Exemple #1
0
def test_device_host_file_short(tmp_path, num_device_arrays, num_host_arrays,
                                array_size_range):
    tmpdir = tmp_path / "storage"
    tmpdir.mkdir()
    dhf = DeviceHostFile(device_memory_limit=1024 * 16,
                         memory_limit=1024 * 16,
                         local_dir=tmpdir)

    host = [("x-%d" % i, np.random.random(randint(*array_size_range)))
            for i in range(num_host_arrays)]
    device = [("dx-%d" % i, cupy.random.random(randint(*array_size_range)))
              for i in range(num_device_arrays)]

    import random

    full = host + device
    random.shuffle(full)

    for i in full:
        dhf[i[0]] = i[1]

    random.shuffle(full)

    for i in full:
        assert_array_equal(i[1], dhf[i[0]])
        del dhf[i[0]]

    assert set(dhf.device.keys()) == set()
    assert set(dhf.host.keys()) == set()
    assert set(dhf.disk.keys()) == set()
Exemple #2
0
def test_device_host_file_short(tmp_path, num_device_arrays, num_host_arrays,
                                array_size_range, jit_unspill):
    tmpdir = tmp_path / "storage"
    tmpdir.mkdir()
    dhf = DeviceHostFile(
        device_memory_limit=1024 * 16,
        memory_limit=1024 * 16,
        local_directory=tmpdir,
        jit_unspill=jit_unspill,
    )

    host = [("x-%d" % i, np.random.random(randint(*array_size_range)))
            for i in range(num_host_arrays)]
    device = [("dx-%d" % i, cupy.random.random(randint(*array_size_range)))
              for i in range(num_device_arrays)]

    import random

    full = host + device
    random.shuffle(full)

    for k, v in full:
        dhf[k] = v

    random.shuffle(full)

    for k, original in full:
        acquired = dhf[k]
        assert_eq(original, acquired)
        del dhf[k]

    assert set(dhf.device.keys()) == set()
    assert set(dhf.host.keys()) == set()
    assert set(dhf.disk.keys()) == set()
Exemple #3
0
def test_cudf_device_spill(params):
    @gen_cluster(
        client=True,
        ncores=[("127.0.0.1", 1)],
        Worker=Worker,
        timeout=300,
        worker_kwargs={
            "memory_limit":
            params["memory_limit"],
            "data":
            DeviceHostFile(
                device_memory_limit=params["device_memory_limit"],
                memory_limit=params["memory_limit"],
            ),
        },
        config={
            "distributed.comm.timeouts.connect": "20s",
            "distributed.worker.memory.target": params["host_target"],
            "distributed.worker.memory.spill": params["host_spill"],
        },
    )
    def test_device_spill(client, scheduler, worker):

        cdf = dask.datasets.timeseries(dtypes={
            "x": int,
            "y": float
        },
                                       freq="30ms").map_partitions(
                                           cudf.from_pandas)

        sizes = yield client.compute(
            cdf.map_partitions(lambda df: df.__sizeof__()))
        sizes = sizes.tolist()
        nbytes = sum(sizes)
        part_index_nbytes = (yield client.compute(
            cdf.partitions[0].index)).__sizeof__()

        cdf2 = cdf.persist()
        yield wait(cdf2)

        del cdf

        yield client.run(worker_assert, nbytes, 32, 2048 + part_index_nbytes)

        host_chunks = yield client.run(lambda: len(get_worker().data.host))
        disk_chunks = yield client.run(lambda: len(get_worker().data.disk))
        for hc, dc in zip(host_chunks.values(), disk_chunks.values()):
            if params["spills_to_disk"]:
                assert dc > 0
            else:
                assert hc > 0
                assert dc == 0

        del cdf2

        yield client.run(worker_assert, 0, 0, 0)

    test_device_spill()
Exemple #4
0
def test_cudf_device_spill(params):
    @gen_cluster(
        client=True,
        nthreads=[("127.0.0.1", 1)],
        Worker=Worker,
        timeout=60,
        worker_kwargs={
            "memory_limit": params["memory_limit"],
            "data": DeviceHostFile(
                device_memory_limit=params["device_memory_limit"],
                memory_limit=params["memory_limit"],
            ),
        },
        config={
            "distributed.comm.timeouts.connect": "20s",
            "distributed.worker.memory.target": params["host_target"],
            "distributed.worker.memory.spill": params["host_spill"],
            "distributed.worker.memory.pause": params["host_pause"],
        },
    )
    def test_device_spill(client, scheduler, worker):
        cudf = pytest.importorskip("cudf")
        # There's a known issue with datetime64:
        # https://github.com/numpy/numpy/issues/4983#issuecomment-441332940
        # The same error above happens when spilling datetime64 to disk
        cdf = (
            dask.datasets.timeseries(dtypes={"x": int, "y": float}, freq="20ms")
            .reset_index(drop=True)
            .map_partitions(cudf.from_pandas)
        )

        sizes = yield client.compute(cdf.map_partitions(lambda df: df.__sizeof__()))
        sizes = sizes.tolist()
        nbytes = sum(sizes)
        part_index_nbytes = (yield client.compute(cdf.partitions[0].index)).__sizeof__()

        cdf2 = cdf.persist()
        yield wait(cdf2)

        del cdf

        host_chunks = yield client.run(lambda: len(get_worker().data.host))
        disk_chunks = yield client.run(lambda: len(get_worker().data.disk or list()))
        for hc, dc in zip(host_chunks.values(), disk_chunks.values()):
            if params["spills_to_disk"]:
                assert dc > 0
            else:
                assert hc > 0
                assert dc == 0

        yield client.run(worker_assert, nbytes, 32, 2048 + part_index_nbytes)

        del cdf2

        yield client.run(delayed_worker_assert, 0, 0, 0)

    test_device_spill()
Exemple #5
0
def test_cupy_device_spill(params):
    @gen_cluster(
        client=True,
        nthreads=[("127.0.0.1", 1)],
        Worker=Worker,
        timeout=60,
        worker_kwargs={
            "memory_limit":
            params["memory_limit"],
            "data":
            DeviceHostFile(
                device_memory_limit=params["device_memory_limit"],
                memory_limit=params["memory_limit"],
            ),
        },
        config={
            "distributed.comm.timeouts.connect": "20s",
            "distributed.worker.memory.target": params["host_target"],
            "distributed.worker.memory.spill": params["host_spill"],
            "distributed.worker.memory.pause": params["host_pause"],
        },
    )
    def test_device_spill(client, scheduler, worker):
        cupy = pytest.importorskip("cupy")
        rs = da.random.RandomState(RandomState=cupy.random.RandomState)
        x = rs.random(int(250e6), chunks=10e6)

        xx = x.persist()
        yield wait(xx)

        # Allow up to 1024 bytes overhead per chunk serialized
        yield client.run(worker_assert, x.nbytes, 1024, 1024)

        y = client.compute(x.sum())
        res = yield y

        assert (abs(res / x.size) - 0.5) < 1e-3

        yield client.run(worker_assert, x.nbytes, 1024, 1024)
        host_chunks = yield client.run(lambda: len(get_worker().data.host))
        disk_chunks = yield client.run(
            lambda: len(get_worker().data.disk or list()))
        for hc, dc in zip(host_chunks.values(), disk_chunks.values()):
            if params["spills_to_disk"]:
                assert dc > 0
            else:
                assert hc > 0
                assert dc == 0

    test_device_spill()
Exemple #6
0
def test_device_host_file_step_by_step(tmp_path):
    tmpdir = tmp_path / "storage"
    tmpdir.mkdir()
    dhf = DeviceHostFile(device_memory_limit=1024 * 16,
                         memory_limit=1024 * 16,
                         local_dir=tmpdir)

    a = np.random.random(1000)
    b = cupy.random.random(1000)

    dhf["a1"] = a

    assert set(dhf.device.keys()) == set()
    assert set(dhf.host.keys()) == set(["a1"])
    assert set(dhf.disk.keys()) == set()

    dhf["b1"] = b

    assert set(dhf.device.keys()) == set(["b1"])
    assert set(dhf.host.keys()) == set(["a1"])
    assert set(dhf.disk.keys()) == set()

    dhf["b2"] = b
    assert set(dhf.device.keys()) == set(["b1", "b2"])
    assert set(dhf.host.keys()) == set(["a1"])
    assert set(dhf.disk.keys()) == set()

    dhf["b3"] = b
    assert set(dhf.device.keys()) == set(["b2", "b3"])
    assert set(dhf.host.keys()) == set(["a1", "b1"])
    assert set(dhf.disk.keys()) == set()

    dhf["a2"] = a
    assert set(dhf.device.keys()) == set(["b2", "b3"])
    assert set(dhf.host.keys()) == set(["a2", "b1"])
    assert set(dhf.disk.keys()) == set(["a1"])

    dhf["b4"] = b
    assert set(dhf.device.keys()) == set(["b3", "b4"])
    assert set(dhf.host.keys()) == set(["a2", "b2"])
    assert set(dhf.disk.keys()) == set(["a1", "b1"])

    dhf["b4"] = b
    assert set(dhf.device.keys()) == set(["b3", "b4"])
    assert set(dhf.host.keys()) == set(["a2", "b2"])
    assert set(dhf.disk.keys()) == set(["a1", "b1"])

    assert_array_equal(dhf["a1"], a)
    del dhf["a1"]
    assert_array_equal(dhf["a2"], a)
    del dhf["a2"]
    assert_array_equal(dhf["b1"], b)
    del dhf["b1"]
    assert_array_equal(dhf["b2"], b)
    del dhf["b2"]
    assert_array_equal(dhf["b3"], b)
    del dhf["b3"]
    assert_array_equal(dhf["b4"], b)
    del dhf["b4"]

    assert set(dhf.device.keys()) == set()
    assert set(dhf.host.keys()) == set()
    assert set(dhf.disk.keys()) == set()
Exemple #7
0
def test_device_host_file_config(tmp_path):
    dhf_disk_path = str(tmp_path / "dask-worker-space" / "storage")
    with dask.config.set(temporary_directory=str(tmp_path)):
        dhf = DeviceHostFile()
        assert os.path.exists(dhf_disk_path)
        assert dhf.disk_func_path == dhf_disk_path