Esempio n. 1
0
    def __init__(self, context, fft, dk, volume, **kwargs):
        self.fft = fft
        self.dtype = fft.dtype
        self.rdtype = fft.rdtype
        self.cdtype = fft.cdtype
        self.volume = volume

        sub_k = list(x.get() for x in self.fft.sub_k.values())
        kvecs = np.meshgrid(*sub_k, indexing="ij", sparse=False)
        self.kmags = np.sqrt(sum((dki * ki)**2 for dki, ki in zip(dk, kvecs)))

        seed = kwargs.pop("seed", 13298)
        self.rng = clr.ThreefryGenerator(context, seed=seed)

        def parallelize(knl):
            knl = lp.fix_parameters(knl, pi=np.pi, sqrt2=np.sqrt(2.))
            knl = lp.split_iname(knl,
                                 "k",
                                 32,
                                 inner_tag="l.0",
                                 outer_tag="g.0")
            knl = lp.split_iname(knl, "j", 1, inner_tag="unr", outer_tag="g.1")
            knl = lp.split_iname(knl, "i", 1, inner_tag="unr", outer_tag="g.2")
            return knl

        self.wkb_knl = parallelize(self.get_wkb_knl())
        self.non_wkb_knl = parallelize(self.get_non_wkb_knl())
Esempio n. 2
0
def test_histogram(ctx_factory, grid_shape, proc_shape, dtype, num_bins,
                   timing=False):
    if ctx_factory:
        ctx = ctx_factory()
    else:
        ctx = ps.choose_device_and_make_context()

    queue = cl.CommandQueue(ctx)
    h = 1
    mpi = ps.DomainDecomposition(proc_shape, h, grid_shape=grid_shape)
    rank_shape, _ = mpi.get_rank_shape_start(grid_shape)

    if np.dtype(dtype) in (np.dtype("float64"), np.dtype("complex128")):
        max_rtol, avg_rtol = 1e-10, 1e-11
    else:
        max_rtol, avg_rtol = 5e-4, 5e-5

    from pymbolic import var
    _fx = ps.Field("fx")
    histograms = {
        "count": (var("abs")(_fx) * num_bins, 1),
        "squared": (var("abs")(_fx) * num_bins, _fx**2),
    }
    hist = ps.Histogrammer(mpi, histograms, num_bins, dtype, rank_shape=rank_shape)

    rng = clr.ThreefryGenerator(ctx, seed=12321)
    fx = rng.uniform(queue, rank_shape, dtype)
    fx_h = fx.get()

    result = hist(queue, fx=fx)

    res = result["count"]
    assert np.sum(res.astype("int64")) == np.product(grid_shape), \
        f"Count histogram doesn't sum to grid_size ({np.sum(res)})"

    bins = np.linspace(0, 1, num_bins+1).astype(dtype)
    weights = np.ones_like(fx_h)
    np_res = np.histogram(fx_h, bins=bins, weights=weights)[0]
    np_res = mpi.allreduce(np_res)

    max_err, avg_err = get_errs(res, np_res)
    assert max_err < max_rtol and avg_err < avg_rtol, \
        f"Histogrammer inaccurate for grid_shape={grid_shape}" \
        f": {max_err=}, {avg_err=}"

    res = result["squared"]
    np_res = np.histogram(fx_h, bins=bins, weights=fx_h**2)[0]
    np_res = mpi.allreduce(np_res)

    max_err, avg_err = get_errs(res, np_res)
    assert max_err < max_rtol and avg_err < avg_rtol, \
        f"Histogrammer with weights inaccurate for grid_shape={grid_shape}" \
        f": {max_err=}, {avg_err=}"

    if timing:
        from common import timer
        t = timer(lambda: hist(queue, fx=fx))
        print(f"histogram took {t:.3f} ms for {grid_shape=}, {dtype=}")
Esempio n. 3
0
def test_share_halos(ctx_factory,
                     grid_shape,
                     proc_shape,
                     h,
                     dtype,
                     _grid_shape,
                     pass_grid_shape,
                     timing=False):
    ctx = ctx_factory()

    if isinstance(h, int):
        h = (h, ) * 3

    queue = cl.CommandQueue(ctx)
    grid_shape = _grid_shape or grid_shape
    mpi = ps.DomainDecomposition(
        proc_shape, h, grid_shape=(grid_shape if pass_grid_shape else None))
    rank_shape, substart = mpi.get_rank_shape_start(grid_shape)

    # data will be same on each rank
    rng = clr.ThreefryGenerator(ctx, seed=12321)
    data = rng.uniform(queue,
                       tuple(Ni + 2 * hi for Ni, hi in zip(grid_shape, h)),
                       dtype).get()
    if h[0] > 0:
        data[:h[0], :, :] = data[-2 * h[0]:-h[0], :, :]
        data[-h[0]:, :, :] = data[h[0]:2 * h[0], :, :]
    if h[1] > 0:
        data[:, :h[1], :] = data[:, -2 * h[1]:-h[1], :]
        data[:, -h[1]:, :] = data[:, h[1]:2 * h[1], :]
    if h[2] > 0:
        data[:, :, :h[2]] = data[:, :, -2 * h[2]:-h[2]]
        data[:, :, -h[2]:] = data[:, :, h[2]:2 * h[2]]

    subdata = np.empty(tuple(ni + 2 * hi for ni, hi in zip(rank_shape, h)),
                       dtype)
    rank_slice = tuple(
        slice(si + hi, si + ni + hi)
        for ni, si, hi in zip(rank_shape, substart, h))
    unpadded_slc = tuple(slice(hi, -hi) if hi > 0 else slice(None) for hi in h)
    subdata[unpadded_slc] = data[rank_slice]

    subdata_device = cla.to_device(queue, subdata)
    mpi.share_halos(queue, subdata_device)
    subdata2 = subdata_device.get()

    pencil_slice = tuple(
        slice(si, si + ri + 2 * hi)
        for ri, si, hi in zip(rank_shape, substart, h))
    assert (subdata2 == data[pencil_slice]).all(), \
        f"rank {mpi.rank} {mpi.rank_tuple} has incorrect halo data"

    # test that can call with different-shaped input
    if not pass_grid_shape:
        subdata_device_new = clr.rand(
            queue, tuple(ni // 2 + 2 * hi for ni, hi in zip(rank_shape, h)),
            dtype)
        mpi.share_halos(queue, subdata_device_new)

    if timing:
        from common import timer
        t = timer(lambda: mpi.share_halos(queue, fx=subdata_device))
        if mpi.rank == 0:
            print(f"share_halos took {t:.3f} ms for "
                  f"{grid_shape=}, {h=}, {proc_shape=}")
Esempio n. 4
0
def test_gather_scatter(ctx_factory,
                        grid_shape,
                        proc_shape,
                        h,
                        dtype,
                        _grid_shape,
                        pass_grid_shape,
                        timing=False):
    ctx = ctx_factory()

    if isinstance(h, int):
        h = (h, ) * 3

    queue = cl.CommandQueue(ctx)
    grid_shape = _grid_shape or grid_shape
    mpi = ps.DomainDecomposition(proc_shape, h)
    rank_shape, substart = mpi.get_rank_shape_start(grid_shape)

    rank_slice = tuple(
        slice(si, si + ri) for ri, si, hi in zip(rank_shape, substart, h))
    pencil_shape = tuple(ni + 2 * hi for ni, hi in zip(rank_shape, h))

    unpadded_slc = tuple(slice(hi, -hi) if hi > 0 else slice(None) for hi in h)

    # create random data with same seed on all ranks
    rng = clr.ThreefryGenerator(ctx, seed=12321)
    data = rng.uniform(queue, grid_shape, dtype)

    # cl.Array -> cl.Array
    subdata = cla.zeros(queue, pencil_shape, dtype)
    mpi.scatter_array(queue, data if mpi.rank == 0 else None, subdata, 0)
    sub_h = subdata.get()
    data_h = data.get()
    assert (sub_h[unpadded_slc] == data_h[rank_slice]).all()

    data_test = cla.zeros_like(data)
    mpi.gather_array(queue, subdata, data_test if mpi.rank == 0 else None, 0)
    data_test_h = data_test.get()
    if mpi.rank == 0:
        assert (data_test_h == data_h).all()

    # np.ndarray -> np.ndarray
    mpi.scatter_array(queue, data_h if mpi.rank == 0 else None, sub_h, 0)
    assert (sub_h[unpadded_slc] == data_h[rank_slice]).all()

    mpi.gather_array(queue, sub_h, data_test_h if mpi.rank == 0 else None, 0)
    if mpi.rank == 0:
        assert (data_test_h == data_h).all()

    # scatter cl.Array -> np.ndarray
    sub_h[:] = 0
    mpi.scatter_array(queue, data if mpi.rank == 0 else None, sub_h, 0)
    assert (sub_h[unpadded_slc] == data_h[rank_slice]).all()

    # gather np.ndarray -> cl.Array
    data_test[:] = 0
    mpi.gather_array(queue, sub_h, data_test if mpi.rank == 0 else None, 0)
    data_test_h = data_test.get()
    if mpi.rank == 0:
        assert (data_test_h == data_h).all()

    # scatter np.ndarray -> cl.Array
    subdata[:] = 0
    mpi.scatter_array(queue, data_h if mpi.rank == 0 else None, subdata, 0)
    sub_h = subdata.get()
    assert (sub_h[unpadded_slc] == data_h[rank_slice]).all()

    # gather cl.Array -> np.ndarray
    data_test_h[:] = 0
    mpi.gather_array(queue, subdata, data_test_h if mpi.rank == 0 else None, 0)
    if mpi.rank == 0:
        assert (data_test_h == data_h).all()

    if timing:
        from common import timer
        ntime = 25
        times = {}

        times["scatter cl.Array -> cl.Array"] = \
            timer(lambda: mpi.scatter_array(queue, data, subdata, 0), ntime=ntime)
        times["scatter cl.Array -> np.ndarray"] = \
            timer(lambda: mpi.scatter_array(queue, data, sub_h, 0), ntime=ntime)
        times["scatter np.ndarray -> cl.Array"] = \
            timer(lambda: mpi.scatter_array(queue, data_h, subdata, 0), ntime=ntime)
        times["scatter np.ndarray -> np.ndarray"] = \
            timer(lambda: mpi.scatter_array(queue, data_h, sub_h, 0), ntime=ntime)

        times["gather cl.Array -> cl.Array"] = \
            timer(lambda: mpi.gather_array(queue, subdata, data, 0), ntime=ntime)
        times["gather cl.Array -> np.ndarray"] = \
            timer(lambda: mpi.gather_array(queue, subdata, data_h, 0), ntime=ntime)
        times["gather np.ndarray -> cl.Array"] = \
            timer(lambda: mpi.gather_array(queue, sub_h, data, 0), ntime=ntime)
        times["gather np.ndarray -> np.ndarray"] = \
            timer(lambda: mpi.gather_array(queue, sub_h, data_h, 0), ntime=ntime)

        if mpi.rank == 0:
            print(f"{grid_shape=}, {h=}, {proc_shape=}")
            for key, val in times.items():
                print(f"{key} took {val:.3f} ms")
Esempio n. 5
0
def test_dft(ctx_factory,
             grid_shape,
             proc_shape,
             dtype,
             use_fftw,
             timing=False):
    if not use_fftw and np.product(proc_shape) > 1:
        pytest.skip("Must use mpi4py-fft on more than one rank.")

    ctx = ctx_factory()

    queue = cl.CommandQueue(ctx)
    h = 1
    mpi = ps.DomainDecomposition(proc_shape, h, grid_shape=grid_shape)
    mpi0 = ps.DomainDecomposition(proc_shape, 0, grid_shape=grid_shape)
    rank_shape, _ = mpi.get_rank_shape_start(grid_shape)

    fft = ps.DFT(mpi, ctx, queue, grid_shape, dtype, use_fftw=use_fftw)
    grid_size = np.product(grid_shape)
    rdtype = fft.rdtype

    if fft.is_real:
        np_dft = np.fft.rfftn
        np_idft = np.fft.irfftn
    else:
        np_dft = np.fft.fftn
        np_idft = np.fft.ifftn

    rtol = 1e-11 if dtype in ("float64", "complex128") else 2e-3

    rng = clr.ThreefryGenerator(ctx, seed=12321 * (mpi.rank + 1))
    fx = rng.uniform(queue, rank_shape, rdtype) + 1e-2
    if not fft.is_real:
        fx = fx + 1j * rng.uniform(queue, rank_shape, rdtype)

    fx = fx.get()

    fk = fft.dft(fx)
    if isinstance(fk, cla.Array):
        fk = fk.get()
    fk, _fk = fk.copy(), fk  # hang on to one that fftw won't overwrite

    fx2 = fft.idft(_fk)
    if isinstance(fx2, cla.Array):
        fx2 = fx2.get()

    fx_glb = np.empty(shape=grid_shape, dtype=dtype)
    for root in range(mpi.nranks):
        mpi0.gather_array(queue, fx, fx_glb, root=root)

    fk_glb_np = np.ascontiguousarray(np_dft(fx_glb))
    fx2_glb_np = np.ascontiguousarray(np_idft(fk_glb_np))

    if use_fftw:
        fk_np = fk_glb_np[fft.fft.local_slice(True)]
        fx2_np = fx2_glb_np[fft.fft.local_slice(False)]
    else:
        fk_np = fk_glb_np
        fx2_np = fx2_glb_np

    max_err, avg_err = get_errs(fx, fx2 / grid_size)
    assert max_err < rtol, \
        f"IDFT(DFT(f)) != f for {grid_shape=}, {proc_shape=}: {max_err=}, {avg_err=}"

    max_err, avg_err = get_errs(fk_np, fk)
    assert max_err < rtol, \
        f"DFT disagrees with numpy for {grid_shape=}, {proc_shape=}:"\
        f" {max_err=}, {avg_err=}"

    max_err, avg_err = get_errs(fx2_np, fx2 / grid_size)
    assert max_err < rtol, \
        f"IDFT disagrees with numpy for {grid_shape=}, {proc_shape=}:"\
        f" {max_err=}, {avg_err=}"

    fx_cl = cla.empty(queue, rank_shape, dtype)
    pencil_shape = tuple(ni + 2 * h for ni in rank_shape)
    fx_cl_halo = cla.empty(queue, pencil_shape, dtype)
    fx_np = np.empty(rank_shape, dtype)
    fx_np_halo = np.empty(pencil_shape, dtype)
    fk_cl = cla.empty(queue, fft.shape(True), fft.fk.dtype)
    fk_np = np.empty(fft.shape(True), fft.fk.dtype)

    # FIXME: check that these actually produce the correct result
    fx_types = {
        "cl": fx_cl,
        "cl halo": fx_cl_halo,
        "np": fx_np,
        "np halo": fx_np_halo,
        "None": None
    }

    fk_types = {"cl": fk_cl, "np": fk_np, "None": None}

    # run all of these to ensure no runtime errors even if no timing
    ntime = 20 if timing else 1

    from common import timer

    if mpi.rank == 0:
        print(f"N = {grid_shape}, ",
              "complex" if np.dtype(dtype).kind == "c" else "real")

    from itertools import product
    for (a, input_), (b, output) in product(fx_types.items(),
                                            fk_types.items()):
        t = timer(lambda: fft.dft(input_, output), ntime=ntime)
        if mpi.rank == 0:
            print(f"dft({a}, {b}) took {t:.3f} ms")

    for (a, input_), (b, output) in product(fk_types.items(),
                                            fx_types.items()):
        t = timer(lambda: fft.idft(input_, output), ntime=ntime)
        if mpi.rank == 0:
            print(f"idft({a}, {b}) took {t:.3f} ms")
Esempio n. 6
0
def test_field_histogram(ctx_factory, grid_shape, proc_shape, dtype, timing=False):
    if ctx_factory:
        ctx = ctx_factory()
    else:
        ctx = ps.choose_device_and_make_context()

    queue = cl.CommandQueue(ctx)
    h = 1
    mpi = ps.DomainDecomposition(proc_shape, h, grid_shape=grid_shape)
    rank_shape, _ = mpi.get_rank_shape_start(grid_shape)
    pencil_shape = tuple(Ni + 2 * h for Ni in rank_shape)

    num_bins = 432

    if np.dtype(dtype) in (np.dtype("float64"), np.dtype("complex128")):
        max_rtol, avg_rtol = 1e-10, 1e-11
    else:
        max_rtol, avg_rtol = 5e-4, 5e-5

    hist = ps.FieldHistogrammer(mpi, num_bins, dtype,
                                rank_shape=rank_shape, halo_shape=h)

    rng = clr.ThreefryGenerator(ctx, seed=12321)
    fx = rng.uniform(queue, (2, 2)+pencil_shape, dtype, a=-1.2, b=3.)
    fx_h = fx.get()[..., h:-h, h:-h, h:-h]

    result = hist(fx)

    outer_shape = fx.shape[:-3]
    from itertools import product
    slices = list(product(*[range(n) for n in outer_shape]))

    for slc in slices:
        res = result["linear"][slc]
        np_res = np.histogram(fx_h[slc], bins=result["linear_bins"][slc])[0]
        np_res = mpi.allreduce(np_res)

        max_err, avg_err = get_errs(res, np_res)
        assert max_err < max_rtol and avg_err < avg_rtol, \
            f"linear Histogrammer inaccurate for grid_shape={grid_shape}" \
            f": {max_err=}, {avg_err=}"

        res = result["log"][slc]
        bins = result["log_bins"][slc]

        # avoid FPA comparison issues
        # numpy sometimes doesn't count the actual maximum/minimum
        eps = 1e-14 if np.dtype(dtype) == np.dtype("float64") else 1e-4
        bins[0] *= (1 - eps)
        bins[-1] *= (1 + eps)

        np_res = np.histogram(np.abs(fx_h[slc]), bins=bins)[0]
        np_res = mpi.allreduce(np_res)
        norm = np.maximum(np.abs(res), np.abs(np_res))
        norm[norm == 0.] = 1.

        max_err, avg_err = get_errs(res, np_res)
        assert max_err < max_rtol and avg_err < avg_rtol, \
            f"log Histogrammer inaccurate for grid_shape={grid_shape}" \
            f": {max_err=}, {avg_err=}"

    if timing:
        from common import timer
        t = timer(lambda: hist(fx[0, 0]))
        print(f"field histogram took {t:.3f} ms for {grid_shape=}, {dtype=}")