Exemple #1
0
 def zero_mean_array():
     f0 = clr.rand(queue, grid_shape, dtype)
     f = clr.rand(queue, tuple(ni + 2 * h for ni in rank_shape), dtype)
     mpi.scatter_array(queue, f0, f, root=0)
     avg = statistics(f)["mean"]
     f = f - avg
     mpi.share_halos(queue, f)
     return f
Exemple #2
0
def make_random_array(queue, dtype, size):
    from pyopencl.clrandom import rand

    dtype = np.dtype(dtype)
    if dtype.kind == "c":
        real_dtype = TO_REAL[dtype]
        return (rand(queue, shape=(size, ), dtype=real_dtype).astype(dtype) +
                rand(queue, shape=(size, ), dtype=real_dtype).astype(dtype) *
                dtype.type(1j))
    else:
        return rand(queue, shape=(size, ), dtype=dtype)
Exemple #3
0
def make_random_array(queue, dtype, size):
    from pyopencl.clrandom import rand

    dtype = np.dtype(dtype)
    if dtype.kind == "c":
        real_dtype = TO_REAL[dtype]
        return (rand(queue, shape=(size,), dtype=real_dtype).astype(dtype)
                + rand(queue, shape=(size,), dtype=real_dtype).astype(dtype)
                * dtype.type(1j))
    else:
        return rand(queue, shape=(size,), dtype=dtype)
Exemple #4
0
def test_basic_complex(ctx_factory):
    context = ctx_factory()
    queue = cl.CommandQueue(context)

    from pyopencl.clrandom import rand

    size = 500

    ary = (rand(queue, shape=(size,), dtype=np.float32).astype(np.complex64)
            + rand(queue, shape=(size,), dtype=np.float32).astype(np.complex64) * 1j)
    c = np.complex64(5+7j)

    host_ary = ary.get()
    assert la.norm((ary*c).get() - c*host_ary) < 1e-5 * la.norm(host_ary)
Exemple #5
0
def test_reduction_with_new_shape(ctx_factory,
                                  grid_shape,
                                  proc_shape,
                                  dtype,
                                  op,
                                  _grid_shape,
                                  timing=False):
    if ctx_factory:
        ctx = ctx_factory()
    else:
        ctx = ps.choose_device_and_make_context()

    queue = cl.CommandQueue(ctx)
    h = 1
    grid_shape = _grid_shape or grid_shape
    mpi = ps.DomainDecomposition(proc_shape, h, grid_shape=grid_shape)
    rank_shape, _ = mpi.get_rank_shape_start(grid_shape)

    from pystella import Field
    reducers = {}
    reducers["avg"] = [(Field("f"), op)]

    reducer = ps.Reduction(mpi, reducers)

    f = clr.rand(queue, rank_shape, dtype=dtype)
    result = reducer(queue, f=f)
    avg = result["avg"]

    avg_test = reducer.reduce_array(f, op)
    if op == "avg":
        avg_test /= np.product(grid_shape)

    rtol = 5e-14 if dtype == np.float64 else 1e-5
    assert np.allclose(avg, avg_test, rtol=rtol, atol=0), \
        f"{op} reduction innaccurate for {grid_shape=}, {proc_shape=}"

    # test call to reducer with new shape
    grid_shape = tuple(Ni // 2 for Ni in grid_shape)
    rank_shape, _ = mpi.get_rank_shape_start(grid_shape)
    f = clr.rand(queue, rank_shape, dtype=dtype)
    result = reducer(queue, f=f)
    avg = result["avg"]

    avg_test = reducer.reduce_array(f, op)
    if op == "avg":
        avg_test /= np.product(grid_shape)

    rtol = 5e-14 if dtype == np.float64 else 1e-5
    assert np.allclose(avg, avg_test, rtol=rtol, atol=0), \
        f"{op} reduction w/new shape innaccurate for {grid_shape=}, {proc_shape=}"
def test_basic_complex(ctx_factory):
    context = ctx_factory()
    queue = cl.CommandQueue(context)

    from pyopencl.clrandom import rand

    size = 500

    ary =  (rand(queue, shape=(size,), dtype=np.float32).astype(np.complex64)
            + 1j* rand(queue, shape=(size,), dtype=np.float32).astype(np.complex64))
    c = np.complex64(5+7j)

    host_ary = ary.get()
    assert la.norm((c*ary).get() - c*host_ary) < 1e-5 * la.norm(host_ary)
Exemple #7
0
def test_bitonic_sort(ctx_factory, size, dtype):
    ctx = cl.create_some_context()
    queue = cl.CommandQueue(ctx)

    dev = ctx.devices[0]
    if (dev.platform.name == "Apple" and dev.type & cl.device_type.CPU):
        pytest.xfail("Bitonic sort won't work on Apple CPU: no workgroup "
            "parallelism")
    if (dev.platform.name == "Portable Computing Language"
            and dtype == np.float64
            and get_pocl_version(dev.platform) < (1, 0)):
        pytest.xfail("Double precision bitonic sort doesn't work on POCL < 1.0")

    if dtype == np.float64 and not has_double_support(dev):
        from pytest import skip
        skip("double precision not supported on %s" % dev)

    import pyopencl.clrandom as clrandom
    from pyopencl.bitonic_sort import BitonicSort

    s = clrandom.rand(queue, (2, size, 3,), dtype, luxury=None, a=0, b=239482333)
    sgs = s.copy()
    # enqueue_marker crashes under CL 1.1 pocl if there is anything to wait for
    # (no clEnqueueWaitForEvents) https://github.com/inducer/pyopencl/pull/237
    if (dev.platform.name == "Portable Computing Language"
            and cl.get_cl_header_version() < (1, 2)):
        sgs.finish()
    sorter = BitonicSort(ctx)
    sgs, evt = sorter(sgs, axis=1)
    assert np.array_equal(np.sort(s.get(), axis=1), sgs.get())
Exemple #8
0
def test_bitonic_argsort(ctx_factory, size, dtype):
    import sys
    is_pypy = "__pypy__" in sys.builtin_module_names

    if not size and is_pypy:
        # https://bitbucket.org/pypy/numpy/issues/53/specifying-strides-on-zero-sized-array
        pytest.xfail("pypy doesn't seem to handle as_strided "
                "on zero-sized arrays very well")

    ctx = cl.create_some_context()
    queue = cl.CommandQueue(ctx)

    device = queue.device
    if device.platform.vendor == "The pocl project" \
            and device.type & cl.device_type.GPU:
        pytest.xfail("bitonic argsort fails on POCL + Nvidia,"
                "at least the K40, as of pocl 1.6, 2021-01-20")

    dev = ctx.devices[0]
    if (dev.platform.name == "Portable Computing Language"
            and sys.platform == "darwin"):
        pytest.xfail("Bitonic sort crashes on Apple POCL")
    if (dev.platform.name == "Apple" and dev.type & cl.device_type.CPU):
        pytest.xfail("Bitonic sort won't work on Apple CPU: no workgroup "
            "parallelism")
    if (dev.platform.name == "Portable Computing Language"
            and dtype == np.float64
            and get_pocl_version(dev.platform) < (1, 0)):
        pytest.xfail("Double precision bitonic sort doesn't work on POCL < 1.0")
    if (dev.platform.name == "Intel(R) OpenCL" and size == 0):
        pytest.xfail("size-0 arange fails on Intel CL")

    if dtype == np.float64 and not has_double_support(dev):
        from pytest import skip
        skip("double precision not supported on %s" % dev)

    import pyopencl.clrandom as clrandom
    from pyopencl.bitonic_sort import BitonicSort

    index = cl_array.arange(queue, 0, size, 1, dtype=np.int32)
    m = clrandom.rand(queue, (size,), dtype, luxury=None, a=0, b=239432234)

    sorterm = BitonicSort(ctx)

    ms = m.copy()
    # enqueue_marker crashes under CL 1.1 pocl if there is anything to wait for
    # (no clEnqueueWaitForEvents) https://github.com/inducer/pyopencl/pull/237
    if (dev.platform.name == "Portable Computing Language"
            and cl.get_cl_header_version() < (1, 2)):
        ms.finish()
        index.finish()
    ms, evt = sorterm(ms, idx=index, axis=0)

    assert np.array_equal(np.sort(m.get()), ms.get())

    # may be False because of identical values in array
    # assert np.array_equal(np.argsort(m.get()), index.get())

    # Check values by indices
    assert np.array_equal(m.get()[np.argsort(m.get())], m.get()[index.get()])
Exemple #9
0
def test_bitonic_argsort(ctx_factory, size, dtype):
    ctx = cl.create_some_context()
    queue = cl.CommandQueue(ctx)

    dev = ctx.devices[0]
    if (dev.platform.name == "Apple" and dev.type & cl.device_type.CPU):
        pytest.xfail("Bitonic sort won't work on Apple CPU: no workgroup "
                     "parallelism")
    if (dev.platform.name == "Portable Computing Language"
            and dtype == np.float64):
        pytest.xfail("Double precision bitonic sort doesn't work on POCL")

    import pyopencl.clrandom as clrandom
    from pyopencl.bitonic_sort import BitonicSort

    index = cl_array.arange(queue, 0, size, 1, dtype=np.int32)
    m = clrandom.rand(queue, (size, ), dtype, luxury=None, a=0, b=239432234)

    sorterm = BitonicSort(ctx)

    ms, evt = sorterm(m.copy(), idx=index, axis=0)

    assert np.array_equal(np.sort(m.get()), ms.get())

    # may be False because of identical values in array
    # assert np.array_equal(np.argsort(m.get()), index.get())

    # Check values by indices
    assert np.array_equal(m.get()[np.argsort(m.get())], m.get()[index.get()])
Exemple #10
0
def test_bitonic_sort(ctx_factory, size, dtype):
    ctx = cl.create_some_context()
    queue = cl.CommandQueue(ctx)

    dev = ctx.devices[0]
    if (dev.platform.name == "Apple" and dev.type & cl.device_type.CPU):
        pytest.xfail("Bitonic sort won't work on Apple CPU: no workgroup "
                     "parallelism")
    if (dev.platform.name == "Portable Computing Language"
            and dtype == np.float64):
        pytest.xfail("Double precision bitonic sort doesn't work on POCL")

    import pyopencl.clrandom as clrandom
    from pyopencl.bitonic_sort import BitonicSort

    s = clrandom.rand(queue, (
        2,
        size,
        3,
    ),
                      dtype,
                      luxury=None,
                      a=0,
                      b=239482333)
    sorter = BitonicSort(ctx)
    sgs, evt = sorter(s.copy(), axis=1)
    assert np.array_equal(np.sort(s.get(), axis=1), sgs.get())
Exemple #11
0
 def __call__(self, *args, **kwargs):
     """
     Because we don't need to manage buffers or compile kernel code we
     override the __call__ and just call the pyOpenCl code. Returns a
     pyopencl.array.Array.
     """
     return rand(self.queue, kwargs['shape'], numpy.float32)
Exemple #12
0
def test_bitonic_argsort(ctx_factory, size, dtype):
    ctx = cl.create_some_context()
    queue = cl.CommandQueue(ctx)

    dev = ctx.devices[0]
    if (dev.platform.name == "Apple" and dev.type & cl.device_type.CPU):
        pytest.xfail("Bitonic sort won't work on Apple CPU: no workgroup "
            "parallelism")
    if (dev.platform.name == "Portable Computing Language"
            and dtype == np.float64):
        pytest.xfail("Double precision bitonic sort doesn't work on POCL")

    import pyopencl.clrandom as clrandom
    from pyopencl.bitonic_sort import BitonicSort

    index = cl_array.arange(queue, 0, size, 1, dtype=np.int32)
    m = clrandom.rand(queue, (size,), dtype, luxury=None, a=0, b=239432234)

    sorterm = BitonicSort(ctx)

    ms, evt = sorterm(m.copy(), idx=index, axis=0)

    assert np.array_equal(np.sort(m.get()), ms.get())

    # may be False because of identical values in array
    # assert np.array_equal(np.argsort(m.get()), index.get())

    # Check values by indices
    assert np.array_equal(m.get()[np.argsort(m.get())], m.get()[index.get()])
Exemple #13
0
 def rand(self, *args):
     dtype=float_
     shape = args if len(args) else 1
     res = clrandom.rand(queue, shape, dtype, a=0.0, b=1.0)
     res.__class__ = myclArray
     res.reinit()
     return res #myclArray(queue, _res.shape, _res.dtype, data=_res.data)
Exemple #14
0
def test_reduction(ctx_factory,
                   grid_shape,
                   proc_shape,
                   dtype,
                   op,
                   _grid_shape,
                   pass_grid_dims,
                   timing=False):
    if ctx_factory:
        ctx = ctx_factory()
    else:
        ctx = ps.choose_device_and_make_context()

    queue = cl.CommandQueue(ctx)
    h = 1
    grid_shape = _grid_shape or grid_shape
    mpi = ps.DomainDecomposition(proc_shape, h, grid_shape=grid_shape)
    rank_shape, _ = mpi.get_rank_shape_start(grid_shape)

    from pymbolic import var
    from pystella import Field
    tmp_insns = [(var("x"), Field("f") / 2 + .31)]

    reducers = {}
    reducers["avg"] = [(var("x"), op)]

    if pass_grid_dims:
        reducer = ps.Reduction(mpi,
                               reducers,
                               rank_shape=rank_shape,
                               tmp_instructions=tmp_insns,
                               grid_size=np.product(grid_shape))
    else:
        reducer = ps.Reduction(mpi, reducers, tmp_instructions=tmp_insns)

    f = clr.rand(queue, rank_shape, dtype=dtype)

    import pyopencl.tools as clt
    pool = clt.MemoryPool(clt.ImmediateAllocator(queue))

    result = reducer(queue, f=f, allocator=pool)
    avg = result["avg"]

    avg_test = reducer.reduce_array(f / 2 + .31, op)
    if op == "avg":
        avg_test /= np.product(grid_shape)

    rtol = 5e-14 if dtype == np.float64 else 1e-5
    assert np.allclose(avg, avg_test, rtol=rtol, atol=0), \
        f"{op} reduction innaccurate for {grid_shape=}, {proc_shape=}"

    if timing:
        from common import timer
        t = timer(lambda: reducer(queue, f=f, allocator=pool), ntime=1000)
        if mpi.rank == 0:
            print(
                f"reduction took {t:.3f} ms for {grid_shape=}, {proc_shape=}")
            bandwidth = f.nbytes / 1024**3 / t * 1000
            print(f"Bandwidth = {bandwidth:.1f} GB/s")
Exemple #15
0
def test_field_statistics(ctx_factory,
                          grid_shape,
                          proc_shape,
                          dtype,
                          _grid_shape,
                          pass_grid_dims,
                          timing=False):
    ctx = ctx_factory()

    queue = cl.CommandQueue(ctx)
    h = 1
    grid_shape = _grid_shape or grid_shape
    mpi = ps.DomainDecomposition(proc_shape, h, grid_shape=grid_shape)
    rank_shape, _ = mpi.get_rank_shape_start(grid_shape)

    # make select parameters local for convenience
    h = 2
    f = clr.rand(queue, (2, 1) + tuple(ni + 2 * h for ni in rank_shape),
                 dtype=dtype)

    if pass_grid_dims:
        statistics = ps.FieldStatistics(mpi,
                                        h,
                                        rank_shape=rank_shape,
                                        grid_size=np.product(grid_shape))
    else:
        statistics = ps.FieldStatistics(mpi, h)

    import pyopencl.tools as clt
    pool = clt.MemoryPool(clt.ImmediateAllocator(queue))

    stats = statistics(f, allocator=pool)
    avg = stats["mean"]
    var = stats["variance"]

    f_h = f.get()
    rank_sum = np.sum(f_h[..., h:-h, h:-h, h:-h], axis=(-3, -2, -1))
    avg_test = mpi.allreduce(rank_sum) / np.product(grid_shape)

    rank_sum = np.sum(f_h[..., h:-h, h:-h, h:-h]**2, axis=(-3, -2, -1))
    var_test = mpi.allreduce(rank_sum) / np.product(grid_shape) - avg_test**2

    rtol = 5e-14 if dtype == np.float64 else 1e-5

    assert np.allclose(avg, avg_test, rtol=rtol, atol=0), \
        f"average innaccurate for {grid_shape=}, {proc_shape=}"

    assert np.allclose(var, var_test, rtol=rtol, atol=0), \
        f"variance innaccurate for {grid_shape=}, {proc_shape=}"

    if timing:
        from common import timer
        t = timer(lambda: statistics(f, allocator=pool))
        if mpi.rank == 0:
            print(
                f"field stats took {t:.3f} ms "
                f"for outer shape {f.shape[:-3]}, {grid_shape=}, {proc_shape=}"
            )
Exemple #16
0
 def _evaluate(self, valuation, cache):
     if self.test:
         return self.ops[0]._evaluate(valuation, cache)
     if id(self) not in cache:
         q = pl.qs[0]
         op = self.ops[0]._evaluate(valuation, cache)
         self.mask = clrandom.rand(q, op.shape, op.dtype) >= self.ratio
         cache[id(self)] = op * self.mask
     return cache[id(self)]
Exemple #17
0
    def rand(shape: Union[tuple[int, ...], int] = (1, 1), gpu=False) -> Tensor:
        """Returns a tensor of random values in a given shape."""

        if gpu:
            return Tensor(clrandom.rand(QUEUE, shape, np.float32), gpu=True)

        if isinstance(shape, tuple):
            return Tensor(np.random.rand(*shape).astype(np.float32))

        return Tensor(np.random.rand(shape).astype(np.float32))
Exemple #18
0
 def randint(self, low, high=None, size=1):
     #_size, reshape = szs(size)
     if high:
         a, b = low, high
     else:
         a, b = 0, low
     res = clrandom.rand(queue, size, np.int32, a=a, b=b)
     res.__class__ = myclArray
     res.reinit()
     return res#myclArray(queue, _res.shape, _res.dtype, data=_res.data)
def test_bitonic_sort(ctx_factory, size, dtype):
    ctx = cl.create_some_context()
    queue = cl.CommandQueue(ctx)

    if (ctx.devices[0].platform.name == "Portable Computing Language"
            and dtype == np.float64):
        pytest.xfail("Double precision bitonic sort doesn't work on POCL")

    import pyopencl.clrandom as clrandom
    from pyopencl.bitonic_sort import BitonicSort

    s = clrandom.rand(queue, (2, size, 3,), dtype, luxury=None, a=0, b=239482333)
    sorter = BitonicSort(ctx)
    sgs, evt = sorter(s.copy(), axis=1)
    assert np.array_equal(np.sort(s.get(), axis=1), sgs.get())
Exemple #20
0
def test_bitonic_argsort(ctx_factory, size, dtype):
    import sys
    is_pypy = '__pypy__' in sys.builtin_module_names

    if not size and is_pypy:
        # https://bitbucket.org/pypy/numpy/issues/53/specifying-strides-on-zero-sized-array
        pytest.xfail("pypy doesn't seem to handle as_strided "
                     "on zero-sized arrays very well")

    ctx = cl.create_some_context()
    queue = cl.CommandQueue(ctx)

    dev = ctx.devices[0]
    if (dev.platform.name == "Portable Computing Language"
            and sys.platform == "darwin"):
        pytest.xfail("Bitonic sort crashes on Apple POCL")
    if (dev.platform.name == "Apple" and dev.type & cl.device_type.CPU):
        pytest.xfail("Bitonic sort won't work on Apple CPU: no workgroup "
                     "parallelism")
    if (dev.platform.name == "Portable Computing Language"
            and dtype == np.float64 and get_pocl_version(dev.platform) <
        (1, 0)):
        pytest.xfail(
            "Double precision bitonic sort doesn't work on POCL < 1.0")

    if dtype == np.float64 and not has_double_support(dev):
        from pytest import skip
        skip("double precision not supported on %s" % dev)

    import pyopencl.clrandom as clrandom
    from pyopencl.bitonic_sort import BitonicSort

    index = cl_array.arange(queue, 0, size, 1, dtype=np.int32)
    m = clrandom.rand(queue, (size, ), dtype, luxury=None, a=0, b=239432234)

    sorterm = BitonicSort(ctx)

    ms, evt = sorterm(m.copy(), idx=index, axis=0)

    assert np.array_equal(np.sort(m.get()), ms.get())

    # may be False because of identical values in array
    # assert np.array_equal(np.argsort(m.get()), index.get())

    # Check values by indices
    assert np.array_equal(m.get()[np.argsort(m.get())], m.get()[index.get()])
Exemple #21
0
def test_bitonic_argsort(ctx_factory, size, dtype):
    import sys
    is_pypy = '__pypy__' in sys.builtin_module_names

    if not size and is_pypy:
        # https://bitbucket.org/pypy/numpy/issues/53/specifying-strides-on-zero-sized-array
        pytest.xfail("pypy doesn't seem to handle as_strided "
                "on zero-sized arrays very well")

    ctx = cl.create_some_context()
    queue = cl.CommandQueue(ctx)

    dev = ctx.devices[0]
    if (dev.platform.name == "Portable Computing Language"
            and sys.platform == "darwin"):
        pytest.xfail("Bitonic sort crashes on Apple POCL")
    if (dev.platform.name == "Apple" and dev.type & cl.device_type.CPU):
        pytest.xfail("Bitonic sort won't work on Apple CPU: no workgroup "
            "parallelism")
    if (dev.platform.name == "Portable Computing Language"
            and dtype == np.float64
            and get_pocl_version(dev.platform) < (1, 0)):
        pytest.xfail("Double precision bitonic sort doesn't work on POCL < 1.0")

    if dtype == np.float64 and not has_double_support(dev):
        from pytest import skip
        skip("double precision not supported on %s" % dev)

    import pyopencl.clrandom as clrandom
    from pyopencl.bitonic_sort import BitonicSort

    index = cl_array.arange(queue, 0, size, 1, dtype=np.int32)
    m = clrandom.rand(queue, (size,), dtype, luxury=None, a=0, b=239432234)

    sorterm = BitonicSort(ctx)

    ms, evt = sorterm(m.copy(), idx=index, axis=0)

    assert np.array_equal(np.sort(m.get()), ms.get())

    # may be False because of identical values in array
    # assert np.array_equal(np.argsort(m.get()), index.get())

    # Check values by indices
    assert np.array_equal(m.get()[np.argsort(m.get())], m.get()[index.get()])
Exemple #22
0
def test_bitonic_sort(ctx_factory, size, dtype):
    ctx = cl.create_some_context()
    queue = cl.CommandQueue(ctx)

    dev = ctx.devices[0]
    if (dev.platform.name == "Apple" and dev.type & cl.device_type.CPU):
        pytest.xfail("Bitonic sort won't work on Apple CPU: no workgroup "
            "parallelism")
    if (dev.platform.name == "Portable Computing Language"
            and dtype == np.float64
            and get_pocl_version(dev.platform) < (1, 0)):
        pytest.xfail("Double precision bitonic sort doesn't work on POCL < 1.0")

    if dtype == np.float64 and not has_double_support(dev):
        from pytest import skip
        skip("double precision not supported on %s" % dev)

    import pyopencl.clrandom as clrandom
    from pyopencl.bitonic_sort import BitonicSort

    s = clrandom.rand(queue, (2, size, 3,), dtype, luxury=None, a=0, b=239482333)
    sorter = BitonicSort(ctx)
    sgs, evt = sorter(s.copy(), axis=1)
    assert np.array_equal(np.sort(s.get(), axis=1), sgs.get())
Exemple #23
0
import numpy
from time import time 
import os
#os.environ['PYOPENCL_COMPILER_OUTPUT']="1"


ctx = cl.create_some_context()
queue = cl.CommandQueue(ctx)

dot = ReductionKernel (ctx, dtype_out=numpy.float32, neutral="0",
			reduce_expr="a+b" , map_expr="x[i]*y[i]" ,
			arguments="__global const float *x, __global const float *y")

import pyopencl.clrandom as cl_rand

x= cl_rand.rand(queue,(1000*1000),dtype=numpy.float32)
y= cl_rand.rand(queue,(1000*1000),dtype=numpy.float32)

t1= time()
x_dot_y = dot(x,y).get()
gpu_time = (time()-t1)



t1 = time()
x_dot_y_cpu = numpy.dot(x.get(),y.get())
cpu_time = time()-t1



print "CPU time (s)", cpu_time
Exemple #24
0
def test_stencil(ctx_factory,
                 grid_shape,
                 proc_shape,
                 dtype,
                 stream,
                 h=1,
                 timing=False):
    if ctx_factory:
        ctx = ctx_factory()
    else:
        ctx = ps.choose_device_and_make_context()

    queue = cl.CommandQueue(ctx)
    rank_shape = tuple(Ni // pi for Ni, pi in zip(grid_shape, proc_shape))

    from pymbolic import var
    x = var("x")
    y = var("y")
    i, j, k = var("i"), var("j"), var("k")

    map_dict = {}
    map_dict[y[i, j,
               k]] = (x[i + h + h, j + h, k + h] + x[i + h, j + h + h, k + h] +
                      x[i + h, j + h, k + h + h] + x[i - h + h, j + h, k + h] +
                      x[i + h, j - h + h, k + h] + x[i + h, j + h, k - h + h])

    if stream:
        try:
            stencil_map = ps.StreamingStencil(map_dict,
                                              prefetch_args=["x"],
                                              halo_shape=h)
        except:  # noqa
            pytest.skip("StreamingStencil unavailable")
    else:
        stencil_map = ps.Stencil(map_dict, h, prefetch_args=["x"])

    x = clr.rand(queue, tuple(ni + 2 * h for ni in rank_shape), dtype)
    y = clr.rand(queue, rank_shape, dtype)

    x_h = x.get()
    y_true = (x_h[2 * h:, h:-h, h:-h] + x_h[h:-h, 2 * h:, h:-h] +
              x_h[h:-h, h:-h, 2 * h:] + x_h[:-2 * h, h:-h, h:-h] +
              x_h[h:-h, :-2 * h, h:-h] + x_h[h:-h, h:-h, :-2 * h])

    stencil_map(queue, x=x, y=y)

    max_rtol = 5e-14 if dtype == np.float64 else 1e-5
    avg_rtol = 5e-14 if dtype == np.float64 else 1e-5

    max_err, avg_err = get_errs(y_true, y.get())
    assert max_err < max_rtol and avg_err < avg_rtol, \
        f"y innaccurate for {grid_shape=}, {h=}, {proc_shape=}" \
        f": {max_err=}, {avg_err=}"

    if timing:
        from common import timer
        t = timer(lambda: stencil_map(queue, x=x, y=y)[0])
        print(
            f"stencil took {t:.3f} ms for {grid_shape=}, {h=}, {proc_shape=}")
        bandwidth = (x.nbytes + y.nbytes) / 1024**3 / t * 1000
        print(f"Bandwidth = {bandwidth} GB/s")
Exemple #25
0
def test_elementwise(ctx_factory, grid_shape, proc_shape, dtype, timing=False):
    if ctx_factory:
        ctx = ctx_factory()
    else:
        ctx = ps.choose_device_and_make_context()

    queue = cl.CommandQueue(ctx)
    rank_shape = tuple(Ni // pi for Ni, pi in zip(grid_shape, proc_shape))

    from pymbolic import var
    a = var("a")
    b = var("b")

    from pystella.field import Field
    x = Field("x")
    y = Field("y")
    z = Field("z")

    tmp_dict = {a[0]: x + 2, a[1]: 2 + x * y, b: x + y / 2}
    map_dict = {x: a[0] * y**2 * x + a[1] * b, z: z + a[1] * b}
    single_insn = {x: y + z}

    ew_map = ps.ElementWiseMap(map_dict, tmp_instructions=tmp_dict)

    x = clr.rand(queue, rank_shape, dtype=dtype)
    y = clr.rand(queue, rank_shape, dtype=dtype)
    z = clr.rand(queue, rank_shape, dtype=dtype)

    a0 = x + 2
    a1 = 2 + x * y
    b = x + y / 2
    x_true = a0 * y**2 * x + a1 * b
    z_true = z + a1 * b

    ew_map(queue, x=x, y=y, z=z)

    max_rtol = 5e-14 if dtype == np.float64 else 1e-5
    avg_rtol = 5e-14 if dtype == np.float64 else 1e-5

    max_err, avg_err = get_errs(x_true.get(), x.get())
    assert max_err < max_rtol and avg_err < avg_rtol, \
        f"x innaccurate for {grid_shape=}, {proc_shape=}: {max_err=}, {avg_err=}"

    max_err, avg_err = get_errs(z_true.get(), z.get())
    assert max_err < max_rtol and avg_err < avg_rtol, \
        f"z innaccurate for {grid_shape=}, {proc_shape=}: {max_err=}, {avg_err=}"

    # test success of single instruction
    ew_map_single = ps.ElementWiseMap(single_insn)
    ew_map_single(queue, x=x, y=y, z=z)

    x_true = y + z
    max_err, avg_err = get_errs(x_true.get(), x.get())
    assert max_err < max_rtol and avg_err < avg_rtol, \
        f"x innaccurate for {grid_shape=}, {proc_shape=}: {max_err=}, {avg_err=}"

    if timing:
        from common import timer
        t = timer(lambda: ew_map(queue, x=x, y=y, z=z)[0])
        print(
            f"elementwise map took {t:.3f} ms for {grid_shape=}, {proc_shape=}"
        )
        bandwidth = 5 * x.nbytes / 1024**3 / t * 1000
        print(f"Bandwidth = {bandwidth:.1f} GB/s")
Exemple #26
0
import pyopencl as cl
from pyopencl import array
from pyopencl import clrandom
import numpy as np
ctx = cl.create_some_context()
queue = cl.CommandQueue(ctx)

s = clrandom.rand(queue, (4000, 5000,), np.int32, luxury=None, a=0, b=9)
Exemple #27
0
 def random(self, size=None):
     _size = size if size else 1
     res = clrandom.rand(queue, _size, float_, a=0.0, b=1.0)
     res.__class__ = myclArray
     res.reinit()
     return res#myclArray(queue, _res.shape, _res.dtype, data=_res.data)
from pyopencl.scan import GenericScanKernel

# np.cumsum([1, 2, 3])
# np.array([1, 3, 6])

ctx = cl.create_some_context()
queue = cl.CommandQueue(ctx)
print("queue: ", queue)
print()

sknl = GenericScanKernel(ctx,
                         np.float64,
                         arguments="double *y, double *x",
                         input_expr="x[i]",
                         scan_expr="a+b",
                         neutral="0",
                         output_statement="y[i] = item;")

n = 10**7
x = clrand.rand(queue, n, np.float64)
print("x:", x)
print()

result = cl.array.empty_like(x)
# result = cl.array.arange(queue, n, dtype=np.float64)
sknl(result, x, queue=queue)
print("result", result)
print()

result_np = result.get()
print("result_np", result.get())
Exemple #29
0
import pyopencl as cl
from pyopencl import array
from pyopencl import clrandom
import numpy as np

ctx = cl.create_some_context()
queue = cl.CommandQueue(ctx)

s = clrandom.rand(queue, (4, 5), np.int32, luxury=None, a=0, b=9)
Exemple #30
0
def test_spectral_poisson(ctx_factory, grid_shape, proc_shape, h, dtype,
                          timing=False):
    if ctx_factory:
        ctx = ctx_factory()
    else:
        ctx = ps.choose_device_and_make_context()

    queue = cl.CommandQueue(ctx)
    mpi = ps.DomainDecomposition(proc_shape, h, grid_shape=grid_shape)
    rank_shape, _ = mpi.get_rank_shape_start(grid_shape)
    fft = ps.DFT(mpi, ctx, queue, grid_shape, dtype)

    L = (3, 5, 7)
    dx = tuple(Li / Ni for Li, Ni in zip(L, grid_shape))
    dk = tuple(2 * np.pi / Li for Li in L)

    if h == 0:
        def get_evals_2(k, dx):
            return - k**2

        derivs = ps.SpectralCollocator(fft, dk)
    else:
        from pystella.derivs import SecondCenteredDifference
        get_evals_2 = SecondCenteredDifference(h).get_eigenvalues
        derivs = ps.FiniteDifferencer(mpi, h, dx, stream=False)

    solver = ps.SpectralPoissonSolver(fft, dk, dx, get_evals_2)

    pencil_shape = tuple(ni + 2*h for ni in rank_shape)

    statistics = ps.FieldStatistics(mpi, 0, rank_shape=rank_shape,
                                    grid_size=np.product(grid_shape))

    fx = cla.empty(queue, pencil_shape, dtype)
    rho = clr.rand(queue, rank_shape, dtype)
    rho -= statistics(rho)["mean"]
    lap = cla.empty(queue, rank_shape, dtype)
    rho_h = rho.get()

    for m_squared in (0, 1.2, 19.2):
        solver(queue, fx, rho, m_squared=m_squared)
        fx_h = fx.get()
        if h > 0:
            fx_h = fx_h[h:-h, h:-h, h:-h]

        derivs(queue, fx=fx, lap=lap)

        diff = np.fabs(lap.get() - rho_h - m_squared * fx_h)
        max_err = np.max(diff) / cla.max(clm.fabs(rho))
        avg_err = np.sum(diff) / cla.sum(clm.fabs(rho))

        max_rtol = 1e-12 if dtype == np.float64 else 1e-4
        avg_rtol = 1e-13 if dtype == np.float64 else 1e-5

        assert max_err < max_rtol and avg_err < avg_rtol, \
            f"solution inaccurate for {h=}, {grid_shape=}, {proc_shape=}"

    if timing:
        from common import timer
        time = timer(lambda: solver(queue, fx, rho, m_squared=m_squared), ntime=10)

        if mpi.rank == 0:
            print(f"poisson took {time:.3f} ms for {grid_shape=}, {proc_shape=}")
Exemple #31
0

from collections import defaultdict

times = defaultdict(list)
dtype = np.int32
xs = range(5, 32)
bs = BitonicSort(ctx)
rs = RadixSort(ctx,
               "int *ary",
               key_expr="ary[i]",
               sort_arg_names=["ary"],
               scan_kernel=GenericScanKernel)
for size in xs:
    print("running size=2^{} = {}".format(size, 2**size))
    s = clrandom.rand(queue, (2**size, ), dtype, a=0, b=2**16)
    times['bitonic'].append(test_bitonic_speed(s, bs).microseconds / 1000000.)
    times['radix'].append(test_radix_speed(s, rs).microseconds / 1000000.)
    times['numpy'].append(
        test_numpy_speed(s.get().copy()).microseconds / 1000000.)

print("\t".join(["Size"] + times.keys()))
for idx, s in enumerate(xs):
    print("\t".join(["2^" + str(s)] +
                    [str(times[k][idx]) for k in times.keys()]))

font = {'size': 30}
import matplotlib
import matplotlib.pyplot as plt
from pluck import pluck
Exemple #32
0
def test_share_halos(ctx_factory,
                     grid_shape,
                     proc_shape,
                     h,
                     dtype,
                     _grid_shape,
                     pass_grid_shape,
                     timing=False):
    ctx = ctx_factory()

    if isinstance(h, int):
        h = (h, ) * 3

    queue = cl.CommandQueue(ctx)
    grid_shape = _grid_shape or grid_shape
    mpi = ps.DomainDecomposition(
        proc_shape, h, grid_shape=(grid_shape if pass_grid_shape else None))
    rank_shape, substart = mpi.get_rank_shape_start(grid_shape)

    # data will be same on each rank
    rng = clr.ThreefryGenerator(ctx, seed=12321)
    data = rng.uniform(queue,
                       tuple(Ni + 2 * hi for Ni, hi in zip(grid_shape, h)),
                       dtype).get()
    if h[0] > 0:
        data[:h[0], :, :] = data[-2 * h[0]:-h[0], :, :]
        data[-h[0]:, :, :] = data[h[0]:2 * h[0], :, :]
    if h[1] > 0:
        data[:, :h[1], :] = data[:, -2 * h[1]:-h[1], :]
        data[:, -h[1]:, :] = data[:, h[1]:2 * h[1], :]
    if h[2] > 0:
        data[:, :, :h[2]] = data[:, :, -2 * h[2]:-h[2]]
        data[:, :, -h[2]:] = data[:, :, h[2]:2 * h[2]]

    subdata = np.empty(tuple(ni + 2 * hi for ni, hi in zip(rank_shape, h)),
                       dtype)
    rank_slice = tuple(
        slice(si + hi, si + ni + hi)
        for ni, si, hi in zip(rank_shape, substart, h))
    unpadded_slc = tuple(slice(hi, -hi) if hi > 0 else slice(None) for hi in h)
    subdata[unpadded_slc] = data[rank_slice]

    subdata_device = cla.to_device(queue, subdata)
    mpi.share_halos(queue, subdata_device)
    subdata2 = subdata_device.get()

    pencil_slice = tuple(
        slice(si, si + ri + 2 * hi)
        for ri, si, hi in zip(rank_shape, substart, h))
    assert (subdata2 == data[pencil_slice]).all(), \
        f"rank {mpi.rank} {mpi.rank_tuple} has incorrect halo data"

    # test that can call with different-shaped input
    if not pass_grid_shape:
        subdata_device_new = clr.rand(
            queue, tuple(ni // 2 + 2 * hi for ni, hi in zip(rank_shape, h)),
            dtype)
        mpi.share_halos(queue, subdata_device_new)

    if timing:
        from common import timer
        t = timer(lambda: mpi.share_halos(queue, fx=subdata_device))
        if mpi.rank == 0:
            print(f"share_halos took {t:.3f} ms for "
                  f"{grid_shape=}, {h=}, {proc_shape=}")
Exemple #33
0
# set parameters
grid_shape = (128, 128, 128)
proc_shape = (1, 1, 1)
rank_shape = tuple(Ni // pi for Ni, pi in zip(grid_shape, proc_shape))
halo_shape = 1
dtype = "float64"
dx = tuple(10 / Ni for Ni in grid_shape)
dt = min(dx) / 10

# create pyopencl context, queue, and halo-sharer
ctx = ps.choose_device_and_make_context()
queue = cl.CommandQueue(ctx)
decomp = ps.DomainDecomposition(proc_shape, halo_shape, rank_shape)

# initialize arrays with random data
f = clr.rand(queue, tuple(ni + 2 * halo_shape for ni in rank_shape), dtype)
dfdt = clr.rand(queue, tuple(ni + 2 * halo_shape for ni in rank_shape), dtype)
lap_f = cla.zeros(queue, rank_shape, dtype)

# define system of equations
f_ = ps.DynamicField("f", offset="h")  # don't overwrite f
rhs_dict = {
    f_: f_.dot,  # df/dt = \dot{f}
    f_.dot: f_.lap  # d\dot{f}/dt = \nabla^2 f
}

# create time-stepping and derivative-computing kernels
stepper = ps.LowStorageRK54(rhs_dict, dt=dt, halo_shape=halo_shape)
derivs = ps.FiniteDifferencer(decomp, halo_shape, dx)

t = 0.
Exemple #34
0
def test_scalar_energy(ctx_factory,
                       grid_shape,
                       proc_shape,
                       h,
                       dtype,
                       timing=False):
    if ctx_factory:
        ctx = ctx_factory()
    else:
        ctx = ps.choose_device_and_make_context()

    queue = cl.CommandQueue(ctx)
    mpi = ps.DomainDecomposition(proc_shape, h, grid_shape=grid_shape)
    rank_shape, _ = mpi.get_rank_shape_start(grid_shape)

    grid_size = np.product(grid_shape)

    nscalars = 2

    def potential(f):
        phi, chi = f[0], f[1]
        return 1 / 2 * phi**2 + 1 / 2 * chi**2 + 1 / 2 * phi**2 * chi**2

    scalar_sector = ps.ScalarSector(nscalars, potential=potential)
    scalar_energy = ps.Reduction(mpi,
                                 scalar_sector,
                                 rank_shape=rank_shape,
                                 grid_size=grid_size,
                                 halo_shape=h)

    pencil_shape = tuple(ni + 2 * h for ni in rank_shape)
    f = clr.rand(queue, (nscalars, ) + pencil_shape, dtype)
    dfdt = clr.rand(queue, (nscalars, ) + pencil_shape, dtype)
    lap = clr.rand(queue, (nscalars, ) + rank_shape, dtype)

    energy = scalar_energy(queue, f=f, dfdt=dfdt, lap_f=lap, a=np.array(1.))

    kin_test = []
    grad_test = []
    for fld in range(nscalars):
        df_h = dfdt[fld].get()
        rank_sum = np.sum(df_h[h:-h, h:-h, h:-h]**2)
        kin_test.append(1 / 2 * mpi.allreduce(rank_sum) / grid_size)

        f_h = f[fld].get()
        lap_h = lap[fld].get()

        rank_sum = np.sum(-f_h[h:-h, h:-h, h:-h] * lap_h)
        grad_test.append(1 / 2 * mpi.allreduce(rank_sum) / grid_size)

    energy_test = {}
    energy_test["kinetic"] = np.array(kin_test)
    energy_test["gradient"] = np.array(grad_test)

    phi = f[0].get()[h:-h, h:-h, h:-h]
    chi = f[1].get()[h:-h, h:-h, h:-h]
    pot_rank = np.sum(potential([phi, chi]))
    energy_test["potential"] = np.array(mpi.allreduce(pot_rank) / grid_size)

    max_rtol = 1e-14 if dtype == np.float64 else 1e-5
    avg_rtol = 1e-14 if dtype == np.float64 else 1e-5

    for key, value in energy.items():
        max_err, avg_err = get_errs(value, energy_test[key])
        assert max_err < max_rtol and avg_err < avg_rtol, \
            f"{key} inaccurate for {nscalars=}, {grid_shape=}, {proc_shape=}" \
            f": {max_err=}, {avg_err=}"

    if timing:
        from common import timer
        t = timer(lambda: scalar_energy(
            queue, a=np.array(1.), f=f, dfdt=dfdt, lap_f=lap))
        if mpi.rank == 0:
            print(f"scalar energy took {t:.3f} "
                  f"ms for {nscalars=}, {grid_shape=}, {proc_shape=}")
Exemple #35
0
import pyopencl as cl
import pyopencl.clrandom as clrand
from pyopencl.elementwise import ElementwiseKernel
import numpy as np

ctx = cl.create_some_context()
queue = cl.CommandQueue(ctx)

n = 10 ** 6 
a = clrand.rand(queue, n, np.float32) 
b = clrand.rand(queue, n, np.float32)

c1 = 5 * a + 6 * b
result_np = c1.get()

lin_comb = ElementwiseKernel(ctx,
    "float a, float *x, float b, float *y, float *c",
    "c[i] = a * x[i] + b * y[i]")
c2 = cl.array.empty_like(a)
lin_comb(5, a, 6, b, c2)
result_np = c2.get()
Exemple #36
0
        round(0.1 * 100 / (nz**(3 - ndim) * n**ndim *
                           np.dtype(dtype).itemsize * ndim * 2 * 2 / 1024**3)))
    nb = max(nb, 1)
    nb = min(nb, 1000)
    # print("%4d (nb=%4d)"%(n, nb))

    if ndim == 1:
        sh = nz, nz, n
    elif ndim == 2:
        sh = nz, n, n
    else:
        sh = n, n, n

    # OpenCL backends
    if has_pyvkfft_opencl or has_gpyfft:
        d = clrandom.rand(cq, shape=sh, dtype=np.float32).astype(dtype)

    if has_pyvkfft_opencl:
        dt = 0
        try:
            app = clVkFFTApp(d.shape, d.dtype, queue=cq, ndim=ndim)
            for i in range(nb_repeat):
                cq.finish()
                t0 = timeit.default_timer()
                for i in range(nb):
                    d = app.ifft(d)
                    d = app.fft(d)
                cq.finish()
                dt1 = timeit.default_timer() - t0
                if dt == 0:
                    dt = dt1