def zero_mean_array(): f0 = clr.rand(queue, grid_shape, dtype) f = clr.rand(queue, tuple(ni + 2 * h for ni in rank_shape), dtype) mpi.scatter_array(queue, f0, f, root=0) avg = statistics(f)["mean"] f = f - avg mpi.share_halos(queue, f) return f
def make_random_array(queue, dtype, size): from pyopencl.clrandom import rand dtype = np.dtype(dtype) if dtype.kind == "c": real_dtype = TO_REAL[dtype] return (rand(queue, shape=(size, ), dtype=real_dtype).astype(dtype) + rand(queue, shape=(size, ), dtype=real_dtype).astype(dtype) * dtype.type(1j)) else: return rand(queue, shape=(size, ), dtype=dtype)
def make_random_array(queue, dtype, size): from pyopencl.clrandom import rand dtype = np.dtype(dtype) if dtype.kind == "c": real_dtype = TO_REAL[dtype] return (rand(queue, shape=(size,), dtype=real_dtype).astype(dtype) + rand(queue, shape=(size,), dtype=real_dtype).astype(dtype) * dtype.type(1j)) else: return rand(queue, shape=(size,), dtype=dtype)
def test_basic_complex(ctx_factory): context = ctx_factory() queue = cl.CommandQueue(context) from pyopencl.clrandom import rand size = 500 ary = (rand(queue, shape=(size,), dtype=np.float32).astype(np.complex64) + rand(queue, shape=(size,), dtype=np.float32).astype(np.complex64) * 1j) c = np.complex64(5+7j) host_ary = ary.get() assert la.norm((ary*c).get() - c*host_ary) < 1e-5 * la.norm(host_ary)
def test_reduction_with_new_shape(ctx_factory, grid_shape, proc_shape, dtype, op, _grid_shape, timing=False): if ctx_factory: ctx = ctx_factory() else: ctx = ps.choose_device_and_make_context() queue = cl.CommandQueue(ctx) h = 1 grid_shape = _grid_shape or grid_shape mpi = ps.DomainDecomposition(proc_shape, h, grid_shape=grid_shape) rank_shape, _ = mpi.get_rank_shape_start(grid_shape) from pystella import Field reducers = {} reducers["avg"] = [(Field("f"), op)] reducer = ps.Reduction(mpi, reducers) f = clr.rand(queue, rank_shape, dtype=dtype) result = reducer(queue, f=f) avg = result["avg"] avg_test = reducer.reduce_array(f, op) if op == "avg": avg_test /= np.product(grid_shape) rtol = 5e-14 if dtype == np.float64 else 1e-5 assert np.allclose(avg, avg_test, rtol=rtol, atol=0), \ f"{op} reduction innaccurate for {grid_shape=}, {proc_shape=}" # test call to reducer with new shape grid_shape = tuple(Ni // 2 for Ni in grid_shape) rank_shape, _ = mpi.get_rank_shape_start(grid_shape) f = clr.rand(queue, rank_shape, dtype=dtype) result = reducer(queue, f=f) avg = result["avg"] avg_test = reducer.reduce_array(f, op) if op == "avg": avg_test /= np.product(grid_shape) rtol = 5e-14 if dtype == np.float64 else 1e-5 assert np.allclose(avg, avg_test, rtol=rtol, atol=0), \ f"{op} reduction w/new shape innaccurate for {grid_shape=}, {proc_shape=}"
def test_basic_complex(ctx_factory): context = ctx_factory() queue = cl.CommandQueue(context) from pyopencl.clrandom import rand size = 500 ary = (rand(queue, shape=(size,), dtype=np.float32).astype(np.complex64) + 1j* rand(queue, shape=(size,), dtype=np.float32).astype(np.complex64)) c = np.complex64(5+7j) host_ary = ary.get() assert la.norm((c*ary).get() - c*host_ary) < 1e-5 * la.norm(host_ary)
def test_bitonic_sort(ctx_factory, size, dtype): ctx = cl.create_some_context() queue = cl.CommandQueue(ctx) dev = ctx.devices[0] if (dev.platform.name == "Apple" and dev.type & cl.device_type.CPU): pytest.xfail("Bitonic sort won't work on Apple CPU: no workgroup " "parallelism") if (dev.platform.name == "Portable Computing Language" and dtype == np.float64 and get_pocl_version(dev.platform) < (1, 0)): pytest.xfail("Double precision bitonic sort doesn't work on POCL < 1.0") if dtype == np.float64 and not has_double_support(dev): from pytest import skip skip("double precision not supported on %s" % dev) import pyopencl.clrandom as clrandom from pyopencl.bitonic_sort import BitonicSort s = clrandom.rand(queue, (2, size, 3,), dtype, luxury=None, a=0, b=239482333) sgs = s.copy() # enqueue_marker crashes under CL 1.1 pocl if there is anything to wait for # (no clEnqueueWaitForEvents) https://github.com/inducer/pyopencl/pull/237 if (dev.platform.name == "Portable Computing Language" and cl.get_cl_header_version() < (1, 2)): sgs.finish() sorter = BitonicSort(ctx) sgs, evt = sorter(sgs, axis=1) assert np.array_equal(np.sort(s.get(), axis=1), sgs.get())
def test_bitonic_argsort(ctx_factory, size, dtype): import sys is_pypy = "__pypy__" in sys.builtin_module_names if not size and is_pypy: # https://bitbucket.org/pypy/numpy/issues/53/specifying-strides-on-zero-sized-array pytest.xfail("pypy doesn't seem to handle as_strided " "on zero-sized arrays very well") ctx = cl.create_some_context() queue = cl.CommandQueue(ctx) device = queue.device if device.platform.vendor == "The pocl project" \ and device.type & cl.device_type.GPU: pytest.xfail("bitonic argsort fails on POCL + Nvidia," "at least the K40, as of pocl 1.6, 2021-01-20") dev = ctx.devices[0] if (dev.platform.name == "Portable Computing Language" and sys.platform == "darwin"): pytest.xfail("Bitonic sort crashes on Apple POCL") if (dev.platform.name == "Apple" and dev.type & cl.device_type.CPU): pytest.xfail("Bitonic sort won't work on Apple CPU: no workgroup " "parallelism") if (dev.platform.name == "Portable Computing Language" and dtype == np.float64 and get_pocl_version(dev.platform) < (1, 0)): pytest.xfail("Double precision bitonic sort doesn't work on POCL < 1.0") if (dev.platform.name == "Intel(R) OpenCL" and size == 0): pytest.xfail("size-0 arange fails on Intel CL") if dtype == np.float64 and not has_double_support(dev): from pytest import skip skip("double precision not supported on %s" % dev) import pyopencl.clrandom as clrandom from pyopencl.bitonic_sort import BitonicSort index = cl_array.arange(queue, 0, size, 1, dtype=np.int32) m = clrandom.rand(queue, (size,), dtype, luxury=None, a=0, b=239432234) sorterm = BitonicSort(ctx) ms = m.copy() # enqueue_marker crashes under CL 1.1 pocl if there is anything to wait for # (no clEnqueueWaitForEvents) https://github.com/inducer/pyopencl/pull/237 if (dev.platform.name == "Portable Computing Language" and cl.get_cl_header_version() < (1, 2)): ms.finish() index.finish() ms, evt = sorterm(ms, idx=index, axis=0) assert np.array_equal(np.sort(m.get()), ms.get()) # may be False because of identical values in array # assert np.array_equal(np.argsort(m.get()), index.get()) # Check values by indices assert np.array_equal(m.get()[np.argsort(m.get())], m.get()[index.get()])
def test_bitonic_argsort(ctx_factory, size, dtype): ctx = cl.create_some_context() queue = cl.CommandQueue(ctx) dev = ctx.devices[0] if (dev.platform.name == "Apple" and dev.type & cl.device_type.CPU): pytest.xfail("Bitonic sort won't work on Apple CPU: no workgroup " "parallelism") if (dev.platform.name == "Portable Computing Language" and dtype == np.float64): pytest.xfail("Double precision bitonic sort doesn't work on POCL") import pyopencl.clrandom as clrandom from pyopencl.bitonic_sort import BitonicSort index = cl_array.arange(queue, 0, size, 1, dtype=np.int32) m = clrandom.rand(queue, (size, ), dtype, luxury=None, a=0, b=239432234) sorterm = BitonicSort(ctx) ms, evt = sorterm(m.copy(), idx=index, axis=0) assert np.array_equal(np.sort(m.get()), ms.get()) # may be False because of identical values in array # assert np.array_equal(np.argsort(m.get()), index.get()) # Check values by indices assert np.array_equal(m.get()[np.argsort(m.get())], m.get()[index.get()])
def test_bitonic_sort(ctx_factory, size, dtype): ctx = cl.create_some_context() queue = cl.CommandQueue(ctx) dev = ctx.devices[0] if (dev.platform.name == "Apple" and dev.type & cl.device_type.CPU): pytest.xfail("Bitonic sort won't work on Apple CPU: no workgroup " "parallelism") if (dev.platform.name == "Portable Computing Language" and dtype == np.float64): pytest.xfail("Double precision bitonic sort doesn't work on POCL") import pyopencl.clrandom as clrandom from pyopencl.bitonic_sort import BitonicSort s = clrandom.rand(queue, ( 2, size, 3, ), dtype, luxury=None, a=0, b=239482333) sorter = BitonicSort(ctx) sgs, evt = sorter(s.copy(), axis=1) assert np.array_equal(np.sort(s.get(), axis=1), sgs.get())
def __call__(self, *args, **kwargs): """ Because we don't need to manage buffers or compile kernel code we override the __call__ and just call the pyOpenCl code. Returns a pyopencl.array.Array. """ return rand(self.queue, kwargs['shape'], numpy.float32)
def test_bitonic_argsort(ctx_factory, size, dtype): ctx = cl.create_some_context() queue = cl.CommandQueue(ctx) dev = ctx.devices[0] if (dev.platform.name == "Apple" and dev.type & cl.device_type.CPU): pytest.xfail("Bitonic sort won't work on Apple CPU: no workgroup " "parallelism") if (dev.platform.name == "Portable Computing Language" and dtype == np.float64): pytest.xfail("Double precision bitonic sort doesn't work on POCL") import pyopencl.clrandom as clrandom from pyopencl.bitonic_sort import BitonicSort index = cl_array.arange(queue, 0, size, 1, dtype=np.int32) m = clrandom.rand(queue, (size,), dtype, luxury=None, a=0, b=239432234) sorterm = BitonicSort(ctx) ms, evt = sorterm(m.copy(), idx=index, axis=0) assert np.array_equal(np.sort(m.get()), ms.get()) # may be False because of identical values in array # assert np.array_equal(np.argsort(m.get()), index.get()) # Check values by indices assert np.array_equal(m.get()[np.argsort(m.get())], m.get()[index.get()])
def rand(self, *args): dtype=float_ shape = args if len(args) else 1 res = clrandom.rand(queue, shape, dtype, a=0.0, b=1.0) res.__class__ = myclArray res.reinit() return res #myclArray(queue, _res.shape, _res.dtype, data=_res.data)
def test_reduction(ctx_factory, grid_shape, proc_shape, dtype, op, _grid_shape, pass_grid_dims, timing=False): if ctx_factory: ctx = ctx_factory() else: ctx = ps.choose_device_and_make_context() queue = cl.CommandQueue(ctx) h = 1 grid_shape = _grid_shape or grid_shape mpi = ps.DomainDecomposition(proc_shape, h, grid_shape=grid_shape) rank_shape, _ = mpi.get_rank_shape_start(grid_shape) from pymbolic import var from pystella import Field tmp_insns = [(var("x"), Field("f") / 2 + .31)] reducers = {} reducers["avg"] = [(var("x"), op)] if pass_grid_dims: reducer = ps.Reduction(mpi, reducers, rank_shape=rank_shape, tmp_instructions=tmp_insns, grid_size=np.product(grid_shape)) else: reducer = ps.Reduction(mpi, reducers, tmp_instructions=tmp_insns) f = clr.rand(queue, rank_shape, dtype=dtype) import pyopencl.tools as clt pool = clt.MemoryPool(clt.ImmediateAllocator(queue)) result = reducer(queue, f=f, allocator=pool) avg = result["avg"] avg_test = reducer.reduce_array(f / 2 + .31, op) if op == "avg": avg_test /= np.product(grid_shape) rtol = 5e-14 if dtype == np.float64 else 1e-5 assert np.allclose(avg, avg_test, rtol=rtol, atol=0), \ f"{op} reduction innaccurate for {grid_shape=}, {proc_shape=}" if timing: from common import timer t = timer(lambda: reducer(queue, f=f, allocator=pool), ntime=1000) if mpi.rank == 0: print( f"reduction took {t:.3f} ms for {grid_shape=}, {proc_shape=}") bandwidth = f.nbytes / 1024**3 / t * 1000 print(f"Bandwidth = {bandwidth:.1f} GB/s")
def test_field_statistics(ctx_factory, grid_shape, proc_shape, dtype, _grid_shape, pass_grid_dims, timing=False): ctx = ctx_factory() queue = cl.CommandQueue(ctx) h = 1 grid_shape = _grid_shape or grid_shape mpi = ps.DomainDecomposition(proc_shape, h, grid_shape=grid_shape) rank_shape, _ = mpi.get_rank_shape_start(grid_shape) # make select parameters local for convenience h = 2 f = clr.rand(queue, (2, 1) + tuple(ni + 2 * h for ni in rank_shape), dtype=dtype) if pass_grid_dims: statistics = ps.FieldStatistics(mpi, h, rank_shape=rank_shape, grid_size=np.product(grid_shape)) else: statistics = ps.FieldStatistics(mpi, h) import pyopencl.tools as clt pool = clt.MemoryPool(clt.ImmediateAllocator(queue)) stats = statistics(f, allocator=pool) avg = stats["mean"] var = stats["variance"] f_h = f.get() rank_sum = np.sum(f_h[..., h:-h, h:-h, h:-h], axis=(-3, -2, -1)) avg_test = mpi.allreduce(rank_sum) / np.product(grid_shape) rank_sum = np.sum(f_h[..., h:-h, h:-h, h:-h]**2, axis=(-3, -2, -1)) var_test = mpi.allreduce(rank_sum) / np.product(grid_shape) - avg_test**2 rtol = 5e-14 if dtype == np.float64 else 1e-5 assert np.allclose(avg, avg_test, rtol=rtol, atol=0), \ f"average innaccurate for {grid_shape=}, {proc_shape=}" assert np.allclose(var, var_test, rtol=rtol, atol=0), \ f"variance innaccurate for {grid_shape=}, {proc_shape=}" if timing: from common import timer t = timer(lambda: statistics(f, allocator=pool)) if mpi.rank == 0: print( f"field stats took {t:.3f} ms " f"for outer shape {f.shape[:-3]}, {grid_shape=}, {proc_shape=}" )
def _evaluate(self, valuation, cache): if self.test: return self.ops[0]._evaluate(valuation, cache) if id(self) not in cache: q = pl.qs[0] op = self.ops[0]._evaluate(valuation, cache) self.mask = clrandom.rand(q, op.shape, op.dtype) >= self.ratio cache[id(self)] = op * self.mask return cache[id(self)]
def rand(shape: Union[tuple[int, ...], int] = (1, 1), gpu=False) -> Tensor: """Returns a tensor of random values in a given shape.""" if gpu: return Tensor(clrandom.rand(QUEUE, shape, np.float32), gpu=True) if isinstance(shape, tuple): return Tensor(np.random.rand(*shape).astype(np.float32)) return Tensor(np.random.rand(shape).astype(np.float32))
def randint(self, low, high=None, size=1): #_size, reshape = szs(size) if high: a, b = low, high else: a, b = 0, low res = clrandom.rand(queue, size, np.int32, a=a, b=b) res.__class__ = myclArray res.reinit() return res#myclArray(queue, _res.shape, _res.dtype, data=_res.data)
def test_bitonic_sort(ctx_factory, size, dtype): ctx = cl.create_some_context() queue = cl.CommandQueue(ctx) if (ctx.devices[0].platform.name == "Portable Computing Language" and dtype == np.float64): pytest.xfail("Double precision bitonic sort doesn't work on POCL") import pyopencl.clrandom as clrandom from pyopencl.bitonic_sort import BitonicSort s = clrandom.rand(queue, (2, size, 3,), dtype, luxury=None, a=0, b=239482333) sorter = BitonicSort(ctx) sgs, evt = sorter(s.copy(), axis=1) assert np.array_equal(np.sort(s.get(), axis=1), sgs.get())
def test_bitonic_argsort(ctx_factory, size, dtype): import sys is_pypy = '__pypy__' in sys.builtin_module_names if not size and is_pypy: # https://bitbucket.org/pypy/numpy/issues/53/specifying-strides-on-zero-sized-array pytest.xfail("pypy doesn't seem to handle as_strided " "on zero-sized arrays very well") ctx = cl.create_some_context() queue = cl.CommandQueue(ctx) dev = ctx.devices[0] if (dev.platform.name == "Portable Computing Language" and sys.platform == "darwin"): pytest.xfail("Bitonic sort crashes on Apple POCL") if (dev.platform.name == "Apple" and dev.type & cl.device_type.CPU): pytest.xfail("Bitonic sort won't work on Apple CPU: no workgroup " "parallelism") if (dev.platform.name == "Portable Computing Language" and dtype == np.float64 and get_pocl_version(dev.platform) < (1, 0)): pytest.xfail( "Double precision bitonic sort doesn't work on POCL < 1.0") if dtype == np.float64 and not has_double_support(dev): from pytest import skip skip("double precision not supported on %s" % dev) import pyopencl.clrandom as clrandom from pyopencl.bitonic_sort import BitonicSort index = cl_array.arange(queue, 0, size, 1, dtype=np.int32) m = clrandom.rand(queue, (size, ), dtype, luxury=None, a=0, b=239432234) sorterm = BitonicSort(ctx) ms, evt = sorterm(m.copy(), idx=index, axis=0) assert np.array_equal(np.sort(m.get()), ms.get()) # may be False because of identical values in array # assert np.array_equal(np.argsort(m.get()), index.get()) # Check values by indices assert np.array_equal(m.get()[np.argsort(m.get())], m.get()[index.get()])
def test_bitonic_argsort(ctx_factory, size, dtype): import sys is_pypy = '__pypy__' in sys.builtin_module_names if not size and is_pypy: # https://bitbucket.org/pypy/numpy/issues/53/specifying-strides-on-zero-sized-array pytest.xfail("pypy doesn't seem to handle as_strided " "on zero-sized arrays very well") ctx = cl.create_some_context() queue = cl.CommandQueue(ctx) dev = ctx.devices[0] if (dev.platform.name == "Portable Computing Language" and sys.platform == "darwin"): pytest.xfail("Bitonic sort crashes on Apple POCL") if (dev.platform.name == "Apple" and dev.type & cl.device_type.CPU): pytest.xfail("Bitonic sort won't work on Apple CPU: no workgroup " "parallelism") if (dev.platform.name == "Portable Computing Language" and dtype == np.float64 and get_pocl_version(dev.platform) < (1, 0)): pytest.xfail("Double precision bitonic sort doesn't work on POCL < 1.0") if dtype == np.float64 and not has_double_support(dev): from pytest import skip skip("double precision not supported on %s" % dev) import pyopencl.clrandom as clrandom from pyopencl.bitonic_sort import BitonicSort index = cl_array.arange(queue, 0, size, 1, dtype=np.int32) m = clrandom.rand(queue, (size,), dtype, luxury=None, a=0, b=239432234) sorterm = BitonicSort(ctx) ms, evt = sorterm(m.copy(), idx=index, axis=0) assert np.array_equal(np.sort(m.get()), ms.get()) # may be False because of identical values in array # assert np.array_equal(np.argsort(m.get()), index.get()) # Check values by indices assert np.array_equal(m.get()[np.argsort(m.get())], m.get()[index.get()])
def test_bitonic_sort(ctx_factory, size, dtype): ctx = cl.create_some_context() queue = cl.CommandQueue(ctx) dev = ctx.devices[0] if (dev.platform.name == "Apple" and dev.type & cl.device_type.CPU): pytest.xfail("Bitonic sort won't work on Apple CPU: no workgroup " "parallelism") if (dev.platform.name == "Portable Computing Language" and dtype == np.float64 and get_pocl_version(dev.platform) < (1, 0)): pytest.xfail("Double precision bitonic sort doesn't work on POCL < 1.0") if dtype == np.float64 and not has_double_support(dev): from pytest import skip skip("double precision not supported on %s" % dev) import pyopencl.clrandom as clrandom from pyopencl.bitonic_sort import BitonicSort s = clrandom.rand(queue, (2, size, 3,), dtype, luxury=None, a=0, b=239482333) sorter = BitonicSort(ctx) sgs, evt = sorter(s.copy(), axis=1) assert np.array_equal(np.sort(s.get(), axis=1), sgs.get())
import numpy from time import time import os #os.environ['PYOPENCL_COMPILER_OUTPUT']="1" ctx = cl.create_some_context() queue = cl.CommandQueue(ctx) dot = ReductionKernel (ctx, dtype_out=numpy.float32, neutral="0", reduce_expr="a+b" , map_expr="x[i]*y[i]" , arguments="__global const float *x, __global const float *y") import pyopencl.clrandom as cl_rand x= cl_rand.rand(queue,(1000*1000),dtype=numpy.float32) y= cl_rand.rand(queue,(1000*1000),dtype=numpy.float32) t1= time() x_dot_y = dot(x,y).get() gpu_time = (time()-t1) t1 = time() x_dot_y_cpu = numpy.dot(x.get(),y.get()) cpu_time = time()-t1 print "CPU time (s)", cpu_time
def test_stencil(ctx_factory, grid_shape, proc_shape, dtype, stream, h=1, timing=False): if ctx_factory: ctx = ctx_factory() else: ctx = ps.choose_device_and_make_context() queue = cl.CommandQueue(ctx) rank_shape = tuple(Ni // pi for Ni, pi in zip(grid_shape, proc_shape)) from pymbolic import var x = var("x") y = var("y") i, j, k = var("i"), var("j"), var("k") map_dict = {} map_dict[y[i, j, k]] = (x[i + h + h, j + h, k + h] + x[i + h, j + h + h, k + h] + x[i + h, j + h, k + h + h] + x[i - h + h, j + h, k + h] + x[i + h, j - h + h, k + h] + x[i + h, j + h, k - h + h]) if stream: try: stencil_map = ps.StreamingStencil(map_dict, prefetch_args=["x"], halo_shape=h) except: # noqa pytest.skip("StreamingStencil unavailable") else: stencil_map = ps.Stencil(map_dict, h, prefetch_args=["x"]) x = clr.rand(queue, tuple(ni + 2 * h for ni in rank_shape), dtype) y = clr.rand(queue, rank_shape, dtype) x_h = x.get() y_true = (x_h[2 * h:, h:-h, h:-h] + x_h[h:-h, 2 * h:, h:-h] + x_h[h:-h, h:-h, 2 * h:] + x_h[:-2 * h, h:-h, h:-h] + x_h[h:-h, :-2 * h, h:-h] + x_h[h:-h, h:-h, :-2 * h]) stencil_map(queue, x=x, y=y) max_rtol = 5e-14 if dtype == np.float64 else 1e-5 avg_rtol = 5e-14 if dtype == np.float64 else 1e-5 max_err, avg_err = get_errs(y_true, y.get()) assert max_err < max_rtol and avg_err < avg_rtol, \ f"y innaccurate for {grid_shape=}, {h=}, {proc_shape=}" \ f": {max_err=}, {avg_err=}" if timing: from common import timer t = timer(lambda: stencil_map(queue, x=x, y=y)[0]) print( f"stencil took {t:.3f} ms for {grid_shape=}, {h=}, {proc_shape=}") bandwidth = (x.nbytes + y.nbytes) / 1024**3 / t * 1000 print(f"Bandwidth = {bandwidth} GB/s")
def test_elementwise(ctx_factory, grid_shape, proc_shape, dtype, timing=False): if ctx_factory: ctx = ctx_factory() else: ctx = ps.choose_device_and_make_context() queue = cl.CommandQueue(ctx) rank_shape = tuple(Ni // pi for Ni, pi in zip(grid_shape, proc_shape)) from pymbolic import var a = var("a") b = var("b") from pystella.field import Field x = Field("x") y = Field("y") z = Field("z") tmp_dict = {a[0]: x + 2, a[1]: 2 + x * y, b: x + y / 2} map_dict = {x: a[0] * y**2 * x + a[1] * b, z: z + a[1] * b} single_insn = {x: y + z} ew_map = ps.ElementWiseMap(map_dict, tmp_instructions=tmp_dict) x = clr.rand(queue, rank_shape, dtype=dtype) y = clr.rand(queue, rank_shape, dtype=dtype) z = clr.rand(queue, rank_shape, dtype=dtype) a0 = x + 2 a1 = 2 + x * y b = x + y / 2 x_true = a0 * y**2 * x + a1 * b z_true = z + a1 * b ew_map(queue, x=x, y=y, z=z) max_rtol = 5e-14 if dtype == np.float64 else 1e-5 avg_rtol = 5e-14 if dtype == np.float64 else 1e-5 max_err, avg_err = get_errs(x_true.get(), x.get()) assert max_err < max_rtol and avg_err < avg_rtol, \ f"x innaccurate for {grid_shape=}, {proc_shape=}: {max_err=}, {avg_err=}" max_err, avg_err = get_errs(z_true.get(), z.get()) assert max_err < max_rtol and avg_err < avg_rtol, \ f"z innaccurate for {grid_shape=}, {proc_shape=}: {max_err=}, {avg_err=}" # test success of single instruction ew_map_single = ps.ElementWiseMap(single_insn) ew_map_single(queue, x=x, y=y, z=z) x_true = y + z max_err, avg_err = get_errs(x_true.get(), x.get()) assert max_err < max_rtol and avg_err < avg_rtol, \ f"x innaccurate for {grid_shape=}, {proc_shape=}: {max_err=}, {avg_err=}" if timing: from common import timer t = timer(lambda: ew_map(queue, x=x, y=y, z=z)[0]) print( f"elementwise map took {t:.3f} ms for {grid_shape=}, {proc_shape=}" ) bandwidth = 5 * x.nbytes / 1024**3 / t * 1000 print(f"Bandwidth = {bandwidth:.1f} GB/s")
import pyopencl as cl from pyopencl import array from pyopencl import clrandom import numpy as np ctx = cl.create_some_context() queue = cl.CommandQueue(ctx) s = clrandom.rand(queue, (4000, 5000,), np.int32, luxury=None, a=0, b=9)
def random(self, size=None): _size = size if size else 1 res = clrandom.rand(queue, _size, float_, a=0.0, b=1.0) res.__class__ = myclArray res.reinit() return res#myclArray(queue, _res.shape, _res.dtype, data=_res.data)
from pyopencl.scan import GenericScanKernel # np.cumsum([1, 2, 3]) # np.array([1, 3, 6]) ctx = cl.create_some_context() queue = cl.CommandQueue(ctx) print("queue: ", queue) print() sknl = GenericScanKernel(ctx, np.float64, arguments="double *y, double *x", input_expr="x[i]", scan_expr="a+b", neutral="0", output_statement="y[i] = item;") n = 10**7 x = clrand.rand(queue, n, np.float64) print("x:", x) print() result = cl.array.empty_like(x) # result = cl.array.arange(queue, n, dtype=np.float64) sknl(result, x, queue=queue) print("result", result) print() result_np = result.get() print("result_np", result.get())
import pyopencl as cl from pyopencl import array from pyopencl import clrandom import numpy as np ctx = cl.create_some_context() queue = cl.CommandQueue(ctx) s = clrandom.rand(queue, (4, 5), np.int32, luxury=None, a=0, b=9)
def test_spectral_poisson(ctx_factory, grid_shape, proc_shape, h, dtype, timing=False): if ctx_factory: ctx = ctx_factory() else: ctx = ps.choose_device_and_make_context() queue = cl.CommandQueue(ctx) mpi = ps.DomainDecomposition(proc_shape, h, grid_shape=grid_shape) rank_shape, _ = mpi.get_rank_shape_start(grid_shape) fft = ps.DFT(mpi, ctx, queue, grid_shape, dtype) L = (3, 5, 7) dx = tuple(Li / Ni for Li, Ni in zip(L, grid_shape)) dk = tuple(2 * np.pi / Li for Li in L) if h == 0: def get_evals_2(k, dx): return - k**2 derivs = ps.SpectralCollocator(fft, dk) else: from pystella.derivs import SecondCenteredDifference get_evals_2 = SecondCenteredDifference(h).get_eigenvalues derivs = ps.FiniteDifferencer(mpi, h, dx, stream=False) solver = ps.SpectralPoissonSolver(fft, dk, dx, get_evals_2) pencil_shape = tuple(ni + 2*h for ni in rank_shape) statistics = ps.FieldStatistics(mpi, 0, rank_shape=rank_shape, grid_size=np.product(grid_shape)) fx = cla.empty(queue, pencil_shape, dtype) rho = clr.rand(queue, rank_shape, dtype) rho -= statistics(rho)["mean"] lap = cla.empty(queue, rank_shape, dtype) rho_h = rho.get() for m_squared in (0, 1.2, 19.2): solver(queue, fx, rho, m_squared=m_squared) fx_h = fx.get() if h > 0: fx_h = fx_h[h:-h, h:-h, h:-h] derivs(queue, fx=fx, lap=lap) diff = np.fabs(lap.get() - rho_h - m_squared * fx_h) max_err = np.max(diff) / cla.max(clm.fabs(rho)) avg_err = np.sum(diff) / cla.sum(clm.fabs(rho)) max_rtol = 1e-12 if dtype == np.float64 else 1e-4 avg_rtol = 1e-13 if dtype == np.float64 else 1e-5 assert max_err < max_rtol and avg_err < avg_rtol, \ f"solution inaccurate for {h=}, {grid_shape=}, {proc_shape=}" if timing: from common import timer time = timer(lambda: solver(queue, fx, rho, m_squared=m_squared), ntime=10) if mpi.rank == 0: print(f"poisson took {time:.3f} ms for {grid_shape=}, {proc_shape=}")
from collections import defaultdict times = defaultdict(list) dtype = np.int32 xs = range(5, 32) bs = BitonicSort(ctx) rs = RadixSort(ctx, "int *ary", key_expr="ary[i]", sort_arg_names=["ary"], scan_kernel=GenericScanKernel) for size in xs: print("running size=2^{} = {}".format(size, 2**size)) s = clrandom.rand(queue, (2**size, ), dtype, a=0, b=2**16) times['bitonic'].append(test_bitonic_speed(s, bs).microseconds / 1000000.) times['radix'].append(test_radix_speed(s, rs).microseconds / 1000000.) times['numpy'].append( test_numpy_speed(s.get().copy()).microseconds / 1000000.) print("\t".join(["Size"] + times.keys())) for idx, s in enumerate(xs): print("\t".join(["2^" + str(s)] + [str(times[k][idx]) for k in times.keys()])) font = {'size': 30} import matplotlib import matplotlib.pyplot as plt from pluck import pluck
def test_share_halos(ctx_factory, grid_shape, proc_shape, h, dtype, _grid_shape, pass_grid_shape, timing=False): ctx = ctx_factory() if isinstance(h, int): h = (h, ) * 3 queue = cl.CommandQueue(ctx) grid_shape = _grid_shape or grid_shape mpi = ps.DomainDecomposition( proc_shape, h, grid_shape=(grid_shape if pass_grid_shape else None)) rank_shape, substart = mpi.get_rank_shape_start(grid_shape) # data will be same on each rank rng = clr.ThreefryGenerator(ctx, seed=12321) data = rng.uniform(queue, tuple(Ni + 2 * hi for Ni, hi in zip(grid_shape, h)), dtype).get() if h[0] > 0: data[:h[0], :, :] = data[-2 * h[0]:-h[0], :, :] data[-h[0]:, :, :] = data[h[0]:2 * h[0], :, :] if h[1] > 0: data[:, :h[1], :] = data[:, -2 * h[1]:-h[1], :] data[:, -h[1]:, :] = data[:, h[1]:2 * h[1], :] if h[2] > 0: data[:, :, :h[2]] = data[:, :, -2 * h[2]:-h[2]] data[:, :, -h[2]:] = data[:, :, h[2]:2 * h[2]] subdata = np.empty(tuple(ni + 2 * hi for ni, hi in zip(rank_shape, h)), dtype) rank_slice = tuple( slice(si + hi, si + ni + hi) for ni, si, hi in zip(rank_shape, substart, h)) unpadded_slc = tuple(slice(hi, -hi) if hi > 0 else slice(None) for hi in h) subdata[unpadded_slc] = data[rank_slice] subdata_device = cla.to_device(queue, subdata) mpi.share_halos(queue, subdata_device) subdata2 = subdata_device.get() pencil_slice = tuple( slice(si, si + ri + 2 * hi) for ri, si, hi in zip(rank_shape, substart, h)) assert (subdata2 == data[pencil_slice]).all(), \ f"rank {mpi.rank} {mpi.rank_tuple} has incorrect halo data" # test that can call with different-shaped input if not pass_grid_shape: subdata_device_new = clr.rand( queue, tuple(ni // 2 + 2 * hi for ni, hi in zip(rank_shape, h)), dtype) mpi.share_halos(queue, subdata_device_new) if timing: from common import timer t = timer(lambda: mpi.share_halos(queue, fx=subdata_device)) if mpi.rank == 0: print(f"share_halos took {t:.3f} ms for " f"{grid_shape=}, {h=}, {proc_shape=}")
# set parameters grid_shape = (128, 128, 128) proc_shape = (1, 1, 1) rank_shape = tuple(Ni // pi for Ni, pi in zip(grid_shape, proc_shape)) halo_shape = 1 dtype = "float64" dx = tuple(10 / Ni for Ni in grid_shape) dt = min(dx) / 10 # create pyopencl context, queue, and halo-sharer ctx = ps.choose_device_and_make_context() queue = cl.CommandQueue(ctx) decomp = ps.DomainDecomposition(proc_shape, halo_shape, rank_shape) # initialize arrays with random data f = clr.rand(queue, tuple(ni + 2 * halo_shape for ni in rank_shape), dtype) dfdt = clr.rand(queue, tuple(ni + 2 * halo_shape for ni in rank_shape), dtype) lap_f = cla.zeros(queue, rank_shape, dtype) # define system of equations f_ = ps.DynamicField("f", offset="h") # don't overwrite f rhs_dict = { f_: f_.dot, # df/dt = \dot{f} f_.dot: f_.lap # d\dot{f}/dt = \nabla^2 f } # create time-stepping and derivative-computing kernels stepper = ps.LowStorageRK54(rhs_dict, dt=dt, halo_shape=halo_shape) derivs = ps.FiniteDifferencer(decomp, halo_shape, dx) t = 0.
def test_scalar_energy(ctx_factory, grid_shape, proc_shape, h, dtype, timing=False): if ctx_factory: ctx = ctx_factory() else: ctx = ps.choose_device_and_make_context() queue = cl.CommandQueue(ctx) mpi = ps.DomainDecomposition(proc_shape, h, grid_shape=grid_shape) rank_shape, _ = mpi.get_rank_shape_start(grid_shape) grid_size = np.product(grid_shape) nscalars = 2 def potential(f): phi, chi = f[0], f[1] return 1 / 2 * phi**2 + 1 / 2 * chi**2 + 1 / 2 * phi**2 * chi**2 scalar_sector = ps.ScalarSector(nscalars, potential=potential) scalar_energy = ps.Reduction(mpi, scalar_sector, rank_shape=rank_shape, grid_size=grid_size, halo_shape=h) pencil_shape = tuple(ni + 2 * h for ni in rank_shape) f = clr.rand(queue, (nscalars, ) + pencil_shape, dtype) dfdt = clr.rand(queue, (nscalars, ) + pencil_shape, dtype) lap = clr.rand(queue, (nscalars, ) + rank_shape, dtype) energy = scalar_energy(queue, f=f, dfdt=dfdt, lap_f=lap, a=np.array(1.)) kin_test = [] grad_test = [] for fld in range(nscalars): df_h = dfdt[fld].get() rank_sum = np.sum(df_h[h:-h, h:-h, h:-h]**2) kin_test.append(1 / 2 * mpi.allreduce(rank_sum) / grid_size) f_h = f[fld].get() lap_h = lap[fld].get() rank_sum = np.sum(-f_h[h:-h, h:-h, h:-h] * lap_h) grad_test.append(1 / 2 * mpi.allreduce(rank_sum) / grid_size) energy_test = {} energy_test["kinetic"] = np.array(kin_test) energy_test["gradient"] = np.array(grad_test) phi = f[0].get()[h:-h, h:-h, h:-h] chi = f[1].get()[h:-h, h:-h, h:-h] pot_rank = np.sum(potential([phi, chi])) energy_test["potential"] = np.array(mpi.allreduce(pot_rank) / grid_size) max_rtol = 1e-14 if dtype == np.float64 else 1e-5 avg_rtol = 1e-14 if dtype == np.float64 else 1e-5 for key, value in energy.items(): max_err, avg_err = get_errs(value, energy_test[key]) assert max_err < max_rtol and avg_err < avg_rtol, \ f"{key} inaccurate for {nscalars=}, {grid_shape=}, {proc_shape=}" \ f": {max_err=}, {avg_err=}" if timing: from common import timer t = timer(lambda: scalar_energy( queue, a=np.array(1.), f=f, dfdt=dfdt, lap_f=lap)) if mpi.rank == 0: print(f"scalar energy took {t:.3f} " f"ms for {nscalars=}, {grid_shape=}, {proc_shape=}")
import pyopencl as cl import pyopencl.clrandom as clrand from pyopencl.elementwise import ElementwiseKernel import numpy as np ctx = cl.create_some_context() queue = cl.CommandQueue(ctx) n = 10 ** 6 a = clrand.rand(queue, n, np.float32) b = clrand.rand(queue, n, np.float32) c1 = 5 * a + 6 * b result_np = c1.get() lin_comb = ElementwiseKernel(ctx, "float a, float *x, float b, float *y, float *c", "c[i] = a * x[i] + b * y[i]") c2 = cl.array.empty_like(a) lin_comb(5, a, 6, b, c2) result_np = c2.get()
round(0.1 * 100 / (nz**(3 - ndim) * n**ndim * np.dtype(dtype).itemsize * ndim * 2 * 2 / 1024**3))) nb = max(nb, 1) nb = min(nb, 1000) # print("%4d (nb=%4d)"%(n, nb)) if ndim == 1: sh = nz, nz, n elif ndim == 2: sh = nz, n, n else: sh = n, n, n # OpenCL backends if has_pyvkfft_opencl or has_gpyfft: d = clrandom.rand(cq, shape=sh, dtype=np.float32).astype(dtype) if has_pyvkfft_opencl: dt = 0 try: app = clVkFFTApp(d.shape, d.dtype, queue=cq, ndim=ndim) for i in range(nb_repeat): cq.finish() t0 = timeit.default_timer() for i in range(nb): d = app.ifft(d) d = app.fft(d) cq.finish() dt1 = timeit.default_timer() - t0 if dt == 0: dt = dt1