def test_divide_array(ctx_factory): """Test the division of an array and a scalar. """ context = ctx_factory() queue = cl.CommandQueue(context) dtypes = (np.float32, np.complex64) from pyopencl.characterize import has_double_support if has_double_support(queue.device): dtypes = dtypes + (np.float64, np.complex128) from itertools import product for dtype_a, dtype_b in product(dtypes, repeat=2): a = np.array([10, 20, 30, 40, 50, 60, 70, 80, 90, 100]).astype(dtype_a) b = np.array([10, 10, 10, 10, 10, 10, 10, 10, 10, 10]).astype(dtype_b) a_gpu = cl_array.to_device(queue, a) b_gpu = cl_array.to_device(queue, b) c = a / b c_gpu = (a_gpu / b_gpu) assert (np.abs(c_gpu.get() - c) < 1e-3).all() assert c_gpu.dtype is c.dtype d = b / a d_gpu = (b_gpu / a_gpu) assert (np.abs(d_gpu.get() - d) < 1e-3).all() assert d_gpu.dtype is d.dtype
def test_dot(ctx_factory): from pytest import importorskip importorskip("mako") context = ctx_factory() queue = cl.CommandQueue(context) dtypes = [np.float32, np.complex64] if has_double_support(context.devices[0]): dtypes.extend([np.float64, np.complex128]) for a_dtype in dtypes: for b_dtype in dtypes: print(a_dtype, b_dtype) a_gpu = general_clrand(queue, (200000, ), a_dtype) a = a_gpu.get() b_gpu = general_clrand(queue, (200000, ), b_dtype) b = b_gpu.get() dot_ab = np.dot(a, b) dot_ab_gpu = cl_array.dot(a_gpu, b_gpu).get() assert abs(dot_ab_gpu - dot_ab) / abs(dot_ab) < 1e-4 vdot_ab = np.vdot(a, b) vdot_ab_gpu = cl_array.vdot(a_gpu, b_gpu).get() assert abs(vdot_ab_gpu - vdot_ab) / abs(vdot_ab) < 1e-4
def get_dot_kernel(ctx, dtype_out, dtype_a=None, dtype_b=None, conjugate_first=False): from pyopencl.characterize import has_double_support map_expr, dtype_out, dtype_b = _get_dot_expr( dtype_out, dtype_a, dtype_b, conjugate_first, has_double_support=has_double_support(ctx.devices[0])) reduce_expr = "a+b" neutral_expr = "0" if dtype_out.kind == "c": from pyopencl.elementwise import complex_dtype_to_name dtname = complex_dtype_to_name(dtype_out) reduce_expr = "%s_add(a, b)" % dtname neutral_expr = "%s_new(0, 0)" % dtname return ReductionKernel(ctx, dtype_out, neutral=neutral_expr, reduce_expr=reduce_expr, map_expr=map_expr, arguments=("const %(tp_a)s *a, " "const %(tp_b)s *b" % { "tp_a": dtype_to_ctype(dtype_a), "tp_b": dtype_to_ctype(dtype_b), }))
def test_random(ctx_factory): context = ctx_factory() queue = cl.CommandQueue(context) from pyopencl.clrandom import RanluxGenerator if has_double_support(context.devices[0]): dtypes = [np.float32, np.float64] else: dtypes = [np.float32] gen = RanluxGenerator(queue, 5120) for ary_size in [300, 301, 302, 303, 10007]: for dtype in dtypes: ran = cl_array.zeros(queue, ary_size, dtype) gen.fill_uniform(ran) assert (0 < ran.get()).all() assert (ran.get() < 1).all() gen.synchronize(queue) ran = cl_array.zeros(queue, ary_size, dtype) gen.fill_uniform(ran, a=4, b=7) assert (4 < ran.get()).all() assert (ran.get() < 7).all() ran = gen.normal(queue, (10007,), dtype, mu=4, sigma=3) dtypes = [np.int32] for dtype in dtypes: ran = gen.uniform(queue, (10000007,), dtype, a=200, b=300) assert (200 <= ran.get()).all() assert (ran.get() < 300).all()
def test_astype(ctx_factory): context = ctx_factory() queue = cl.CommandQueue(context) from pyopencl.clrandom import rand as clrand if not has_double_support(context.devices[0]): from pytest import skip skip("double precision not supported on %s" % context.devices[0]) a_gpu = clrand(queue, (2000,), dtype=np.float32) a = a_gpu.get().astype(np.float64) a2 = a_gpu.astype(np.float64).get() assert a2.dtype == np.float64 assert la.norm(a - a2) == 0, (a, a2) a_gpu = clrand(queue, (2000,), dtype=np.float64) a = a_gpu.get().astype(np.float32) a2 = a_gpu.astype(np.float32).get() assert a2.dtype == np.float32 assert la.norm(a - a2) / la.norm(a) < 1e-7
def test_dot(ctx_factory): from pytest import importorskip importorskip("mako") context = ctx_factory() queue = cl.CommandQueue(context) dtypes = [np.float32, np.complex64] if has_double_support(context.devices[0]): dtypes.extend([np.float64, np.complex128]) for a_dtype in dtypes: for b_dtype in dtypes: print(a_dtype, b_dtype) a_gpu = general_clrand(queue, (200000,), a_dtype) a = a_gpu.get() b_gpu = general_clrand(queue, (200000,), b_dtype) b = b_gpu.get() dot_ab = np.dot(a, b) dot_ab_gpu = cl_array.dot(a_gpu, b_gpu).get() assert abs(dot_ab_gpu - dot_ab) / abs(dot_ab) < 1e-4 vdot_ab = np.vdot(a, b) vdot_ab_gpu = cl_array.vdot(a_gpu, b_gpu).get() assert abs(vdot_ab_gpu - vdot_ab) / abs(vdot_ab) < 1e-4
def _get_reduction_source( ctx, out_type, out_type_size, neutral, reduce_expr, map_expr, parsed_args, name="reduce_kernel", preamble="", arg_prep="", device=None, max_group_size=None): if device is not None: devices = [device] else: devices = ctx.devices # {{{ compute group size def get_dev_group_size(device): # dirty fix for the RV770 boards max_work_group_size = device.max_work_group_size if "RV770" in device.name: max_work_group_size = 64 # compute lmem limit from pytools import div_ceil lmem_wg_size = div_ceil(max_work_group_size, out_type_size) result = min(max_work_group_size, lmem_wg_size) # round down to power of 2 from pyopencl.tools import bitlog2 return 2**bitlog2(result) group_size = min(get_dev_group_size(dev) for dev in devices) if max_group_size is not None: group_size = min(max_group_size, group_size) # }}} from mako.template import Template from pytools import all from pyopencl.characterize import has_double_support src = str(Template(KERNEL).render( out_type=out_type, arguments=", ".join(arg.declarator() for arg in parsed_args), group_size=group_size, neutral=neutral, reduce_expr=_process_code_for_macro(reduce_expr), map_expr=_process_code_for_macro(map_expr), name=name, preamble=preamble, arg_prep=arg_prep, double_support=all(has_double_support(dev) for dev in devices), )) from pytools import Record class ReductionInfo(Record): pass return ReductionInfo( context=ctx, source=src, group_size=group_size)
def get_write_kernel(self, index_dtype): index_ctype = dtype_to_ctype(index_dtype) from pyopencl.tools import VectorArg, OtherArg kernel_list_args = [] kernel_list_arg_values = "" user_list_args = [] for name, dtype in self.list_names_and_dtypes: list_name = "plb_%s_list" % name list_arg = VectorArg(dtype, list_name) kernel_list_args.append(list_arg) user_list_args.append(list_arg) if name in self.count_sharing: kernel_list_arg_values += "%s, " % list_name continue kernel_list_args.append(VectorArg(index_dtype, "plb_%s_start_index" % name)) index_name = "plb_%s_index" % name user_list_args.append(OtherArg("%s *%s" % (index_ctype, index_name), index_name)) kernel_list_arg_values += "%s, &%s, " % (list_name, index_name) kernel_name = self.name_prefix + "_write" from pyopencl.characterize import has_double_support src = _LIST_BUILDER_TEMPLATE.render( is_count_stage=False, kernel_name=kernel_name, double_support=all(has_double_support(dev) for dev in self.context.devices), debug=self.debug, do_not_vectorize=self.do_not_vectorize(), kernel_list_arg_decl=_get_arg_decl(kernel_list_args), kernel_list_arg_values=kernel_list_arg_values, user_list_arg_decl=_get_arg_decl(user_list_args), user_list_args=_get_arg_list(user_list_args), user_arg_decl=_get_arg_decl(self.arg_decls), user_args=_get_arg_list(self.arg_decls), list_names_and_dtypes=self.list_names_and_dtypes, count_sharing=self.count_sharing, name_prefix=self.name_prefix, generate_template=self.generate_template, preamble=self.preamble, index_type=index_ctype, ) src = str(src) prg = cl.Program(self.context, src).build(self.options) knl = getattr(prg, kernel_name) from pyopencl.tools import get_arg_list_scalar_arg_dtypes knl.set_scalar_arg_dtypes(get_arg_list_scalar_arg_dtypes(kernel_list_args + self.arg_decls) + [index_dtype]) return knl
def test_bitonic_sort(ctx_factory, size, dtype): ctx = cl.create_some_context() queue = cl.CommandQueue(ctx) dev = ctx.devices[0] if (dev.platform.name == "Apple" and dev.type & cl.device_type.CPU): pytest.xfail("Bitonic sort won't work on Apple CPU: no workgroup " "parallelism") if (dev.platform.name == "Portable Computing Language" and dtype == np.float64 and get_pocl_version(dev.platform) < (1, 0)): pytest.xfail( "Double precision bitonic sort doesn't work on POCL < 1.0") if dtype == np.float64 and not has_double_support(dev): from pytest import skip skip("double precision not supported on %s" % dev) import pyopencl.clrandom as clrandom from pyopencl.bitonic_sort import BitonicSort s = clrandom.rand(queue, ( 2, size, 3, ), dtype, luxury=None, a=0, b=239482333) sorter = BitonicSort(ctx) sgs, evt = sorter(s.copy(), axis=1) assert np.array_equal(np.sort(s.get(), axis=1), sgs.get())
def test_bitonic_argsort(ctx_factory, size, dtype): import sys is_pypy = "__pypy__" in sys.builtin_module_names if not size and is_pypy: # https://bitbucket.org/pypy/numpy/issues/53/specifying-strides-on-zero-sized-array pytest.xfail("pypy doesn't seem to handle as_strided " "on zero-sized arrays very well") ctx = cl.create_some_context() queue = cl.CommandQueue(ctx) device = queue.device if device.platform.vendor == "The pocl project" \ and device.type & cl.device_type.GPU: pytest.xfail("bitonic argsort fails on POCL + Nvidia," "at least the K40, as of pocl 1.6, 2021-01-20") dev = ctx.devices[0] if (dev.platform.name == "Portable Computing Language" and sys.platform == "darwin"): pytest.xfail("Bitonic sort crashes on Apple POCL") if (dev.platform.name == "Apple" and dev.type & cl.device_type.CPU): pytest.xfail("Bitonic sort won't work on Apple CPU: no workgroup " "parallelism") if (dev.platform.name == "Portable Computing Language" and dtype == np.float64 and get_pocl_version(dev.platform) < (1, 0)): pytest.xfail("Double precision bitonic sort doesn't work on POCL < 1.0") if (dev.platform.name == "Intel(R) OpenCL" and size == 0): pytest.xfail("size-0 arange fails on Intel CL") if dtype == np.float64 and not has_double_support(dev): from pytest import skip skip("double precision not supported on %s" % dev) import pyopencl.clrandom as clrandom from pyopencl.bitonic_sort import BitonicSort index = cl_array.arange(queue, 0, size, 1, dtype=np.int32) m = clrandom.rand(queue, (size,), dtype, luxury=None, a=0, b=239432234) sorterm = BitonicSort(ctx) ms = m.copy() # enqueue_marker crashes under CL 1.1 pocl if there is anything to wait for # (no clEnqueueWaitForEvents) https://github.com/inducer/pyopencl/pull/237 if (dev.platform.name == "Portable Computing Language" and cl.get_cl_header_version() < (1, 2)): ms.finish() index.finish() ms, evt = sorterm(ms, idx=index, axis=0) assert np.array_equal(np.sort(m.get()), ms.get()) # may be False because of identical values in array # assert np.array_equal(np.argsort(m.get()), index.get()) # Check values by indices assert np.array_equal(m.get()[np.argsort(m.get())], m.get()[index.get()])
def test_divide_inplace_scalar(ctx_factory): """Test inplace division of arrays and a scalar.""" context = ctx_factory() queue = cl.CommandQueue(context) if queue.device.platform.name == "Apple": pytest.xfail("Apple CL compiler crashes on this.") dtypes = (np.uint8, np.uint16, np.uint32, np.int8, np.int16, np.int32, np.float32, np.complex64) from pyopencl.characterize import has_double_support if has_double_support(queue.device): dtypes = dtypes + (np.float64, np.complex128) from itertools import product for dtype_a, dtype_s in product(dtypes, repeat=2): a = np.array([10, 20, 30, 40, 50, 60, 70, 80, 90, 100]).astype(dtype_a) s = dtype_s(40) a_gpu = cl_array.to_device(queue, a) # ensure the same behavior as inplace numpy.ndarray division try: a /= s except TypeError: with np.testing.assert_raises(TypeError): a_gpu /= s else: a_gpu /= s assert (np.abs(a_gpu.get() - a) < 1e-3).all() assert a_gpu.dtype is a.dtype
def test_lin_comb_diff(ctx_factory, arg_type): ctx = ctx_factory() dev, = ctx.devices if not has_double_support(dev): if arg_type in (np.float64, np.complex128): pytest.skip('Device does not support double.') n = 100000 a_np = (np.random.randn(n)).astype(arg_type) b_np = (np.random.randn(n)).astype(arg_type) c_np = (np.random.randn(n) * 10).astype(arg_type) queue = cl.CommandQueue(ctx) a_g = cl.array.to_device(queue, a_np) b_g = cl.array.to_device(queue, b_np) c_g = cl.array.to_device(queue, c_np) res_g = cl.array.empty_like(a_g) lin_comb_diff = lin_comb_diff_kernel(ctx, arg_type, arg_type, arg_type, np.float32, 2) gs, ls = get_group_sizes(n, dev, lin_comb_diff) evt = run_elwise_kernel(lin_comb_diff, queue, gs, ls, n, [], res_g, c_g, a_g, b_g, 2, 3) evt.wait() # Check on GPU with PyOpenCL Array: assert np.linalg.norm((res_g - (c_g + 2 * a_g + 3 * b_g)).get()) <= 2e-4 # Check on CPU with Numpy: res_np = res_g.get() assert np.linalg.norm(res_np - (c_np + 2 * a_np + 3 * b_np)) <= 2e-4
def test_divide_inplace_array(ctx_factory): """Test inplace division of arrays.""" context = ctx_factory() queue = cl.CommandQueue(context) dtypes = (np.uint8, np.uint16, np.uint32, np.int8, np.int16, np.int32, np.float32, np.complex64) from pyopencl.characterize import has_double_support if has_double_support(queue.device): dtypes = dtypes + (np.float64, np.complex128) from itertools import product for dtype_a, dtype_b in product(dtypes, repeat=2): print(dtype_a, dtype_b) a = np.array([10, 20, 30, 40, 50, 60, 70, 80, 90, 100]).astype(dtype_a) b = np.array([10, 10, 10, 10, 10, 10, 10, 10, 10, 10]).astype(dtype_b) a_gpu = cl_array.to_device(queue, a) b_gpu = cl_array.to_device(queue, b) # ensure the same behavior as inplace numpy.ndarray division try: a_gpu /= b_gpu except TypeError: # pass for now, as numpy casts differently for in-place and out-place # true_divide pass # with np.testing.assert_raises(TypeError): # a /= b else: a /= b assert (np.abs(a_gpu.get() - a) < 1e-3).all() assert a_gpu.dtype is a.dtype
def test_bitonic_sort(ctx_factory, size, dtype): ctx = cl.create_some_context() queue = cl.CommandQueue(ctx) dev = ctx.devices[0] if (dev.platform.name == "Apple" and dev.type & cl.device_type.CPU): pytest.xfail("Bitonic sort won't work on Apple CPU: no workgroup " "parallelism") if (dev.platform.name == "Portable Computing Language" and dtype == np.float64 and get_pocl_version(dev.platform) < (1, 0)): pytest.xfail("Double precision bitonic sort doesn't work on POCL < 1.0") if dtype == np.float64 and not has_double_support(dev): from pytest import skip skip("double precision not supported on %s" % dev) import pyopencl.clrandom as clrandom from pyopencl.bitonic_sort import BitonicSort s = clrandom.rand(queue, (2, size, 3,), dtype, luxury=None, a=0, b=239482333) sgs = s.copy() # enqueue_marker crashes under CL 1.1 pocl if there is anything to wait for # (no clEnqueueWaitForEvents) https://github.com/inducer/pyopencl/pull/237 if (dev.platform.name == "Portable Computing Language" and cl.get_cl_header_version() < (1, 2)): sgs.finish() sorter = BitonicSort(ctx) sgs, evt = sorter(sgs, axis=1) assert np.array_equal(np.sort(s.get(), axis=1), sgs.get())
def test(ctx_factory): context = ctx_factory() queue = cl.CommandQueue(context) gpu_func = getattr(clmath, name) cpu_func = getattr(np, numpy_func_names.get(name, name)) if has_double_support(context.devices[0]): if use_complex: dtypes = [np.float32, np.float64, np.complex64, np.complex128] else: dtypes = [np.float32, np.float64] else: if use_complex: dtypes = [np.float32, np.complex64] else: dtypes = [np.float32] for s in sizes: for dtype in dtypes: dtype = np.dtype(dtype) args = cl_array.arange(queue, a, b, (b - a) / s, dtype=dtype) if dtype.kind == "c": args = args + dtype.type(1j) * args gpu_results = gpu_func(args).get() cpu_results = cpu_func(args.get()) my_threshold = threshold if dtype.kind == "c" and isinstance(use_complex, float): my_threshold = use_complex max_err = np.max(np.abs(cpu_results - gpu_results)) assert (max_err <= my_threshold).all(), (max_err, name, dtype)
def test_divide_scalar(ctx_factory): """Test the division of an array and a scalar.""" context = ctx_factory() queue = cl.CommandQueue(context) if queue.device.platform.name == "Apple": pytest.xfail("Apple CL compiler crashes on this.") dtypes = (np.uint8, np.uint16, np.uint32, np.int8, np.int16, np.int32, np.float32, np.complex64) from pyopencl.characterize import has_double_support if has_double_support(queue.device): dtypes = dtypes + (np.float64, np.complex128) from itertools import product for dtype_a, dtype_s in product(dtypes, repeat=2): a = np.array([10, 20, 30, 40, 50, 60, 70, 80, 90, 100]).astype(dtype_a) s = dtype_s(40) a_gpu = cl_array.to_device(queue, a) b = a / s b_gpu = a_gpu / s assert (np.abs(b_gpu.get() - b) < 1e-3).all() assert b_gpu.dtype is b.dtype c = s / a c_gpu = s / a_gpu assert (np.abs(c_gpu.get() - c) < 1e-3).all() assert c_gpu.dtype is c.dtype
def get_subset_dot_kernel(ctx, dtype_out, dtype_subset, dtype_a=None, dtype_b=None, conjugate_first=False): from pyopencl.characterize import has_double_support map_expr, dtype_out, dtype_b = _get_dot_expr( dtype_out, dtype_a, dtype_b, conjugate_first, has_double_support=has_double_support(ctx.devices[0]), index_expr="lookup_tbl[i]") # important: lookup_tbl must be first--it controls the length return ReductionKernel( ctx, dtype_out, neutral="0", reduce_expr="a+b", map_expr=map_expr, arguments=("const %(tp_lut)s *lookup_tbl, " "const %(tp_a)s *a, " "const %(tp_b)s *b" % { "tp_lut": dtype_to_ctype(dtype_subset), "tp_a": dtype_to_ctype(dtype_a), "tp_b": dtype_to_ctype(dtype_b), }))
def get_dot_kernel(ctx, dtype_out, dtype_a=None, dtype_b=None): if dtype_b is None: if dtype_a is None: dtype_b = dtype_out else: dtype_b = dtype_a if dtype_out is None: from pyopencl.compyte.array import get_common_dtype from pyopencl.characterize import has_double_support dtype_out = get_common_dtype(dtype_a.type(0), dtype_b.type(0), has_double_support(ctx.devices[0])) a_real_dtype = dtype_a.type(0).real.dtype b_real_dtype = dtype_b.type(0).real.dtype out_real_dtype = dtype_out.type(0).real.dtype a_is_complex = dtype_a.kind == "c" b_is_complex = dtype_b.kind == "c" out_is_complex = dtype_out.kind == "c" from pyopencl.elementwise import complex_dtype_to_name if a_is_complex and b_is_complex: a = "a[i]" b = "b[i]" if dtype_a != dtype_out: a = "%s_cast(%s)" % (complex_dtype_to_name(dtype_out), a) if dtype_b != dtype_out: b = "%s_cast(%s)" % (complex_dtype_to_name(dtype_out), b) map_expr = "%s_mul(%s, %s)" % (complex_dtype_to_name(dtype_out), a, b) else: a = "a[i]" b = "b[i]" if out_is_complex: if a_is_complex and dtype_a != dtype_out: a = "%s_cast(%s)" % (complex_dtype_to_name(dtype_out), a) if b_is_complex and dtype_b != dtype_out: b = "%s_cast(%s)" % (complex_dtype_to_name(dtype_out), b) if not a_is_complex and a_real_dtype != out_real_dtype: a = "(%s) (%s)" % (dtype_to_ctype(out_real_dtype), a) if not b_is_complex and b_real_dtype != out_real_dtype: b = "(%s) (%s)" % (dtype_to_ctype(out_real_dtype), b) map_expr = "%s*%s" % (a, b) return ReductionKernel(ctx, dtype_out, neutral="0", reduce_expr="a+b", map_expr=map_expr, arguments="const %(tp_a)s *a, " "const %(tp_b)s *b" % { "tp_a": dtype_to_ctype(dtype_a), "tp_b": dtype_to_ctype(dtype_b), })
def __init__(self, queue, num_work_items, luxury=None, seed=None, no_warmup=False, use_legacy_init=False, max_work_items=None): if luxury is None: luxury = 4 if seed is None: from time import time seed = int(time()*1e6) % 2<<30 self.context = queue.context self.luxury = luxury self.num_work_items = num_work_items from pyopencl.characterize import has_double_support self.support_double = has_double_support(queue.device) self.no_warmup = no_warmup self.use_legacy_init = use_legacy_init self.max_work_items = max_work_items src = """ %(defines)s #include <pyopencl-ranluxcl.cl> kernel void init_ranlux(unsigned seeds, global ranluxcl_state_t *ranluxcltab) { if (get_global_id(0) < %(num_work_items)d) ranluxcl_initialization(seeds, ranluxcltab); } """ % { "defines": self.generate_settings_defines(), "num_work_items": num_work_items } prg = cl.Program(queue.context, src).build() # {{{ compute work group size wg_size = None import sys import platform if ("darwin" in sys.platform and "Apple" in queue.device.platform.vendor and platform.mac_ver()[0].startswith("10.7") and queue.device.type == cl.device_type.CPU): wg_size = (1,) self.wg_size = wg_size # }}} self.state = cl_array.empty(queue, (num_work_items, 112), dtype=np.uint8) self.state.fill(17) prg.init_ranlux(queue, (num_work_items,), self.wg_size, np.uint32(seed), self.state.data)
def get_dot_kernel(ctx, dtype_out, dtype_a=None, dtype_b=None): if dtype_b is None: if dtype_a is None: dtype_b = dtype_out else: dtype_b = dtype_a if dtype_out is None: from pyopencl.compyte.array import get_common_dtype from pyopencl.characterize import has_double_support dtype_out = get_common_dtype( dtype_a.type(0), dtype_b.type(0), has_double_support(ctx.devices[0])) a_real_dtype = dtype_a.type(0).real.dtype b_real_dtype = dtype_b.type(0).real.dtype out_real_dtype = dtype_out.type(0).real.dtype a_is_complex = dtype_a.kind == "c" b_is_complex = dtype_b.kind == "c" out_is_complex = dtype_out.kind == "c" from pyopencl.elementwise import complex_dtype_to_name if a_is_complex and b_is_complex: a = "a[i]" b = "b[i]" if dtype_a != dtype_out: a = "%s_cast(%s)" % (complex_dtype_to_name(dtype_out), a) if dtype_b != dtype_out: b = "%s_cast(%s)" % (complex_dtype_to_name(dtype_out), b) map_expr = "%s_mul(%s, %s)" % ( complex_dtype_to_name(dtype_out), a, b) else: a = "a[i]" b = "b[i]" if out_is_complex: if a_is_complex and dtype_a != dtype_out: a = "%s_cast(%s)" % (complex_dtype_to_name(dtype_out), a) if b_is_complex and dtype_b != dtype_out: b = "%s_cast(%s)" % (complex_dtype_to_name(dtype_out), b) if not a_is_complex and a_real_dtype != out_real_dtype: a = "(%s) (%s)" % (dtype_to_ctype(out_real_dtype), a) if not b_is_complex and b_real_dtype != out_real_dtype: b = "(%s) (%s)" % (dtype_to_ctype(out_real_dtype), b) map_expr = "%s*%s" % (a, b) return ReductionKernel(ctx, dtype_out, neutral="0", reduce_expr="a+b", map_expr=map_expr, arguments= "__global const %(tp_a)s *a, " "__global const %(tp_b)s *b" % { "tp_a": dtype_to_ctype(dtype_a), "tp_b": dtype_to_ctype(dtype_b), })
def test_get_kernels(ctx_factory, res_type, arg_type, weight_type): ctx = ctx_factory() dev, = ctx.devices if not has_double_support(dev): for t in res_type, arg_type, weight_type: if t in (np.float64, np.complex128): pytest.skip('Device does not support double.') for length in range(1, 3): lin_comb_kernel(ctx, res_type, arg_type, weight_type, length)
def test_random_float_in_range(ctx_factory, rng_class, ary_size, plot_hist=False): context = ctx_factory() queue = cl.CommandQueue(context) device = queue.device if device.platform.vendor == "The pocl project" \ and device.type & cl.device_type.GPU \ and rng_class is RanluxGenerator: pytest.xfail("ranlux test fails on POCL + Nvidia," "at least the Titan V, as of pocl 1.6, 2021-01-20") if has_double_support(context.devices[0]): dtypes = [np.float32, np.float64] else: dtypes = [np.float32] if rng_class is RanluxGenerator: gen = rng_class(queue, 5120) else: gen = rng_class(context) for dtype in dtypes: print(dtype) ran = cl_array.zeros(queue, ary_size, dtype) gen.fill_uniform(ran) if plot_hist: import matplotlib.pyplot as pt pt.hist(ran.get(), 30) pt.show() assert (0 <= ran.get()).all() assert (ran.get() <= 1).all() if rng_class is RanluxGenerator: gen.synchronize(queue) ran = cl_array.zeros(queue, ary_size, dtype) gen.fill_uniform(ran, a=4, b=7) ran_host = ran.get() for cond in [4 <= ran_host, ran_host <= 7]: good = cond.all() if not good: print(np.where(~cond)) print(ran_host[~cond]) assert good ran = gen.normal(queue, ary_size, dtype, mu=10, sigma=3) if plot_hist: import matplotlib.pyplot as pt pt.hist(ran.get(), 30) pt.show()
def get_count_kernel(self, index_dtype): index_ctype = dtype_to_ctype(index_dtype) from pyopencl.tools import VectorArg, OtherArg kernel_list_args = [ VectorArg(index_dtype, "plb_%s_count" % name) for name, dtype in self.list_names_and_dtypes if name not in self.count_sharing] user_list_args = [] for name, dtype in self.list_names_and_dtypes: if name in self.count_sharing: continue name = "plb_loc_%s_count" % name user_list_args.append(OtherArg("%s *%s" % ( index_ctype, name), name)) kernel_name = self.name_prefix+"_count" from pyopencl.characterize import has_double_support src = _LIST_BUILDER_TEMPLATE.render( is_count_stage=True, kernel_name=kernel_name, double_support=all(has_double_support(dev) for dev in self.context.devices), debug=self.debug, do_not_vectorize=self.do_not_vectorize(), eliminate_empty_output_lists=self.eliminate_empty_output_lists, kernel_list_arg_decl=_get_arg_decl(kernel_list_args), kernel_list_arg_values=_get_arg_list(user_list_args, prefix="&"), user_list_arg_decl=_get_arg_decl(user_list_args), user_list_args=_get_arg_list(user_list_args), user_arg_decl_with_offset=_get_arg_decl(self.arg_decls), user_arg_decl_no_offset=_get_arg_decl(self.arg_decls_no_offset), user_args_no_offset=_get_arg_list(self.arg_decls_no_offset), arg_offset_adjustment=get_arg_offset_adjuster_code(self.arg_decls), list_names_and_dtypes=self.list_names_and_dtypes, count_sharing=self.count_sharing, name_prefix=self.name_prefix, generate_template=self.generate_template, preamble=self.preamble, index_type=index_ctype, ) src = str(src) prg = cl.Program(self.context, src).build(self.options) knl = getattr(prg, kernel_name) from pyopencl.tools import get_arg_list_scalar_arg_dtypes knl.set_scalar_arg_dtypes(get_arg_list_scalar_arg_dtypes( kernel_list_args+self.arg_decls) + [index_dtype]) return knl
def test_clrandom_dtypes(ctx_factory, rng_class, dtype): cl_ctx = ctx_factory() if dtype == np.float64 and not has_double_support(cl_ctx.devices[0]): pytest.skip("double precision not supported on this device") rng = rng_class(cl_ctx) size = 10 with cl.CommandQueue(cl_ctx) as queue: rng.uniform(queue, size, dtype) if dtype not in (np.int32, np.int64): rng.normal(queue, size, dtype)
def test_random_float_in_range(ctx_factory, rng_class, ary_size, plot_hist=False): context = ctx_factory() queue = cl.CommandQueue(context) if has_double_support(context.devices[0]): dtypes = [np.float32, np.float64] else: dtypes = [np.float32] if rng_class is RanluxGenerator: gen = rng_class(queue, 5120) else: gen = rng_class(context) for dtype in dtypes: print(dtype) ran = cl_array.zeros(queue, ary_size, dtype) gen.fill_uniform(ran) if plot_hist: import matplotlib.pyplot as pt pt.hist(ran.get(), 30) pt.show() assert (0 <= ran.get()).all() assert (ran.get() <= 1).all() if rng_class is RanluxGenerator: gen.synchronize(queue) ran = cl_array.zeros(queue, ary_size, dtype) gen.fill_uniform(ran, a=4, b=7) ran_host = ran.get() for cond in [4 <= ran_host, ran_host <= 7]: good = cond.all() if not good: print(np.where(~cond)) print(ran_host[~cond]) assert good ran = gen.normal(queue, ary_size, dtype, mu=10, sigma=3) if plot_hist: import matplotlib.pyplot as pt pt.hist(ran.get(), 30) pt.show()
def test_hankel_01_complex(ctx_factory, ref_src): ctx = ctx_factory() queue = cl.CommandQueue(ctx) if not has_double_support(ctx.devices[0]): from pytest import skip skip("no double precision support--cannot test complex bessel function") n = 10**6 np.random.seed(11) z = ( np.logspace(-5, 2, n) * np.exp(1j * 2 * np.pi * np.random.rand(n))) def get_err(check, ref): return np.max(np.abs(check-ref)) / np.max(np.abs(ref)) if ref_src == "pyfmmlib": pyfmmlib = pytest.importorskip("pyfmmlib") h0_ref, h1_ref = pyfmmlib.hank103_vec(z, ifexpon=1) elif ref_src == "scipy": spec = pytest.importorskip("scipy.special") h0_ref = spec.hankel1(0, z) h1_ref = spec.hankel1(1, z) else: raise ValueError("ref_src") z_dev = cl_array.to_device(queue, z) h0_dev, h1_dev = clmath.hankel_01(z_dev) rel_err_h0 = np.abs(h0_dev.get() - h0_ref)/np.abs(h0_ref) rel_err_h1 = np.abs(h1_dev.get() - h1_ref)/np.abs(h1_ref) max_rel_err_h0 = np.max(rel_err_h0) max_rel_err_h1 = np.max(rel_err_h1) print("H0", max_rel_err_h0) print("H1", max_rel_err_h1) assert max_rel_err_h0 < 4e-13 assert max_rel_err_h1 < 2e-13 if 0: import matplotlib.pyplot as pt pt.loglog(np.abs(z), rel_err_h0) pt.loglog(np.abs(z), rel_err_h1) pt.show()
def get_dot_kernel(ctx, dtype_out, dtype_a=None, dtype_b=None, conjugate_first=False): from pyopencl.characterize import has_double_support map_expr, dtype_out, dtype_b = _get_dot_expr( dtype_out, dtype_a, dtype_b, conjugate_first, has_double_support=has_double_support(ctx.devices[0])) return ReductionKernel(ctx, dtype_out, neutral="0", reduce_expr="a+b", map_expr=map_expr, arguments= "const %(tp_a)s *a, " "const %(tp_b)s *b" % { "tp_a": dtype_to_ctype(dtype_a), "tp_b": dtype_to_ctype(dtype_b), })
def test_hankel_01_complex(ctx_factory, ref_src): ctx = ctx_factory() queue = cl.CommandQueue(ctx) if not has_double_support(ctx.devices[0]): from pytest import skip skip( "no double precision support--cannot test complex bessel function") n = 10**6 np.random.seed(11) z = (np.logspace(-5, 2, n) * np.exp(1j * 2 * np.pi * np.random.rand(n))) def get_err(check, ref): return np.max(np.abs(check - ref)) / np.max(np.abs(ref)) if ref_src == "pyfmmlib": pyfmmlib = pytest.importorskip("pyfmmlib") h0_ref, h1_ref = pyfmmlib.hank103_vec(z, ifexpon=1) elif ref_src == "scipy": spec = pytest.importorskip("scipy.special") h0_ref = spec.hankel1(0, z) h1_ref = spec.hankel1(1, z) else: raise ValueError("ref_src") z_dev = cl_array.to_device(queue, z) h0_dev, h1_dev = clmath.hankel_01(z_dev) rel_err_h0 = np.abs(h0_dev.get() - h0_ref) / np.abs(h0_ref) rel_err_h1 = np.abs(h1_dev.get() - h1_ref) / np.abs(h1_ref) max_rel_err_h0 = np.max(rel_err_h0) max_rel_err_h1 = np.max(rel_err_h1) print("H0", max_rel_err_h0) print("H1", max_rel_err_h1) assert max_rel_err_h0 < 4e-13 assert max_rel_err_h1 < 2e-13 if 0: import matplotlib.pyplot as pt pt.loglog(np.abs(z), rel_err_h0) pt.loglog(np.abs(z), rel_err_h1) pt.show()
def test_random(ctx_getter): context = ctx_getter() queue = cl.CommandQueue(context) from pyopencl.clrandom import rand as clrand if has_double_support(context.devices[0]): dtypes = [np.float32, np.float64] else: dtypes = [np.float32] for dtype in dtypes: a = clrand(context, queue, (10, 100), dtype=dtype).get() assert (0 <= a).all() assert (a < 1).all()
def test_bitonic_argsort(ctx_factory, size, dtype): import sys is_pypy = '__pypy__' in sys.builtin_module_names if not size and is_pypy: # https://bitbucket.org/pypy/numpy/issues/53/specifying-strides-on-zero-sized-array pytest.xfail("pypy doesn't seem to handle as_strided " "on zero-sized arrays very well") ctx = cl.create_some_context() queue = cl.CommandQueue(ctx) dev = ctx.devices[0] if (dev.platform.name == "Portable Computing Language" and sys.platform == "darwin"): pytest.xfail("Bitonic sort crashes on Apple POCL") if (dev.platform.name == "Apple" and dev.type & cl.device_type.CPU): pytest.xfail("Bitonic sort won't work on Apple CPU: no workgroup " "parallelism") if (dev.platform.name == "Portable Computing Language" and dtype == np.float64 and get_pocl_version(dev.platform) < (1, 0)): pytest.xfail( "Double precision bitonic sort doesn't work on POCL < 1.0") if dtype == np.float64 and not has_double_support(dev): from pytest import skip skip("double precision not supported on %s" % dev) import pyopencl.clrandom as clrandom from pyopencl.bitonic_sort import BitonicSort index = cl_array.arange(queue, 0, size, 1, dtype=np.int32) m = clrandom.rand(queue, (size, ), dtype, luxury=None, a=0, b=239432234) sorterm = BitonicSort(ctx) ms, evt = sorterm(m.copy(), idx=index, axis=0) assert np.array_equal(np.sort(m.get()), ms.get()) # may be False because of identical values in array # assert np.array_equal(np.argsort(m.get()), index.get()) # Check values by indices assert np.array_equal(m.get()[np.argsort(m.get())], m.get()[index.get()])
def test_pow_neg1_vs_inv(ctx_factory): ctx = ctx_factory() queue = cl.CommandQueue(ctx) device = ctx.devices[0] if not has_double_support(device): from py.test import skip skip("double precision not supported on %s" % device) a_dev = make_random_array(queue, np.complex128, 20000) res1 = (a_dev ** (-1)).get() res2 = (1/a_dev).get() ref = 1/a_dev.get() assert la.norm(res1-ref, np.inf) / la.norm(ref) < 1e-13 assert la.norm(res2-ref, np.inf) / la.norm(ref) < 1e-13
def test_bitonic_argsort(ctx_factory, size, dtype): import sys is_pypy = '__pypy__' in sys.builtin_module_names if not size and is_pypy: # https://bitbucket.org/pypy/numpy/issues/53/specifying-strides-on-zero-sized-array pytest.xfail("pypy doesn't seem to handle as_strided " "on zero-sized arrays very well") ctx = cl.create_some_context() queue = cl.CommandQueue(ctx) dev = ctx.devices[0] if (dev.platform.name == "Portable Computing Language" and sys.platform == "darwin"): pytest.xfail("Bitonic sort crashes on Apple POCL") if (dev.platform.name == "Apple" and dev.type & cl.device_type.CPU): pytest.xfail("Bitonic sort won't work on Apple CPU: no workgroup " "parallelism") if (dev.platform.name == "Portable Computing Language" and dtype == np.float64 and get_pocl_version(dev.platform) < (1, 0)): pytest.xfail("Double precision bitonic sort doesn't work on POCL < 1.0") if dtype == np.float64 and not has_double_support(dev): from pytest import skip skip("double precision not supported on %s" % dev) import pyopencl.clrandom as clrandom from pyopencl.bitonic_sort import BitonicSort index = cl_array.arange(queue, 0, size, 1, dtype=np.int32) m = clrandom.rand(queue, (size,), dtype, luxury=None, a=0, b=239432234) sorterm = BitonicSort(ctx) ms, evt = sorterm(m.copy(), idx=index, axis=0) assert np.array_equal(np.sort(m.get()), ms.get()) # may be False because of identical values in array # assert np.array_equal(np.argsort(m.get()), index.get()) # Check values by indices assert np.array_equal(m.get()[np.argsort(m.get())], m.get()[index.get()])
def test_dot(ctx_factory): from pytest import importorskip importorskip("mako") context = ctx_factory() queue = cl.CommandQueue(context) dev = context.devices[0] dtypes = [np.float32, np.complex64] if has_double_support(dev): if has_struct_arg_count_bug(dev) == "apple": dtypes.extend([np.float64]) else: dtypes.extend([np.float64, np.complex128]) for a_dtype in dtypes: for b_dtype in dtypes: print(a_dtype, b_dtype) a_gpu = general_clrand(queue, (200000,), a_dtype) a = a_gpu.get() b_gpu = general_clrand(queue, (200000,), b_dtype) b = b_gpu.get() dot_ab = np.dot(a, b) dot_ab_gpu = cl_array.dot(a_gpu, b_gpu).get() assert abs(dot_ab_gpu - dot_ab) / abs(dot_ab) < 1e-4 try: vdot_ab = np.vdot(a, b) except NotImplementedError: import sys is_pypy = '__pypy__' in sys.builtin_module_names if is_pypy: print("PYPY: VDOT UNIMPLEMENTED") continue else: raise vdot_ab_gpu = cl_array.vdot(a_gpu, b_gpu).get() rel_err = abs(vdot_ab_gpu - vdot_ab) / abs(vdot_ab) assert rel_err < 1e-4, rel_err
def test_dot(ctx_factory): from pytest import importorskip importorskip("mako") context = ctx_factory() queue = cl.CommandQueue(context) dev = context.devices[0] dtypes = [np.float32, np.complex64] if has_double_support(dev): if has_struct_arg_count_bug(dev) == "apple": dtypes.extend([np.float64]) else: dtypes.extend([np.float64, np.complex128]) for a_dtype in dtypes: for b_dtype in dtypes: print(a_dtype, b_dtype) a_gpu = general_clrand(queue, (200000,), a_dtype) a = a_gpu.get() b_gpu = general_clrand(queue, (200000,), b_dtype) b = b_gpu.get() dot_ab = np.dot(a, b) dot_ab_gpu = cl_array.dot(a_gpu, b_gpu).get() assert abs(dot_ab_gpu - dot_ab) / abs(dot_ab) < 1e-4 try: vdot_ab = np.vdot(a, b) except NotImplementedError: import sys is_pypy = "__pypy__" in sys.builtin_module_names if is_pypy: print("PYPY: VDOT UNIMPLEMENTED") continue else: raise vdot_ab_gpu = cl_array.vdot(a_gpu, b_gpu).get() rel_err = abs(vdot_ab_gpu - vdot_ab) / abs(vdot_ab) assert rel_err < 1e-4, rel_err
def get_subset_dot_kernel(ctx, dtype_out, dtype_subset, dtype_a=None, dtype_b=None, conjugate_first=False): from pyopencl.characterize import has_double_support map_expr, dtype_out, dtype_b = _get_dot_expr( dtype_out, dtype_a, dtype_b, conjugate_first, has_double_support=has_double_support(ctx.devices[0]), index_expr="lookup_tbl[i]") # important: lookup_tbl must be first--it controls the length return ReductionKernel(ctx, dtype_out, neutral="0", reduce_expr="a+b", map_expr=map_expr, arguments=( "const %(tp_lut)s *lookup_tbl, " "const %(tp_a)s *a, " "const %(tp_b)s *b" % { "tp_lut": dtype_to_ctype(dtype_subset), "tp_a": dtype_to_ctype(dtype_a), "tp_b": dtype_to_ctype(dtype_b), }))
def test_minmax(ctx_factory): context = ctx_factory() queue = cl.CommandQueue(context) from pyopencl.clrandom import rand as clrand if has_double_support(context.devices[0]): dtypes = [np.float64, np.float32, np.int32] else: dtypes = [np.float32, np.int32] for what in ["min", "max"]: for dtype in dtypes: a_gpu = clrand(queue, (200000,), dtype) a = a_gpu.get() op_a = getattr(np, what)(a) op_a_gpu = getattr(cl_array, what)(a_gpu).get() assert op_a_gpu == op_a, (op_a_gpu, op_a, dtype, what)
def test_clrandom_dtypes(ctx_factory, rng_class, dtype): cl_ctx = ctx_factory() if dtype == np.float64 and not has_double_support(cl_ctx.devices[0]): pytest.skip("double precision not supported on this device") rng = rng_class(cl_ctx) size = 10 with cl.CommandQueue(cl_ctx) as queue: device = queue.device if device.platform.vendor == "The pocl project" \ and device.type & cl.device_type.GPU \ and rng_class is make_ranlux_generator: pytest.xfail("ranlux test fails on POCL + Nvidia," "at least the K40, as of pocl 1.6, 2021-01-20") rng.uniform(queue, size, dtype) if dtype not in (np.int32, np.int64): rng.normal(queue, size, dtype)
def test_pow_neg1_vs_inv(ctx_factory): ctx = ctx_factory() queue = cl.CommandQueue(ctx) device = ctx.devices[0] if not has_double_support(device): from pytest import skip skip("double precision not supported on %s" % device) if has_struct_arg_count_bug(device) == "apple": from pytest import xfail xfail("apple struct arg counting broken") a_dev = make_random_array(queue, np.complex128, 20000) res1 = (a_dev**(-1)).get() res2 = (1 / a_dev).get() ref = 1 / a_dev.get() assert la.norm(res1 - ref, np.inf) / la.norm(ref) < 1e-13 assert la.norm(res2 - ref, np.inf) / la.norm(ref) < 1e-13
def test_pow_neg1_vs_inv(ctx_factory): ctx = ctx_factory() queue = cl.CommandQueue(ctx) device = ctx.devices[0] if not has_double_support(device): from pytest import skip skip("double precision not supported on %s" % device) if has_struct_arg_count_bug(device) == "apple": from pytest import xfail xfail("apple struct arg counting broken") a_dev = make_random_array(queue, np.complex128, 20000) res1 = (a_dev ** (-1)).get() res2 = (1/a_dev).get() ref = 1/a_dev.get() assert la.norm(res1-ref, np.inf) / la.norm(ref) < 1e-13 assert la.norm(res2-ref, np.inf) / la.norm(ref) < 1e-13
def test(ctx_factory): context = ctx_factory() queue = cl.CommandQueue(context) gpu_func = getattr(clmath, name) cpu_func = getattr(np, numpy_func_names.get(name, name)) dev = context.devices[0] if has_double_support(dev): if use_complex and has_struct_arg_count_bug(dev) == "apple": dtypes = [np.float32, np.float64, np.complex64] elif use_complex: dtypes = [np.float32, np.float64, np.complex64, np.complex128] else: dtypes = [np.float32, np.float64] else: if use_complex: dtypes = [np.float32, np.complex64] else: dtypes = [np.float32] for s in sizes: for dtype in dtypes: dtype = np.dtype(dtype) args = cl_array.arange(queue, a, b, (b - a) / s, dtype=dtype) if dtype.kind == "c": # args = args + dtype.type(1j) * args args = args + args * dtype.type(1j) gpu_results = gpu_func(args).get() cpu_results = cpu_func(args.get()) my_threshold = threshold if dtype.kind == "c" and isinstance(use_complex, float): my_threshold = use_complex max_err = np.max(np.abs(cpu_results - gpu_results)) assert (max_err <= my_threshold).all(), \ (max_err, name, dtype)
def test_subset_minmax(ctx_factory): from pytest import importorskip importorskip("mako") context = ctx_factory() queue = cl.CommandQueue(context) from pyopencl.clrandom import rand as clrand l_a = 200000 gran = 5 l_m = l_a - l_a // gran + 1 if has_double_support(context.devices[0]): dtypes = [np.float64, np.float32, np.int32] else: dtypes = [np.float32, np.int32] for dtype in dtypes: a_gpu = clrand(queue, (l_a,), dtype) a = a_gpu.get() meaningful_indices_gpu = cl_array.zeros( queue, l_m, dtype=np.int32) meaningful_indices = meaningful_indices_gpu.get() j = 0 for i in range(len(meaningful_indices)): meaningful_indices[i] = j j = j + 1 if j % gran == 0: j = j + 1 meaningful_indices_gpu = cl_array.to_device( queue, meaningful_indices) b = a[meaningful_indices] min_a = np.min(b) min_a_gpu = cl_array.subset_min(meaningful_indices_gpu, a_gpu).get() assert min_a_gpu == min_a
def test(ctx_getter): context = ctx_getter() queue = cl.CommandQueue(context) gpu_func = getattr(clmath, name) cpu_func = getattr(np, numpy_func_names.get(name, name)) if has_double_support(context.devices[0]): dtypes = [np.float32, np.float64] else: dtypes = [np.float32] for s in sizes: for dtype in dtypes: args = cl_array.arange(queue, a, b, (b-a)/s, dtype=np.float32) gpu_results = gpu_func(args).get() cpu_results = cpu_func(args.get()) max_err = np.max(np.abs(cpu_results - gpu_results)) assert (max_err <= threshold).all(), \ (max_err, name, dtype)
def get_dot_kernel(ctx, dtype_out, dtype_a=None, dtype_b=None, conjugate_first=False): from pyopencl.characterize import has_double_support map_expr, dtype_out, dtype_b = _get_dot_expr( dtype_out, dtype_a, dtype_b, conjugate_first, has_double_support=has_double_support(ctx.devices[0])) reduce_expr = "a+b" neutral_expr = "0" if dtype_out.kind == "c": from pyopencl.elementwise import complex_dtype_to_name dtname = complex_dtype_to_name(dtype_out) reduce_expr = "%s_add(a, b)" % dtname neutral_expr = "%s_new(0, 0)" % dtname return ReductionKernel(ctx, dtype_out, neutral=neutral_expr, reduce_expr=reduce_expr, map_expr=map_expr, arguments=( "const %(tp_a)s *a, " "const %(tp_b)s *b" % { "tp_a": dtype_to_ctype(dtype_a), "tp_b": dtype_to_ctype(dtype_b), }))
def test_astype(ctx_getter): context = ctx_getter() queue = cl.CommandQueue(context) from pyopencl.clrandom import rand as clrand if not has_double_support(context.devices[0]): return a_gpu = clrand(context, queue, (2000,), dtype=np.float32) a = a_gpu.get().astype(np.float64) a2 = a_gpu.astype(np.float64).get() assert a2.dtype == np.float64 assert la.norm(a - a2) == 0, (a, a2) a_gpu = clrand(context, queue, (2000,), dtype=np.float64) a = a_gpu.get().astype(np.float32) a2 = a_gpu.astype(np.float32).get() assert a2.dtype == np.float32 assert la.norm(a - a2)/la.norm(a) < 1e-7
def test_bitonic_sort(ctx_factory, size, dtype): ctx = cl.create_some_context() queue = cl.CommandQueue(ctx) dev = ctx.devices[0] if (dev.platform.name == "Apple" and dev.type & cl.device_type.CPU): pytest.xfail("Bitonic sort won't work on Apple CPU: no workgroup " "parallelism") if (dev.platform.name == "Portable Computing Language" and dtype == np.float64 and get_pocl_version(dev.platform) < (1, 0)): pytest.xfail("Double precision bitonic sort doesn't work on POCL < 1.0") if dtype == np.float64 and not has_double_support(dev): from pytest import skip skip("double precision not supported on %s" % dev) import pyopencl.clrandom as clrandom from pyopencl.bitonic_sort import BitonicSort s = clrandom.rand(queue, (2, size, 3,), dtype, luxury=None, a=0, b=239482333) sorter = BitonicSort(ctx) sgs, evt = sorter(s.copy(), axis=1) assert np.array_equal(np.sort(s.get(), axis=1), sgs.get())
def __init__(self, queue, num_work_items=None, luxury=None, seed=None, no_warmup=False, use_legacy_init=False, max_work_items=None): """ :param queue: :class:`pyopencl.CommandQueue`, only used for initialization :param luxury: the "luxury value" of the generator, and should be 0-4, where 0 is fastest and 4 produces the best numbers. It can also be >=24, in which case it directly sets the p-value of RANLUXCL. :param num_work_items: is the number of generators to initialize, usually corresponding to the number of work-items in the NDRange RANLUXCL will be used with. May be `None`, in which case a default value is used. :param max_work_items: should reflect the maximum number of work-items that will be used on any parallel instance of RANLUXCL. So for instance if we are launching 5120 work-items on GPU1 and 10240 work-items on GPU2, GPU1's RANLUXCLTab would be generated by calling ranluxcl_intialization with numWorkitems = 5120 while GPU2's RANLUXCLTab would use numWorkitems = 10240. However maxWorkitems must be at least 10240 for both GPU1 and GPU2, and it must be set to the same value for both. (may be `None`) .. versionchanged:: 2013.1 Added default value for `num_work_items`. """ if luxury is None: luxury = 4 if num_work_items is None: if queue.device.type & cl.device_type.CPU: num_work_items = 8 * queue.device.max_compute_units else: num_work_items = 64 * queue.device.max_compute_units if seed is None: from time import time seed = int(time()*1e6) % 2 << 30 self.context = queue.context self.luxury = luxury self.num_work_items = num_work_items from pyopencl.characterize import has_double_support self.support_double = has_double_support(queue.device) self.no_warmup = no_warmup self.use_legacy_init = use_legacy_init self.max_work_items = max_work_items src = """ %(defines)s #include <pyopencl-ranluxcl.cl> kernel void init_ranlux(unsigned seeds, global ranluxcl_state_t *ranluxcltab) { if (get_global_id(0) < %(num_work_items)d) ranluxcl_initialization(seeds, ranluxcltab); } """ % { "defines": self.generate_settings_defines(), "num_work_items": num_work_items } prg = cl.Program(queue.context, src).build() # {{{ compute work group size wg_size = None import sys import platform if ("darwin" in sys.platform and "Apple" in queue.device.platform.vendor and platform.mac_ver()[0].startswith("10.7") and queue.device.type & cl.device_type.CPU): wg_size = (1,) self.wg_size = wg_size # }}} self.state = cl_array.empty(queue, (num_work_items, 112), dtype=np.uint8) self.state.fill(17) prg.init_ranlux(queue, (num_work_items,), self.wg_size, np.uint32(seed), self.state.data)
def test_bessel_j(ctx_factory): try: import scipy.special as spec except ImportError: from py.test import skip skip("scipy not present--cannot test Bessel function") ctx = ctx_factory() queue = cl.CommandQueue(ctx) if not has_double_support(ctx.devices[0]): from py.test import skip skip("no double precision support--cannot test bessel function") nterms = 30 try: from hellskitchen._native import jfuns2d except ImportError: use_hellskitchen = False else: use_hellskitchen = True if use_hellskitchen: a = np.logspace(-3, 3, 10 ** 6) else: a = np.logspace(-5, 5, 10 ** 6) if use_hellskitchen: hellskitchen_result = np.empty((len(a), nterms), dtype=np.complex128) for i, a_i in enumerate(a): if i % 10000 == 0: print "%.1f %%" % (100 * i / len(a)) ier, fjs, _, _ = jfuns2d(nterms, a_i, 1, 0, 10000) hellskitchen_result[i] = fjs[:nterms] assert ier == 0 a_dev = cl_array.to_device(queue, a) for n in range(0, nterms): cl_bessel = clmath.bessel_jn(n, a_dev).get() scipy_bessel = spec.jn(n, a) error_scipy = np.max(np.abs(cl_bessel - scipy_bessel)) assert error_scipy < 1e-10, error_scipy if use_hellskitchen: hk_bessel = hellskitchen_result[:, n] error_hk = np.max(np.abs(cl_bessel - hk_bessel)) assert error_hk < 1e-10, error_hk error_hk_scipy = np.max(np.abs(scipy_bessel - hk_bessel)) print (n, error_scipy, error_hk, error_hk_scipy) else: print (n, error_scipy) assert not np.isnan(cl_bessel).any() if 0 and n == 15: import matplotlib.pyplot as pt # pt.plot(scipy_bessel) # pt.plot(cl_bessel) pt.loglog(a, np.abs(cl_bessel - scipy_bessel), label="vs scipy") if use_hellskitchen: pt.loglog(a, np.abs(cl_bessel - hk_bessel), label="vs hellskitchen") pt.legend() pt.show()
def get_write_kernel(self, index_dtype): index_ctype = dtype_to_ctype(index_dtype) from pyopencl.tools import VectorArg, OtherArg kernel_list_args = [] kernel_list_arg_values = "" user_list_args = [] for name, dtype in self.list_names_and_dtypes: list_name = "plb_%s_list" % name list_arg = VectorArg(dtype, list_name) kernel_list_args.append(list_arg) user_list_args.append(list_arg) if name in self.count_sharing: kernel_list_arg_values += "%s, " % list_name continue kernel_list_args.append( VectorArg(index_dtype, "plb_%s_start_index" % name)) index_name = "plb_%s_index" % name user_list_args.append( OtherArg("%s *%s" % (index_ctype, index_name), index_name)) kernel_list_arg_values += "%s, &%s, " % (list_name, index_name) kernel_name = self.name_prefix + "_write" from pyopencl.characterize import has_double_support src = _LIST_BUILDER_TEMPLATE.render( is_count_stage=False, kernel_name=kernel_name, double_support=all( has_double_support(dev) for dev in self.context.devices), debug=self.debug, do_not_vectorize=self.do_not_vectorize(), kernel_list_arg_decl=_get_arg_decl(kernel_list_args), kernel_list_arg_values=kernel_list_arg_values, user_list_arg_decl=_get_arg_decl(user_list_args), user_list_args=_get_arg_list(user_list_args), user_arg_decl=_get_arg_decl(self.arg_decls), user_args=_get_arg_list(self.arg_decls), list_names_and_dtypes=self.list_names_and_dtypes, count_sharing=self.count_sharing, name_prefix=self.name_prefix, generate_template=self.generate_template, preamble=self.preamble, index_type=index_ctype, ) src = str(src) prg = cl.Program(self.context, src).build(self.options) knl = getattr(prg, kernel_name) from pyopencl.tools import get_arg_list_scalar_arg_dtypes knl.set_scalar_arg_dtypes( get_arg_list_scalar_arg_dtypes(kernel_list_args + self.arg_decls) + [index_dtype]) return knl
cl_buffer_datatype_dict = { np.bool: "bool", np.uint8: "uchar", np.uint16: "ushort", np.uint32: "uint", np.uint64: "ulong", np.int8: "char", np.int16: "short", np.int32: "int", np.int64: "long", np.float32: "float", np.complex64: "cfloat_t", } if characterize.has_double_support(get_device().device): cl_buffer_datatype_dict[np.float64] = "double" def abspath(myPath): """ Get absolute path to resource, works for dev and for PyInstaller """ try: # PyInstaller creates a temp folder and stores path in _MEIPASS base_path = sys._MEIPASS return os.path.join(base_path, os.path.basename(myPath)) except Exception: base_path = os.path.abspath(os.path.dirname(__file__)) return os.path.join(base_path, myPath)
def test_mix_complex(ctx_factory): context = ctx_factory() queue = cl.CommandQueue(context) size = 10 dtypes = [ (np.float32, np.complex64), #(np.int32, np.complex64), ] dev = context.devices[0] if has_double_support(dev) and has_struct_arg_count_bug(dev) == "apple": dtypes.extend([ (np.float32, np.float64), ]) elif has_double_support(dev): dtypes.extend([ (np.float32, np.float64), (np.float32, np.complex128), (np.float64, np.complex64), (np.float64, np.complex128), ]) from operator import add, mul, sub, truediv for op in [add, sub, mul, truediv, pow]: for dtype_a0, dtype_b0 in dtypes: for dtype_a, dtype_b in [ (dtype_a0, dtype_b0), (dtype_b0, dtype_a0), ]: for is_scalar_a, is_scalar_b in [ (False, False), (False, True), (True, False), ]: if is_scalar_a: ary_a = make_random_array(queue, dtype_a, 1).get()[0] host_ary_a = ary_a else: ary_a = make_random_array(queue, dtype_a, size) host_ary_a = ary_a.get() if is_scalar_b: ary_b = make_random_array(queue, dtype_b, 1).get()[0] host_ary_b = ary_b else: ary_b = make_random_array(queue, dtype_b, size) host_ary_b = ary_b.get() print(op, dtype_a, dtype_b, is_scalar_a, is_scalar_b) dev_result = op(ary_a, ary_b).get() host_result = op(host_ary_a, host_ary_b) if host_result.dtype != dev_result.dtype: # This appears to be a numpy bug, where we get # served a Python complex that is really a # smaller numpy complex. print("HOST_DTYPE: %s DEV_DTYPE: %s" % ( host_result.dtype, dev_result.dtype)) dev_result = dev_result.astype(host_result.dtype) err = la.norm(host_result-dev_result)/la.norm(host_result) print(err) correct = err < 1e-4 if not correct: print(host_result) print(dev_result) print(host_result - dev_result) assert correct