def test_key_value_sorter(ctx_factory): from pytest import importorskip importorskip("mako") context = ctx_factory() queue = cl.CommandQueue(context) n = 10**5 nkeys = 2000 from pyopencl.clrandom import rand as clrand keys = clrand(queue, n, np.int32, b=nkeys) values = clrand(queue, n, np.int32, b=n).astype(np.int64) assert np.max(keys.get()) < nkeys from pyopencl.algorithm import KeyValueSorter kvs = KeyValueSorter(context) starts, lists, evt = kvs(queue, keys, values, nkeys, starts_dtype=np.int32) starts = starts.get() lists = lists.get() mydict = dict() for k, v in zip(keys.get(), values.get()): mydict.setdefault(k, []).append(v) for i in range(nkeys): start, end = starts[i:i+2] assert sorted(mydict[i]) == sorted(lists[start:end])
def test_comparisons(ctx_factory): context = ctx_factory() queue = cl.CommandQueue(context) from pyopencl.clrandom import rand as clrand l = 20000 a_dev = clrand(queue, (l,), dtype=np.float32) b_dev = clrand(queue, (l,), dtype=np.float32) a = a_dev.get() b = b_dev.get() import operator as o for op in [o.eq, o.ne, o.le, o.lt, o.ge, o.gt]: res_dev = op(a_dev, b_dev) res = op(a, b) assert (res_dev.get() == res).all() res_dev = op(a_dev, 0) res = op(a, 0) assert (res_dev.get() == res).all() res_dev = op(0, b_dev) res = op(0, b) assert (res_dev.get() == res).all()
def test_astype(ctx_factory): context = ctx_factory() queue = cl.CommandQueue(context) from pyopencl.clrandom import rand as clrand if not has_double_support(context.devices[0]): from pytest import skip skip("double precision not supported on %s" % context.devices[0]) a_gpu = clrand(queue, (2000,), dtype=np.float32) a = a_gpu.get().astype(np.float64) a2 = a_gpu.astype(np.float64).get() assert a2.dtype == np.float64 assert la.norm(a - a2) == 0, (a, a2) a_gpu = clrand(queue, (2000,), dtype=np.float64) a = a_gpu.get().astype(np.float32) a2 = a_gpu.astype(np.float32).get() assert a2.dtype == np.float32 assert la.norm(a - a2) / la.norm(a) < 1e-7
def general_clrand(queue, shape, dtype): from pyopencl.clrandom import rand as clrand dtype = np.dtype(dtype) if dtype.kind == "c": real_dtype = dtype.type(0).real.dtype return clrand(queue, shape, real_dtype) + 1j*clrand(queue, shape, real_dtype) else: return clrand(queue, shape, dtype)
def test_dot(ctx_getter): from pyopencl.clrandom import rand as clrand a_gpu = clrand(context, queue, (200000,)) a = a_gpu.get() b_gpu = clrand(context, queue, (200000,)) b = b_gpu.get() dot_ab = numpy.dot(a, b) dot_ab_gpu = cl_array.dot(a_gpu, b_gpu).get() assert abs(dot_ab_gpu-dot_ab)/abs(dot_ab) < 1e-4
def test_dot(ctx_getter): context = ctx_getter() queue = cl.CommandQueue(context) from pyopencl.clrandom import rand as clrand a_gpu = clrand(context, queue, (200000,), np.float32) a = a_gpu.get() b_gpu = clrand(context, queue, (200000,), np.float32) b = b_gpu.get() dot_ab = np.dot(a, b) dot_ab_gpu = cl_array.dot(a_gpu, b_gpu).get() assert abs(dot_ab_gpu-dot_ab)/abs(dot_ab) < 1e-4
def test_unique(ctx_factory): from pytest import importorskip importorskip("mako") context = ctx_factory() queue = cl.CommandQueue(context) from pyopencl.clrandom import rand as clrand for n in scan_test_counts: a_dev = clrand(queue, (n,), dtype=np.int32, a=0, b=1000) a = a_dev.get() a = np.sort(a) a_dev = cl_array.to_device(queue, a) a_unique_host = np.unique(a) from pyopencl.algorithm import unique a_unique_dev, count_unique_dev, evt = unique(a_dev) count_unique_dev = count_unique_dev.get() assert (a_unique_dev.get()[:count_unique_dev] == a_unique_host).all() from gc import collect collect()
def test_struct_reduce(ctx_factory): pytest.importorskip("mako") context = ctx_factory() queue = cl.CommandQueue(context) dev, = context.devices if (dev.vendor == "NVIDIA" and dev.platform.vendor == "Apple" and dev.driver_version == "8.12.47 310.40.00.05f01"): pytest.skip("causes a compiler hang on Apple/Nv GPU") mmc_dtype, mmc_c_decl = make_mmc_dtype(context.devices[0]) preamble = mmc_c_decl + r"""//CL// minmax_collector mmc_neutral() { // FIXME: needs infinity literal in real use, ok here minmax_collector result; result.cur_min = 1<<30; result.cur_max = -(1<<30); return result; } minmax_collector mmc_from_scalar(float x) { minmax_collector result; result.cur_min = x; result.cur_max = x; return result; } minmax_collector agg_mmc(minmax_collector a, minmax_collector b) { minmax_collector result = a; if (b.cur_min < result.cur_min) result.cur_min = b.cur_min; if (b.cur_max > result.cur_max) result.cur_max = b.cur_max; return result; } """ from pyopencl.clrandom import rand as clrand a_gpu = clrand(queue, (20000,), dtype=np.int32, a=0, b=10**6) a = a_gpu.get() from pyopencl.reduction import ReductionKernel red = ReductionKernel(context, mmc_dtype, neutral="mmc_neutral()", reduce_expr="agg_mmc(a, b)", map_expr="mmc_from_scalar(x[i])", arguments="__global int *x", preamble=preamble) minmax = red(a_gpu).get() #print minmax["cur_min"], minmax["cur_max"] #print np.min(a), np.max(a) assert abs(minmax["cur_min"] - np.min(a)) < 1e-5 assert abs(minmax["cur_max"] - np.max(a)) < 1e-5
def test_subset_minmax(ctx_getter): context = ctx_getter() queue = cl.CommandQueue(context) from pyopencl.clrandom import rand as clrand l_a = 200000 gran = 5 l_m = l_a - l_a // gran + 1 if has_double_support(): dtypes = [numpy.float64, numpy.float32, numpy.int32] else: dtypes = [numpy.float32, numpy.int32] for dtype in dtypes: a_gpu = clrand(context, queue, (l_a,), dtype) a = a_gpu.get() meaningful_indices_gpu = cl_array.zeros(l_m, dtype=numpy.int32) meaningful_indices = meaningful_indices_gpu.get() j = 0 for i in range(len(meaningful_indices)): meaningful_indices[i] = j j = j + 1 if j % gran == 0: j = j + 1 meaningful_indices_gpu = cl_array.to_device(meaningful_indices) b = a[meaningful_indices] min_a = numpy.min(b) min_a_gpu = cl_array.subset_min(meaningful_indices_gpu, a_gpu).get() assert min_a_gpu == min_a
def test_partition(ctx_factory): from pytest import importorskip importorskip("mako") context = ctx_factory() queue = cl.CommandQueue(context) from pyopencl.clrandom import rand as clrand for n in scan_test_counts: print("part", n) a_dev = clrand(queue, (n,), dtype=np.int32, a=0, b=1000) a = a_dev.get() crit = a_dev.dtype.type(300) true_host = a[a > crit] false_host = a[a <= crit] from pyopencl.algorithm import partition true_dev, false_dev, count_true_dev, evt = partition( a_dev, "ary[i] > myval", [("myval", crit)]) count_true_dev = count_true_dev.get() assert (true_dev.get()[:count_true_dev] == true_host).all() assert (false_dev.get()[:n-count_true_dev] == false_host).all()
def test_elwise_kernel_with_options(ctx_factory): from pyopencl.clrandom import rand as clrand from pyopencl.elementwise import ElementwiseKernel context = ctx_factory() queue = cl.CommandQueue(context) in_gpu = clrand(queue, (50,), np.float32) options = ['-D', 'ADD_ONE'] add_one = ElementwiseKernel( context, "float* out, const float *in", """ out[i] = in[i] #ifdef ADD_ONE +1 #endif ; """, options=options, ) out_gpu = cl_array.empty_like(in_gpu) add_one(out_gpu, in_gpu) gt = in_gpu.get() + 1 gv = out_gpu.get() assert la.norm(gv - gt) < 1e-5
def test_concatenate(ctx_factory): context = ctx_factory() queue = cl.CommandQueue(context) from pyopencl.clrandom import rand as clrand a_dev = clrand(queue, (5, 15, 20), dtype=np.float32) b_dev = clrand(queue, (4, 15, 20), dtype=np.float32) c_dev = clrand(queue, (3, 15, 20), dtype=np.float32) a = a_dev.get() b = b_dev.get() c = c_dev.get() cat_dev = cl.array.concatenate((a_dev, b_dev, c_dev)) cat = np.concatenate((a, b, c)) assert la.norm(cat - cat_dev.get()) == 0
def test_slice(ctx_factory): if _PYPY: pytest.xfail("numpypy: spurious as_strided failure") context = ctx_factory() queue = cl.CommandQueue(context) from pyopencl.clrandom import rand as clrand tp = np.float32 ary_len = 20000 a_gpu = clrand(queue, (ary_len,), dtype=tp) b_gpu = clrand(queue, (ary_len,), dtype=tp) a = a_gpu.get() b = b_gpu.get() from random import randrange for i in range(20): start = randrange(ary_len) end = randrange(start, ary_len) a_gpu_slice = tp(2)*a_gpu[start:end] a_slice = tp(2)*a[start:end] assert la.norm(a_gpu_slice.get() - a_slice) == 0 for i in range(20): start = randrange(ary_len) end = randrange(start, ary_len) a_gpu[start:end] = tp(2)*b[start:end] a[start:end] = tp(2)*b[start:end] assert la.norm(a_gpu.get() - a) == 0 for i in range(20): start = randrange(ary_len) end = randrange(start, ary_len) a_gpu[start:end] = tp(2)*b_gpu[start:end] a[start:end] = tp(2)*b[start:end] assert la.norm(a_gpu.get() - a) == 0
def test_elwise_kernel(ctx_getter): context = ctx_getter() queue = cl.CommandQueue(context) from pyopencl.clrandom import rand as clrand a_gpu = clrand(context, queue, (50,), numpy.float32) b_gpu = clrand(context, queue, (50,), numpy.float32) from pyopencl.elementwise import ElementwiseKernel lin_comb = ElementwiseKernel(context, "float a, float *x, float b, float *y, float *z", "z[i] = a*x[i] + b*y[i]", "linear_combination") c_gpu = cl_array.empty_like(a_gpu) lin_comb(5, a_gpu, 6, b_gpu, c_gpu) assert la.norm((c_gpu - (5*a_gpu+6*b_gpu)).get()) < 1e-5
def test_slice(ctx_factory): context = ctx_factory() queue = cl.CommandQueue(context) from pyopencl.clrandom import rand as clrand tp = np.float32 l = 20000 a_gpu = clrand(queue, (l,), dtype=tp) b_gpu = clrand(queue, (l,), dtype=tp) a = a_gpu.get() b = b_gpu.get() from random import randrange for i in range(20): start = randrange(l) end = randrange(start, l) a_gpu_slice = tp(2) * a_gpu[start:end] a_slice = tp(2) * a[start:end] assert la.norm(a_gpu_slice.get() - a_slice) == 0 for i in range(20): start = randrange(l) end = randrange(start, l) a_gpu[start:end] = tp(2) * b[start:end] a[start:end] = tp(2) * b[start:end] assert la.norm(a_gpu.get() - a) == 0 for i in range(20): start = randrange(l) end = randrange(start, l) a_gpu[start:end] = tp(2) * b_gpu[start:end] a[start:end] = tp(2) * b[start:end] assert la.norm(a_gpu.get() - a) == 0
def test_if_positive(ctx_factory): context = ctx_factory() queue = cl.CommandQueue(context) from pyopencl.clrandom import rand as clrand ary_len = 20000 a_gpu = clrand(queue, (ary_len,), np.float32) b_gpu = clrand(queue, (ary_len,), np.float32) a = a_gpu.get() b = b_gpu.get() max_a_b_gpu = cl_array.maximum(a_gpu, b_gpu) min_a_b_gpu = cl_array.minimum(a_gpu, b_gpu) print(max_a_b_gpu) print(np.maximum(a, b)) assert la.norm(max_a_b_gpu.get() - np.maximum(a, b)) == 0 assert la.norm(min_a_b_gpu.get() - np.minimum(a, b)) == 0
def test_if_positive(ctx_getter): context = ctx_getter() queue = cl.CommandQueue(context) from pyopencl.clrandom import rand as clrand l = 20000 a_gpu = clrand(context, queue, (l,), numpy.float32) b_gpu = clrand(context, queue, (l,), numpy.float32) a = a_gpu.get() b = b_gpu.get() max_a_b_gpu = cl_array.maximum(a_gpu, b_gpu) min_a_b_gpu = cl_array.minimum(a_gpu, b_gpu) print(max_a_b_gpu) print(numpy.maximum(a, b)) assert la.norm(max_a_b_gpu.get()- numpy.maximum(a, b)) == 0 assert la.norm(min_a_b_gpu.get()- numpy.minimum(a, b)) == 0
def test_struct_reduce(ctx_factory): context = ctx_factory() queue = cl.CommandQueue(context) mmc_dtype, mmc_c_decl = make_mmc_dtype(context.devices[0]) preamble = mmc_c_decl + r"""//CL// minmax_collector mmc_neutral() { // FIXME: needs infinity literal in real use, ok here minmax_collector result; result.cur_min = 1<<30; result.cur_max = -(1<<30); return result; } minmax_collector mmc_from_scalar(float x) { minmax_collector result; result.cur_min = x; result.cur_max = x; return result; } minmax_collector agg_mmc(minmax_collector a, minmax_collector b) { minmax_collector result = a; if (b.cur_min < result.cur_min) result.cur_min = b.cur_min; if (b.cur_max > result.cur_max) result.cur_max = b.cur_max; return result; } """ from pyopencl.clrandom import rand as clrand a_gpu = clrand(queue, (20000,), dtype=np.int32, a=0, b=10**6) a = a_gpu.get() from pyopencl.reduction import ReductionKernel red = ReductionKernel(context, mmc_dtype, neutral="mmc_neutral()", reduce_expr="agg_mmc(a, b)", map_expr="mmc_from_scalar(x[i])", arguments="__global int *x", preamble=preamble) minmax = red(a_gpu).get() #print minmax["cur_min"], minmax["cur_max"] #print np.min(a), np.max(a) assert abs(minmax["cur_min"] - np.min(a)) < 1e-5 assert abs(minmax["cur_max"] - np.max(a)) < 1e-5
def test_diff(ctx_factory): context = ctx_factory() queue = cl.CommandQueue(context) from pyopencl.clrandom import rand as clrand l = 20000 a_dev = clrand(queue, (l,), dtype=np.float32) a = a_dev.get() err = la.norm((cl.array.diff(a_dev).get() - np.diff(a))) assert err < 1e-4
def test_sum(ctx_getter): context = ctx_getter() queue = cl.CommandQueue(context) from pyopencl.clrandom import rand as clrand a_gpu = clrand(context, queue, (200000,), np.float32) a = a_gpu.get() sum_a = np.sum(a) sum_a_gpu = cl_array.sum(a_gpu).get() assert abs(sum_a_gpu-sum_a)/abs(sum_a) < 1e-4
def test_sum(ctx_getter): context = ctx_getter() queue = cl.CommandQueue(context) from pyopencl.clrandom import rand as clrand a_gpu = clrand(context, queue, (200000,)) a = a_gpu.get() sum_a = numpy.sum(a) from pycuda.reduction import get_sum_kernel sum_a_gpu = cl_array.sum(a_gpu).get() assert abs(sum_a_gpu-sum_a)/abs(sum_a) < 1e-4
def test_transpose(ctx_factory): if _PYPY: pytest.xfail("numpypy: no array creation from __array_interface__") context = ctx_factory() queue = cl.CommandQueue(context) from pyopencl.clrandom import rand as clrand a_gpu = clrand(queue, (10, 20, 30), dtype=np.float32) a = a_gpu.get() # FIXME: not contiguous #assert np.allclose(a_gpu.transpose((1,2,0)).get(), a.transpose((1,2,0))) assert np.array_equal(a_gpu.T.get(), a.T)
def test_astype(ctx_getter): context = ctx_getter() queue = cl.CommandQueue(context) from pyopencl.clrandom import rand as clrand if not has_double_support(context.devices[0]): return a_gpu = clrand(context, queue, (2000,), dtype=numpy.float32) a = a_gpu.get().astype(numpy.float64) a2 = a_gpu.astype(numpy.float64).get() assert a2.dtype == numpy.float64 assert la.norm(a - a2) == 0, (a, a2) a_gpu = clrand(context, queue, (2000,), dtype=numpy.float64) a = a_gpu.get().astype(numpy.float32) a2 = a_gpu.astype(numpy.float32).get() assert a2.dtype == numpy.float32 assert la.norm(a - a2)/la.norm(a) < 1e-7
def test_newaxis(ctx_factory): context = ctx_factory() queue = cl.CommandQueue(context) from pyopencl.clrandom import rand as clrand a_gpu = clrand(queue, (10, 20, 30), dtype=np.float32) a = a_gpu.get() b_gpu = a_gpu[:, np.newaxis] b = a[:, np.newaxis] assert b_gpu.shape == b.shape for i in range(b.ndim): if b.shape[i] > 1: assert b_gpu.strides[i] == b.strides[i]
def test_slice(ctx_getter): from pyopencl.clrandom import rand as clrand l = 20000 a_gpu = clrand(context, queue, (l,)) a = a_gpu.get() from random import randrange for i in range(200): start = randrange(l) end = randrange(start, l) a_gpu_slice = a_gpu[start:end] a_slice = a[start:end] assert la.norm(a_gpu_slice.get()-a_slice) == 0
def test_view_and_strides(ctx_factory): context = ctx_factory() queue = cl.CommandQueue(context) from pyopencl.clrandom import rand as clrand X = clrand(queue, (5, 10), dtype=np.float32) Y = X[:3, :5] y = Y.view() assert y.shape == Y.shape assert y.strides == Y.strides import pytest with pytest.raises(AssertionError): assert (y.get() == X.get()[:3, :5]).all()
def test_random(ctx_getter): context = ctx_getter() queue = cl.CommandQueue(context) from pyopencl.clrandom import rand as clrand if has_double_support(context.devices[0]): dtypes = [numpy.float32, numpy.float64] else: dtypes = [numpy.float32] for dtype in dtypes: a = clrand(context, queue, (10, 100), dtype=dtype).get() assert (0 <= a).all() assert (a < 1).all()
def test_copy_if(ctx_factory): context = ctx_factory() queue = cl.CommandQueue(context) from pyopencl.clrandom import rand as clrand for n in scan_test_counts: a_dev = clrand(queue, (n,), dtype=np.int32, a=0, b=1000) a = a_dev.get() from pyopencl.algorithm import copy_if crit = a_dev.dtype.type(300) selected = a[a>crit] selected_dev, count_dev = copy_if(a_dev, "ary[i] > myval", [("myval", crit)]) assert (selected_dev.get()[:count_dev.get()] == selected).all() from gc import collect collect()
def test_view_and_strides(ctx_factory): if _PYPY: pytest.xfail("numpypy: no array creation from __array_interface__") return context = ctx_factory() queue = cl.CommandQueue(context) from pyopencl.clrandom import rand as clrand x = clrand(queue, (5, 10), dtype=np.float32) y = x[:3, :5] yv = y.view() assert yv.shape == y.shape assert yv.strides == y.strides with pytest.raises(AssertionError): assert (yv.get() == x.get()[:3, :5]).all()
def no_test_slice(ctx_factory): context = ctx_factory() queue = cl.CommandQueue(context) from pyopencl.clrandom import rand as clrand l = 20000 a_gpu = clrand(queue, (l,)) a = a_gpu.get() from random import randrange for i in range(200): start = randrange(l) end = randrange(start, l) a_gpu_slice = a_gpu[start:end] a_slice = a[start:end] assert la.norm(a_gpu_slice.get() - a_slice) == 0
def test_partition(ctx_factory): context = ctx_factory() queue = cl.CommandQueue(context) from pyopencl.clrandom import rand as clrand for n in scan_test_counts: a_dev = clrand(queue, (n,), dtype=np.int32, a=0, b=1000) a = a_dev.get() crit = a_dev.dtype.type(300) true_host = a[a>crit] false_host = a[a<=crit] from pyopencl.algorithm import partition true_dev, false_dev, count_true_dev = partition(a_dev, "ary[i] > myval", [("myval", crit)]) count_true_dev = count_true_dev.get() assert (true_dev.get()[:count_true_dev] == true_host).all() assert (false_dev.get()[:n-count_true_dev] == false_host).all()
def test_minmax(ctx_factory): context = ctx_factory() queue = cl.CommandQueue(context) from pyopencl.clrandom import rand as clrand if has_double_support(context.devices[0]): dtypes = [np.float64, np.float32, np.int32] else: dtypes = [np.float32, np.int32] for what in ["min", "max"]: for dtype in dtypes: a_gpu = clrand(queue, (200000,), dtype) a = a_gpu.get() op_a = getattr(np, what)(a) op_a_gpu = getattr(cl_array, what)(a_gpu).get() assert op_a_gpu == op_a, (op_a_gpu, op_a, dtype, what)
def test_unique(ctx_factory): context = ctx_factory() queue = cl.CommandQueue(context) from pyopencl.clrandom import rand as clrand for n in scan_test_counts: a_dev = clrand(queue, (n,), dtype=np.int32, a=0, b=1000) a = a_dev.get() a = np.sort(a) a_dev = cl_array.to_device(queue, a) a_unique_host = np.unique(a) from pyopencl.algorithm import unique a_unique_dev, count_unique_dev = unique(a_dev) count_unique_dev = count_unique_dev.get() assert (a_unique_dev.get()[:count_unique_dev] == a_unique_host).all() from gc import collect collect()
def test_event_management(ctx_factory): context = ctx_factory() queue = cl.CommandQueue(context) from pyopencl.clrandom import rand as clrand x = clrand(queue, (5, 10), dtype=np.float32) assert len(x.events) == 1, len(x.events) x.finish() assert len(x.events) == 0 y = x + x assert len(y.events) == 1 y = x * x assert len(y.events) == 1 y = 2 * x assert len(y.events) == 1 y = 2 / x assert len(y.events) == 1 y = x / 2 assert len(y.events) == 1 y = x**2 assert len(y.events) == 1 y = 2**x assert len(y.events) == 1 for i in range(10): x.fill(0) assert len(x.events) == 10 for i in range(1000): x.fill(0) assert len(x.events) < 100
def test_subset_minmax(ctx_getter): context = ctx_getter() queue = cl.CommandQueue(context) from pyopencl.clrandom import rand as clrand l_a = 200000 gran = 5 l_m = l_a - l_a // gran + 1 if has_double_support(): dtypes = [numpy.float64, numpy.float32, numpy.int32] else: dtypes = [numpy.float32, numpy.int32] for dtype in dtypes: a_gpu = clrand(context, queue, (l_a, ), dtype) a = a_gpu.get() meaningful_indices_gpu = cl_array.zeros(l_m, dtype=numpy.int32) meaningful_indices = meaningful_indices_gpu.get() j = 0 for i in range(len(meaningful_indices)): meaningful_indices[i] = j j = j + 1 if j % gran == 0: j = j + 1 meaningful_indices_gpu = cl_array.to_device(meaningful_indices) b = a[meaningful_indices] min_a = numpy.min(b) min_a_gpu = cl_array.subset_min(meaningful_indices_gpu, a_gpu).get() assert min_a_gpu == min_a
def test_bitwise(ctx_factory): if _PYPY: pytest.xfail("numpypy: missing bitwise ops") context = ctx_factory() queue = cl.CommandQueue(context) from itertools import product dtypes = [np.dtype(t) for t in (np.int64, np.int32, np.int16, np.int8)] from pyopencl.clrandom import rand as clrand for a_dtype, b_dtype in product(dtypes, dtypes): ary_len = 16 np.random.seed(10) int32_min = np.iinfo(np.int32).min int32_max = np.iinfo(np.int32).max a_dev = clrand(queue, (ary_len, ), a=int32_min, b=1 + int32_max, dtype=np.int64).astype(a_dtype) b_dev = clrand(queue, (ary_len, ), a=int32_min, b=1 + int32_max, dtype=np.int64).astype(b_dtype) a = a_dev.get() b = b_dev.get() s = int( clrand(queue, (), a=int32_min, b=1 + int32_max, dtype=np.int64).astype(b_dtype).get()) import operator as o for op in [o.and_, o.or_, o.xor]: res_dev = op(a_dev, b_dev) res = op(a, b) assert (res_dev.get() == res).all() res_dev = op(a_dev, s) res = op(a, s) assert (res_dev.get() == res).all() res_dev = op(s, b_dev) res = op(s, b) assert (res_dev.get() == res).all() for op in [o.iand, o.ior, o.ixor]: res_dev = a_dev.copy() op_res = op(res_dev, b_dev) assert op_res is res_dev res = a.copy() op(res, b) assert (res_dev.get() == res).all() res_dev = a_dev.copy() op_res = op(res_dev, s) assert op_res is res_dev res = a.copy() op(res, s) assert (res_dev.get() == res).all() # Test unary ~ res_dev = ~a_dev res = ~a # pylint:disable=invalid-unary-operand-type assert (res_dev.get() == res).all()
} minmax_collector agg_mmc(minmax_collector a, minmax_collector b) { minmax_collector result = a; if (b.cur_min < result.cur_min) result.cur_min = b.cur_min; if (b.cur_max > result.cur_max) result.cur_max = b.cur_max; return result; } """ from pyopencl.clrandom import rand as clrand a_gpu = clrand(queue, (20000, ), dtype=np.int32, a=0, b=10**6) a = a_gpu.get() from pyopencl.reduction import ReductionKernel red = ReductionKernel(ctx, mmc_dtype, neutral="mmc_neutral()", reduce_expr="agg_mmc(a, b)", map_expr="mmc_from_scalar(x[i])", arguments="__global int *x", preamble=preamble) minmax = red(a_gpu).get() assert abs(minmax["cur_min"] - np.min(a)) < 1e-5 assert abs(minmax["cur_max"] - np.max(a)) < 1e-5