Beispiel #1
0
def test_divide_array(ctx_factory):
    """Test the division of an array and a scalar. """

    context = ctx_factory()
    queue = cl.CommandQueue(context)

    dtypes = (np.float32, np.complex64)
    from pyopencl.characterize import has_double_support
    if has_double_support(queue.device):
        dtypes = dtypes + (np.float64, np.complex128)

    from itertools import product

    for dtype_a, dtype_b in product(dtypes, repeat=2):

        a = np.array([10, 20, 30, 40, 50, 60, 70, 80, 90, 100]).astype(dtype_a)
        b = np.array([10, 10, 10, 10, 10, 10, 10, 10, 10, 10]).astype(dtype_b)

        a_gpu = cl_array.to_device(queue, a)
        b_gpu = cl_array.to_device(queue, b)
        c = a / b
        c_gpu = (a_gpu / b_gpu)
        assert (np.abs(c_gpu.get() - c) < 1e-3).all()
        assert c_gpu.dtype is c.dtype

        d = b / a
        d_gpu = (b_gpu / a_gpu)
        assert (np.abs(d_gpu.get() - d) < 1e-3).all()
        assert d_gpu.dtype is d.dtype
Beispiel #2
0
def test_dot(ctx_factory):
    from pytest import importorskip
    importorskip("mako")

    context = ctx_factory()
    queue = cl.CommandQueue(context)

    dtypes = [np.float32, np.complex64]
    if has_double_support(context.devices[0]):
        dtypes.extend([np.float64, np.complex128])

    for a_dtype in dtypes:
        for b_dtype in dtypes:
            print(a_dtype, b_dtype)
            a_gpu = general_clrand(queue, (200000, ), a_dtype)
            a = a_gpu.get()
            b_gpu = general_clrand(queue, (200000, ), b_dtype)
            b = b_gpu.get()

            dot_ab = np.dot(a, b)
            dot_ab_gpu = cl_array.dot(a_gpu, b_gpu).get()

            assert abs(dot_ab_gpu - dot_ab) / abs(dot_ab) < 1e-4

            vdot_ab = np.vdot(a, b)
            vdot_ab_gpu = cl_array.vdot(a_gpu, b_gpu).get()

            assert abs(vdot_ab_gpu - vdot_ab) / abs(vdot_ab) < 1e-4
def get_dot_kernel(ctx,
                   dtype_out,
                   dtype_a=None,
                   dtype_b=None,
                   conjugate_first=False):
    from pyopencl.characterize import has_double_support
    map_expr, dtype_out, dtype_b = _get_dot_expr(
        dtype_out,
        dtype_a,
        dtype_b,
        conjugate_first,
        has_double_support=has_double_support(ctx.devices[0]))

    reduce_expr = "a+b"
    neutral_expr = "0"
    if dtype_out.kind == "c":
        from pyopencl.elementwise import complex_dtype_to_name
        dtname = complex_dtype_to_name(dtype_out)
        reduce_expr = "%s_add(a, b)" % dtname
        neutral_expr = "%s_new(0, 0)" % dtname

    return ReductionKernel(ctx,
                           dtype_out,
                           neutral=neutral_expr,
                           reduce_expr=reduce_expr,
                           map_expr=map_expr,
                           arguments=("const %(tp_a)s *a, "
                                      "const %(tp_b)s *b" % {
                                          "tp_a": dtype_to_ctype(dtype_a),
                                          "tp_b": dtype_to_ctype(dtype_b),
                                      }))
Beispiel #4
0
def test_random(ctx_factory):
    context = ctx_factory()
    queue = cl.CommandQueue(context)

    from pyopencl.clrandom import RanluxGenerator

    if has_double_support(context.devices[0]):
        dtypes = [np.float32, np.float64]
    else:
        dtypes = [np.float32]

    gen = RanluxGenerator(queue, 5120)

    for ary_size in [300, 301, 302, 303, 10007]:
        for dtype in dtypes:
            ran = cl_array.zeros(queue, ary_size, dtype)
            gen.fill_uniform(ran)
            assert (0 < ran.get()).all()
            assert (ran.get() < 1).all()

            gen.synchronize(queue)

            ran = cl_array.zeros(queue, ary_size, dtype)
            gen.fill_uniform(ran, a=4, b=7)
            assert (4 < ran.get()).all()
            assert (ran.get() < 7).all()

            ran = gen.normal(queue, (10007,), dtype, mu=4, sigma=3)

    dtypes = [np.int32]
    for dtype in dtypes:
        ran = gen.uniform(queue, (10000007,), dtype, a=200, b=300)
        assert (200 <= ran.get()).all()
        assert (ran.get() < 300).all()
Beispiel #5
0
def test_astype(ctx_factory):
    context = ctx_factory()
    queue = cl.CommandQueue(context)

    from pyopencl.clrandom import rand as clrand

    if not has_double_support(context.devices[0]):
        from pytest import skip
        skip("double precision not supported on %s" % context.devices[0])

    a_gpu = clrand(queue, (2000,), dtype=np.float32)

    a = a_gpu.get().astype(np.float64)
    a2 = a_gpu.astype(np.float64).get()

    assert a2.dtype == np.float64
    assert la.norm(a - a2) == 0, (a, a2)

    a_gpu = clrand(queue, (2000,), dtype=np.float64)

    a = a_gpu.get().astype(np.float32)
    a2 = a_gpu.astype(np.float32).get()

    assert a2.dtype == np.float32
    assert la.norm(a - a2) / la.norm(a) < 1e-7
Beispiel #6
0
def test_dot(ctx_factory):
    from pytest import importorskip
    importorskip("mako")

    context = ctx_factory()
    queue = cl.CommandQueue(context)

    dtypes = [np.float32, np.complex64]
    if has_double_support(context.devices[0]):
        dtypes.extend([np.float64, np.complex128])

    for a_dtype in dtypes:
        for b_dtype in dtypes:
            print(a_dtype, b_dtype)
            a_gpu = general_clrand(queue, (200000,), a_dtype)
            a = a_gpu.get()
            b_gpu = general_clrand(queue, (200000,), b_dtype)
            b = b_gpu.get()

            dot_ab = np.dot(a, b)
            dot_ab_gpu = cl_array.dot(a_gpu, b_gpu).get()

            assert abs(dot_ab_gpu - dot_ab) / abs(dot_ab) < 1e-4

            vdot_ab = np.vdot(a, b)
            vdot_ab_gpu = cl_array.vdot(a_gpu, b_gpu).get()

            assert abs(vdot_ab_gpu - vdot_ab) / abs(vdot_ab) < 1e-4
Beispiel #7
0
def _get_reduction_source(
        ctx, out_type, out_type_size,
        neutral, reduce_expr, map_expr, parsed_args,
        name="reduce_kernel", preamble="", arg_prep="",
        device=None, max_group_size=None):

    if device is not None:
        devices = [device]
    else:
        devices = ctx.devices

    # {{{ compute group size

    def get_dev_group_size(device):
        # dirty fix for the RV770 boards
        max_work_group_size = device.max_work_group_size
        if "RV770" in device.name:
            max_work_group_size = 64

        # compute lmem limit
        from pytools import div_ceil
        lmem_wg_size = div_ceil(max_work_group_size, out_type_size)
        result = min(max_work_group_size, lmem_wg_size)

        # round down to power of 2
        from pyopencl.tools import bitlog2
        return 2**bitlog2(result)

    group_size = min(get_dev_group_size(dev) for dev in devices)

    if max_group_size is not None:
        group_size = min(max_group_size, group_size)

    # }}}

    from mako.template import Template
    from pytools import all
    from pyopencl.characterize import has_double_support
    src = str(Template(KERNEL).render(
        out_type=out_type,
        arguments=", ".join(arg.declarator() for arg in parsed_args),
        group_size=group_size,
        neutral=neutral,
        reduce_expr=_process_code_for_macro(reduce_expr),
        map_expr=_process_code_for_macro(map_expr),
        name=name,
        preamble=preamble,
        arg_prep=arg_prep,
        double_support=all(has_double_support(dev) for dev in devices),
        ))

    from pytools import Record

    class ReductionInfo(Record):
        pass

    return ReductionInfo(
            context=ctx,
            source=src,
            group_size=group_size)
Beispiel #8
0
    def get_write_kernel(self, index_dtype):
        index_ctype = dtype_to_ctype(index_dtype)
        from pyopencl.tools import VectorArg, OtherArg

        kernel_list_args = []
        kernel_list_arg_values = ""
        user_list_args = []

        for name, dtype in self.list_names_and_dtypes:
            list_name = "plb_%s_list" % name
            list_arg = VectorArg(dtype, list_name)

            kernel_list_args.append(list_arg)
            user_list_args.append(list_arg)

            if name in self.count_sharing:
                kernel_list_arg_values += "%s, " % list_name
                continue

            kernel_list_args.append(VectorArg(index_dtype, "plb_%s_start_index" % name))

            index_name = "plb_%s_index" % name
            user_list_args.append(OtherArg("%s *%s" % (index_ctype, index_name), index_name))

            kernel_list_arg_values += "%s, &%s, " % (list_name, index_name)

        kernel_name = self.name_prefix + "_write"

        from pyopencl.characterize import has_double_support

        src = _LIST_BUILDER_TEMPLATE.render(
            is_count_stage=False,
            kernel_name=kernel_name,
            double_support=all(has_double_support(dev) for dev in self.context.devices),
            debug=self.debug,
            do_not_vectorize=self.do_not_vectorize(),
            kernel_list_arg_decl=_get_arg_decl(kernel_list_args),
            kernel_list_arg_values=kernel_list_arg_values,
            user_list_arg_decl=_get_arg_decl(user_list_args),
            user_list_args=_get_arg_list(user_list_args),
            user_arg_decl=_get_arg_decl(self.arg_decls),
            user_args=_get_arg_list(self.arg_decls),
            list_names_and_dtypes=self.list_names_and_dtypes,
            count_sharing=self.count_sharing,
            name_prefix=self.name_prefix,
            generate_template=self.generate_template,
            preamble=self.preamble,
            index_type=index_ctype,
        )

        src = str(src)

        prg = cl.Program(self.context, src).build(self.options)
        knl = getattr(prg, kernel_name)

        from pyopencl.tools import get_arg_list_scalar_arg_dtypes

        knl.set_scalar_arg_dtypes(get_arg_list_scalar_arg_dtypes(kernel_list_args + self.arg_decls) + [index_dtype])

        return knl
Beispiel #9
0
def test_bitonic_sort(ctx_factory, size, dtype):
    ctx = cl.create_some_context()
    queue = cl.CommandQueue(ctx)

    dev = ctx.devices[0]
    if (dev.platform.name == "Apple" and dev.type & cl.device_type.CPU):
        pytest.xfail("Bitonic sort won't work on Apple CPU: no workgroup "
                     "parallelism")
    if (dev.platform.name == "Portable Computing Language"
            and dtype == np.float64 and get_pocl_version(dev.platform) <
        (1, 0)):
        pytest.xfail(
            "Double precision bitonic sort doesn't work on POCL < 1.0")

    if dtype == np.float64 and not has_double_support(dev):
        from pytest import skip
        skip("double precision not supported on %s" % dev)

    import pyopencl.clrandom as clrandom
    from pyopencl.bitonic_sort import BitonicSort

    s = clrandom.rand(queue, (
        2,
        size,
        3,
    ),
                      dtype,
                      luxury=None,
                      a=0,
                      b=239482333)
    sorter = BitonicSort(ctx)
    sgs, evt = sorter(s.copy(), axis=1)
    assert np.array_equal(np.sort(s.get(), axis=1), sgs.get())
Beispiel #10
0
def test_bitonic_argsort(ctx_factory, size, dtype):
    import sys
    is_pypy = "__pypy__" in sys.builtin_module_names

    if not size and is_pypy:
        # https://bitbucket.org/pypy/numpy/issues/53/specifying-strides-on-zero-sized-array
        pytest.xfail("pypy doesn't seem to handle as_strided "
                "on zero-sized arrays very well")

    ctx = cl.create_some_context()
    queue = cl.CommandQueue(ctx)

    device = queue.device
    if device.platform.vendor == "The pocl project" \
            and device.type & cl.device_type.GPU:
        pytest.xfail("bitonic argsort fails on POCL + Nvidia,"
                "at least the K40, as of pocl 1.6, 2021-01-20")

    dev = ctx.devices[0]
    if (dev.platform.name == "Portable Computing Language"
            and sys.platform == "darwin"):
        pytest.xfail("Bitonic sort crashes on Apple POCL")
    if (dev.platform.name == "Apple" and dev.type & cl.device_type.CPU):
        pytest.xfail("Bitonic sort won't work on Apple CPU: no workgroup "
            "parallelism")
    if (dev.platform.name == "Portable Computing Language"
            and dtype == np.float64
            and get_pocl_version(dev.platform) < (1, 0)):
        pytest.xfail("Double precision bitonic sort doesn't work on POCL < 1.0")
    if (dev.platform.name == "Intel(R) OpenCL" and size == 0):
        pytest.xfail("size-0 arange fails on Intel CL")

    if dtype == np.float64 and not has_double_support(dev):
        from pytest import skip
        skip("double precision not supported on %s" % dev)

    import pyopencl.clrandom as clrandom
    from pyopencl.bitonic_sort import BitonicSort

    index = cl_array.arange(queue, 0, size, 1, dtype=np.int32)
    m = clrandom.rand(queue, (size,), dtype, luxury=None, a=0, b=239432234)

    sorterm = BitonicSort(ctx)

    ms = m.copy()
    # enqueue_marker crashes under CL 1.1 pocl if there is anything to wait for
    # (no clEnqueueWaitForEvents) https://github.com/inducer/pyopencl/pull/237
    if (dev.platform.name == "Portable Computing Language"
            and cl.get_cl_header_version() < (1, 2)):
        ms.finish()
        index.finish()
    ms, evt = sorterm(ms, idx=index, axis=0)

    assert np.array_equal(np.sort(m.get()), ms.get())

    # may be False because of identical values in array
    # assert np.array_equal(np.argsort(m.get()), index.get())

    # Check values by indices
    assert np.array_equal(m.get()[np.argsort(m.get())], m.get()[index.get()])
Beispiel #11
0
def test_divide_inplace_scalar(ctx_factory):
    """Test inplace division of arrays and a scalar."""

    context = ctx_factory()
    queue = cl.CommandQueue(context)

    if queue.device.platform.name == "Apple":
        pytest.xfail("Apple CL compiler crashes on this.")

    dtypes = (np.uint8, np.uint16, np.uint32, np.int8, np.int16, np.int32,
              np.float32, np.complex64)
    from pyopencl.characterize import has_double_support
    if has_double_support(queue.device):
        dtypes = dtypes + (np.float64, np.complex128)

    from itertools import product

    for dtype_a, dtype_s in product(dtypes, repeat=2):

        a = np.array([10, 20, 30, 40, 50, 60, 70, 80, 90, 100]).astype(dtype_a)
        s = dtype_s(40)
        a_gpu = cl_array.to_device(queue, a)

        # ensure the same behavior as inplace numpy.ndarray division
        try:
            a /= s
        except TypeError:
            with np.testing.assert_raises(TypeError):
                a_gpu /= s
        else:
            a_gpu /= s
            assert (np.abs(a_gpu.get() - a) < 1e-3).all()
            assert a_gpu.dtype is a.dtype
Beispiel #12
0
def test_lin_comb_diff(ctx_factory, arg_type):
    ctx = ctx_factory()
    dev, = ctx.devices
    if not has_double_support(dev):
        if arg_type in (np.float64, np.complex128):
            pytest.skip('Device does not support double.')
    n = 100000
    a_np = (np.random.randn(n)).astype(arg_type)
    b_np = (np.random.randn(n)).astype(arg_type)
    c_np = (np.random.randn(n) * 10).astype(arg_type)
    queue = cl.CommandQueue(ctx)

    a_g = cl.array.to_device(queue, a_np)
    b_g = cl.array.to_device(queue, b_np)
    c_g = cl.array.to_device(queue, c_np)
    res_g = cl.array.empty_like(a_g)
    lin_comb_diff = lin_comb_diff_kernel(ctx, arg_type, arg_type,
                                         arg_type, np.float32, 2)
    gs, ls = get_group_sizes(n, dev, lin_comb_diff)

    evt = run_elwise_kernel(lin_comb_diff, queue, gs, ls, n, [],
                            res_g, c_g, a_g, b_g, 2, 3)
    evt.wait()

    # Check on GPU with PyOpenCL Array:
    assert np.linalg.norm((res_g - (c_g + 2 * a_g + 3 * b_g)).get()) <= 2e-4

    # Check on CPU with Numpy:
    res_np = res_g.get()
    assert np.linalg.norm(res_np - (c_np + 2 * a_np + 3 * b_np)) <= 2e-4
Beispiel #13
0
def test_divide_inplace_array(ctx_factory):
    """Test inplace division of arrays."""

    context = ctx_factory()
    queue = cl.CommandQueue(context)

    dtypes = (np.uint8, np.uint16, np.uint32, np.int8, np.int16, np.int32,
              np.float32, np.complex64)
    from pyopencl.characterize import has_double_support
    if has_double_support(queue.device):
        dtypes = dtypes + (np.float64, np.complex128)

    from itertools import product

    for dtype_a, dtype_b in product(dtypes, repeat=2):
        print(dtype_a, dtype_b)
        a = np.array([10, 20, 30, 40, 50, 60, 70, 80, 90, 100]).astype(dtype_a)
        b = np.array([10, 10, 10, 10, 10, 10, 10, 10, 10, 10]).astype(dtype_b)

        a_gpu = cl_array.to_device(queue, a)
        b_gpu = cl_array.to_device(queue, b)

        # ensure the same behavior as inplace numpy.ndarray division
        try:
            a_gpu /= b_gpu
        except TypeError:
            # pass for now, as numpy casts differently for in-place and out-place
            # true_divide
            pass
            # with np.testing.assert_raises(TypeError):
            #     a /= b
        else:
            a /= b
            assert (np.abs(a_gpu.get() - a) < 1e-3).all()
            assert a_gpu.dtype is a.dtype
Beispiel #14
0
def test_bitonic_sort(ctx_factory, size, dtype):
    ctx = cl.create_some_context()
    queue = cl.CommandQueue(ctx)

    dev = ctx.devices[0]
    if (dev.platform.name == "Apple" and dev.type & cl.device_type.CPU):
        pytest.xfail("Bitonic sort won't work on Apple CPU: no workgroup "
            "parallelism")
    if (dev.platform.name == "Portable Computing Language"
            and dtype == np.float64
            and get_pocl_version(dev.platform) < (1, 0)):
        pytest.xfail("Double precision bitonic sort doesn't work on POCL < 1.0")

    if dtype == np.float64 and not has_double_support(dev):
        from pytest import skip
        skip("double precision not supported on %s" % dev)

    import pyopencl.clrandom as clrandom
    from pyopencl.bitonic_sort import BitonicSort

    s = clrandom.rand(queue, (2, size, 3,), dtype, luxury=None, a=0, b=239482333)
    sgs = s.copy()
    # enqueue_marker crashes under CL 1.1 pocl if there is anything to wait for
    # (no clEnqueueWaitForEvents) https://github.com/inducer/pyopencl/pull/237
    if (dev.platform.name == "Portable Computing Language"
            and cl.get_cl_header_version() < (1, 2)):
        sgs.finish()
    sorter = BitonicSort(ctx)
    sgs, evt = sorter(sgs, axis=1)
    assert np.array_equal(np.sort(s.get(), axis=1), sgs.get())
Beispiel #15
0
def test_astype(ctx_factory):
    context = ctx_factory()
    queue = cl.CommandQueue(context)

    from pyopencl.clrandom import rand as clrand

    if not has_double_support(context.devices[0]):
        from pytest import skip
        skip("double precision not supported on %s" % context.devices[0])

    a_gpu = clrand(queue, (2000,), dtype=np.float32)

    a = a_gpu.get().astype(np.float64)
    a2 = a_gpu.astype(np.float64).get()

    assert a2.dtype == np.float64
    assert la.norm(a - a2) == 0, (a, a2)

    a_gpu = clrand(queue, (2000,), dtype=np.float64)

    a = a_gpu.get().astype(np.float32)
    a2 = a_gpu.astype(np.float32).get()

    assert a2.dtype == np.float32
    assert la.norm(a - a2) / la.norm(a) < 1e-7
Beispiel #16
0
    def test(ctx_factory):
        context = ctx_factory()
        queue = cl.CommandQueue(context)

        gpu_func = getattr(clmath, name)
        cpu_func = getattr(np, numpy_func_names.get(name, name))

        if has_double_support(context.devices[0]):
            if use_complex:
                dtypes = [np.float32, np.float64, np.complex64, np.complex128]
            else:
                dtypes = [np.float32, np.float64]
        else:
            if use_complex:
                dtypes = [np.float32, np.complex64]
            else:
                dtypes = [np.float32]

        for s in sizes:
            for dtype in dtypes:
                dtype = np.dtype(dtype)

                args = cl_array.arange(queue, a, b, (b - a) / s, dtype=dtype)
                if dtype.kind == "c":
                    args = args + dtype.type(1j) * args

                gpu_results = gpu_func(args).get()
                cpu_results = cpu_func(args.get())

                my_threshold = threshold
                if dtype.kind == "c" and isinstance(use_complex, float):
                    my_threshold = use_complex

                max_err = np.max(np.abs(cpu_results - gpu_results))
                assert (max_err <= my_threshold).all(), (max_err, name, dtype)
Beispiel #17
0
def test_divide_scalar(ctx_factory):
    """Test the division of an array and a scalar."""

    context = ctx_factory()
    queue = cl.CommandQueue(context)

    if queue.device.platform.name == "Apple":
        pytest.xfail("Apple CL compiler crashes on this.")

    dtypes = (np.uint8, np.uint16, np.uint32, np.int8, np.int16, np.int32,
              np.float32, np.complex64)
    from pyopencl.characterize import has_double_support
    if has_double_support(queue.device):
        dtypes = dtypes + (np.float64, np.complex128)

    from itertools import product

    for dtype_a, dtype_s in product(dtypes, repeat=2):
        a = np.array([10, 20, 30, 40, 50, 60, 70, 80, 90, 100]).astype(dtype_a)
        s = dtype_s(40)
        a_gpu = cl_array.to_device(queue, a)

        b = a / s
        b_gpu = a_gpu / s
        assert (np.abs(b_gpu.get() - b) < 1e-3).all()
        assert b_gpu.dtype is b.dtype

        c = s / a
        c_gpu = s / a_gpu
        assert (np.abs(c_gpu.get() - c) < 1e-3).all()
        assert c_gpu.dtype is c.dtype
def get_subset_dot_kernel(ctx,
                          dtype_out,
                          dtype_subset,
                          dtype_a=None,
                          dtype_b=None,
                          conjugate_first=False):
    from pyopencl.characterize import has_double_support
    map_expr, dtype_out, dtype_b = _get_dot_expr(
        dtype_out,
        dtype_a,
        dtype_b,
        conjugate_first,
        has_double_support=has_double_support(ctx.devices[0]),
        index_expr="lookup_tbl[i]")

    # important: lookup_tbl must be first--it controls the length
    return ReductionKernel(
        ctx,
        dtype_out,
        neutral="0",
        reduce_expr="a+b",
        map_expr=map_expr,
        arguments=("const %(tp_lut)s *lookup_tbl, "
                   "const %(tp_a)s *a, "
                   "const %(tp_b)s *b" % {
                       "tp_lut": dtype_to_ctype(dtype_subset),
                       "tp_a": dtype_to_ctype(dtype_a),
                       "tp_b": dtype_to_ctype(dtype_b),
                   }))
Beispiel #19
0
def get_dot_kernel(ctx, dtype_out, dtype_a=None, dtype_b=None):
    if dtype_b is None:
        if dtype_a is None:
            dtype_b = dtype_out
        else:
            dtype_b = dtype_a

    if dtype_out is None:
        from pyopencl.compyte.array import get_common_dtype
        from pyopencl.characterize import has_double_support
        dtype_out = get_common_dtype(dtype_a.type(0), dtype_b.type(0),
                                     has_double_support(ctx.devices[0]))

    a_real_dtype = dtype_a.type(0).real.dtype
    b_real_dtype = dtype_b.type(0).real.dtype
    out_real_dtype = dtype_out.type(0).real.dtype

    a_is_complex = dtype_a.kind == "c"
    b_is_complex = dtype_b.kind == "c"
    out_is_complex = dtype_out.kind == "c"

    from pyopencl.elementwise import complex_dtype_to_name

    if a_is_complex and b_is_complex:
        a = "a[i]"
        b = "b[i]"
        if dtype_a != dtype_out:
            a = "%s_cast(%s)" % (complex_dtype_to_name(dtype_out), a)
        if dtype_b != dtype_out:
            b = "%s_cast(%s)" % (complex_dtype_to_name(dtype_out), b)

        map_expr = "%s_mul(%s, %s)" % (complex_dtype_to_name(dtype_out), a, b)
    else:
        a = "a[i]"
        b = "b[i]"

        if out_is_complex:
            if a_is_complex and dtype_a != dtype_out:
                a = "%s_cast(%s)" % (complex_dtype_to_name(dtype_out), a)
            if b_is_complex and dtype_b != dtype_out:
                b = "%s_cast(%s)" % (complex_dtype_to_name(dtype_out), b)

            if not a_is_complex and a_real_dtype != out_real_dtype:
                a = "(%s) (%s)" % (dtype_to_ctype(out_real_dtype), a)
            if not b_is_complex and b_real_dtype != out_real_dtype:
                b = "(%s) (%s)" % (dtype_to_ctype(out_real_dtype), b)

        map_expr = "%s*%s" % (a, b)

    return ReductionKernel(ctx,
                           dtype_out,
                           neutral="0",
                           reduce_expr="a+b",
                           map_expr=map_expr,
                           arguments="const %(tp_a)s *a, "
                           "const %(tp_b)s *b" % {
                               "tp_a": dtype_to_ctype(dtype_a),
                               "tp_b": dtype_to_ctype(dtype_b),
                           })
Beispiel #20
0
    def __init__(self, queue, num_work_items,
            luxury=None, seed=None, no_warmup=False,
            use_legacy_init=False, max_work_items=None):
        if luxury is None:
            luxury = 4

        if seed is None:
            from time import time
            seed = int(time()*1e6) % 2<<30

        self.context = queue.context
        self.luxury = luxury
        self.num_work_items = num_work_items

        from pyopencl.characterize import has_double_support
        self.support_double = has_double_support(queue.device)

        self.no_warmup = no_warmup
        self.use_legacy_init = use_legacy_init
        self.max_work_items = max_work_items

        src = """
            %(defines)s

            #include <pyopencl-ranluxcl.cl>

            kernel void init_ranlux(unsigned seeds, global ranluxcl_state_t *ranluxcltab)
            {
              if (get_global_id(0) < %(num_work_items)d)
                ranluxcl_initialization(seeds, ranluxcltab);
            }
            """ % {
                    "defines": self.generate_settings_defines(),
                    "num_work_items": num_work_items
                }
        prg = cl.Program(queue.context, src).build()

        # {{{ compute work group size

        wg_size = None

        import sys
        import platform
        if ("darwin" in sys.platform
                and "Apple" in queue.device.platform.vendor
                and platform.mac_ver()[0].startswith("10.7")
                and queue.device.type == cl.device_type.CPU):
            wg_size = (1,)

        self.wg_size = wg_size

        # }}}

        self.state = cl_array.empty(queue, (num_work_items, 112), dtype=np.uint8)
        self.state.fill(17)

        prg.init_ranlux(queue, (num_work_items,), self.wg_size, np.uint32(seed),
                self.state.data)
Beispiel #21
0
def get_dot_kernel(ctx, dtype_out, dtype_a=None, dtype_b=None):
    if dtype_b is None:
        if dtype_a is None:
            dtype_b = dtype_out
        else:
            dtype_b = dtype_a

    if dtype_out is None:
        from pyopencl.compyte.array import get_common_dtype
        from pyopencl.characterize import has_double_support
        dtype_out = get_common_dtype(
                dtype_a.type(0), dtype_b.type(0), has_double_support(ctx.devices[0]))

    a_real_dtype = dtype_a.type(0).real.dtype
    b_real_dtype = dtype_b.type(0).real.dtype
    out_real_dtype = dtype_out.type(0).real.dtype

    a_is_complex = dtype_a.kind == "c"
    b_is_complex = dtype_b.kind == "c"
    out_is_complex = dtype_out.kind == "c"

    from pyopencl.elementwise import complex_dtype_to_name

    if a_is_complex and b_is_complex:
        a = "a[i]"
        b = "b[i]"
        if dtype_a != dtype_out:
            a = "%s_cast(%s)" % (complex_dtype_to_name(dtype_out), a)
        if dtype_b != dtype_out:
            b = "%s_cast(%s)" % (complex_dtype_to_name(dtype_out), b)

        map_expr = "%s_mul(%s, %s)" % (
                complex_dtype_to_name(dtype_out), a, b)
    else:
        a = "a[i]"
        b = "b[i]"

        if out_is_complex:
            if a_is_complex and dtype_a != dtype_out:
                a = "%s_cast(%s)" % (complex_dtype_to_name(dtype_out), a)
            if b_is_complex and dtype_b != dtype_out:
                b = "%s_cast(%s)" % (complex_dtype_to_name(dtype_out), b)

            if not a_is_complex and a_real_dtype != out_real_dtype:
                a = "(%s) (%s)" % (dtype_to_ctype(out_real_dtype), a)
            if not b_is_complex and b_real_dtype != out_real_dtype:
                b = "(%s) (%s)" % (dtype_to_ctype(out_real_dtype), b)

        map_expr = "%s*%s" % (a, b)

    return ReductionKernel(ctx, dtype_out, neutral="0",
            reduce_expr="a+b", map_expr=map_expr,
            arguments=
            "__global const %(tp_a)s *a, "
            "__global const %(tp_b)s *b" % {
                "tp_a": dtype_to_ctype(dtype_a),
                "tp_b": dtype_to_ctype(dtype_b),
                })
Beispiel #22
0
def test_get_kernels(ctx_factory, res_type, arg_type, weight_type):
    ctx = ctx_factory()
    dev, = ctx.devices
    if not has_double_support(dev):
        for t in res_type, arg_type, weight_type:
            if t in (np.float64, np.complex128):
                pytest.skip('Device does not support double.')
    for length in range(1, 3):
        lin_comb_kernel(ctx, res_type, arg_type, weight_type, length)
Beispiel #23
0
def test_random_float_in_range(ctx_factory,
                               rng_class,
                               ary_size,
                               plot_hist=False):
    context = ctx_factory()
    queue = cl.CommandQueue(context)

    device = queue.device
    if device.platform.vendor == "The pocl project" \
            and device.type & cl.device_type.GPU \
            and rng_class is RanluxGenerator:
        pytest.xfail("ranlux test fails on POCL + Nvidia,"
                     "at least the Titan V, as of pocl 1.6, 2021-01-20")

    if has_double_support(context.devices[0]):
        dtypes = [np.float32, np.float64]
    else:
        dtypes = [np.float32]

    if rng_class is RanluxGenerator:
        gen = rng_class(queue, 5120)
    else:
        gen = rng_class(context)

    for dtype in dtypes:
        print(dtype)
        ran = cl_array.zeros(queue, ary_size, dtype)
        gen.fill_uniform(ran)

        if plot_hist:
            import matplotlib.pyplot as pt
            pt.hist(ran.get(), 30)
            pt.show()

        assert (0 <= ran.get()).all()
        assert (ran.get() <= 1).all()

        if rng_class is RanluxGenerator:
            gen.synchronize(queue)

        ran = cl_array.zeros(queue, ary_size, dtype)
        gen.fill_uniform(ran, a=4, b=7)
        ran_host = ran.get()

        for cond in [4 <= ran_host, ran_host <= 7]:
            good = cond.all()
            if not good:
                print(np.where(~cond))
                print(ran_host[~cond])
            assert good

        ran = gen.normal(queue, ary_size, dtype, mu=10, sigma=3)

        if plot_hist:
            import matplotlib.pyplot as pt
            pt.hist(ran.get(), 30)
            pt.show()
Beispiel #24
0
    def get_count_kernel(self, index_dtype):
        index_ctype = dtype_to_ctype(index_dtype)
        from pyopencl.tools import VectorArg, OtherArg
        kernel_list_args = [
                VectorArg(index_dtype, "plb_%s_count" % name)
                for name, dtype in self.list_names_and_dtypes
                if name not in self.count_sharing]

        user_list_args = []
        for name, dtype in self.list_names_and_dtypes:
            if name in self.count_sharing:
                continue

            name = "plb_loc_%s_count" % name
            user_list_args.append(OtherArg("%s *%s" % (
                index_ctype, name), name))

        kernel_name = self.name_prefix+"_count"

        from pyopencl.characterize import has_double_support
        src = _LIST_BUILDER_TEMPLATE.render(
                is_count_stage=True,
                kernel_name=kernel_name,
                double_support=all(has_double_support(dev) for dev in
                    self.context.devices),
                debug=self.debug,
                do_not_vectorize=self.do_not_vectorize(),
                eliminate_empty_output_lists=self.eliminate_empty_output_lists,

                kernel_list_arg_decl=_get_arg_decl(kernel_list_args),
                kernel_list_arg_values=_get_arg_list(user_list_args, prefix="&"),
                user_list_arg_decl=_get_arg_decl(user_list_args),
                user_list_args=_get_arg_list(user_list_args),
                user_arg_decl_with_offset=_get_arg_decl(self.arg_decls),
                user_arg_decl_no_offset=_get_arg_decl(self.arg_decls_no_offset),
                user_args_no_offset=_get_arg_list(self.arg_decls_no_offset),
                arg_offset_adjustment=get_arg_offset_adjuster_code(self.arg_decls),

                list_names_and_dtypes=self.list_names_and_dtypes,
                count_sharing=self.count_sharing,
                name_prefix=self.name_prefix,
                generate_template=self.generate_template,
                preamble=self.preamble,

                index_type=index_ctype,
                )

        src = str(src)

        prg = cl.Program(self.context, src).build(self.options)
        knl = getattr(prg, kernel_name)

        from pyopencl.tools import get_arg_list_scalar_arg_dtypes
        knl.set_scalar_arg_dtypes(get_arg_list_scalar_arg_dtypes(
            kernel_list_args+self.arg_decls) + [index_dtype])

        return knl
Beispiel #25
0
    def get_count_kernel(self, index_dtype):
        index_ctype = dtype_to_ctype(index_dtype)
        from pyopencl.tools import VectorArg, OtherArg
        kernel_list_args = [
                VectorArg(index_dtype, "plb_%s_count" % name)
                for name, dtype in self.list_names_and_dtypes
                if name not in self.count_sharing]

        user_list_args = []
        for name, dtype in self.list_names_and_dtypes:
            if name in self.count_sharing:
                continue

            name = "plb_loc_%s_count" % name
            user_list_args.append(OtherArg("%s *%s" % (
                index_ctype, name), name))

        kernel_name = self.name_prefix+"_count"

        from pyopencl.characterize import has_double_support
        src = _LIST_BUILDER_TEMPLATE.render(
                is_count_stage=True,
                kernel_name=kernel_name,
                double_support=all(has_double_support(dev) for dev in
                    self.context.devices),
                debug=self.debug,
                do_not_vectorize=self.do_not_vectorize(),
                eliminate_empty_output_lists=self.eliminate_empty_output_lists,

                kernel_list_arg_decl=_get_arg_decl(kernel_list_args),
                kernel_list_arg_values=_get_arg_list(user_list_args, prefix="&"),
                user_list_arg_decl=_get_arg_decl(user_list_args),
                user_list_args=_get_arg_list(user_list_args),
                user_arg_decl_with_offset=_get_arg_decl(self.arg_decls),
                user_arg_decl_no_offset=_get_arg_decl(self.arg_decls_no_offset),
                user_args_no_offset=_get_arg_list(self.arg_decls_no_offset),
                arg_offset_adjustment=get_arg_offset_adjuster_code(self.arg_decls),

                list_names_and_dtypes=self.list_names_and_dtypes,
                count_sharing=self.count_sharing,
                name_prefix=self.name_prefix,
                generate_template=self.generate_template,
                preamble=self.preamble,

                index_type=index_ctype,
                )

        src = str(src)

        prg = cl.Program(self.context, src).build(self.options)
        knl = getattr(prg, kernel_name)

        from pyopencl.tools import get_arg_list_scalar_arg_dtypes
        knl.set_scalar_arg_dtypes(get_arg_list_scalar_arg_dtypes(
            kernel_list_args+self.arg_decls) + [index_dtype])

        return knl
Beispiel #26
0
def test_clrandom_dtypes(ctx_factory, rng_class, dtype):
    cl_ctx = ctx_factory()
    if dtype == np.float64 and not has_double_support(cl_ctx.devices[0]):
        pytest.skip("double precision not supported on this device")
    rng = rng_class(cl_ctx)

    size = 10

    with cl.CommandQueue(cl_ctx) as queue:
        rng.uniform(queue, size, dtype)

        if dtype not in (np.int32, np.int64):
            rng.normal(queue, size, dtype)
Beispiel #27
0
def test_clrandom_dtypes(ctx_factory, rng_class, dtype):
    cl_ctx = ctx_factory()
    if dtype == np.float64 and not has_double_support(cl_ctx.devices[0]):
        pytest.skip("double precision not supported on this device")
    rng = rng_class(cl_ctx)

    size = 10

    with cl.CommandQueue(cl_ctx) as queue:
        rng.uniform(queue, size, dtype)

        if dtype not in (np.int32, np.int64):
            rng.normal(queue, size, dtype)
Beispiel #28
0
def test_random_float_in_range(ctx_factory,
                               rng_class,
                               ary_size,
                               plot_hist=False):
    context = ctx_factory()
    queue = cl.CommandQueue(context)

    if has_double_support(context.devices[0]):
        dtypes = [np.float32, np.float64]
    else:
        dtypes = [np.float32]

    if rng_class is RanluxGenerator:
        gen = rng_class(queue, 5120)
    else:
        gen = rng_class(context)

    for dtype in dtypes:
        print(dtype)
        ran = cl_array.zeros(queue, ary_size, dtype)
        gen.fill_uniform(ran)

        if plot_hist:
            import matplotlib.pyplot as pt
            pt.hist(ran.get(), 30)
            pt.show()

        assert (0 <= ran.get()).all()
        assert (ran.get() <= 1).all()

        if rng_class is RanluxGenerator:
            gen.synchronize(queue)

        ran = cl_array.zeros(queue, ary_size, dtype)
        gen.fill_uniform(ran, a=4, b=7)
        ran_host = ran.get()

        for cond in [4 <= ran_host, ran_host <= 7]:
            good = cond.all()
            if not good:
                print(np.where(~cond))
                print(ran_host[~cond])
            assert good

        ran = gen.normal(queue, ary_size, dtype, mu=10, sigma=3)

        if plot_hist:
            import matplotlib.pyplot as pt
            pt.hist(ran.get(), 30)
            pt.show()
Beispiel #29
0
def test_hankel_01_complex(ctx_factory, ref_src):
    ctx = ctx_factory()
    queue = cl.CommandQueue(ctx)

    if not has_double_support(ctx.devices[0]):
        from pytest import skip
        skip("no double precision support--cannot test complex bessel function")

    n = 10**6
    np.random.seed(11)
    z = (
        np.logspace(-5, 2, n)
        * np.exp(1j * 2 * np.pi * np.random.rand(n)))

    def get_err(check, ref):
        return np.max(np.abs(check-ref)) / np.max(np.abs(ref))

    if ref_src == "pyfmmlib":
        pyfmmlib = pytest.importorskip("pyfmmlib")
        h0_ref, h1_ref = pyfmmlib.hank103_vec(z, ifexpon=1)
    elif ref_src == "scipy":
        spec = pytest.importorskip("scipy.special")
        h0_ref = spec.hankel1(0, z)
        h1_ref = spec.hankel1(1, z)

    else:
        raise ValueError("ref_src")

    z_dev = cl_array.to_device(queue, z)

    h0_dev, h1_dev = clmath.hankel_01(z_dev)

    rel_err_h0 = np.abs(h0_dev.get() - h0_ref)/np.abs(h0_ref)
    rel_err_h1 = np.abs(h1_dev.get() - h1_ref)/np.abs(h1_ref)

    max_rel_err_h0 = np.max(rel_err_h0)
    max_rel_err_h1 = np.max(rel_err_h1)

    print("H0", max_rel_err_h0)
    print("H1", max_rel_err_h1)

    assert max_rel_err_h0 < 4e-13
    assert max_rel_err_h1 < 2e-13

    if 0:
        import matplotlib.pyplot as pt
        pt.loglog(np.abs(z), rel_err_h0)
        pt.loglog(np.abs(z), rel_err_h1)
        pt.show()
Beispiel #30
0
def get_dot_kernel(ctx, dtype_out, dtype_a=None, dtype_b=None,
        conjugate_first=False):
    from pyopencl.characterize import has_double_support
    map_expr, dtype_out, dtype_b = _get_dot_expr(
            dtype_out, dtype_a, dtype_b, conjugate_first,
            has_double_support=has_double_support(ctx.devices[0]))

    return ReductionKernel(ctx, dtype_out, neutral="0",
            reduce_expr="a+b", map_expr=map_expr,
            arguments=
            "const %(tp_a)s *a, "
            "const %(tp_b)s *b" % {
                "tp_a": dtype_to_ctype(dtype_a),
                "tp_b": dtype_to_ctype(dtype_b),
                })
Beispiel #31
0
def get_dot_kernel(ctx, dtype_out, dtype_a=None, dtype_b=None,
        conjugate_first=False):
    from pyopencl.characterize import has_double_support
    map_expr, dtype_out, dtype_b = _get_dot_expr(
            dtype_out, dtype_a, dtype_b, conjugate_first,
            has_double_support=has_double_support(ctx.devices[0]))

    return ReductionKernel(ctx, dtype_out, neutral="0",
            reduce_expr="a+b", map_expr=map_expr,
            arguments=
            "const %(tp_a)s *a, "
            "const %(tp_b)s *b" % {
                "tp_a": dtype_to_ctype(dtype_a),
                "tp_b": dtype_to_ctype(dtype_b),
                })
Beispiel #32
0
def test_hankel_01_complex(ctx_factory, ref_src):
    ctx = ctx_factory()
    queue = cl.CommandQueue(ctx)

    if not has_double_support(ctx.devices[0]):
        from pytest import skip
        skip(
            "no double precision support--cannot test complex bessel function")

    n = 10**6
    np.random.seed(11)
    z = (np.logspace(-5, 2, n) * np.exp(1j * 2 * np.pi * np.random.rand(n)))

    def get_err(check, ref):
        return np.max(np.abs(check - ref)) / np.max(np.abs(ref))

    if ref_src == "pyfmmlib":
        pyfmmlib = pytest.importorskip("pyfmmlib")
        h0_ref, h1_ref = pyfmmlib.hank103_vec(z, ifexpon=1)
    elif ref_src == "scipy":
        spec = pytest.importorskip("scipy.special")
        h0_ref = spec.hankel1(0, z)
        h1_ref = spec.hankel1(1, z)

    else:
        raise ValueError("ref_src")

    z_dev = cl_array.to_device(queue, z)

    h0_dev, h1_dev = clmath.hankel_01(z_dev)

    rel_err_h0 = np.abs(h0_dev.get() - h0_ref) / np.abs(h0_ref)
    rel_err_h1 = np.abs(h1_dev.get() - h1_ref) / np.abs(h1_ref)

    max_rel_err_h0 = np.max(rel_err_h0)
    max_rel_err_h1 = np.max(rel_err_h1)

    print("H0", max_rel_err_h0)
    print("H1", max_rel_err_h1)

    assert max_rel_err_h0 < 4e-13
    assert max_rel_err_h1 < 2e-13

    if 0:
        import matplotlib.pyplot as pt
        pt.loglog(np.abs(z), rel_err_h0)
        pt.loglog(np.abs(z), rel_err_h1)
        pt.show()
Beispiel #33
0
def test_random(ctx_getter):
    context = ctx_getter()
    queue = cl.CommandQueue(context)

    from pyopencl.clrandom import rand as clrand

    if has_double_support(context.devices[0]):
        dtypes = [np.float32, np.float64]
    else:
        dtypes = [np.float32]

    for dtype in dtypes:
        a = clrand(context, queue, (10, 100), dtype=dtype).get()

        assert (0 <= a).all()
        assert (a < 1).all()
Beispiel #34
0
def test_random_float_in_range(ctx_factory, rng_class, ary_size, plot_hist=False):
    context = ctx_factory()
    queue = cl.CommandQueue(context)

    if has_double_support(context.devices[0]):
        dtypes = [np.float32, np.float64]
    else:
        dtypes = [np.float32]

    if rng_class is RanluxGenerator:
        gen = rng_class(queue, 5120)
    else:
        gen = rng_class(context)

    for dtype in dtypes:
        print(dtype)
        ran = cl_array.zeros(queue, ary_size, dtype)
        gen.fill_uniform(ran)

        if plot_hist:
            import matplotlib.pyplot as pt
            pt.hist(ran.get(), 30)
            pt.show()

        assert (0 <= ran.get()).all()
        assert (ran.get() <= 1).all()

        if rng_class is RanluxGenerator:
            gen.synchronize(queue)

        ran = cl_array.zeros(queue, ary_size, dtype)
        gen.fill_uniform(ran, a=4, b=7)
        ran_host = ran.get()

        for cond in [4 <= ran_host,  ran_host <= 7]:
            good = cond.all()
            if not good:
                print(np.where(~cond))
                print(ran_host[~cond])
            assert good

        ran = gen.normal(queue, ary_size, dtype, mu=10, sigma=3)

        if plot_hist:
            import matplotlib.pyplot as pt
            pt.hist(ran.get(), 30)
            pt.show()
Beispiel #35
0
def test_bitonic_argsort(ctx_factory, size, dtype):
    import sys
    is_pypy = '__pypy__' in sys.builtin_module_names

    if not size and is_pypy:
        # https://bitbucket.org/pypy/numpy/issues/53/specifying-strides-on-zero-sized-array
        pytest.xfail("pypy doesn't seem to handle as_strided "
                     "on zero-sized arrays very well")

    ctx = cl.create_some_context()
    queue = cl.CommandQueue(ctx)

    dev = ctx.devices[0]
    if (dev.platform.name == "Portable Computing Language"
            and sys.platform == "darwin"):
        pytest.xfail("Bitonic sort crashes on Apple POCL")
    if (dev.platform.name == "Apple" and dev.type & cl.device_type.CPU):
        pytest.xfail("Bitonic sort won't work on Apple CPU: no workgroup "
                     "parallelism")
    if (dev.platform.name == "Portable Computing Language"
            and dtype == np.float64 and get_pocl_version(dev.platform) <
        (1, 0)):
        pytest.xfail(
            "Double precision bitonic sort doesn't work on POCL < 1.0")

    if dtype == np.float64 and not has_double_support(dev):
        from pytest import skip
        skip("double precision not supported on %s" % dev)

    import pyopencl.clrandom as clrandom
    from pyopencl.bitonic_sort import BitonicSort

    index = cl_array.arange(queue, 0, size, 1, dtype=np.int32)
    m = clrandom.rand(queue, (size, ), dtype, luxury=None, a=0, b=239432234)

    sorterm = BitonicSort(ctx)

    ms, evt = sorterm(m.copy(), idx=index, axis=0)

    assert np.array_equal(np.sort(m.get()), ms.get())

    # may be False because of identical values in array
    # assert np.array_equal(np.argsort(m.get()), index.get())

    # Check values by indices
    assert np.array_equal(m.get()[np.argsort(m.get())], m.get()[index.get()])
Beispiel #36
0
def test_pow_neg1_vs_inv(ctx_factory):
    ctx = ctx_factory()
    queue = cl.CommandQueue(ctx)

    device = ctx.devices[0]
    if not has_double_support(device):
        from py.test import skip
        skip("double precision not supported on %s" % device)

    a_dev = make_random_array(queue, np.complex128, 20000)

    res1 = (a_dev ** (-1)).get()
    res2 = (1/a_dev).get()
    ref = 1/a_dev.get()

    assert la.norm(res1-ref, np.inf) / la.norm(ref) < 1e-13
    assert la.norm(res2-ref, np.inf) / la.norm(ref) < 1e-13
Beispiel #37
0
def test_bitonic_argsort(ctx_factory, size, dtype):
    import sys
    is_pypy = '__pypy__' in sys.builtin_module_names

    if not size and is_pypy:
        # https://bitbucket.org/pypy/numpy/issues/53/specifying-strides-on-zero-sized-array
        pytest.xfail("pypy doesn't seem to handle as_strided "
                "on zero-sized arrays very well")

    ctx = cl.create_some_context()
    queue = cl.CommandQueue(ctx)

    dev = ctx.devices[0]
    if (dev.platform.name == "Portable Computing Language"
            and sys.platform == "darwin"):
        pytest.xfail("Bitonic sort crashes on Apple POCL")
    if (dev.platform.name == "Apple" and dev.type & cl.device_type.CPU):
        pytest.xfail("Bitonic sort won't work on Apple CPU: no workgroup "
            "parallelism")
    if (dev.platform.name == "Portable Computing Language"
            and dtype == np.float64
            and get_pocl_version(dev.platform) < (1, 0)):
        pytest.xfail("Double precision bitonic sort doesn't work on POCL < 1.0")

    if dtype == np.float64 and not has_double_support(dev):
        from pytest import skip
        skip("double precision not supported on %s" % dev)

    import pyopencl.clrandom as clrandom
    from pyopencl.bitonic_sort import BitonicSort

    index = cl_array.arange(queue, 0, size, 1, dtype=np.int32)
    m = clrandom.rand(queue, (size,), dtype, luxury=None, a=0, b=239432234)

    sorterm = BitonicSort(ctx)

    ms, evt = sorterm(m.copy(), idx=index, axis=0)

    assert np.array_equal(np.sort(m.get()), ms.get())

    # may be False because of identical values in array
    # assert np.array_equal(np.argsort(m.get()), index.get())

    # Check values by indices
    assert np.array_equal(m.get()[np.argsort(m.get())], m.get()[index.get()])
Beispiel #38
0
def test_dot(ctx_factory):
    from pytest import importorskip
    importorskip("mako")

    context = ctx_factory()
    queue = cl.CommandQueue(context)

    dev = context.devices[0]

    dtypes = [np.float32, np.complex64]
    if has_double_support(dev):
        if has_struct_arg_count_bug(dev) == "apple":
            dtypes.extend([np.float64])
        else:
            dtypes.extend([np.float64, np.complex128])

    for a_dtype in dtypes:
        for b_dtype in dtypes:
            print(a_dtype, b_dtype)
            a_gpu = general_clrand(queue, (200000,), a_dtype)
            a = a_gpu.get()
            b_gpu = general_clrand(queue, (200000,), b_dtype)
            b = b_gpu.get()

            dot_ab = np.dot(a, b)
            dot_ab_gpu = cl_array.dot(a_gpu, b_gpu).get()

            assert abs(dot_ab_gpu - dot_ab) / abs(dot_ab) < 1e-4

            try:
                vdot_ab = np.vdot(a, b)
            except NotImplementedError:
                import sys
                is_pypy = '__pypy__' in sys.builtin_module_names
                if is_pypy:
                    print("PYPY: VDOT UNIMPLEMENTED")
                    continue
                else:
                    raise

            vdot_ab_gpu = cl_array.vdot(a_gpu, b_gpu).get()

            rel_err = abs(vdot_ab_gpu - vdot_ab) / abs(vdot_ab)
            assert rel_err < 1e-4, rel_err
Beispiel #39
0
def test_dot(ctx_factory):
    from pytest import importorskip
    importorskip("mako")

    context = ctx_factory()
    queue = cl.CommandQueue(context)

    dev = context.devices[0]

    dtypes = [np.float32, np.complex64]
    if has_double_support(dev):
        if has_struct_arg_count_bug(dev) == "apple":
            dtypes.extend([np.float64])
        else:
            dtypes.extend([np.float64, np.complex128])

    for a_dtype in dtypes:
        for b_dtype in dtypes:
            print(a_dtype, b_dtype)
            a_gpu = general_clrand(queue, (200000,), a_dtype)
            a = a_gpu.get()
            b_gpu = general_clrand(queue, (200000,), b_dtype)
            b = b_gpu.get()

            dot_ab = np.dot(a, b)
            dot_ab_gpu = cl_array.dot(a_gpu, b_gpu).get()

            assert abs(dot_ab_gpu - dot_ab) / abs(dot_ab) < 1e-4

            try:
                vdot_ab = np.vdot(a, b)
            except NotImplementedError:
                import sys
                is_pypy = "__pypy__" in sys.builtin_module_names
                if is_pypy:
                    print("PYPY: VDOT UNIMPLEMENTED")
                    continue
                else:
                    raise

            vdot_ab_gpu = cl_array.vdot(a_gpu, b_gpu).get()

            rel_err = abs(vdot_ab_gpu - vdot_ab) / abs(vdot_ab)
            assert rel_err < 1e-4, rel_err
Beispiel #40
0
def get_subset_dot_kernel(ctx, dtype_out, dtype_subset, dtype_a=None, dtype_b=None,
        conjugate_first=False):
    from pyopencl.characterize import has_double_support
    map_expr, dtype_out, dtype_b = _get_dot_expr(
            dtype_out, dtype_a, dtype_b, conjugate_first,
            has_double_support=has_double_support(ctx.devices[0]),
            index_expr="lookup_tbl[i]")

    # important: lookup_tbl must be first--it controls the length
    return ReductionKernel(ctx, dtype_out, neutral="0",
            reduce_expr="a+b", map_expr=map_expr,
            arguments=(
                "const %(tp_lut)s *lookup_tbl, "
                "const %(tp_a)s *a, "
                "const %(tp_b)s *b" % {
                    "tp_lut": dtype_to_ctype(dtype_subset),
                    "tp_a": dtype_to_ctype(dtype_a),
                    "tp_b": dtype_to_ctype(dtype_b),
                    }))
Beispiel #41
0
def test_minmax(ctx_factory):
    context = ctx_factory()
    queue = cl.CommandQueue(context)

    from pyopencl.clrandom import rand as clrand

    if has_double_support(context.devices[0]):
        dtypes = [np.float64, np.float32, np.int32]
    else:
        dtypes = [np.float32, np.int32]

    for what in ["min", "max"]:
        for dtype in dtypes:
            a_gpu = clrand(queue, (200000,), dtype)
            a = a_gpu.get()

            op_a = getattr(np, what)(a)
            op_a_gpu = getattr(cl_array, what)(a_gpu).get()

            assert op_a_gpu == op_a, (op_a_gpu, op_a, dtype, what)
Beispiel #42
0
def test_clrandom_dtypes(ctx_factory, rng_class, dtype):
    cl_ctx = ctx_factory()
    if dtype == np.float64 and not has_double_support(cl_ctx.devices[0]):
        pytest.skip("double precision not supported on this device")
    rng = rng_class(cl_ctx)

    size = 10

    with cl.CommandQueue(cl_ctx) as queue:
        device = queue.device
        if device.platform.vendor == "The pocl project" \
                and device.type & cl.device_type.GPU \
                and rng_class is make_ranlux_generator:
            pytest.xfail("ranlux test fails on POCL + Nvidia,"
                         "at least the K40, as of pocl 1.6, 2021-01-20")

        rng.uniform(queue, size, dtype)

        if dtype not in (np.int32, np.int64):
            rng.normal(queue, size, dtype)
Beispiel #43
0
def test_pow_neg1_vs_inv(ctx_factory):
    ctx = ctx_factory()
    queue = cl.CommandQueue(ctx)

    device = ctx.devices[0]
    if not has_double_support(device):
        from pytest import skip
        skip("double precision not supported on %s" % device)
    if has_struct_arg_count_bug(device) == "apple":
        from pytest import xfail
        xfail("apple struct arg counting broken")

    a_dev = make_random_array(queue, np.complex128, 20000)

    res1 = (a_dev**(-1)).get()
    res2 = (1 / a_dev).get()
    ref = 1 / a_dev.get()

    assert la.norm(res1 - ref, np.inf) / la.norm(ref) < 1e-13
    assert la.norm(res2 - ref, np.inf) / la.norm(ref) < 1e-13
Beispiel #44
0
def test_pow_neg1_vs_inv(ctx_factory):
    ctx = ctx_factory()
    queue = cl.CommandQueue(ctx)

    device = ctx.devices[0]
    if not has_double_support(device):
        from pytest import skip
        skip("double precision not supported on %s" % device)
    if has_struct_arg_count_bug(device) == "apple":
        from pytest import xfail
        xfail("apple struct arg counting broken")

    a_dev = make_random_array(queue, np.complex128, 20000)

    res1 = (a_dev ** (-1)).get()
    res2 = (1/a_dev).get()
    ref = 1/a_dev.get()

    assert la.norm(res1-ref, np.inf) / la.norm(ref) < 1e-13
    assert la.norm(res2-ref, np.inf) / la.norm(ref) < 1e-13
Beispiel #45
0
    def test(ctx_factory):
        context = ctx_factory()
        queue = cl.CommandQueue(context)

        gpu_func = getattr(clmath, name)
        cpu_func = getattr(np, numpy_func_names.get(name, name))

        dev = context.devices[0]

        if has_double_support(dev):
            if use_complex and has_struct_arg_count_bug(dev) == "apple":
                dtypes = [np.float32, np.float64, np.complex64]
            elif use_complex:
                dtypes = [np.float32, np.float64, np.complex64, np.complex128]
            else:
                dtypes = [np.float32, np.float64]
        else:
            if use_complex:
                dtypes = [np.float32, np.complex64]
            else:
                dtypes = [np.float32]

        for s in sizes:
            for dtype in dtypes:
                dtype = np.dtype(dtype)

                args = cl_array.arange(queue, a, b, (b - a) / s, dtype=dtype)
                if dtype.kind == "c":
                    # args = args + dtype.type(1j) * args
                    args = args + args * dtype.type(1j)

                gpu_results = gpu_func(args).get()
                cpu_results = cpu_func(args.get())

                my_threshold = threshold
                if dtype.kind == "c" and isinstance(use_complex, float):
                    my_threshold = use_complex

                max_err = np.max(np.abs(cpu_results - gpu_results))
                assert (max_err <= my_threshold).all(), \
                        (max_err, name, dtype)
Beispiel #46
0
def test_subset_minmax(ctx_factory):
    from pytest import importorskip
    importorskip("mako")

    context = ctx_factory()
    queue = cl.CommandQueue(context)

    from pyopencl.clrandom import rand as clrand

    l_a = 200000
    gran = 5
    l_m = l_a - l_a // gran + 1

    if has_double_support(context.devices[0]):
        dtypes = [np.float64, np.float32, np.int32]
    else:
        dtypes = [np.float32, np.int32]

    for dtype in dtypes:
        a_gpu = clrand(queue, (l_a,), dtype)
        a = a_gpu.get()

        meaningful_indices_gpu = cl_array.zeros(
                queue, l_m, dtype=np.int32)
        meaningful_indices = meaningful_indices_gpu.get()
        j = 0
        for i in range(len(meaningful_indices)):
            meaningful_indices[i] = j
            j = j + 1
            if j % gran == 0:
                j = j + 1

        meaningful_indices_gpu = cl_array.to_device(
                queue, meaningful_indices)
        b = a[meaningful_indices]

        min_a = np.min(b)
        min_a_gpu = cl_array.subset_min(meaningful_indices_gpu, a_gpu).get()

        assert min_a_gpu == min_a
Beispiel #47
0
def test_subset_minmax(ctx_factory):
    from pytest import importorskip
    importorskip("mako")

    context = ctx_factory()
    queue = cl.CommandQueue(context)

    from pyopencl.clrandom import rand as clrand

    l_a = 200000
    gran = 5
    l_m = l_a - l_a // gran + 1

    if has_double_support(context.devices[0]):
        dtypes = [np.float64, np.float32, np.int32]
    else:
        dtypes = [np.float32, np.int32]

    for dtype in dtypes:
        a_gpu = clrand(queue, (l_a,), dtype)
        a = a_gpu.get()

        meaningful_indices_gpu = cl_array.zeros(
                queue, l_m, dtype=np.int32)
        meaningful_indices = meaningful_indices_gpu.get()
        j = 0
        for i in range(len(meaningful_indices)):
            meaningful_indices[i] = j
            j = j + 1
            if j % gran == 0:
                j = j + 1

        meaningful_indices_gpu = cl_array.to_device(
                queue, meaningful_indices)
        b = a[meaningful_indices]

        min_a = np.min(b)
        min_a_gpu = cl_array.subset_min(meaningful_indices_gpu, a_gpu).get()

        assert min_a_gpu == min_a
Beispiel #48
0
    def test(ctx_getter):
        context = ctx_getter()
        queue = cl.CommandQueue(context)

        gpu_func = getattr(clmath, name)
        cpu_func = getattr(np, numpy_func_names.get(name, name))

        if has_double_support(context.devices[0]):
            dtypes = [np.float32, np.float64]
        else:
            dtypes = [np.float32]

        for s in sizes:
            for dtype in dtypes:
                args = cl_array.arange(queue, a, b, (b-a)/s, 
                        dtype=np.float32)
                gpu_results = gpu_func(args).get()
                cpu_results = cpu_func(args.get())

                max_err = np.max(np.abs(cpu_results - gpu_results))
                assert (max_err <= threshold).all(), \
                        (max_err, name, dtype)
Beispiel #49
0
def get_dot_kernel(ctx, dtype_out, dtype_a=None, dtype_b=None,
        conjugate_first=False):
    from pyopencl.characterize import has_double_support
    map_expr, dtype_out, dtype_b = _get_dot_expr(
            dtype_out, dtype_a, dtype_b, conjugate_first,
            has_double_support=has_double_support(ctx.devices[0]))

    reduce_expr = "a+b"
    neutral_expr = "0"
    if dtype_out.kind == "c":
        from pyopencl.elementwise import complex_dtype_to_name
        dtname = complex_dtype_to_name(dtype_out)
        reduce_expr = "%s_add(a, b)" % dtname
        neutral_expr = "%s_new(0, 0)" % dtname

    return ReductionKernel(ctx, dtype_out, neutral=neutral_expr,
            reduce_expr=reduce_expr, map_expr=map_expr,
            arguments=(
                "const %(tp_a)s *a, "
                "const %(tp_b)s *b" % {
                    "tp_a": dtype_to_ctype(dtype_a),
                    "tp_b": dtype_to_ctype(dtype_b),
                    }))
Beispiel #50
0
def test_astype(ctx_getter):
    context = ctx_getter()
    queue = cl.CommandQueue(context)

    from pyopencl.clrandom import rand as clrand

    if not has_double_support(context.devices[0]):
        return

    a_gpu = clrand(context, queue, (2000,), dtype=np.float32)

    a = a_gpu.get().astype(np.float64)
    a2 = a_gpu.astype(np.float64).get()

    assert a2.dtype == np.float64
    assert la.norm(a - a2) == 0, (a, a2)

    a_gpu = clrand(context, queue, (2000,), dtype=np.float64)

    a = a_gpu.get().astype(np.float32)
    a2 = a_gpu.astype(np.float32).get()

    assert a2.dtype == np.float32
    assert la.norm(a - a2)/la.norm(a) < 1e-7
Beispiel #51
0
def test_bitonic_sort(ctx_factory, size, dtype):
    ctx = cl.create_some_context()
    queue = cl.CommandQueue(ctx)

    dev = ctx.devices[0]
    if (dev.platform.name == "Apple" and dev.type & cl.device_type.CPU):
        pytest.xfail("Bitonic sort won't work on Apple CPU: no workgroup "
            "parallelism")
    if (dev.platform.name == "Portable Computing Language"
            and dtype == np.float64
            and get_pocl_version(dev.platform) < (1, 0)):
        pytest.xfail("Double precision bitonic sort doesn't work on POCL < 1.0")

    if dtype == np.float64 and not has_double_support(dev):
        from pytest import skip
        skip("double precision not supported on %s" % dev)

    import pyopencl.clrandom as clrandom
    from pyopencl.bitonic_sort import BitonicSort

    s = clrandom.rand(queue, (2, size, 3,), dtype, luxury=None, a=0, b=239482333)
    sorter = BitonicSort(ctx)
    sgs, evt = sorter(s.copy(), axis=1)
    assert np.array_equal(np.sort(s.get(), axis=1), sgs.get())
Beispiel #52
0
    def __init__(self, queue, num_work_items=None,
            luxury=None, seed=None, no_warmup=False,
            use_legacy_init=False, max_work_items=None):
        """
        :param queue: :class:`pyopencl.CommandQueue`, only used for initialization
        :param luxury: the "luxury value" of the generator, and should be 0-4,
            where 0 is fastest and 4 produces the best numbers. It can also be
            >=24, in which case it directly sets the p-value of RANLUXCL.
        :param num_work_items: is the number of generators to initialize,
            usually corresponding to the number of work-items in the NDRange
            RANLUXCL will be used with.  May be `None`, in which case a default
            value is used.
        :param max_work_items: should reflect the maximum number of work-items
            that will be used on any parallel instance of RANLUXCL. So for
            instance if we are launching 5120 work-items on GPU1 and 10240
            work-items on GPU2, GPU1's RANLUXCLTab would be generated by
            calling ranluxcl_intialization with numWorkitems = 5120 while
            GPU2's RANLUXCLTab would use numWorkitems = 10240. However
            maxWorkitems must be at least 10240 for both GPU1 and GPU2, and it
            must be set to the same value for both. (may be `None`)

        .. versionchanged:: 2013.1
            Added default value for `num_work_items`.
        """

        if luxury is None:
            luxury = 4

        if num_work_items is None:
            if queue.device.type & cl.device_type.CPU:
                num_work_items = 8 * queue.device.max_compute_units
            else:
                num_work_items = 64 * queue.device.max_compute_units

        if seed is None:
            from time import time
            seed = int(time()*1e6) % 2 << 30

        self.context = queue.context
        self.luxury = luxury
        self.num_work_items = num_work_items

        from pyopencl.characterize import has_double_support
        self.support_double = has_double_support(queue.device)

        self.no_warmup = no_warmup
        self.use_legacy_init = use_legacy_init
        self.max_work_items = max_work_items

        src = """
            %(defines)s

            #include <pyopencl-ranluxcl.cl>

            kernel void init_ranlux(unsigned seeds,
                global ranluxcl_state_t *ranluxcltab)
            {
              if (get_global_id(0) < %(num_work_items)d)
                ranluxcl_initialization(seeds, ranluxcltab);
            }
            """ % {
                    "defines": self.generate_settings_defines(),
                    "num_work_items": num_work_items
                }
        prg = cl.Program(queue.context, src).build()

        # {{{ compute work group size

        wg_size = None

        import sys
        import platform
        if ("darwin" in sys.platform
                and "Apple" in queue.device.platform.vendor
                and platform.mac_ver()[0].startswith("10.7")
                and queue.device.type & cl.device_type.CPU):
            wg_size = (1,)

        self.wg_size = wg_size

        # }}}

        self.state = cl_array.empty(queue, (num_work_items, 112), dtype=np.uint8)
        self.state.fill(17)

        prg.init_ranlux(queue, (num_work_items,), self.wg_size, np.uint32(seed),
                self.state.data)
Beispiel #53
0
def test_bessel_j(ctx_factory):
    try:
        import scipy.special as spec
    except ImportError:
        from py.test import skip

        skip("scipy not present--cannot test Bessel function")

    ctx = ctx_factory()
    queue = cl.CommandQueue(ctx)

    if not has_double_support(ctx.devices[0]):
        from py.test import skip

        skip("no double precision support--cannot test bessel function")

    nterms = 30

    try:
        from hellskitchen._native import jfuns2d
    except ImportError:
        use_hellskitchen = False
    else:
        use_hellskitchen = True

    if use_hellskitchen:
        a = np.logspace(-3, 3, 10 ** 6)
    else:
        a = np.logspace(-5, 5, 10 ** 6)

    if use_hellskitchen:
        hellskitchen_result = np.empty((len(a), nterms), dtype=np.complex128)
        for i, a_i in enumerate(a):
            if i % 10000 == 0:
                print "%.1f %%" % (100 * i / len(a))
            ier, fjs, _, _ = jfuns2d(nterms, a_i, 1, 0, 10000)
            hellskitchen_result[i] = fjs[:nterms]
        assert ier == 0

    a_dev = cl_array.to_device(queue, a)

    for n in range(0, nterms):
        cl_bessel = clmath.bessel_jn(n, a_dev).get()
        scipy_bessel = spec.jn(n, a)

        error_scipy = np.max(np.abs(cl_bessel - scipy_bessel))
        assert error_scipy < 1e-10, error_scipy

        if use_hellskitchen:
            hk_bessel = hellskitchen_result[:, n]
            error_hk = np.max(np.abs(cl_bessel - hk_bessel))
            assert error_hk < 1e-10, error_hk
            error_hk_scipy = np.max(np.abs(scipy_bessel - hk_bessel))
            print (n, error_scipy, error_hk, error_hk_scipy)
        else:
            print (n, error_scipy)

        assert not np.isnan(cl_bessel).any()

        if 0 and n == 15:
            import matplotlib.pyplot as pt

            # pt.plot(scipy_bessel)
            # pt.plot(cl_bessel)

            pt.loglog(a, np.abs(cl_bessel - scipy_bessel), label="vs scipy")
            if use_hellskitchen:
                pt.loglog(a, np.abs(cl_bessel - hk_bessel), label="vs hellskitchen")
            pt.legend()
            pt.show()
    def get_write_kernel(self, index_dtype):
        index_ctype = dtype_to_ctype(index_dtype)
        from pyopencl.tools import VectorArg, OtherArg
        kernel_list_args = []
        kernel_list_arg_values = ""
        user_list_args = []

        for name, dtype in self.list_names_and_dtypes:
            list_name = "plb_%s_list" % name
            list_arg = VectorArg(dtype, list_name)

            kernel_list_args.append(list_arg)
            user_list_args.append(list_arg)

            if name in self.count_sharing:
                kernel_list_arg_values += "%s, " % list_name
                continue

            kernel_list_args.append(
                VectorArg(index_dtype, "plb_%s_start_index" % name))

            index_name = "plb_%s_index" % name
            user_list_args.append(
                OtherArg("%s *%s" % (index_ctype, index_name), index_name))

            kernel_list_arg_values += "%s, &%s, " % (list_name, index_name)

        kernel_name = self.name_prefix + "_write"

        from pyopencl.characterize import has_double_support
        src = _LIST_BUILDER_TEMPLATE.render(
            is_count_stage=False,
            kernel_name=kernel_name,
            double_support=all(
                has_double_support(dev) for dev in self.context.devices),
            debug=self.debug,
            do_not_vectorize=self.do_not_vectorize(),
            kernel_list_arg_decl=_get_arg_decl(kernel_list_args),
            kernel_list_arg_values=kernel_list_arg_values,
            user_list_arg_decl=_get_arg_decl(user_list_args),
            user_list_args=_get_arg_list(user_list_args),
            user_arg_decl=_get_arg_decl(self.arg_decls),
            user_args=_get_arg_list(self.arg_decls),
            list_names_and_dtypes=self.list_names_and_dtypes,
            count_sharing=self.count_sharing,
            name_prefix=self.name_prefix,
            generate_template=self.generate_template,
            preamble=self.preamble,
            index_type=index_ctype,
        )

        src = str(src)

        prg = cl.Program(self.context, src).build(self.options)
        knl = getattr(prg, kernel_name)

        from pyopencl.tools import get_arg_list_scalar_arg_dtypes
        knl.set_scalar_arg_dtypes(
            get_arg_list_scalar_arg_dtypes(kernel_list_args + self.arg_decls) +
            [index_dtype])

        return knl
Beispiel #55
0
cl_buffer_datatype_dict = {
    np.bool: "bool",
    np.uint8: "uchar",
    np.uint16: "ushort",
    np.uint32: "uint",
    np.uint64: "ulong",
    np.int8: "char",
    np.int16: "short",
    np.int32: "int",
    np.int64: "long",
    np.float32: "float",
    np.complex64: "cfloat_t",
}

if characterize.has_double_support(get_device().device):
    cl_buffer_datatype_dict[np.float64] = "double"


def abspath(myPath):
    """ Get absolute path to resource, works for dev and for PyInstaller """

    try:
        # PyInstaller creates a temp folder and stores path in _MEIPASS
        base_path = sys._MEIPASS
        return os.path.join(base_path, os.path.basename(myPath))
    except Exception:
        base_path = os.path.abspath(os.path.dirname(__file__))
        return os.path.join(base_path, myPath)

Beispiel #56
0
def test_mix_complex(ctx_factory):
    context = ctx_factory()
    queue = cl.CommandQueue(context)

    size = 10

    dtypes = [
            (np.float32, np.complex64),
            #(np.int32, np.complex64),
            ]

    dev = context.devices[0]
    if has_double_support(dev) and has_struct_arg_count_bug(dev) == "apple":
        dtypes.extend([
            (np.float32, np.float64),
            ])
    elif has_double_support(dev):
        dtypes.extend([
            (np.float32, np.float64),
            (np.float32, np.complex128),
            (np.float64, np.complex64),
            (np.float64, np.complex128),
            ])

    from operator import add, mul, sub, truediv
    for op in [add, sub, mul, truediv, pow]:
        for dtype_a0, dtype_b0 in dtypes:
            for dtype_a, dtype_b in [
                    (dtype_a0, dtype_b0),
                    (dtype_b0, dtype_a0),
                    ]:
                for is_scalar_a, is_scalar_b in [
                        (False, False),
                        (False, True),
                        (True, False),
                        ]:
                    if is_scalar_a:
                        ary_a = make_random_array(queue, dtype_a, 1).get()[0]
                        host_ary_a = ary_a
                    else:
                        ary_a = make_random_array(queue, dtype_a, size)
                        host_ary_a = ary_a.get()

                    if is_scalar_b:
                        ary_b = make_random_array(queue, dtype_b, 1).get()[0]
                        host_ary_b = ary_b
                    else:
                        ary_b = make_random_array(queue, dtype_b, size)
                        host_ary_b = ary_b.get()

                    print(op, dtype_a, dtype_b, is_scalar_a, is_scalar_b)
                    dev_result = op(ary_a, ary_b).get()
                    host_result = op(host_ary_a, host_ary_b)

                    if host_result.dtype != dev_result.dtype:
                        # This appears to be a numpy bug, where we get
                        # served a Python complex that is really a
                        # smaller numpy complex.

                        print("HOST_DTYPE: %s DEV_DTYPE: %s" % (
                                host_result.dtype, dev_result.dtype))

                        dev_result = dev_result.astype(host_result.dtype)

                    err = la.norm(host_result-dev_result)/la.norm(host_result)
                    print(err)
                    correct = err < 1e-4
                    if not correct:
                        print(host_result)
                        print(dev_result)
                        print(host_result - dev_result)

                    assert correct