Example #1
0
def test_custom_type_fill(ctx_factory):
    context = ctx_factory()
    queue = cl.CommandQueue(context)

    from pyopencl.characterize import has_struct_arg_count_bug
    if has_struct_arg_count_bug(queue.device):
        pytest.skip("device has LLVM arg counting bug")

    dtype = np.dtype([
        ("cur_min", np.int32),
        ("cur_max", np.int32),
        ("pad", np.int32),
    ])

    from pyopencl.tools import get_or_register_dtype, match_dtype_to_c_struct

    name = "mmc_type"
    dtype, c_decl = match_dtype_to_c_struct(queue.device, name, dtype)
    dtype = get_or_register_dtype(name, dtype)

    n = 1000
    z_dev = cl.array.empty(queue, n, dtype=dtype)
    z_dev.fill(np.zeros((), dtype))

    z = z_dev.get()

    assert np.array_equal(np.zeros(n, dtype), z)
Example #2
0
def test_custom_type_fill(ctx_factory):
    context = ctx_factory()
    queue = cl.CommandQueue(context)

    from pyopencl.characterize import has_struct_arg_count_bug
    if has_struct_arg_count_bug(queue.device):
        pytest.skip("device has LLVM arg counting bug")

    dtype = np.dtype([
        ("cur_min", np.int32),
        ("cur_max", np.int32),
        ("pad", np.int32),
        ])

    from pyopencl.tools import get_or_register_dtype, match_dtype_to_c_struct

    name = "mmc_type"
    dtype, c_decl = match_dtype_to_c_struct(queue.device, name, dtype)
    dtype = get_or_register_dtype(name, dtype)

    n = 1000
    z_dev = cl.array.empty(queue, n, dtype=dtype)
    z_dev.fill(np.zeros((), dtype))

    z = z_dev.get()

    assert np.array_equal(np.zeros(n, dtype), z)
Example #3
0
    def __init__(self,
                 ctx_getter=cl.create_some_context,
                 enable_extents=False):
        ctx = ctx_getter()
        queue = cl.CommandQueue(ctx)

        from pyopencl.characterize import has_struct_arg_count_bug
        if has_struct_arg_count_bug(queue.device):
            pytest.xfail(
                "won't work on devices with the struct arg count issue")

        logging.basicConfig(level=logging.INFO)

        dims = 2
        nsources = 9000000
        ntargets = 9000000
        dtype = np.float32

        from boxtree.fmm import drive_fmm
        sources = p_normal(queue, nsources, dims, dtype, seed=15)
        targets = p_normal(queue, ntargets, dims, dtype, seed=15)

        from pyopencl.clrandom import PhiloxGenerator
        rng = PhiloxGenerator(queue.context, seed=12)

        if enable_extents:
            target_radii = 2**rng.uniform(queue,
                                          ntargets,
                                          dtype=dtype,
                                          a=-10,
                                          b=0)
        else:
            target_radii = None

        from boxtree import TreeBuilder
        tb = TreeBuilder(ctx)

        tree, _ = tb(
            queue,
            sources,
            #targets=targets,
            max_particles_in_box=30,
            #target_radii=target_radii,
            #stick_out_factor=0.25,
            debug=True)

        from boxtree.traversal import FMMTraversalBuilder
        tbuild = FMMTraversalBuilder(ctx)
        trav, _ = tbuild(queue, tree, debug=True)

        weights = np.ones(nsources)
        weights_sum = np.sum(weights)

        host_trav = trav.get(queue=queue)
        host_tree = host_trav.tree
        self.tree = host_tree
        self.trav = host_trav

        self.input = [host_tree, weights, weights_sum, host_trav]
        self.pot = None
Example #4
0
def test_fmm_float32(ctx_getter=cl.create_some_context, enable_extents=True):
    from time import time


    ctx = ctx_getter()
    queue = cl.CommandQueue(ctx)

    from pyopencl.characterize import has_struct_arg_count_bug
    if has_struct_arg_count_bug(queue.device):
        pytest.xfail("won't work on devices with the struct arg count issue")

    logging.basicConfig(level=logging.INFO)

    dims = 2
    nsources = 3000000
    ntargets = 3000000
    dtype = np.float32

    from boxtree.fmm import drive_fmm
    sources = p_normal(queue, nsources, dims, dtype, seed=15)
    targets = p_normal(queue, ntargets, dims, dtype, seed=15)

    from pyopencl.clrandom import PhiloxGenerator
    rng = PhiloxGenerator(queue.context, seed=12)

    if enable_extents:
        target_radii = 2**rng.uniform(queue, ntargets, dtype=dtype, a=-10, b=0)
    else:
        target_radii = None

    from boxtree import TreeBuilder
    tb = TreeBuilder(ctx)

    tree, _ = tb(queue, sources,
                 targets=targets,
            max_particles_in_box=30,
            target_radii=target_radii,stick_out_factor=0.25,
            debug=True)

    from boxtree.traversal import FMMTraversalBuilder
    tbuild = FMMTraversalBuilder(ctx)
    trav, _ = tbuild(queue, tree, debug=True)

    weights = np.ones(nsources)

    weights_sum = np.sum(weights)

    host_trav = trav.get(queue=queue)
    host_tree = host_trav.tree

    wrangler = ConstantOneExpansionWrangler(host_tree)

    ti = time()
    pot = drive_fmm(host_trav, wrangler, weights)
    print(time() - ti)
    assert (pot == weights_sum).all()
Example #5
0
def test_dot(ctx_factory):
    from pytest import importorskip
    importorskip("mako")

    context = ctx_factory()
    queue = cl.CommandQueue(context)

    dev = context.devices[0]

    dtypes = [np.float32, np.complex64]
    if has_double_support(dev):
        if has_struct_arg_count_bug(dev) == "apple":
            dtypes.extend([np.float64])
        else:
            dtypes.extend([np.float64, np.complex128])

    for a_dtype in dtypes:
        for b_dtype in dtypes:
            print(a_dtype, b_dtype)
            a_gpu = general_clrand(queue, (200000,), a_dtype)
            a = a_gpu.get()
            b_gpu = general_clrand(queue, (200000,), b_dtype)
            b = b_gpu.get()

            dot_ab = np.dot(a, b)
            dot_ab_gpu = cl_array.dot(a_gpu, b_gpu).get()

            assert abs(dot_ab_gpu - dot_ab) / abs(dot_ab) < 1e-4

            try:
                vdot_ab = np.vdot(a, b)
            except NotImplementedError:
                import sys
                is_pypy = "__pypy__" in sys.builtin_module_names
                if is_pypy:
                    print("PYPY: VDOT UNIMPLEMENTED")
                    continue
                else:
                    raise

            vdot_ab_gpu = cl_array.vdot(a_gpu, b_gpu).get()

            rel_err = abs(vdot_ab_gpu - vdot_ab) / abs(vdot_ab)
            assert rel_err < 1e-4, rel_err
Example #6
0
def test_dot(ctx_factory):
    from pytest import importorskip
    importorskip("mako")

    context = ctx_factory()
    queue = cl.CommandQueue(context)

    dev = context.devices[0]

    dtypes = [np.float32, np.complex64]
    if has_double_support(dev):
        if has_struct_arg_count_bug(dev) == "apple":
            dtypes.extend([np.float64])
        else:
            dtypes.extend([np.float64, np.complex128])

    for a_dtype in dtypes:
        for b_dtype in dtypes:
            print(a_dtype, b_dtype)
            a_gpu = general_clrand(queue, (200000,), a_dtype)
            a = a_gpu.get()
            b_gpu = general_clrand(queue, (200000,), b_dtype)
            b = b_gpu.get()

            dot_ab = np.dot(a, b)
            dot_ab_gpu = cl_array.dot(a_gpu, b_gpu).get()

            assert abs(dot_ab_gpu - dot_ab) / abs(dot_ab) < 1e-4

            try:
                vdot_ab = np.vdot(a, b)
            except NotImplementedError:
                import sys
                is_pypy = '__pypy__' in sys.builtin_module_names
                if is_pypy:
                    print("PYPY: VDOT UNIMPLEMENTED")
                    continue
                else:
                    raise

            vdot_ab_gpu = cl_array.vdot(a_gpu, b_gpu).get()

            rel_err = abs(vdot_ab_gpu - vdot_ab) / abs(vdot_ab)
            assert rel_err < 1e-4, rel_err
Example #7
0
def test_pow_neg1_vs_inv(ctx_factory):
    ctx = ctx_factory()
    queue = cl.CommandQueue(ctx)

    device = ctx.devices[0]
    if not has_double_support(device):
        from pytest import skip
        skip("double precision not supported on %s" % device)
    if has_struct_arg_count_bug(device) == "apple":
        from pytest import xfail
        xfail("apple struct arg counting broken")

    a_dev = make_random_array(queue, np.complex128, 20000)

    res1 = (a_dev ** (-1)).get()
    res2 = (1/a_dev).get()
    ref = 1/a_dev.get()

    assert la.norm(res1-ref, np.inf) / la.norm(ref) < 1e-13
    assert la.norm(res2-ref, np.inf) / la.norm(ref) < 1e-13
Example #8
0
def test_pow_neg1_vs_inv(ctx_factory):
    ctx = ctx_factory()
    queue = cl.CommandQueue(ctx)

    device = ctx.devices[0]
    if not has_double_support(device):
        from pytest import skip
        skip("double precision not supported on %s" % device)
    if has_struct_arg_count_bug(device) == "apple":
        from pytest import xfail
        xfail("apple struct arg counting broken")

    a_dev = make_random_array(queue, np.complex128, 20000)

    res1 = (a_dev**(-1)).get()
    res2 = (1 / a_dev).get()
    ref = 1 / a_dev.get()

    assert la.norm(res1 - ref, np.inf) / la.norm(ref) < 1e-13
    assert la.norm(res2 - ref, np.inf) / la.norm(ref) < 1e-13
Example #9
0
    def test(ctx_factory):
        context = ctx_factory()
        queue = cl.CommandQueue(context)

        gpu_func = getattr(clmath, name)
        cpu_func = getattr(np, numpy_func_names.get(name, name))

        dev = context.devices[0]

        if has_double_support(dev):
            if use_complex and has_struct_arg_count_bug(dev) == "apple":
                dtypes = [np.float32, np.float64, np.complex64]
            elif use_complex:
                dtypes = [np.float32, np.float64, np.complex64, np.complex128]
            else:
                dtypes = [np.float32, np.float64]
        else:
            if use_complex:
                dtypes = [np.float32, np.complex64]
            else:
                dtypes = [np.float32]

        for s in sizes:
            for dtype in dtypes:
                dtype = np.dtype(dtype)

                args = cl_array.arange(queue, a, b, (b - a) / s, dtype=dtype)
                if dtype.kind == "c":
                    # args = args + dtype.type(1j) * args
                    args = args + args * dtype.type(1j)

                gpu_results = gpu_func(args).get()
                cpu_results = cpu_func(args.get())

                my_threshold = threshold
                if dtype.kind == "c" and isinstance(use_complex, float):
                    my_threshold = use_complex

                max_err = np.max(np.abs(cpu_results - gpu_results))
                assert (max_err <= my_threshold).all(), \
                        (max_err, name, dtype)
Example #10
0
    def test(ctx_factory):
        context = ctx_factory()
        queue = cl.CommandQueue(context)

        gpu_func = getattr(clmath, name)
        cpu_func = getattr(np, numpy_func_names.get(name, name))

        dev = context.devices[0]

        if has_double_support(dev):
            if use_complex and has_struct_arg_count_bug(dev) == "apple":
                dtypes = [np.float32, np.float64, np.complex64]
            elif use_complex:
                dtypes = [np.float32, np.float64, np.complex64, np.complex128]
            else:
                dtypes = [np.float32, np.float64]
        else:
            if use_complex:
                dtypes = [np.float32, np.complex64]
            else:
                dtypes = [np.float32]

        for s in sizes:
            for dtype in dtypes:
                dtype = np.dtype(dtype)

                args = cl_array.arange(queue, a, b, (b-a)/s, dtype=dtype)
                if dtype.kind == "c":
                    # args = args + dtype.type(1j) * args
                    args = args + args * dtype.type(1j)

                gpu_results = gpu_func(args).get()
                cpu_results = cpu_func(args.get())

                my_threshold = threshold
                if dtype.kind == "c" and isinstance(use_complex, float):
                    my_threshold = use_complex

                max_err = np.max(np.abs(cpu_results - gpu_results))
                assert (max_err <= my_threshold).all(), \
                        (max_err, name, dtype)
Example #11
0
def test_mix_complex(ctx_factory):
    context = ctx_factory()
    queue = cl.CommandQueue(context)

    size = 10

    dtypes = [
            (np.float32, np.complex64),
            #(np.int32, np.complex64),
            ]

    dev = context.devices[0]
    if has_double_support(dev) and has_struct_arg_count_bug(dev) == "apple":
        dtypes.extend([
            (np.float32, np.float64),
            ])
    elif has_double_support(dev):
        dtypes.extend([
            (np.float32, np.float64),
            (np.float32, np.complex128),
            (np.float64, np.complex64),
            (np.float64, np.complex128),
            ])

    from operator import add, mul, sub, truediv
    for op in [add, sub, mul, truediv, pow]:
        for dtype_a0, dtype_b0 in dtypes:
            for dtype_a, dtype_b in [
                    (dtype_a0, dtype_b0),
                    (dtype_b0, dtype_a0),
                    ]:
                for is_scalar_a, is_scalar_b in [
                        (False, False),
                        (False, True),
                        (True, False),
                        ]:
                    if is_scalar_a:
                        ary_a = make_random_array(queue, dtype_a, 1).get()[0]
                        host_ary_a = ary_a
                    else:
                        ary_a = make_random_array(queue, dtype_a, size)
                        host_ary_a = ary_a.get()

                    if is_scalar_b:
                        ary_b = make_random_array(queue, dtype_b, 1).get()[0]
                        host_ary_b = ary_b
                    else:
                        ary_b = make_random_array(queue, dtype_b, size)
                        host_ary_b = ary_b.get()

                    print(op, dtype_a, dtype_b, is_scalar_a, is_scalar_b)
                    dev_result = op(ary_a, ary_b).get()
                    host_result = op(host_ary_a, host_ary_b)

                    if host_result.dtype != dev_result.dtype:
                        # This appears to be a numpy bug, where we get
                        # served a Python complex that is really a
                        # smaller numpy complex.

                        print("HOST_DTYPE: %s DEV_DTYPE: %s" % (
                                host_result.dtype, dev_result.dtype))

                        dev_result = dev_result.astype(host_result.dtype)

                    err = la.norm(host_result-dev_result)/la.norm(host_result)
                    print(err)
                    correct = err < 1e-4
                    if not correct:
                        print(host_result)
                        print(dev_result)
                        print(host_result - dev_result)

                    assert correct
Example #12
0
def generate_value_arg_setup(gen, kernel, cl_kernel, impl_arg_info, options):
    import loopy as lp
    from loopy.kernel.array import ArrayBase

    # {{{ arg counting bug handling

    # For example:
    # https://github.com/pocl/pocl/issues/197
    # (but Apple CPU has a similar bug)

    work_around_arg_count_bug = False
    warn_about_arg_count_bug = False

    devices = cl_kernel.context.devices

    try:
        from pyopencl.characterize import has_struct_arg_count_bug

    except ImportError:
        count_bug_per_dev = [False]*len(devices)

    else:
        count_bug_per_dev = [
                has_struct_arg_count_bug(dev)
                for dev in devices]

    if any(count_bug_per_dev):
        if all(count_bug_per_dev):
            work_around_arg_count_bug = True
        else:
            warn_about_arg_count_bug = True

    # }}}

    cl_arg_idx = 0
    arg_idx_to_cl_arg_idx = {}

    fp_arg_count = 0

    for arg_idx, arg in enumerate(impl_arg_info):
        arg_idx_to_cl_arg_idx[arg_idx] = cl_arg_idx

        if arg.arg_class is not lp.ValueArg:
            assert issubclass(arg.arg_class, ArrayBase)

            # assume each of those generates exactly one...
            cl_arg_idx += 1

            continue

        gen("# {{{ process %s" % arg.name)
        gen("")

        if not options.skip_arg_checks:
            gen("""
                if {name} is None:
                    raise RuntimeError("input argument '{name}' must "
                        "be supplied")
                """.format(name=arg.name))

        if sys.version_info < (2, 7) and arg.dtype.kind == "i":
            gen("# cast to long to avoid trouble with struct packing")
            gen("%s = long(%s)" % (arg.name, arg.name))
            gen("")

        if arg.dtype.char == "V":
            gen("cl_kernel.set_arg(%d, %s)" % (cl_arg_idx, arg.name))
            cl_arg_idx += 1

        elif arg.dtype.kind == "c":
            if warn_about_arg_count_bug:
                from warnings import warn
                warn("{knl_name}: arguments include complex numbers, and "
                        "some (but not all) of the target devices mishandle "
                        "struct kernel arguments (hence the workaround is "
                        "disabled".format(
                            knl_name=kernel.name))

            if arg.dtype == np.complex64:
                arg_char = "f"
            elif arg.dtype == np.complex128:
                arg_char = "d"
            else:
                raise TypeError("unexpected complex type: %s" % arg.dtype)

            if (work_around_arg_count_bug
                    and arg.dtype == np.complex128
                    and fp_arg_count + 2 <= 8):
                gen(
                        "buf = _lpy_pack('{arg_char}', {arg_var}.real)"
                        .format(arg_char=arg_char, arg_var=arg.name))
                gen(
                        "cl_kernel.set_arg({cl_arg_idx}, buf)"
                        .format(cl_arg_idx=cl_arg_idx))
                cl_arg_idx += 1

                gen(
                        "buf = _lpy_pack('{arg_char}', {arg_var}.imag)"
                        .format(arg_char=arg_char, arg_var=arg.name))
                gen(
                        "cl_kernel.set_arg({cl_arg_idx}, buf)"
                        .format(cl_arg_idx=cl_arg_idx))
                cl_arg_idx += 1
            else:
                gen(
                        "buf = _lpy_pack('{arg_char}{arg_char}', "
                        "{arg_var}.real, {arg_var}.imag)"
                        .format(arg_char=arg_char, arg_var=arg.name))
                gen(
                        "cl_kernel.set_arg({cl_arg_idx}, buf)"
                        .format(cl_arg_idx=cl_arg_idx))
                cl_arg_idx += 1

            fp_arg_count += 2

        else:
            if arg.dtype.kind == "f":
                fp_arg_count += 1

            gen("cl_kernel.set_arg(%d, _lpy_pack('%s', %s))"
                    % (cl_arg_idx, arg.dtype.char, arg.name))

            cl_arg_idx += 1

        gen("")

        gen("# }}}")
        gen("")

    assert cl_arg_idx == cl_kernel.num_args

    return arg_idx_to_cl_arg_idx
Example #13
0
def generate_value_arg_setup(kernel, devices, implemented_data_info):
    options = kernel.options

    import loopy as lp
    from loopy.kernel.array import ArrayBase

    # {{{ arg counting bug handling

    # For example:
    # https://github.com/pocl/pocl/issues/197
    # (but Apple CPU has a similar bug)

    work_around_arg_count_bug = False
    warn_about_arg_count_bug = False

    try:
        from pyopencl.characterize import has_struct_arg_count_bug

    except ImportError:
        count_bug_per_dev = [False]*len(devices)

    else:
        count_bug_per_dev = [
                has_struct_arg_count_bug(dev)
                if dev is not None else False
                for dev in devices]

    if any(dev is None for dev in devices):
        warn("{knl_name}: device not supplied to PyOpenCLTarget--"
                "workarounds for broken OpenCL implementations "
                "(such as those relating to complex numbers) "
                "may not be enabled when needed"
                .format(knl_name=kernel.name))

    if any(count_bug_per_dev):
        if all(count_bug_per_dev):
            work_around_arg_count_bug = True
        else:
            warn_about_arg_count_bug = True

    # }}}

    cl_arg_idx = 0
    arg_idx_to_cl_arg_idx = {}

    fp_arg_count = 0

    from genpy import (
            Comment, Line, If, Raise, Assign, Statement as S, Suite)

    result = []
    gen = result.append

    for arg_idx, idi in enumerate(implemented_data_info):
        arg_idx_to_cl_arg_idx[arg_idx] = cl_arg_idx

        if not issubclass(idi.arg_class, lp.ValueArg):
            assert issubclass(idi.arg_class, ArrayBase)

            # assume each of those generates exactly one...
            cl_arg_idx += 1

            continue

        gen(Comment("{{{ process %s" % idi.name))
        gen(Line())

        if not options.skip_arg_checks:
            gen(If("%s is None" % idi.name,
                Raise('RuntimeError("input argument \'{name}\' '
                        'must be supplied")'.format(name=idi.name))))

        if idi.dtype.is_integral():
            gen(Comment("cast to Python int to avoid trouble "
                "with struct packing or Boost.Python"))
            if sys.version_info < (3,):
                py_type = "long"
            else:
                py_type = "int"

            gen(Assign(idi.name, "%s(%s)" % (py_type, idi.name)))
            gen(Line())

        if idi.dtype.is_composite():
            gen(S("_lpy_knl.set_arg(%d, %s)" % (cl_arg_idx, idi.name)))
            cl_arg_idx += 1

        elif idi.dtype.is_complex():
            assert isinstance(idi.dtype, NumpyType)

            dtype = idi.dtype

            if warn_about_arg_count_bug:
                warn("{knl_name}: arguments include complex numbers, and "
                        "some (but not all) of the target devices mishandle "
                        "struct kernel arguments (hence the workaround is "
                        "disabled".format(
                            knl_name=kernel.name))

            if dtype.numpy_dtype == np.complex64:
                arg_char = "f"
            elif dtype.numpy_dtype == np.complex128:
                arg_char = "d"
            else:
                raise TypeError("unexpected complex type: %s" % dtype)

            if (work_around_arg_count_bug
                    and dtype.numpy_dtype == np.complex128
                    and fp_arg_count + 2 <= 8):
                gen(Assign(
                    "_lpy_buf",
                    "_lpy_pack('{arg_char}', {arg_var}.real)"
                    .format(arg_char=arg_char, arg_var=idi.name)))
                gen(S(
                    "_lpy_knl.set_arg({cl_arg_idx}, _lpy_buf)"
                    .format(cl_arg_idx=cl_arg_idx)))
                cl_arg_idx += 1

                gen(Assign(
                    "_lpy_buf",
                    "_lpy_pack('{arg_char}', {arg_var}.imag)"
                    .format(arg_char=arg_char, arg_var=idi.name)))
                gen(S(
                        "_lpy_knl.set_arg({cl_arg_idx}, _lpy_buf)"
                        .format(cl_arg_idx=cl_arg_idx)))
                cl_arg_idx += 1
            else:
                gen(Assign(
                    "_lpy_buf",
                    "_lpy_pack('{arg_char}{arg_char}', "
                    "{arg_var}.real, {arg_var}.imag)"
                    .format(arg_char=arg_char, arg_var=idi.name)))
                gen(S(
                    "_lpy_knl.set_arg({cl_arg_idx}, _lpy_buf)"
                    .format(cl_arg_idx=cl_arg_idx)))
                cl_arg_idx += 1

            fp_arg_count += 2

        elif isinstance(idi.dtype, NumpyType):
            if idi.dtype.dtype.kind == "f":
                fp_arg_count += 1

            gen(S(
                "_lpy_knl.set_arg(%d, _lpy_pack('%s', %s))"
                % (cl_arg_idx, idi.dtype.dtype.char, idi.name)))

            cl_arg_idx += 1

        else:
            raise LoopyError("do not know how to pass argument of type '%s'"
                    % idi.dtype)

        gen(Line())

        gen(Comment("}}}"))
        gen(Line())

    return Suite(result), arg_idx_to_cl_arg_idx, cl_arg_idx
Example #14
0
def test_mix_complex(ctx_factory):
    context = ctx_factory()
    queue = cl.CommandQueue(context)

    size = 10

    dtypes = [
        (np.float32, np.complex64),
        #(np.int32, np.complex64),
    ]

    dev = context.devices[0]
    if has_double_support(dev) and has_struct_arg_count_bug(dev) == "apple":
        dtypes.extend([
            (np.float32, np.float64),
        ])
    elif has_double_support(dev):
        dtypes.extend([
            (np.float32, np.float64),
            (np.float32, np.complex128),
            (np.float64, np.complex64),
            (np.float64, np.complex128),
        ])

    from operator import add, mul, sub, truediv
    for op in [add, sub, mul, truediv, pow]:
        for dtype_a0, dtype_b0 in dtypes:
            for dtype_a, dtype_b in [
                (dtype_a0, dtype_b0),
                (dtype_b0, dtype_a0),
            ]:
                for is_scalar_a, is_scalar_b in [
                    (False, False),
                    (False, True),
                    (True, False),
                ]:
                    if is_scalar_a:
                        ary_a = make_random_array(queue, dtype_a, 1).get()[0]
                        host_ary_a = ary_a
                    else:
                        ary_a = make_random_array(queue, dtype_a, size)
                        host_ary_a = ary_a.get()

                    if is_scalar_b:
                        ary_b = make_random_array(queue, dtype_b, 1).get()[0]
                        host_ary_b = ary_b
                    else:
                        ary_b = make_random_array(queue, dtype_b, size)
                        host_ary_b = ary_b.get()

                    print(op, dtype_a, dtype_b, is_scalar_a, is_scalar_b)
                    dev_result = op(ary_a, ary_b).get()
                    host_result = op(host_ary_a, host_ary_b)

                    if host_result.dtype != dev_result.dtype:
                        # This appears to be a numpy bug, where we get
                        # served a Python complex that is really a
                        # smaller numpy complex.

                        print("HOST_DTYPE: {} DEV_DTYPE: {}".format(
                            host_result.dtype, dev_result.dtype))

                        dev_result = dev_result.astype(host_result.dtype)

                    err = la.norm(host_result -
                                  dev_result) / la.norm(host_result)
                    print(err)
                    correct = err < 1e-4
                    if not correct:
                        print(host_result)
                        print(dev_result)
                        print(host_result - dev_result)

                    assert correct
Example #15
0
def generate_value_arg_setup(kernel, devices, implemented_data_info):
    options = kernel.options

    import loopy as lp
    from loopy.kernel.array import ArrayBase

    # {{{ arg counting bug handling

    # For example:
    # https://github.com/pocl/pocl/issues/197
    # (but Apple CPU has a similar bug)

    work_around_arg_count_bug = False
    warn_about_arg_count_bug = False

    try:
        from pyopencl.characterize import has_struct_arg_count_bug

    except ImportError:
        count_bug_per_dev = [False]*len(devices)

    else:
        count_bug_per_dev = [
                has_struct_arg_count_bug(dev)
                if dev is not None else False
                for dev in devices]

    if any(dev is None for dev in devices):
        warn("{knl_name}: device not supplied to PyOpenCLTarget--"
                "workarounds for broken OpenCL implementations "
                "(such as those relating to complex numbers) "
                "may not be enabled when needed. To avoid this, "
                "pass target=lp.PyOpenCLTarget(dev) when creating "
                "the kernel."
                .format(knl_name=kernel.name))

    if any(count_bug_per_dev):
        if all(count_bug_per_dev):
            work_around_arg_count_bug = True
        else:
            warn_about_arg_count_bug = True

    # }}}

    cl_arg_idx = 0
    arg_idx_to_cl_arg_idx = {}

    fp_arg_count = 0

    from genpy import (
            Comment, Line, If, Raise, Assign, Statement as S, Suite)

    result = []
    gen = result.append

    for arg_idx, idi in enumerate(implemented_data_info):
        arg_idx_to_cl_arg_idx[arg_idx] = cl_arg_idx

        if not issubclass(idi.arg_class, lp.ValueArg):
            assert issubclass(idi.arg_class, ArrayBase)

            # assume each of those generates exactly one...
            cl_arg_idx += 1

            continue

        gen(Comment("{{{ process %s" % idi.name))
        gen(Line())

        if not options.skip_arg_checks:
            gen(If("%s is None" % idi.name,
                Raise('RuntimeError("input argument \'{name}\' '
                        'must be supplied")'.format(name=idi.name))))

        if idi.dtype.is_integral():
            gen(Comment("cast to Python int to avoid trouble "
                "with struct packing or Boost.Python"))
            if sys.version_info < (3,):
                py_type = "long"
            else:
                py_type = "int"

            gen(Assign(idi.name, "%s(%s)" % (py_type, idi.name)))
            gen(Line())

        if idi.dtype.is_composite():
            gen(S("_lpy_knl.set_arg(%d, %s)" % (cl_arg_idx, idi.name)))
            cl_arg_idx += 1

        elif idi.dtype.is_complex():
            assert isinstance(idi.dtype, NumpyType)

            dtype = idi.dtype

            if warn_about_arg_count_bug:
                warn("{knl_name}: arguments include complex numbers, and "
                        "some (but not all) of the target devices mishandle "
                        "struct kernel arguments (hence the workaround is "
                        "disabled".format(
                            knl_name=kernel.name))

            if dtype.numpy_dtype == np.complex64:
                arg_char = "f"
            elif dtype.numpy_dtype == np.complex128:
                arg_char = "d"
            else:
                raise TypeError("unexpected complex type: %s" % dtype)

            if (work_around_arg_count_bug
                    and dtype.numpy_dtype == np.complex128
                    and fp_arg_count + 2 <= 8):
                gen(Assign(
                    "_lpy_buf",
                    "_lpy_pack('{arg_char}', {arg_var}.real)"
                    .format(arg_char=arg_char, arg_var=idi.name)))
                gen(S(
                    "_lpy_knl.set_arg({cl_arg_idx}, _lpy_buf)"
                    .format(cl_arg_idx=cl_arg_idx)))
                cl_arg_idx += 1

                gen(Assign(
                    "_lpy_buf",
                    "_lpy_pack('{arg_char}', {arg_var}.imag)"
                    .format(arg_char=arg_char, arg_var=idi.name)))
                gen(S(
                        "_lpy_knl.set_arg({cl_arg_idx}, _lpy_buf)"
                        .format(cl_arg_idx=cl_arg_idx)))
                cl_arg_idx += 1
            else:
                gen(Assign(
                    "_lpy_buf",
                    "_lpy_pack('{arg_char}{arg_char}', "
                    "{arg_var}.real, {arg_var}.imag)"
                    .format(arg_char=arg_char, arg_var=idi.name)))
                gen(S(
                    "_lpy_knl.set_arg({cl_arg_idx}, _lpy_buf)"
                    .format(cl_arg_idx=cl_arg_idx)))
                cl_arg_idx += 1

            fp_arg_count += 2

        elif isinstance(idi.dtype, NumpyType):
            if idi.dtype.dtype.kind == "f":
                fp_arg_count += 1

            gen(S(
                "_lpy_knl.set_arg(%d, _lpy_pack('%s', %s))"
                % (cl_arg_idx, idi.dtype.dtype.char, idi.name)))

            cl_arg_idx += 1

        else:
            raise LoopyError("do not know how to pass argument of type '%s'"
                    % idi.dtype)

        gen(Line())

        gen(Comment("}}}"))
        gen(Line())

    return Suite(result), arg_idx_to_cl_arg_idx, cl_arg_idx
Example #16
0
def generate_value_arg_setup(kernel, devices, implemented_data_info):
    options = kernel.options

    import loopy as lp
    from loopy.kernel.array import ArrayBase

    # {{{ arg counting bug handling

    # For example:
    # https://github.com/pocl/pocl/issues/197
    # (but Apple CPU has a similar bug)

    work_around_arg_count_bug = False
    warn_about_arg_count_bug = False

    try:
        from pyopencl.characterize import has_struct_arg_count_bug

    except ImportError:
        count_bug_per_dev = [False] * len(devices)

    else:
        count_bug_per_dev = [
            has_struct_arg_count_bug(dev) if dev is not None else False
            for dev in devices
        ]

    if any(dev is None for dev in devices):
        warn("{knl_name}: device not supplied to PyOpenCLTarget--"
             "workarounds for broken OpenCL implementations "
             "(such as those relating to complex numbers) "
             "may not be enabled when needed. To avoid this, "
             "pass target=lp.PyOpenCLTarget(dev) when creating "
             "the kernel.".format(knl_name=kernel.name))

    if any(count_bug_per_dev):
        if all(count_bug_per_dev):
            work_around_arg_count_bug = True
        else:
            warn_about_arg_count_bug = True

    # }}}

    cl_arg_idx = 0
    arg_idx_to_cl_arg_idx = {}

    fp_arg_count = 0

    from genpy import If, Raise, Statement as S, Suite

    result = []
    gen = result.append

    buf_indices_and_args = []
    buf_pack_indices_and_args = []

    from pyopencl.invoker import BUF_PACK_TYPECHARS

    def add_buf_arg(arg_idx, typechar, expr_str):
        if typechar in BUF_PACK_TYPECHARS:
            buf_pack_indices_and_args.append(arg_idx)
            buf_pack_indices_and_args.append(repr(typechar.encode()))
            buf_pack_indices_and_args.append(expr_str)
        else:
            buf_indices_and_args.append(arg_idx)
            buf_indices_and_args.append(f"pack('{typechar}', {expr_str})")

    for arg_idx, idi in enumerate(implemented_data_info):
        arg_idx_to_cl_arg_idx[arg_idx] = cl_arg_idx

        if not issubclass(idi.arg_class, lp.ValueArg):
            assert issubclass(idi.arg_class, ArrayBase)

            # assume each of those generates exactly one...
            cl_arg_idx += 1

            continue

        if not options.skip_arg_checks:
            gen(
                If(
                    "%s is None" % idi.name,
                    Raise('RuntimeError("input argument \'{name}\' '
                          'must be supplied")'.format(name=idi.name))))

        if idi.dtype.is_composite():
            buf_indices_and_args.append(cl_arg_idx)
            buf_indices_and_args.append(f"{idi.name}")

            cl_arg_idx += 1

        elif idi.dtype.is_complex():
            assert isinstance(idi.dtype, NumpyType)

            dtype = idi.dtype

            if warn_about_arg_count_bug:
                warn("{knl_name}: arguments include complex numbers, and "
                     "some (but not all) of the target devices mishandle "
                     "struct kernel arguments (hence the workaround is "
                     "disabled".format(knl_name=kernel.name))

            if dtype.numpy_dtype == np.complex64:
                arg_char = "f"
            elif dtype.numpy_dtype == np.complex128:
                arg_char = "d"
            else:
                raise TypeError("unexpected complex type: %s" % dtype)

            if (work_around_arg_count_bug
                    and dtype.numpy_dtype == np.complex128
                    and fp_arg_count + 2 <= 8):
                add_buf_arg(cl_arg_idx, arg_char, f"{idi.name}.real")
                cl_arg_idx += 1

                add_buf_arg(cl_arg_idx, arg_char, f"{idi.name}.imag")
                cl_arg_idx += 1
            else:
                buf_indices_and_args.append(cl_arg_idx)
                buf_indices_and_args.append(
                    f"_lpy_pack('{arg_char}{arg_char}', "
                    f"{idi.name}.real, {idi.name}.imag)")
                cl_arg_idx += 1

            fp_arg_count += 2

        elif isinstance(idi.dtype, NumpyType):
            if idi.dtype.dtype.kind == "f":
                fp_arg_count += 1

            add_buf_arg(cl_arg_idx, idi.dtype.dtype.char, idi.name)
            cl_arg_idx += 1

        else:
            raise LoopyError("do not know how to pass argument of type '%s'" %
                             idi.dtype)

    for arg_kind, args_and_indices, entry_length in [
        ("_buf", buf_indices_and_args, 2),
        ("_buf_pack", buf_pack_indices_and_args, 3),
    ]:
        assert len(args_and_indices) % entry_length == 0
        if args_and_indices:
            gen(
                S(f"_lpy_knl._set_arg{arg_kind}_multi("
                  f"({', '.join(str(i) for i in args_and_indices)},), "
                  ")"))

    return Suite(result), arg_idx_to_cl_arg_idx, cl_arg_idx