Esempio n. 1
0
def get_target(lang, device=None, compiler=None):
    """

    Parameters
    ----------
    lang : str
        One of the supported languages, {'c', 'cuda', 'opencl'}
    device : :class:`pyopencl.Device`
        If supplied, and lang is 'opencl', passed to the
        :class:`loopy.PyOpenCLTarget`
    compiler: str
        If supplied, the C-compiler to use

    Returns
    -------
    The correct loopy target type
    """

    utils.check_lang(lang)

    # set target
    if lang == 'opencl':
        return lp.PyOpenCLTarget(device=device)
    elif lang == 'c':
        return lp.ExecutableCTarget(compiler=compiler)
    elif lang == 'cuda':
        return lp.CudaTarget()
    elif lang == 'ispc':
        return lp.ISPCTarget()
Esempio n. 2
0
def test_recursive_nested_dependent_reduction(ctx_factory):
    dtype = np.dtype(np.int32)
    ctx = ctx_factory()

    knl = lp.make_kernel([
        "{[itgt]: 0 <= itgt < ntgts}", "{[isrc_box]: 0 <= isrc_box < nboxes}",
        "{[isrc]: 0 <= isrc < npart}"
    ],
                         """
            for itgt
                for isrc_box
                    <> npart = nparticles_per_box[isrc_box]
                    <> boxsum = sum(isrc, isrc+isrc_box+itgt)
                end
                a[itgt] = sum(isrc_box, boxsum)
            end
            """, [
                             lp.ValueArg("n", np.int32),
                             lp.GlobalArg("a", dtype, ("n", )),
                             lp.GlobalArg("nparticles_per_box", np.int32,
                                          ("nboxes", )),
                             lp.ValueArg("ntgts", np.int32),
                             lp.ValueArg("nboxes", np.int32),
                         ],
                         assumptions="ntgts>=1",
                         target=lp.PyOpenCLTarget(ctx.devices[0]))

    print(lp.generate_code_v2(knl).device_code())
Esempio n. 3
0
def test_dependent_loop_bounds_3(ctx_factory):
    # The point of this test is that it shows a dependency between
    # domains that is exclusively mediated by the row_len temporary.
    # It also makes sure that row_len gets read before any
    # conditionals use it.

    dtype = np.dtype(np.float32)
    ctx = ctx_factory()

    knl = lp.make_kernel([
        "{[i]: 0<=i<n}",
        "{[jj]: 0<=jj<row_len}",
    ], [
        "<> row_len = a_row_lengths[i]",
        "a[i,jj] = 1",
    ], [
        lp.GlobalArg("a_row_lengths", np.int32, shape=lp.auto),
        lp.GlobalArg("a", dtype, shape=("n,n"), order="C"),
        lp.ValueArg("n", np.int32),
    ],
                         target=lp.PyOpenCLTarget(ctx.devices[0]),
                         name="loopy_kernel")

    assert knl["loopy_kernel"].parents_per_domain()[1] == 0

    knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0")

    print(lp.generate_code_v2(knl).device_code())

    knl_bad = lp.split_iname(knl, "jj", 128, outer_tag="g.1", inner_tag="l.1")

    with pytest.raises(RuntimeError):
        list(lp.generate_code_v2(knl_bad))
Esempio n. 4
0
    def __init__(self, map_instructions, **kwargs):
        if "map_dict" in kwargs:
            from warnings import warn
            warn("Passing map_dict is deprecated. Pass map_instructions instead.",
                 DeprecationWarning, stacklevel=2)
            map_instructions = kwargs.pop("map_dict")

        self.map_instructions = map_instructions
        if isinstance(self.map_instructions, dict):
            self.map_instructions = list(self.map_instructions.items())

        if "tmp_dict" in kwargs:
            from warnings import warn
            warn("Passing tmp_dict is deprecated. Pass tmp_instructions instead.",
                 DeprecationWarning, stacklevel=2)
            tmp_instructions = kwargs.pop("tmp_dict")
        else:
            tmp_instructions = []

        self.tmp_instructions = kwargs.pop("tmp_instructions", tmp_instructions)
        if isinstance(self.tmp_instructions, dict):
            self.tmp_instructions = list(self.tmp_instructions.items())

        self.args = kwargs.pop("args", [...])
        self.dtype = kwargs.pop("dtype", None)

        # default local size which saturates memory bandwidth
        self.lsize = kwargs.pop("lsize", (16, 4, 1))
        rank_shape = kwargs.pop("rank_shape", None)
        halo_shape = kwargs.pop("halo_shape", None)

        domains = kwargs.pop(
            "domains",
            "[Nx, Ny, Nz] -> {[i,j,k]: 0<=i<Nx and 0<=j<Ny and 0<=k<Nz}"
        )

        kernel_kwargs = dict(
            seq_dependencies=True,
            default_offset=lp.auto,
            target=lp.PyOpenCLTarget(),
            lang_version=(2018, 2),
        )
        kernel_kwargs.update(kwargs)

        knl = self.make_kernel(self.map_instructions, self.tmp_instructions,
                               self.args, domains, **kernel_kwargs)

        if isinstance(halo_shape, int):
            knl = lp.fix_parameters(knl, h=halo_shape)
        elif isinstance(halo_shape, (tuple, list)):
            knl = lp.fix_parameters(
                knl, hx=halo_shape[0], hy=halo_shape[1], hz=halo_shape[2]
            )

        knl = self.parallelize(knl, self.lsize)
        if rank_shape is not None:
            knl = lp.fix_parameters(
                knl, Nx=rank_shape[0], Ny=rank_shape[1], Nz=rank_shape[2]
            )
        self.knl = lp.remove_unused_inames(knl)
    def __test(loop_size, vec_width):
        knl = lp.make_kernel(
            '{{[i]: 0 <= i < {}}}'.format(loop_size),
            """
            <> x = 1.0
            a1[0] = a1[0] + x {id=set}
            ... lbarrier {id=wait, dep=set}
            for i
                a1[0] = a1[0] + 1 {id=a1, dep=set:wait, nosync=set}
            end
            """, [
                lp.GlobalArg(
                    'a1', shape=(loop_size, ), order='C', dtype=np.float32)
            ],
            target=lp.PyOpenCLTarget())
        loopy_opts = type('', (object, ), {
            'depth': vec_width,
            'order': 'C',
            'use_atomics': True
        })
        knl = lp.split_iname(knl, 'i', vec_width, inner_tag='l.0')

        # feed through deep specializer
        _, ds = get_deep_specializer(loopy_opts,
                                     atomic_ids=['a1'],
                                     split_ids=['set'],
                                     use_atomics=True,
                                     is_write_race=True,
                                     split_size=loop_size)
        knl = ds(knl)

        val = np.minimum(loop_size, vec_width)
        assert 'x / {:.1f}f'.format(val) in lp.generate_code(knl)[0]
Esempio n. 6
0
def test_triangle_domain(ctx_factory):
    ctx = ctx_factory()

    knl = lp.make_kernel("{[i,j]: 0<=i,j<n and i <= j}",
                         "a[i,j] = 17",
                         assumptions="n>=1",
                         target=lp.PyOpenCLTarget(ctx.devices[0]))

    print(knl)
    print(lp.generate_code_v2(knl).device_code())
Esempio n. 7
0
def test_rob_stroud_bernstein(ctx_factory):
    ctx = ctx_factory()

    # NOTE: tmp would have to be zero-filled beforehand

    knl = lp.make_kernel(
            "{[el, i2, alpha1,alpha2]: \
                    0 <= el < nels and \
                    0 <= i2 < nqp1d and \
                    0 <= alpha1 <= deg and 0 <= alpha2 <= deg-alpha1 }",
            """
            for el,i2
                <> xi = qpts[1, i2]
                <> s = 1-xi
                <> r = xi/s
                <> aind = 0 {id=aind_init}

                for alpha1
                    <> w = s**(deg-alpha1) {id=init_w}

                    for alpha2
                        tmp[el,alpha1,i2] = tmp[el,alpha1,i2] + w * coeffs[aind] \
                                {id=write_tmp,dep=init_w:aind_init}
                        w = w * r * ( deg - alpha1 - alpha2 ) / (1 + alpha2) \
                                {id=update_w,dep=init_w:write_tmp}
                        aind = aind + 1 \
                                {id=aind_incr,dep=aind_init:write_tmp:update_w}
                    end
                end
            end
            """,
            [
                # Must declare coeffs to have "no" shape, to keep loopy
                # from trying to figure it out the shape automatically.

                lp.GlobalArg("coeffs", None, shape=None),
                "..."
                ],
            assumptions="deg>=0 and nels>=1",
            target=lp.PyOpenCLTarget(ctx.devices[0])
            )

    knl = lp.fix_parameters(knl, nqp1d=7, deg=4)
    knl = lp.split_iname(knl, "el", 16, inner_tag="l.0")
    knl = lp.split_iname(knl, "el_outer", 2, outer_tag="g.0", inner_tag="ilp",
            slabs=(0, 1))
    knl = lp.tag_inames(knl, dict(i2="l.1", alpha1="unr", alpha2="unr"))
    knl = lp.add_dtypes(knl, dict(
                qpts=np.float32,
                coeffs=np.float32,
                tmp=np.float32,
                ))
    print(lp.generate_code_v2(knl))
Esempio n. 8
0
def test_eq_constraint(ctx_factory):
    logging.basicConfig(level=logging.INFO)

    ctx = ctx_factory()

    knl = lp.make_kernel("{[i]: 0<= i < 32}", ["a[i] = b[i]"], [
        lp.GlobalArg("a", np.float32, shape=(1000, )),
        lp.GlobalArg("b", np.float32, shape=(1000, ))
    ],
                         target=lp.PyOpenCLTarget(ctx.devices[0]))

    knl = lp.split_iname(knl, "i", 16, outer_tag="g.0")
    knl = lp.split_iname(knl, "i_inner", 16, outer_tag=None, inner_tag="l.0")
    print(lp.generate_code_v2(knl).device_code())
Esempio n. 9
0
def test_assume(ctx_factory):
    ctx = ctx_factory()

    knl = lp.make_kernel("{[i]: 0<=i<n}",
                         "a[i] = a[i] + 1",
                         [lp.GlobalArg("a", np.float32, shape="n"), "..."],
                         target=lp.PyOpenCLTarget(ctx.devices[0]))

    knl = lp.split_iname(knl, "i", 16)
    knl = lp.prioritize_loops(knl, "i_outer,i_inner")
    knl = lp.assume(knl, "n mod 16 = 0")
    knl = lp.assume(knl, "n > 10")
    code = lp.generate_code_v2(knl).device_code()
    assert "if" not in code
Esempio n. 10
0
def test_divisibility_assumption(ctx_factory):
    ctx = ctx_factory()

    knl = lp.make_kernel("[n] -> {[i]: 0<=i<n}", ["b[i] = 2*a[i]"], [
        lp.GlobalArg("a", np.float32, shape=("n", )),
        lp.GlobalArg("b", np.float32, shape=("n", )),
        lp.ValueArg("n", np.int32),
    ],
                         assumptions="n>=1 and (exists zz: n = 16*zz)",
                         target=lp.PyOpenCLTarget(ctx.devices[0]))

    ref_knl = knl

    knl = lp.split_iname(knl, "i", 16)
    code = lp.generate_code_v2(knl).device_code()
    assert "if" not in code

    lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters={"n": 16**3})
Esempio n. 11
0
def test_dependent_loop_bounds(ctx_factory):
    dtype = np.dtype(np.float32)
    ctx = ctx_factory()

    knl = lp.make_kernel([
        "{[i]: 0<=i<n}",
        "{[jj]: 0<=jj<row_len}",
    ], [
        "<> row_len = a_rowstarts[i+1] - a_rowstarts[i]",
        "a_sum[i] = sum(jj, a_values[[a_rowstarts[i]+jj]])",
    ], [
        lp.GlobalArg("a_rowstarts", np.int32, shape=lp.auto),
        lp.GlobalArg("a_indices", np.int32, shape=lp.auto),
        lp.GlobalArg("a_values", dtype),
        lp.GlobalArg("a_sum", dtype, shape=lp.auto),
        lp.ValueArg("n", np.int32),
    ],
                         assumptions="n>=1 and row_len>=1",
                         target=lp.PyOpenCLTarget(ctx.devices[0]))

    print(lp.generate_code_v2(knl).device_code())
Esempio n. 12
0
def test_dependent_loop_bounds_2(ctx_factory):
    dtype = np.dtype(np.float32)
    ctx = ctx_factory()

    knl = lp.make_kernel([
        "{[i]: 0<=i<n}",
        "{[jj]: 0<=jj<row_len}",
    ], [
        "<> row_start = a_rowstarts[i]",
        "<> row_len = a_rowstarts[i+1] - row_start",
        "ax[i] = sum(jj, a_values[[row_start+jj]])",
    ], [
        lp.GlobalArg("a_rowstarts", np.int32, shape=lp.auto),
        lp.GlobalArg("a_indices", np.int32, shape=lp.auto),
        lp.GlobalArg("a_values", dtype, strides=(1, )),
        lp.GlobalArg("ax", dtype, shape=lp.auto),
        lp.ValueArg("n", np.int32),
    ],
                         assumptions="n>=1 and row_len>=1",
                         target=lp.PyOpenCLTarget(ctx.devices[0]))

    knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0")

    print(lp.generate_code_v2(knl).device_code())
Esempio n. 13
0
 def get_loopy_target(self) -> "loopy.PyOpenCLTarget":
     import loopy as lp
     device = None
     if self.queue is not None:
         device = self.queue.device
     return lp.PyOpenCLTarget(device)
Esempio n. 14
0
    def __inner(queue=None):
        output = []
        kc_ind = 0
        oob = False
        while not oob:
            # handle weirdness between list / non-list input
            try:
                kc = kernel_calls[kc_ind]
                kc_ind += 1
            except IndexError:
                oob = True
                break  # reached end of list
            except TypeError:
                # not a list
                oob = True  # break on next run
                kc = kernel_calls

            # create the outputs
            if kc.out_mask is not None:
                out_ref = [None for i in kc.out_mask]
            else:
                out_ref = [None]

            found = False
            # run kernels
            for k in knl:
                # test that we want to run this one
                if kc.is_my_kernel(k):
                    found = True
                    # set the editor to avoid intel bugs
                    test_knl = editor(k)
                    if isinstance(test_knl.target, lp.PyOpenCLTarget):
                        # recreate with device
                        test_knl = test_knl.copy(target=lp.PyOpenCLTarget(
                            device=device))

                    # check for chaining
                    if kc.chain:
                        kc.chain(kc, output)

                    # run!
                    out = kc(test_knl, queue)

                    if kc.post_process:
                        kc.post_process(kc, out)

                    # output mapping
                    if all(x is None for x in out_ref):
                        # if the outputs are none, we init to zeros
                        # and avoid copying zeros over later data!
                        out_ref = [np.zeros_like(x) for x in out]

                    for ind in range(len(out)):
                        # get indicies that are non-zero (already in there)
                        # or non infinity/nan

                        # try w/o finite check (I'm paranoid, don't want to mask)
                        # any bad data
                        copy_inds = np.where(np.logical_not(out[ind] == 0))
                        # copy_inds = np.where(np.logical_not(
                        #    np.logical_or(np.isinf(out[ind]),
                        #                  out[ind] == 0, np.isnan(out[ind]))),
                        # )
                        out_ref[ind][copy_inds] = out[ind][copy_inds]

            output.append(out_ref)
            assert found or kc.allow_skip, (
                'No kernels could be found to match kernel call {}'.format(
                    kc.name))
        return output
Esempio n. 15
0
def test_rob_stroud_bernstein_full(ctx_factory):
    #logging.basicConfig(level=logging.DEBUG)
    ctx = ctx_factory()

    # NOTE: result would have to be zero-filled beforehand

    knl = lp.make_kernel(
        "{[el, i2, alpha1,alpha2, i1_2, alpha1_2, i2_2]: \
                0 <= el < nels and \
                0 <= i2 < nqp1d and \
                0 <= alpha1 <= deg and 0 <= alpha2 <= deg-alpha1 and\
                \
                0 <= i1_2 < nqp1d and \
                0 <= alpha1_2 <= deg and \
                0 <= i2_2 < nqp1d \
                }",
        """
        for el
            for i2
                <> xi = qpts[1, i2]
                <> s = 1-xi
                <> r = xi/s
                <> aind = 0 {id=aind_init}

                for alpha1
                    <> w = s**(deg-alpha1) {id=init_w}

                    <> tmp[alpha1,i2] = tmp[alpha1,i2] + w * coeffs[aind] \
                            {id=write_tmp,dep=init_w:aind_init}
                    for alpha2
                        w = w * r * ( deg - alpha1 - alpha2 ) / (1 + alpha2) \
                            {id=update_w,dep=init_w:write_tmp}
                        aind = aind + 1 \
                            {id=aind_incr,dep=aind_init:write_tmp:update_w}
                    end
                end
            end

            for i1_2
                <> xi2 = qpts[0, i1_2] {dep=aind_incr}
                <> s2 = 1-xi2
                <> r2 = xi2/s2
                <> w2 = s2**deg  {id=w2_init}

                for alpha1_2
                    for i2_2
                        result[el, i1_2, i2_2] = result[el, i1_2, i2_2] + \
                                w2 * tmp[alpha1_2, i2_2]  {id=res2,dep=w2_init}
                    end

                    w2 = w2 * r2 * (deg-alpha1_2) / (1+alpha1_2)  \
                            {id=w2_update, dep=res2}
                end
            end
        end
        """,
        [
            # Must declare coeffs to have "no" shape, to keep loopy
            # from trying to figure it out the shape automatically.

            lp.GlobalArg("coeffs", None, shape=None),
            "..."
            ],
        assumptions="deg>=0 and nels>=1",
        target=lp.PyOpenCLTarget(ctx.devices[0])
        )

    knl = lp.fix_parameters(knl, nqp1d=7, deg=4)

    if 0:
        knl = lp.split_iname(knl, "el", 16, inner_tag="l.0")
        knl = lp.split_iname(knl, "el_outer", 2, outer_tag="g.0", inner_tag="ilp",
                slabs=(0, 1))
        knl = lp.tag_inames(knl, dict(i2="l.1", alpha1="unr", alpha2="unr"))

    from pickle import dumps, loads
    knl = loads(dumps(knl))

    knl = lp.add_dtypes(knl,
            dict(
                qpts=np.float32,
                tmp=np.float32,
                coeffs=np.float32,
                result=np.float32,
                ))
    print(lp.generate_code_v2(knl))
Esempio n. 16
0
 def get_loopy_target(self) -> "loopy.LoopyPyOpenCLTarget":
     import loopy as lp
     return lp.PyOpenCLTarget(self.device)