Example #1
0
def test_diamond_tiling(ctx_factory, interactive=False):
    ctx = ctx_factory()
    queue = cl.CommandQueue(ctx)

    ref_knl = lp.make_kernel(
        "[nx,nt] -> {[ix, it]: 1<=ix<nx-1 and 0<=it<nt}", """
        u[ix, it+2] = (
            2*u[ix, it+1]
            + dt**2/dx**2 * (u[ix+1, it+1] - 2*u[ix, it+1] + u[ix-1, it+1])
            - u[ix, it])
        """)

    knl_for_transform = ref_knl

    ref_knl = lp.prioritize_loops(ref_knl, "it, ix")

    import islpy as isl
    m = isl.BasicMap(
        "[nx,nt] -> {[ix, it] -> [tx, tt, tparity, itt, itx]: "
        "16*(tx - tt) + itx - itt = ix - it and "
        "16*(tx + tt + tparity) + itt + itx = ix + it and "
        "0<=tparity<2 and 0 <= itx - itt < 16 and 0 <= itt+itx < 16}")
    knl = lp.map_domain(knl_for_transform, m)
    knl = lp.prioritize_loops(knl, "tt,tparity,tx,itt,itx")

    if interactive:
        nx = 43
        u = np.zeros((nx, 200))
        x = np.linspace(-1, 1, nx)
        dx = x[1] - x[0]
        u[:, 0] = u[:, 1] = np.exp(-100 * x**2)

        u_dev = cl.array.to_device(queue, u)
        knl(queue, u=u_dev, dx=dx, dt=dx)

        u = u_dev.get()
        import matplotlib.pyplot as plt
        plt.imshow(u.T)
        plt.show()
    else:
        types = {"dt,dx,u": np.float64}
        knl = lp.add_and_infer_dtypes(knl, types)
        ref_knl = lp.add_and_infer_dtypes(ref_knl, types)

        lp.auto_test_vs_ref(ref_knl,
                            ctx,
                            knl,
                            parameters={
                                "nx": 200,
                                "nt": 300,
                                "dx": 1,
                                "dt": 1
                            })
def build_loopy_kernel_A_text():
    knl_name = "kernel_tensor_A"

    knl = lp.make_kernel("{ [i,j,k]: 0<=i,j<n and 0<=k<m }",
                         """
            A[i,j] = c*sum(k, B[k,i]*B[k,j])
        """,
                         name=knl_name,
                         assumptions="n >= 1 and m >= 1",
                         lang_version=lp.MOST_RECENT_LANGUAGE_VERSION,
                         target=lp.CTarget())

    knl = lp.add_and_infer_dtypes(
        knl, {
            "A": np.dtype(np.double),
            "B": np.dtype(np.double),
            "c": np.dtype(np.double)
        })
    knl = lp.fix_parameters(knl, n=3, m=2)
    knl = lp.prioritize_loops(knl, "i,j")
    #print(knl)

    knl_c, knl_h = lp.generate_code_v2(knl).device_code(), str(
        lp.generate_header(knl)[0])

    replacements = [("__restrict__", "restrict")]
    knl_c = utils.replace_strings(knl_c, replacements)
    knl_h = utils.replace_strings(knl_h, replacements)

    knl_call = "kernel_tensor_A(A, &B[0][0], 1.0/(2.0*Ae));"

    return knl_name, knl_call, knl_c, knl_h
Example #3
0
 def variant_2(knl):
     knl = lp.split_iname(knl, "i", 16, outer_tag="g.1", inner_tag="l.1")
     knl = lp.split_iname(knl, "j", 16, outer_tag="g.0", inner_tag="l.0")
     knl = lp.add_prefetch(knl, "a", ["i_inner", "j_inner"],
             fetch_bounding_box=True, default_tag="l.auto")
     knl = lp.prioritize_loops(knl, ["a_dim_0_outer", "a_dim_1_outer"])
     return knl
Example #4
0
def test_precompute_with_preexisting_inames(ctx_factory):
    ctx = ctx_factory()

    knl = lp.make_kernel(
        "{[e,i,j,k]: 0<=e<E and 0<=i,j,k<n}",
        """
        result[e,i] = sum(j, D1[i,j]*u[e,j])
        result2[e,i] = sum(k, D2[i,k]*u[e,k])
        """)

    knl = lp.add_and_infer_dtypes(knl, {
        "u": np.float32,
        "D1": np.float32,
        "D2": np.float32,
        })

    knl = lp.fix_parameters(knl, n=13)

    ref_knl = knl

    knl = lp.extract_subst(knl, "D1_subst", "D1[ii,jj]", parameters="ii,jj")
    knl = lp.extract_subst(knl, "D2_subst", "D2[ii,jj]", parameters="ii,jj")

    knl = lp.precompute(knl, "D1_subst", "i,j", default_tag="for",
            precompute_inames="ii,jj")
    knl = lp.precompute(knl, "D2_subst", "i,k", default_tag="for",
            precompute_inames="ii,jj")

    knl = lp.prioritize_loops(knl, "ii,jj,e,j,k")

    lp.auto_test_vs_ref(
            ref_knl, ctx, knl,
            parameters=dict(E=200))
Example #5
0
def test_precompute_with_preexisting_inames(ctx_factory):
    ctx = ctx_factory()

    knl = lp.make_kernel(
        "{[e,i,j,k]: 0<=e<E and 0<=i,j,k<n}",
        """
        result[e,i] = sum(j, D1[i,j]*u[e,j])
        result2[e,i] = sum(k, D2[i,k]*u[e,k])
        """)

    knl = lp.add_and_infer_dtypes(knl, {
        "u": np.float32,
        "D1": np.float32,
        "D2": np.float32,
        })

    knl = lp.fix_parameters(knl, n=13)

    ref_knl = knl

    knl = lp.extract_subst(knl, "D1_subst", "D1[ii,jj]", parameters="ii,jj")
    knl = lp.extract_subst(knl, "D2_subst", "D2[ii,jj]", parameters="ii,jj")

    knl = lp.precompute(knl, "D1_subst", "i,j", default_tag="for",
            precompute_inames="ii,jj")
    knl = lp.precompute(knl, "D2_subst", "i,k", default_tag="for",
            precompute_inames="ii,jj")

    knl = lp.prioritize_loops(knl, "ii,jj,e,j,k")

    lp.auto_test_vs_ref(
            ref_knl, ctx, knl,
            parameters=dict(E=200))
Example #6
0
def test_prefetch_local_into_private():
    # https://gitlab.tiker.net/inducer/loopy/-/issues/210
    n = 32
    m = 32
    n_vecs = 32

    knl = lp.make_kernel(
        """{[k,i,j]:
            0<=k<n_vecs and
            0<=i<m and
            0<=j<n}""",
        """
        result[i,k] = sum(j, mat[i, j] * vec[j, k])
        """,
        kernel_data=[
            lp.GlobalArg("result", np.float32, shape=(m, n_vecs), order="C"),
            lp.GlobalArg("mat", np.float32, shape=(m, n), order="C"),
            lp.GlobalArg("vec", np.float32, shape=(n, n_vecs), order="C")
        ],
        assumptions="n > 0 \
                     and m > 0 \
                     and n_vecs > 0",
        name="mxm"
    )

    knl = lp.fix_parameters(knl, m=m, n=n, n_vecs=n_vecs)
    knl = lp.prioritize_loops(knl, "i,k,j")

    knl = lp.add_prefetch(
            knl, "mat", "i, j", temporary_name="s_mat", default_tag="for")
    knl = lp.add_prefetch(
            knl, "s_mat", "j", temporary_name="p_mat", default_tag="for")
Example #7
0
def test_generate_c_snippet():
    from pymbolic import var
    I = var("I")  # noqa
    f = var("f")
    df = var("df")
    q_v = var("q_v")
    eN = var("eN")  # noqa
    k = var("k")
    u = var("u")

    from functools import partial
    l_sum = partial(lp.Reduction, "sum", allow_simultaneous=True)

    Instr = lp.Assignment  # noqa

    knl = lp.make_kernel("{[I, k]: 0<=I<nSpace and 0<=k<nQuad}", [
        Instr(f[I], l_sum(k, q_v[k, I] * u)),
        Instr(df[I], l_sum(k, q_v[k, I])),
    ], [
        lp.GlobalArg("q_v", np.float64, shape="nQuad, nSpace"),
        lp.GlobalArg("f,df", np.float64, shape="nSpace"),
        lp.ValueArg("u", np.float64),
        "...",
    ],
                         target=CTarget(),
                         assumptions="nQuad>=1")

    if 0:  # enable to play with prefetching
        # (prefetch currently requires constant sizes)
        knl = lp.fix_parameters(knl, nQuad=5, nSpace=3)
        knl = lp.add_prefetch(knl, "q_v", "k,I", default_tag=None)

    knl = lp.split_iname(knl, "k", 4, inner_tag="unr", slabs=(0, 1))
    knl = lp.prioritize_loops(knl, "I,k_outer,k_inner")
    print(lp.generate_code_v2(knl))
Example #8
0
 def variant_2(knl):
     knl = lp.split_iname(knl, "i", 16, outer_tag="g.1", inner_tag="l.1")
     knl = lp.split_iname(knl, "j", 16, outer_tag="g.0", inner_tag="l.0")
     knl = lp.add_prefetch(knl, "a", ["i_inner", "j_inner"],
             fetch_bounding_box=True)
     knl = lp.prioritize_loops(knl, ["a_dim_0_outer", "a_dim_1_outer"])
     return knl
Example #9
0
    def get_optimized_kernel(self, targets_is_obj_array, sources_is_obj_array,
                             centers_is_obj_array):
        # FIXME specialize/tune for GPU/CPU
        loopy_knl = self.get_kernel()

        if targets_is_obj_array:
            loopy_knl = lp.tag_array_axes(loopy_knl, "tgt", "sep,C")
        if sources_is_obj_array:
            loopy_knl = lp.tag_array_axes(loopy_knl, "src", "sep,C")
        if centers_is_obj_array:
            loopy_knl = lp.tag_array_axes(loopy_knl, "center", "sep,C")

        import pyopencl as cl
        dev = self.context.devices[0]
        if dev.type & cl.device_type.CPU:
            loopy_knl = lp.split_iname(loopy_knl,
                                       "itgt",
                                       16,
                                       outer_tag="g.0",
                                       inner_tag="l.0")
            loopy_knl = lp.split_iname(loopy_knl, "isrc", 256)
            loopy_knl = lp.prioritize_loops(loopy_knl,
                                            ["isrc_outer", "itgt_inner"])
        else:
            from warnings import warn
            warn(
                "don't know how to tune layer potential computation for '%s'" %
                dev)
            loopy_knl = lp.split_iname(loopy_knl, "itgt", 128, outer_tag="g.0")

        return loopy_knl
Example #10
0
 def variant_1(knl):
     knl = lp.split_iname(knl, "i", 16, outer_tag="g.1", inner_tag="l.1")
     knl = lp.split_iname(knl, "j", 16, outer_tag="g.0", inner_tag="l.0")
     knl = lp.add_prefetch(knl,
                           "a", ["i_inner", "j_inner"],
                           default_tag="l.auto")
     knl = lp.prioritize_loops(knl, ["a_dim_0_outer", "a_dim_1_outer"])
     return knl
Example #11
0
def test_poisson_fem(ctx_factory):
    # Stolen from Peter Coogan and Rob Kirby for FEM assembly
    ctx = ctx_factory()

    nbf = 5
    nqp = 5
    sdim = 3

    knl = lp.make_kernel(
            "{ [c,i,j,k,ell,ell2,ell3]: \
            0 <= c < nels and \
            0 <= i < nbf and \
            0 <= j < nbf and \
            0 <= k < nqp and \
            0 <= ell,ell2 < sdim}",
            """
            dpsi(bf,k0,dir) := \
                    simul_reduce(sum, ell2, DFinv[c,ell2,dir] * DPsi[bf,k0,ell2] )
            Ael[c,i,j] = \
                    J[c] * w[k] * sum(ell, dpsi(i,k,ell) * dpsi(j,k,ell))
            """,
            assumptions="nels>=1 and nbf >= 1 and nels mod 4 = 0")

    print(knl)

    knl = lp.fix_parameters(knl, nbf=nbf, sdim=sdim, nqp=nqp)

    ref_knl = knl

    knl = lp.prioritize_loops(knl, ["c", "j", "i", "k"])

    def variant_1(knl):
        knl = lp.precompute(knl, "dpsi", "i,k,ell", default_tag='for')
        knl = lp.prioritize_loops(knl, "c,i,j")
        return knl

    def variant_2(knl):
        knl = lp.precompute(knl, "dpsi", "i,ell", default_tag='for')
        knl = lp.prioritize_loops(knl, "c,i,j")
        return knl

    def add_types(knl):
        return lp.add_and_infer_dtypes(knl, dict(
            w=np.float32,
            J=np.float32,
            DPsi=np.float32,
            DFinv=np.float32,
            ))

    for variant in [
            #variant_1,
            variant_2
            ]:
        knl = variant(knl)

        lp.auto_test_vs_ref(
                add_types(ref_knl), ctx, add_types(knl),
                parameters=dict(n=5, nels=15, nbf=5, sdim=2, nqp=7))
Example #12
0
def test_poisson_fem(ctx_factory):
    # Stolen from Peter Coogan and Rob Kirby for FEM assembly
    ctx = ctx_factory()

    nbf = 5
    nqp = 5
    sdim = 3

    knl = lp.make_kernel(
            "{ [c,i,j,k,ell,ell2,ell3]: \
            0 <= c < nels and \
            0 <= i < nbf and \
            0 <= j < nbf and \
            0 <= k < nqp and \
            0 <= ell,ell2 < sdim}",
            """
            dpsi(bf,k0,dir) := \
                    simul_reduce(sum, ell2, DFinv[c,ell2,dir] * DPsi[bf,k0,ell2] )
            Ael[c,i,j] = \
                    J[c] * w[k] * sum(ell, dpsi(i,k,ell) * dpsi(j,k,ell))
            """,
            assumptions="nels>=1 and nbf >= 1 and nels mod 4 = 0")

    print(knl)

    knl = lp.fix_parameters(knl, nbf=nbf, sdim=sdim, nqp=nqp)

    ref_knl = knl

    knl = lp.prioritize_loops(knl, ["c", "j", "i", "k"])

    def variant_1(knl):
        knl = lp.precompute(knl, "dpsi", "i,k,ell", default_tag='for')
        knl = lp.prioritize_loops(knl, "c,i,j")
        return knl

    def variant_2(knl):
        knl = lp.precompute(knl, "dpsi", "i,ell", default_tag='for')
        knl = lp.prioritize_loops(knl, "c,i,j")
        return knl

    def add_types(knl):
        return lp.add_and_infer_dtypes(knl, dict(
            w=np.float32,
            J=np.float32,
            DPsi=np.float32,
            DFinv=np.float32,
            ))

    for variant in [
            #variant_1,
            variant_2
            ]:
        knl = variant(knl)

        lp.auto_test_vs_ref(
                add_types(ref_knl), ctx, add_types(knl),
                parameters=dict(n=5, nels=15, nbf=5, sdim=2, nqp=7))
Example #13
0
 def variant_overfetch(knl):
     knl = lp.split_iname(knl, "i", 16, outer_tag="g.1", inner_tag="l.1",
             slabs=(1, 1))
     knl = lp.split_iname(knl, "j", 16, outer_tag="g.0", inner_tag="l.0",
            slabs=(1, 1))
     knl = lp.add_prefetch(knl, "a", ["i_inner", "j_inner"],
             fetch_bounding_box=True, default_tag="l.auto")
     knl = lp.prioritize_loops(knl, ["a_dim_0_outer", "a_dim_1_outer"])
     return knl
Example #14
0
 def test_split_iname3(self):
     "Split one of two inames."
     from loopy.target.ispc import ISPCTarget as CTarget
     knl = lp.make_kernel("{[i,j]:0<=i,j<n}",
                          "out[i, j] = in[i, j]",
                          target=CTarget())
     knl = lp.split_iname(knl, 'i', 8)
     knl = lp.prioritize_loops(knl, ['i_outer', 'j', 'i_inner'])
     print(self._dtype_and_code(knl))
Example #15
0
def generate(impero_c, args, precision, scalar_type, kernel_name="loopy_kernel", index_names=[]):
    """Generates loopy code.

    :arg impero_c: ImperoC tuple with Impero AST and other data
    :arg args: list of loopy.GlobalArgs
    :arg precision: floating-point precision for printing
    :arg scalar_type: type of scalars as C typename string
    :arg kernel_name: function name of the kernel
    :arg index_names: pre-assigned index names
    :returns: loopy kernel
    """
    ctx = LoopyContext()
    ctx.indices = impero_c.indices
    ctx.index_names = defaultdict(lambda: "i", index_names)
    ctx.precision = precision
    ctx.scalar_type = scalar_type
    ctx.epsilon = 10.0 ** (-precision)

    # Create arguments
    data = list(args)
    for i, temp in enumerate(impero_c.temporaries):
        name = "t%d" % i
        if isinstance(temp, gem.Constant):
            data.append(lp.TemporaryVariable(name, shape=temp.shape, dtype=temp.array.dtype, initializer=temp.array, address_space=lp.AddressSpace.LOCAL, read_only=True))
        else:
            shape = tuple([i.extent for i in ctx.indices[temp]]) + temp.shape
            data.append(lp.TemporaryVariable(name, shape=shape, dtype=numpy.float64, initializer=None, address_space=lp.AddressSpace.LOCAL, read_only=False))
        ctx.gem_to_pymbolic[temp] = p.Variable(name)

    # Create instructions
    instructions = statement(impero_c.tree, ctx)

    # Create domains
    domains = []
    for idx, extent in ctx.index_extent.items():
        inames = isl.make_zero_and_vars([idx])
        domains.append(((inames[0].le_set(inames[idx])) & (inames[idx].lt_set(inames[0] + extent))))

    if not domains:
        domains = [isl.BasicSet("[] -> {[]}")]

    # Create loopy kernel
    knl = lp.make_function(domains, instructions, data, name=kernel_name, target=lp.CTarget(),
                           seq_dependencies=True, silenced_warnings=["summing_if_branches_ops"])

    # Prevent loopy interchange by loopy
    knl = lp.prioritize_loops(knl, ",".join(ctx.index_extent.keys()))

    # Help loopy in scheduling by assigning priority to instructions
    insn_new = []
    for i, insn in enumerate(knl.instructions):
        insn_new.append(insn.copy(priority=len(knl.instructions) - i))
    knl = knl.copy(instructions=insn_new)

    return knl
Example #16
0
def test_fuse_kernels(ctx_factory):
    fortran_template = """
        subroutine {name}(nelements, ndofs, result, d, q)
          implicit none
          integer e, i, j, k
          integer nelements, ndofs
          real*8 result(nelements, ndofs, ndofs)
          real*8 q(nelements, ndofs, ndofs)
          real*8 d(ndofs, ndofs)
          real*8 prev

          do e = 1,nelements
            do i = 1,ndofs
              do j = 1,ndofs
                do k = 1,ndofs
                  {inner}
                end do
              end do
            end do
          end do
        end subroutine
        """

    xd_line = """
        prev = result(e,i,j)
        result(e,i,j) = prev + d(i,k)*q(e,i,k)
        """
    yd_line = """
        prev = result(e,i,j)
        result(e,i,j) = prev + d(i,k)*q(e,k,j)
        """

    xderiv, = lp.parse_fortran(
        fortran_template.format(inner=xd_line, name="xderiv"))
    yderiv, = lp.parse_fortran(
        fortran_template.format(inner=yd_line, name="yderiv"))
    xyderiv, = lp.parse_fortran(
        fortran_template.format(inner=(xd_line + "\n" + yd_line),
                                name="xyderiv"))

    knl = lp.fuse_kernels((xderiv, yderiv))
    knl = lp.prioritize_loops(knl, "e,i,j,k")

    assert len(knl.temporary_variables) == 2

    # This is needed for correctness, otherwise ordering could foul things up.
    knl = lp.assignment_to_subst(knl, "prev")
    knl = lp.assignment_to_subst(knl, "prev_0")

    ctx = ctx_factory()
    lp.auto_test_vs_ref(xyderiv,
                        ctx,
                        knl,
                        parameters=dict(nelements=20, ndofs=4))
Example #17
0
 def variant_gpu(knl):
     knl = lp.expand_subst(knl)
     knl = lp.split_iname(knl, "i", 256,
             outer_tag="g.0", inner_tag="l.0")
     knl = lp.split_iname(knl, "j", 256)
     knl = lp.add_prefetch(knl, "x[j,k]", ["j_inner", "k"],
             ["x_fetch_j", "x_fetch_k"], default_tag=None)
     knl = lp.tag_inames(knl, dict(x_fetch_k="unr", x_fetch_j="l.0"))
     knl = lp.add_prefetch(knl, "x[i,k]", ["k"], default_tag=None)
     knl = lp.prioritize_loops(knl, ["j_outer", "j_inner"])
     return knl
Example #18
0
def test_fuse_kernels(ctx_factory):
    fortran_template = """
        subroutine {name}(nelements, ndofs, result, d, q)
          implicit none
          integer e, i, j, k
          integer nelements, ndofs
          real*8 result(nelements, ndofs, ndofs)
          real*8 q(nelements, ndofs, ndofs)
          real*8 d(ndofs, ndofs)
          real*8 prev

          do e = 1,nelements
            do i = 1,ndofs
              do j = 1,ndofs
                do k = 1,ndofs
                  {inner}
                end do
              end do
            end do
          end do
        end subroutine
        """

    xd_line = """
        prev = result(e,i,j)
        result(e,i,j) = prev + d(i,k)*q(e,i,k)
        """
    yd_line = """
        prev = result(e,i,j)
        result(e,i,j) = prev + d(i,k)*q(e,k,j)
        """

    xderiv = lp.parse_fortran(
        fortran_template.format(inner=xd_line, name="xderiv"))
    yderiv = lp.parse_fortran(
        fortran_template.format(inner=yd_line, name="yderiv"))
    xyderiv = lp.parse_fortran(
        fortran_template.format(inner=(xd_line + "\n" + yd_line),
                                name="xyderiv"))

    knl = lp.fuse_kernels((xderiv["xderiv"], yderiv["yderiv"]),
                          data_flow=[("result", 0, 1)])
    knl = knl.with_kernel(
        lp.prioritize_loops(knl["xderiv_and_yderiv"], "e,i,j,k"))

    assert len(knl["xderiv_and_yderiv"].temporary_variables) == 2

    ctx = ctx_factory()
    lp.auto_test_vs_ref(xyderiv,
                        ctx,
                        knl,
                        parameters=dict(nelements=20, ndofs=4))
Example #19
0
def test_chunk_iname(ctx_factory):
    ctx = ctx_factory()

    knl = lp.make_kernel(
        "{ [i]: 0<=i<n }",
        "out[i] = 2*a[i]",
        [lp.GlobalArg("out,a", np.float32, shape=lp.auto), "..."],
        assumptions="n>0")

    ref_knl = knl
    knl = lp.chunk_iname(knl, "i", 3, inner_tag="l.0")
    knl = lp.prioritize_loops(knl, "i_outer, i_inner")
    lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=130))
Example #20
0
def test_assume(ctx_factory):
    ctx = ctx_factory()

    knl = lp.make_kernel("{[i]: 0<=i<n}",
                         "a[i] = a[i] + 1",
                         [lp.GlobalArg("a", np.float32, shape="n"), "..."],
                         target=lp.PyOpenCLTarget(ctx.devices[0]))

    knl = lp.split_iname(knl, "i", 16)
    knl = lp.prioritize_loops(knl, "i_outer,i_inner")
    knl = lp.assume(knl, "n mod 16 = 0")
    knl = lp.assume(knl, "n > 10")
    code = lp.generate_code_v2(knl).device_code()
    assert "if" not in code
Example #21
0
def tune_geom_kernel(knl, shape=None, ng=None):
    """Parameters specific to kernels where one array is 2D and one is 3D"""
    # These kernels get run on lots of grid sizes
    if shape is not None:
        pass  # Deal with various vectros/tensors
        #knl = lp.fix_parameters(knl, n1=int(shape[0]), n2=int(shape[1]), n3=int(shape[2]))
    knl = _lp.split_iname(
        knl, "k", lsize[0], outer_tag="for",
        inner_tag="unr")  # Worthwhile to cache so explicitly?
    #knl = lp.split_iname(knl, "k", lsize[0], outer_tag=otags[0], inner_tag=itags[0])
    knl = _lp.split_iname(knl,
                          "j",
                          lsize[1],
                          outer_tag=otags[0],
                          inner_tag=itags[0])
    knl = _lp.split_iname(knl,
                          "i",
                          lsize[2],
                          outer_tag=otags[1],
                          inner_tag=itags[1])

    # TODO caching geom values is especially important. Try things here...
    if shape is not None:
        if len(shape) > 4:
            knl = _lp.prioritize_loops(
                knl, "mu, nu,i_outer,i_inner,j_outer,j_inner,k_outer,k_inner")
        elif len(shape) > 3:
            knl = _lp.prioritize_loops(
                knl, "mu,i_outer,i_inner,j_outer,j_inner,k_outer,k_inner")
        else:
            knl = _lp.prioritize_loops(
                knl, "i_outer,i_inner,j_outer,j_inner,k_outer,k_inner")

    # Currently these functions get used on frontend, too
    #knl = lp.set_options(knl, no_numpy=True)

    return knl
Example #22
0
def test_chunk_iname(ctx_factory):
    ctx = ctx_factory()

    knl = lp.make_kernel(
            "{ [i]: 0<=i<n }",
            "out[i] = 2*a[i]",
            [
                lp.GlobalArg("out,a", np.float32, shape=lp.auto),
                "..."
                ],
            assumptions="n>0")

    ref_knl = knl
    knl = lp.chunk_iname(knl, "i", 3, inner_tag="l.0")
    knl = lp.prioritize_loops(knl, "i_outer, i_inner")
    lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=130))
Example #23
0
def test_fuse_kernels(ctx_factory):
    fortran_template = """
        subroutine {name}(nelements, ndofs, result, d, q)
          implicit none
          integer e, i, j, k
          integer nelements, ndofs
          real*8 result(nelements, ndofs, ndofs)
          real*8 q(nelements, ndofs, ndofs)
          real*8 d(ndofs, ndofs)
          real*8 prev

          do e = 1,nelements
            do i = 1,ndofs
              do j = 1,ndofs
                do k = 1,ndofs
                  {inner}
                end do
              end do
            end do
          end do
        end subroutine
        """

    xd_line = """
        prev = result(e,i,j)
        result(e,i,j) = prev + d(i,k)*q(e,i,k)
        """
    yd_line = """
        prev = result(e,i,j)
        result(e,i,j) = prev + d(i,k)*q(e,k,j)
        """

    xderiv, = lp.parse_fortran(
            fortran_template.format(inner=xd_line, name="xderiv"))
    yderiv, = lp.parse_fortran(
            fortran_template.format(inner=yd_line, name="yderiv"))
    xyderiv, = lp.parse_fortran(
            fortran_template.format(
                inner=(xd_line + "\n" + yd_line), name="xyderiv"))

    knl = lp.fuse_kernels((xderiv, yderiv), data_flow=[("result", 0, 1)])
    knl = lp.prioritize_loops(knl, "e,i,j,k")

    assert len(knl.temporary_variables) == 2

    ctx = ctx_factory()
    lp.auto_test_vs_ref(xyderiv, ctx, knl, parameters=dict(nelements=20, ndofs=4))
Example #24
0
def test_numba_cuda_target():
    knl = lp.make_kernel("{[i,j,k]: 0<=i,j<M and 0<=k<N}",
                         "D[i,j] = sqrt(sum(k, (X[i, k]-X[j, k])**2))",
                         target=lp.NumbaCudaTarget())

    knl = lp.assume(knl, "M>0")
    knl = lp.split_iname(knl, "i", 16, outer_tag='g.0')
    knl = lp.split_iname(knl, "j", 128, inner_tag='l.0', slabs=(0, 1))
    knl = lp.add_prefetch(knl, "X[i,:]")
    knl = lp.fix_parameters(knl, N=3)
    knl = lp.prioritize_loops(knl, "i_inner,j_outer")
    knl = lp.tag_inames(knl, "k:unr")
    knl = lp.tag_array_axes(knl, "X", "N0,N1")

    knl = lp.add_and_infer_dtypes(knl, {"X": np.float32})

    print(lp.generate_code_v2(knl).all_code())
Example #25
0
def test_prefetch_through_indirect_access():
    knl = lp.make_kernel(
        "{[i, j, k]: 0 <= i,k < 10 and 0<=j<2}",
        """
        for i, j, k
            a[map1[indirect[i], j], k] = 2
        end
        """, [
            lp.GlobalArg("a", strides=(2, 1), dtype=int),
            lp.GlobalArg("map1", shape=(10, 10), dtype=int), "..."
        ],
        target=lp.CTarget())

    knl = lp.prioritize_loops(knl, "i,j,k")

    with pytest.raises(LoopyError):
        knl = lp.add_prefetch(knl, "map1[:, j]")
Example #26
0
def test_numba_cuda_target():
    knl = lp.make_kernel(
        "{[i,j,k]: 0<=i,j<M and 0<=k<N}",
        "D[i,j] = sqrt(sum(k, (X[i, k]-X[j, k])**2))",
        target=lp.NumbaCudaTarget())

    knl = lp.assume(knl, "M>0")
    knl = lp.split_iname(knl, "i", 16, outer_tag='g.0')
    knl = lp.split_iname(knl, "j", 128, inner_tag='l.0', slabs=(0, 1))
    knl = lp.add_prefetch(knl, "X[i,:]", default_tag="l.auto")
    knl = lp.fix_parameters(knl, N=3)
    knl = lp.prioritize_loops(knl, "i_inner,j_outer")
    knl = lp.tag_inames(knl, "k:unr")
    knl = lp.tag_array_axes(knl, "X", "N0,N1")

    knl = lp.add_and_infer_dtypes(knl, {"X": np.float32})

    print(lp.generate_code_v2(knl).all_code())
Example #27
0
    def get_optimized_kernel(self):
        # FIXME specialize/tune for GPU/CPU
        loopy_knl = self.get_kernel()

        import pyopencl as cl
        dev = self.context.devices[0]
        if dev.type & cl.device_type.CPU:
            loopy_knl = lp.split_iname(loopy_knl, "itgt", 16, outer_tag="g.0",
                    inner_tag="l.0")
            loopy_knl = lp.split_iname(loopy_knl, "isrc", 256)
            loopy_knl = lp.prioritize_loops(loopy_knl,
                    ["isrc_outer", "itgt_inner"])
        else:
            from warnings import warn
            warn("don't know how to tune layer potential computation for '%s'" % dev)
            loopy_knl = lp.split_iname(loopy_knl, "itgt", 128, outer_tag="g.0")

        return loopy_knl
Example #28
0
def test_prefetch_through_indirect_access():
    knl = lp.make_kernel("{[i, j, k]: 0 <= i,k < 10 and 0<=j<2}",
        """
        for i, j, k
            a[map1[indirect[i], j], k] = 2
        end
        """,
        [
            lp.GlobalArg("a", strides=(2, 1), dtype=int),
            lp.GlobalArg("map1", shape=(10, 10), dtype=int),
            "..."
            ],
        target=lp.CTarget())

    knl = lp.prioritize_loops(knl, "i,j,k")

    with pytest.raises(LoopyError):
        knl = lp.add_prefetch(knl, "map1[:, j]")
Example #29
0
def test_assume(ctx_factory):
    ctx = ctx_factory()

    knl = lp.make_kernel("{[i]: 0<=i<n}", "a[i] = a[i] + 1",
                         [lp.GlobalArg("a", np.float32, shape="n"), "..."])

    knl = lp.split_iname(knl, "i", 16)
    knl = lp.prioritize_loops(knl, "i_outer,i_inner")
    knl = lp.assume(knl, "n mod 16 = 0")
    knl = lp.assume(knl, "n > 10")
    knl = lp.preprocess_kernel(knl, ctx.devices[0])
    kernel_gen = lp.generate_loop_schedules(knl)

    for gen_knl in kernel_gen:
        print(gen_knl)
        compiled = lp.CompiledKernel(ctx, gen_knl)
        print(compiled.get_code())
        assert "if" not in compiled.get_code()
Example #30
0
def test_assume(ctx_factory):
    ctx = ctx_factory()

    knl = lp.make_kernel(
            "{[i]: 0<=i<n}",
            "a[i] = a[i] + 1",
            [lp.GlobalArg("a", np.float32, shape="n"), "..."])

    knl = lp.split_iname(knl, "i", 16)
    knl = lp.prioritize_loops(knl, "i_outer,i_inner")
    knl = lp.assume(knl, "n mod 16 = 0")
    knl = lp.assume(knl, "n > 10")
    knl = lp.preprocess_kernel(knl, ctx.devices[0])
    kernel_gen = lp.generate_loop_schedules(knl)

    for gen_knl in kernel_gen:
        print(gen_knl)
        compiled = lp.CompiledKernel(ctx, gen_knl)
        print(compiled.get_code())
        assert "if" not in compiled.get_code()
Example #31
0
def test_generate_c_snippet():
    from pymbolic import var
    I = var("I")  # noqa
    f = var("f")
    df = var("df")
    q_v = var("q_v")
    eN = var("eN")  # noqa
    k = var("k")
    u = var("u")

    from functools import partial
    l_sum = partial(lp.Reduction, "sum", allow_simultaneous=True)

    Instr = lp.Assignment  # noqa

    knl = lp.make_kernel(
        "{[I, k]: 0<=I<nSpace and 0<=k<nQuad}",
        [
            Instr(f[I], l_sum(k, q_v[k, I]*u)),
            Instr(df[I], l_sum(k, q_v[k, I])),
            ],
        [
            lp.GlobalArg("q_v", np.float64, shape="nQuad, nSpace"),
            lp.GlobalArg("f,df", np.float64, shape="nSpace"),
            lp.ValueArg("u", np.float64),
            "...",
            ],
        target=CTarget(),
        assumptions="nQuad>=1")

    if 0:  # enable to play with prefetching
        # (prefetch currently requires constant sizes)
        knl = lp.fix_parameters(knl, nQuad=5, nSpace=3)
        knl = lp.add_prefetch(knl, "q_v", "k,I", default_tag=None)

    knl = lp.split_iname(knl, "k", 4, inner_tag="unr", slabs=(0, 1))
    knl = lp.prioritize_loops(knl, "I,k_outer,k_inner")

    knl = lp.preprocess_kernel(knl)
    knl = lp.get_one_scheduled_kernel(knl)
    print(lp.generate_body(knl))
Example #32
0
 def variant_2(knl):
     knl = lp.precompute(knl, "dpsi", "i,ell", default_tag='for')
     knl = lp.prioritize_loops(knl, "c,i,j")
     return knl
Example #33
0
 def variant_1(knl):
     knl = lp.split_iname(knl, "im_x", 16, inner_tag="l.0")
     knl = lp.prioritize_loops(knl, "iimg,im_x_outer,im_y,ifeat,f_x,f_y")
     return knl
Example #34
0
def generate(builder, wrapper_name=None):
    if builder.layer_index is not None:
        outer_inames = frozenset(
            [builder._loop_index.name, builder.layer_index.name])
    else:
        outer_inames = frozenset([builder._loop_index.name])

    instructions = list(builder.emit_instructions())

    parameters = Bag()
    parameters.domains = OrderedDict()
    parameters.assumptions = OrderedDict()
    parameters.wrapper_arguments = builder.wrapper_args
    parameters.layer_start = builder.layer_extents[0].name
    parameters.layer_end = builder.layer_extents[1].name
    parameters.conditions = []
    parameters.kernel_data = list(None for _ in parameters.wrapper_arguments)
    parameters.temporaries = OrderedDict()
    parameters.kernel_name = builder.kernel.name

    # replace Materialise
    mapper = Memoizer(replace_materialise)
    mapper.initialisers = []
    instructions = list(mapper(i) for i in instructions)

    # merge indices
    merger = index_merger(instructions)
    instructions = list(merger(i) for i in instructions)
    initialiser = list(itertools.chain(*mapper.initialisers))
    merger = index_merger(initialiser)
    initialiser = list(merger(i) for i in initialiser)
    instructions = instructions + initialiser
    mapper.initialisers = [
        tuple(merger(i) for i in inits) for inits in mapper.initialisers
    ]

    # rename indices and nodes (so that the counters start from zero)
    pattern = re.compile(r"^([a-zA-Z_]+)([0-9]+)(_offset)?$")
    replacements = {}
    counter = defaultdict(itertools.count)
    for node in traversal(instructions):
        if isinstance(node,
                      (Index, RuntimeIndex, Variable, Argument, NamedLiteral)):
            match = pattern.match(node.name)
            if match is None:
                continue
            prefix, _, postfix = match.groups()
            if postfix is None:
                postfix = ""
            replacements[node] = "%s%d%s" % (
                prefix, next(counter[(prefix, postfix)]), postfix)

    instructions = rename_nodes(instructions, replacements)
    mapper.initialisers = [
        rename_nodes(inits, replacements) for inits in mapper.initialisers
    ]
    parameters.wrapper_arguments = rename_nodes(parameters.wrapper_arguments,
                                                replacements)
    s, e = rename_nodes([mapper(e) for e in builder.layer_extents],
                        replacements)
    parameters.layer_start = s.name
    parameters.layer_end = e.name

    # scheduling and loop nesting
    deps = instruction_dependencies(instructions, mapper.initialisers)
    within_inames = loop_nesting(instructions, deps, outer_inames,
                                 parameters.kernel_name)

    # generate loopy
    context = Bag()
    context.parameters = parameters
    context.within_inames = within_inames
    context.conditions = []
    context.index_ordering = []
    context.instruction_dependencies = deps

    statements = list(statement(insn, context) for insn in instructions)
    # remote the dummy instructions (they were only used to ensure
    # that the kernel knows about the outer inames).
    statements = list(s for s in statements
                      if not isinstance(s, DummyInstruction))

    domains = list(parameters.domains.values())
    if builder.single_cell:
        new_domains = []
        for d in domains:
            if d.get_dim_name(isl.dim_type.set, 0) == builder._loop_index.name:
                # n = start
                new_domains.append(
                    d.add_constraint(
                        isl.Constraint.eq_from_names(d.space, {
                            "n": 1,
                            "start": -1
                        })))
            else:
                new_domains.append(d)
        domains = new_domains
        if builder.extruded:
            new_domains = []
            for d in domains:
                if d.get_dim_name(isl.dim_type.set,
                                  0) == builder.layer_index.name:
                    # layer = t1 - 1
                    t1 = parameters.layer_end
                    new_domains.append(
                        d.add_constraint(
                            isl.Constraint.eq_from_names(
                                d.space, {
                                    "layer": 1,
                                    t1: -1,
                                    1: 1
                                })))
                else:
                    new_domains.append(d)
        domains = new_domains

    assumptions, = reduce(
        operator.and_,
        parameters.assumptions.values()).params().get_basic_sets()
    options = loopy.Options(check_dep_resolution=True,
                            ignore_boostable_into=True)

    # sometimes masks are not used, but we still need to create the function arguments
    for i, arg in enumerate(parameters.wrapper_arguments):
        if parameters.kernel_data[i] is None:
            arg = loopy.GlobalArg(arg.name, dtype=arg.dtype, shape=arg.shape)
            parameters.kernel_data[i] = arg

    if wrapper_name is None:
        wrapper_name = "wrap_%s" % builder.kernel.name

    pwaffd = isl.affs_from_space(assumptions.get_space())
    assumptions = assumptions & pwaffd["start"].ge_set(pwaffd[0])
    if builder.single_cell:
        assumptions = assumptions & pwaffd["start"].lt_set(pwaffd["end"])
    else:
        assumptions = assumptions & pwaffd["start"].le_set(pwaffd["end"])
    if builder.extruded:
        assumptions = assumptions & pwaffd[parameters.layer_start].le_set(
            pwaffd[parameters.layer_end])
    assumptions = reduce(operator.and_, assumptions.get_basic_sets())

    wrapper = loopy.make_kernel(domains,
                                statements,
                                kernel_data=parameters.kernel_data,
                                target=loopy.CTarget(),
                                temporary_variables=parameters.temporaries,
                                symbol_manglers=[symbol_mangler],
                                options=options,
                                assumptions=assumptions,
                                lang_version=(2018, 2),
                                name=wrapper_name)

    # prioritize loops
    for indices in context.index_ordering:
        wrapper = loopy.prioritize_loops(wrapper, indices)

    # register kernel
    kernel = builder.kernel
    headers = set(kernel._headers)
    headers = headers | set(
        ["#include <math.h>", "#include <complex.h>", "#include <petsc.h>"])
    preamble = "\n".join(sorted(headers))

    from coffee.base import Node

    if isinstance(kernel._code, loopy.LoopKernel):
        knl = kernel._code
        wrapper = loopy.register_callable_kernel(wrapper, knl)
        from loopy.transform.callable import _match_caller_callee_argument_dimension_
        wrapper = _match_caller_callee_argument_dimension_(wrapper, knl.name)
        wrapper = loopy.inline_callable_kernel(wrapper, knl.name)
    else:
        # kernel is a string, add it to preamble
        if isinstance(kernel._code, Node):
            code = kernel._code.gencode()
        else:
            code = kernel._code
        wrapper = loopy.register_function_id_to_in_knl_callable_mapper(
            wrapper,
            PyOP2KernelLookup(kernel.name, code,
                              tuple(builder.argument_accesses)))
        preamble = preamble + "\n" + code

    wrapper = loopy.register_preamble_generators(wrapper,
                                                 [_PreambleGen(preamble)])

    # register petsc functions
    wrapper = loopy.register_function_id_to_in_knl_callable_mapper(
        wrapper, petsc_function_lookup)

    return wrapper
Example #35
0
def test_gnuma_horiz_kernel(ctx_factory, ilp_multiple, Nq, opt_level):  # noqa
    pytest.importorskip("fparser")
    ctx = ctx_factory()

    filename = os.path.join(os.path.dirname(__file__),
                            "strongVolumeKernels.f90")
    with open(filename) as sourcef:
        source = sourcef.read()

    source = source.replace("datafloat", "real*4")

    program = lp.parse_fortran(source, filename, seq_dependencies=False)

    hsv_r, hsv_s = program["strongVolumeKernelR"], program[
        "strongVolumeKernelS"]

    hsv_r = lp.tag_instructions(hsv_r, "rknl")
    hsv_s = lp.tag_instructions(hsv_s, "sknl")
    hsv = lp.fuse_kernels([hsv_r, hsv_s], ["_r", "_s"])
    #hsv = hsv_s
    hsv = lp.add_nosync(hsv, "any", "writes:rhsQ", "writes:rhsQ", force=True)

    from gnuma_loopy_transforms import (fix_euler_parameters,
                                        set_q_storage_format,
                                        set_D_storage_format)

    hsv = lp.fix_parameters(hsv, Nq=Nq)
    hsv = lp.prioritize_loops(hsv, "e,k,j,i")
    hsv = lp.tag_inames(hsv, dict(e="g.0", j="l.1", i="l.0"))
    hsv = lp.assume(hsv, "elements >= 1")

    hsv = fix_euler_parameters(hsv, p_p0=1, p_Gamma=1.4, p_R=1)
    from loopy.frontend.fortran.translator import specialize_fortran_division
    hsv = specialize_fortran_division(hsv)

    for name in ["Q", "rhsQ"]:
        hsv = set_q_storage_format(hsv, name)

    hsv = set_D_storage_format(hsv)
    #hsv = lp.add_prefetch(hsv, "volumeGeometricFactors")

    ref_hsv = hsv

    if opt_level == 0:
        tap_hsv = hsv

    hsv = lp.add_prefetch(hsv,
                          "D[:,:]",
                          fetch_outer_inames="e",
                          default_tag="l.auto")

    if opt_level == 1:
        tap_hsv = hsv

    # turn the first reads into subst rules
    local_prep_var_names = set()
    for insn in lp.find_instructions(hsv, "tag:local_prep"):
        assignee, = insn.assignee_var_names()
        local_prep_var_names.add(assignee)
        hsv = lp.assignment_to_subst(hsv, assignee)

    # precompute fluxes
    hsv = lp.assignment_to_subst(hsv, "JinvD_r")
    hsv = lp.assignment_to_subst(hsv, "JinvD_s")

    r_fluxes = lp.find_instructions(hsv, "tag:compute_fluxes and tag:rknl")
    s_fluxes = lp.find_instructions(hsv, "tag:compute_fluxes and tag:sknl")

    if ilp_multiple > 1:
        hsv = lp.split_iname(hsv, "k", 2, inner_tag="ilp")
        ilp_inames = ("k_inner", )
        flux_ilp_inames = ("kk", )
    else:
        ilp_inames = ()
        flux_ilp_inames = ()

    rtmps = []
    stmps = []

    flux_store_idx = 0

    for rflux_insn, sflux_insn in zip(r_fluxes, s_fluxes):
        for knl_tag, insn, flux_inames, tmps, flux_precomp_inames in [
            ("rknl", rflux_insn, (
                "j",
                "n",
            ), rtmps, (
                "jj",
                "ii",
            )),
            ("sknl", sflux_insn, (
                "i",
                "n",
            ), stmps, (
                "ii",
                "jj",
            )),
        ]:
            flux_var, = insn.assignee_var_names()
            print(insn)

            reader, = lp.find_instructions(
                hsv,
                "tag:{knl_tag} and reads:{flux_var}".format(knl_tag=knl_tag,
                                                            flux_var=flux_var))

            hsv = lp.assignment_to_subst(hsv, flux_var)

            flux_store_name = "flux_store_%d" % flux_store_idx
            flux_store_idx += 1
            tmps.append(flux_store_name)

            hsv = lp.precompute(hsv,
                                flux_var + "_subst",
                                flux_inames + ilp_inames,
                                temporary_name=flux_store_name,
                                precompute_inames=flux_precomp_inames +
                                flux_ilp_inames,
                                default_tag=None)
            if flux_var.endswith("_s"):
                hsv = lp.tag_array_axes(hsv, flux_store_name, "N0,N1,N2?")
            else:
                hsv = lp.tag_array_axes(hsv, flux_store_name, "N1,N0,N2?")

            n_iname = "n_" + flux_var.replace("_r", "").replace("_s", "")
            if n_iname.endswith("_0"):
                n_iname = n_iname[:-2]
            hsv = lp.rename_iname(hsv,
                                  "n",
                                  n_iname,
                                  within="id:" + reader.id,
                                  existing_ok=True)

    hsv = lp.tag_inames(hsv, dict(ii="l.0", jj="l.1"))
    for iname in flux_ilp_inames:
        hsv = lp.tag_inames(hsv, {iname: "ilp"})

    hsv = lp.alias_temporaries(hsv, rtmps)
    hsv = lp.alias_temporaries(hsv, stmps)

    if opt_level == 2:
        tap_hsv = hsv

    for prep_var_name in local_prep_var_names:
        if prep_var_name.startswith("Jinv") or "_s" in prep_var_name:
            continue
        hsv = lp.precompute(hsv,
                            lp.find_one_rule_matching(
                                hsv, prep_var_name + "_*subst*"),
                            default_tag="l.auto")

    if opt_level == 3:
        tap_hsv = hsv

    hsv = lp.add_prefetch(hsv,
                          "Q[ii,jj,k,:,:,e]",
                          sweep_inames=ilp_inames,
                          default_tag="l.auto")

    if opt_level == 4:
        tap_hsv = hsv
        tap_hsv = lp.tag_inames(
            tap_hsv, dict(Q_dim_field_inner="unr", Q_dim_field_outer="unr"))

    hsv = lp.buffer_array(hsv,
                          "rhsQ",
                          ilp_inames,
                          fetch_bounding_box=True,
                          default_tag="for",
                          init_expression="0",
                          store_expression="base + buffer")

    if opt_level == 5:
        tap_hsv = hsv
        tap_hsv = lp.tag_inames(
            tap_hsv,
            dict(rhsQ_init_field_inner="unr",
                 rhsQ_store_field_inner="unr",
                 rhsQ_init_field_outer="unr",
                 rhsQ_store_field_outer="unr",
                 Q_dim_field_inner="unr",
                 Q_dim_field_outer="unr"))

    # buffer axes need to be vectorized in order for this to work
    hsv = lp.tag_array_axes(hsv, "rhsQ_buf", "c?,vec,c")
    hsv = lp.tag_array_axes(hsv, "Q_fetch", "c?,vec,c")
    hsv = lp.tag_array_axes(hsv, "D_fetch", "f,f")
    hsv = lp.tag_inames(hsv, {
        "Q_dim_k": "unr",
        "rhsQ_init_k": "unr",
        "rhsQ_store_k": "unr"
    },
                        ignore_nonexistent=True)

    if opt_level == 6:
        tap_hsv = hsv
        tap_hsv = lp.tag_inames(
            tap_hsv,
            dict(rhsQ_init_field_inner="unr",
                 rhsQ_store_field_inner="unr",
                 rhsQ_init_field_outer="unr",
                 rhsQ_store_field_outer="unr",
                 Q_dim_field_inner="unr",
                 Q_dim_field_outer="unr"))

    hsv = lp.tag_inames(
        hsv,
        dict(rhsQ_init_field_inner="vec",
             rhsQ_store_field_inner="vec",
             rhsQ_init_field_outer="unr",
             rhsQ_store_field_outer="unr",
             Q_dim_field_inner="vec",
             Q_dim_field_outer="unr"))

    if opt_level == 7:
        tap_hsv = hsv

    hsv = lp.collect_common_factors_on_increment(
        hsv, "rhsQ_buf", vary_by_axes=(0, ) if ilp_multiple > 1 else ())

    if opt_level >= 8:
        tap_hsv = hsv

    hsv = tap_hsv

    hsv = lp.set_options(hsv,
                         cl_build_options=[
                             "-cl-denorms-are-zero", "-cl-fast-relaxed-math",
                             "-cl-finite-math-only", "-cl-mad-enable",
                             "-cl-no-signed-zeros"
                         ])

    if 1:
        print("OPS")
        op_map = lp.get_op_map(hsv, subgroup_size=32)
        print(lp.stringify_stats_mapping(op_map))

        print("MEM")
        gmem_map = lp.get_mem_access_map(hsv, subgroup_size=32).to_bytes()
        print(lp.stringify_stats_mapping(gmem_map))

    # FIXME: renaming's a bit tricky in this program model.
    # add a simple transformation for it
    # hsv = hsv.copy(name="horizontalStrongVolumeKernel")

    results = lp.auto_test_vs_ref(ref_hsv,
                                  ctx,
                                  hsv,
                                  parameters=dict(elements=300),
                                  quiet=True)

    elapsed = results["elapsed_wall"]

    print("elapsed", elapsed)
Example #36
0
 def variant_1(knl):
     knl = lp.add_prefetch(knl, "a", default_tag="l.auto")
     knl = lp.add_prefetch(knl, "b", default_tag="l.auto")
     knl = lp.prioritize_loops(knl, ["i", "j"])
     knl = lp.add_inames_to_insn(knl, "i", "writes:b_fetch")
     return knl
Example #37
0
def set_up_volume_loop(kernel, Nq):  # noqa
    kernel = lp.fix_parameters(kernel, Nq=Nq)
    kernel = lp.prioritize_loops(kernel, "e,k,j,i")
    kernel = lp.tag_inames(kernel, dict(e="g.0", j="l.1", i="l.0"))
    kernel = lp.assume(kernel, "elements >= 1")
    return kernel
Example #38
0
 def variant_1(knl):
     knl = lp.split_iname(knl, "im_x", 16, inner_tag="l.0")
     knl = lp.prioritize_loops(knl, "iimg,im_x_outer,im_y,ifeat,f_x,f_y")
     return knl
Example #39
0
    def get_kernel(self):
        ncoeffs = len(self.expansion)

        loopy_insns, result_names = self.get_loopy_insns_and_result_names()

        loopy_knl = lp.make_kernel(
            [
                "{[itgt_box]: 0<=itgt_box<ntgt_boxes}",
                "{[itgt]: itgt_start<=itgt<itgt_end}",
                "{[isrc_box]: isrc_box_start<=isrc_box<isrc_box_end }",
                "{[idim]: 0<=idim<dim}",
            ],
            self.get_kernel_scaling_assignment() + [
                """
                for itgt_box
                    <> tgt_ibox = target_boxes[itgt_box]
                    <> itgt_start = box_target_starts[tgt_ibox]
                    <> itgt_end = itgt_start+box_target_counts_nonchild[tgt_ibox]

                    for itgt
                        <> tgt[idim] = targets[idim,itgt]

                        <> isrc_box_start = source_box_starts[itgt_box]
                        <> isrc_box_end = source_box_starts[itgt_box+1]

                        for isrc_box
                            <> src_ibox = source_box_lists[isrc_box]
                            """
            ] + [
                """
                            <> coeff{coeffidx} = \
                                src_expansions[src_ibox - src_base_ibox, {coeffidx}]
                            """.format(coeffidx=i) for i in range(ncoeffs)
            ] + [
                """

                            <> center[idim] = centers[idim, src_ibox] {dup=idim}
                            <> b[idim] = tgt[idim] - center[idim] {dup=idim}

                            """
            ] + loopy_insns +
            ["""
                        end
                        """] + [
                """
                        result[{resultidx}, itgt] = result[{resultidx}, itgt] + \
                                kernel_scaling * simul_reduce(sum, isrc_box,
                                result_{resultidx}_p) {{id_prefix=write_result}}
                        """.format(resultidx=i)
                for i in range(len(result_names))
            ] + [
                """
                    end
                end
                """
            ],
            [
                lp.GlobalArg("targets",
                             None,
                             shape=(self.dim, "ntargets"),
                             dim_tags="sep,C"),
                lp.GlobalArg("box_target_starts,box_target_counts_nonchild",
                             None,
                             shape=None),
                lp.GlobalArg("centers", None, shape="dim, aligned_nboxes"),
                lp.GlobalArg("src_expansions",
                             None,
                             shape=("nsrc_level_boxes", ncoeffs),
                             offset=lp.auto),
                lp.ValueArg("src_base_ibox", np.int32),
                lp.ValueArg("nsrc_level_boxes,aligned_nboxes", np.int32),
                lp.ValueArg("ntargets", np.int32),
                lp.GlobalArg("result",
                             None,
                             shape="nresults,ntargets",
                             dim_tags="sep,C"),
                lp.GlobalArg("source_box_starts, source_box_lists,",
                             None,
                             shape=None,
                             offset=lp.auto), "..."
            ] + [arg.loopy_arg for arg in self.expansion.get_args()],
            name=self.name,
            assumptions="ntgt_boxes>=1",
            silenced_warnings="write_race(write_result*)",
            default_offset=lp.auto,
            fixed_parameters=dict(dim=self.dim, nresults=len(result_names)))

        loopy_knl = lp.tag_inames(loopy_knl, "idim*:unr")
        loopy_knl = lp.prioritize_loops(loopy_knl, "itgt_box,itgt,isrc_box")
        loopy_knl = self.expansion.prepare_loopy_kernel(loopy_knl)

        return loopy_knl
Example #40
0
osc = model.HMJE()
osc.dt = 0.1
osc_knl = osc.kernel(target)
osc_knl = batch_knl(osc_knl)
osc_fn = compiler.CompiledKernel(osc_knl, comp())

cfun = coupling.Diff(osc)
net = network.Network(osc, cfun)
net_knl = net.kernel(target)
net_knl = batch_knl(net_knl)
net_fn = compiler.CompiledKernel(net_knl, comp())

scm = scheme.EulerStep(osc.dt)
scm_knl = scm.kernel(target)
scm_knl = batch_knl(scm_knl)
scm_knl = lp.prioritize_loops(scm_knl, ['i_subj', 'i', 'j'])
scm_knl = lp.fix_parameters(scm_knl, nsvar=len(osc.state_sym))
scm_knl = lp.tag_inames(scm_knl, [('j', 'ilp')])
scm_fn = compiler.CompiledKernel(scm_knl, comp())

# build other data arrays
next, state, drift = np.zeros((3, nsubj, nnode, osc.state_sym.size),
                              np.float32)
input, param, diffs = np.zeros((3, nsubj, nnode, 6), np.float32)
obsrv = np.zeros((Dmax + 3, nsubj, nnode, 2), np.float32)
LOG.info('obsrv %r %.3f MB', obsrv.shape, obsrv.nbytes / 2**20)


# step function
def step(n_step=1):
    for _ in range(n_step):
Example #41
0
def generate(builder, wrapper_name=None):
    if builder.layer_index is not None:
        outer_inames = frozenset([builder._loop_index.name,
                                  builder.layer_index.name])
    else:
        outer_inames = frozenset([builder._loop_index.name])

    instructions = list(builder.emit_instructions())

    parameters = Bag()
    parameters.domains = OrderedDict()
    parameters.assumptions = OrderedDict()
    parameters.wrapper_arguments = builder.wrapper_args
    parameters.layer_start = builder.layer_extents[0].name
    parameters.layer_end = builder.layer_extents[1].name
    parameters.conditions = []
    parameters.kernel_data = list(None for _ in parameters.wrapper_arguments)
    parameters.temporaries = OrderedDict()
    parameters.kernel_name = builder.kernel.name

    # replace Materialise
    mapper = Memoizer(replace_materialise)
    mapper.initialisers = []
    instructions = list(mapper(i) for i in instructions)

    # merge indices
    merger = index_merger(instructions)
    instructions = list(merger(i) for i in instructions)
    initialiser = list(itertools.chain(*mapper.initialisers))
    merger = index_merger(initialiser)
    initialiser = list(merger(i) for i in initialiser)
    instructions = instructions + initialiser
    mapper.initialisers = [tuple(merger(i) for i in inits) for inits in mapper.initialisers]

    # rename indices and nodes (so that the counters start from zero)
    pattern = re.compile(r"^([a-zA-Z_]+)([0-9]+)(_offset)?$")
    replacements = {}
    counter = defaultdict(itertools.count)
    for node in traversal(instructions):
        if isinstance(node, (Index, RuntimeIndex, Variable, Argument, NamedLiteral)):
            match = pattern.match(node.name)
            if match is None:
                continue
            prefix, _, postfix = match.groups()
            if postfix is None:
                postfix = ""
            replacements[node] = "%s%d%s" % (prefix, next(counter[(prefix, postfix)]), postfix)

    instructions = rename_nodes(instructions, replacements)
    mapper.initialisers = [rename_nodes(inits, replacements) for inits in mapper.initialisers]
    parameters.wrapper_arguments = rename_nodes(parameters.wrapper_arguments, replacements)
    s, e = rename_nodes([mapper(e) for e in builder.layer_extents], replacements)
    parameters.layer_start = s.name
    parameters.layer_end = e.name

    # scheduling and loop nesting
    deps = instruction_dependencies(instructions, mapper.initialisers)
    within_inames = loop_nesting(instructions, deps, outer_inames, parameters.kernel_name)

    # generate loopy
    context = Bag()
    context.parameters = parameters
    context.within_inames = within_inames
    context.conditions = []
    context.index_ordering = []
    context.instruction_dependencies = deps

    statements = list(statement(insn, context) for insn in instructions)
    # remote the dummy instructions (they were only used to ensure
    # that the kernel knows about the outer inames).
    statements = list(s for s in statements if not isinstance(s, DummyInstruction))

    domains = list(parameters.domains.values())
    if builder.single_cell:
        new_domains = []
        for d in domains:
            if d.get_dim_name(isl.dim_type.set, 0) == builder._loop_index.name:
                # n = start
                new_domains.append(d.add_constraint(isl.Constraint.eq_from_names(d.space, {"n": 1, "start": -1})))
            else:
                new_domains.append(d)
        domains = new_domains
        if builder.extruded:
            new_domains = []
            for d in domains:
                if d.get_dim_name(isl.dim_type.set, 0) == builder.layer_index.name:
                    # layer = t1 - 1
                    t1 = parameters.layer_end
                    new_domains.append(d.add_constraint(isl.Constraint.eq_from_names(d.space, {"layer": 1, t1: -1, 1: 1})))
                else:
                    new_domains.append(d)
        domains = new_domains

    assumptions, = reduce(operator.and_,
                          parameters.assumptions.values()).params().get_basic_sets()
    options = loopy.Options(check_dep_resolution=True, ignore_boostable_into=True)

    # sometimes masks are not used, but we still need to create the function arguments
    for i, arg in enumerate(parameters.wrapper_arguments):
        if parameters.kernel_data[i] is None:
            arg = loopy.GlobalArg(arg.name, dtype=arg.dtype, shape=arg.shape)
            parameters.kernel_data[i] = arg

    if wrapper_name is None:
        wrapper_name = "wrap_%s" % builder.kernel.name

    pwaffd = isl.affs_from_space(assumptions.get_space())
    assumptions = assumptions & pwaffd["start"].ge_set(pwaffd[0])
    if builder.single_cell:
        assumptions = assumptions & pwaffd["start"].lt_set(pwaffd["end"])
    else:
        assumptions = assumptions & pwaffd["start"].le_set(pwaffd["end"])
    if builder.extruded:
        assumptions = assumptions & pwaffd[parameters.layer_start].le_set(pwaffd[parameters.layer_end])
    assumptions = reduce(operator.and_, assumptions.get_basic_sets())

    wrapper = loopy.make_kernel(domains,
                                statements,
                                kernel_data=parameters.kernel_data,
                                target=loopy.CTarget(),
                                temporary_variables=parameters.temporaries,
                                symbol_manglers=[symbol_mangler],
                                options=options,
                                assumptions=assumptions,
                                lang_version=(2018, 2),
                                name=wrapper_name)

    # prioritize loops
    for indices in context.index_ordering:
        wrapper = loopy.prioritize_loops(wrapper, indices)

    # register kernel
    kernel = builder.kernel
    headers = set(kernel._headers)
    headers = headers | set(["#include <math.h>"])
    preamble = "\n".join(sorted(headers))

    from coffee.base import Node

    if isinstance(kernel._code, loopy.LoopKernel):
        knl = kernel._code
        wrapper = loopy.register_callable_kernel(wrapper, knl)
        from loopy.transform.callable import _match_caller_callee_argument_dimension_
        wrapper = _match_caller_callee_argument_dimension_(wrapper, knl.name)
        wrapper = loopy.inline_callable_kernel(wrapper, knl.name)
    else:
        # kernel is a string, add it to preamble
        if isinstance(kernel._code, Node):
            code = kernel._code.gencode()
        else:
            code = kernel._code
        wrapper = loopy.register_function_id_to_in_knl_callable_mapper(
            wrapper,
            PyOP2KernelLookup(kernel.name, code, tuple(builder.argument_accesses)))
        preamble = preamble + "\n" + code

    wrapper = loopy.register_preamble_generators(wrapper, [_PreambleGen(preamble)])

    # register petsc functions
    wrapper = loopy.register_function_id_to_in_knl_callable_mapper(wrapper, petsc_function_lookup)

    return wrapper
Example #42
0
File: e2p.py Project: inducer/sumpy
    def get_kernel(self):
        ncoeffs = len(self.expansion)

        loopy_insns, result_names = self.get_loopy_insns_and_result_names()

        loopy_knl = lp.make_kernel(
                [
                    "{[itgt_box]: 0<=itgt_box<ntgt_boxes}",
                    "{[itgt]: itgt_start<=itgt<itgt_end}",
                    "{[isrc_box]: isrc_box_start<=isrc_box<isrc_box_end }",
                    "{[idim]: 0<=idim<dim}",
                    ],
                self.get_kernel_scaling_assignment()
                + ["""
                for itgt_box
                    <> tgt_ibox = target_boxes[itgt_box]
                    <> itgt_start = box_target_starts[tgt_ibox]
                    <> itgt_end = itgt_start+box_target_counts_nonchild[tgt_ibox]

                    for itgt
                        <> tgt[idim] = targets[idim,itgt]

                        <> isrc_box_start = source_box_starts[itgt_box]
                        <> isrc_box_end = source_box_starts[itgt_box+1]

                        for isrc_box
                            <> src_ibox = source_box_lists[isrc_box]
                            """] + ["""
                            <> coeff{coeffidx} = \
                                src_expansions[src_ibox - src_base_ibox, {coeffidx}]
                            """.format(coeffidx=i) for i in range(ncoeffs)] + ["""

                            <> center[idim] = centers[idim, src_ibox] {dup=idim}
                            <> b[idim] = tgt[idim] - center[idim] {dup=idim}

                            """] + loopy_insns + ["""
                        end
                        """] + ["""
                        result[{resultidx}, itgt] = result[{resultidx}, itgt] + \
                                kernel_scaling * simul_reduce(sum, isrc_box,
                                result_{resultidx}_p) {{id_prefix=write_result}}
                        """.format(resultidx=i) for i in range(len(result_names))] + ["""
                    end
                end
                """],
                [
                    lp.GlobalArg("targets", None, shape=(self.dim, "ntargets"),
                        dim_tags="sep,C"),
                    lp.GlobalArg("box_target_starts,box_target_counts_nonchild",
                        None, shape=None),
                    lp.GlobalArg("centers", None, shape="dim, aligned_nboxes"),
                    lp.GlobalArg("src_expansions", None,
                        shape=("nsrc_level_boxes", ncoeffs), offset=lp.auto),
                    lp.ValueArg("src_base_ibox", np.int32),
                    lp.ValueArg("nsrc_level_boxes,aligned_nboxes", np.int32),
                    lp.ValueArg("ntargets", np.int32),
                    lp.GlobalArg("result", None, shape="nresults,ntargets",
                        dim_tags="sep,C"),
                    lp.GlobalArg("source_box_starts, source_box_lists,",
                        None, shape=None, offset=lp.auto),
                    "..."
                ] + [arg.loopy_arg for arg in self.expansion.get_args()],
                name=self.name,
                assumptions="ntgt_boxes>=1",
                silenced_warnings="write_race(write_result*)",
                default_offset=lp.auto,
                fixed_parameters=dict(
                    dim=self.dim,
                    nresults=len(result_names)),
                lang_version=MOST_RECENT_LANGUAGE_VERSION)

        loopy_knl = lp.tag_inames(loopy_knl, "idim*:unr")
        loopy_knl = lp.prioritize_loops(loopy_knl, "itgt_box,itgt,isrc_box")
        loopy_knl = self.expansion.prepare_loopy_kernel(loopy_knl)

        return loopy_knl
Example #43
0
def generate(impero_c,
             args,
             scalar_type,
             kernel_name="loopy_kernel",
             index_names=[],
             return_increments=True):
    """Generates loopy code.

    :arg impero_c: ImperoC tuple with Impero AST and other data
    :arg args: list of loopy.GlobalArgs
    :arg scalar_type: type of scalars as C typename string
    :arg kernel_name: function name of the kernel
    :arg index_names: pre-assigned index names
    :arg return_increments: Does codegen for Return nodes increment the lvalue, or assign?
    :returns: loopy kernel
    """
    ctx = LoopyContext()
    ctx.indices = impero_c.indices
    ctx.index_names = defaultdict(lambda: "i", index_names)
    ctx.epsilon = numpy.finfo(scalar_type).resolution
    ctx.scalar_type = scalar_type
    ctx.return_increments = return_increments

    # Create arguments
    data = list(args)
    for i, (temp, dtype) in enumerate(
            assign_dtypes(impero_c.temporaries, scalar_type)):
        name = "t%d" % i
        if isinstance(temp, gem.Constant):
            data.append(
                lp.TemporaryVariable(name,
                                     shape=temp.shape,
                                     dtype=dtype,
                                     initializer=temp.array,
                                     address_space=lp.AddressSpace.LOCAL,
                                     read_only=True))
        else:
            shape = tuple([i.extent for i in ctx.indices[temp]]) + temp.shape
            data.append(
                lp.TemporaryVariable(name,
                                     shape=shape,
                                     dtype=dtype,
                                     initializer=None,
                                     address_space=lp.AddressSpace.LOCAL,
                                     read_only=False))
        ctx.gem_to_pymbolic[temp] = p.Variable(name)

    # Create instructions
    instructions = statement(impero_c.tree, ctx)

    # Create domains
    domains = create_domains(ctx.index_extent.items())

    # Create loopy kernel
    knl = lp.make_function(domains,
                           instructions,
                           data,
                           name=kernel_name,
                           target=lp.CTarget(),
                           seq_dependencies=True,
                           silenced_warnings=["summing_if_branches_ops"],
                           lang_version=(2018, 2))

    # Prevent loopy interchange by loopy
    knl = lp.prioritize_loops(knl, ",".join(ctx.index_extent.keys()))

    return knl
Example #44
0
def set_up_volume_loop(kernel, Nq):  # noqa
    kernel = lp.fix_parameters(kernel, Nq=Nq)
    kernel = lp.prioritize_loops(kernel, "e,k,j,i")
    kernel = lp.tag_inames(kernel, dict(e="g.0", j="l.1", i="l.0"))
    kernel = lp.assume(kernel, "elements >= 1")
    return kernel
Example #45
0
 def variant_2(knl):
     knl = lp.precompute(knl, "dpsi", "i,ell", default_tag='for')
     knl = lp.prioritize_loops(knl, "c,i,j")
     return knl
Example #46
0
 def variant_1(knl):
     knl = lp.add_prefetch(knl, "a", default_tag="l.auto")
     knl = lp.add_prefetch(knl, "b", default_tag="l.auto")
     knl = lp.prioritize_loops(knl, ["i", "j"])
     knl = lp.add_inames_to_insn(knl, "i", "writes:b_fetch")
     return knl