コード例 #1
0
ファイル: qbx.py プロジェクト: benSepanski/sumpy
    def get_optimized_kernel(self, targets_is_obj_array, sources_is_obj_array,
                             centers_is_obj_array):
        # FIXME specialize/tune for GPU/CPU
        loopy_knl = self.get_kernel()

        if targets_is_obj_array:
            loopy_knl = lp.tag_array_axes(loopy_knl, "tgt", "sep,C")
        if sources_is_obj_array:
            loopy_knl = lp.tag_array_axes(loopy_knl, "src", "sep,C")
        if centers_is_obj_array:
            loopy_knl = lp.tag_array_axes(loopy_knl, "center", "sep,C")

        import pyopencl as cl
        dev = self.context.devices[0]
        if dev.type & cl.device_type.CPU:
            loopy_knl = lp.split_iname(loopy_knl,
                                       "itgt",
                                       16,
                                       outer_tag="g.0",
                                       inner_tag="l.0")
            loopy_knl = lp.split_iname(loopy_knl, "isrc", 256)
            loopy_knl = lp.prioritize_loops(loopy_knl,
                                            ["isrc_outer", "itgt_inner"])
        else:
            from warnings import warn
            warn(
                "don't know how to tune layer potential computation for '%s'" %
                dev)
            loopy_knl = lp.split_iname(loopy_knl, "itgt", 128, outer_tag="g.0")

        return loopy_knl
コード例 #2
0
ファイル: p2p.py プロジェクト: inducer/sumpy
    def get_optimized_kernel(self, targets_is_obj_array, sources_is_obj_array):
        # FIXME
        knl = self.get_kernel()

        if sources_is_obj_array:
            knl = lp.tag_array_axes(knl, "sources", "sep,C")
        if targets_is_obj_array:
            knl = lp.tag_array_axes(knl, "targets", "sep,C")

        knl = lp.split_iname(knl, "imat", 1024, outer_tag="g.0")
        return knl
コード例 #3
0
ファイル: p2p.py プロジェクト: stjordanis/sumpy
    def get_optimized_kernel(self, targets_is_obj_array, sources_is_obj_array):
        # FIXME
        knl = self.get_kernel()

        if sources_is_obj_array:
            knl = lp.tag_array_axes(knl, "sources", "sep,C")
        if targets_is_obj_array:
            knl = lp.tag_array_axes(knl, "targets", "sep,C")

        knl = lp.split_iname(knl, "imat", 1024, outer_tag="g.0")
        return knl
コード例 #4
0
ファイル: interactions.py プロジェクト: choward1491/pytential
    def get_optimized_kernel(self, is_sources_obj_array, is_centers_obj_array):
        # FIXME
        knl = self.get_kernel()

        if is_sources_obj_array:
            knl = lp.tag_array_axes(knl, "sources", "sep,C")
        if is_centers_obj_array:
            knl = lp.tag_array_axes(knl, "qbx_centers", "sep,C")

        knl = lp.split_iname(knl, "itgt_center", 16, outer_tag="g.0")
        knl = self._allow_redundant_execution_of_knl_scaling(knl)
        return knl
コード例 #5
0
ファイル: interactions.py プロジェクト: choward1491/pytential
    def get_optimized_kernel(self, is_targets_obj_array, is_centers_obj_array):
        # FIXME
        knl = self.get_kernel()

        if is_targets_obj_array:
            knl = lp.tag_array_axes(knl, "targets", "sep,C")
        if is_centers_obj_array:
            knl = lp.tag_array_axes(knl, "qbx_centers", "sep,C")

        knl = lp.tag_inames(knl, dict(iglobal_center="g.0"))
        knl = self._allow_redundant_execution_of_knl_scaling(knl)
        return knl
コード例 #6
0
ファイル: qbx.py プロジェクト: benSepanski/sumpy
    def get_optimized_kernel(self, targets_is_obj_array, sources_is_obj_array,
                             centers_is_obj_array):
        loopy_knl = self.get_kernel()

        if targets_is_obj_array:
            loopy_knl = lp.tag_array_axes(loopy_knl, "tgt", "sep,C")
        if sources_is_obj_array:
            loopy_knl = lp.tag_array_axes(loopy_knl, "src", "sep,C")
        if centers_is_obj_array:
            loopy_knl = lp.tag_array_axes(loopy_knl, "center", "sep,C")

        loopy_knl = lp.split_iname(loopy_knl, "imat", 1024, outer_tag="g.0")
        return loopy_knl
コード例 #7
0
ファイル: test_transform.py プロジェクト: connorjward/loopy
def test_tag_data_axes(ctx_factory):
    ctx = ctx_factory()

    knl = lp.make_kernel("{ [i,j,k]: 0<=i,j,k<n }", "out[i,j,k] = 15")

    ref_knl = knl

    with pytest.raises(lp.LoopyError):
        lp.tag_array_axes(knl, "out", "N1,N0,N5")

    with pytest.raises(lp.LoopyError):
        lp.tag_array_axes(knl, "out", "N1,N0,c")

    knl = lp.tag_array_axes(knl, "out", "N1,N0,N2")
    knl = lp.tag_inames(knl, dict(j="g.0", i="g.1"))

    lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=20))
コード例 #8
0
def set_q_storage_format(kernel, name):
    kernel = lp.set_array_axis_names(kernel, name, "i,j,k,field,e")

    kernel = lp.split_array_dim(
        kernel, (name, 3, "F"), 4, auto_split_inames=False)
    kernel = lp.tag_array_axes(kernel, name, "N0,N1,N2,vec,N4,N3")

    return kernel
コード例 #9
0
    def copy_targets_kernel(self):
        knl = lp.make_kernel("""{[dim,i]:
                0<=dim<ndims and
                0<=i<npoints}""",
                             """
                targets[dim, i] = points[dim, i]
                """,
                             default_offset=lp.auto,
                             name="copy_targets",
                             lang_version=MOST_RECENT_LANGUAGE_VERSION)

        knl = lp.fix_parameters(knl, ndims=self.ambient_dim)

        knl = lp.split_iname(knl, "i", 128, inner_tag="l.0", outer_tag="g.0")
        knl = lp.tag_array_axes(knl, "points", "sep, C")

        knl = lp.tag_array_axes(knl, "targets", "stride:auto, stride:1")
        return lp.tag_inames(knl, dict(dim="ilp"))
コード例 #10
0
ファイル: unregularized.py プロジェクト: inducer/pytential
    def copy_targets_kernel(self):
        knl = lp.make_kernel(
            """{[dim,i]:
                0<=dim<ndims and
                0<=i<npoints}""",
            """
                targets[dim, i] = points[dim, i]
                """,
            default_offset=lp.auto, name="copy_targets",
            lang_version=MOST_RECENT_LANGUAGE_VERSION)

        knl = lp.fix_parameters(knl, ndims=self.ambient_dim)

        knl = lp.split_iname(knl, "i", 128, inner_tag="l.0", outer_tag="g.0")
        knl = lp.tag_array_axes(knl, "points", "sep, C")

        knl = lp.tag_array_axes(knl, "targets", "stride:auto, stride:1")
        return lp.tag_inames(knl, dict(dim="ilp"))
コード例 #11
0
def set_q_storage_format(kernel, name):
    kernel = lp.set_array_axis_names(kernel, name, "i,j,k,field,e")

    kernel = lp.split_array_dim(kernel, (name, 3, "F"),
                                4,
                                auto_split_inames=False)
    kernel = lp.tag_array_axes(kernel, name, "N0,N1,N2,vec,N4,N3")

    return kernel
コード例 #12
0
        def prg():
            result = make_loopy_program(
                """{[iel, idof, j]:
                    0<=iel<nelements and
                    0<=idof<ndiscr_nodes_out and
                    0<=j<ndiscr_nodes_in}""",
                "result[iel, idof] = sum(j, mat[idof, j] * vec[iel, j])",
                name="diff")

            result = lp.tag_array_axes(result, "mat", "stride:auto,stride:auto")
            return result
コード例 #13
0
ファイル: test_target.py プロジェクト: sailfish009/loopy
def test_cuda_short_vector():
    knl = lp.make_kernel("{ [i]: 0<=i<n }",
                         "out[i] = 2*a[i]",
                         target=lp.CudaTarget())

    knl = lp.set_options(knl, write_code=True)
    knl = lp.split_iname(knl, "i", 4, slabs=(0, 1), inner_tag="vec")
    knl = lp.split_array_axis(knl, "a,out", axis_nr=0, count=4)
    knl = lp.tag_array_axes(knl, "a,out", "C,vec")

    knl = lp.set_options(knl, write_wrapper=True)
    knl = lp.add_and_infer_dtypes(knl, {"a": np.float32})

    print(lp.generate_code_v2(knl).device_code())
コード例 #14
0
ファイル: test_target.py プロジェクト: inducer/loopy
def test_cuda_short_vector():
    knl = lp.make_kernel(
        "{ [i]: 0<=i<n }",
        "out[i] = 2*a[i]",
        target=lp.CudaTarget())

    knl = lp.set_options(knl, write_code=True)
    knl = lp.split_iname(knl, "i", 4, slabs=(0, 1), inner_tag="vec")
    knl = lp.split_array_axis(knl, "a,out", axis_nr=0, count=4)
    knl = lp.tag_array_axes(knl, "a,out", "C,vec")

    knl = lp.set_options(knl, write_wrapper=True)
    knl = lp.add_and_infer_dtypes(knl, {"a": np.float32})

    print(lp.generate_code_v2(knl).device_code())
コード例 #15
0
        def prg(nmatrices):
            result = make_loopy_program(
                """{[imatrix, iel, idof, j]:
                    0<=imatrix<nmatrices and
                    0<=iel<nelements and
                    0<=idof<nunit_nodes_out and
                    0<=j<nunit_nodes_in}""",
                """
                result[imatrix, iel, idof] = sum(
                        j, diff_mat[imatrix, idof, j] * vec[iel, j])
                """,
                name="diff")

            result = lp.fix_parameters(result, nmatrices=nmatrices)
            result = lp.tag_inames(result, "imatrix: unr")
            result = lp.tag_array_axes(result, "result", "sep,c,c")
            return result
コード例 #16
0
ファイル: test_target.py プロジェクト: tj-sun/loopy
def test_numba_cuda_target():
    knl = lp.make_kernel("{[i,j,k]: 0<=i,j<M and 0<=k<N}",
                         "D[i,j] = sqrt(sum(k, (X[i, k]-X[j, k])**2))",
                         target=lp.NumbaCudaTarget())

    knl = lp.assume(knl, "M>0")
    knl = lp.split_iname(knl, "i", 16, outer_tag='g.0')
    knl = lp.split_iname(knl, "j", 128, inner_tag='l.0', slabs=(0, 1))
    knl = lp.add_prefetch(knl, "X[i,:]")
    knl = lp.fix_parameters(knl, N=3)
    knl = lp.prioritize_loops(knl, "i_inner,j_outer")
    knl = lp.tag_inames(knl, "k:unr")
    knl = lp.tag_array_axes(knl, "X", "N0,N1")

    knl = lp.add_and_infer_dtypes(knl, {"X": np.float32})

    print(lp.generate_code_v2(knl).all_code())
コード例 #17
0
ファイル: test_target.py プロジェクト: inducer/loopy
def test_numba_cuda_target():
    knl = lp.make_kernel(
        "{[i,j,k]: 0<=i,j<M and 0<=k<N}",
        "D[i,j] = sqrt(sum(k, (X[i, k]-X[j, k])**2))",
        target=lp.NumbaCudaTarget())

    knl = lp.assume(knl, "M>0")
    knl = lp.split_iname(knl, "i", 16, outer_tag='g.0')
    knl = lp.split_iname(knl, "j", 128, inner_tag='l.0', slabs=(0, 1))
    knl = lp.add_prefetch(knl, "X[i,:]", default_tag="l.auto")
    knl = lp.fix_parameters(knl, N=3)
    knl = lp.prioritize_loops(knl, "i_inner,j_outer")
    knl = lp.tag_inames(knl, "k:unr")
    knl = lp.tag_array_axes(knl, "X", "N0,N1")

    knl = lp.add_and_infer_dtypes(knl, {"X": np.float32})

    print(lp.generate_code_v2(knl).all_code())
コード例 #18
0
ファイル: __init__.py プロジェクト: mattwala/meshmode
        def knl():
            knl = lp.make_kernel(
                """{[d,k,i,j]:
                    0<=d<dims and
                    0<=k<nelements and
                    0<=i<ndiscr_nodes and
                    0<=j<nmesh_nodes}""",
                """
                    result[d, k, i] = \
                        sum(j, resampling_mat[i, j] * nodes[d, k, j])
                    """,
                name="nodes",
                default_offset=lp.auto)

            knl = lp.split_iname(knl, "i", 16, inner_tag="l.0")
            knl = lp.tag_inames(knl, dict(k="g.0"))
            knl = lp.tag_array_axes(knl, "result",
                    "stride:auto,stride:auto,stride:auto")
            return knl
コード例 #19
0
        def knl():
            knl = lp.make_kernel(
                """{[d,k,i,j]:
                    0<=d<dims and
                    0<=k<nelements and
                    0<=i<ndiscr_nodes and
                    0<=j<nmesh_nodes}""",
                """
                    result[d, k, i] = \
                        sum(j, resampling_mat[i, j] * nodes[d, k, j])
                    """,
                name="nodes",
                default_offset=lp.auto)

            knl = lp.split_iname(knl, "i", 16, inner_tag="l.0")
            knl = lp.tag_inames(knl, dict(k="g.0"))
            knl = lp.tag_array_axes(knl, "result",
                    "stride:auto,stride:auto,stride:auto")
            return knl
コード例 #20
0
ファイル: test_transform.py プロジェクト: connorjward/loopy
def test_vectorize(ctx_factory):
    ctx = ctx_factory()

    knl = lp.make_kernel(
        "{[i]: 0<=i<n}", """
        <> temp = 2*b[i]
        a[i] = temp
        """)
    knl = lp.add_and_infer_dtypes(knl, dict(b=np.float32))
    knl = lp.set_array_axis_names(knl, "a,b", "i")
    knl = lp.split_array_dim(knl, [("a", 0), ("b", 0)],
                             4,
                             split_kwargs=dict(slabs=(0, 1)))

    knl = lp.tag_array_axes(knl, "a,b", "c,vec")
    ref_knl = knl
    ref_knl = lp.tag_inames(ref_knl, {"i_inner": "unr"})

    knl = lp.tag_inames(knl, {"i_inner": "vec"})

    knl = lp.preprocess_kernel(knl)
    code, inf = lp.generate_code(knl)

    lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=30))
コード例 #21
0
ファイル: p2p.py プロジェクト: jdoherty7/sumpy
    def get_kernel(self):
        loopy_insns, result_names = self.get_loopy_insns_and_result_names()

        from pymbolic import var
        exprs = [
            var(name) * var("strength").index(
                (self.strength_usage[i], var("isrc")))
            for i, name in enumerate(result_names)
        ]

        if self.exclude_self:
            from pymbolic.primitives import If, Variable
            exprs = [If(Variable("is_self"), 0, expr) for expr in exprs]

        from sumpy.tools import gather_loopy_source_arguments
        loopy_knl = lp.make_kernel(
            [
                "{[itgt_box]: 0<=itgt_box<ntgt_boxes}",
                "{[isrc_box]: isrc_box_start<=isrc_box<isrc_box_end}",
                "{[itgt,isrc,idim]: \
                        itgt_start<=itgt<itgt_end and \
                        isrc_start<=isrc<isrc_end and \
                        0<=idim<dim }",
            ],
            self.get_kernel_scaling_assignments() + [
                """
                for itgt_box
                    <> tgt_ibox = target_boxes[itgt_box]
                    <> itgt_start = box_target_starts[tgt_ibox]
                    <> itgt_end = itgt_start+box_target_counts_nonchild[tgt_ibox]

                    <> isrc_box_start = source_box_starts[itgt_box]
                    <> isrc_box_end = source_box_starts[itgt_box+1]

                    for isrc_box
                        <> src_ibox = source_box_lists[isrc_box]
                        <> isrc_start = box_source_starts[src_ibox]
                        <> isrc_end = isrc_start+box_source_counts_nonchild[src_ibox]

                        for itgt
                            for isrc
                                <> d[idim] = \
                                        targets[idim,itgt] - sources[idim,isrc] \
                                        {dup=idim}
                                """
            ] + [
                """
                                <> is_self = (isrc == target_to_source[itgt])
                                """ if self.exclude_self else ""
            ] + [] + loopy_insns + [
                lp.Assignment(id=None,
                              assignee="pair_result_%d" % i,
                              expression=expr,
                              temp_var_type=lp.auto)
                for i, expr in enumerate(exprs)
            ] + [
                """
                            end
                            """
            ] + [
                """
                            result[KNLIDX, itgt] = result[KNLIDX, itgt] + \
                                knl_KNLIDX_scaling \
                                * simul_reduce(sum, isrc, pair_result_KNLIDX)
                            """.replace("KNLIDX", str(iknl))
                for iknl in range(len(exprs))
            ] + [
                """
                        end
                    end
                end
                """
            ],
            [
                lp.GlobalArg(
                    "box_target_starts,box_target_counts_nonchild,"
                    "box_source_starts,box_source_counts_nonchild,",
                    None,
                    shape=None),
                lp.GlobalArg(
                    "source_box_starts, source_box_lists,", None, shape=None),
                lp.GlobalArg("strength", None, shape="nstrengths,nsources"),
                lp.GlobalArg("result",
                             None,
                             shape="nkernels,ntargets",
                             dim_tags="sep,c"),
                lp.GlobalArg(
                    "targets", None, shape="dim,ntargets", dim_tags="sep,c"),
                lp.GlobalArg(
                    "sources", None, shape="dim,nsources", dim_tags="sep,c"),
                lp.ValueArg("nsources", np.int32),
                lp.ValueArg("ntargets", np.int32),
                "...",
            ] + ([
                lp.GlobalArg(
                    "target_to_source", np.int32, shape=("ntargets", ))
            ] if self.exclude_self else []) +
            gather_loopy_source_arguments(self.kernels),
            name=self.name,
            assumptions="ntgt_boxes>=1",
            fixed_parameters=dict(dim=self.dim,
                                  nstrengths=self.strength_count,
                                  nkernels=len(self.kernels)))

        loopy_knl = lp.tag_inames(loopy_knl, "idim*:unr")
        loopy_knl = lp.tag_array_axes(loopy_knl, "strength", "sep,C")

        for knl in self.kernels:
            loopy_knl = knl.prepare_loopy_kernel(loopy_knl)

        return loopy_knl
コード例 #22
0
def set_D_storage_format(kernel):
    return lp.tag_array_axes(kernel, "D", "f,f")
コード例 #23
0
def set_D_storage_format(kernel):
    return lp.tag_array_axes(kernel, "D", "f,f")
コード例 #24
0
def test_gnuma_horiz_kernel(ctx_factory, ilp_multiple, Nq, opt_level):
    ctx = ctx_factory()

    filename = "strongVolumeKernels.f90"
    with open(filename, "r") as sourcef:
        source = sourcef.read()

    source = source.replace("datafloat", "real*4")

    hsv_r, hsv_s = [
        knl
        for knl in lp.parse_fortran(source, filename, auto_dependencies=False)
        if "KernelR" in knl.name or "KernelS" in knl.name
    ]
    hsv_r = lp.tag_instructions(hsv_r, "rknl")
    hsv_s = lp.tag_instructions(hsv_s, "sknl")
    hsv = lp.fuse_kernels([hsv_r, hsv_s], ["_r", "_s"])
    #hsv = hsv_s

    from gnuma_loopy_transforms import (fix_euler_parameters,
                                        set_q_storage_format,
                                        set_D_storage_format)

    hsv = lp.fix_parameters(hsv, Nq=Nq)
    hsv = lp.set_loop_priority(hsv, "e,k,j,i")
    hsv = lp.tag_inames(hsv, dict(e="g.0", j="l.1", i="l.0"))
    hsv = lp.assume(hsv, "elements >= 1")

    hsv = fix_euler_parameters(hsv, p_p0=1, p_Gamma=1.4, p_R=1)
    for name in ["Q", "rhsQ"]:
        hsv = set_q_storage_format(hsv, name)

    hsv = set_D_storage_format(hsv)
    #hsv = lp.add_prefetch(hsv, "volumeGeometricFactors")

    ref_hsv = hsv

    if opt_level == 0:
        tap_hsv = hsv

    hsv = lp.add_prefetch(hsv, "D[:,:]")

    if opt_level == 1:
        tap_hsv = hsv

    # turn the first reads into subst rules
    local_prep_var_names = set()
    for insn in lp.find_instructions(hsv, "tag:local_prep"):
        assignee, = insn.assignee_var_names()
        local_prep_var_names.add(assignee)
        hsv = lp.assignment_to_subst(hsv, assignee)

    # precompute fluxes
    hsv = lp.assignment_to_subst(hsv, "JinvD_r")
    hsv = lp.assignment_to_subst(hsv, "JinvD_s")

    r_fluxes = lp.find_instructions(hsv, "tag:compute_fluxes and tag:rknl")
    s_fluxes = lp.find_instructions(hsv, "tag:compute_fluxes and tag:sknl")

    if ilp_multiple > 1:
        hsv = lp.split_iname(hsv, "k", 2, inner_tag="ilp")
        ilp_inames = ("k_inner", )
        flux_ilp_inames = ("kk", )
    else:
        ilp_inames = ()
        flux_ilp_inames = ()

    rtmps = []
    stmps = []

    flux_store_idx = 0

    for rflux_insn, sflux_insn in zip(r_fluxes, s_fluxes):
        for knl_tag, insn, flux_inames, tmps, flux_precomp_inames in [
            ("rknl", rflux_insn, (
                "j",
                "n",
            ), rtmps, (
                "jj",
                "ii",
            )),
            ("sknl", sflux_insn, (
                "i",
                "n",
            ), stmps, (
                "ii",
                "jj",
            )),
        ]:
            flux_var, = insn.assignee_var_names()
            print(insn)

            reader, = lp.find_instructions(
                hsv,
                "tag:{knl_tag} and reads:{flux_var}".format(knl_tag=knl_tag,
                                                            flux_var=flux_var))

            hsv = lp.assignment_to_subst(hsv, flux_var)

            flux_store_name = "flux_store_%d" % flux_store_idx
            flux_store_idx += 1
            tmps.append(flux_store_name)

            hsv = lp.precompute(hsv,
                                flux_var + "_subst",
                                flux_inames + ilp_inames,
                                temporary_name=flux_store_name,
                                precompute_inames=flux_precomp_inames +
                                flux_ilp_inames,
                                default_tag=None)
            if flux_var.endswith("_s"):
                hsv = lp.tag_array_axes(hsv, flux_store_name, "N0,N1,N2?")
            else:
                hsv = lp.tag_array_axes(hsv, flux_store_name, "N1,N0,N2?")

            n_iname = "n_" + flux_var.replace("_r", "").replace("_s", "")
            if n_iname.endswith("_0"):
                n_iname = n_iname[:-2]
            hsv = lp.rename_iname(hsv,
                                  "n",
                                  n_iname,
                                  within="id:" + reader.id,
                                  existing_ok=True)

    hsv = lp.tag_inames(hsv, dict(ii="l.0", jj="l.1"))
    for iname in flux_ilp_inames:
        hsv = lp.tag_inames(hsv, {iname: "ilp"})

    hsv = lp.alias_temporaries(hsv, rtmps)
    hsv = lp.alias_temporaries(hsv, stmps)

    if opt_level == 2:
        tap_hsv = hsv

    for prep_var_name in local_prep_var_names:
        if prep_var_name.startswith("Jinv") or "_s" in prep_var_name:
            continue
        hsv = lp.precompute(
            hsv, lp.find_one_rule_matching(hsv, prep_var_name + "_*subst*"))

    if opt_level == 3:
        tap_hsv = hsv

    hsv = lp.add_prefetch(hsv, "Q[ii,jj,k,:,:,e]", sweep_inames=ilp_inames)

    if opt_level == 4:
        tap_hsv = hsv
        tap_hsv = lp.tag_inames(
            tap_hsv, dict(Q_dim_field_inner="unr", Q_dim_field_outer="unr"))

    hsv = lp.buffer_array(hsv,
                          "rhsQ",
                          ilp_inames,
                          fetch_bounding_box=True,
                          default_tag="for",
                          init_expression="0",
                          store_expression="base + buffer")

    if opt_level == 5:
        tap_hsv = hsv
        tap_hsv = lp.tag_inames(
            tap_hsv,
            dict(rhsQ_init_field_inner="unr",
                 rhsQ_store_field_inner="unr",
                 rhsQ_init_field_outer="unr",
                 rhsQ_store_field_outer="unr",
                 Q_dim_field_inner="unr",
                 Q_dim_field_outer="unr"))

    # buffer axes need to be vectorized in order for this to work
    hsv = lp.tag_array_axes(hsv, "rhsQ_buf", "c?,vec,c")
    hsv = lp.tag_array_axes(hsv, "Q_fetch", "c?,vec,c")
    hsv = lp.tag_array_axes(hsv, "D_fetch", "f,f")
    hsv = lp.tag_inames(hsv, {
        "Q_dim_k": "unr",
        "rhsQ_init_k": "unr",
        "rhsQ_store_k": "unr"
    },
                        ignore_nonexistent=True)

    if opt_level == 6:
        tap_hsv = hsv
        tap_hsv = lp.tag_inames(
            tap_hsv,
            dict(rhsQ_init_field_inner="unr",
                 rhsQ_store_field_inner="unr",
                 rhsQ_init_field_outer="unr",
                 rhsQ_store_field_outer="unr",
                 Q_dim_field_inner="unr",
                 Q_dim_field_outer="unr"))

    hsv = lp.tag_inames(
        hsv,
        dict(rhsQ_init_field_inner="vec",
             rhsQ_store_field_inner="vec",
             rhsQ_init_field_outer="unr",
             rhsQ_store_field_outer="unr",
             Q_dim_field_inner="vec",
             Q_dim_field_outer="unr"))

    if opt_level == 7:
        tap_hsv = hsv

    hsv = lp.collect_common_factors_on_increment(
        hsv, "rhsQ_buf", vary_by_axes=(0, ) if ilp_multiple > 1 else ())

    if opt_level >= 8:
        tap_hsv = hsv

    hsv = tap_hsv

    if 1:
        print("OPS")
        op_poly = lp.get_op_poly(hsv)
        print(lp.stringify_stats_mapping(op_poly))

        print("MEM")
        gmem_poly = lp.sum_mem_access_to_bytes(lp.get_gmem_access_poly(hsv))
        print(lp.stringify_stats_mapping(gmem_poly))

    hsv = lp.set_options(hsv,
                         cl_build_options=[
                             "-cl-denorms-are-zero",
                             "-cl-fast-relaxed-math",
                             "-cl-finite-math-only",
                             "-cl-mad-enable",
                             "-cl-no-signed-zeros",
                         ])

    hsv = hsv.copy(name="horizontalStrongVolumeKernel")

    results = lp.auto_test_vs_ref(ref_hsv,
                                  ctx,
                                  hsv,
                                  parameters=dict(elements=300),
                                  quiet=True)

    elapsed = results["elapsed_wall"]

    print("elapsed", elapsed)
コード例 #25
0
import numpy as np
import loopy as lp
import pyopencl as cl
import pyopencl.array

ctx = cl.create_some_context()
queue = cl.CommandQueue(ctx)

n = 15 * 10**6
a = cl.array.arange(queue, n, dtype=np.float32)

knl = lp.make_kernel("{ [i]: 0<=i<n }", "out[i] = 2*a[i]")

knl = lp.set_options(knl, write_code=True)
knl = lp.split_iname(knl, "i", 4, slabs=(0, 1), inner_tag="vec")
knl = lp.split_array_axis(knl, "a,out", axis_nr=0, count=4)
knl = lp.tag_array_axes(knl, "a,out", "C,vec")

knl(queue, a=a.reshape(-1, 4), n=n)
コード例 #26
0
    def get_kernel(self):
        loopy_insns, result_names = self.get_loopy_insns_and_result_names()

        from pymbolic import var
        exprs = [
                var(name)
                * var("strength").index((self.strength_usage[i], var("isrc")))
                for i, name in enumerate(result_names)]

        from sumpy.tools import gather_loopy_source_arguments
        loopy_knl = lp.make_kernel(
            [
                "{[itgt_box]: 0<=itgt_box<ntgt_boxes}",
                "{[isrc_box]: isrc_box_start<=isrc_box<isrc_box_end}",
                "{[itgt,isrc,idim]: \
                        itgt_start<=itgt<itgt_end and \
                        isrc_start<=isrc<isrc_end and \
                        0<=idim<dim }",
                ],
            self.get_kernel_scaling_assignments()
            + [
                """
                for itgt_box
                    <> tgt_ibox = target_boxes[itgt_box]
                    <> itgt_start = box_target_starts[tgt_ibox]
                    <> itgt_end = itgt_start+box_target_counts_nonchild[tgt_ibox]

                    <> isrc_box_start = source_box_starts[itgt_box]
                    <> isrc_box_end = source_box_starts[itgt_box+1]

                    for isrc_box
                        <> src_ibox = source_box_lists[isrc_box]
                        <> isrc_start = box_source_starts[src_ibox]
                        <> isrc_end = isrc_start+box_source_counts_nonchild[src_ibox]

                        for itgt
                            for isrc
                                <> d[idim] = \
                                        targets[idim,itgt] - sources[idim,isrc] \
                                        {dup=idim}
                                """
                                ] + loopy_insns + [
                                lp.Assignment(id=None,
                                    assignee="pair_result_%d" % i, expression=expr,
                                    temp_var_type=lp.auto)
                                for i, expr in enumerate(exprs)
                                ] + [
                                """
                            end
                            """] + ["""
                            result[KNLIDX, itgt] = result[KNLIDX, itgt] + \
                                knl_KNLIDX_scaling \
                                * simul_reduce(sum, isrc, pair_result_KNLIDX)
                            """.replace("KNLIDX", str(iknl))
                            for iknl in range(len(exprs))] + ["""
                        end
                    end
                end
                """],
            [
                lp.GlobalArg("box_target_starts,box_target_counts_nonchild,"
                    "box_source_starts,box_source_counts_nonchild,",
                    None, shape=None),
                lp.GlobalArg("source_box_starts, source_box_lists,",
                    None, shape=None),
                lp.GlobalArg("strength", None, shape="nstrengths,nsources"),
                lp.GlobalArg("result", None,
                    shape="nkernels,ntargets", dim_tags="sep,c"),
                lp.GlobalArg("targets", None,
                    shape="dim,ntargets", dim_tags="sep,c"),
                lp.GlobalArg("sources", None,
                    shape="dim,nsources", dim_tags="sep,c"),
                lp.ValueArg("nsources", np.int32),
                lp.ValueArg("ntargets", np.int32),
                "...",
            ] + gather_loopy_source_arguments(self.kernels),
            name=self.name, assumptions="ntgt_boxes>=1")

        loopy_knl = lp.fix_parameters(
                loopy_knl,
                dim=self.dim,
                nstrengths=self.strength_count,
                nkernels=len(self.kernels))

        loopy_knl = lp.tag_inames(loopy_knl, "idim*:unr")
        loopy_knl = lp.tag_array_axes(loopy_knl, "strength", "sep,C")

        for knl in self.kernels:
            loopy_knl = knl.prepare_loopy_kernel(loopy_knl)

        return loopy_knl
コード例 #27
0
    def get_kernel(self):
        from sumpy.symbolic import make_sympy_vector

        avec = make_sympy_vector("a", self.dim)
        bvec = make_sympy_vector("b", self.dim)

        from sumpy.assignment_collection import SymbolicAssignmentCollection
        sac = SymbolicAssignmentCollection()

        logger.info("compute expansion expressions: start")

        result_names = [expand(i, sac, expn, avec, bvec)
                for i, expn in enumerate(self.expansions)]

        logger.info("compute expansion expressions: done")

        sac.run_global_cse()

        from sumpy.symbolic import kill_trivial_assignments
        assignments = kill_trivial_assignments([
                (name, expr.subs("tau", 0))
                for name, expr in six.iteritems(sac.assignments)],
                retain_names=result_names)

        from sumpy.codegen import to_loopy_insns
        loopy_insns = to_loopy_insns(assignments,
                vector_names=set(["a", "b"]),
                pymbolic_expr_maps=[
                    expn.kernel.get_code_transformer() for expn in self.expansions],
                complex_dtype=np.complex128  # FIXME
                )

        isrc_sym = var("isrc")
        exprs = [
                var(name)
                * self.get_strength_or_not(isrc_sym, i)
                for i, name in enumerate(result_names)]

        from sumpy.tools import gather_loopy_source_arguments
        arguments = (
                self.get_src_tgt_arguments()
                + self.get_input_and_output_arguments()
                + gather_loopy_source_arguments(self.kernels))

        loopy_knl = lp.make_kernel(
                "{[isrc,itgt,idim]: 0<=itgt<ntargets and 0<=isrc<nsources "
                "and 0<=idim<dim}",
                self.get_kernel_scaling_assignments()
                + ["for itgt, isrc"]
                + [self.get_compute_a_and_b_vecs()]
                + loopy_insns
                + [
                    lp.Assignment(id=None,
                        assignee="pair_result_%d" % i, expression=expr,
                        temp_var_type=lp.auto)
                    for i, (expr, dtype) in enumerate(zip(exprs, self.value_dtypes))
                ]
                + ["end"]
                + self.get_result_store_instructions(),
                arguments,
                name=self.name,
                assumptions="nsources>=1 and ntargets>=1",
                default_offset=lp.auto,
                silenced_warnings="write_race(write_lpot*)"
                )

        loopy_knl = lp.fix_parameters(loopy_knl, dim=self.dim)

        loopy_knl = lp.tag_inames(loopy_knl, "idim*:unr")

        for expn in self.expansions:
            loopy_knl = expn.prepare_loopy_kernel(loopy_knl)

        loopy_knl = lp.tag_array_axes(loopy_knl, "center", "sep,C")

        return loopy_knl
コード例 #28
0
ファイル: p2p.py プロジェクト: inducer/sumpy
    def get_kernel(self):
        loopy_insns, result_names = self.get_loopy_insns_and_result_names()
        kernel_exprs = self.get_kernel_exprs(result_names)
        arguments = (
            self.get_default_src_tgt_arguments()
            + [
                lp.GlobalArg("box_target_starts",
                    None, shape=None),
                lp.GlobalArg("box_target_counts_nonchild",
                    None, shape=None),
                lp.GlobalArg("box_source_starts",
                    None, shape=None),
                lp.GlobalArg("box_source_counts_nonchild",
                    None, shape=None),
                lp.GlobalArg("source_box_starts",
                    None, shape=None),
                lp.GlobalArg("source_box_lists",
                    None, shape=None),
                lp.GlobalArg("strength", None,
                    shape="nstrengths, nsources", dim_tags="sep,C"),
                lp.GlobalArg("result", None,
                    shape="nkernels, ntargets", dim_tags="sep,C"),
                "..."
            ])

        loopy_knl = lp.make_kernel([
            "{[itgt_box]: 0 <= itgt_box < ntgt_boxes}",
            "{[isrc_box]: isrc_box_start <= isrc_box < isrc_box_end}",
            "{[itgt, isrc, idim]: \
                itgt_start <= itgt < itgt_end and \
                isrc_start <= isrc < isrc_end and \
                0 <= idim < dim}",
            ],
            self.get_kernel_scaling_assignments()
            + ["""
                for itgt_box
                <> tgt_ibox = target_boxes[itgt_box]
                <> itgt_start = box_target_starts[tgt_ibox]
                <> itgt_end = itgt_start + box_target_counts_nonchild[tgt_ibox]

                <> isrc_box_start = source_box_starts[itgt_box]
                <> isrc_box_end = source_box_starts[itgt_box+1]

                for isrc_box
                    <> src_ibox = source_box_lists[isrc_box]
                    <> isrc_start = box_source_starts[src_ibox]
                    <> isrc_end = isrc_start + box_source_counts_nonchild[src_ibox]

                    for itgt
                    for isrc
                        <> d[idim] = \
                            targets[idim, itgt] - sources[idim, isrc] {dup=idim}
            """] + ["""
                        <> is_self = (isrc == target_to_source[itgt])
                    """ if self.exclude_self else ""]
            + loopy_insns + kernel_exprs
            + ["    end"]
            + ["""
                    result[{i}, itgt] = result[{i}, itgt] + \
                        knl_{i}_scaling * simul_reduce(sum, isrc, pair_result_{i}) \
                        {{id_prefix=write_csr}}
                """.format(i=iknl)
                for iknl in range(len(self.kernels))]
            + ["""
                    end
                end
                end
            """],
            arguments,
            assumptions="ntgt_boxes>=1",
            name=self.name,
            silenced_warnings="write_race(write_csr*)",
            fixed_parameters=dict(
                dim=self.dim,
                nstrengths=self.strength_count,
                nkernels=len(self.kernels)),
            lang_version=MOST_RECENT_LANGUAGE_VERSION)

        loopy_knl = lp.add_dtypes(loopy_knl,
            dict(nsources=np.int32, ntargets=np.int32))

        loopy_knl = lp.tag_inames(loopy_knl, "idim*:unr")
        loopy_knl = lp.tag_array_axes(loopy_knl, "targets", "sep,C")
        loopy_knl = lp.tag_array_axes(loopy_knl, "sources", "sep,C")

        for knl in self.kernels:
            loopy_knl = knl.prepare_loopy_kernel(loopy_knl)

        return loopy_knl
コード例 #29
0
ファイル: p2p.py プロジェクト: stjordanis/sumpy
    def get_kernel(self):
        loopy_insns, result_names = self.get_loopy_insns_and_result_names()
        kernel_exprs = self.get_kernel_exprs(result_names)
        arguments = (self.get_default_src_tgt_arguments() + [
            lp.GlobalArg("box_target_starts", None, shape=None),
            lp.GlobalArg("box_target_counts_nonchild", None, shape=None),
            lp.GlobalArg("box_source_starts", None, shape=None),
            lp.GlobalArg("box_source_counts_nonchild", None, shape=None),
            lp.GlobalArg("source_box_starts", None, shape=None),
            lp.GlobalArg("source_box_lists", None, shape=None),
            lp.GlobalArg("strength",
                         None,
                         shape="nstrengths, nsources",
                         dim_tags="sep,C"),
            lp.GlobalArg(
                "result", None, shape="nkernels, ntargets", dim_tags="sep,C"),
            "..."
        ])

        loopy_knl = lp.make_kernel(
            [
                "{[itgt_box]: 0 <= itgt_box < ntgt_boxes}",
                "{[isrc_box]: isrc_box_start <= isrc_box < isrc_box_end}",
                "{[itgt, isrc, idim]: \
                itgt_start <= itgt < itgt_end and \
                isrc_start <= isrc < isrc_end and \
                0 <= idim < dim}",
            ],
            self.get_kernel_scaling_assignments() + [
                """
                for itgt_box
                <> tgt_ibox = target_boxes[itgt_box]
                <> itgt_start = box_target_starts[tgt_ibox]
                <> itgt_end = itgt_start + box_target_counts_nonchild[tgt_ibox]

                <> isrc_box_start = source_box_starts[itgt_box]
                <> isrc_box_end = source_box_starts[itgt_box+1]

                for isrc_box
                    <> src_ibox = source_box_lists[isrc_box]
                    <> isrc_start = box_source_starts[src_ibox]
                    <> isrc_end = isrc_start + box_source_counts_nonchild[src_ibox]

                    for itgt
                    for isrc
                        <> d[idim] = \
                            targets[idim, itgt] - sources[idim, isrc] {dup=idim}
            """
            ] + [
                """
                        <> is_self = (isrc == target_to_source[itgt])
                    """ if self.exclude_self else ""
            ] + loopy_insns + kernel_exprs + ["    end"] + [
                """
                    result[{i}, itgt] = result[{i}, itgt] + \
                        knl_{i}_scaling * simul_reduce(sum, isrc, pair_result_{i}) \
                        {{id_prefix=write_csr}}
                """.format(i=iknl) for iknl in range(len(self.kernels))
            ] + [
                """
                    end
                end
                end
            """
            ],
            arguments,
            assumptions="ntgt_boxes>=1",
            name=self.name,
            silenced_warnings="write_race(write_csr*)",
            fixed_parameters=dict(dim=self.dim,
                                  nstrengths=self.strength_count,
                                  nkernels=len(self.kernels)),
            lang_version=MOST_RECENT_LANGUAGE_VERSION)

        loopy_knl = lp.add_dtypes(loopy_knl,
                                  dict(nsources=np.int32, ntargets=np.int32))

        loopy_knl = lp.tag_inames(loopy_knl, "idim*:unr")
        loopy_knl = lp.tag_array_axes(loopy_knl, "targets", "sep,C")
        loopy_knl = lp.tag_array_axes(loopy_knl, "sources", "sep,C")

        for knl in self.kernels:
            loopy_knl = knl.prepare_loopy_kernel(loopy_knl)

        return loopy_knl
コード例 #30
0
    def get_kernel(self):
        loopy_insns, result_names = self.get_loopy_insns_and_result_names()

        from pymbolic import var
        exprs = [
                var(name)
                * var("strength").index((self.strength_usage[i], var("isrc")))
                for i, name in enumerate(result_names)]

        if self.exclude_self:
            from pymbolic.primitives import If, ComparisonOperator, Variable
            exprs = [
                    If(
                        ComparisonOperator(Variable("isrc"), "!=", Variable("itgt")),
                        expr, 0)
                    for expr in exprs]

        from sumpy.tools import gather_loopy_source_arguments
        loopy_knl = lp.make_kernel(
                "{[isrc,itgt,idim]: 0<=itgt<ntargets and 0<=isrc<nsources \
                        and 0<=idim<dim}",
                self.get_kernel_scaling_assignments()
                + ["""
                for itgt
                    for isrc
                        """] + loopy_insns + ["""
                        <> d[idim] = targets[idim,itgt] - sources[idim,isrc] \
                        """]+[
                        lp.Assignment(id=None,
                            assignee="pair_result_%d" % i, expression=expr,
                            temp_var_type=lp.auto)
                        for i, expr in enumerate(exprs)
                        ] + ["""
                    end
                    """] + ["""
                    result[KNLIDX, itgt] = knl_KNLIDX_scaling \
                            * simul_reduce(sum, isrc, pair_result_KNLIDX)
                    """.replace("KNLIDX", str(iknl))
                    for iknl in range(len(exprs))] + [
                    ] + ["""
                end
                """],
                [
                    lp.GlobalArg("sources", None,
                        shape=(self.dim, "nsources")),
                    lp.GlobalArg("targets", None,
                        shape=(self.dim, "ntargets")),
                    lp.ValueArg("nsources", None),
                    lp.ValueArg("ntargets", None),
                    lp.GlobalArg("strength", None, shape="nstrengths,nsources"),
                    lp.GlobalArg("result", None,
                        shape="nresults,ntargets", dim_tags="sep,C")
                ] + gather_loopy_source_arguments(self.kernels),
                name=self.name,
                assumptions="nsources>=1 and ntargets>=1")

        loopy_knl = lp.fix_parameters(
                loopy_knl,
                dim=self.dim,
                nstrengths=self.strength_count,
                nresults=len(self.kernels))

        loopy_knl = lp.tag_inames(loopy_knl, "idim*:unr")

        for knl in self.kernels:
            loopy_knl = knl.prepare_loopy_kernel(loopy_knl)

        loopy_knl = lp.tag_array_axes(loopy_knl, "strength", "sep,C")

        return loopy_knl
コード例 #31
0
ファイル: vector-types.py プロジェクト: inducer/loopy
import numpy as np
import loopy as lp
import pyopencl as cl
import pyopencl.array

ctx = cl.create_some_context()
queue = cl.CommandQueue(ctx)

n = 15 * 10**6
a = cl.array.arange(queue, n, dtype=np.float32)

knl = lp.make_kernel(
        "{ [i]: 0<=i<n }",
        "out[i] = 2*a[i]")

knl = lp.set_options(knl, write_code=True)
knl = lp.split_iname(knl, "i", 4, slabs=(0, 1), inner_tag="vec")
knl = lp.split_array_axis(knl, "a,out", axis_nr=0, count=4)
knl = lp.tag_array_axes(knl, "a,out", "C,vec")

knl(queue, a=a.reshape(-1, 4), n=n)
コード例 #32
0
ファイル: p2p.py プロジェクト: jdoherty7/sumpy
    def get_kernel(self):
        loopy_insns, result_names = self.get_loopy_insns_and_result_names()

        from pymbolic import var
        exprs = [
            var(name) * var("strength").index(
                (self.strength_usage[i], var("isrc")))
            for i, name in enumerate(result_names)
        ]

        if self.exclude_self:
            from pymbolic.primitives import If, Variable
            exprs = [If(Variable("is_self"), 0, expr) for expr in exprs]

        from sumpy.tools import gather_loopy_source_arguments
        loopy_knl = lp.make_kernel(
            "{[isrc,itgt,idim]: 0<=itgt<ntargets and 0<=isrc<nsources \
                        and 0<=idim<dim}",
            self.get_kernel_scaling_assignments() + [
                """
                for itgt
                    for isrc
                        """
            ] + loopy_insns + [
                """
                        <> d[idim] = targets[idim,itgt] - sources[idim,isrc]
                        """
            ] + [
                """
                        <> is_self = (isrc == target_to_source[itgt])
                        """ if self.exclude_self else ""
            ] + [
                lp.Assignment(id=None,
                              assignee="pair_result_%d" % i,
                              expression=expr,
                              temp_var_type=lp.auto)
                for i, expr in enumerate(exprs)
            ] + ["""
                    end
                    """] + [
                """
                    result[KNLIDX, itgt] = knl_KNLIDX_scaling \
                            * simul_reduce(sum, isrc, pair_result_KNLIDX)
                    """.replace("KNLIDX", str(iknl))
                for iknl in range(len(exprs))
            ] + [] + ["""
                end
                """],
            [
                lp.GlobalArg("sources", None, shape=(self.dim, "nsources")),
                lp.GlobalArg("targets", None, shape=(self.dim, "ntargets")),
                lp.ValueArg("nsources", None),
                lp.ValueArg("ntargets", None),
                lp.GlobalArg("strength", None, shape="nstrengths,nsources"),
                lp.GlobalArg("result",
                             None,
                             shape="nresults,ntargets",
                             dim_tags="sep,C")
            ] + ([
                lp.GlobalArg(
                    "target_to_source", np.int32, shape=("ntargets", ))
            ] if self.exclude_self else []) +
            gather_loopy_source_arguments(self.kernels),
            name=self.name,
            assumptions="nsources>=1 and ntargets>=1",
            fixed_parameters=dict(dim=self.dim,
                                  nstrengths=self.strength_count,
                                  nresults=len(self.kernels)))

        loopy_knl = lp.tag_inames(loopy_knl, "idim*:unr")

        for knl in self.kernels:
            loopy_knl = knl.prepare_loopy_kernel(loopy_knl)

        loopy_knl = lp.tag_array_axes(loopy_knl, "strength", "sep,C")

        return loopy_knl
コード例 #33
0
    def get_kernel(self):
        from sumpy.symbolic import make_sym_vector

        avec = make_sym_vector("a", self.dim)
        bvec = make_sym_vector("b", self.dim)

        from sumpy.assignment_collection import SymbolicAssignmentCollection
        sac = SymbolicAssignmentCollection()

        logger.info("compute expansion expressions: start")

        result_names = [
            expand(i, sac, expn, avec, bvec)
            for i, expn in enumerate(self.expansions)
        ]

        logger.info("compute expansion expressions: done")

        sac.run_global_cse()

        from sumpy.codegen import to_loopy_insns
        loopy_insns = to_loopy_insns(
            six.iteritems(sac.assignments),
            vector_names=set(["a", "b"]),
            pymbolic_expr_maps=[
                expn.kernel.get_code_transformer() for expn in self.expansions
            ],
            retain_names=result_names,
            complex_dtype=np.complex128  # FIXME
        )

        isrc_sym = var("isrc")
        exprs = [
            var(name) * self.get_strength_or_not(isrc_sym, i)
            for i, name in enumerate(result_names)
        ]

        from sumpy.tools import gather_loopy_source_arguments
        arguments = (self.get_src_tgt_arguments() +
                     self.get_input_and_output_arguments() +
                     gather_loopy_source_arguments(self.kernels))

        loopy_knl = lp.make_kernel(
            "{[isrc,itgt,idim]: 0<=itgt<ntargets and 0<=isrc<nsources "
            "and 0<=idim<dim}",
            self.get_kernel_scaling_assignments() + ["for itgt, isrc"] +
            [self.get_compute_a_and_b_vecs()] + loopy_insns + [
                lp.Assignment(id=None,
                              assignee="pair_result_%d" % i,
                              expression=expr,
                              temp_var_type=lp.auto)
                for i, (expr,
                        dtype) in enumerate(zip(exprs, self.value_dtypes))
            ] + ["end"] + self.get_result_store_instructions(),
            arguments,
            name=self.name,
            assumptions="nsources>=1 and ntargets>=1",
            default_offset=lp.auto,
            silenced_warnings="write_race(write_lpot*)",
            fixed_parameters=dict(dim=self.dim))

        loopy_knl = lp.tag_inames(loopy_knl, "idim*:unr")

        for expn in self.expansions:
            loopy_knl = expn.prepare_loopy_kernel(loopy_knl)

        loopy_knl = lp.tag_array_axes(loopy_knl, "center", "sep,C")

        return loopy_knl