Python Block.extend Examples

Programming Language: Python

Namespace/Package Name: cgen

Class/Type: Block

Method/Function: extend

Examples at hotexamples.com: 19

Python Block.extend - 19 examples found. These are the top rated real world Python examples of cgen.Block.extend extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

Block(30)

extend(8)

append(4)

extend_log_block(3)

Example #1

Show file

File: __init__.py Project: rckirby/loopy

    def generate_body(self, kernel, codegen_state):
        from cgen import Block

        body = Block()

        # {{{ declare temporaries

        body.extend(
            idi.cgen_declarator
            for tv in six.itervalues(kernel.temporary_variables)
            for idi in tv.decl_info(kernel.target, is_written=True, index_dtype=kernel.index_dtype)
        )

        # }}}

        from loopy.codegen.loop import set_up_hw_parallel_loops

        gen_code = set_up_hw_parallel_loops(kernel, 0, codegen_state)

        from cgen import Line

        body.append(Line())

        if isinstance(gen_code.ast, Block):
            body.extend(gen_code.ast.contents)
        else:
            body.append(gen_code.ast)

        return body, gen_code.implemented_domains

Example #2

Show file

File: elementwise.py Project: inducer/codepy

def get_elwise_module_descriptor(arguments, operation, name="kernel"):
    from codepy.bpl import BoostPythonModule

    from cgen import FunctionBody, FunctionDeclaration, \
            Value, POD, Struct, For, Initializer, Include, Statement, \
            Line, Block

    S = Statement

    mod = BoostPythonModule()
    mod.add_to_preamble([
        Include("pyublas/numpy.hpp"),
        ])

    mod.add_to_module([
        S("namespace ublas = boost::numeric::ublas"),
        S("using namespace pyublas"),
        Line(),
        ])

    body = Block([
        Initializer(
            Value("numpy_array<%s >::iterator"
                % dtype_to_ctype(varg.dtype),
                varg.name),
            "args.%s_ary.begin()" % varg.name)
        for varg in arguments if isinstance(varg, VectorArg)]
        +[Initializer(
            sarg.declarator(), "args." + sarg.name)
        for sarg in arguments if isinstance(sarg, ScalarArg)]
        )

    body.extend([
        Line(),
        For("unsigned i = 0",
            "i < codepy_length",
            "++i",
            Block([S(operation)])
            )
        ])

    arg_struct = Struct("arg_struct", 
            [arg.declarator() for arg in arguments])
    mod.add_struct(arg_struct, "ArgStruct")
    mod.add_to_module([Line()])

    mod.add_function(
            FunctionBody(
                FunctionDeclaration(
                    Value("void", name),
                    [POD(numpy.uintp, "codepy_length"),
                        Value("arg_struct", "args")]),
                body))

    return mod

Example #3

Show file

File: elementwise.py Project: inducer/codepy

def get_elwise_module_descriptor(arguments, operation, name="kernel"):
    from codepy.bpl import BoostPythonModule

    from cgen import FunctionBody, FunctionDeclaration, \
            Value, POD, Struct, For, Initializer, Include, Statement, \
            Line, Block

    S = Statement  # noqa: N806

    mod = BoostPythonModule()
    mod.add_to_preamble([
        Include("pyublas/numpy.hpp"),
    ])

    mod.add_to_module([
        S("namespace ublas = boost::numeric::ublas"),
        S("using namespace pyublas"),
        Line(),
    ])

    body = Block([
        Initializer(
            Value(
                "numpy_array<{} >::iterator".format(dtype_to_ctype(
                    varg.dtype)), varg.name), f"args.{varg.name}_ary.begin()")
        for varg in arguments if isinstance(varg, VectorArg)
    ] + [
        Initializer(sarg.declarator(), f"args.{sarg.name}")
        for sarg in arguments if isinstance(sarg, ScalarArg)
    ])

    body.extend([
        Line(),
        For("unsigned i = 0", "i < codepy_length", "++i",
            Block([S(operation)]))
    ])

    arg_struct = Struct("arg_struct", [arg.declarator() for arg in arguments])
    mod.add_struct(arg_struct, "ArgStruct")
    mod.add_to_module([Line()])

    mod.add_function(
        FunctionBody(
            FunctionDeclaration(Value("void", name), [
                POD(numpy.uintp, "codepy_length"),
                Value("arg_struct", "args")
            ]), body))

    return mod

Example #4

Show file

        def get_flux_code(flux_writer):
            flux_code = Block([])

            flux_code.extend([
                Initializer(Pointer(Value("face_pair", "fpair")),
                            "data.facepairs+fpair_nr"),
                Initializer(
                    MaybeUnused(POD(numpy.uint32, "a_index")),
                    "fpair->a_base + tex1Dfetch(tex_index_lists, "
                    "fpair->a_ilist_index + FACEDOF_NR)"),
                Initializer(
                    MaybeUnused(POD(numpy.uint32, "b_index")),
                    "fpair->b_base + tex1Dfetch(tex_index_lists, "
                    "fpair->b_ilist_index + FACEDOF_NR)"),
                Line(),
                flux_writer(),
                Line(),
                S("fpair_nr += CONCURRENT_FACES")
            ])

            return flux_code

Example #5

Show file

File: fluxgather.py Project: felipeh/hedge

        def get_flux_code(flux_writer):
            flux_code = Block([])

            flux_code.extend([
                Initializer(Pointer(
                    Value("face_pair", "fpair")),
                    "data.facepairs+fpair_nr"),
                Initializer(
                    MaybeUnused(POD(numpy.uint32, "a_index")),
                    "fpair->a_base + tex1Dfetch(tex_index_lists, "
                    "fpair->a_ilist_index + FACEDOF_NR)"),
                Initializer(
                    MaybeUnused(POD(numpy.uint32, "b_index")),
                    "fpair->b_base + tex1Dfetch(tex_index_lists, "
                    "fpair->b_ilist_index + FACEDOF_NR)"),
                Line(),
                flux_writer(),
                Line(),
                S("fpair_nr += CONCURRENT_FACES")
                ])

            return flux_code

Example #6

Show file

File: el_local_shared_segmat.py Project: allansnielsen/hedge

    def get_kernel(self, with_scaling, for_benchmark=False):
        from cgen import \
                Pointer, POD, Value, ArrayOf, \
                Module, FunctionDeclaration, FunctionBody, Block, \
                Line, Define, Include, \
                Initializer, If, For, Statement, Assign, \
                ArrayInitializer

        from cgen import dtype_to_ctype
        from cgen.cuda import CudaShared, CudaConstant, CudaGlobal

        discr = self.discr
        d = discr.dimensions
        dims = range(d)
        given = self.plan.given

        float_type = given.float_type

        f_decl = CudaGlobal(FunctionDeclaration(Value("void", "apply_el_local_mat_smem_mat"),
            [
                Pointer(POD(float_type, "out_vector")),
                Pointer(POD(numpy.uint8, "gmem_matrix")),
                Pointer(POD(float_type, "debugbuf")),
                POD(numpy.uint32, "microblock_count"),
                ]
            ))

        cmod = Module([
                Include("pycuda-helpers.hpp"),
                Line(),
                Value("texture<fp_tex_%s, 1, cudaReadModeElementType>"
                    % dtype_to_ctype(float_type),
                    "in_vector_tex"),
                ])
        if with_scaling:
            cmod.append(
                Value("texture<fp_tex_%s, 1, cudaReadModeElementType>"
                    % dtype_to_ctype(float_type),
                    "scaling_tex"),
                )

        par = self.plan.parallelism

        cmod.extend([
                Line(),
                Define("DIMENSIONS", discr.dimensions),
                Define("DOFS_PER_EL", given.dofs_per_el()),
                Define("PREIMAGE_DOFS_PER_EL", self.plan.preimage_dofs_per_el),
                Line(),
                Define("SEGMENT_DOF", "threadIdx.x"),
                Define("PAR_MB_NR", "threadIdx.y"),
                Line(),
                Define("MB_SEGMENT", "blockIdx.x"),
                Define("MACROBLOCK_NR", "blockIdx.y"),
                Line(),
                Define("DOFS_PER_SEGMENT", self.plan.segment_size),
                Define("SEGMENTS_PER_MB", self.plan.segments_per_microblock()),
                Define("ALIGNED_DOFS_PER_MB", given.microblock.aligned_floats),
                Define("ALIGNED_PREIMAGE_DOFS_PER_MB",
                    self.plan.aligned_preimage_dofs_per_microblock),
                Define("MB_EL_COUNT", given.microblock.elements),
                Line(),
                Define("PAR_MB_COUNT", par.parallel),
                Define("INLINE_MB_COUNT", par.inline),
                Define("SEQ_MB_COUNT", par.serial),
                Line(),
                Define("THREAD_NUM", "(SEGMENT_DOF+PAR_MB_NR*DOFS_PER_SEGMENT)"),
                Define("COALESCING_THREAD_COUNT", "(PAR_MB_COUNT*DOFS_PER_SEGMENT)"),
                Line(),
                Define("MB_DOF_BASE", "(MB_SEGMENT*DOFS_PER_SEGMENT)"),
                Define("MB_DOF", "(MB_DOF_BASE+SEGMENT_DOF)"),
                Define("GLOBAL_MB_NR_BASE",
                    "(MACROBLOCK_NR*PAR_MB_COUNT*INLINE_MB_COUNT*SEQ_MB_COUNT)"),
                Define("GLOBAL_MB_NR",
                    "(GLOBAL_MB_NR_BASE"
                    "+ (seq_mb_number*PAR_MB_COUNT + PAR_MB_NR)*INLINE_MB_COUNT)"),
                Define("GLOBAL_MB_DOF_BASE", "(GLOBAL_MB_NR*ALIGNED_DOFS_PER_MB)"),
                Define("GLOBAL_MB_PREIMG_DOF_BASE", "(GLOBAL_MB_NR*ALIGNED_PREIMAGE_DOFS_PER_MB)"),
                Line(),
                Define("MATRIX_COLUMNS", self.plan.gpu_matrix_columns()),
                Define("MATRIX_SEGMENT_FLOATS", self.plan.gpu_matrix_block_floats()),
                Define("MATRIX_SEGMENT_BYTES",
                    "(MATRIX_SEGMENT_FLOATS*%d)" % given.float_size()),

                Line(),
                CudaShared(ArrayOf(POD(float_type, "smem_matrix"),
                    "MATRIX_SEGMENT_FLOATS")),
                CudaShared(
                    ArrayOf(
                        ArrayOf(
                            ArrayOf(
                                POD(float_type, "dof_buffer"),
                                "PAR_MB_COUNT"),
                            "INLINE_MB_COUNT"),
                        "DOFS_PER_SEGMENT"),
                    ),
                CudaShared(POD(numpy.uint16, "segment_start_el")),
                CudaShared(POD(numpy.uint16, "segment_stop_el")),
                CudaShared(POD(numpy.uint16, "segment_el_count")),
                Line(),
                ArrayInitializer(
                        CudaConstant(
                            ArrayOf(
                                POD(numpy.uint32, "segment_start_el_lookup"),
                            "SEGMENTS_PER_MB")),
                        [(chk*self.plan.segment_size)//given.dofs_per_el()
                            for chk in range(self.plan.segments_per_microblock())]
                        ),
                ArrayInitializer(
                        CudaConstant(
                            ArrayOf(
                                POD(numpy.uint32, "segment_stop_el_lookup"),
                            "SEGMENTS_PER_MB")),
                        [min(given.microblock.elements,
                            (chk*self.plan.segment_size+self.plan.segment_size-1)
                                //given.dofs_per_el()+1)
                            for chk in range(self.plan.segments_per_microblock())]
                        ),
                ])

        S = Statement
        f_body = Block()

        f_body.extend_log_block("calculate this dof's element", [
            Initializer(POD(numpy.uint8, "mb_el"),
                "MB_DOF/DOFS_PER_EL") ])

        if self.plan.use_prefetch_branch:
            f_body.extend_log_block("calculate segment responsibility data", [
                If("THREAD_NUM==0",
                    Block([
                        Assign("segment_start_el", "segment_start_el_lookup[MB_SEGMENT]"),
                        Assign("segment_stop_el", "segment_stop_el_lookup[MB_SEGMENT]"),
                        Assign("segment_el_count", "segment_stop_el-segment_start_el"),
                        ])
                    ),
                S("__syncthreads()")
                ])

        from hedge.backends.cuda.tools import get_load_code
        f_body.extend(
            get_load_code(
                dest="smem_matrix",
                base=("gmem_matrix + MB_SEGMENT*MATRIX_SEGMENT_BYTES"),
                bytes="MATRIX_SEGMENT_BYTES",
                descr="load matrix segment")
            +[S("__syncthreads()")]
            )

        # ---------------------------------------------------------------------
        def get_batched_fetch_mat_mul_code(el_fetch_count):
            result = []
            dofs = range(self.plan.preimage_dofs_per_el)

            for load_segment_start in range(0, self.plan.preimage_dofs_per_el,
                    self.plan.segment_size):
                result.extend(
                        [S("__syncthreads()")]
                        +[Assign(
                            "dof_buffer[PAR_MB_NR][%d][SEGMENT_DOF]" % inl,
                            "fp_tex1Dfetch(in_vector_tex, "
                            "GLOBAL_MB_PREIMG_DOF_BASE"
                            " + %d*ALIGNED_PREIMAGE_DOFS_PER_MB"
                            " + (segment_start_el)*PREIMAGE_DOFS_PER_EL + %d + SEGMENT_DOF)"
                            % (inl, load_segment_start)
                            )
                        for inl in range(par.inline)]
                        +[S("__syncthreads()"),
                        Line(),
                        ])

                for dof in dofs[load_segment_start:load_segment_start+self.plan.segment_size]:
                    for inl in range(par.inline):
                        result.append(
                                S("result%d += "
                                    "smem_matrix[SEGMENT_DOF*MATRIX_COLUMNS + %d]"
                                    "*"
                                    "dof_buffer[PAR_MB_NR][%d][%d]"
                                    % (inl, dof, inl, dof-load_segment_start))
                                )
                result.append(Line())
            return result

        from hedge.backends.cuda.tools import unroll
        def get_direct_tex_mat_mul_code():
            return (
                    [POD(float_type, "fof%d" % inl) for inl in range(par.inline)]
                    + [POD(float_type, "lm"), Line()]
                    + unroll(
                        lambda j: [
                        Assign("fof%d" % inl,
                            "fp_tex1Dfetch(in_vector_tex, "
                            "GLOBAL_MB_PREIMG_DOF_BASE"
                            " + %(inl)d * ALIGNED_PREIMAGE_DOFS_PER_MB"
                            " + mb_el*PREIMAGE_DOFS_PER_EL+%(j)s)"
                            % {"j":j, "inl":inl, "row": "SEGMENT_DOF"},)
                        for inl in range(par.inline)
                        ]+[
                        Assign("lm",
                            "smem_matrix["
                            "%(row)s*MATRIX_COLUMNS + %(j)s]"
                            % {"j":j, "row": "SEGMENT_DOF"},
                            )
                        ]+[
                        S("result%(inl)d += fof%(inl)d*lm" % {"inl":inl})
                        for inl in range(par.inline)
                        ],
                        total_number=self.plan.preimage_dofs_per_el,
                        max_unroll=self.plan.max_unroll)
                    + [Line()])

        def get_mat_mul_code(el_fetch_count):
            if el_fetch_count == 1:
                return get_batched_fetch_mat_mul_code(el_fetch_count)
            else:
                return get_direct_tex_mat_mul_code()

        def mat_mul_outer_loop(fetch_count):
            if with_scaling:
                inv_jac_multiplier = ("fp_tex1Dfetch(scaling_tex,"
                        "(GLOBAL_MB_NR + %(inl)d)*MB_EL_COUNT + mb_el)")
            else:
                inv_jac_multiplier = "1"

            write_condition = "MB_DOF < DOFS_PER_EL*MB_EL_COUNT"
            if self.with_index_check:
                write_condition += " && GLOBAL_MB_NR < microblock_count"
            return For("unsigned short seq_mb_number = 0",
                "seq_mb_number < SEQ_MB_COUNT",
                "++seq_mb_number",
                Block([
                    Initializer(POD(float_type, "result%d" % inl), 0)
                    for inl in range(par.inline)
                    ]+[Line()]
                    +get_mat_mul_code(fetch_count)
                    +[
                    If(write_condition,
                        Block([
                            Assign(
                                "out_vector[GLOBAL_MB_DOF_BASE"
                                " + %d*ALIGNED_DOFS_PER_MB"
                                " + MB_DOF]" % inl,
                                "result%d * %s" % (inl, (inv_jac_multiplier % {"inl":inl}))
                                )
                            for inl in range(par.inline)
                            ])
                        )
                    ])
                )

        if self.plan.use_prefetch_branch:
            from cgen import make_multiple_ifs
            f_body.append(make_multiple_ifs([
                    ("segment_el_count == %d" % fetch_count,
                        mat_mul_outer_loop(fetch_count))
                    for fetch_count in
                    range(1, self.plan.max_elements_touched_by_segment()+1)]
                    ))
        else:
            f_body.append(mat_mul_outer_loop(0))

        # finish off ----------------------------------------------------------
        cmod.append(FunctionBody(f_decl, f_body))

        if not for_benchmark and "cuda_dump_kernels" in discr.debug:
            from hedge.tools import open_unique_debug_file
            open_unique_debug_file(self.plan.debug_name, ".cu").write(str(cmod))

        mod = SourceModule(cmod,
                keep="cuda_keep_kernels" in discr.debug,
                #options=["--maxrregcount=12"]
                )

        func = mod.get_function("apply_el_local_mat_smem_mat")

        if self.plan.debug_name in discr.debug:
            print "%s: lmem=%d smem=%d regs=%d" % (
                    self.plan.debug_name,
                    func.local_size_bytes,
                    func.shared_size_bytes,
                    func.num_regs)

        in_vector_texref = mod.get_texref("in_vector_tex")
        texrefs = [in_vector_texref]

        if with_scaling:
            scaling_texref = mod.get_texref("scaling_tex")
            texrefs.append(scaling_texref)
        else:
            scaling_texref = None

        func.prepare(
                "PPPI",
                block=(self.plan.segment_size, self.plan.parallelism.parallel, 1),
                texrefs=texrefs)

        return func, in_vector_texref, scaling_texref

Example #7

Show file

File: diff_shared_fld.py Project: gimac/hedge

    def get_kernel(self, diff_op, elgroup, for_benchmark=False):
        from cgen import (
            Pointer,
            POD,
            Value,
            ArrayOf,
            Const,
            Module,
            FunctionDeclaration,
            FunctionBody,
            Block,
            Comment,
            Line,
            Define,
            Include,
            Initializer,
            If,
            For,
            Statement,
            Assign,
        )

        from pycuda.tools import dtype_to_ctype
        from cgen.cuda import CudaShared, CudaGlobal

        discr = self.discr
        d = discr.dimensions
        dims = range(d)
        plan = self.plan
        given = plan.given

        elgroup, = discr.element_groups
        float_type = given.float_type

        f_decl = CudaGlobal(
            FunctionDeclaration(
                Value("void", "apply_diff_mat_smem"),
                [Pointer(POD(float_type, "debugbuf")), Pointer(POD(float_type, "field"))]
                + [Pointer(POD(float_type, "drst%d_global" % i)) for i in dims],
            )
        )

        par = plan.parallelism

        cmod = Module([Include("pycuda-helpers.hpp")])

        if float_type == numpy.float64:
            cmod.append(Value("texture<fp_tex_double, 1, cudaReadModeElementType>", "diff_rst_mat_tex"))
        elif float_type == numpy.float32:
            rst_channels = given.devdata.make_valid_tex_channel_count(d)
            cmod.append(Value("texture<float%d, 1, cudaReadModeElementType>" % rst_channels, "diff_rst_mat_tex"))
        else:
            raise ValueError("unsupported float type: %s" % float_type)

        # only preimage size variation is supported here
        assert plan.image_dofs_per_el == given.dofs_per_el()
        assert plan.aligned_image_dofs_per_microblock == given.microblock.aligned_floats

        # FIXME: aligned_image_dofs_per_microblock must be divisible
        # by this, therefore hardcoding for now.
        chunk_size = 16

        cmod.extend(
            [
                Line(),
                Define("DIMENSIONS", discr.dimensions),
                Define("IMAGE_DOFS_PER_EL", plan.image_dofs_per_el),
                Define("PREIMAGE_DOFS_PER_EL", plan.preimage_dofs_per_el),
                Define("ALIGNED_IMAGE_DOFS_PER_MB", plan.aligned_image_dofs_per_microblock),
                Define("ALIGNED_PREIMAGE_DOFS_PER_MB", plan.aligned_preimage_dofs_per_microblock),
                Define("ELS_PER_MB", given.microblock.elements),
                Define("IMAGE_DOFS_PER_MB", "(IMAGE_DOFS_PER_EL*ELS_PER_MB)"),
                Line(),
                Define("CHUNK_SIZE", chunk_size),
                Define("CHUNK_DOF", "threadIdx.x"),
                Define("PAR_MB_NR", "threadIdx.y"),
                Define("CHUNK_NR", "threadIdx.z"),
                Define("IMAGE_MB_DOF", "(CHUNK_NR*CHUNK_SIZE+CHUNK_DOF)"),
                Define("IMAGE_EL_DOF", "(IMAGE_MB_DOF - mb_el*IMAGE_DOFS_PER_EL)"),
                Line(),
                Define("MACROBLOCK_NR", "blockIdx.x"),
                Line(),
                Define("PAR_MB_COUNT", par.parallel),
                Define("INLINE_MB_COUNT", par.inline),
                Define("SEQ_MB_COUNT", par.serial),
                Line(),
                Define("GLOBAL_MB_NR_BASE", "(MACROBLOCK_NR*PAR_MB_COUNT*INLINE_MB_COUNT*SEQ_MB_COUNT)"),
                Define(
                    "GLOBAL_MB_NR", "(GLOBAL_MB_NR_BASE" "+ (seq_mb_number*PAR_MB_COUNT + PAR_MB_NR)*INLINE_MB_COUNT)"
                ),
                Define("GLOBAL_MB_IMAGE_DOF_BASE", "(GLOBAL_MB_NR*ALIGNED_IMAGE_DOFS_PER_MB)"),
                Define("GLOBAL_MB_PREIMAGE_DOF_BASE", "(GLOBAL_MB_NR*ALIGNED_PREIMAGE_DOFS_PER_MB)"),
                Line(),
                CudaShared(
                    ArrayOf(
                        ArrayOf(ArrayOf(POD(float_type, "smem_field"), "PAR_MB_COUNT"), "INLINE_MB_COUNT"),
                        "ALIGNED_PREIMAGE_DOFS_PER_MB",
                    )
                ),
                Line(),
            ]
        )

        S = Statement
        f_body = Block([Initializer(Const(POD(numpy.uint16, "mb_el")), "IMAGE_MB_DOF / IMAGE_DOFS_PER_EL"), Line()])

        # ---------------------------------------------------------------------
        def get_load_code():
            mb_img_dofs = plan.aligned_image_dofs_per_microblock
            mb_preimg_dofs = plan.aligned_preimage_dofs_per_microblock
            preimg_dofs_over_dofs = (mb_preimg_dofs + mb_img_dofs - 1) // mb_img_dofs

            load_code = []
            store_code = []

            var_num = 0
            for load_block in range(preimg_dofs_over_dofs):
                for inl in range(par.inline):
                    # load and store are split for better pipelining
                    # compiler can't figure that out because of branch

                    var = "tmp%d" % var_num
                    var_num += 1
                    load_code.append(POD(float_type, var))

                    block_addr = "%d * ALIGNED_IMAGE_DOFS_PER_MB + IMAGE_MB_DOF" % load_block
                    load_instr = Assign(
                        var,
                        "field[GLOBAL_MB_PREIMAGE_DOF_BASE"
                        " + %d*ALIGNED_PREIMAGE_DOFS_PER_MB"
                        " + %s]" % (inl, block_addr),
                    )
                    store_instr = Assign("smem_field[PAR_MB_NR][%d][%s]" % (inl, block_addr), var)
                    if (load_block + 1) * mb_img_dofs >= mb_preimg_dofs:
                        cond = "%s < ALIGNED_PREIMAGE_DOFS_PER_MB" % block_addr
                        load_instr = If(cond, load_instr)
                        store_instr = If(cond, store_instr)

                    load_code.append(load_instr)
                    store_code.append(store_instr)
            return Block(load_code + [Line()] + store_code)

        def get_scalar_diff_code():
            code = []
            for inl in range(par.inline):
                for axis in dims:
                    code.append(Initializer(POD(float_type, "d%drst%d" % (inl, axis)), 0))

            code.append(Line())

            tex_channels = ["x", "y", "z", "w"]

            store_code = Block()
            for inl in range(par.inline):
                for rst_axis in dims:
                    store_code.append(
                        Assign(
                            "drst%d_global[GLOBAL_MB_IMAGE_DOF_BASE + "
                            "%d*ALIGNED_IMAGE_DOFS_PER_MB + IMAGE_MB_DOF]" % (rst_axis, inl),
                            "d%drst%d" % (inl, rst_axis),
                        )
                    )

            from hedge.backends.cuda.tools import unroll

            code.extend(
                [
                    Comment("everybody needs to be done with the old data"),
                    S("__syncthreads()"),
                    Line(),
                    get_load_code(),
                    Line(),
                    Comment("all the new data must be loaded"),
                    S("__syncthreads()"),
                    Line(),
                ]
            )

            if float_type == numpy.float32:
                code.append(Value("float%d" % rst_channels, "dmat_entries"))

            code.extend([POD(float_type, "field_value%d" % inl) for inl in range(par.inline)] + [Line()])

            def unroll_body(j):
                result = [
                    Assign("field_value%d" % inl, "smem_field[PAR_MB_NR][%d][mb_el*PREIMAGE_DOFS_PER_EL+%s]" % (inl, j))
                    for inl in range(par.inline)
                ]

                if float_type == numpy.float32:
                    result.append(
                        Assign("dmat_entries", "tex1Dfetch(diff_rst_mat_tex, IMAGE_EL_DOF + %s*IMAGE_DOFS_PER_EL)" % j)
                    )
                    result.extend(
                        S("d%drst%d += dmat_entries.%s * field_value%d" % (inl, axis, tex_channels[axis], inl))
                        for inl in range(par.inline)
                        for axis in dims
                    )
                elif float_type == numpy.float64:
                    result.extend(
                        S(
                            "d%(inl)drst%(axis)d += "
                            "fp_tex1Dfetch(diff_rst_mat_tex, %(axis)d "
                            "+ DIMENSIONS*(IMAGE_EL_DOF + %(j)d*IMAGE_DOFS_PER_EL))"
                            "* field_value%(inl)d" % {"inl": inl, "axis": axis, "j": j}
                        )
                        for inl in range(par.inline)
                        for axis in dims
                    )
                else:
                    assert False

                return result

            code.append(
                If(
                    "IMAGE_MB_DOF < IMAGE_DOFS_PER_MB",
                    Block(unroll(unroll_body, total_number=plan.preimage_dofs_per_el) + [store_code]),
                )
            )

            return code

        f_body.extend(
            [
                For(
                    "unsigned short seq_mb_number = 0",
                    "seq_mb_number < SEQ_MB_COUNT",
                    "++seq_mb_number",
                    Block(get_scalar_diff_code()),
                )
            ]
        )

        # finish off ----------------------------------------------------------
        cmod.append(FunctionBody(f_decl, f_body))

        if not for_benchmark and "cuda_dump_kernels" in discr.debug:
            from hedge.tools import open_unique_debug_file

            open_unique_debug_file("diff", ".cu").write(str(cmod))

        mod = SourceModule(
            cmod,
            keep="cuda_keep_kernels" in discr.debug,
            # options=["--maxrregcount=16"]
        )

        func = mod.get_function("apply_diff_mat_smem")

        if "cuda_diff" in discr.debug:
            print "diff: lmem=%d smem=%d regs=%d" % (func.local_size_bytes, func.shared_size_bytes, func.registers)

        diff_rst_mat_texref = mod.get_texref("diff_rst_mat_tex")
        gpu_diffmats = self.gpu_diffmats(diff_op, elgroup)

        if given.float_type == numpy.float32:
            gpu_diffmats.bind_to_texref_ext(diff_rst_mat_texref, rst_channels)
        elif given.float_type == numpy.float64:
            gpu_diffmats.bind_to_texref_ext(diff_rst_mat_texref, allow_double_hack=True)
        else:
            assert False

        assert given.microblock.aligned_floats % chunk_size == 0
        block = (chunk_size, plan.parallelism.parallel, given.microblock.aligned_floats // chunk_size)

        func.prepare(["PP"] + discr.dimensions * ["P"], texrefs=[diff_rst_mat_texref])

        return block, func

Example #8

Show file

File: fluxgather.py Project: felipeh/hedge

    def get_kernel(self, fdata, ilist_data, for_benchmark):
        from cgen.cuda import CudaShared, CudaGlobal
        from pycuda.tools import dtype_to_ctype

        discr = self.discr
        given = self.plan.given
        fplan = self.plan
        d = discr.dimensions
        dims = range(d)

        elgroup, = discr.element_groups

        float_type = given.float_type

        f_decl = CudaGlobal(FunctionDeclaration(Value("void", "apply_flux"),
            [
                Pointer(POD(float_type, "debugbuf")),
                Pointer(POD(numpy.uint8, "gmem_facedata")),
                ]+[
                Pointer(POD(float_type, "gmem_fluxes_on_faces%d" % flux_nr))
                for flux_nr in range(len(self.fluxes))
                ]
            ))

        cmod = Module()
        cmod.append(Include("pycuda-helpers.hpp"))

        for dep_expr in self.all_deps:
            cmod.extend([
                Value("texture<%s, 1, cudaReadModeElementType>"
                    % dtype_to_ctype(float_type, with_fp_tex_hack=True),
                    "field%d_tex" % self.dep_to_index[dep_expr])
                ])

        if fplan.flux_count != len(self.fluxes):
            from warnings import warn
            warn("Flux count in flux execution plan different from actual flux count.\n"
                    "You may want to specify the tune_for= kwarg in the Discretization\n"
                    "constructor.")

        cmod.extend([
            Line(),
            Typedef(POD(float_type, "value_type")),
            Line(),
            flux_header_struct(float_type, discr.dimensions),
            Line(),
            face_pair_struct(float_type, discr.dimensions),
            Line(),
            Define("DIMENSIONS", discr.dimensions),
            Define("DOFS_PER_FACE", fplan.dofs_per_face),
            Define("THREADS_PER_FACE", fplan.threads_per_face()),
            Line(),
            Define("CONCURRENT_FACES", fplan.parallel_faces),
            Define("BLOCK_MB_COUNT", fplan.mbs_per_block),
            Line(),
            Define("FACEDOF_NR", "threadIdx.x"),
            Define("BLOCK_FACE", "threadIdx.y"),
            Line(),
            Define("FLUX_COUNT", len(self.fluxes)),
            Line(),
            Define("THREAD_NUM", "(FACEDOF_NR + BLOCK_FACE*THREADS_PER_FACE)"),
            Define("THREAD_COUNT", "(THREADS_PER_FACE*CONCURRENT_FACES)"),
            Define("COALESCING_THREAD_COUNT",
                "(THREAD_COUNT < 0x10 ? THREAD_COUNT : THREAD_COUNT & ~0xf)"),
            Line(),
            Define("DATA_BLOCK_SIZE", fdata.block_bytes),
            Define("ALIGNED_FACE_DOFS_PER_MB", fplan.aligned_face_dofs_per_microblock()),
            Define("ALIGNED_FACE_DOFS_PER_BLOCK",
                "(ALIGNED_FACE_DOFS_PER_MB*BLOCK_MB_COUNT)"),
            Line(),
            Define("FOF_BLOCK_BASE", "(blockIdx.x*ALIGNED_FACE_DOFS_PER_BLOCK)"),
            Line(),
            ] + ilist_data.code + [
            Line(),
            Value("texture<index_list_entry_t, 1, cudaReadModeElementType>",
                "tex_index_lists"),
            Line(),
            fdata.struct,
            Line(),
            CudaShared(Value("flux_data", "data")),
            ])

        if not fplan.direct_store:
            cmod.extend([
                CudaShared(
                    ArrayOf(
                        ArrayOf(
                            POD(float_type, "smem_fluxes_on_faces"),
                            "FLUX_COUNT"),
                        "ALIGNED_FACE_DOFS_PER_MB*BLOCK_MB_COUNT")
                    ),
                Line(),
                ])

        S = Statement
        f_body = Block()

        from hedge.backends.cuda.tools import get_load_code

        f_body.extend(get_load_code(
            dest="&data",
            base="gmem_facedata + blockIdx.x*DATA_BLOCK_SIZE",
            bytes="sizeof(flux_data)",
            descr="load face_pair data")
            +[S("__syncthreads()"), Line() ])

        def get_flux_code(flux_writer):
            flux_code = Block([])

            flux_code.extend([
                Initializer(Pointer(
                    Value("face_pair", "fpair")),
                    "data.facepairs+fpair_nr"),
                Initializer(
                    MaybeUnused(POD(numpy.uint32, "a_index")),
                    "fpair->a_base + tex1Dfetch(tex_index_lists, "
                    "fpair->a_ilist_index + FACEDOF_NR)"),
                Initializer(
                    MaybeUnused(POD(numpy.uint32, "b_index")),
                    "fpair->b_base + tex1Dfetch(tex_index_lists, "
                    "fpair->b_ilist_index + FACEDOF_NR)"),
                Line(),
                flux_writer(),
                Line(),
                S("fpair_nr += CONCURRENT_FACES")
                ])

            return flux_code

        flux_computation = Block([
            Comment("fluxes for dual-sided (intra-block) interior face pairs"),
            While("fpair_nr < data.header.same_facepairs_end",
                get_flux_code(lambda:
                    self.write_interior_flux_code(True))
                ),
            Line(),
            Comment("work around nvcc assertion failure"),
            S("fpair_nr+=1"),
            S("fpair_nr-=1"),
            Line(),
            Comment("fluxes for single-sided (inter-block) interior face pairs"),
            While("fpair_nr < data.header.diff_facepairs_end",
                get_flux_code(lambda:
                    self.write_interior_flux_code(False))
                ),
            Line(),
            Comment("fluxes for single-sided boundary face pairs"),
            While("fpair_nr < data.header.bdry_facepairs_end",
                get_flux_code(
                    lambda: self.write_boundary_flux_code(for_benchmark))
                ),
            ])

        f_body.extend_log_block("compute the fluxes", [
            Initializer(POD(numpy.uint32, "fpair_nr"), "BLOCK_FACE"),
            If("FACEDOF_NR < DOFS_PER_FACE", flux_computation)
            ])

        if not fplan.direct_store:
            f_body.extend([
                Line(),
                S("__syncthreads()"),
                Line()
                ])

            f_body.extend_log_block("store fluxes", [
                    #Assign("debugbuf[blockIdx.x]", "FOF_BLOCK_BASE"),
                    #Assign("debugbuf[0]", "FOF_BLOCK_BASE"),
                    #Assign("debugbuf[0]", "sizeof(face_pair)"),
                    For("unsigned word_nr = THREAD_NUM",
                        "word_nr < ALIGNED_FACE_DOFS_PER_MB*BLOCK_MB_COUNT",
                        "word_nr += COALESCING_THREAD_COUNT",
                        Block([Assign(
                            "gmem_fluxes_on_faces%d[FOF_BLOCK_BASE+word_nr]" % flux_nr,
                            "smem_fluxes_on_faces[%d][word_nr]" % flux_nr)
                            for flux_nr in range(len(self.fluxes))]
                           #+[If("isnan(smem_fluxes_on_faces[%d][word_nr])" % flux_nr,
                               #Block([
                                   #Assign("debugbuf[blockIdx.x]", "word_nr"),
                                   #])
                               #)
                            #for flux_nr in range(len(self.fluxes))]
                        )
                    )
                    ])
        if False:
            f_body.extend([
                    Assign("debugbuf[blockIdx.x*96+32+BLOCK_FACE*32+threadIdx.x]", "fpair_nr"),
                    Assign("debugbuf[blockIdx.x*96+16]", "data.header.same_facepairs_end"),
                    Assign("debugbuf[blockIdx.x*96+17]", "data.header.diff_facepairs_end"),
                    Assign("debugbuf[blockIdx.x*96+18]", "data.header.bdry_facepairs_end"),
                    ]
                    )

        # finish off ----------------------------------------------------------
        cmod.append(FunctionBody(f_decl, f_body))

        if not for_benchmark and "cuda_dump_kernels" in discr.debug:
            from hedge.tools import open_unique_debug_file
            open_unique_debug_file("flux_gather", ".cu").write(str(cmod))

        #from pycuda.tools import allow_user_edit
        mod = SourceModule(
                #allow_user_edit(cmod, "kernel.cu", "the flux kernel"),
                cmod,
                keep="cuda_keep_kernels" in discr.debug)
        expr_to_texture_map = dict(
                (dep_expr, mod.get_texref(
                    "field%d_tex" % self.dep_to_index[dep_expr]))
                for dep_expr in self.all_deps)

        index_list_texref = mod.get_texref("tex_index_lists")
        index_list_texref.set_address(
                ilist_data.device_memory,
                ilist_data.bytes)
        index_list_texref.set_format(
                cuda.dtype_to_array_format(ilist_data.type), 1)
        index_list_texref.set_flags(cuda.TRSF_READ_AS_INTEGER)

        func = mod.get_function("apply_flux")
        block = (fplan.threads_per_face(), fplan.parallel_faces, 1)
        func.prepare(
                (2+len(self.fluxes))*"P",
                texrefs=expr_to_texture_map.values()
                + [index_list_texref])

        if "cuda_flux" in discr.debug:
            print "flux: lmem=%d smem=%d regs=%d" % (
                    func.local_size_bytes,
                    func.shared_size_bytes,
                    func.num_regs)

        return block, func, expr_to_texture_map

Example #9

Show file

File: fluxgather.py Project: felipeh/hedge

    def write_interior_flux_code(self, is_twosided):
        given = self.plan.given

        def get_field(flux_rec, is_interior, flipped):
            if is_interior ^ flipped:
                prefix = "a"
            else:
                prefix = "b"

            return ("val_%s_field%d" % (prefix, self.dep_to_index[flux_rec.field_expr]))

        flux_write_code = Block([])

        flux_var_decl = [Initializer(POD(given.float_type, "a_flux"), 0)]

        if is_twosided:
            flux_var_decl.append(Initializer(POD(given.float_type, "b_flux"), 0))
            prefixes = ["a", "b"]
            flip_values = [False, True]
        else:
            prefixes = ["a"]
            flip_values = [False]

        flux_write_code.append(Line())

        for dep in self.interior_deps:
            flux_write_code.append(Comment(str(dep)))

            for side in ["a", "b"]:
                flux_write_code.append(
                        Initializer(
                            MaybeUnused(POD(given.float_type, "val_%s_field%d"
                                % (side, self.dep_to_index[dep]))),
                            "fp_tex1Dfetch(field%d_tex, %s_index)"
                            % (self.dep_to_index[dep], side)))

        f2cm = FluxToCodeMapper(given.float_type)

        flux_sub_codes = []
        for flux_nr, wdflux in enumerate(self.fluxes):
            my_flux_block = Block(flux_var_decl)

            for int_rec in wdflux.interiors:
                for prefix, is_flipped in zip(prefixes, flip_values):
                    my_flux_block.append(
                            Statement("%s_flux += %s"
                                % (prefix,
                                    flux_to_code(f2cm, is_flipped,
                                        int_rec.field_expr,
                                        int_rec.field_expr,
                                        self.dep_to_index,
                                        int_rec.flux_expr, PREC_NONE),
                                    )))

            my_flux_block.append(Line())

            my_flux_block.append(
                    self.gen_store(flux_nr, "fpair->a_dest+FACEDOF_NR",
                        "fpair->face_jacobian*a_flux"))

            #my_flux_block.append(
                    #Statement("if(isnan(val_b_field5)) debugbuf[blockIdx.x] = 1"),
                    #)

            if is_twosided:
                my_flux_block.append(
                        self.gen_store(flux_nr,
                            "fpair->b_dest+tex1Dfetch(tex_index_lists, "
                            "fpair->b_write_ilist_index + FACEDOF_NR)",
                            "fpair->face_jacobian*b_flux"))

                #my_flux_block.append(
                        #Assign("debugbuf[blockIdx.x*96+fpair_nr+8]", "10000+fpair->b_dest"),
                        #)

            flux_sub_codes.append(my_flux_block)

        if f2cm.cse_name_list:
            flux_write_code.append(Line())

        flux_write_code.extend(
                Initializer(
                    Value("value_type", cse_name), cse_str)
                for cse_name, cse_str in f2cm.cse_name_list)

        flux_write_code.extend(flux_sub_codes)

        return flux_write_code

Example #10

Show file

File: fluxgather.py Project: felipeh/hedge

    def write_boundary_flux_code(self, for_benchmark):
        given = self.plan.given

        flux_write_code = Block()

        fluxes_by_bdry_number = {}
        for flux_nr, wdflux in enumerate(self.fluxes):
            for bflux_info in wdflux.boundaries:
                if for_benchmark:
                    bdry_number = 0
                else:
                    bdry_number = self.executor.boundary_tag_to_number[
                            bflux_info.bpair.tag]

                fluxes_by_bdry_number.setdefault(bdry_number, [])\
                        .append((flux_nr, bflux_info))

        flux_write_code.extend([
            Initializer(
                MaybeUnused(POD(given.float_type, "flux%d" % flux_nr)),
                0)
            for flux_nr in range(len(self.fluxes))])

        for bdry_number, nrs_and_fluxes in fluxes_by_bdry_number.iteritems():
            bblock = []

            from pytools import set_sum
            int_deps = set_sum(flux_rec.int_dependencies
                    for flux_nr, flux_rec in nrs_and_fluxes)
            ext_deps = set_sum(flux_rec.ext_dependencies
                    for flux_nr, flux_rec in nrs_and_fluxes)

            for dep in int_deps:
                bblock.extend([
                    Comment(str(dep)),
                    Initializer(
                        MaybeUnused(POD(given.float_type, "val_a_field%d"
                            % self.dep_to_index[dep])),
                        "fp_tex1Dfetch(field%d_tex, a_index)" % self.dep_to_index[dep])
                    ])
            for dep in ext_deps:
                bblock.extend([
                    Comment(str(dep)),
                    Initializer(
                        MaybeUnused(POD(given.float_type, "val_b_field%d"
                            % self.dep_to_index[dep])),
                        "fp_tex1Dfetch(field%s_tex, b_index)" % self.dep_to_index[dep])
                    ])

            f2cm = FluxToCodeMapper(given.float_type)

            comp_code = [Line()]
            for flux_nr, flux_rec in nrs_and_fluxes:
                comp_code.append(
                        Statement(("flux%d += " % flux_nr) +
                            flux_to_code(f2cm, is_flipped=False,
                                int_field_expr=flux_rec.bpair.field,
                                ext_field_expr=flux_rec.bpair.bfield,
                                dep_to_index=self.dep_to_index,
                                flux=flux_rec.flux_expr, prec=PREC_NONE)))

            if f2cm.cse_name_list:
                bblock.append(Line())

            bblock.extend(
                    Initializer(
                        Value("value_type", cse_name), cse_str)
                    for cse_name, cse_str in f2cm.cse_name_list)

            flux_write_code.extend([
                Line(),
                Comment(nrs_and_fluxes[0][1].bpair.tag),
                If("(fpair->boundary_bitmap) & (1 << %d)" % (bdry_number),
                    Block(bblock+comp_code)),
                ])

        flux_write_code.extend([Line(),]
            +[
            self.gen_store(flux_nr, "fpair->a_dest+FACEDOF_NR",
                "fpair->face_jacobian * flux%d" % flux_nr)
            for flux_nr in range(len(self.fluxes))
            ]
            #Assign("debugbuf[blockIdx.x*96+fpair_nr]", "10000+fpair->a_dest"),
            )

        return flux_write_code

Example #11

Show file

File: diff_shared_fld.py Project: yangzilongdmgy/hedge

    def get_kernel(self, diff_op, elgroup, for_benchmark=False):
        from cgen import \
                Pointer, POD, Value, ArrayOf, Const, \
                Module, FunctionDeclaration, FunctionBody, Block, \
                Comment, Line, Define, Include, \
                Initializer, If, For, Statement, Assign

        from pycuda.tools import dtype_to_ctype
        from cgen.cuda import CudaShared, CudaGlobal

        discr = self.discr
        d = discr.dimensions
        dims = range(d)
        plan = self.plan
        given = plan.given

        elgroup, = discr.element_groups
        float_type = given.float_type

        f_decl = CudaGlobal(FunctionDeclaration(Value("void", "apply_diff_mat_smem"),
            [Pointer(POD(float_type, "debugbuf")), Pointer(POD(float_type, "field")), ]
            + [Pointer(POD(float_type, "drst%d_global" % i)) for i in dims]
            ))

        par = plan.parallelism

        cmod = Module([
                Include("pycuda-helpers.hpp"),
                ])

        if float_type == numpy.float64:
            cmod.append(Value("texture<fp_tex_double, 1, cudaReadModeElementType>",
                    "diff_rst_mat_tex"))
        elif float_type == numpy.float32:
            rst_channels = given.devdata.make_valid_tex_channel_count(d)
            cmod.append(Value("texture<float%d, 1, cudaReadModeElementType>"
                    % rst_channels, "diff_rst_mat_tex"))
        else:
            raise ValueError("unsupported float type: %s" % float_type)

        # only preimage size variation is supported here
        assert plan.image_dofs_per_el == given.dofs_per_el()
        assert plan.aligned_image_dofs_per_microblock == given.microblock.aligned_floats

        # FIXME: aligned_image_dofs_per_microblock must be divisible
        # by this, therefore hardcoding for now.
        chunk_size = 16

        cmod.extend([
                Line(),
                Define("DIMENSIONS", discr.dimensions),

                Define("IMAGE_DOFS_PER_EL", plan.image_dofs_per_el),
                Define("PREIMAGE_DOFS_PER_EL", plan.preimage_dofs_per_el),
                Define("ALIGNED_IMAGE_DOFS_PER_MB", plan.aligned_image_dofs_per_microblock),
                Define("ALIGNED_PREIMAGE_DOFS_PER_MB", plan.aligned_preimage_dofs_per_microblock),
                Define("ELS_PER_MB", given.microblock.elements),
                Define("IMAGE_DOFS_PER_MB", "(IMAGE_DOFS_PER_EL*ELS_PER_MB)"),
                Line(),
                Define("CHUNK_SIZE", chunk_size),
                Define("CHUNK_DOF", "threadIdx.x"),
                Define("PAR_MB_NR", "threadIdx.y"),
                Define("CHUNK_NR", "threadIdx.z"),
                Define("IMAGE_MB_DOF", "(CHUNK_NR*CHUNK_SIZE+CHUNK_DOF)"),
                Define("IMAGE_EL_DOF", "(IMAGE_MB_DOF - mb_el*IMAGE_DOFS_PER_EL)"),
                Line(),
                Define("MACROBLOCK_NR", "blockIdx.x"),
                Line(),
                Define("PAR_MB_COUNT", par.parallel),
                Define("INLINE_MB_COUNT", par.inline),
                Define("SEQ_MB_COUNT", par.serial),
                Line(),
                Define("GLOBAL_MB_NR_BASE",
                    "(MACROBLOCK_NR*PAR_MB_COUNT*INLINE_MB_COUNT*SEQ_MB_COUNT)"),
                Define("GLOBAL_MB_NR",
                    "(GLOBAL_MB_NR_BASE"
                    "+ (seq_mb_number*PAR_MB_COUNT + PAR_MB_NR)*INLINE_MB_COUNT)"),
                Define("GLOBAL_MB_IMAGE_DOF_BASE", "(GLOBAL_MB_NR*ALIGNED_IMAGE_DOFS_PER_MB)"),
                Define("GLOBAL_MB_PREIMAGE_DOF_BASE", "(GLOBAL_MB_NR*ALIGNED_PREIMAGE_DOFS_PER_MB)"),
                Line(),
                CudaShared(
                    ArrayOf(
                        ArrayOf(
                            ArrayOf(
                                POD(float_type, "smem_field"),
                                "PAR_MB_COUNT"),
                            "INLINE_MB_COUNT"),
                        "ALIGNED_PREIMAGE_DOFS_PER_MB")),
                Line(),
                ])

        S = Statement
        f_body = Block([
            Initializer(Const(POD(numpy.uint16, "mb_el")),
                "IMAGE_MB_DOF / IMAGE_DOFS_PER_EL"),
            Line(),
            ])

        # ---------------------------------------------------------------------
        def get_load_code():
            mb_img_dofs = plan.aligned_image_dofs_per_microblock
            mb_preimg_dofs = plan.aligned_preimage_dofs_per_microblock
            preimg_dofs_over_dofs = (mb_preimg_dofs+mb_img_dofs-1) // mb_img_dofs

            load_code = []
            store_code = []

            var_num = 0
            for load_block in range(preimg_dofs_over_dofs):
                for inl in range(par.inline):
                    # load and store are split for better pipelining
                    # compiler can't figure that out because of branch

                    var = "tmp%d" % var_num
                    var_num += 1
                    load_code.append(POD(float_type, var))

                    block_addr = "%d * ALIGNED_IMAGE_DOFS_PER_MB + IMAGE_MB_DOF" % load_block
                    load_instr = Assign(var,
                        "field[GLOBAL_MB_PREIMAGE_DOF_BASE"
                        " + %d*ALIGNED_PREIMAGE_DOFS_PER_MB"
                        " + %s]" % (inl, block_addr))
                    store_instr = Assign(
                            "smem_field[PAR_MB_NR][%d][%s]" % (inl, block_addr),
                            var
                            )
                    if (load_block+1)*mb_img_dofs >= mb_preimg_dofs:
                        cond = "%s < ALIGNED_PREIMAGE_DOFS_PER_MB" % block_addr
                        load_instr = If(cond, load_instr)
                        store_instr = If(cond, store_instr)

                    load_code.append(load_instr)
                    store_code.append(store_instr)
            return Block(load_code + [Line()] + store_code)

        def get_scalar_diff_code():
            code = []
            for inl in range(par.inline):
                for axis in dims:
                    code.append(
                        Initializer(POD(float_type, "d%drst%d" % (inl, axis)), 0))

            code.append(Line())

            tex_channels = ["x", "y", "z", "w"]

            store_code = Block()
            for inl in range(par.inline):
                for rst_axis in dims:
                    store_code.append(Assign(
                        "drst%d_global[GLOBAL_MB_IMAGE_DOF_BASE + "
                        "%d*ALIGNED_IMAGE_DOFS_PER_MB + IMAGE_MB_DOF]"
                        % (rst_axis, inl),
                        "d%drst%d" % (inl, rst_axis)
                        ))

            from hedge.backends.cuda.tools import unroll
            code.extend([
                Comment("everybody needs to be done with the old data"),
                S("__syncthreads()"),
                Line(),
                get_load_code(),
                Line(),
                Comment("all the new data must be loaded"),
                S("__syncthreads()"),
                Line(),
                ])

            if float_type == numpy.float32:
                code.append(Value("float%d" % rst_channels, "dmat_entries"))

            code.extend([
                POD(float_type, "field_value%d" % inl)
                for inl in range(par.inline)
                ]+[Line()])

            def unroll_body(j):
                result = [
                    Assign("field_value%d" % inl,
                        "smem_field[PAR_MB_NR][%d][mb_el*PREIMAGE_DOFS_PER_EL+%s]" % (inl, j))
                    for inl in range(par.inline)
                    ]

                if float_type == numpy.float32:
                    result.append(Assign("dmat_entries",
                        "tex1Dfetch(diff_rst_mat_tex, IMAGE_EL_DOF + %s*IMAGE_DOFS_PER_EL)" % j))
                    result.extend(
                        S("d%drst%d += dmat_entries.%s * field_value%d"
                            % (inl, axis, tex_channels[axis], inl))
                        for inl in range(par.inline)
                        for axis in dims)
                elif float_type == numpy.float64:
                    result.extend(
                        S("d%(inl)drst%(axis)d += "
                            "fp_tex1Dfetch(diff_rst_mat_tex, %(axis)d "
                            "+ DIMENSIONS*(IMAGE_EL_DOF + %(j)d*IMAGE_DOFS_PER_EL))"
                            "* field_value%(inl)d" % {
                            "inl": inl,
                            "axis": axis,
                            "j": j
                            })
                        for inl in range(par.inline)
                        for axis in dims)
                else:
                    assert False

                return result

            code.append(If("IMAGE_MB_DOF < IMAGE_DOFS_PER_MB", Block(unroll(unroll_body,
                    total_number=plan.preimage_dofs_per_el)
                    +[store_code])))

            return code

        f_body.extend([
            For("unsigned short seq_mb_number = 0",
                "seq_mb_number < SEQ_MB_COUNT",
                "++seq_mb_number",
                Block(get_scalar_diff_code())
                )
            ])

        # finish off ----------------------------------------------------------
        cmod.append(FunctionBody(f_decl, f_body))

        if not for_benchmark and "cuda_dump_kernels" in discr.debug:
            from hedge.tools import open_unique_debug_file
            open_unique_debug_file("diff", ".cu").write(str(cmod))

        mod = SourceModule(cmod,
                keep="cuda_keep_kernels" in discr.debug,
                #options=["--maxrregcount=16"]
                )

        func = mod.get_function("apply_diff_mat_smem")

        if "cuda_diff" in discr.debug:
            print "diff: lmem=%d smem=%d regs=%d" % (
                    func.local_size_bytes,
                    func.shared_size_bytes,
                    func.registers)

        diff_rst_mat_texref = mod.get_texref("diff_rst_mat_tex")
        gpu_diffmats = self.gpu_diffmats(diff_op, elgroup)

        if given.float_type == numpy.float32:
            gpu_diffmats.bind_to_texref_ext(diff_rst_mat_texref, rst_channels)
        elif given.float_type == numpy.float64:
            gpu_diffmats.bind_to_texref_ext(diff_rst_mat_texref,
                    allow_double_hack=True)
        else:
            assert False

        assert given.microblock.aligned_floats % chunk_size == 0
        block = (
                chunk_size,
                plan.parallelism.parallel,
                given.microblock.aligned_floats//chunk_size)

        func.prepare(
                ["PP"] + discr.dimensions*["P"],
                texrefs=[diff_rst_mat_texref])

        return block, func

Example #12

Show file

File: __init__.py Project: navjotk/loopy

    def generate_body(self, kernel, codegen_state):
        from cgen import Block
        body = Block()

        temp_decls = []

        # {{{ declare temporaries

        base_storage_sizes = {}
        base_storage_to_is_local = {}
        base_storage_to_align_bytes = {}

        from cgen import ArrayOf, Pointer, Initializer, AlignedAttribute
        from loopy.codegen import POD  # uses the correct complex type

        class ConstRestrictPointer(Pointer):
            def get_decl_pair(self):
                sub_tp, sub_decl = self.subdecl.get_decl_pair()
                return sub_tp, ("*const restrict %s" % sub_decl)

        for tv in sorted(
                six.itervalues(kernel.temporary_variables),
                key=lambda tv: tv.name):
            decl_info = tv.decl_info(self, index_dtype=kernel.index_dtype)

            if not tv.base_storage:
                for idi in decl_info:
                    temp_var_decl = POD(self, idi.dtype, idi.name)

                    if idi.shape:
                        temp_var_decl = ArrayOf(temp_var_decl,
                                " * ".join(str(s) for s in idi.shape))

                    temp_decls.append(
                            self.wrap_temporary_decl(temp_var_decl, tv.is_local))

            else:
                offset = 0
                base_storage_sizes.setdefault(tv.base_storage, []).append(
                        tv.nbytes)
                base_storage_to_is_local.setdefault(tv.base_storage, []).append(
                        tv.is_local)

                align_size = tv.dtype.itemsize

                from loopy.kernel.array import VectorArrayDimTag
                for dim_tag, axis_len in zip(tv.dim_tags, tv.shape):
                    if isinstance(dim_tag, VectorArrayDimTag):
                        align_size *= axis_len

                base_storage_to_align_bytes.setdefault(tv.base_storage, []).append(
                        align_size)

                for idi in decl_info:
                    cast_decl = POD(self, idi.dtype, "")
                    temp_var_decl = POD(self, idi.dtype, idi.name)

                    cast_decl = self.wrap_temporary_decl(cast_decl, tv.is_local)
                    temp_var_decl = self.wrap_temporary_decl(
                            temp_var_decl, tv.is_local)

                    # The 'restrict' part of this is a complete lie--of course
                    # all these temporaries are aliased. But we're promising to
                    # not use them to shovel data from one representation to the
                    # other. That counts, right?

                    cast_decl = ConstRestrictPointer(cast_decl)
                    temp_var_decl = ConstRestrictPointer(temp_var_decl)

                    cast_tp, cast_d = cast_decl.get_decl_pair()
                    temp_var_decl = Initializer(
                            temp_var_decl,
                            "(%s %s) (%s + %s)" % (
                                " ".join(cast_tp), cast_d,
                                tv.base_storage,
                                offset))

                    temp_decls.append(temp_var_decl)

                    from pytools import product
                    offset += (
                            idi.dtype.itemsize
                            * product(si for si in idi.shape))

        for bs_name, bs_sizes in sorted(six.iteritems(base_storage_sizes)):
            bs_var_decl = POD(self, np.int8, bs_name)
            bs_var_decl = self.wrap_temporary_decl(
                    bs_var_decl, base_storage_to_is_local[bs_name])
            bs_var_decl = ArrayOf(bs_var_decl, max(bs_sizes))

            alignment = max(base_storage_to_align_bytes[bs_name])
            bs_var_decl = AlignedAttribute(alignment, bs_var_decl)

            body.append(bs_var_decl)

        body.extend(temp_decls)

        # }}}

        from loopy.codegen.loop import set_up_hw_parallel_loops
        gen_code = set_up_hw_parallel_loops(kernel, 0, codegen_state)

        from cgen import Line
        body.append(Line())

        if isinstance(gen_code.ast, Block):
            body.extend(gen_code.ast.contents)
        else:
            body.append(gen_code.ast)

        return body, gen_code.implemented_domains

Example #13

Show file

    def get_kernel(self, fdata, ilist_data, for_benchmark):
        from cgen.cuda import CudaShared, CudaGlobal
        from pycuda.tools import dtype_to_ctype

        discr = self.discr
        given = self.plan.given
        fplan = self.plan
        d = discr.dimensions
        dims = range(d)

        elgroup, = discr.element_groups

        float_type = given.float_type

        f_decl = CudaGlobal(
            FunctionDeclaration(Value("void", "apply_flux"), [
                Pointer(POD(float_type, "debugbuf")),
                Pointer(POD(numpy.uint8, "gmem_facedata")),
            ] + [
                Pointer(POD(float_type, "gmem_fluxes_on_faces%d" % flux_nr))
                for flux_nr in range(len(self.fluxes))
            ]))

        cmod = Module()
        cmod.append(Include("pycuda-helpers.hpp"))

        for dep_expr in self.all_deps:
            cmod.extend([
                Value(
                    "texture<%s, 1, cudaReadModeElementType>" %
                    dtype_to_ctype(float_type, with_fp_tex_hack=True),
                    "field%d_tex" % self.dep_to_index[dep_expr])
            ])

        if fplan.flux_count != len(self.fluxes):
            from warnings import warn
            warn(
                "Flux count in flux execution plan different from actual flux count.\n"
                "You may want to specify the tune_for= kwarg in the Discretization\n"
                "constructor.")

        cmod.extend([
            Line(),
            Typedef(POD(float_type, "value_type")),
            Line(),
            flux_header_struct(float_type, discr.dimensions),
            Line(),
            face_pair_struct(float_type, discr.dimensions),
            Line(),
            Define("DIMENSIONS", discr.dimensions),
            Define("DOFS_PER_FACE", fplan.dofs_per_face),
            Define("THREADS_PER_FACE", fplan.threads_per_face()),
            Line(),
            Define("CONCURRENT_FACES", fplan.parallel_faces),
            Define("BLOCK_MB_COUNT", fplan.mbs_per_block),
            Line(),
            Define("FACEDOF_NR", "threadIdx.x"),
            Define("BLOCK_FACE", "threadIdx.y"),
            Line(),
            Define("FLUX_COUNT", len(self.fluxes)),
            Line(),
            Define("THREAD_NUM", "(FACEDOF_NR + BLOCK_FACE*THREADS_PER_FACE)"),
            Define("THREAD_COUNT", "(THREADS_PER_FACE*CONCURRENT_FACES)"),
            Define(
                "COALESCING_THREAD_COUNT",
                "(THREAD_COUNT < 0x10 ? THREAD_COUNT : THREAD_COUNT & ~0xf)"),
            Line(),
            Define("DATA_BLOCK_SIZE", fdata.block_bytes),
            Define("ALIGNED_FACE_DOFS_PER_MB",
                   fplan.aligned_face_dofs_per_microblock()),
            Define("ALIGNED_FACE_DOFS_PER_BLOCK",
                   "(ALIGNED_FACE_DOFS_PER_MB*BLOCK_MB_COUNT)"),
            Line(),
            Define("FOF_BLOCK_BASE",
                   "(blockIdx.x*ALIGNED_FACE_DOFS_PER_BLOCK)"),
            Line(),
        ] + ilist_data.code + [
            Line(),
            Value("texture<index_list_entry_t, 1, cudaReadModeElementType>",
                  "tex_index_lists"),
            Line(),
            fdata.struct,
            Line(),
            CudaShared(Value("flux_data", "data")),
        ])

        if not fplan.direct_store:
            cmod.extend([
                CudaShared(
                    ArrayOf(
                        ArrayOf(POD(float_type, "smem_fluxes_on_faces"),
                                "FLUX_COUNT"),
                        "ALIGNED_FACE_DOFS_PER_MB*BLOCK_MB_COUNT")),
                Line(),
            ])

        S = Statement
        f_body = Block()

        from hedge.backends.cuda.tools import get_load_code

        f_body.extend(
            get_load_code(dest="&data",
                          base="gmem_facedata + blockIdx.x*DATA_BLOCK_SIZE",
                          bytes="sizeof(flux_data)",
                          descr="load face_pair data") +
            [S("__syncthreads()"), Line()])

        def get_flux_code(flux_writer):
            flux_code = Block([])

            flux_code.extend([
                Initializer(Pointer(Value("face_pair", "fpair")),
                            "data.facepairs+fpair_nr"),
                Initializer(
                    MaybeUnused(POD(numpy.uint32, "a_index")),
                    "fpair->a_base + tex1Dfetch(tex_index_lists, "
                    "fpair->a_ilist_index + FACEDOF_NR)"),
                Initializer(
                    MaybeUnused(POD(numpy.uint32, "b_index")),
                    "fpair->b_base + tex1Dfetch(tex_index_lists, "
                    "fpair->b_ilist_index + FACEDOF_NR)"),
                Line(),
                flux_writer(),
                Line(),
                S("fpair_nr += CONCURRENT_FACES")
            ])

            return flux_code

        flux_computation = Block([
            Comment("fluxes for dual-sided (intra-block) interior face pairs"),
            While("fpair_nr < data.header.same_facepairs_end",
                  get_flux_code(lambda: self.write_interior_flux_code(True))),
            Line(),
            Comment("work around nvcc assertion failure"),
            S("fpair_nr+=1"),
            S("fpair_nr-=1"),
            Line(),
            Comment(
                "fluxes for single-sided (inter-block) interior face pairs"),
            While("fpair_nr < data.header.diff_facepairs_end",
                  get_flux_code(lambda: self.write_interior_flux_code(False))),
            Line(),
            Comment("fluxes for single-sided boundary face pairs"),
            While(
                "fpair_nr < data.header.bdry_facepairs_end",
                get_flux_code(
                    lambda: self.write_boundary_flux_code(for_benchmark))),
        ])

        f_body.extend_log_block("compute the fluxes", [
            Initializer(POD(numpy.uint32, "fpair_nr"), "BLOCK_FACE"),
            If("FACEDOF_NR < DOFS_PER_FACE", flux_computation)
        ])

        if not fplan.direct_store:
            f_body.extend([Line(), S("__syncthreads()"), Line()])

            f_body.extend_log_block(
                "store fluxes",
                [
                    #Assign("debugbuf[blockIdx.x]", "FOF_BLOCK_BASE"),
                    #Assign("debugbuf[0]", "FOF_BLOCK_BASE"),
                    #Assign("debugbuf[0]", "sizeof(face_pair)"),
                    For(
                        "unsigned word_nr = THREAD_NUM",
                        "word_nr < ALIGNED_FACE_DOFS_PER_MB*BLOCK_MB_COUNT",
                        "word_nr += COALESCING_THREAD_COUNT",
                        Block([
                            Assign(
                                "gmem_fluxes_on_faces%d[FOF_BLOCK_BASE+word_nr]"
                                % flux_nr,
                                "smem_fluxes_on_faces[%d][word_nr]" % flux_nr)
                            for flux_nr in range(len(self.fluxes))
                        ]
                              #+[If("isnan(smem_fluxes_on_faces[%d][word_nr])" % flux_nr,
                              #Block([
                              #Assign("debugbuf[blockIdx.x]", "word_nr"),
                              #])
                              #)
                              #for flux_nr in range(len(self.fluxes))]
                              ))
                ])
        if False:
            f_body.extend([
                Assign("debugbuf[blockIdx.x*96+32+BLOCK_FACE*32+threadIdx.x]",
                       "fpair_nr"),
                Assign("debugbuf[blockIdx.x*96+16]",
                       "data.header.same_facepairs_end"),
                Assign("debugbuf[blockIdx.x*96+17]",
                       "data.header.diff_facepairs_end"),
                Assign("debugbuf[blockIdx.x*96+18]",
                       "data.header.bdry_facepairs_end"),
            ])

        # finish off ----------------------------------------------------------
        cmod.append(FunctionBody(f_decl, f_body))

        if not for_benchmark and "cuda_dump_kernels" in discr.debug:
            from hedge.tools import open_unique_debug_file
            open_unique_debug_file("flux_gather", ".cu").write(str(cmod))

        #from pycuda.tools import allow_user_edit
        mod = SourceModule(
            #allow_user_edit(cmod, "kernel.cu", "the flux kernel"),
            cmod,
            keep="cuda_keep_kernels" in discr.debug)
        expr_to_texture_map = dict(
            (dep_expr,
             mod.get_texref("field%d_tex" % self.dep_to_index[dep_expr]))
            for dep_expr in self.all_deps)

        index_list_texref = mod.get_texref("tex_index_lists")
        index_list_texref.set_address(ilist_data.device_memory,
                                      ilist_data.bytes)
        index_list_texref.set_format(
            cuda.dtype_to_array_format(ilist_data.type), 1)
        index_list_texref.set_flags(cuda.TRSF_READ_AS_INTEGER)

        func = mod.get_function("apply_flux")
        block = (fplan.threads_per_face(), fplan.parallel_faces, 1)
        func.prepare(
            (2 + len(self.fluxes)) * "P",
            texrefs=expr_to_texture_map.values() + [index_list_texref])

        if "cuda_flux" in discr.debug:
            print "flux: lmem=%d smem=%d regs=%d" % (
                func.local_size_bytes, func.shared_size_bytes, func.num_regs)

        return block, func, expr_to_texture_map

Example #14

Show file

File: ispc.py Project: navjotk/loopy

    def generate_code(self, kernel, codegen_state, impl_arg_info):
        from cgen import (FunctionBody, FunctionDeclaration, Value, Module,
                Block, Line, Statement as S)
        from cgen.ispc import ISPCExport, ISPCTask

        knl_body, implemented_domains = kernel.target.generate_body(
                kernel, codegen_state)

        inner_name = "lp_ispc_inner_"+kernel.name
        arg_decls = [iai.cgen_declarator for iai in impl_arg_info]
        arg_names = [iai.name for iai in impl_arg_info]

        # {{{ occa compatibility hackery

        if self.occa_mode:
            from cgen import ArrayOf, Const
            from cgen.ispc import ISPCUniform

            arg_decls = [
                    Const(ISPCUniform(ArrayOf(Value("int", "loopy_dims")))),
                    Const(ISPCUniform(Value("int", "o1"))),
                    Const(ISPCUniform(Value("int", "o2"))),
                    Const(ISPCUniform(Value("int", "o3"))),
                    ] + arg_decls
            arg_names = ["loopy_dims", "o1", "o2", "o3"] + arg_names

        # }}}

        knl_fbody = FunctionBody(
                ISPCTask(
                    FunctionDeclaration(
                        Value("void", inner_name),
                        arg_decls)),
                knl_body)

        # {{{ generate wrapper

        wrapper_body = Block()

        gsize, lsize = kernel.get_grid_sizes_as_exprs()
        if len(lsize) > 1:
            for i, ls_i in enumerate(lsize[1:]):
                if ls_i != 1:
                    raise LoopyError("local axis %d (0-based) "
                            "has length > 1, which is unsupported "
                            "by ISPC" % ls_i)

        from pymbolic.mapper.stringifier import PREC_COMPARISON, PREC_NONE
        ccm = self.get_expression_to_code_mapper(codegen_state)

        wrapper_body.extend([
                S("assert(programCount == %s)"
                    % ccm(lsize[0], PREC_COMPARISON)),
                S("launch[%s] %s(%s)"
                    % (
                        ", ".join(
                            ccm(gs_i, PREC_NONE)
                            for gs_i in gsize),
                        inner_name,
                        ", ".join(arg_names)
                        ))
                ])

        wrapper_fbody = FunctionBody(
                ISPCExport(
                    FunctionDeclaration(
                        Value("void", kernel.name),
                        arg_decls)),
                wrapper_body)

        # }}}

        mod = Module([
            knl_fbody,
            Line(),
            wrapper_fbody,
            ])

        return str(mod), implemented_domains

Example #15

Show file

    def write_boundary_flux_code(self, for_benchmark):
        given = self.plan.given

        flux_write_code = Block()

        fluxes_by_bdry_number = {}
        for flux_nr, wdflux in enumerate(self.fluxes):
            for bflux_info in wdflux.boundaries:
                if for_benchmark:
                    bdry_number = 0
                else:
                    bdry_number = self.executor.boundary_tag_to_number[
                        bflux_info.bpair.tag]

                fluxes_by_bdry_number.setdefault(bdry_number, [])\
                        .append((flux_nr, bflux_info))

        flux_write_code.extend([
            Initializer(MaybeUnused(POD(given.float_type, "flux%d" % flux_nr)),
                        0) for flux_nr in range(len(self.fluxes))
        ])

        for bdry_number, nrs_and_fluxes in fluxes_by_bdry_number.iteritems():
            bblock = []

            from pytools import set_sum
            int_deps = set_sum(flux_rec.int_dependencies
                               for flux_nr, flux_rec in nrs_and_fluxes)
            ext_deps = set_sum(flux_rec.ext_dependencies
                               for flux_nr, flux_rec in nrs_and_fluxes)

            for dep in int_deps:
                bblock.extend([
                    Comment(str(dep)),
                    Initializer(
                        MaybeUnused(
                            POD(given.float_type,
                                "val_a_field%d" % self.dep_to_index[dep])),
                        "fp_tex1Dfetch(field%d_tex, a_index)" %
                        self.dep_to_index[dep])
                ])
            for dep in ext_deps:
                bblock.extend([
                    Comment(str(dep)),
                    Initializer(
                        MaybeUnused(
                            POD(given.float_type,
                                "val_b_field%d" % self.dep_to_index[dep])),
                        "fp_tex1Dfetch(field%s_tex, b_index)" %
                        self.dep_to_index[dep])
                ])

            f2cm = FluxToCodeMapper(given.float_type)

            comp_code = [Line()]
            for flux_nr, flux_rec in nrs_and_fluxes:
                comp_code.append(
                    Statement(
                        ("flux%d += " % flux_nr) +
                        flux_to_code(f2cm,
                                     is_flipped=False,
                                     int_field_expr=flux_rec.bpair.field,
                                     ext_field_expr=flux_rec.bpair.bfield,
                                     dep_to_index=self.dep_to_index,
                                     flux=flux_rec.flux_expr,
                                     prec=PREC_NONE)))

            if f2cm.cse_name_list:
                bblock.append(Line())

            bblock.extend(
                Initializer(Value("value_type", cse_name), cse_str)
                for cse_name, cse_str in f2cm.cse_name_list)

            flux_write_code.extend([
                Line(),
                Comment(nrs_and_fluxes[0][1].bpair.tag),
                If("(fpair->boundary_bitmap) & (1 << %d)" % (bdry_number),
                   Block(bblock + comp_code)),
            ])

        flux_write_code.extend(
            [
                Line(),
            ] + [
                self.gen_store(flux_nr, "fpair->a_dest+FACEDOF_NR",
                               "fpair->face_jacobian * flux%d" % flux_nr)
                for flux_nr in range(len(self.fluxes))
            ]
            #Assign("debugbuf[blockIdx.x*96+fpair_nr]", "10000+fpair->a_dest"),
        )

        return flux_write_code

Example #16

Show file

    def write_interior_flux_code(self, is_twosided):
        given = self.plan.given

        def get_field(flux_rec, is_interior, flipped):
            if is_interior ^ flipped:
                prefix = "a"
            else:
                prefix = "b"

            return ("val_%s_field%d" %
                    (prefix, self.dep_to_index[flux_rec.field_expr]))

        flux_write_code = Block([])

        flux_var_decl = [Initializer(POD(given.float_type, "a_flux"), 0)]

        if is_twosided:
            flux_var_decl.append(
                Initializer(POD(given.float_type, "b_flux"), 0))
            prefixes = ["a", "b"]
            flip_values = [False, True]
        else:
            prefixes = ["a"]
            flip_values = [False]

        flux_write_code.append(Line())

        for dep in self.interior_deps:
            flux_write_code.append(Comment(str(dep)))

            for side in ["a", "b"]:
                flux_write_code.append(
                    Initializer(
                        MaybeUnused(
                            POD(
                                given.float_type, "val_%s_field%d" %
                                (side, self.dep_to_index[dep]))),
                        "fp_tex1Dfetch(field%d_tex, %s_index)" %
                        (self.dep_to_index[dep], side)))

        f2cm = FluxToCodeMapper(given.float_type)

        flux_sub_codes = []
        for flux_nr, wdflux in enumerate(self.fluxes):
            my_flux_block = Block(flux_var_decl)

            for int_rec in wdflux.interiors:
                for prefix, is_flipped in zip(prefixes, flip_values):
                    my_flux_block.append(
                        Statement("%s_flux += %s" % (
                            prefix,
                            flux_to_code(f2cm, is_flipped, int_rec.field_expr,
                                         int_rec.field_expr, self.dep_to_index,
                                         int_rec.flux_expr, PREC_NONE),
                        )))

            my_flux_block.append(Line())

            my_flux_block.append(
                self.gen_store(flux_nr, "fpair->a_dest+FACEDOF_NR",
                               "fpair->face_jacobian*a_flux"))

            #my_flux_block.append(
            #Statement("if(isnan(val_b_field5)) debugbuf[blockIdx.x] = 1"),
            #)

            if is_twosided:
                my_flux_block.append(
                    self.gen_store(
                        flux_nr, "fpair->b_dest+tex1Dfetch(tex_index_lists, "
                        "fpair->b_write_ilist_index + FACEDOF_NR)",
                        "fpair->face_jacobian*b_flux"))

                #my_flux_block.append(
                #Assign("debugbuf[blockIdx.x*96+fpair_nr+8]", "10000+fpair->b_dest"),
                #)

            flux_sub_codes.append(my_flux_block)

        if f2cm.cse_name_list:
            flux_write_code.append(Line())

        flux_write_code.extend(
            Initializer(Value("value_type", cse_name), cse_str)
            for cse_name, cse_str in f2cm.cse_name_list)

        flux_write_code.extend(flux_sub_codes)

        return flux_write_code

Example #17

Show file

File: diff_shared_segmat.py Project: allansnielsen/hedge

    def get_kernel(self, diff_op_cls, elgroup, for_benchmark=False):
        from cgen import \
                Pointer, POD, Value, ArrayOf, \
                Module, FunctionDeclaration, FunctionBody, Block, \
                Line, Define, Include, \
                Initializer, If, For, Statement, Assign

        from cgen import dtype_to_ctype
        from cgen.cuda import CudaShared, CudaGlobal

        discr = self.discr
        d = discr.dimensions
        dims = range(d)
        given = self.plan.given

        par = self.plan.parallelism

        diffmat_data = self.gpu_diffmats(diff_op_cls, elgroup)
        elgroup, = discr.element_groups

        float_type = given.float_type

        f_decl = CudaGlobal(FunctionDeclaration(Value("void", "apply_diff_mat"),
            [Pointer(POD(numpy.uint8, "gmem_diff_rst_mat")),
                #Pointer(POD(float_type, "debugbuf")),
                ] + [Pointer(POD(float_type, "drst%d_global" % i)) for i in dims]
            ))

        rst_channels = given.devdata.make_valid_tex_channel_count(d)
        cmod = Module([
                Include("pycuda-helpers.hpp"),
                Line(),
                Value("texture<fp_tex_%s, 1, cudaReadModeElementType>"
                    % dtype_to_ctype(float_type),
                    "field_tex"),
                Line(),
                Define("DIMENSIONS", discr.dimensions),
                Define("DOFS_PER_EL", given.dofs_per_el()),
                Line(),
                Define("SEGMENT_DOF", "threadIdx.x"),
                Define("PAR_MB_NR", "threadIdx.y"),
                Line(),
                Define("MB_SEGMENT", "blockIdx.x"),
                Define("MACROBLOCK_NR", "blockIdx.y"),
                Line(),
                Define("DOFS_PER_SEGMENT", self.plan.segment_size),
                Define("SEGMENTS_PER_MB", self.plan.segments_per_microblock()),
                Define("ALIGNED_DOFS_PER_MB", given.microblock.aligned_floats),
                Define("ELS_PER_MB", given.microblock.elements),
                Line(),
                Define("PAR_MB_COUNT", par.parallel),
                Define("INLINE_MB_COUNT", par.inline),
                Define("SEQ_MB_COUNT", par.serial),
                Line(),
                Define("THREAD_NUM", "(SEGMENT_DOF+PAR_MB_NR*DOFS_PER_SEGMENT)"),
                Define("COALESCING_THREAD_COUNT", "(PAR_MB_COUNT*DOFS_PER_SEGMENT)"),
                Line(),
                Define("MB_DOF_BASE", "(MB_SEGMENT*DOFS_PER_SEGMENT)"),
                Define("MB_DOF", "(MB_DOF_BASE+SEGMENT_DOF)"),
                Define("GLOBAL_MB_NR_BASE",
                    "(MACROBLOCK_NR*PAR_MB_COUNT*INLINE_MB_COUNT*SEQ_MB_COUNT)"),
                Define("GLOBAL_MB_NR",
                    "(GLOBAL_MB_NR_BASE"
                    "+ (seq_mb_number*PAR_MB_COUNT + PAR_MB_NR)*INLINE_MB_COUNT)"),
                Define("GLOBAL_MB_DOF_BASE", "(GLOBAL_MB_NR*ALIGNED_DOFS_PER_MB)"),
                Line(),
                Define("DIFFMAT_SEGMENT_FLOATS", diffmat_data.block_floats),
                Define("DIFFMAT_SEGMENT_BYTES", "(DIFFMAT_SEGMENT_FLOATS*%d)"
                     % given.float_size()),
                Define("DIFFMAT_COLUMNS", diffmat_data.matrix_columns),
                Line(),
                CudaShared(ArrayOf(POD(float_type, "smem_diff_rst_mat"),
                    "DIFFMAT_COLUMNS*DOFS_PER_SEGMENT")),
                Line(),
                ])

        S = Statement
        f_body = Block()

        f_body.extend_log_block("calculate responsibility data", [
            Initializer(POD(numpy.uint16, "mb_el"),
                "MB_DOF/DOFS_PER_EL"),
            ])

        from hedge.backends.cuda.tools import get_load_code
        f_body.extend(
            get_load_code(
                dest="smem_diff_rst_mat",
                base="gmem_diff_rst_mat + MB_SEGMENT*DIFFMAT_SEGMENT_BYTES",
                bytes="DIFFMAT_SEGMENT_BYTES",
                descr="load diff mat segment")
            +[S("__syncthreads()"), Line()])

        # ---------------------------------------------------------------------
        def get_scalar_diff_code():
            code = []
            for inl in range(par.inline):
                for axis in dims:
                    code.append(
                        Initializer(POD(float_type, "d%drst%d" % (inl, axis)), 0))

            code.append(Line())

            def get_mat_entry(row, col, axis):
                return ("smem_diff_rst_mat["
                        "%(row)s*DIFFMAT_COLUMNS + %(axis)s*DOFS_PER_EL"
                        " + %(col)s"
                        "]" % {"row":row, "col":col, "axis":axis}
                        )

            tex_channels = ["x", "y", "z", "w"]
            from hedge.backends.cuda.tools import unroll
            code.extend(
                    [POD(float_type, "field_value%d" % inl)
                        for inl in range(par.inline)]
                    +[Line()]
                    +unroll(lambda j: [
                        Assign("field_value%d" % inl,
                            "fp_tex1Dfetch(field_tex, GLOBAL_MB_DOF_BASE + %d*ALIGNED_DOFS_PER_MB "
                            "+ mb_el*DOFS_PER_EL + %s)" % (inl, j)
                            )
                        for inl in range(par.inline)]
                        +[Line()]
                        +[S("d%drst%d += %s * field_value%d"
                            % (inl, axis, get_mat_entry("SEGMENT_DOF", j, axis), inl))
                        for axis in dims
                        for inl in range(par.inline)]
                        +[Line()],
                        given.dofs_per_el(), self.plan.max_unroll)
                    )

            store_code = Block()
            for inl in range(par.inline):
                for rst_axis in dims:
                    store_code.append(Assign(
                        "drst%d_global[GLOBAL_MB_DOF_BASE"
                        " + %d*ALIGNED_DOFS_PER_MB + MB_DOF]" % (rst_axis, inl),
                        "d%drst%d" % (inl, rst_axis),
                        ))

            code.append(If("MB_DOF < DOFS_PER_EL*ELS_PER_MB", store_code))

            return code

        f_body.extend([
            For("unsigned short seq_mb_number = 0",
                "seq_mb_number < SEQ_MB_COUNT",
                "++seq_mb_number",
                Block(get_scalar_diff_code()))
            ])

        # finish off ----------------------------------------------------------
        cmod.append(FunctionBody(f_decl, f_body))

        if not for_benchmark and "cuda_dump_kernels" in discr.debug:
            from hedge.tools import open_unique_debug_file
            open_unique_debug_file("diff", ".cu").write(str(cmod))

        mod = SourceModule(cmod,
                keep="cuda_keep_kernels" in discr.debug,
                #options=["--maxrregcount=10"]
                )

        field_texref = mod.get_texref("field_tex")

        func = mod.get_function("apply_diff_mat")
        func.prepare(
                discr.dimensions*[float_type] + ["P"],
                block=(self.plan.segment_size, par.parallel, 1),
                texrefs=[field_texref])

        if "cuda_diff" in discr.debug:
            print "diff: lmem=%d smem=%d regs=%d" % (
                    func.local_size_bytes,
                    func.shared_size_bytes,
                    func.num_regs)

        return func, field_texref

Example #18

Show file

    def get_kernel(self, diff_op_cls, elgroup, for_benchmark=False):
        from cgen import \
                Pointer, POD, Value, ArrayOf, \
                Module, FunctionDeclaration, FunctionBody, Block, \
                Line, Define, Include, \
                Initializer, If, For, Statement, Assign

        from cgen import dtype_to_ctype
        from cgen.cuda import CudaShared, CudaGlobal

        discr = self.discr
        d = discr.dimensions
        dims = range(d)
        given = self.plan.given

        par = self.plan.parallelism

        diffmat_data = self.gpu_diffmats(diff_op_cls, elgroup)
        elgroup, = discr.element_groups

        float_type = given.float_type

        f_decl = CudaGlobal(
            FunctionDeclaration(
                Value("void", "apply_diff_mat"),
                [
                    Pointer(POD(numpy.uint8, "gmem_diff_rst_mat")),
                    #Pointer(POD(float_type, "debugbuf")),
                ] +
                [Pointer(POD(float_type, "drst%d_global" % i)) for i in dims]))

        rst_channels = given.devdata.make_valid_tex_channel_count(d)
        cmod = Module([
            Include("pycuda-helpers.hpp"),
            Line(),
            Value(
                "texture<fp_tex_%s, 1, cudaReadModeElementType>" %
                dtype_to_ctype(float_type), "field_tex"),
            Line(),
            Define("DIMENSIONS", discr.dimensions),
            Define("DOFS_PER_EL", given.dofs_per_el()),
            Line(),
            Define("SEGMENT_DOF", "threadIdx.x"),
            Define("PAR_MB_NR", "threadIdx.y"),
            Line(),
            Define("MB_SEGMENT", "blockIdx.x"),
            Define("MACROBLOCK_NR", "blockIdx.y"),
            Line(),
            Define("DOFS_PER_SEGMENT", self.plan.segment_size),
            Define("SEGMENTS_PER_MB", self.plan.segments_per_microblock()),
            Define("ALIGNED_DOFS_PER_MB", given.microblock.aligned_floats),
            Define("ELS_PER_MB", given.microblock.elements),
            Line(),
            Define("PAR_MB_COUNT", par.parallel),
            Define("INLINE_MB_COUNT", par.inline),
            Define("SEQ_MB_COUNT", par.serial),
            Line(),
            Define("THREAD_NUM", "(SEGMENT_DOF+PAR_MB_NR*DOFS_PER_SEGMENT)"),
            Define("COALESCING_THREAD_COUNT",
                   "(PAR_MB_COUNT*DOFS_PER_SEGMENT)"),
            Line(),
            Define("MB_DOF_BASE", "(MB_SEGMENT*DOFS_PER_SEGMENT)"),
            Define("MB_DOF", "(MB_DOF_BASE+SEGMENT_DOF)"),
            Define(
                "GLOBAL_MB_NR_BASE",
                "(MACROBLOCK_NR*PAR_MB_COUNT*INLINE_MB_COUNT*SEQ_MB_COUNT)"),
            Define(
                "GLOBAL_MB_NR", "(GLOBAL_MB_NR_BASE"
                "+ (seq_mb_number*PAR_MB_COUNT + PAR_MB_NR)*INLINE_MB_COUNT)"),
            Define("GLOBAL_MB_DOF_BASE", "(GLOBAL_MB_NR*ALIGNED_DOFS_PER_MB)"),
            Line(),
            Define("DIFFMAT_SEGMENT_FLOATS", diffmat_data.block_floats),
            Define("DIFFMAT_SEGMENT_BYTES",
                   "(DIFFMAT_SEGMENT_FLOATS*%d)" % given.float_size()),
            Define("DIFFMAT_COLUMNS", diffmat_data.matrix_columns),
            Line(),
            CudaShared(
                ArrayOf(POD(float_type, "smem_diff_rst_mat"),
                        "DIFFMAT_COLUMNS*DOFS_PER_SEGMENT")),
            Line(),
        ])

        S = Statement
        f_body = Block()

        f_body.extend_log_block("calculate responsibility data", [
            Initializer(POD(numpy.uint16, "mb_el"), "MB_DOF/DOFS_PER_EL"),
        ])

        from hedge.backends.cuda.tools import get_load_code
        f_body.extend(
            get_load_code(
                dest="smem_diff_rst_mat",
                base="gmem_diff_rst_mat + MB_SEGMENT*DIFFMAT_SEGMENT_BYTES",
                bytes="DIFFMAT_SEGMENT_BYTES",
                descr="load diff mat segment") +
            [S("__syncthreads()"), Line()])

        # ---------------------------------------------------------------------
        def get_scalar_diff_code():
            code = []
            for inl in range(par.inline):
                for axis in dims:
                    code.append(
                        Initializer(POD(float_type, "d%drst%d" % (inl, axis)),
                                    0))

            code.append(Line())

            def get_mat_entry(row, col, axis):
                return ("smem_diff_rst_mat["
                        "%(row)s*DIFFMAT_COLUMNS + %(axis)s*DOFS_PER_EL"
                        " + %(col)s"
                        "]" % {
                            "row": row,
                            "col": col,
                            "axis": axis
                        })

            tex_channels = ["x", "y", "z", "w"]
            from hedge.backends.cuda.tools import unroll
            code.extend([
                POD(float_type, "field_value%d" % inl)
                for inl in range(par.inline)
            ] + [Line()] + unroll(
                lambda j: [
                    Assign(
                        "field_value%d" % inl,
                        "fp_tex1Dfetch(field_tex, GLOBAL_MB_DOF_BASE + %d*ALIGNED_DOFS_PER_MB "
                        "+ mb_el*DOFS_PER_EL + %s)" % (inl, j))
                    for inl in range(par.inline)
                ] + [Line()] + [
                    S("d%drst%d += %s * field_value%d" %
                      (inl, axis, get_mat_entry("SEGMENT_DOF", j, axis), inl))
                    for axis in dims for inl in range(par.inline)
                ] + [Line()], given.dofs_per_el(), self.plan.max_unroll))

            store_code = Block()
            for inl in range(par.inline):
                for rst_axis in dims:
                    store_code.append(
                        Assign(
                            "drst%d_global[GLOBAL_MB_DOF_BASE"
                            " + %d*ALIGNED_DOFS_PER_MB + MB_DOF]" %
                            (rst_axis, inl),
                            "d%drst%d" % (inl, rst_axis),
                        ))

            code.append(If("MB_DOF < DOFS_PER_EL*ELS_PER_MB", store_code))

            return code

        f_body.extend([
            For("unsigned short seq_mb_number = 0",
                "seq_mb_number < SEQ_MB_COUNT", "++seq_mb_number",
                Block(get_scalar_diff_code()))
        ])

        # finish off ----------------------------------------------------------
        cmod.append(FunctionBody(f_decl, f_body))

        if not for_benchmark and "cuda_dump_kernels" in discr.debug:
            from hedge.tools import open_unique_debug_file
            open_unique_debug_file("diff", ".cu").write(str(cmod))

        mod = SourceModule(
            cmod,
            keep="cuda_keep_kernels" in discr.debug,
            #options=["--maxrregcount=10"]
        )

        field_texref = mod.get_texref("field_tex")

        func = mod.get_function("apply_diff_mat")
        func.prepare(discr.dimensions * [float_type] + ["P"],
                     block=(self.plan.segment_size, par.parallel, 1),
                     texrefs=[field_texref])

        if "cuda_diff" in discr.debug:
            print "diff: lmem=%d smem=%d regs=%d" % (
                func.local_size_bytes, func.shared_size_bytes, func.num_regs)

        return func, field_texref

Example #19

Show file

    def get_kernel(self, with_scaling, for_benchmark=False):
        from cgen import \
                Pointer, POD, Value, ArrayOf, \
                Module, FunctionDeclaration, FunctionBody, Block, \
                Line, Define, Include, \
                Initializer, If, For, Statement, Assign, \
                ArrayInitializer

        from cgen import dtype_to_ctype
        from cgen.cuda import CudaShared, CudaConstant, CudaGlobal

        discr = self.discr
        d = discr.dimensions
        dims = range(d)
        given = self.plan.given

        float_type = given.float_type

        f_decl = CudaGlobal(
            FunctionDeclaration(Value("void", "apply_el_local_mat_smem_mat"), [
                Pointer(POD(float_type, "out_vector")),
                Pointer(POD(numpy.uint8, "gmem_matrix")),
                Pointer(POD(float_type, "debugbuf")),
                POD(numpy.uint32, "microblock_count"),
            ]))

        cmod = Module([
            Include("pycuda-helpers.hpp"),
            Line(),
            Value(
                "texture<fp_tex_%s, 1, cudaReadModeElementType>" %
                dtype_to_ctype(float_type), "in_vector_tex"),
        ])
        if with_scaling:
            cmod.append(
                Value(
                    "texture<fp_tex_%s, 1, cudaReadModeElementType>" %
                    dtype_to_ctype(float_type), "scaling_tex"), )

        par = self.plan.parallelism

        cmod.extend([
            Line(),
            Define("DIMENSIONS", discr.dimensions),
            Define("DOFS_PER_EL", given.dofs_per_el()),
            Define("PREIMAGE_DOFS_PER_EL", self.plan.preimage_dofs_per_el),
            Line(),
            Define("SEGMENT_DOF", "threadIdx.x"),
            Define("PAR_MB_NR", "threadIdx.y"),
            Line(),
            Define("MB_SEGMENT", "blockIdx.x"),
            Define("MACROBLOCK_NR", "blockIdx.y"),
            Line(),
            Define("DOFS_PER_SEGMENT", self.plan.segment_size),
            Define("SEGMENTS_PER_MB", self.plan.segments_per_microblock()),
            Define("ALIGNED_DOFS_PER_MB", given.microblock.aligned_floats),
            Define("ALIGNED_PREIMAGE_DOFS_PER_MB",
                   self.plan.aligned_preimage_dofs_per_microblock),
            Define("MB_EL_COUNT", given.microblock.elements),
            Line(),
            Define("PAR_MB_COUNT", par.parallel),
            Define("INLINE_MB_COUNT", par.inline),
            Define("SEQ_MB_COUNT", par.serial),
            Line(),
            Define("THREAD_NUM", "(SEGMENT_DOF+PAR_MB_NR*DOFS_PER_SEGMENT)"),
            Define("COALESCING_THREAD_COUNT",
                   "(PAR_MB_COUNT*DOFS_PER_SEGMENT)"),
            Line(),
            Define("MB_DOF_BASE", "(MB_SEGMENT*DOFS_PER_SEGMENT)"),
            Define("MB_DOF", "(MB_DOF_BASE+SEGMENT_DOF)"),
            Define(
                "GLOBAL_MB_NR_BASE",
                "(MACROBLOCK_NR*PAR_MB_COUNT*INLINE_MB_COUNT*SEQ_MB_COUNT)"),
            Define(
                "GLOBAL_MB_NR", "(GLOBAL_MB_NR_BASE"
                "+ (seq_mb_number*PAR_MB_COUNT + PAR_MB_NR)*INLINE_MB_COUNT)"),
            Define("GLOBAL_MB_DOF_BASE", "(GLOBAL_MB_NR*ALIGNED_DOFS_PER_MB)"),
            Define("GLOBAL_MB_PREIMG_DOF_BASE",
                   "(GLOBAL_MB_NR*ALIGNED_PREIMAGE_DOFS_PER_MB)"),
            Line(),
            Define("MATRIX_COLUMNS", self.plan.gpu_matrix_columns()),
            Define("MATRIX_SEGMENT_FLOATS",
                   self.plan.gpu_matrix_block_floats()),
            Define("MATRIX_SEGMENT_BYTES",
                   "(MATRIX_SEGMENT_FLOATS*%d)" % given.float_size()),
            Line(),
            CudaShared(
                ArrayOf(POD(float_type, "smem_matrix"),
                        "MATRIX_SEGMENT_FLOATS")),
            CudaShared(
                ArrayOf(
                    ArrayOf(
                        ArrayOf(POD(float_type, "dof_buffer"), "PAR_MB_COUNT"),
                        "INLINE_MB_COUNT"), "DOFS_PER_SEGMENT"), ),
            CudaShared(POD(numpy.uint16, "segment_start_el")),
            CudaShared(POD(numpy.uint16, "segment_stop_el")),
            CudaShared(POD(numpy.uint16, "segment_el_count")),
            Line(),
            ArrayInitializer(
                CudaConstant(
                    ArrayOf(POD(numpy.uint32, "segment_start_el_lookup"),
                            "SEGMENTS_PER_MB")),
                [(chk * self.plan.segment_size) // given.dofs_per_el()
                 for chk in range(self.plan.segments_per_microblock())]),
            ArrayInitializer(
                CudaConstant(
                    ArrayOf(POD(numpy.uint32, "segment_stop_el_lookup"),
                            "SEGMENTS_PER_MB")),
                [
                    min(given.microblock.elements,
                        (chk * self.plan.segment_size +
                         self.plan.segment_size - 1) // given.dofs_per_el() +
                        1)
                    for chk in range(self.plan.segments_per_microblock())
                ]),
        ])

        S = Statement
        f_body = Block()

        f_body.extend_log_block(
            "calculate this dof's element",
            [Initializer(POD(numpy.uint8, "mb_el"), "MB_DOF/DOFS_PER_EL")])

        if self.plan.use_prefetch_branch:
            f_body.extend_log_block("calculate segment responsibility data", [
                If(
                    "THREAD_NUM==0",
                    Block([
                        Assign("segment_start_el",
                               "segment_start_el_lookup[MB_SEGMENT]"),
                        Assign("segment_stop_el",
                               "segment_stop_el_lookup[MB_SEGMENT]"),
                        Assign("segment_el_count",
                               "segment_stop_el-segment_start_el"),
                    ])),
                S("__syncthreads()")
            ])

        from hedge.backends.cuda.tools import get_load_code
        f_body.extend(
            get_load_code(dest="smem_matrix",
                          base=(
                              "gmem_matrix + MB_SEGMENT*MATRIX_SEGMENT_BYTES"),
                          bytes="MATRIX_SEGMENT_BYTES",
                          descr="load matrix segment") +
            [S("__syncthreads()")])

        # ---------------------------------------------------------------------
        def get_batched_fetch_mat_mul_code(el_fetch_count):
            result = []
            dofs = range(self.plan.preimage_dofs_per_el)

            for load_segment_start in range(0, self.plan.preimage_dofs_per_el,
                                            self.plan.segment_size):
                result.extend([S("__syncthreads()")] + [
                    Assign(
                        "dof_buffer[PAR_MB_NR][%d][SEGMENT_DOF]" %
                        inl, "fp_tex1Dfetch(in_vector_tex, "
                        "GLOBAL_MB_PREIMG_DOF_BASE"
                        " + %d*ALIGNED_PREIMAGE_DOFS_PER_MB"
                        " + (segment_start_el)*PREIMAGE_DOFS_PER_EL + %d + SEGMENT_DOF)"
                        % (inl, load_segment_start))
                    for inl in range(par.inline)
                ] + [
                    S("__syncthreads()"),
                    Line(),
                ])

                for dof in dofs[load_segment_start:load_segment_start +
                                self.plan.segment_size]:
                    for inl in range(par.inline):
                        result.append(
                            S("result%d += "
                              "smem_matrix[SEGMENT_DOF*MATRIX_COLUMNS + %d]"
                              "*"
                              "dof_buffer[PAR_MB_NR][%d][%d]" %
                              (inl, dof, inl, dof - load_segment_start)))
                result.append(Line())
            return result

        from hedge.backends.cuda.tools import unroll

        def get_direct_tex_mat_mul_code():
            return (
                [POD(float_type, "fof%d" % inl) for inl in range(par.inline)] +
                [POD(float_type, "lm"), Line()] + unroll(
                    lambda j: [
                        Assign(
                            "fof%d" % inl,
                            "fp_tex1Dfetch(in_vector_tex, "
                            "GLOBAL_MB_PREIMG_DOF_BASE"
                            " + %(inl)d * ALIGNED_PREIMAGE_DOFS_PER_MB"
                            " + mb_el*PREIMAGE_DOFS_PER_EL+%(j)s)" % {
                                "j": j,
                                "inl": inl,
                                "row": "SEGMENT_DOF"
                            },
                        ) for inl in range(par.inline)
                    ] + [
                        Assign(
                            "lm",
                            "smem_matrix["
                            "%(row)s*MATRIX_COLUMNS + %(j)s]" % {
                                "j": j,
                                "row": "SEGMENT_DOF"
                            },
                        )
                    ] + [
                        S("result%(inl)d += fof%(inl)d*lm" % {"inl": inl})
                        for inl in range(par.inline)
                    ],
                    total_number=self.plan.preimage_dofs_per_el,
                    max_unroll=self.plan.max_unroll) + [Line()])

        def get_mat_mul_code(el_fetch_count):
            if el_fetch_count == 1:
                return get_batched_fetch_mat_mul_code(el_fetch_count)
            else:
                return get_direct_tex_mat_mul_code()

        def mat_mul_outer_loop(fetch_count):
            if with_scaling:
                inv_jac_multiplier = (
                    "fp_tex1Dfetch(scaling_tex,"
                    "(GLOBAL_MB_NR + %(inl)d)*MB_EL_COUNT + mb_el)")
            else:
                inv_jac_multiplier = "1"

            write_condition = "MB_DOF < DOFS_PER_EL*MB_EL_COUNT"
            if self.with_index_check:
                write_condition += " && GLOBAL_MB_NR < microblock_count"
            return For(
                "unsigned short seq_mb_number = 0",
                "seq_mb_number < SEQ_MB_COUNT", "++seq_mb_number",
                Block([
                    Initializer(POD(float_type, "result%d" % inl), 0)
                    for inl in range(par.inline)
                ] + [Line()] + get_mat_mul_code(fetch_count) + [
                    If(
                        write_condition,
                        Block([
                            Assign(
                                "out_vector[GLOBAL_MB_DOF_BASE"
                                " + %d*ALIGNED_DOFS_PER_MB"
                                " + MB_DOF]" % inl, "result%d * %s" %
                                (inl, (inv_jac_multiplier % {
                                    "inl": inl
                                }))) for inl in range(par.inline)
                        ]))
                ]))

        if self.plan.use_prefetch_branch:
            from cgen import make_multiple_ifs
            f_body.append(
                make_multiple_ifs([
                    ("segment_el_count == %d" % fetch_count,
                     mat_mul_outer_loop(fetch_count)) for fetch_count in range(
                         1,
                         self.plan.max_elements_touched_by_segment() + 1)
                ]))
        else:
            f_body.append(mat_mul_outer_loop(0))

        # finish off ----------------------------------------------------------
        cmod.append(FunctionBody(f_decl, f_body))

        if not for_benchmark and "cuda_dump_kernels" in discr.debug:
            from hedge.tools import open_unique_debug_file
            open_unique_debug_file(self.plan.debug_name,
                                   ".cu").write(str(cmod))

        mod = SourceModule(
            cmod,
            keep="cuda_keep_kernels" in discr.debug,
            #options=["--maxrregcount=12"]
        )

        func = mod.get_function("apply_el_local_mat_smem_mat")

        if self.plan.debug_name in discr.debug:
            print "%s: lmem=%d smem=%d regs=%d" % (
                self.plan.debug_name, func.local_size_bytes,
                func.shared_size_bytes, func.num_regs)

        in_vector_texref = mod.get_texref("in_vector_tex")
        texrefs = [in_vector_texref]

        if with_scaling:
            scaling_texref = mod.get_texref("scaling_tex")
            texrefs.append(scaling_texref)
        else:
            scaling_texref = None

        func.prepare("PPPI",
                     block=(self.plan.segment_size,
                            self.plan.parallelism.parallel, 1),
                     texrefs=texrefs)

        return func, in_vector_texref, scaling_texref