Example #1
0
        def mat_mul_outer_loop(fetch_count):
            if with_scaling:
                inv_jac_multiplier = (
                    "fp_tex1Dfetch(scaling_tex,"
                    "(GLOBAL_MB_NR + %(inl)d)*MB_EL_COUNT + mb_el)")
            else:
                inv_jac_multiplier = "1"

            write_condition = "MB_DOF < DOFS_PER_EL*MB_EL_COUNT"
            if self.with_index_check:
                write_condition += " && GLOBAL_MB_NR < microblock_count"
            return For(
                "unsigned short seq_mb_number = 0",
                "seq_mb_number < SEQ_MB_COUNT", "++seq_mb_number",
                Block([
                    Initializer(POD(float_type, "result%d" % inl), 0)
                    for inl in range(par.inline)
                ] + [Line()] + get_mat_mul_code(fetch_count) + [
                    If(
                        write_condition,
                        Block([
                            Assign(
                                "out_vector[GLOBAL_MB_DOF_BASE"
                                " + %d*ALIGNED_DOFS_PER_MB"
                                " + MB_DOF]" % inl, "result%d * %s" %
                                (inl, (inv_jac_multiplier % {
                                    "inl": inl
                                }))) for inl in range(par.inline)
                        ]))
                ]))
Example #2
0
    def make_codepy_module(self, toolchain, dtype):
        from codepy.libraries import add_codepy
        toolchain = toolchain.copy()
        add_codepy(toolchain)

        from cgen import (Value, Include, Statement,
                Typedef, FunctionBody, FunctionDeclaration, Block, Const,
                Line, POD, Initializer, CustomLoop)
        S = Statement

        from codepy.bpl import BoostPythonModule
        mod = BoostPythonModule()

        mod.add_to_preamble([
            Include("vector"),
            Include("algorithm"),
            Include("hedge/base.hpp"),
            Include("hedge/volume_operators.hpp"),
            Include("boost/foreach.hpp"),
            Include("boost/numeric/ublas/io.hpp"),
            ]+self.get_cpu_extra_includes())

        mod.add_to_module([
            S("namespace ublas = boost::numeric::ublas"),
            S("using namespace hedge"),
            S("using namespace pyublas"),
            Line(),
            Typedef(POD(dtype, "value_type")),
            Line(),
            ])

        mod.add_function(FunctionBody(
            FunctionDeclaration(Value("void", "process_elements"), [
                Const(Value("uniform_element_ranges", "ers")),
                Const(Value("numpy_vector<value_type>", "field")),
                Value("numpy_vector<value_type>", "result"),
                ]+self.get_cpu_extra_parameter_declarators()),
            Block([
                Typedef(Value("numpy_vector<value_type>::iterator",
                    "it_type")),
                Typedef(Value("numpy_vector<value_type>::const_iterator",
                    "cit_type")),
                Line(),
                Initializer(Value("it_type", "result_it"),
                    "result.begin()"),
                Initializer(Value("cit_type", "field_it"),
                    "field.begin()"),
                Line() ]+self.get_cpu_extra_preamble()+[ Line(),
                CustomLoop(
                    "BOOST_FOREACH(const element_range er, ers)",
                    Block(self.get_cpu_per_element_code())
                    )
                ])))

        #print mod.generate()
        #toolchain = toolchain.copy()
        #toolchain.enable_debugging
        return mod.compile(toolchain)
Example #3
0
def get_elwise_module_descriptor(arguments, operation, name="kernel"):
    from codepy.bpl import BoostPythonModule

    from cgen import FunctionBody, FunctionDeclaration, \
            Value, POD, Struct, For, Initializer, Include, Statement, \
            Line, Block

    S = Statement

    mod = BoostPythonModule()
    mod.add_to_preamble([
        Include("pyublas/numpy.hpp"),
        ])

    mod.add_to_module([
        S("namespace ublas = boost::numeric::ublas"),
        S("using namespace pyublas"),
        Line(),
        ])

    body = Block([
        Initializer(
            Value("numpy_array<%s >::iterator"
                % dtype_to_ctype(varg.dtype),
                varg.name),
            "args.%s_ary.begin()" % varg.name)
        for varg in arguments if isinstance(varg, VectorArg)]
        +[Initializer(
            sarg.declarator(), "args." + sarg.name)
        for sarg in arguments if isinstance(sarg, ScalarArg)]
        )

    body.extend([
        Line(),
        For("unsigned i = 0",
            "i < codepy_length",
            "++i",
            Block([S(operation)])
            )
        ])

    arg_struct = Struct("arg_struct", 
            [arg.declarator() for arg in arguments])
    mod.add_struct(arg_struct, "ArgStruct")
    mod.add_to_module([Line()])

    mod.add_function(
            FunctionBody(
                FunctionDeclaration(
                    Value("void", name),
                    [POD(numpy.uintp, "codepy_length"),
                        Value("arg_struct", "args")]),
                body))

    return mod
Example #4
0
    def generate_body(self, kernel, codegen_state):
        from cgen import Block

        body = Block()

        # {{{ declare temporaries

        body.extend(
            idi.cgen_declarator
            for tv in six.itervalues(kernel.temporary_variables)
            for idi in tv.decl_info(kernel.target, is_written=True, index_dtype=kernel.index_dtype)
        )

        # }}}

        from loopy.codegen.loop import set_up_hw_parallel_loops

        gen_code = set_up_hw_parallel_loops(kernel, 0, codegen_state)

        from cgen import Line

        body.append(Line())

        if isinstance(gen_code.ast, Block):
            body.extend(gen_code.ast.contents)
        else:
            body.append(gen_code.ast)

        return body, gen_code.implemented_domains
Example #5
0
        def get_scalar_diff_code():
            code = []
            for inl in range(par.inline):
                for axis in dims:
                    code.append(
                        Initializer(POD(float_type, "d%drst%d" % (inl, axis)),
                                    0))

            code.append(Line())

            def get_mat_entry(row, col, axis):
                return ("smem_diff_rst_mat["
                        "%(row)s*DIFFMAT_COLUMNS + %(axis)s*DOFS_PER_EL"
                        " + %(col)s"
                        "]" % {
                            "row": row,
                            "col": col,
                            "axis": axis
                        })

            tex_channels = ["x", "y", "z", "w"]
            from hedge.backends.cuda.tools import unroll
            code.extend([
                POD(float_type, "field_value%d" % inl)
                for inl in range(par.inline)
            ] + [Line()] + unroll(
                lambda j: [
                    Assign(
                        "field_value%d" % inl,
                        "fp_tex1Dfetch(field_tex, GLOBAL_MB_DOF_BASE + %d*ALIGNED_DOFS_PER_MB "
                        "+ mb_el*DOFS_PER_EL + %s)" % (inl, j))
                    for inl in range(par.inline)
                ] + [Line()] + [
                    S("d%drst%d += %s * field_value%d" %
                      (inl, axis, get_mat_entry("SEGMENT_DOF", j, axis), inl))
                    for axis in dims for inl in range(par.inline)
                ] + [Line()], given.dofs_per_el(), self.plan.max_unroll))

            store_code = Block()
            for inl in range(par.inline):
                for rst_axis in dims:
                    store_code.append(
                        Assign(
                            "drst%d_global[GLOBAL_MB_DOF_BASE"
                            " + %d*ALIGNED_DOFS_PER_MB + MB_DOF]" %
                            (rst_axis, inl),
                            "d%drst%d" % (inl, rst_axis),
                        ))

            code.append(If("MB_DOF < DOFS_PER_EL*ELS_PER_MB", store_code))

            return code
Example #6
0
def test_cgen():
    s = Struct(
        "yuck",
        [
            POD(
                np.float32,
                "h",
            ),
            POD(np.float32, "order"),
            POD(np.float32, "face_jacobian"),
            ArrayOf(POD(np.float32, "normal"), 17),
            POD(np.uint16, "a_base"),
            POD(np.uint16, "b_base"),
            #CudaGlobal(POD(np.uint8, "a_ilist_number")),
            POD(np.uint8, "b_ilist_number"),
            POD(np.uint8, "bdry_flux_number"),  # 0 if not on boundary
            POD(np.uint8, "reserved"),
            POD(np.uint32, "b_global_base"),
        ])
    f_decl = FunctionDeclaration(POD(np.uint16, "get_num"), [
        POD(np.uint8, "reserved"),
        POD(np.uint32, "b_global_base"),
    ])
    f_body = FunctionBody(
        f_decl,
        Block([
            POD(np.uint32, "i"),
            For(
                "i = 0",
                "i < 17",
                "++i",
                If(
                    "a > b",
                    Assign("a", "b"),
                    Block([
                        Assign("a", "b-1"),
                        #Break(),
                    ])),
            ),
            #BlankLine(),
            Comment("all done"),
        ]))
    t_decl = Template(
        'typename T',
        FunctionDeclaration(
            Value('CUdeviceptr', 'scan'),
            [Value('CUdeviceptr', 'inputPtr'),
             Value('int', 'length')]))

    print(s)
    print(f_body)
    print(t_decl)
Example #7
0
def get_elwise_module_descriptor(arguments, operation, name="kernel"):
    from codepy.bpl import BoostPythonModule

    from cgen import FunctionBody, FunctionDeclaration, \
            Value, POD, Struct, For, Initializer, Include, Statement, \
            Line, Block

    S = Statement  # noqa: N806

    mod = BoostPythonModule()
    mod.add_to_preamble([
        Include("pyublas/numpy.hpp"),
    ])

    mod.add_to_module([
        S("namespace ublas = boost::numeric::ublas"),
        S("using namespace pyublas"),
        Line(),
    ])

    body = Block([
        Initializer(
            Value(
                "numpy_array<{} >::iterator".format(dtype_to_ctype(
                    varg.dtype)), varg.name), f"args.{varg.name}_ary.begin()")
        for varg in arguments if isinstance(varg, VectorArg)
    ] + [
        Initializer(sarg.declarator(), f"args.{sarg.name}")
        for sarg in arguments if isinstance(sarg, ScalarArg)
    ])

    body.extend([
        Line(),
        For("unsigned i = 0", "i < codepy_length", "++i",
            Block([S(operation)]))
    ])

    arg_struct = Struct("arg_struct", [arg.declarator() for arg in arguments])
    mod.add_struct(arg_struct, "ArgStruct")
    mod.add_to_module([Line()])

    mod.add_function(
        FunctionBody(
            FunctionDeclaration(Value("void", name), [
                POD(numpy.uintp, "codepy_length"),
                Value("arg_struct", "args")
            ]), body))

    return mod
        def get_scalar_diff_code():
            code = []
            for inl in range(par.inline):
                for axis in dims:
                    code.append(
                        Initializer(POD(float_type, "d%drst%d" % (inl, axis)), 0))

            code.append(Line())

            def get_mat_entry(row, col, axis):
                return ("smem_diff_rst_mat["
                        "%(row)s*DIFFMAT_COLUMNS + %(axis)s*DOFS_PER_EL"
                        " + %(col)s"
                        "]" % {"row":row, "col":col, "axis":axis}
                        )

            tex_channels = ["x", "y", "z", "w"]
            from hedge.backends.cuda.tools import unroll
            code.extend(
                    [POD(float_type, "field_value%d" % inl)
                        for inl in range(par.inline)]
                    +[Line()]
                    +unroll(lambda j: [
                        Assign("field_value%d" % inl,
                            "fp_tex1Dfetch(field_tex, GLOBAL_MB_DOF_BASE + %d*ALIGNED_DOFS_PER_MB "
                            "+ mb_el*DOFS_PER_EL + %s)" % (inl, j)
                            )
                        for inl in range(par.inline)]
                        +[Line()]
                        +[S("d%drst%d += %s * field_value%d"
                            % (inl, axis, get_mat_entry("SEGMENT_DOF", j, axis), inl))
                        for axis in dims
                        for inl in range(par.inline)]
                        +[Line()],
                        given.dofs_per_el(), self.plan.max_unroll)
                    )

            store_code = Block()
            for inl in range(par.inline):
                for rst_axis in dims:
                    store_code.append(Assign(
                        "drst%d_global[GLOBAL_MB_DOF_BASE"
                        " + %d*ALIGNED_DOFS_PER_MB + MB_DOF]" % (rst_axis, inl),
                        "d%drst%d" % (inl, rst_axis),
                        ))

            code.append(If("MB_DOF < DOFS_PER_EL*ELS_PER_MB", store_code))

            return code
Example #9
0
        def get_matmul_code():
            from hedge.backends.cuda.tools import unroll

            index_check_condition = "GLOBAL_MB_NR < microblock_count"

            def if_(conditions, then):
                final_cond = " && ".join(cond for cond in conditions if cond)
                if final_cond:
                    return If(final_cond, then)
                else:
                    return then

            result = Block([
                Comment("everybody needs to be done with the old data"),
                S("__syncthreads()"), Line(),
                ]+[If(index_check_condition, get_load_code())]+[
                Line(),
                Comment("all the new data must be loaded"),
                S("__syncthreads()"),
                Line(),
                ]+[
                Initializer(POD(float_type, "result%d" % inl), 0)
                for inl in range(par.inline)
                ]+[
                Line(),
                POD(float_type, "mat_entry"),
                Line(),
                ])

            result.append(if_(["IMAGE_MB_DOF < IMAGE_DOFS_PER_MB", index_check_condition],
                Block(unroll(lambda j:
                    [Assign("mat_entry", "fp_tex2D(mat_tex, IMAGE_EL_DOF, %s)" % j)]
                    +[
                    S("result%d += mat_entry "
                    "* smem_in_vector[PAR_MB_NR][%d][mb_el*PREIMAGE_DOFS_PER_EL + %s]"
                    % (inl, inl, j))
                    for inl in range(par.inline)
                    ],
                    total_number=plan.preimage_dofs_per_el)
                    +[Line()]
                    +[Assign(
                        "out_vector[GLOBAL_MB_IMAGE_DOF_BASE + "
                        "%d*ALIGNED_IMAGE_DOFS_PER_MB + IMAGE_MB_DOF]" % inl,
                        "result%d" % inl)
                    for inl in range(par.inline)]
                    )))

            return result
Example #10
0
    def get_kernel_call(self, codegen_state, name, gsize, lsize, extra_args):
        ecm = self.get_expression_to_code_mapper(codegen_state)

        from pymbolic.mapper.stringifier import PREC_COMPARISON, PREC_NONE
        result = []
        from cgen import Statement as S, Block
        if lsize:
            result.append(
                    S("assert(programCount == %s)"
                        % ecm(lsize[0], PREC_COMPARISON)))

        if gsize:
            launch_spec = "[%s]" % ", ".join(
                                ecm(gs_i, PREC_NONE)
                                for gs_i in gsize)
        else:
            launch_spec = ""

        arg_names, arg_decls = self._arg_names_and_decls(codegen_state)

        result.append(S(
            "launch%s %s(%s)" % (
                launch_spec,
                name,
                ", ".join(arg_names)
                )))

        return Block(result)
Example #11
0
        def get_load_code():
            mb_img_dofs = plan.aligned_image_dofs_per_microblock
            mb_preimg_dofs = plan.aligned_preimage_dofs_per_microblock
            preimg_dofs_over_dofs = (mb_preimg_dofs+mb_img_dofs-1) // mb_img_dofs

            load_code = []
            store_code = []

            var_num = 0
            for load_block in range(preimg_dofs_over_dofs):
                for inl in range(par.inline):
                    # load and store are split for better pipelining
                    # compiler can't figure that out because of branch

                    var = "tmp%d" % var_num
                    var_num += 1
                    load_code.append(POD(float_type, var))

                    block_addr = "%d * ALIGNED_IMAGE_DOFS_PER_MB + IMAGE_MB_DOF" % load_block
                    load_instr = Assign(var,
                        "field[GLOBAL_MB_PREIMAGE_DOF_BASE"
                        " + %d*ALIGNED_PREIMAGE_DOFS_PER_MB"
                        " + %s]" % (inl, block_addr))
                    store_instr = Assign(
                            "smem_field[PAR_MB_NR][%d][%s]" % (inl, block_addr),
                            var
                            )
                    if (load_block+1)*mb_img_dofs >= mb_preimg_dofs:
                        cond = "%s < ALIGNED_PREIMAGE_DOFS_PER_MB" % block_addr
                        load_instr = If(cond, load_instr)
                        store_instr = If(cond, store_instr)

                    load_code.append(load_instr)
                    store_code.append(store_instr)
            return Block(load_code + [Line()] + store_code)
Example #12
0
def get_load_code(dest, base, bytes, word_type=numpy.uint32,
        descr=None):
    from cgen import (
            Pointer, POD, 
            Comment, Block, Line, \
            Constant, For, Statement)

    from cgen import dtype_to_ctype
    copy_dtype = numpy.dtype(word_type)
    copy_dtype_str = dtype_to_ctype(copy_dtype)

    code = []
    if descr is not None:
        code.append(Comment(descr))

    code.extend([
        Block([
            Constant(Pointer(POD(copy_dtype, "load_base")),
                ("(%s *) (%s)" % (copy_dtype_str, base))),
            For("unsigned word_nr = THREAD_NUM",
                "word_nr*sizeof(int) < (%s)" % bytes,
                "word_nr += COALESCING_THREAD_COUNT",
                Statement("((%s *) (%s))[word_nr] = load_base[word_nr]"
                    % (copy_dtype_str, dest))
                ),
            ]),
        Line(),
        ])

    return code
Example #13
0
def unroll(body_gen, total_number, max_unroll=None, start=0):
    from cgen import For, Line, Block
    from pytools import flatten

    if max_unroll is None:
        max_unroll = total_number

    result = []

    if total_number > max_unroll:
        loop_items = (total_number // max_unroll) * max_unroll

        result.extend([
                For("unsigned j = 0",
                    "j < %d" % loop_items,
                    "j += %d" % max_unroll,
                    Block(list(flatten(
                        body_gen("(j+%d)" % i)
                        for i in range(max_unroll))))
                    ),
                Line()
                ])
        start += loop_items

    result.extend(flatten(
        body_gen(i) for i in range(start, total_number)))

    return result
Example #14
0
    def add_struct(self,
            struct, py_name=None, py_member_name_transform=lambda x: x,
            by_value_members=None):
        if by_value_members is None:
            by_value_members = set()

        from cgen import Block, Line, Statement, Typedef, Value

        if py_name is None:
            py_name = struct.tpname

        self.mod_body.append(struct)

        member_defs = []
        for f in struct.fields:
            py_f_name = py_member_name_transform(f.name)
            tp_lines, declarator = f.get_decl_pair()
            if f.name in by_value_members or tp_lines[0].startswith("numpy_"):
                member_defs.append(
                        ".def(pyublas::by_value_rw_member"
                        f'("{py_f_name}", &cl::{f.name}))')
            else:
                member_defs.append(
                        f'.def_readwrite("{py_f_name}", &cl::{f.name})'
                        )

        self.init_body.append(
            Block([
                Typedef(Value(struct.tpname, "cl")),
                Line(),
                Statement(
                    'boost::python::class_<cl>("{}"){}'.format(
                        py_name, "".join(member_defs))),
                ]))
Example #15
0
    def generate(self):
        """Generate (i.e. yield) the source code of the
        module line-by-line.
        """

        from cgen import Block, Module, Include, Line, Define, \
                PrivateNamespace

        body = []

        if self.max_arity is not None:
            body.append(Define("BOOST_PYTHON_MAX_ARITY", self.max_arity))

        if self.use_private_namespace:
            mod_body = [PrivateNamespace(self.mod_body)]
        else:
            mod_body = self.mod_body

        body += ([Include("boost/python.hpp")]
                + self.preamble + [Line()]
                + mod_body
                + [Line(), Line(f"BOOST_PYTHON_MODULE({self.name})")]
                + [Block(self.init_body)])

        return Module(body)
Example #16
0
def generate_c_instruction_code(codegen_state, insn):
    kernel = codegen_state.kernel

    if codegen_state.vectorization_info is not None:
        raise Unvectorizable("C instructions cannot be vectorized")

    body = []

    from loopy.target.c import POD
    from cgen import Initializer, Block, Line

    from pymbolic.primitives import Variable
    for name, iname_expr in insn.iname_exprs:
        if (isinstance(iname_expr, Variable)
                and name not in codegen_state.var_subst_map):
            # No need, the bare symbol will work
            continue

        body.append(
            Initializer(
                POD(codegen_state.ast_builder, kernel.index_dtype, name),
                codegen_state.expression_to_code_mapper(iname_expr,
                                                        prec=PREC_NONE,
                                                        type_context="i")))

    if body:
        body.append(Line())

    body.extend(Line(line) for line in insn.code.split("\n"))

    return Block(body)
Example #17
0
        def get_flux_code(flux_writer):
            flux_code = Block([])

            flux_code.extend([
                Initializer(Pointer(Value("face_pair", "fpair")),
                            "data.facepairs+fpair_nr"),
                Initializer(
                    MaybeUnused(POD(numpy.uint32, "a_index")),
                    "fpair->a_base + tex1Dfetch(tex_index_lists, "
                    "fpair->a_ilist_index + FACEDOF_NR)"),
                Initializer(
                    MaybeUnused(POD(numpy.uint32, "b_index")),
                    "fpair->b_base + tex1Dfetch(tex_index_lists, "
                    "fpair->b_ilist_index + FACEDOF_NR)"),
                Line(),
                flux_writer(),
                Line(),
                S("fpair_nr += CONCURRENT_FACES")
            ])

            return flux_code
Example #18
0
        def get_flux_code(flux_writer):
            flux_code = Block([])

            flux_code.extend([
                Initializer(Pointer(
                    Value("face_pair", "fpair")),
                    "data.facepairs+fpair_nr"),
                Initializer(
                    MaybeUnused(POD(numpy.uint32, "a_index")),
                    "fpair->a_base + tex1Dfetch(tex_index_lists, "
                    "fpair->a_ilist_index + FACEDOF_NR)"),
                Initializer(
                    MaybeUnused(POD(numpy.uint32, "b_index")),
                    "fpair->b_base + tex1Dfetch(tex_index_lists, "
                    "fpair->b_ilist_index + FACEDOF_NR)"),
                Line(),
                flux_writer(),
                Line(),
                S("fpair_nr += CONCURRENT_FACES")
                ])

            return flux_code
Example #19
0
    def emit_sequential_loop(self, codegen_state, iname, iname_dtype, lbound,
                             ubound, inner):
        from cgen import Pragma, Block
        loop = super().emit_sequential_loop(codegen_state, iname, iname_dtype,
                                            lbound, ubound, inner)

        pragma = self.target.iname_pragma_map.get(iname)
        if pragma:
            return Block(contents=[
                Pragma(pragma),
                loop,
            ])
        return loop
Example #20
0
def make_greet_mod(greeting):
    from cgen import FunctionBody, FunctionDeclaration, Block, \
            Const, Pointer, Value, Statement
    from codepy.bpl import BoostPythonModule

    mod = BoostPythonModule()

    mod.add_function(
        FunctionBody(
            FunctionDeclaration(Const(Pointer(Value("char", "greet"))), []),
            Block([Statement('return "%s"' % greeting)])))

    from codepy.toolchain import guess_toolchain
    return mod.compile(guess_toolchain(), wait_on_error=True)
Example #21
0
    def expose_vector_type(self, name, py_name=None):
        self.add_codepy_include()

        if py_name is None:
            py_name = name

        from cgen import (Block, Typedef, Line, Statement, Value)

        self.init_body.append(
            Block([
                Typedef(Value(name, "cl")),
                Line(),
                Statement(
                    f'boost::python::class_<cl>("{py_name}")'
                    ".def(codepy::no_compare_indexing_suite<cl>())"),
                ]))
Example #22
0
    def get_cpu_per_element_code(self):
        from cgen import (Value, Statement, Initializer, While,
                Comment, Block, For, Line, Pointer)
        S = Statement
        return [
                # assumes there is more than one coefficient
                Initializer(Value("cit_type", "el_modes"), "field_it+er.first"),

                Line(),
                Comment("zero out reduced_modes"),
                For("npy_uint32 mode_idx = 0",
                    "mode_idx < max_degree+1",
                    "++mode_idx",
                    S("reduced_modes[mode_idx] = 0")),

                Line(),
                Comment("gather modes by degree"),
                For("npy_uint32 mode_idx = 0",
                    "mode_idx < mode_count",
                    "++mode_idx",
                    S("reduced_modes[mode_degrees_iterator[mode_idx]]"
                        " += el_modes[mode_idx]")),

                Line(),
                Comment("perform skyline procedure"),
                Initializer(Pointer(Value("value_type", "start")),
                    "reduced_modes.get()"),
                Initializer(Pointer(Value("value_type", "end")),
                    "start+max_degree+1"),
                Initializer(Value("value_type", "cur_max"),
                    "std::max(*(end-1), *(end-2))"),

                Line(),
                While("end != start", Block([
                    S("--end"),
                    S("*end = std::max(cur_max, *end)"),
                    ])),

                Line(),
                Comment("scatter modes by degree"),
                Initializer(Value("it_type", "tgt_base"), "result_it+er.first"),
                For("npy_uint32 mode_idx = 0",
                    "mode_idx < mode_count",
                    "++mode_idx",
                    S("tgt_base[mode_idx] = "
                        "reduced_modes[mode_degrees_iterator[mode_idx]]")),
                ]
Example #23
0
    def get_kernel_call(self, codegen_state, name, gsize, lsize, extra_args):
        ecm = self.get_expression_to_code_mapper(codegen_state)

        from pymbolic.mapper.stringifier import PREC_NONE
        result = []
        from cgen import Statement as S, Block
        if lsize:
            result.append(
                S("assert(programCount == (%s))" % ecm(lsize[0], PREC_NONE)))

        arg_names, arg_decls = self._arg_names_and_decls(codegen_state)

        from cgen.ispc import ISPCLaunch
        result.append(
            ISPCLaunch(tuple(ecm(gs_i, PREC_NONE) for gs_i in gsize),
                       "%s(%s)" % (name, ", ".join(arg_names))))

        return Block(result)
Example #24
0
    def get_cpu_per_element_code(self):
        from cgen import (Value, Statement, Initializer, While, Block)
        S = Statement
        return [
                # assumes there is more than one coefficient
                Initializer(Value("cit_type", "start"), "field_it+er.first"),
                Initializer(Value("cit_type", "end"), "field_it+er.second"),
                Initializer(Value("it_type", "tgt"), "result_it+er.first"),

                Initializer(Value("cit_type", "cur"), "start"),
                While("cur != end",
                    Block([
                        Initializer(Value("cit_type", "avg_start"),
                            "std::max(start, cur-1)"),
                        Initializer(Value("cit_type", "avg_end"),
                            "std::min(end, cur+2)"),

                        S("*tgt++ = std::accumulate(avg_start, avg_end, value_type(0))"
                            "/std::distance(avg_start, avg_end)"),
                        S("++cur"),
                        ])
                    )
                ]
Example #25
0
    def get_kernel(self, diff_op, elgroup, for_benchmark=False):
        from cgen import \
                Pointer, POD, Value, ArrayOf, Const, \
                Module, FunctionDeclaration, FunctionBody, Block, \
                Comment, Line, Define, Include, \
                Initializer, If, For, Statement, Assign

        from pycuda.tools import dtype_to_ctype
        from cgen.cuda import CudaShared, CudaGlobal

        discr = self.discr
        d = discr.dimensions
        dims = range(d)
        plan = self.plan
        given = plan.given

        elgroup, = discr.element_groups
        float_type = given.float_type

        f_decl = CudaGlobal(FunctionDeclaration(Value("void", "apply_diff_mat_smem"),
            [Pointer(POD(float_type, "debugbuf")), Pointer(POD(float_type, "field")), ]
            + [Pointer(POD(float_type, "drst%d_global" % i)) for i in dims]
            ))

        par = plan.parallelism

        cmod = Module([
                Include("pycuda-helpers.hpp"),
                ])

        if float_type == numpy.float64:
            cmod.append(Value("texture<fp_tex_double, 1, cudaReadModeElementType>",
                    "diff_rst_mat_tex"))
        elif float_type == numpy.float32:
            rst_channels = given.devdata.make_valid_tex_channel_count(d)
            cmod.append(Value("texture<float%d, 1, cudaReadModeElementType>"
                    % rst_channels, "diff_rst_mat_tex"))
        else:
            raise ValueError("unsupported float type: %s" % float_type)

        # only preimage size variation is supported here
        assert plan.image_dofs_per_el == given.dofs_per_el()
        assert plan.aligned_image_dofs_per_microblock == given.microblock.aligned_floats

        # FIXME: aligned_image_dofs_per_microblock must be divisible
        # by this, therefore hardcoding for now.
        chunk_size = 16

        cmod.extend([
                Line(),
                Define("DIMENSIONS", discr.dimensions),

                Define("IMAGE_DOFS_PER_EL", plan.image_dofs_per_el),
                Define("PREIMAGE_DOFS_PER_EL", plan.preimage_dofs_per_el),
                Define("ALIGNED_IMAGE_DOFS_PER_MB", plan.aligned_image_dofs_per_microblock),
                Define("ALIGNED_PREIMAGE_DOFS_PER_MB", plan.aligned_preimage_dofs_per_microblock),
                Define("ELS_PER_MB", given.microblock.elements),
                Define("IMAGE_DOFS_PER_MB", "(IMAGE_DOFS_PER_EL*ELS_PER_MB)"),
                Line(),
                Define("CHUNK_SIZE", chunk_size),
                Define("CHUNK_DOF", "threadIdx.x"),
                Define("PAR_MB_NR", "threadIdx.y"),
                Define("CHUNK_NR", "threadIdx.z"),
                Define("IMAGE_MB_DOF", "(CHUNK_NR*CHUNK_SIZE+CHUNK_DOF)"),
                Define("IMAGE_EL_DOF", "(IMAGE_MB_DOF - mb_el*IMAGE_DOFS_PER_EL)"),
                Line(),
                Define("MACROBLOCK_NR", "blockIdx.x"),
                Line(),
                Define("PAR_MB_COUNT", par.parallel),
                Define("INLINE_MB_COUNT", par.inline),
                Define("SEQ_MB_COUNT", par.serial),
                Line(),
                Define("GLOBAL_MB_NR_BASE",
                    "(MACROBLOCK_NR*PAR_MB_COUNT*INLINE_MB_COUNT*SEQ_MB_COUNT)"),
                Define("GLOBAL_MB_NR",
                    "(GLOBAL_MB_NR_BASE"
                    "+ (seq_mb_number*PAR_MB_COUNT + PAR_MB_NR)*INLINE_MB_COUNT)"),
                Define("GLOBAL_MB_IMAGE_DOF_BASE", "(GLOBAL_MB_NR*ALIGNED_IMAGE_DOFS_PER_MB)"),
                Define("GLOBAL_MB_PREIMAGE_DOF_BASE", "(GLOBAL_MB_NR*ALIGNED_PREIMAGE_DOFS_PER_MB)"),
                Line(),
                CudaShared(
                    ArrayOf(
                        ArrayOf(
                            ArrayOf(
                                POD(float_type, "smem_field"),
                                "PAR_MB_COUNT"),
                            "INLINE_MB_COUNT"),
                        "ALIGNED_PREIMAGE_DOFS_PER_MB")),
                Line(),
                ])

        S = Statement
        f_body = Block([
            Initializer(Const(POD(numpy.uint16, "mb_el")),
                "IMAGE_MB_DOF / IMAGE_DOFS_PER_EL"),
            Line(),
            ])

        # ---------------------------------------------------------------------
        def get_load_code():
            mb_img_dofs = plan.aligned_image_dofs_per_microblock
            mb_preimg_dofs = plan.aligned_preimage_dofs_per_microblock
            preimg_dofs_over_dofs = (mb_preimg_dofs+mb_img_dofs-1) // mb_img_dofs

            load_code = []
            store_code = []

            var_num = 0
            for load_block in range(preimg_dofs_over_dofs):
                for inl in range(par.inline):
                    # load and store are split for better pipelining
                    # compiler can't figure that out because of branch

                    var = "tmp%d" % var_num
                    var_num += 1
                    load_code.append(POD(float_type, var))

                    block_addr = "%d * ALIGNED_IMAGE_DOFS_PER_MB + IMAGE_MB_DOF" % load_block
                    load_instr = Assign(var,
                        "field[GLOBAL_MB_PREIMAGE_DOF_BASE"
                        " + %d*ALIGNED_PREIMAGE_DOFS_PER_MB"
                        " + %s]" % (inl, block_addr))
                    store_instr = Assign(
                            "smem_field[PAR_MB_NR][%d][%s]" % (inl, block_addr),
                            var
                            )
                    if (load_block+1)*mb_img_dofs >= mb_preimg_dofs:
                        cond = "%s < ALIGNED_PREIMAGE_DOFS_PER_MB" % block_addr
                        load_instr = If(cond, load_instr)
                        store_instr = If(cond, store_instr)

                    load_code.append(load_instr)
                    store_code.append(store_instr)
            return Block(load_code + [Line()] + store_code)

        def get_scalar_diff_code():
            code = []
            for inl in range(par.inline):
                for axis in dims:
                    code.append(
                        Initializer(POD(float_type, "d%drst%d" % (inl, axis)), 0))

            code.append(Line())

            tex_channels = ["x", "y", "z", "w"]

            store_code = Block()
            for inl in range(par.inline):
                for rst_axis in dims:
                    store_code.append(Assign(
                        "drst%d_global[GLOBAL_MB_IMAGE_DOF_BASE + "
                        "%d*ALIGNED_IMAGE_DOFS_PER_MB + IMAGE_MB_DOF]"
                        % (rst_axis, inl),
                        "d%drst%d" % (inl, rst_axis)
                        ))

            from hedge.backends.cuda.tools import unroll
            code.extend([
                Comment("everybody needs to be done with the old data"),
                S("__syncthreads()"),
                Line(),
                get_load_code(),
                Line(),
                Comment("all the new data must be loaded"),
                S("__syncthreads()"),
                Line(),
                ])

            if float_type == numpy.float32:
                code.append(Value("float%d" % rst_channels, "dmat_entries"))

            code.extend([
                POD(float_type, "field_value%d" % inl)
                for inl in range(par.inline)
                ]+[Line()])

            def unroll_body(j):
                result = [
                    Assign("field_value%d" % inl,
                        "smem_field[PAR_MB_NR][%d][mb_el*PREIMAGE_DOFS_PER_EL+%s]" % (inl, j))
                    for inl in range(par.inline)
                    ]

                if float_type == numpy.float32:
                    result.append(Assign("dmat_entries",
                        "tex1Dfetch(diff_rst_mat_tex, IMAGE_EL_DOF + %s*IMAGE_DOFS_PER_EL)" % j))
                    result.extend(
                        S("d%drst%d += dmat_entries.%s * field_value%d"
                            % (inl, axis, tex_channels[axis], inl))
                        for inl in range(par.inline)
                        for axis in dims)
                elif float_type == numpy.float64:
                    result.extend(
                        S("d%(inl)drst%(axis)d += "
                            "fp_tex1Dfetch(diff_rst_mat_tex, %(axis)d "
                            "+ DIMENSIONS*(IMAGE_EL_DOF + %(j)d*IMAGE_DOFS_PER_EL))"
                            "* field_value%(inl)d" % {
                            "inl": inl,
                            "axis": axis,
                            "j": j
                            })
                        for inl in range(par.inline)
                        for axis in dims)
                else:
                    assert False

                return result

            code.append(If("IMAGE_MB_DOF < IMAGE_DOFS_PER_MB", Block(unroll(unroll_body,
                    total_number=plan.preimage_dofs_per_el)
                    +[store_code])))

            return code

        f_body.extend([
            For("unsigned short seq_mb_number = 0",
                "seq_mb_number < SEQ_MB_COUNT",
                "++seq_mb_number",
                Block(get_scalar_diff_code())
                )
            ])

        # finish off ----------------------------------------------------------
        cmod.append(FunctionBody(f_decl, f_body))

        if not for_benchmark and "cuda_dump_kernels" in discr.debug:
            from hedge.tools import open_unique_debug_file
            open_unique_debug_file("diff", ".cu").write(str(cmod))

        mod = SourceModule(cmod,
                keep="cuda_keep_kernels" in discr.debug,
                #options=["--maxrregcount=16"]
                )

        func = mod.get_function("apply_diff_mat_smem")

        if "cuda_diff" in discr.debug:
            print "diff: lmem=%d smem=%d regs=%d" % (
                    func.local_size_bytes,
                    func.shared_size_bytes,
                    func.registers)

        diff_rst_mat_texref = mod.get_texref("diff_rst_mat_tex")
        gpu_diffmats = self.gpu_diffmats(diff_op, elgroup)

        if given.float_type == numpy.float32:
            gpu_diffmats.bind_to_texref_ext(diff_rst_mat_texref, rst_channels)
        elif given.float_type == numpy.float64:
            gpu_diffmats.bind_to_texref_ext(diff_rst_mat_texref,
                    allow_double_hack=True)
        else:
            assert False

        assert given.microblock.aligned_floats % chunk_size == 0
        block = (
                chunk_size,
                plan.parallelism.parallel,
                given.microblock.aligned_floats//chunk_size)

        func.prepare(
                ["PP"] + discr.dimensions*["P"],
                texrefs=[diff_rst_mat_texref])

        return block, func
Example #26
0
    def get_kernel(self, diff_op, elgroup, for_benchmark=False):
        from cgen import (
            Pointer,
            POD,
            Value,
            ArrayOf,
            Const,
            Module,
            FunctionDeclaration,
            FunctionBody,
            Block,
            Comment,
            Line,
            Define,
            Include,
            Initializer,
            If,
            For,
            Statement,
            Assign,
        )

        from pycuda.tools import dtype_to_ctype
        from cgen.cuda import CudaShared, CudaGlobal

        discr = self.discr
        d = discr.dimensions
        dims = range(d)
        plan = self.plan
        given = plan.given

        elgroup, = discr.element_groups
        float_type = given.float_type

        f_decl = CudaGlobal(
            FunctionDeclaration(
                Value("void", "apply_diff_mat_smem"),
                [Pointer(POD(float_type, "debugbuf")), Pointer(POD(float_type, "field"))]
                + [Pointer(POD(float_type, "drst%d_global" % i)) for i in dims],
            )
        )

        par = plan.parallelism

        cmod = Module([Include("pycuda-helpers.hpp")])

        if float_type == numpy.float64:
            cmod.append(Value("texture<fp_tex_double, 1, cudaReadModeElementType>", "diff_rst_mat_tex"))
        elif float_type == numpy.float32:
            rst_channels = given.devdata.make_valid_tex_channel_count(d)
            cmod.append(Value("texture<float%d, 1, cudaReadModeElementType>" % rst_channels, "diff_rst_mat_tex"))
        else:
            raise ValueError("unsupported float type: %s" % float_type)

        # only preimage size variation is supported here
        assert plan.image_dofs_per_el == given.dofs_per_el()
        assert plan.aligned_image_dofs_per_microblock == given.microblock.aligned_floats

        # FIXME: aligned_image_dofs_per_microblock must be divisible
        # by this, therefore hardcoding for now.
        chunk_size = 16

        cmod.extend(
            [
                Line(),
                Define("DIMENSIONS", discr.dimensions),
                Define("IMAGE_DOFS_PER_EL", plan.image_dofs_per_el),
                Define("PREIMAGE_DOFS_PER_EL", plan.preimage_dofs_per_el),
                Define("ALIGNED_IMAGE_DOFS_PER_MB", plan.aligned_image_dofs_per_microblock),
                Define("ALIGNED_PREIMAGE_DOFS_PER_MB", plan.aligned_preimage_dofs_per_microblock),
                Define("ELS_PER_MB", given.microblock.elements),
                Define("IMAGE_DOFS_PER_MB", "(IMAGE_DOFS_PER_EL*ELS_PER_MB)"),
                Line(),
                Define("CHUNK_SIZE", chunk_size),
                Define("CHUNK_DOF", "threadIdx.x"),
                Define("PAR_MB_NR", "threadIdx.y"),
                Define("CHUNK_NR", "threadIdx.z"),
                Define("IMAGE_MB_DOF", "(CHUNK_NR*CHUNK_SIZE+CHUNK_DOF)"),
                Define("IMAGE_EL_DOF", "(IMAGE_MB_DOF - mb_el*IMAGE_DOFS_PER_EL)"),
                Line(),
                Define("MACROBLOCK_NR", "blockIdx.x"),
                Line(),
                Define("PAR_MB_COUNT", par.parallel),
                Define("INLINE_MB_COUNT", par.inline),
                Define("SEQ_MB_COUNT", par.serial),
                Line(),
                Define("GLOBAL_MB_NR_BASE", "(MACROBLOCK_NR*PAR_MB_COUNT*INLINE_MB_COUNT*SEQ_MB_COUNT)"),
                Define(
                    "GLOBAL_MB_NR", "(GLOBAL_MB_NR_BASE" "+ (seq_mb_number*PAR_MB_COUNT + PAR_MB_NR)*INLINE_MB_COUNT)"
                ),
                Define("GLOBAL_MB_IMAGE_DOF_BASE", "(GLOBAL_MB_NR*ALIGNED_IMAGE_DOFS_PER_MB)"),
                Define("GLOBAL_MB_PREIMAGE_DOF_BASE", "(GLOBAL_MB_NR*ALIGNED_PREIMAGE_DOFS_PER_MB)"),
                Line(),
                CudaShared(
                    ArrayOf(
                        ArrayOf(ArrayOf(POD(float_type, "smem_field"), "PAR_MB_COUNT"), "INLINE_MB_COUNT"),
                        "ALIGNED_PREIMAGE_DOFS_PER_MB",
                    )
                ),
                Line(),
            ]
        )

        S = Statement
        f_body = Block([Initializer(Const(POD(numpy.uint16, "mb_el")), "IMAGE_MB_DOF / IMAGE_DOFS_PER_EL"), Line()])

        # ---------------------------------------------------------------------
        def get_load_code():
            mb_img_dofs = plan.aligned_image_dofs_per_microblock
            mb_preimg_dofs = plan.aligned_preimage_dofs_per_microblock
            preimg_dofs_over_dofs = (mb_preimg_dofs + mb_img_dofs - 1) // mb_img_dofs

            load_code = []
            store_code = []

            var_num = 0
            for load_block in range(preimg_dofs_over_dofs):
                for inl in range(par.inline):
                    # load and store are split for better pipelining
                    # compiler can't figure that out because of branch

                    var = "tmp%d" % var_num
                    var_num += 1
                    load_code.append(POD(float_type, var))

                    block_addr = "%d * ALIGNED_IMAGE_DOFS_PER_MB + IMAGE_MB_DOF" % load_block
                    load_instr = Assign(
                        var,
                        "field[GLOBAL_MB_PREIMAGE_DOF_BASE"
                        " + %d*ALIGNED_PREIMAGE_DOFS_PER_MB"
                        " + %s]" % (inl, block_addr),
                    )
                    store_instr = Assign("smem_field[PAR_MB_NR][%d][%s]" % (inl, block_addr), var)
                    if (load_block + 1) * mb_img_dofs >= mb_preimg_dofs:
                        cond = "%s < ALIGNED_PREIMAGE_DOFS_PER_MB" % block_addr
                        load_instr = If(cond, load_instr)
                        store_instr = If(cond, store_instr)

                    load_code.append(load_instr)
                    store_code.append(store_instr)
            return Block(load_code + [Line()] + store_code)

        def get_scalar_diff_code():
            code = []
            for inl in range(par.inline):
                for axis in dims:
                    code.append(Initializer(POD(float_type, "d%drst%d" % (inl, axis)), 0))

            code.append(Line())

            tex_channels = ["x", "y", "z", "w"]

            store_code = Block()
            for inl in range(par.inline):
                for rst_axis in dims:
                    store_code.append(
                        Assign(
                            "drst%d_global[GLOBAL_MB_IMAGE_DOF_BASE + "
                            "%d*ALIGNED_IMAGE_DOFS_PER_MB + IMAGE_MB_DOF]" % (rst_axis, inl),
                            "d%drst%d" % (inl, rst_axis),
                        )
                    )

            from hedge.backends.cuda.tools import unroll

            code.extend(
                [
                    Comment("everybody needs to be done with the old data"),
                    S("__syncthreads()"),
                    Line(),
                    get_load_code(),
                    Line(),
                    Comment("all the new data must be loaded"),
                    S("__syncthreads()"),
                    Line(),
                ]
            )

            if float_type == numpy.float32:
                code.append(Value("float%d" % rst_channels, "dmat_entries"))

            code.extend([POD(float_type, "field_value%d" % inl) for inl in range(par.inline)] + [Line()])

            def unroll_body(j):
                result = [
                    Assign("field_value%d" % inl, "smem_field[PAR_MB_NR][%d][mb_el*PREIMAGE_DOFS_PER_EL+%s]" % (inl, j))
                    for inl in range(par.inline)
                ]

                if float_type == numpy.float32:
                    result.append(
                        Assign("dmat_entries", "tex1Dfetch(diff_rst_mat_tex, IMAGE_EL_DOF + %s*IMAGE_DOFS_PER_EL)" % j)
                    )
                    result.extend(
                        S("d%drst%d += dmat_entries.%s * field_value%d" % (inl, axis, tex_channels[axis], inl))
                        for inl in range(par.inline)
                        for axis in dims
                    )
                elif float_type == numpy.float64:
                    result.extend(
                        S(
                            "d%(inl)drst%(axis)d += "
                            "fp_tex1Dfetch(diff_rst_mat_tex, %(axis)d "
                            "+ DIMENSIONS*(IMAGE_EL_DOF + %(j)d*IMAGE_DOFS_PER_EL))"
                            "* field_value%(inl)d" % {"inl": inl, "axis": axis, "j": j}
                        )
                        for inl in range(par.inline)
                        for axis in dims
                    )
                else:
                    assert False

                return result

            code.append(
                If(
                    "IMAGE_MB_DOF < IMAGE_DOFS_PER_MB",
                    Block(unroll(unroll_body, total_number=plan.preimage_dofs_per_el) + [store_code]),
                )
            )

            return code

        f_body.extend(
            [
                For(
                    "unsigned short seq_mb_number = 0",
                    "seq_mb_number < SEQ_MB_COUNT",
                    "++seq_mb_number",
                    Block(get_scalar_diff_code()),
                )
            ]
        )

        # finish off ----------------------------------------------------------
        cmod.append(FunctionBody(f_decl, f_body))

        if not for_benchmark and "cuda_dump_kernels" in discr.debug:
            from hedge.tools import open_unique_debug_file

            open_unique_debug_file("diff", ".cu").write(str(cmod))

        mod = SourceModule(
            cmod,
            keep="cuda_keep_kernels" in discr.debug,
            # options=["--maxrregcount=16"]
        )

        func = mod.get_function("apply_diff_mat_smem")

        if "cuda_diff" in discr.debug:
            print "diff: lmem=%d smem=%d regs=%d" % (func.local_size_bytes, func.shared_size_bytes, func.registers)

        diff_rst_mat_texref = mod.get_texref("diff_rst_mat_tex")
        gpu_diffmats = self.gpu_diffmats(diff_op, elgroup)

        if given.float_type == numpy.float32:
            gpu_diffmats.bind_to_texref_ext(diff_rst_mat_texref, rst_channels)
        elif given.float_type == numpy.float64:
            gpu_diffmats.bind_to_texref_ext(diff_rst_mat_texref, allow_double_hack=True)
        else:
            assert False

        assert given.microblock.aligned_floats % chunk_size == 0
        block = (chunk_size, plan.parallelism.parallel, given.microblock.aligned_floats // chunk_size)

        func.prepare(["PP"] + discr.dimensions * ["P"], texrefs=[diff_rst_mat_texref])

        return block, func
Example #27
0
        Pointer, Module, Block, Initializer, Assign
from cgen.cuda import CudaGlobal

mod = Module([
    FunctionBody(
        CudaGlobal(
            FunctionDeclaration(Value("void", "add"),
                                arg_decls=[
                                    Pointer(POD(dtype, name))
                                    for name in ["tgt", "op1", "op2"]
                                ])),
        Block([
            Initializer(
                POD(numpy.int32, "idx"), "threadIdx.x + %d*blockIdx.x" %
                (block_size * thread_strides)),
        ] + [
            Assign(
                "tgt[idx+%d]" % (o * block_size), "op1[idx+%d] + op2[idx+%d]" %
                (o * block_size, o * block_size))
            for o in range(thread_strides)
        ]))
])

mod = SourceModule(mod)

func = mod.get_function("add")
func(c_gpu, a_gpu, b_gpu, block=(block_size, 1, 1), grid=(macroblock_count, 1))

c = cuda.from_device_like(c_gpu, a)

assert la.norm(c - (a + b)) == 0
Example #28
0
    def get_kernel(self, diff_op_cls, elgroup, for_benchmark=False):
        from cgen import \
                Pointer, POD, Value, ArrayOf, \
                Module, FunctionDeclaration, FunctionBody, Block, \
                Line, Define, Include, \
                Initializer, If, For, Statement, Assign

        from cgen import dtype_to_ctype
        from cgen.cuda import CudaShared, CudaGlobal

        discr = self.discr
        d = discr.dimensions
        dims = range(d)
        given = self.plan.given

        par = self.plan.parallelism

        diffmat_data = self.gpu_diffmats(diff_op_cls, elgroup)
        elgroup, = discr.element_groups

        float_type = given.float_type

        f_decl = CudaGlobal(FunctionDeclaration(Value("void", "apply_diff_mat"),
            [Pointer(POD(numpy.uint8, "gmem_diff_rst_mat")),
                #Pointer(POD(float_type, "debugbuf")),
                ] + [Pointer(POD(float_type, "drst%d_global" % i)) for i in dims]
            ))

        rst_channels = given.devdata.make_valid_tex_channel_count(d)
        cmod = Module([
                Include("pycuda-helpers.hpp"),
                Line(),
                Value("texture<fp_tex_%s, 1, cudaReadModeElementType>"
                    % dtype_to_ctype(float_type),
                    "field_tex"),
                Line(),
                Define("DIMENSIONS", discr.dimensions),
                Define("DOFS_PER_EL", given.dofs_per_el()),
                Line(),
                Define("SEGMENT_DOF", "threadIdx.x"),
                Define("PAR_MB_NR", "threadIdx.y"),
                Line(),
                Define("MB_SEGMENT", "blockIdx.x"),
                Define("MACROBLOCK_NR", "blockIdx.y"),
                Line(),
                Define("DOFS_PER_SEGMENT", self.plan.segment_size),
                Define("SEGMENTS_PER_MB", self.plan.segments_per_microblock()),
                Define("ALIGNED_DOFS_PER_MB", given.microblock.aligned_floats),
                Define("ELS_PER_MB", given.microblock.elements),
                Line(),
                Define("PAR_MB_COUNT", par.parallel),
                Define("INLINE_MB_COUNT", par.inline),
                Define("SEQ_MB_COUNT", par.serial),
                Line(),
                Define("THREAD_NUM", "(SEGMENT_DOF+PAR_MB_NR*DOFS_PER_SEGMENT)"),
                Define("COALESCING_THREAD_COUNT", "(PAR_MB_COUNT*DOFS_PER_SEGMENT)"),
                Line(),
                Define("MB_DOF_BASE", "(MB_SEGMENT*DOFS_PER_SEGMENT)"),
                Define("MB_DOF", "(MB_DOF_BASE+SEGMENT_DOF)"),
                Define("GLOBAL_MB_NR_BASE",
                    "(MACROBLOCK_NR*PAR_MB_COUNT*INLINE_MB_COUNT*SEQ_MB_COUNT)"),
                Define("GLOBAL_MB_NR",
                    "(GLOBAL_MB_NR_BASE"
                    "+ (seq_mb_number*PAR_MB_COUNT + PAR_MB_NR)*INLINE_MB_COUNT)"),
                Define("GLOBAL_MB_DOF_BASE", "(GLOBAL_MB_NR*ALIGNED_DOFS_PER_MB)"),
                Line(),
                Define("DIFFMAT_SEGMENT_FLOATS", diffmat_data.block_floats),
                Define("DIFFMAT_SEGMENT_BYTES", "(DIFFMAT_SEGMENT_FLOATS*%d)"
                     % given.float_size()),
                Define("DIFFMAT_COLUMNS", diffmat_data.matrix_columns),
                Line(),
                CudaShared(ArrayOf(POD(float_type, "smem_diff_rst_mat"),
                    "DIFFMAT_COLUMNS*DOFS_PER_SEGMENT")),
                Line(),
                ])

        S = Statement
        f_body = Block()

        f_body.extend_log_block("calculate responsibility data", [
            Initializer(POD(numpy.uint16, "mb_el"),
                "MB_DOF/DOFS_PER_EL"),
            ])

        from hedge.backends.cuda.tools import get_load_code
        f_body.extend(
            get_load_code(
                dest="smem_diff_rst_mat",
                base="gmem_diff_rst_mat + MB_SEGMENT*DIFFMAT_SEGMENT_BYTES",
                bytes="DIFFMAT_SEGMENT_BYTES",
                descr="load diff mat segment")
            +[S("__syncthreads()"), Line()])

        # ---------------------------------------------------------------------
        def get_scalar_diff_code():
            code = []
            for inl in range(par.inline):
                for axis in dims:
                    code.append(
                        Initializer(POD(float_type, "d%drst%d" % (inl, axis)), 0))

            code.append(Line())

            def get_mat_entry(row, col, axis):
                return ("smem_diff_rst_mat["
                        "%(row)s*DIFFMAT_COLUMNS + %(axis)s*DOFS_PER_EL"
                        " + %(col)s"
                        "]" % {"row":row, "col":col, "axis":axis}
                        )

            tex_channels = ["x", "y", "z", "w"]
            from hedge.backends.cuda.tools import unroll
            code.extend(
                    [POD(float_type, "field_value%d" % inl)
                        for inl in range(par.inline)]
                    +[Line()]
                    +unroll(lambda j: [
                        Assign("field_value%d" % inl,
                            "fp_tex1Dfetch(field_tex, GLOBAL_MB_DOF_BASE + %d*ALIGNED_DOFS_PER_MB "
                            "+ mb_el*DOFS_PER_EL + %s)" % (inl, j)
                            )
                        for inl in range(par.inline)]
                        +[Line()]
                        +[S("d%drst%d += %s * field_value%d"
                            % (inl, axis, get_mat_entry("SEGMENT_DOF", j, axis), inl))
                        for axis in dims
                        for inl in range(par.inline)]
                        +[Line()],
                        given.dofs_per_el(), self.plan.max_unroll)
                    )

            store_code = Block()
            for inl in range(par.inline):
                for rst_axis in dims:
                    store_code.append(Assign(
                        "drst%d_global[GLOBAL_MB_DOF_BASE"
                        " + %d*ALIGNED_DOFS_PER_MB + MB_DOF]" % (rst_axis, inl),
                        "d%drst%d" % (inl, rst_axis),
                        ))

            code.append(If("MB_DOF < DOFS_PER_EL*ELS_PER_MB", store_code))

            return code

        f_body.extend([
            For("unsigned short seq_mb_number = 0",
                "seq_mb_number < SEQ_MB_COUNT",
                "++seq_mb_number",
                Block(get_scalar_diff_code()))
            ])

        # finish off ----------------------------------------------------------
        cmod.append(FunctionBody(f_decl, f_body))

        if not for_benchmark and "cuda_dump_kernels" in discr.debug:
            from hedge.tools import open_unique_debug_file
            open_unique_debug_file("diff", ".cu").write(str(cmod))

        mod = SourceModule(cmod,
                keep="cuda_keep_kernels" in discr.debug,
                #options=["--maxrregcount=10"]
                )

        field_texref = mod.get_texref("field_tex")

        func = mod.get_function("apply_diff_mat")
        func.prepare(
                discr.dimensions*[float_type] + ["P"],
                block=(self.plan.segment_size, par.parallel, 1),
                texrefs=[field_texref])

        if "cuda_diff" in discr.debug:
            print "diff: lmem=%d smem=%d regs=%d" % (
                    func.local_size_bytes,
                    func.shared_size_bytes,
                    func.num_regs)

        return func, field_texref
Example #29
0
    def get_kernel(self, fdata, ilist_data, for_benchmark):
        from cgen.cuda import CudaShared, CudaGlobal
        from pycuda.tools import dtype_to_ctype

        discr = self.discr
        given = self.plan.given
        fplan = self.plan
        d = discr.dimensions
        dims = range(d)

        elgroup, = discr.element_groups

        float_type = given.float_type

        f_decl = CudaGlobal(
            FunctionDeclaration(Value("void", "apply_flux"), [
                Pointer(POD(float_type, "debugbuf")),
                Pointer(POD(numpy.uint8, "gmem_facedata")),
            ] + [
                Pointer(POD(float_type, "gmem_fluxes_on_faces%d" % flux_nr))
                for flux_nr in range(len(self.fluxes))
            ]))

        cmod = Module()
        cmod.append(Include("pycuda-helpers.hpp"))

        for dep_expr in self.all_deps:
            cmod.extend([
                Value(
                    "texture<%s, 1, cudaReadModeElementType>" %
                    dtype_to_ctype(float_type, with_fp_tex_hack=True),
                    "field%d_tex" % self.dep_to_index[dep_expr])
            ])

        if fplan.flux_count != len(self.fluxes):
            from warnings import warn
            warn(
                "Flux count in flux execution plan different from actual flux count.\n"
                "You may want to specify the tune_for= kwarg in the Discretization\n"
                "constructor.")

        cmod.extend([
            Line(),
            Typedef(POD(float_type, "value_type")),
            Line(),
            flux_header_struct(float_type, discr.dimensions),
            Line(),
            face_pair_struct(float_type, discr.dimensions),
            Line(),
            Define("DIMENSIONS", discr.dimensions),
            Define("DOFS_PER_FACE", fplan.dofs_per_face),
            Define("THREADS_PER_FACE", fplan.threads_per_face()),
            Line(),
            Define("CONCURRENT_FACES", fplan.parallel_faces),
            Define("BLOCK_MB_COUNT", fplan.mbs_per_block),
            Line(),
            Define("FACEDOF_NR", "threadIdx.x"),
            Define("BLOCK_FACE", "threadIdx.y"),
            Line(),
            Define("FLUX_COUNT", len(self.fluxes)),
            Line(),
            Define("THREAD_NUM", "(FACEDOF_NR + BLOCK_FACE*THREADS_PER_FACE)"),
            Define("THREAD_COUNT", "(THREADS_PER_FACE*CONCURRENT_FACES)"),
            Define(
                "COALESCING_THREAD_COUNT",
                "(THREAD_COUNT < 0x10 ? THREAD_COUNT : THREAD_COUNT & ~0xf)"),
            Line(),
            Define("DATA_BLOCK_SIZE", fdata.block_bytes),
            Define("ALIGNED_FACE_DOFS_PER_MB",
                   fplan.aligned_face_dofs_per_microblock()),
            Define("ALIGNED_FACE_DOFS_PER_BLOCK",
                   "(ALIGNED_FACE_DOFS_PER_MB*BLOCK_MB_COUNT)"),
            Line(),
            Define("FOF_BLOCK_BASE",
                   "(blockIdx.x*ALIGNED_FACE_DOFS_PER_BLOCK)"),
            Line(),
        ] + ilist_data.code + [
            Line(),
            Value("texture<index_list_entry_t, 1, cudaReadModeElementType>",
                  "tex_index_lists"),
            Line(),
            fdata.struct,
            Line(),
            CudaShared(Value("flux_data", "data")),
        ])

        if not fplan.direct_store:
            cmod.extend([
                CudaShared(
                    ArrayOf(
                        ArrayOf(POD(float_type, "smem_fluxes_on_faces"),
                                "FLUX_COUNT"),
                        "ALIGNED_FACE_DOFS_PER_MB*BLOCK_MB_COUNT")),
                Line(),
            ])

        S = Statement
        f_body = Block()

        from hedge.backends.cuda.tools import get_load_code

        f_body.extend(
            get_load_code(dest="&data",
                          base="gmem_facedata + blockIdx.x*DATA_BLOCK_SIZE",
                          bytes="sizeof(flux_data)",
                          descr="load face_pair data") +
            [S("__syncthreads()"), Line()])

        def get_flux_code(flux_writer):
            flux_code = Block([])

            flux_code.extend([
                Initializer(Pointer(Value("face_pair", "fpair")),
                            "data.facepairs+fpair_nr"),
                Initializer(
                    MaybeUnused(POD(numpy.uint32, "a_index")),
                    "fpair->a_base + tex1Dfetch(tex_index_lists, "
                    "fpair->a_ilist_index + FACEDOF_NR)"),
                Initializer(
                    MaybeUnused(POD(numpy.uint32, "b_index")),
                    "fpair->b_base + tex1Dfetch(tex_index_lists, "
                    "fpair->b_ilist_index + FACEDOF_NR)"),
                Line(),
                flux_writer(),
                Line(),
                S("fpair_nr += CONCURRENT_FACES")
            ])

            return flux_code

        flux_computation = Block([
            Comment("fluxes for dual-sided (intra-block) interior face pairs"),
            While("fpair_nr < data.header.same_facepairs_end",
                  get_flux_code(lambda: self.write_interior_flux_code(True))),
            Line(),
            Comment("work around nvcc assertion failure"),
            S("fpair_nr+=1"),
            S("fpair_nr-=1"),
            Line(),
            Comment(
                "fluxes for single-sided (inter-block) interior face pairs"),
            While("fpair_nr < data.header.diff_facepairs_end",
                  get_flux_code(lambda: self.write_interior_flux_code(False))),
            Line(),
            Comment("fluxes for single-sided boundary face pairs"),
            While(
                "fpair_nr < data.header.bdry_facepairs_end",
                get_flux_code(
                    lambda: self.write_boundary_flux_code(for_benchmark))),
        ])

        f_body.extend_log_block("compute the fluxes", [
            Initializer(POD(numpy.uint32, "fpair_nr"), "BLOCK_FACE"),
            If("FACEDOF_NR < DOFS_PER_FACE", flux_computation)
        ])

        if not fplan.direct_store:
            f_body.extend([Line(), S("__syncthreads()"), Line()])

            f_body.extend_log_block(
                "store fluxes",
                [
                    #Assign("debugbuf[blockIdx.x]", "FOF_BLOCK_BASE"),
                    #Assign("debugbuf[0]", "FOF_BLOCK_BASE"),
                    #Assign("debugbuf[0]", "sizeof(face_pair)"),
                    For(
                        "unsigned word_nr = THREAD_NUM",
                        "word_nr < ALIGNED_FACE_DOFS_PER_MB*BLOCK_MB_COUNT",
                        "word_nr += COALESCING_THREAD_COUNT",
                        Block([
                            Assign(
                                "gmem_fluxes_on_faces%d[FOF_BLOCK_BASE+word_nr]"
                                % flux_nr,
                                "smem_fluxes_on_faces[%d][word_nr]" % flux_nr)
                            for flux_nr in range(len(self.fluxes))
                        ]
                              #+[If("isnan(smem_fluxes_on_faces[%d][word_nr])" % flux_nr,
                              #Block([
                              #Assign("debugbuf[blockIdx.x]", "word_nr"),
                              #])
                              #)
                              #for flux_nr in range(len(self.fluxes))]
                              ))
                ])
        if False:
            f_body.extend([
                Assign("debugbuf[blockIdx.x*96+32+BLOCK_FACE*32+threadIdx.x]",
                       "fpair_nr"),
                Assign("debugbuf[blockIdx.x*96+16]",
                       "data.header.same_facepairs_end"),
                Assign("debugbuf[blockIdx.x*96+17]",
                       "data.header.diff_facepairs_end"),
                Assign("debugbuf[blockIdx.x*96+18]",
                       "data.header.bdry_facepairs_end"),
            ])

        # finish off ----------------------------------------------------------
        cmod.append(FunctionBody(f_decl, f_body))

        if not for_benchmark and "cuda_dump_kernels" in discr.debug:
            from hedge.tools import open_unique_debug_file
            open_unique_debug_file("flux_gather", ".cu").write(str(cmod))

        #from pycuda.tools import allow_user_edit
        mod = SourceModule(
            #allow_user_edit(cmod, "kernel.cu", "the flux kernel"),
            cmod,
            keep="cuda_keep_kernels" in discr.debug)
        expr_to_texture_map = dict(
            (dep_expr,
             mod.get_texref("field%d_tex" % self.dep_to_index[dep_expr]))
            for dep_expr in self.all_deps)

        index_list_texref = mod.get_texref("tex_index_lists")
        index_list_texref.set_address(ilist_data.device_memory,
                                      ilist_data.bytes)
        index_list_texref.set_format(
            cuda.dtype_to_array_format(ilist_data.type), 1)
        index_list_texref.set_flags(cuda.TRSF_READ_AS_INTEGER)

        func = mod.get_function("apply_flux")
        block = (fplan.threads_per_face(), fplan.parallel_faces, 1)
        func.prepare(
            (2 + len(self.fluxes)) * "P",
            texrefs=expr_to_texture_map.values() + [index_list_texref])

        if "cuda_flux" in discr.debug:
            print "flux: lmem=%d smem=%d regs=%d" % (
                func.local_size_bytes, func.shared_size_bytes, func.num_regs)

        return block, func, expr_to_texture_map
Example #30
0
def generate_assignment_instruction_code(codegen_state, insn):
    kernel = codegen_state.kernel

    ecm = codegen_state.expression_to_code_mapper

    from loopy.expression import dtype_to_type_context, VectorizabilityChecker

    # {{{ vectorization handling

    if codegen_state.vectorization_info:
        if insn.atomicity:
            raise Unvectorizable("atomic operation")

        vinfo = codegen_state.vectorization_info
        vcheck = VectorizabilityChecker(kernel, vinfo.iname, vinfo.length)
        lhs_is_vector = vcheck(insn.assignee)
        rhs_is_vector = vcheck(insn.expression)

        if not lhs_is_vector and rhs_is_vector:
            raise Unvectorizable("LHS is scalar, RHS is vector, cannot assign")

        is_vector = lhs_is_vector

        del lhs_is_vector
        del rhs_is_vector

    # }}}

    from pymbolic.primitives import Variable, Subscript
    from loopy.symbolic import LinearSubscript

    lhs = insn.assignee
    if isinstance(lhs, Variable):
        assignee_var_name = lhs.name
        assignee_indices = ()

    elif isinstance(lhs, Subscript):
        assignee_var_name = lhs.aggregate.name
        assignee_indices = lhs.index_tuple

    elif isinstance(lhs, LinearSubscript):
        assignee_var_name = lhs.aggregate.name
        assignee_indices = (lhs.index, )

    else:
        raise RuntimeError("invalid lvalue '%s'" % lhs)

    lhs_var = kernel.get_var_descriptor(assignee_var_name)
    lhs_dtype = lhs_var.dtype

    if insn.atomicity is not None:
        lhs_atomicity = [
            a for a in insn.atomicity if a.var_name == assignee_var_name
        ]
        assert len(lhs_atomicity) <= 1
        if lhs_atomicity:
            lhs_atomicity, = lhs_atomicity
        else:
            lhs_atomicity = None
    else:
        lhs_atomicity = None

    from loopy.kernel.data import AtomicInit, AtomicUpdate

    lhs_code = ecm(insn.assignee, prec=PREC_NONE, type_context=None)
    rhs_type_context = dtype_to_type_context(kernel.target, lhs_dtype)
    if lhs_atomicity is None:
        result = codegen_state.ast_builder.emit_assignment(
            codegen_state, lhs_code,
            ecm(insn.expression,
                prec=PREC_NONE,
                type_context=rhs_type_context,
                needed_dtype=lhs_dtype))

    elif isinstance(lhs_atomicity, AtomicInit):
        raise NotImplementedError("atomic init")

    elif isinstance(lhs_atomicity, AtomicUpdate):
        codegen_state.seen_atomic_dtypes.add(lhs_dtype)
        result = codegen_state.ast_builder.generate_atomic_update(
            kernel, codegen_state, lhs_atomicity, lhs_var, insn.assignee,
            insn.expression, lhs_dtype, rhs_type_context)

    else:
        raise ValueError("unexpected lhs atomicity type: %s" %
                         type(lhs_atomicity).__name__)

    # {{{ tracing

    if kernel.options.trace_assignments or kernel.options.trace_assignment_values:
        if codegen_state.vectorization_info and is_vector:
            raise Unvectorizable("tracing does not support vectorization")

        from cgen import Statement as S  # noqa

        gs, ls = kernel.get_grid_size_upper_bounds()

        printf_format = "%s.%s[%s][%s]: %s" % (kernel.name, insn.id, ", ".join(
            "gid%d=%%d" % i for i in range(len(gs))), ", ".join(
                "lid%d=%%d" % i for i in range(len(ls))), assignee_var_name)

        printf_args = (["gid(%d)" % i for i in range(len(gs))] +
                       ["lid(%d)" % i for i in range(len(ls))])

        if assignee_indices:
            printf_format += "[%s]" % ",".join(len(assignee_indices) * ["%d"])
            printf_args.extend(
                ecm(i, prec=PREC_NONE, type_context="i")
                for i in assignee_indices)

        if kernel.options.trace_assignment_values:
            if lhs_dtype.numpy_dtype.kind == "i":
                printf_format += " = %d"
                printf_args.append(lhs_code)
            elif lhs_dtype.numpy_dtype.kind == "f":
                printf_format += " = %g"
                printf_args.append(lhs_code)
            elif lhs_dtype.numpy_dtype.kind == "c":
                printf_format += " = %g + %gj"
                printf_args.extend(["(%s).x" % lhs_code, "(%s).y" % lhs_code])

        if printf_args:
            printf_args_str = ", " + ", ".join(printf_args)
        else:
            printf_args_str = ""

        printf_insn = S("printf(\"%s\\n\"%s)" %
                        (printf_format, printf_args_str))

        from cgen import Block
        if kernel.options.trace_assignment_values:
            result = Block([result, printf_insn])
        else:
            # print first, execute later -> helps find segfaults
            result = Block([printf_insn, result])

    # }}}

    return result
Example #31
0
    def generate_code(self, kernel, codegen_state, impl_arg_info):
        from cgen import (FunctionBody, FunctionDeclaration, Value, Module,
                Block, Line, Statement as S)
        from cgen.ispc import ISPCExport, ISPCTask

        knl_body, implemented_domains = kernel.target.generate_body(
                kernel, codegen_state)

        inner_name = "lp_ispc_inner_"+kernel.name
        arg_decls = [iai.cgen_declarator for iai in impl_arg_info]
        arg_names = [iai.name for iai in impl_arg_info]

        # {{{ occa compatibility hackery

        if self.occa_mode:
            from cgen import ArrayOf, Const
            from cgen.ispc import ISPCUniform

            arg_decls = [
                    Const(ISPCUniform(ArrayOf(Value("int", "loopy_dims")))),
                    Const(ISPCUniform(Value("int", "o1"))),
                    Const(ISPCUniform(Value("int", "o2"))),
                    Const(ISPCUniform(Value("int", "o3"))),
                    ] + arg_decls
            arg_names = ["loopy_dims", "o1", "o2", "o3"] + arg_names

        # }}}

        knl_fbody = FunctionBody(
                ISPCTask(
                    FunctionDeclaration(
                        Value("void", inner_name),
                        arg_decls)),
                knl_body)

        # {{{ generate wrapper

        wrapper_body = Block()

        gsize, lsize = kernel.get_grid_sizes_as_exprs()
        if len(lsize) > 1:
            for i, ls_i in enumerate(lsize[1:]):
                if ls_i != 1:
                    raise LoopyError("local axis %d (0-based) "
                            "has length > 1, which is unsupported "
                            "by ISPC" % ls_i)

        from pymbolic.mapper.stringifier import PREC_COMPARISON, PREC_NONE
        ccm = self.get_expression_to_code_mapper(codegen_state)

        wrapper_body.extend([
                S("assert(programCount == %s)"
                    % ccm(lsize[0], PREC_COMPARISON)),
                S("launch[%s] %s(%s)"
                    % (
                        ", ".join(
                            ccm(gs_i, PREC_NONE)
                            for gs_i in gsize),
                        inner_name,
                        ", ".join(arg_names)
                        ))
                ])

        wrapper_fbody = FunctionBody(
                ISPCExport(
                    FunctionDeclaration(
                        Value("void", kernel.name),
                        arg_decls)),
                wrapper_body)

        # }}}

        mod = Module([
            knl_fbody,
            Line(),
            wrapper_fbody,
            ])

        return str(mod), implemented_domains
Example #32
0
        def get_scalar_diff_code():
            code = []
            for inl in range(par.inline):
                for axis in dims:
                    code.append(
                        Initializer(POD(float_type, "d%drst%d" % (inl, axis)), 0))

            code.append(Line())

            tex_channels = ["x", "y", "z", "w"]

            store_code = Block()
            for inl in range(par.inline):
                for rst_axis in dims:
                    store_code.append(Assign(
                        "drst%d_global[GLOBAL_MB_IMAGE_DOF_BASE + "
                        "%d*ALIGNED_IMAGE_DOFS_PER_MB + IMAGE_MB_DOF]"
                        % (rst_axis, inl),
                        "d%drst%d" % (inl, rst_axis)
                        ))

            from hedge.backends.cuda.tools import unroll
            code.extend([
                Comment("everybody needs to be done with the old data"),
                S("__syncthreads()"),
                Line(),
                get_load_code(),
                Line(),
                Comment("all the new data must be loaded"),
                S("__syncthreads()"),
                Line(),
                ])

            if float_type == numpy.float32:
                code.append(Value("float%d" % rst_channels, "dmat_entries"))

            code.extend([
                POD(float_type, "field_value%d" % inl)
                for inl in range(par.inline)
                ]+[Line()])

            def unroll_body(j):
                result = [
                    Assign("field_value%d" % inl,
                        "smem_field[PAR_MB_NR][%d][mb_el*PREIMAGE_DOFS_PER_EL+%s]" % (inl, j))
                    for inl in range(par.inline)
                    ]

                if float_type == numpy.float32:
                    result.append(Assign("dmat_entries",
                        "tex1Dfetch(diff_rst_mat_tex, IMAGE_EL_DOF + %s*IMAGE_DOFS_PER_EL)" % j))
                    result.extend(
                        S("d%drst%d += dmat_entries.%s * field_value%d"
                            % (inl, axis, tex_channels[axis], inl))
                        for inl in range(par.inline)
                        for axis in dims)
                elif float_type == numpy.float64:
                    result.extend(
                        S("d%(inl)drst%(axis)d += "
                            "fp_tex1Dfetch(diff_rst_mat_tex, %(axis)d "
                            "+ DIMENSIONS*(IMAGE_EL_DOF + %(j)d*IMAGE_DOFS_PER_EL))"
                            "* field_value%(inl)d" % {
                            "inl": inl,
                            "axis": axis,
                            "j": j
                            })
                        for inl in range(par.inline)
                        for axis in dims)
                else:
                    assert False

                return result

            code.append(If("IMAGE_MB_DOF < IMAGE_DOFS_PER_MB", Block(unroll(unroll_body,
                    total_number=plan.preimage_dofs_per_el)
                    +[store_code])))

            return code
Example #33
0
    def emit_atomic_update(self, codegen_state, lhs_atomicity, lhs_var,
                           lhs_expr, rhs_expr, lhs_dtype, rhs_type_context):

        from pymbolic.primitives import Sum
        from cgen import Statement
        from pymbolic.mapper.stringifier import PREC_NONE

        if isinstance(lhs_dtype, NumpyType) and lhs_dtype.numpy_dtype in [
                np.int32, np.int64, np.float32, np.float64
        ]:
            # atomicAdd
            if isinstance(rhs_expr, Sum):
                ecm = self.get_expression_to_code_mapper(codegen_state)

                new_rhs_expr = Sum(
                    tuple(c for c in rhs_expr.children if c != lhs_expr))
                lhs_expr_code = ecm(lhs_expr)
                rhs_expr_code = ecm(new_rhs_expr)

                return Statement("atomicAdd(&{}, {})".format(
                    lhs_expr_code, rhs_expr_code))
            else:
                from cgen import Block, DoWhile, Assign
                from loopy.target.c import POD
                old_val_var = codegen_state.var_name_generator("loopy_old_val")
                new_val_var = codegen_state.var_name_generator("loopy_new_val")

                from loopy.kernel.data import TemporaryVariable
                ecm = codegen_state.expression_to_code_mapper.with_assignments(
                    {
                        old_val_var: TemporaryVariable(old_val_var, lhs_dtype),
                        new_val_var: TemporaryVariable(new_val_var, lhs_dtype),
                    })

                lhs_expr_code = ecm(lhs_expr,
                                    prec=PREC_NONE,
                                    type_context=None)

                from pymbolic.mapper.substitutor import make_subst_func
                from pymbolic import var
                from loopy.symbolic import SubstitutionMapper

                subst = SubstitutionMapper(
                    make_subst_func({lhs_expr: var(old_val_var)}))
                rhs_expr_code = ecm(subst(rhs_expr),
                                    prec=PREC_NONE,
                                    type_context=rhs_type_context,
                                    needed_dtype=lhs_dtype)

                cast_str = ""
                old_val = old_val_var
                new_val = new_val_var

                if lhs_dtype.numpy_dtype.kind == "f":
                    if lhs_dtype.numpy_dtype == np.float32:
                        ctype = "int"
                    elif lhs_dtype.numpy_dtype == np.float64:
                        ctype = "long"
                    else:
                        raise AssertionError()

                    old_val = "*(%s *) &" % ctype + old_val
                    new_val = "*(%s *) &" % ctype + new_val
                    cast_str = "(%s *) " % (ctype)

                return Block([
                    POD(self, NumpyType(lhs_dtype.dtype, target=self.target),
                        old_val_var),
                    POD(self, NumpyType(lhs_dtype.dtype, target=self.target),
                        new_val_var),
                    DoWhile(
                        "atomicCAS("
                        "%(cast_str)s&(%(lhs_expr)s), "
                        "%(old_val)s, "
                        "%(new_val)s"
                        ") != %(old_val)s" % {
                            "cast_str": cast_str,
                            "lhs_expr": lhs_expr_code,
                            "old_val": old_val,
                            "new_val": new_val,
                        },
                        Block([
                            Assign(old_val_var, lhs_expr_code),
                            Assign(new_val_var, rhs_expr_code),
                        ]))
                ])
        else:
            raise NotImplementedError("atomic update for '%s'" % lhs_dtype)
    def get_kernel(self, with_scaling, for_benchmark=False):
        from cgen import \
                Pointer, POD, Value, ArrayOf, \
                Module, FunctionDeclaration, FunctionBody, Block, \
                Line, Define, Include, \
                Initializer, If, For, Statement, Assign, \
                ArrayInitializer

        from cgen import dtype_to_ctype
        from cgen.cuda import CudaShared, CudaConstant, CudaGlobal

        discr = self.discr
        d = discr.dimensions
        dims = range(d)
        given = self.plan.given

        float_type = given.float_type

        f_decl = CudaGlobal(FunctionDeclaration(Value("void", "apply_el_local_mat_smem_mat"),
            [
                Pointer(POD(float_type, "out_vector")),
                Pointer(POD(numpy.uint8, "gmem_matrix")),
                Pointer(POD(float_type, "debugbuf")),
                POD(numpy.uint32, "microblock_count"),
                ]
            ))

        cmod = Module([
                Include("pycuda-helpers.hpp"),
                Line(),
                Value("texture<fp_tex_%s, 1, cudaReadModeElementType>"
                    % dtype_to_ctype(float_type),
                    "in_vector_tex"),
                ])
        if with_scaling:
            cmod.append(
                Value("texture<fp_tex_%s, 1, cudaReadModeElementType>"
                    % dtype_to_ctype(float_type),
                    "scaling_tex"),
                )

        par = self.plan.parallelism

        cmod.extend([
                Line(),
                Define("DIMENSIONS", discr.dimensions),
                Define("DOFS_PER_EL", given.dofs_per_el()),
                Define("PREIMAGE_DOFS_PER_EL", self.plan.preimage_dofs_per_el),
                Line(),
                Define("SEGMENT_DOF", "threadIdx.x"),
                Define("PAR_MB_NR", "threadIdx.y"),
                Line(),
                Define("MB_SEGMENT", "blockIdx.x"),
                Define("MACROBLOCK_NR", "blockIdx.y"),
                Line(),
                Define("DOFS_PER_SEGMENT", self.plan.segment_size),
                Define("SEGMENTS_PER_MB", self.plan.segments_per_microblock()),
                Define("ALIGNED_DOFS_PER_MB", given.microblock.aligned_floats),
                Define("ALIGNED_PREIMAGE_DOFS_PER_MB",
                    self.plan.aligned_preimage_dofs_per_microblock),
                Define("MB_EL_COUNT", given.microblock.elements),
                Line(),
                Define("PAR_MB_COUNT", par.parallel),
                Define("INLINE_MB_COUNT", par.inline),
                Define("SEQ_MB_COUNT", par.serial),
                Line(),
                Define("THREAD_NUM", "(SEGMENT_DOF+PAR_MB_NR*DOFS_PER_SEGMENT)"),
                Define("COALESCING_THREAD_COUNT", "(PAR_MB_COUNT*DOFS_PER_SEGMENT)"),
                Line(),
                Define("MB_DOF_BASE", "(MB_SEGMENT*DOFS_PER_SEGMENT)"),
                Define("MB_DOF", "(MB_DOF_BASE+SEGMENT_DOF)"),
                Define("GLOBAL_MB_NR_BASE",
                    "(MACROBLOCK_NR*PAR_MB_COUNT*INLINE_MB_COUNT*SEQ_MB_COUNT)"),
                Define("GLOBAL_MB_NR",
                    "(GLOBAL_MB_NR_BASE"
                    "+ (seq_mb_number*PAR_MB_COUNT + PAR_MB_NR)*INLINE_MB_COUNT)"),
                Define("GLOBAL_MB_DOF_BASE", "(GLOBAL_MB_NR*ALIGNED_DOFS_PER_MB)"),
                Define("GLOBAL_MB_PREIMG_DOF_BASE", "(GLOBAL_MB_NR*ALIGNED_PREIMAGE_DOFS_PER_MB)"),
                Line(),
                Define("MATRIX_COLUMNS", self.plan.gpu_matrix_columns()),
                Define("MATRIX_SEGMENT_FLOATS", self.plan.gpu_matrix_block_floats()),
                Define("MATRIX_SEGMENT_BYTES",
                    "(MATRIX_SEGMENT_FLOATS*%d)" % given.float_size()),

                Line(),
                CudaShared(ArrayOf(POD(float_type, "smem_matrix"),
                    "MATRIX_SEGMENT_FLOATS")),
                CudaShared(
                    ArrayOf(
                        ArrayOf(
                            ArrayOf(
                                POD(float_type, "dof_buffer"),
                                "PAR_MB_COUNT"),
                            "INLINE_MB_COUNT"),
                        "DOFS_PER_SEGMENT"),
                    ),
                CudaShared(POD(numpy.uint16, "segment_start_el")),
                CudaShared(POD(numpy.uint16, "segment_stop_el")),
                CudaShared(POD(numpy.uint16, "segment_el_count")),
                Line(),
                ArrayInitializer(
                        CudaConstant(
                            ArrayOf(
                                POD(numpy.uint32, "segment_start_el_lookup"),
                            "SEGMENTS_PER_MB")),
                        [(chk*self.plan.segment_size)//given.dofs_per_el()
                            for chk in range(self.plan.segments_per_microblock())]
                        ),
                ArrayInitializer(
                        CudaConstant(
                            ArrayOf(
                                POD(numpy.uint32, "segment_stop_el_lookup"),
                            "SEGMENTS_PER_MB")),
                        [min(given.microblock.elements,
                            (chk*self.plan.segment_size+self.plan.segment_size-1)
                                //given.dofs_per_el()+1)
                            for chk in range(self.plan.segments_per_microblock())]
                        ),
                ])

        S = Statement
        f_body = Block()

        f_body.extend_log_block("calculate this dof's element", [
            Initializer(POD(numpy.uint8, "mb_el"),
                "MB_DOF/DOFS_PER_EL") ])

        if self.plan.use_prefetch_branch:
            f_body.extend_log_block("calculate segment responsibility data", [
                If("THREAD_NUM==0",
                    Block([
                        Assign("segment_start_el", "segment_start_el_lookup[MB_SEGMENT]"),
                        Assign("segment_stop_el", "segment_stop_el_lookup[MB_SEGMENT]"),
                        Assign("segment_el_count", "segment_stop_el-segment_start_el"),
                        ])
                    ),
                S("__syncthreads()")
                ])

        from hedge.backends.cuda.tools import get_load_code
        f_body.extend(
            get_load_code(
                dest="smem_matrix",
                base=("gmem_matrix + MB_SEGMENT*MATRIX_SEGMENT_BYTES"),
                bytes="MATRIX_SEGMENT_BYTES",
                descr="load matrix segment")
            +[S("__syncthreads()")]
            )

        # ---------------------------------------------------------------------
        def get_batched_fetch_mat_mul_code(el_fetch_count):
            result = []
            dofs = range(self.plan.preimage_dofs_per_el)

            for load_segment_start in range(0, self.plan.preimage_dofs_per_el,
                    self.plan.segment_size):
                result.extend(
                        [S("__syncthreads()")]
                        +[Assign(
                            "dof_buffer[PAR_MB_NR][%d][SEGMENT_DOF]" % inl,
                            "fp_tex1Dfetch(in_vector_tex, "
                            "GLOBAL_MB_PREIMG_DOF_BASE"
                            " + %d*ALIGNED_PREIMAGE_DOFS_PER_MB"
                            " + (segment_start_el)*PREIMAGE_DOFS_PER_EL + %d + SEGMENT_DOF)"
                            % (inl, load_segment_start)
                            )
                        for inl in range(par.inline)]
                        +[S("__syncthreads()"),
                        Line(),
                        ])

                for dof in dofs[load_segment_start:load_segment_start+self.plan.segment_size]:
                    for inl in range(par.inline):
                        result.append(
                                S("result%d += "
                                    "smem_matrix[SEGMENT_DOF*MATRIX_COLUMNS + %d]"
                                    "*"
                                    "dof_buffer[PAR_MB_NR][%d][%d]"
                                    % (inl, dof, inl, dof-load_segment_start))
                                )
                result.append(Line())
            return result

        from hedge.backends.cuda.tools import unroll
        def get_direct_tex_mat_mul_code():
            return (
                    [POD(float_type, "fof%d" % inl) for inl in range(par.inline)]
                    + [POD(float_type, "lm"), Line()]
                    + unroll(
                        lambda j: [
                        Assign("fof%d" % inl,
                            "fp_tex1Dfetch(in_vector_tex, "
                            "GLOBAL_MB_PREIMG_DOF_BASE"
                            " + %(inl)d * ALIGNED_PREIMAGE_DOFS_PER_MB"
                            " + mb_el*PREIMAGE_DOFS_PER_EL+%(j)s)"
                            % {"j":j, "inl":inl, "row": "SEGMENT_DOF"},)
                        for inl in range(par.inline)
                        ]+[
                        Assign("lm",
                            "smem_matrix["
                            "%(row)s*MATRIX_COLUMNS + %(j)s]"
                            % {"j":j, "row": "SEGMENT_DOF"},
                            )
                        ]+[
                        S("result%(inl)d += fof%(inl)d*lm" % {"inl":inl})
                        for inl in range(par.inline)
                        ],
                        total_number=self.plan.preimage_dofs_per_el,
                        max_unroll=self.plan.max_unroll)
                    + [Line()])

        def get_mat_mul_code(el_fetch_count):
            if el_fetch_count == 1:
                return get_batched_fetch_mat_mul_code(el_fetch_count)
            else:
                return get_direct_tex_mat_mul_code()

        def mat_mul_outer_loop(fetch_count):
            if with_scaling:
                inv_jac_multiplier = ("fp_tex1Dfetch(scaling_tex,"
                        "(GLOBAL_MB_NR + %(inl)d)*MB_EL_COUNT + mb_el)")
            else:
                inv_jac_multiplier = "1"

            write_condition = "MB_DOF < DOFS_PER_EL*MB_EL_COUNT"
            if self.with_index_check:
                write_condition += " && GLOBAL_MB_NR < microblock_count"
            return For("unsigned short seq_mb_number = 0",
                "seq_mb_number < SEQ_MB_COUNT",
                "++seq_mb_number",
                Block([
                    Initializer(POD(float_type, "result%d" % inl), 0)
                    for inl in range(par.inline)
                    ]+[Line()]
                    +get_mat_mul_code(fetch_count)
                    +[
                    If(write_condition,
                        Block([
                            Assign(
                                "out_vector[GLOBAL_MB_DOF_BASE"
                                " + %d*ALIGNED_DOFS_PER_MB"
                                " + MB_DOF]" % inl,
                                "result%d * %s" % (inl, (inv_jac_multiplier % {"inl":inl}))
                                )
                            for inl in range(par.inline)
                            ])
                        )
                    ])
                )

        if self.plan.use_prefetch_branch:
            from cgen import make_multiple_ifs
            f_body.append(make_multiple_ifs([
                    ("segment_el_count == %d" % fetch_count,
                        mat_mul_outer_loop(fetch_count))
                    for fetch_count in
                    range(1, self.plan.max_elements_touched_by_segment()+1)]
                    ))
        else:
            f_body.append(mat_mul_outer_loop(0))

        # finish off ----------------------------------------------------------
        cmod.append(FunctionBody(f_decl, f_body))

        if not for_benchmark and "cuda_dump_kernels" in discr.debug:
            from hedge.tools import open_unique_debug_file
            open_unique_debug_file(self.plan.debug_name, ".cu").write(str(cmod))

        mod = SourceModule(cmod,
                keep="cuda_keep_kernels" in discr.debug,
                #options=["--maxrregcount=12"]
                )

        func = mod.get_function("apply_el_local_mat_smem_mat")

        if self.plan.debug_name in discr.debug:
            print "%s: lmem=%d smem=%d regs=%d" % (
                    self.plan.debug_name,
                    func.local_size_bytes,
                    func.shared_size_bytes,
                    func.num_regs)

        in_vector_texref = mod.get_texref("in_vector_tex")
        texrefs = [in_vector_texref]

        if with_scaling:
            scaling_texref = mod.get_texref("scaling_tex")
            texrefs.append(scaling_texref)
        else:
            scaling_texref = None

        func.prepare(
                "PPPI",
                block=(self.plan.segment_size, self.plan.parallelism.parallel, 1),
                texrefs=texrefs)

        return func, in_vector_texref, scaling_texref
Example #35
0
    def get_kernel(self, for_benchmark=False):
        from cgen import \
                Pointer, POD, Value, ArrayOf, Const, \
                Module, FunctionDeclaration, FunctionBody, Block, \
                Comment, Line, Include, \
                Define, \
                Initializer, If, For, Statement, Assign

        from cgen import dtype_to_ctype
        from cgen.cuda import CudaShared, CudaGlobal

        discr = self.discr
        d = discr.dimensions
        dims = range(d)
        given = self.plan.given

        float_type = given.float_type

        f_decl = CudaGlobal(FunctionDeclaration(Value("void", "apply_el_local_mat_smem_field"),
            [
                Pointer(POD(float_type, "out_vector")),
                Pointer(POD(float_type, "in_vector")),
                Pointer(POD(float_type, "debugbuf")),
                POD(numpy.uint32, "microblock_count"),
                ]
            ))

        cmod = Module([
                Include("pycuda-helpers.hpp"),
                Line(),
                Value("texture<fp_tex_%s, 2, cudaReadModeElementType>"
                    % dtype_to_ctype(float_type),
                    "mat_tex"),
                ])

        plan = self.plan
        par = plan.parallelism

        # FIXME: aligned_image_dofs_per_microblock must be divisible
        # by this, therefore hardcoding for now.
        chunk_size = 16

        cmod.extend([
                Line(),
                Define("DIMENSIONS", discr.dimensions),
                Define("IMAGE_DOFS_PER_EL", plan.image_dofs_per_el),
                Define("PREIMAGE_DOFS_PER_EL", plan.preimage_dofs_per_el),
                Define("ALIGNED_IMAGE_DOFS_PER_MB", plan.aligned_image_dofs_per_microblock),
                Define("ALIGNED_PREIMAGE_DOFS_PER_MB",
                    plan.aligned_preimage_dofs_per_microblock),
                Line(),
                Define("MB_EL_COUNT", plan.elements_per_microblock),
                Line(),
                Define("IMAGE_DOFS_PER_MB", "(IMAGE_DOFS_PER_EL*MB_EL_COUNT)"),
                Line(),
                Define("CHUNK_SIZE", chunk_size),
                Define("CHUNK_DOF", "threadIdx.x"),
                Define("PAR_MB_NR", "threadIdx.y"),
                Define("CHUNK_NR", "threadIdx.z"),
                Define("IMAGE_MB_DOF", "(CHUNK_NR*CHUNK_SIZE+CHUNK_DOF)"),
                Define("IMAGE_EL_DOF", "(IMAGE_MB_DOF - mb_el*IMAGE_DOFS_PER_EL)"),
                Line(),
                Define("MACROBLOCK_NR", "blockIdx.x"),
                Line(),
                Define("PAR_MB_COUNT", par.parallel),
                Define("INLINE_MB_COUNT", par.inline),
                Define("SEQ_MB_COUNT", par.serial),
                Line(),
                Define("GLOBAL_MB_NR_BASE",
                    "(MACROBLOCK_NR*PAR_MB_COUNT*INLINE_MB_COUNT*SEQ_MB_COUNT)"),
                Define("GLOBAL_MB_NR",
                    "(GLOBAL_MB_NR_BASE"
                    "+ (seq_mb_number*PAR_MB_COUNT + PAR_MB_NR)*INLINE_MB_COUNT)"),
                Define("GLOBAL_MB_IMAGE_DOF_BASE", "(GLOBAL_MB_NR*ALIGNED_IMAGE_DOFS_PER_MB)"),
                Define("GLOBAL_MB_PREIMAGE_DOF_BASE", "(GLOBAL_MB_NR*ALIGNED_PREIMAGE_DOFS_PER_MB)"),
                Line(),
                CudaShared(
                    ArrayOf(
                        ArrayOf(
                            ArrayOf(
                                POD(float_type, "smem_in_vector"),
                                "PAR_MB_COUNT"),
                            "INLINE_MB_COUNT"),
                        "ALIGNED_PREIMAGE_DOFS_PER_MB")),
                Line(),
                ])

        S = Statement
        f_body = Block([
            Initializer(Const(POD(numpy.uint16, "mb_el")),
                "IMAGE_MB_DOF / IMAGE_DOFS_PER_EL"),
            Line(),
            ])

        def get_load_code():
            mb_img_dofs = plan.aligned_image_dofs_per_microblock
            mb_preimg_dofs = plan.aligned_preimage_dofs_per_microblock
            preimg_dofs_over_dofs = (mb_preimg_dofs+mb_img_dofs-1) // mb_img_dofs

            load_code = []
            store_code = []

            var_num = 0
            for load_block in range(preimg_dofs_over_dofs):
                for inl in range(par.inline):
                    # load and store are split for better pipelining
                    # compiler can't figure that out because of branch

                    var = "tmp%d" % var_num
                    var_num += 1
                    load_code.append(POD(float_type, var))

                    block_addr = "%d * ALIGNED_IMAGE_DOFS_PER_MB + IMAGE_MB_DOF" % load_block
                    load_instr = Assign(var,
                        "in_vector[GLOBAL_MB_PREIMAGE_DOF_BASE"
                        " + %d*ALIGNED_PREIMAGE_DOFS_PER_MB"
                        " + %s]" % (inl, block_addr))
                    store_instr = Assign(
                            "smem_in_vector[PAR_MB_NR][%d][%s]" % (inl, block_addr),
                            var
                            )
                    if (load_block+1)*mb_img_dofs >= mb_preimg_dofs:
                        cond = "%s < ALIGNED_PREIMAGE_DOFS_PER_MB" % block_addr
                        load_instr = If(cond, load_instr)
                        store_instr = If(cond, store_instr)

                    load_code.append(load_instr)
                    store_code.append(store_instr)
            return Block(load_code + [Line()] + store_code)

        def get_matmul_code():
            from hedge.backends.cuda.tools import unroll

            index_check_condition = "GLOBAL_MB_NR < microblock_count"

            def if_(conditions, then):
                final_cond = " && ".join(cond for cond in conditions if cond)
                if final_cond:
                    return If(final_cond, then)
                else:
                    return then

            result = Block([
                Comment("everybody needs to be done with the old data"),
                S("__syncthreads()"), Line(),
                ]+[If(index_check_condition, get_load_code())]+[
                Line(),
                Comment("all the new data must be loaded"),
                S("__syncthreads()"),
                Line(),
                ]+[
                Initializer(POD(float_type, "result%d" % inl), 0)
                for inl in range(par.inline)
                ]+[
                Line(),
                POD(float_type, "mat_entry"),
                Line(),
                ])

            result.append(if_(["IMAGE_MB_DOF < IMAGE_DOFS_PER_MB", index_check_condition],
                Block(unroll(lambda j:
                    [Assign("mat_entry", "fp_tex2D(mat_tex, IMAGE_EL_DOF, %s)" % j)]
                    +[
                    S("result%d += mat_entry "
                    "* smem_in_vector[PAR_MB_NR][%d][mb_el*PREIMAGE_DOFS_PER_EL + %s]"
                    % (inl, inl, j))
                    for inl in range(par.inline)
                    ],
                    total_number=plan.preimage_dofs_per_el)
                    +[Line()]
                    +[Assign(
                        "out_vector[GLOBAL_MB_IMAGE_DOF_BASE + "
                        "%d*ALIGNED_IMAGE_DOFS_PER_MB + IMAGE_MB_DOF]" % inl,
                        "result%d" % inl)
                    for inl in range(par.inline)]
                    )))

            return result

        f_body.append(For("unsigned short seq_mb_number = 0",
            "seq_mb_number < SEQ_MB_COUNT",
            "++seq_mb_number", get_matmul_code()))

        # finish off ----------------------------------------------------------
        cmod.append(FunctionBody(f_decl, f_body))

        if not for_benchmark and "cuda_dump_kernels" in discr.debug:
            from hedge.tools import open_unique_debug_file
            open_unique_debug_file(plan.debug_name, ".cu").write(str(cmod))

        mod = SourceModule(cmod,
                keep="cuda_keep_kernels" in discr.debug,
                #options=["--maxrregcount=12"]
                )

        func = mod.get_function("apply_el_local_mat_smem_field")

        if plan.debug_name in discr.debug:
            print "%s: lmem=%d smem=%d regs=%d" % (
                    plan.debug_name,
                    func.local_size_bytes,
                    func.shared_size_bytes,
                    func.num_regs)

        mat_texref = mod.get_texref("mat_tex")
        texrefs = [mat_texref]

        func.prepare(
                "PPPI",
                texrefs=texrefs)
        assert plan.aligned_image_dofs_per_microblock % chunk_size == 0
        block = (
                chunk_size,
                plan.parallelism.parallel,
                plan.aligned_image_dofs_per_microblock
                //chunk_size)

        return func, block, mat_texref
Example #36
0
    def write_boundary_flux_code(self, for_benchmark):
        given = self.plan.given

        flux_write_code = Block()

        fluxes_by_bdry_number = {}
        for flux_nr, wdflux in enumerate(self.fluxes):
            for bflux_info in wdflux.boundaries:
                if for_benchmark:
                    bdry_number = 0
                else:
                    bdry_number = self.executor.boundary_tag_to_number[
                        bflux_info.bpair.tag]

                fluxes_by_bdry_number.setdefault(bdry_number, [])\
                        .append((flux_nr, bflux_info))

        flux_write_code.extend([
            Initializer(MaybeUnused(POD(given.float_type, "flux%d" % flux_nr)),
                        0) for flux_nr in range(len(self.fluxes))
        ])

        for bdry_number, nrs_and_fluxes in fluxes_by_bdry_number.iteritems():
            bblock = []

            from pytools import set_sum
            int_deps = set_sum(flux_rec.int_dependencies
                               for flux_nr, flux_rec in nrs_and_fluxes)
            ext_deps = set_sum(flux_rec.ext_dependencies
                               for flux_nr, flux_rec in nrs_and_fluxes)

            for dep in int_deps:
                bblock.extend([
                    Comment(str(dep)),
                    Initializer(
                        MaybeUnused(
                            POD(given.float_type,
                                "val_a_field%d" % self.dep_to_index[dep])),
                        "fp_tex1Dfetch(field%d_tex, a_index)" %
                        self.dep_to_index[dep])
                ])
            for dep in ext_deps:
                bblock.extend([
                    Comment(str(dep)),
                    Initializer(
                        MaybeUnused(
                            POD(given.float_type,
                                "val_b_field%d" % self.dep_to_index[dep])),
                        "fp_tex1Dfetch(field%s_tex, b_index)" %
                        self.dep_to_index[dep])
                ])

            f2cm = FluxToCodeMapper(given.float_type)

            comp_code = [Line()]
            for flux_nr, flux_rec in nrs_and_fluxes:
                comp_code.append(
                    Statement(
                        ("flux%d += " % flux_nr) +
                        flux_to_code(f2cm,
                                     is_flipped=False,
                                     int_field_expr=flux_rec.bpair.field,
                                     ext_field_expr=flux_rec.bpair.bfield,
                                     dep_to_index=self.dep_to_index,
                                     flux=flux_rec.flux_expr,
                                     prec=PREC_NONE)))

            if f2cm.cse_name_list:
                bblock.append(Line())

            bblock.extend(
                Initializer(Value("value_type", cse_name), cse_str)
                for cse_name, cse_str in f2cm.cse_name_list)

            flux_write_code.extend([
                Line(),
                Comment(nrs_and_fluxes[0][1].bpair.tag),
                If("(fpair->boundary_bitmap) & (1 << %d)" % (bdry_number),
                   Block(bblock + comp_code)),
            ])

        flux_write_code.extend(
            [
                Line(),
            ] + [
                self.gen_store(flux_nr, "fpair->a_dest+FACEDOF_NR",
                               "fpair->face_jacobian * flux%d" % flux_nr)
                for flux_nr in range(len(self.fluxes))
            ]
            #Assign("debugbuf[blockIdx.x*96+fpair_nr]", "10000+fpair->a_dest"),
        )

        return flux_write_code
Example #37
0
    def get_kernel(self, fdata, ilist_data, for_benchmark):
        from cgen.cuda import CudaShared, CudaGlobal
        from pycuda.tools import dtype_to_ctype

        discr = self.discr
        given = self.plan.given
        fplan = self.plan
        d = discr.dimensions
        dims = range(d)

        elgroup, = discr.element_groups

        float_type = given.float_type

        f_decl = CudaGlobal(FunctionDeclaration(Value("void", "apply_flux"),
            [
                Pointer(POD(float_type, "debugbuf")),
                Pointer(POD(numpy.uint8, "gmem_facedata")),
                ]+[
                Pointer(POD(float_type, "gmem_fluxes_on_faces%d" % flux_nr))
                for flux_nr in range(len(self.fluxes))
                ]
            ))

        cmod = Module()
        cmod.append(Include("pycuda-helpers.hpp"))

        for dep_expr in self.all_deps:
            cmod.extend([
                Value("texture<%s, 1, cudaReadModeElementType>"
                    % dtype_to_ctype(float_type, with_fp_tex_hack=True),
                    "field%d_tex" % self.dep_to_index[dep_expr])
                ])

        if fplan.flux_count != len(self.fluxes):
            from warnings import warn
            warn("Flux count in flux execution plan different from actual flux count.\n"
                    "You may want to specify the tune_for= kwarg in the Discretization\n"
                    "constructor.")

        cmod.extend([
            Line(),
            Typedef(POD(float_type, "value_type")),
            Line(),
            flux_header_struct(float_type, discr.dimensions),
            Line(),
            face_pair_struct(float_type, discr.dimensions),
            Line(),
            Define("DIMENSIONS", discr.dimensions),
            Define("DOFS_PER_FACE", fplan.dofs_per_face),
            Define("THREADS_PER_FACE", fplan.threads_per_face()),
            Line(),
            Define("CONCURRENT_FACES", fplan.parallel_faces),
            Define("BLOCK_MB_COUNT", fplan.mbs_per_block),
            Line(),
            Define("FACEDOF_NR", "threadIdx.x"),
            Define("BLOCK_FACE", "threadIdx.y"),
            Line(),
            Define("FLUX_COUNT", len(self.fluxes)),
            Line(),
            Define("THREAD_NUM", "(FACEDOF_NR + BLOCK_FACE*THREADS_PER_FACE)"),
            Define("THREAD_COUNT", "(THREADS_PER_FACE*CONCURRENT_FACES)"),
            Define("COALESCING_THREAD_COUNT",
                "(THREAD_COUNT < 0x10 ? THREAD_COUNT : THREAD_COUNT & ~0xf)"),
            Line(),
            Define("DATA_BLOCK_SIZE", fdata.block_bytes),
            Define("ALIGNED_FACE_DOFS_PER_MB", fplan.aligned_face_dofs_per_microblock()),
            Define("ALIGNED_FACE_DOFS_PER_BLOCK",
                "(ALIGNED_FACE_DOFS_PER_MB*BLOCK_MB_COUNT)"),
            Line(),
            Define("FOF_BLOCK_BASE", "(blockIdx.x*ALIGNED_FACE_DOFS_PER_BLOCK)"),
            Line(),
            ] + ilist_data.code + [
            Line(),
            Value("texture<index_list_entry_t, 1, cudaReadModeElementType>",
                "tex_index_lists"),
            Line(),
            fdata.struct,
            Line(),
            CudaShared(Value("flux_data", "data")),
            ])

        if not fplan.direct_store:
            cmod.extend([
                CudaShared(
                    ArrayOf(
                        ArrayOf(
                            POD(float_type, "smem_fluxes_on_faces"),
                            "FLUX_COUNT"),
                        "ALIGNED_FACE_DOFS_PER_MB*BLOCK_MB_COUNT")
                    ),
                Line(),
                ])

        S = Statement
        f_body = Block()

        from hedge.backends.cuda.tools import get_load_code

        f_body.extend(get_load_code(
            dest="&data",
            base="gmem_facedata + blockIdx.x*DATA_BLOCK_SIZE",
            bytes="sizeof(flux_data)",
            descr="load face_pair data")
            +[S("__syncthreads()"), Line() ])

        def get_flux_code(flux_writer):
            flux_code = Block([])

            flux_code.extend([
                Initializer(Pointer(
                    Value("face_pair", "fpair")),
                    "data.facepairs+fpair_nr"),
                Initializer(
                    MaybeUnused(POD(numpy.uint32, "a_index")),
                    "fpair->a_base + tex1Dfetch(tex_index_lists, "
                    "fpair->a_ilist_index + FACEDOF_NR)"),
                Initializer(
                    MaybeUnused(POD(numpy.uint32, "b_index")),
                    "fpair->b_base + tex1Dfetch(tex_index_lists, "
                    "fpair->b_ilist_index + FACEDOF_NR)"),
                Line(),
                flux_writer(),
                Line(),
                S("fpair_nr += CONCURRENT_FACES")
                ])

            return flux_code

        flux_computation = Block([
            Comment("fluxes for dual-sided (intra-block) interior face pairs"),
            While("fpair_nr < data.header.same_facepairs_end",
                get_flux_code(lambda:
                    self.write_interior_flux_code(True))
                ),
            Line(),
            Comment("work around nvcc assertion failure"),
            S("fpair_nr+=1"),
            S("fpair_nr-=1"),
            Line(),
            Comment("fluxes for single-sided (inter-block) interior face pairs"),
            While("fpair_nr < data.header.diff_facepairs_end",
                get_flux_code(lambda:
                    self.write_interior_flux_code(False))
                ),
            Line(),
            Comment("fluxes for single-sided boundary face pairs"),
            While("fpair_nr < data.header.bdry_facepairs_end",
                get_flux_code(
                    lambda: self.write_boundary_flux_code(for_benchmark))
                ),
            ])

        f_body.extend_log_block("compute the fluxes", [
            Initializer(POD(numpy.uint32, "fpair_nr"), "BLOCK_FACE"),
            If("FACEDOF_NR < DOFS_PER_FACE", flux_computation)
            ])

        if not fplan.direct_store:
            f_body.extend([
                Line(),
                S("__syncthreads()"),
                Line()
                ])

            f_body.extend_log_block("store fluxes", [
                    #Assign("debugbuf[blockIdx.x]", "FOF_BLOCK_BASE"),
                    #Assign("debugbuf[0]", "FOF_BLOCK_BASE"),
                    #Assign("debugbuf[0]", "sizeof(face_pair)"),
                    For("unsigned word_nr = THREAD_NUM",
                        "word_nr < ALIGNED_FACE_DOFS_PER_MB*BLOCK_MB_COUNT",
                        "word_nr += COALESCING_THREAD_COUNT",
                        Block([Assign(
                            "gmem_fluxes_on_faces%d[FOF_BLOCK_BASE+word_nr]" % flux_nr,
                            "smem_fluxes_on_faces[%d][word_nr]" % flux_nr)
                            for flux_nr in range(len(self.fluxes))]
                           #+[If("isnan(smem_fluxes_on_faces[%d][word_nr])" % flux_nr,
                               #Block([
                                   #Assign("debugbuf[blockIdx.x]", "word_nr"),
                                   #])
                               #)
                            #for flux_nr in range(len(self.fluxes))]
                        )
                    )
                    ])
        if False:
            f_body.extend([
                    Assign("debugbuf[blockIdx.x*96+32+BLOCK_FACE*32+threadIdx.x]", "fpair_nr"),
                    Assign("debugbuf[blockIdx.x*96+16]", "data.header.same_facepairs_end"),
                    Assign("debugbuf[blockIdx.x*96+17]", "data.header.diff_facepairs_end"),
                    Assign("debugbuf[blockIdx.x*96+18]", "data.header.bdry_facepairs_end"),
                    ]
                    )

        # finish off ----------------------------------------------------------
        cmod.append(FunctionBody(f_decl, f_body))

        if not for_benchmark and "cuda_dump_kernels" in discr.debug:
            from hedge.tools import open_unique_debug_file
            open_unique_debug_file("flux_gather", ".cu").write(str(cmod))

        #from pycuda.tools import allow_user_edit
        mod = SourceModule(
                #allow_user_edit(cmod, "kernel.cu", "the flux kernel"),
                cmod,
                keep="cuda_keep_kernels" in discr.debug)
        expr_to_texture_map = dict(
                (dep_expr, mod.get_texref(
                    "field%d_tex" % self.dep_to_index[dep_expr]))
                for dep_expr in self.all_deps)

        index_list_texref = mod.get_texref("tex_index_lists")
        index_list_texref.set_address(
                ilist_data.device_memory,
                ilist_data.bytes)
        index_list_texref.set_format(
                cuda.dtype_to_array_format(ilist_data.type), 1)
        index_list_texref.set_flags(cuda.TRSF_READ_AS_INTEGER)

        func = mod.get_function("apply_flux")
        block = (fplan.threads_per_face(), fplan.parallel_faces, 1)
        func.prepare(
                (2+len(self.fluxes))*"P",
                texrefs=expr_to_texture_map.values()
                + [index_list_texref])

        if "cuda_flux" in discr.debug:
            print "flux: lmem=%d smem=%d regs=%d" % (
                    func.local_size_bytes,
                    func.shared_size_bytes,
                    func.num_regs)

        return block, func, expr_to_texture_map
Example #38
0
    def write_boundary_flux_code(self, for_benchmark):
        given = self.plan.given

        flux_write_code = Block()

        fluxes_by_bdry_number = {}
        for flux_nr, wdflux in enumerate(self.fluxes):
            for bflux_info in wdflux.boundaries:
                if for_benchmark:
                    bdry_number = 0
                else:
                    bdry_number = self.executor.boundary_tag_to_number[
                            bflux_info.bpair.tag]

                fluxes_by_bdry_number.setdefault(bdry_number, [])\
                        .append((flux_nr, bflux_info))

        flux_write_code.extend([
            Initializer(
                MaybeUnused(POD(given.float_type, "flux%d" % flux_nr)),
                0)
            for flux_nr in range(len(self.fluxes))])

        for bdry_number, nrs_and_fluxes in fluxes_by_bdry_number.iteritems():
            bblock = []

            from pytools import set_sum
            int_deps = set_sum(flux_rec.int_dependencies
                    for flux_nr, flux_rec in nrs_and_fluxes)
            ext_deps = set_sum(flux_rec.ext_dependencies
                    for flux_nr, flux_rec in nrs_and_fluxes)

            for dep in int_deps:
                bblock.extend([
                    Comment(str(dep)),
                    Initializer(
                        MaybeUnused(POD(given.float_type, "val_a_field%d"
                            % self.dep_to_index[dep])),
                        "fp_tex1Dfetch(field%d_tex, a_index)" % self.dep_to_index[dep])
                    ])
            for dep in ext_deps:
                bblock.extend([
                    Comment(str(dep)),
                    Initializer(
                        MaybeUnused(POD(given.float_type, "val_b_field%d"
                            % self.dep_to_index[dep])),
                        "fp_tex1Dfetch(field%s_tex, b_index)" % self.dep_to_index[dep])
                    ])

            f2cm = FluxToCodeMapper(given.float_type)

            comp_code = [Line()]
            for flux_nr, flux_rec in nrs_and_fluxes:
                comp_code.append(
                        Statement(("flux%d += " % flux_nr) +
                            flux_to_code(f2cm, is_flipped=False,
                                int_field_expr=flux_rec.bpair.field,
                                ext_field_expr=flux_rec.bpair.bfield,
                                dep_to_index=self.dep_to_index,
                                flux=flux_rec.flux_expr, prec=PREC_NONE)))

            if f2cm.cse_name_list:
                bblock.append(Line())

            bblock.extend(
                    Initializer(
                        Value("value_type", cse_name), cse_str)
                    for cse_name, cse_str in f2cm.cse_name_list)

            flux_write_code.extend([
                Line(),
                Comment(nrs_and_fluxes[0][1].bpair.tag),
                If("(fpair->boundary_bitmap) & (1 << %d)" % (bdry_number),
                    Block(bblock+comp_code)),
                ])

        flux_write_code.extend([Line(),]
            +[
            self.gen_store(flux_nr, "fpair->a_dest+FACEDOF_NR",
                "fpair->face_jacobian * flux%d" % flux_nr)
            for flux_nr in range(len(self.fluxes))
            ]
            #Assign("debugbuf[blockIdx.x*96+fpair_nr]", "10000+fpair->a_dest"),
            )

        return flux_write_code
Example #39
0
    def write_interior_flux_code(self, is_twosided):
        given = self.plan.given

        def get_field(flux_rec, is_interior, flipped):
            if is_interior ^ flipped:
                prefix = "a"
            else:
                prefix = "b"

            return ("val_%s_field%d" % (prefix, self.dep_to_index[flux_rec.field_expr]))

        flux_write_code = Block([])

        flux_var_decl = [Initializer(POD(given.float_type, "a_flux"), 0)]

        if is_twosided:
            flux_var_decl.append(Initializer(POD(given.float_type, "b_flux"), 0))
            prefixes = ["a", "b"]
            flip_values = [False, True]
        else:
            prefixes = ["a"]
            flip_values = [False]

        flux_write_code.append(Line())

        for dep in self.interior_deps:
            flux_write_code.append(Comment(str(dep)))

            for side in ["a", "b"]:
                flux_write_code.append(
                        Initializer(
                            MaybeUnused(POD(given.float_type, "val_%s_field%d"
                                % (side, self.dep_to_index[dep]))),
                            "fp_tex1Dfetch(field%d_tex, %s_index)"
                            % (self.dep_to_index[dep], side)))

        f2cm = FluxToCodeMapper(given.float_type)

        flux_sub_codes = []
        for flux_nr, wdflux in enumerate(self.fluxes):
            my_flux_block = Block(flux_var_decl)

            for int_rec in wdflux.interiors:
                for prefix, is_flipped in zip(prefixes, flip_values):
                    my_flux_block.append(
                            Statement("%s_flux += %s"
                                % (prefix,
                                    flux_to_code(f2cm, is_flipped,
                                        int_rec.field_expr,
                                        int_rec.field_expr,
                                        self.dep_to_index,
                                        int_rec.flux_expr, PREC_NONE),
                                    )))

            my_flux_block.append(Line())

            my_flux_block.append(
                    self.gen_store(flux_nr, "fpair->a_dest+FACEDOF_NR",
                        "fpair->face_jacobian*a_flux"))

            #my_flux_block.append(
                    #Statement("if(isnan(val_b_field5)) debugbuf[blockIdx.x] = 1"),
                    #)

            if is_twosided:
                my_flux_block.append(
                        self.gen_store(flux_nr,
                            "fpair->b_dest+tex1Dfetch(tex_index_lists, "
                            "fpair->b_write_ilist_index + FACEDOF_NR)",
                            "fpair->face_jacobian*b_flux"))

                #my_flux_block.append(
                        #Assign("debugbuf[blockIdx.x*96+fpair_nr+8]", "10000+fpair->b_dest"),
                        #)

            flux_sub_codes.append(my_flux_block)

        if f2cm.cse_name_list:
            flux_write_code.append(Line())

        flux_write_code.extend(
                Initializer(
                    Value("value_type", cse_name), cse_str)
                for cse_name, cse_str in f2cm.cse_name_list)

        flux_write_code.extend(flux_sub_codes)

        return flux_write_code
Example #40
0
    def emit_atomic_update(self, codegen_state, lhs_atomicity, lhs_var,
                           lhs_expr, rhs_expr, lhs_dtype, rhs_type_context):
        from pymbolic.mapper.stringifier import PREC_NONE

        # FIXME: Could detect operations, generate atomic_{add,...} when
        # appropriate.

        if isinstance(lhs_dtype, NumpyType) and lhs_dtype.numpy_dtype in [
                np.int32, np.int64, np.float32, np.float64
        ]:
            from cgen import Block, DoWhile, Assign
            from loopy.target.c import POD
            old_val_var = codegen_state.var_name_generator("loopy_old_val")
            new_val_var = codegen_state.var_name_generator("loopy_new_val")

            from loopy.kernel.data import TemporaryVariable, AddressSpace
            ecm = codegen_state.expression_to_code_mapper.with_assignments({
                old_val_var:
                TemporaryVariable(old_val_var, lhs_dtype),
                new_val_var:
                TemporaryVariable(new_val_var, lhs_dtype),
            })

            lhs_expr_code = ecm(lhs_expr, prec=PREC_NONE, type_context=None)

            from pymbolic.mapper.substitutor import make_subst_func
            from pymbolic import var
            from loopy.symbolic import SubstitutionMapper

            subst = SubstitutionMapper(
                make_subst_func({lhs_expr: var(old_val_var)}))
            rhs_expr_code = ecm(subst(rhs_expr),
                                prec=PREC_NONE,
                                type_context=rhs_type_context,
                                needed_dtype=lhs_dtype)

            if lhs_dtype.numpy_dtype.itemsize == 4:
                func_name = "atomic_cmpxchg"
            elif lhs_dtype.numpy_dtype.itemsize == 8:
                func_name = "atom_cmpxchg"
            else:
                raise LoopyError("unexpected atomic size")

            cast_str = ""
            old_val = old_val_var
            new_val = new_val_var

            if lhs_dtype.numpy_dtype.kind == "f":
                if lhs_dtype.numpy_dtype == np.float32:
                    ctype = "int"
                elif lhs_dtype.numpy_dtype == np.float64:
                    ctype = "long"
                else:
                    assert False

                from loopy.kernel.data import (TemporaryVariable, ArrayArg)
                if (isinstance(lhs_var, ArrayArg)
                        and lhs_var.address_space == AddressSpace.GLOBAL):
                    var_kind = "__global"
                elif (isinstance(lhs_var, ArrayArg)
                      and lhs_var.address_space == AddressSpace.LOCAL):
                    var_kind = "__local"
                elif (isinstance(lhs_var, TemporaryVariable)
                      and lhs_var.address_space == AddressSpace.LOCAL):
                    var_kind = "__local"
                elif (isinstance(lhs_var, TemporaryVariable)
                      and lhs_var.address_space == AddressSpace.GLOBAL):
                    var_kind = "__global"
                else:
                    raise LoopyError("unexpected kind of variable '%s' in "
                                     "atomic operation: " %
                                     (lhs_var.name, type(lhs_var).__name__))

                old_val = "*(%s *) &" % ctype + old_val
                new_val = "*(%s *) &" % ctype + new_val
                cast_str = "(%s %s *) " % (var_kind, ctype)

            return Block([
                POD(self, NumpyType(lhs_dtype.dtype, target=self.target),
                    old_val_var),
                POD(self, NumpyType(lhs_dtype.dtype, target=self.target),
                    new_val_var),
                DoWhile(
                    "%(func_name)s("
                    "%(cast_str)s&(%(lhs_expr)s), "
                    "%(old_val)s, "
                    "%(new_val)s"
                    ") != %(old_val)s" % {
                        "func_name": func_name,
                        "cast_str": cast_str,
                        "lhs_expr": lhs_expr_code,
                        "old_val": old_val,
                        "new_val": new_val,
                    },
                    Block([
                        Assign(old_val_var, lhs_expr_code),
                        Assign(new_val_var, rhs_expr_code),
                    ]))
            ])
        else:
            raise NotImplementedError("atomic update for '%s'" % lhs_dtype)
Example #41
0
        def get_scalar_diff_code():
            code = []
            for inl in range(par.inline):
                for axis in dims:
                    code.append(Initializer(POD(float_type, "d%drst%d" % (inl, axis)), 0))

            code.append(Line())

            tex_channels = ["x", "y", "z", "w"]

            store_code = Block()
            for inl in range(par.inline):
                for rst_axis in dims:
                    store_code.append(
                        Assign(
                            "drst%d_global[GLOBAL_MB_IMAGE_DOF_BASE + "
                            "%d*ALIGNED_IMAGE_DOFS_PER_MB + IMAGE_MB_DOF]" % (rst_axis, inl),
                            "d%drst%d" % (inl, rst_axis),
                        )
                    )

            from hedge.backends.cuda.tools import unroll

            code.extend(
                [
                    Comment("everybody needs to be done with the old data"),
                    S("__syncthreads()"),
                    Line(),
                    get_load_code(),
                    Line(),
                    Comment("all the new data must be loaded"),
                    S("__syncthreads()"),
                    Line(),
                ]
            )

            if float_type == numpy.float32:
                code.append(Value("float%d" % rst_channels, "dmat_entries"))

            code.extend([POD(float_type, "field_value%d" % inl) for inl in range(par.inline)] + [Line()])

            def unroll_body(j):
                result = [
                    Assign("field_value%d" % inl, "smem_field[PAR_MB_NR][%d][mb_el*PREIMAGE_DOFS_PER_EL+%s]" % (inl, j))
                    for inl in range(par.inline)
                ]

                if float_type == numpy.float32:
                    result.append(
                        Assign("dmat_entries", "tex1Dfetch(diff_rst_mat_tex, IMAGE_EL_DOF + %s*IMAGE_DOFS_PER_EL)" % j)
                    )
                    result.extend(
                        S("d%drst%d += dmat_entries.%s * field_value%d" % (inl, axis, tex_channels[axis], inl))
                        for inl in range(par.inline)
                        for axis in dims
                    )
                elif float_type == numpy.float64:
                    result.extend(
                        S(
                            "d%(inl)drst%(axis)d += "
                            "fp_tex1Dfetch(diff_rst_mat_tex, %(axis)d "
                            "+ DIMENSIONS*(IMAGE_EL_DOF + %(j)d*IMAGE_DOFS_PER_EL))"
                            "* field_value%(inl)d" % {"inl": inl, "axis": axis, "j": j}
                        )
                        for inl in range(par.inline)
                        for axis in dims
                    )
                else:
                    assert False

                return result

            code.append(
                If(
                    "IMAGE_MB_DOF < IMAGE_DOFS_PER_MB",
                    Block(unroll(unroll_body, total_number=plan.preimage_dofs_per_el) + [store_code]),
                )
            )

            return code
Example #42
0
    def generate_body(self, kernel, codegen_state):
        from cgen import Block
        body = Block()

        temp_decls = []

        # {{{ declare temporaries

        base_storage_sizes = {}
        base_storage_to_is_local = {}
        base_storage_to_align_bytes = {}

        from cgen import ArrayOf, Pointer, Initializer, AlignedAttribute
        from loopy.codegen import POD  # uses the correct complex type

        class ConstRestrictPointer(Pointer):
            def get_decl_pair(self):
                sub_tp, sub_decl = self.subdecl.get_decl_pair()
                return sub_tp, ("*const restrict %s" % sub_decl)

        for tv in sorted(
                six.itervalues(kernel.temporary_variables),
                key=lambda tv: tv.name):
            decl_info = tv.decl_info(self, index_dtype=kernel.index_dtype)

            if not tv.base_storage:
                for idi in decl_info:
                    temp_var_decl = POD(self, idi.dtype, idi.name)

                    if idi.shape:
                        temp_var_decl = ArrayOf(temp_var_decl,
                                " * ".join(str(s) for s in idi.shape))

                    temp_decls.append(
                            self.wrap_temporary_decl(temp_var_decl, tv.is_local))

            else:
                offset = 0
                base_storage_sizes.setdefault(tv.base_storage, []).append(
                        tv.nbytes)
                base_storage_to_is_local.setdefault(tv.base_storage, []).append(
                        tv.is_local)

                align_size = tv.dtype.itemsize

                from loopy.kernel.array import VectorArrayDimTag
                for dim_tag, axis_len in zip(tv.dim_tags, tv.shape):
                    if isinstance(dim_tag, VectorArrayDimTag):
                        align_size *= axis_len

                base_storage_to_align_bytes.setdefault(tv.base_storage, []).append(
                        align_size)

                for idi in decl_info:
                    cast_decl = POD(self, idi.dtype, "")
                    temp_var_decl = POD(self, idi.dtype, idi.name)

                    cast_decl = self.wrap_temporary_decl(cast_decl, tv.is_local)
                    temp_var_decl = self.wrap_temporary_decl(
                            temp_var_decl, tv.is_local)

                    # The 'restrict' part of this is a complete lie--of course
                    # all these temporaries are aliased. But we're promising to
                    # not use them to shovel data from one representation to the
                    # other. That counts, right?

                    cast_decl = ConstRestrictPointer(cast_decl)
                    temp_var_decl = ConstRestrictPointer(temp_var_decl)

                    cast_tp, cast_d = cast_decl.get_decl_pair()
                    temp_var_decl = Initializer(
                            temp_var_decl,
                            "(%s %s) (%s + %s)" % (
                                " ".join(cast_tp), cast_d,
                                tv.base_storage,
                                offset))

                    temp_decls.append(temp_var_decl)

                    from pytools import product
                    offset += (
                            idi.dtype.itemsize
                            * product(si for si in idi.shape))

        for bs_name, bs_sizes in sorted(six.iteritems(base_storage_sizes)):
            bs_var_decl = POD(self, np.int8, bs_name)
            bs_var_decl = self.wrap_temporary_decl(
                    bs_var_decl, base_storage_to_is_local[bs_name])
            bs_var_decl = ArrayOf(bs_var_decl, max(bs_sizes))

            alignment = max(base_storage_to_align_bytes[bs_name])
            bs_var_decl = AlignedAttribute(alignment, bs_var_decl)

            body.append(bs_var_decl)

        body.extend(temp_decls)

        # }}}

        from loopy.codegen.loop import set_up_hw_parallel_loops
        gen_code = set_up_hw_parallel_loops(kernel, 0, codegen_state)

        from cgen import Line
        body.append(Line())

        if isinstance(gen_code.ast, Block):
            body.extend(gen_code.ast.contents)
        else:
            body.append(gen_code.ast)

        return body, gen_code.implemented_domains
Example #43
0
def _cusp_solver(M, parameters):
    cache_key = lambda t, p: (t, p['ksp_type'], p['pc_type'], p['ksp_rtol'], p[
        'ksp_atol'], p['ksp_max_it'], p['ksp_gmres_restart'], p['ksp_monitor'])
    module = _cusp_cache.get(cache_key(M.ctype, parameters))
    if module:
        return module

    import codepy.toolchain
    from cgen import FunctionBody, FunctionDeclaration
    from cgen import Block, Statement, Include, Value
    from codepy.bpl import BoostPythonModule
    from codepy.cuda import CudaModule
    gcc_toolchain = codepy.toolchain.guess_toolchain()
    nvcc_toolchain = codepy.toolchain.guess_nvcc_toolchain()
    if 'CUSP_HOME' in os.environ:
        nvcc_toolchain.add_library('cusp', [os.environ['CUSP_HOME']], [], [])
    host_mod = BoostPythonModule()
    nvcc_mod = CudaModule(host_mod)
    nvcc_includes = [
        'thrust/device_vector.h', 'thrust/fill.h', 'cusp/csr_matrix.h',
        'cusp/krylov/cg.h', 'cusp/krylov/bicgstab.h', 'cusp/krylov/gmres.h',
        'cusp/precond/diagonal.h', 'cusp/precond/smoothed_aggregation.h',
        'cusp/precond/ainv.h', 'string'
    ]
    nvcc_mod.add_to_preamble([Include(s) for s in nvcc_includes])
    nvcc_mod.add_to_preamble([Statement('using namespace std')])

    # We're translating PETSc preconditioner types to CUSP
    diag = Statement(
        'cusp::precond::diagonal< ValueType, cusp::device_memory >M(A)')
    ainv = Statement(
        'cusp::precond::scaled_bridson_ainv< ValueType, cusp::device_memory >M(A)'
    )
    amg = Statement(
        'cusp::precond::smoothed_aggregation< IndexType, ValueType, cusp::device_memory >M(A)'
    )
    none = Statement(
        'cusp::identity_operator< ValueType, cusp::device_memory >M(nrows, ncols)'
    )
    preconditioners = {
        'diagonal': diag,
        'jacobi': diag,
        'ainv': ainv,
        'ainvcusp': ainv,
        'amg': amg,
        'hypre': amg,
        'none': none,
        None: none
    }
    try:
        precond_call = preconditioners[parameters['pc_type']]
    except KeyError:
        raise RuntimeError("Cusp does not support preconditioner type %s" %
                           parameters['pc_type'])
    solvers = {
        'cg':
        Statement('cusp::krylov::cg(A, x, b, monitor, M)'),
        'bicgstab':
        Statement('cusp::krylov::bicgstab(A, x, b, monitor, M)'),
        'gmres':
        Statement(
            'cusp::krylov::gmres(A, x, b, %(ksp_gmres_restart)d, monitor, M)' %
            parameters)
    }
    try:
        solve_call = solvers[parameters['ksp_type']]
    except KeyError:
        raise RuntimeError("Cusp does not support solver type %s" %
                           parameters['ksp_type'])
    monitor = 'monitor(b, %(ksp_max_it)d, %(ksp_rtol)g, %(ksp_atol)g)' % parameters

    nvcc_function = FunctionBody(
        FunctionDeclaration(Value('void', '__cusp_solve'), [
            Value('CUdeviceptr', '_rowptr'),
            Value('CUdeviceptr', '_colidx'),
            Value('CUdeviceptr', '_csrdata'),
            Value('CUdeviceptr', '_b'),
            Value('CUdeviceptr', '_x'),
            Value('int', 'nrows'),
            Value('int', 'ncols'),
            Value('int', 'nnz')
        ]),
        Block([
            Statement('typedef int IndexType'),
            Statement('typedef %s ValueType' % M.ctype),
            Statement(
                'typedef typename cusp::array1d_view< thrust::device_ptr<IndexType> > indices'
            ),
            Statement(
                'typedef typename cusp::array1d_view< thrust::device_ptr<ValueType> > values'
            ),
            Statement(
                'typedef cusp::csr_matrix_view< indices, indices, values, IndexType, ValueType, cusp::device_memory > matrix'
            ),
            Statement(
                'thrust::device_ptr< IndexType > rowptr((IndexType *)_rowptr)'
            ),
            Statement(
                'thrust::device_ptr< IndexType > colidx((IndexType *)_colidx)'
            ),
            Statement(
                'thrust::device_ptr< ValueType > csrdata((ValueType *)_csrdata)'
            ),
            Statement('thrust::device_ptr< ValueType > d_b((ValueType *)_b)'),
            Statement('thrust::device_ptr< ValueType > d_x((ValueType *)_x)'),
            Statement('indices row_offsets(rowptr, rowptr + nrows + 1)'),
            Statement('indices column_indices(colidx, colidx + nnz)'),
            Statement('values matrix_values(csrdata, csrdata + nnz)'),
            Statement('values b(d_b, d_b + nrows)'),
            Statement('values x(d_x, d_x + ncols)'),
            Statement('thrust::fill(x.begin(), x.end(), (ValueType)0)'),
            Statement(
                'matrix A(nrows, ncols, nnz, row_offsets, column_indices, matrix_values)'
            ),
            Statement('cusp::%s_monitor< ValueType > %s' %
                      ('verbose' if parameters['ksp_monitor'] else 'default',
                       monitor)), precond_call, solve_call
        ]))

    host_mod.add_to_preamble(
        [Include('boost/python/extract.hpp'),
         Include('string')])
    host_mod.add_to_preamble([Statement('using namespace boost::python')])
    host_mod.add_to_preamble([Statement('using namespace std')])

    nvcc_mod.add_function(nvcc_function)

    host_mod.add_function(
        FunctionBody(
            FunctionDeclaration(Value('void', 'solve'), [
                Value('object', '_rowptr'),
                Value('object', '_colidx'),
                Value('object', '_csrdata'),
                Value('object', '_b'),
                Value('object', '_x'),
                Value('object', '_nrows'),
                Value('object', '_ncols'),
                Value('object', '_nnz')
            ]),
            Block([
                Statement(
                    'CUdeviceptr rowptr = extract<CUdeviceptr>(_rowptr.attr("gpudata"))'
                ),
                Statement(
                    'CUdeviceptr colidx = extract<CUdeviceptr>(_colidx.attr("gpudata"))'
                ),
                Statement(
                    'CUdeviceptr csrdata = extract<CUdeviceptr>(_csrdata.attr("gpudata"))'
                ),
                Statement(
                    'CUdeviceptr b = extract<CUdeviceptr>(_b.attr("gpudata"))'
                ),
                Statement(
                    'CUdeviceptr x = extract<CUdeviceptr>(_x.attr("gpudata"))'
                ),
                Statement('int nrows = extract<int>(_nrows)'),
                Statement('int ncols = extract<int>(_ncols)'),
                Statement('int nnz = extract<int>(_nnz)'),
                Statement(
                    '__cusp_solve(rowptr, colidx, csrdata, b, x, nrows, ncols, nnz)'
                )
            ])))

    nvcc_toolchain.cflags.append('-arch')
    nvcc_toolchain.cflags.append('sm_20')
    nvcc_toolchain.cflags.append('-O3')
    module = nvcc_mod.compile(gcc_toolchain,
                              nvcc_toolchain,
                              debug=configuration["debug"])

    _cusp_cache[cache_key(M.ctype, parameters)] = module
    return module
Example #44
0
def generate_assignment_instruction_code(codegen_state, insn):
    kernel = codegen_state.kernel

    ecm = codegen_state.expression_to_code_mapper

    from loopy.expression import VectorizabilityChecker

    # {{{ vectorization handling

    if codegen_state.vectorization_info:
        if insn.atomicity:
            raise Unvectorizable("atomic operation")

        vinfo = codegen_state.vectorization_info
        vcheck = VectorizabilityChecker(kernel, vinfo.iname, vinfo.length)
        lhs_is_vector = vcheck(insn.assignee)
        rhs_is_vector = vcheck(insn.expression)

        if not lhs_is_vector and rhs_is_vector:
            raise Unvectorizable("LHS is scalar, RHS is vector, cannot assign")

        is_vector = lhs_is_vector

        del lhs_is_vector
        del rhs_is_vector

    # }}}

    from pymbolic.primitives import Variable, Subscript, Lookup
    from loopy.symbolic import LinearSubscript

    lhs = insn.assignee
    if isinstance(lhs, Lookup):
        lhs = lhs.aggregate

    if isinstance(lhs, Variable):
        assignee_var_name = lhs.name
        assignee_indices = ()

    elif isinstance(lhs, Subscript):
        assignee_var_name = lhs.aggregate.name
        assignee_indices = lhs.index_tuple

    elif isinstance(lhs, LinearSubscript):
        assignee_var_name = lhs.aggregate.name
        assignee_indices = (lhs.index, )

    else:
        raise RuntimeError("invalid lvalue '%s'" % lhs)

    del lhs

    result = codegen_state.ast_builder.emit_assignment(codegen_state, insn)

    # {{{ tracing

    lhs_dtype = codegen_state.kernel.get_var_descriptor(
        assignee_var_name).dtype

    if kernel.options.trace_assignments or kernel.options.trace_assignment_values:
        if codegen_state.vectorization_info and is_vector:
            raise Unvectorizable("tracing does not support vectorization")

        from pymbolic.mapper.stringifier import PREC_NONE
        lhs_code = codegen_state.expression_to_code_mapper(
            insn.assignee, PREC_NONE)

        from cgen import Statement as S  # noqa

        gs, ls = kernel.get_grid_size_upper_bounds()

        printf_format = "{}.{}[{}][{}]: {}".format(
            kernel.name, insn.id, ", ".join("gid%d=%%d" % i
                                            for i in range(len(gs))),
            ", ".join("lid%d=%%d" % i for i in range(len(ls))),
            assignee_var_name)

        printf_args = (["gid(%d)" % i for i in range(len(gs))] +
                       ["lid(%d)" % i for i in range(len(ls))])

        if assignee_indices:
            printf_format += "[%s]" % ",".join(len(assignee_indices) * ["%d"])
            printf_args.extend(
                ecm(i, prec=PREC_NONE, type_context="i")
                for i in assignee_indices)

        if kernel.options.trace_assignment_values:
            if lhs_dtype.numpy_dtype.kind == "i":
                printf_format += " = %d"
                printf_args.append(lhs_code)
            elif lhs_dtype.numpy_dtype.kind == "f":
                printf_format += " = %g"
                printf_args.append(lhs_code)
            elif lhs_dtype.numpy_dtype.kind == "c":
                printf_format += " = %g + %gj"
                printf_args.extend(["(%s).x" % lhs_code, "(%s).y" % lhs_code])

        if printf_args:
            printf_args_str = ", " + ", ".join(str(v) for v in printf_args)
        else:
            printf_args_str = ""

        printf_insn = S('printf("{}\\n"{})'.format(printf_format,
                                                   printf_args_str))

        from cgen import Block
        if kernel.options.trace_assignment_values:
            result = Block([result, printf_insn])
        else:
            # print first, execute later -> helps find segfaults
            result = Block([printf_insn, result])

    # }}}

    return result
Example #45
0
    FunctionBody(
        CLKernel(
            CLRequiredWorkGroupSize(
                (local_size, ),
                FunctionDeclaration(Value("void", "add"),
                                    arg_decls=[
                                        CLGlobal(
                                            Pointer(Const(POD(dtype, name))))
                                        for name in ["tgt", "op1", "op2"]
                                    ]))),
        Block([
            Initializer(
                POD(numpy.int32,
                    "idx"), "get_local_id(0) + %d * get_group_id(0)" %
                (local_size * thread_strides))
        ] + [
            Assign(
                "tgt[idx+%d]" % (o * local_size), "op1[idx+%d] + op2[idx+%d]" %
                (o * local_size, o * local_size))
            for o in range(thread_strides)
        ]))
])

knl = cl.Program(ctx, str(mod)).build().add

knl(queue, (local_size * macroblock_count, ), (local_size, ), c_buf, a_buf,
    b_buf)

c = numpy.empty_like(a)
cl.enqueue_copy(queue, c, c_buf).wait()