Python Block.Block 예제들, cgen.Block.Block Python 예제들

예제 #1

0

파일 보기

        def mat_mul_outer_loop(fetch_count):
            if with_scaling:
                inv_jac_multiplier = (
                    "fp_tex1Dfetch(scaling_tex,"
                    "(GLOBAL_MB_NR + %(inl)d)*MB_EL_COUNT + mb_el)")
            else:
                inv_jac_multiplier = "1"

            write_condition = "MB_DOF < DOFS_PER_EL*MB_EL_COUNT"
            if self.with_index_check:
                write_condition += " && GLOBAL_MB_NR < microblock_count"
            return For(
                "unsigned short seq_mb_number = 0",
                "seq_mb_number < SEQ_MB_COUNT", "++seq_mb_number",
                Block([
                    Initializer(POD(float_type, "result%d" % inl), 0)
                    for inl in range(par.inline)
                ] + [Line()] + get_mat_mul_code(fetch_count) + [
                    If(
                        write_condition,
                        Block([
                            Assign(
                                "out_vector[GLOBAL_MB_DOF_BASE"
                                " + %d*ALIGNED_DOFS_PER_MB"
                                " + MB_DOF]" % inl, "result%d * %s" %
                                (inl, (inv_jac_multiplier % {
                                    "inl": inl
                                }))) for inl in range(par.inline)
                        ]))
                ]))

예제 #2

0

파일 보기

파일: bad_cell.py 프로젝트: yangzilongdmgy/hedge

    def make_codepy_module(self, toolchain, dtype):
        from codepy.libraries import add_codepy
        toolchain = toolchain.copy()
        add_codepy(toolchain)

        from cgen import (Value, Include, Statement,
                Typedef, FunctionBody, FunctionDeclaration, Block, Const,
                Line, POD, Initializer, CustomLoop)
        S = Statement

        from codepy.bpl import BoostPythonModule
        mod = BoostPythonModule()

        mod.add_to_preamble([
            Include("vector"),
            Include("algorithm"),
            Include("hedge/base.hpp"),
            Include("hedge/volume_operators.hpp"),
            Include("boost/foreach.hpp"),
            Include("boost/numeric/ublas/io.hpp"),
            ]+self.get_cpu_extra_includes())

        mod.add_to_module([
            S("namespace ublas = boost::numeric::ublas"),
            S("using namespace hedge"),
            S("using namespace pyublas"),
            Line(),
            Typedef(POD(dtype, "value_type")),
            Line(),
            ])

        mod.add_function(FunctionBody(
            FunctionDeclaration(Value("void", "process_elements"), [
                Const(Value("uniform_element_ranges", "ers")),
                Const(Value("numpy_vector<value_type>", "field")),
                Value("numpy_vector<value_type>", "result"),
                ]+self.get_cpu_extra_parameter_declarators()),
            Block([
                Typedef(Value("numpy_vector<value_type>::iterator",
                    "it_type")),
                Typedef(Value("numpy_vector<value_type>::const_iterator",
                    "cit_type")),
                Line(),
                Initializer(Value("it_type", "result_it"),
                    "result.begin()"),
                Initializer(Value("cit_type", "field_it"),
                    "field.begin()"),
                Line() ]+self.get_cpu_extra_preamble()+[ Line(),
                CustomLoop(
                    "BOOST_FOREACH(const element_range er, ers)",
                    Block(self.get_cpu_per_element_code())
                    )
                ])))

        #print mod.generate()
        #toolchain = toolchain.copy()
        #toolchain.enable_debugging
        return mod.compile(toolchain)

예제 #3

0

파일 보기

def test_cgen():
    s = Struct(
        "yuck",
        [
            POD(
                np.float32,
                "h",
            ),
            POD(np.float32, "order"),
            POD(np.float32, "face_jacobian"),
            ArrayOf(POD(np.float32, "normal"), 17),
            POD(np.uint16, "a_base"),
            POD(np.uint16, "b_base"),
            #CudaGlobal(POD(np.uint8, "a_ilist_number")),
            POD(np.uint8, "b_ilist_number"),
            POD(np.uint8, "bdry_flux_number"),  # 0 if not on boundary
            POD(np.uint8, "reserved"),
            POD(np.uint32, "b_global_base"),
        ])
    f_decl = FunctionDeclaration(POD(np.uint16, "get_num"), [
        POD(np.uint8, "reserved"),
        POD(np.uint32, "b_global_base"),
    ])
    f_body = FunctionBody(
        f_decl,
        Block([
            POD(np.uint32, "i"),
            For(
                "i = 0",
                "i < 17",
                "++i",
                If(
                    "a > b",
                    Assign("a", "b"),
                    Block([
                        Assign("a", "b-1"),
                        #Break(),
                    ])),
            ),
            #BlankLine(),
            Comment("all done"),
        ]))
    t_decl = Template(
        'typename T',
        FunctionDeclaration(
            Value('CUdeviceptr', 'scan'),
            [Value('CUdeviceptr', 'inputPtr'),
             Value('int', 'length')]))

    print(s)
    print(f_body)
    print(t_decl)

예제 #4

0

파일 보기

파일: elementwise.py 프로젝트: inducer/codepy

def get_elwise_module_descriptor(arguments, operation, name="kernel"):
    from codepy.bpl import BoostPythonModule

    from cgen import FunctionBody, FunctionDeclaration, \
            Value, POD, Struct, For, Initializer, Include, Statement, \
            Line, Block

    S = Statement  # noqa: N806

    mod = BoostPythonModule()
    mod.add_to_preamble([
        Include("pyublas/numpy.hpp"),
    ])

    mod.add_to_module([
        S("namespace ublas = boost::numeric::ublas"),
        S("using namespace pyublas"),
        Line(),
    ])

    body = Block([
        Initializer(
            Value(
                "numpy_array<{} >::iterator".format(dtype_to_ctype(
                    varg.dtype)), varg.name), f"args.{varg.name}_ary.begin()")
        for varg in arguments if isinstance(varg, VectorArg)
    ] + [
        Initializer(sarg.declarator(), f"args.{sarg.name}")
        for sarg in arguments if isinstance(sarg, ScalarArg)
    ])

    body.extend([
        Line(),
        For("unsigned i = 0", "i < codepy_length", "++i",
            Block([S(operation)]))
    ])

    arg_struct = Struct("arg_struct", [arg.declarator() for arg in arguments])
    mod.add_struct(arg_struct, "ArgStruct")
    mod.add_to_module([Line()])

    mod.add_function(
        FunctionBody(
            FunctionDeclaration(Value("void", name), [
                POD(numpy.uintp, "codepy_length"),
                Value("arg_struct", "args")
            ]), body))

    return mod

예제 #5

0

파일 보기

    def generate(self):
        """Generate (i.e. yield) the source code of the
        module line-by-line.
        """

        from cgen import Block, Module, Include, Line, Define, \
                PrivateNamespace

        body = []

        if self.max_arity is not None:
            body.append(Define("BOOST_PYTHON_MAX_ARITY", self.max_arity))

        if self.use_private_namespace:
            mod_body = [PrivateNamespace(self.mod_body)]
        else:
            mod_body = self.mod_body

        body += ([Include("boost/python.hpp")]
                + self.preamble + [Line()]
                + mod_body
                + [Line(), Line(f"BOOST_PYTHON_MODULE({self.name})")]
                + [Block(self.init_body)])

        return Module(body)

예제 #6

0

파일 보기

파일: tools.py 프로젝트: yangzilongdmgy/hedge

def unroll(body_gen, total_number, max_unroll=None, start=0):
    from cgen import For, Line, Block
    from pytools import flatten

    if max_unroll is None:
        max_unroll = total_number

    result = []

    if total_number > max_unroll:
        loop_items = (total_number // max_unroll) * max_unroll

        result.extend([
                For("unsigned j = 0",
                    "j < %d" % loop_items,
                    "j += %d" % max_unroll,
                    Block(list(flatten(
                        body_gen("(j+%d)" % i)
                        for i in range(max_unroll))))
                    ),
                Line()
                ])
        start += loop_items

    result.extend(flatten(
        body_gen(i) for i in range(start, total_number)))

    return result

예제 #7

0

파일 보기

    def add_struct(self,
            struct, py_name=None, py_member_name_transform=lambda x: x,
            by_value_members=None):
        if by_value_members is None:
            by_value_members = set()

        from cgen import Block, Line, Statement, Typedef, Value

        if py_name is None:
            py_name = struct.tpname

        self.mod_body.append(struct)

        member_defs = []
        for f in struct.fields:
            py_f_name = py_member_name_transform(f.name)
            tp_lines, declarator = f.get_decl_pair()
            if f.name in by_value_members or tp_lines[0].startswith("numpy_"):
                member_defs.append(
                        ".def(pyublas::by_value_rw_member"
                        f'("{py_f_name}", &cl::{f.name}))')
            else:
                member_defs.append(
                        f'.def_readwrite("{py_f_name}", &cl::{f.name})'
                        )

        self.init_body.append(
            Block([
                Typedef(Value(struct.tpname, "cl")),
                Line(),
                Statement(
                    'boost::python::class_<cl>("{}"){}'.format(
                        py_name, "".join(member_defs))),
                ]))

예제 #8

0

파일 보기

파일: diff_shared_fld.py 프로젝트: yangzilongdmgy/hedge

        def get_load_code():
            mb_img_dofs = plan.aligned_image_dofs_per_microblock
            mb_preimg_dofs = plan.aligned_preimage_dofs_per_microblock
            preimg_dofs_over_dofs = (mb_preimg_dofs+mb_img_dofs-1) // mb_img_dofs

            load_code = []
            store_code = []

            var_num = 0
            for load_block in range(preimg_dofs_over_dofs):
                for inl in range(par.inline):
                    # load and store are split for better pipelining
                    # compiler can't figure that out because of branch

                    var = "tmp%d" % var_num
                    var_num += 1
                    load_code.append(POD(float_type, var))

                    block_addr = "%d * ALIGNED_IMAGE_DOFS_PER_MB + IMAGE_MB_DOF" % load_block
                    load_instr = Assign(var,
                        "field[GLOBAL_MB_PREIMAGE_DOF_BASE"
                        " + %d*ALIGNED_PREIMAGE_DOFS_PER_MB"
                        " + %s]" % (inl, block_addr))
                    store_instr = Assign(
                            "smem_field[PAR_MB_NR][%d][%s]" % (inl, block_addr),
                            var
                            )
                    if (load_block+1)*mb_img_dofs >= mb_preimg_dofs:
                        cond = "%s < ALIGNED_PREIMAGE_DOFS_PER_MB" % block_addr
                        load_instr = If(cond, load_instr)
                        store_instr = If(cond, store_instr)

                    load_code.append(load_instr)
                    store_code.append(store_instr)
            return Block(load_code + [Line()] + store_code)

예제 #9

0

파일 보기

    def get_kernel_call(self, codegen_state, name, gsize, lsize, extra_args):
        ecm = self.get_expression_to_code_mapper(codegen_state)

        from pymbolic.mapper.stringifier import PREC_COMPARISON, PREC_NONE
        result = []
        from cgen import Statement as S, Block
        if lsize:
            result.append(
                    S("assert(programCount == %s)"
                        % ecm(lsize[0], PREC_COMPARISON)))

        if gsize:
            launch_spec = "[%s]" % ", ".join(
                                ecm(gs_i, PREC_NONE)
                                for gs_i in gsize)
        else:
            launch_spec = ""

        arg_names, arg_decls = self._arg_names_and_decls(codegen_state)

        result.append(S(
            "launch%s %s(%s)" % (
                launch_spec,
                name,
                ", ".join(arg_names)
                )))

        return Block(result)

예제 #10

0

파일 보기

def generate_c_instruction_code(codegen_state, insn):
    kernel = codegen_state.kernel

    if codegen_state.vectorization_info is not None:
        raise Unvectorizable("C instructions cannot be vectorized")

    body = []

    from loopy.target.c import POD
    from cgen import Initializer, Block, Line

    from pymbolic.primitives import Variable
    for name, iname_expr in insn.iname_exprs:
        if (isinstance(iname_expr, Variable)
                and name not in codegen_state.var_subst_map):
            # No need, the bare symbol will work
            continue

        body.append(
            Initializer(
                POD(codegen_state.ast_builder, kernel.index_dtype, name),
                codegen_state.expression_to_code_mapper(iname_expr,
                                                        prec=PREC_NONE,
                                                        type_context="i")))

    if body:
        body.append(Line())

    body.extend(Line(line) for line in insn.code.split("\n"))

    return Block(body)

예제 #11

0

파일 보기

파일: tools.py 프로젝트: yangzilongdmgy/hedge

def get_load_code(dest, base, bytes, word_type=numpy.uint32,
        descr=None):
    from cgen import (
            Pointer, POD, 
            Comment, Block, Line, \
            Constant, For, Statement)

    from cgen import dtype_to_ctype
    copy_dtype = numpy.dtype(word_type)
    copy_dtype_str = dtype_to_ctype(copy_dtype)

    code = []
    if descr is not None:
        code.append(Comment(descr))

    code.extend([
        Block([
            Constant(Pointer(POD(copy_dtype, "load_base")),
                ("(%s *) (%s)" % (copy_dtype_str, base))),
            For("unsigned word_nr = THREAD_NUM",
                "word_nr*sizeof(int) < (%s)" % bytes,
                "word_nr += COALESCING_THREAD_COUNT",
                Statement("((%s *) (%s))[word_nr] = load_base[word_nr]"
                    % (copy_dtype_str, dest))
                ),
            ]),
        Line(),
        ])

    return code

예제 #12

0

파일 보기

        def get_scalar_diff_code():
            code = []
            for inl in range(par.inline):
                for axis in dims:
                    code.append(
                        Initializer(POD(float_type, "d%drst%d" % (inl, axis)),
                                    0))

            code.append(Line())

            def get_mat_entry(row, col, axis):
                return ("smem_diff_rst_mat["
                        "%(row)s*DIFFMAT_COLUMNS + %(axis)s*DOFS_PER_EL"
                        " + %(col)s"
                        "]" % {
                            "row": row,
                            "col": col,
                            "axis": axis
                        })

            tex_channels = ["x", "y", "z", "w"]
            from hedge.backends.cuda.tools import unroll
            code.extend([
                POD(float_type, "field_value%d" % inl)
                for inl in range(par.inline)
            ] + [Line()] + unroll(
                lambda j: [
                    Assign(
                        "field_value%d" % inl,
                        "fp_tex1Dfetch(field_tex, GLOBAL_MB_DOF_BASE + %d*ALIGNED_DOFS_PER_MB "
                        "+ mb_el*DOFS_PER_EL + %s)" % (inl, j))
                    for inl in range(par.inline)
                ] + [Line()] + [
                    S("d%drst%d += %s * field_value%d" %
                      (inl, axis, get_mat_entry("SEGMENT_DOF", j, axis), inl))
                    for axis in dims for inl in range(par.inline)
                ] + [Line()], given.dofs_per_el(), self.plan.max_unroll))

            store_code = Block()
            for inl in range(par.inline):
                for rst_axis in dims:
                    store_code.append(
                        Assign(
                            "drst%d_global[GLOBAL_MB_DOF_BASE"
                            " + %d*ALIGNED_DOFS_PER_MB + MB_DOF]" %
                            (rst_axis, inl),
                            "d%drst%d" % (inl, rst_axis),
                        ))

            code.append(If("MB_DOF < DOFS_PER_EL*ELS_PER_MB", store_code))

            return code

예제 #13

0

파일 보기

파일: compiler.py 프로젝트: walterroach/tvb-hpc

    def emit_sequential_loop(self, codegen_state, iname, iname_dtype, lbound,
                             ubound, inner):
        from cgen import Pragma, Block
        loop = super().emit_sequential_loop(codegen_state, iname, iname_dtype,
                                            lbound, ubound, inner)

        pragma = self.target.iname_pragma_map.get(iname)
        if pragma:
            return Block(contents=[
                Pragma(pragma),
                loop,
            ])
        return loop

예제 #14

0

파일 보기

def make_greet_mod(greeting):
    from cgen import FunctionBody, FunctionDeclaration, Block, \
            Const, Pointer, Value, Statement
    from codepy.bpl import BoostPythonModule

    mod = BoostPythonModule()

    mod.add_function(
        FunctionBody(
            FunctionDeclaration(Const(Pointer(Value("char", "greet"))), []),
            Block([Statement('return "%s"' % greeting)])))

    from codepy.toolchain import guess_toolchain
    return mod.compile(guess_toolchain(), wait_on_error=True)

예제 #15

0

파일 보기

    def expose_vector_type(self, name, py_name=None):
        self.add_codepy_include()

        if py_name is None:
            py_name = name

        from cgen import (Block, Typedef, Line, Statement, Value)

        self.init_body.append(
            Block([
                Typedef(Value(name, "cl")),
                Line(),
                Statement(
                    f'boost::python::class_<cl>("{py_name}")'
                    ".def(codepy::no_compare_indexing_suite<cl>())"),
                ]))

예제 #16

0

파일 보기

파일: bad_cell.py 프로젝트: yangzilongdmgy/hedge

    def get_cpu_per_element_code(self):
        from cgen import (Value, Statement, Initializer, While,
                Comment, Block, For, Line, Pointer)
        S = Statement
        return [
                # assumes there is more than one coefficient
                Initializer(Value("cit_type", "el_modes"), "field_it+er.first"),

                Line(),
                Comment("zero out reduced_modes"),
                For("npy_uint32 mode_idx = 0",
                    "mode_idx < max_degree+1",
                    "++mode_idx",
                    S("reduced_modes[mode_idx] = 0")),

                Line(),
                Comment("gather modes by degree"),
                For("npy_uint32 mode_idx = 0",
                    "mode_idx < mode_count",
                    "++mode_idx",
                    S("reduced_modes[mode_degrees_iterator[mode_idx]]"
                        " += el_modes[mode_idx]")),

                Line(),
                Comment("perform skyline procedure"),
                Initializer(Pointer(Value("value_type", "start")),
                    "reduced_modes.get()"),
                Initializer(Pointer(Value("value_type", "end")),
                    "start+max_degree+1"),
                Initializer(Value("value_type", "cur_max"),
                    "std::max(*(end-1), *(end-2))"),

                Line(),
                While("end != start", Block([
                    S("--end"),
                    S("*end = std::max(cur_max, *end)"),
                    ])),

                Line(),
                Comment("scatter modes by degree"),
                Initializer(Value("it_type", "tgt_base"), "result_it+er.first"),
                For("npy_uint32 mode_idx = 0",
                    "mode_idx < mode_count",
                    "++mode_idx",
                    S("tgt_base[mode_idx] = "
                        "reduced_modes[mode_degrees_iterator[mode_idx]]")),
                ]

예제 #17

0

파일 보기

    def get_kernel_call(self, codegen_state, name, gsize, lsize, extra_args):
        ecm = self.get_expression_to_code_mapper(codegen_state)

        from pymbolic.mapper.stringifier import PREC_NONE
        result = []
        from cgen import Statement as S, Block
        if lsize:
            result.append(
                S("assert(programCount == (%s))" % ecm(lsize[0], PREC_NONE)))

        arg_names, arg_decls = self._arg_names_and_decls(codegen_state)

        from cgen.ispc import ISPCLaunch
        result.append(
            ISPCLaunch(tuple(ecm(gs_i, PREC_NONE) for gs_i in gsize),
                       "%s(%s)" % (name, ", ".join(arg_names))))

        return Block(result)

예제 #18

0

파일 보기

        def get_flux_code(flux_writer):
            flux_code = Block([])

            flux_code.extend([
                Initializer(Pointer(Value("face_pair", "fpair")),
                            "data.facepairs+fpair_nr"),
                Initializer(
                    MaybeUnused(POD(numpy.uint32, "a_index")),
                    "fpair->a_base + tex1Dfetch(tex_index_lists, "
                    "fpair->a_ilist_index + FACEDOF_NR)"),
                Initializer(
                    MaybeUnused(POD(numpy.uint32, "b_index")),
                    "fpair->b_base + tex1Dfetch(tex_index_lists, "
                    "fpair->b_ilist_index + FACEDOF_NR)"),
                Line(),
                flux_writer(),
                Line(),
                S("fpair_nr += CONCURRENT_FACES")
            ])

            return flux_code

예제 #19

0

파일 보기

파일: bad_cell.py 프로젝트: yangzilongdmgy/hedge

    def get_cpu_per_element_code(self):
        from cgen import (Value, Statement, Initializer, While, Block)
        S = Statement
        return [
                # assumes there is more than one coefficient
                Initializer(Value("cit_type", "start"), "field_it+er.first"),
                Initializer(Value("cit_type", "end"), "field_it+er.second"),
                Initializer(Value("it_type", "tgt"), "result_it+er.first"),

                Initializer(Value("cit_type", "cur"), "start"),
                While("cur != end",
                    Block([
                        Initializer(Value("cit_type", "avg_start"),
                            "std::max(start, cur-1)"),
                        Initializer(Value("cit_type", "avg_end"),
                            "std::min(end, cur+2)"),

                        S("*tgt++ = std::accumulate(avg_start, avg_end, value_type(0))"
                            "/std::distance(avg_start, avg_end)"),
                        S("++cur"),
                        ])
                    )
                ]

예제 #20

0

파일 보기

파일: diff_shared_fld.py 프로젝트: yangzilongdmgy/hedge

    def get_kernel(self, diff_op, elgroup, for_benchmark=False):
        from cgen import \
                Pointer, POD, Value, ArrayOf, Const, \
                Module, FunctionDeclaration, FunctionBody, Block, \
                Comment, Line, Define, Include, \
                Initializer, If, For, Statement, Assign

        from pycuda.tools import dtype_to_ctype
        from cgen.cuda import CudaShared, CudaGlobal

        discr = self.discr
        d = discr.dimensions
        dims = range(d)
        plan = self.plan
        given = plan.given

        elgroup, = discr.element_groups
        float_type = given.float_type

        f_decl = CudaGlobal(FunctionDeclaration(Value("void", "apply_diff_mat_smem"),
            [Pointer(POD(float_type, "debugbuf")), Pointer(POD(float_type, "field")), ]
            + [Pointer(POD(float_type, "drst%d_global" % i)) for i in dims]
            ))

        par = plan.parallelism

        cmod = Module([
                Include("pycuda-helpers.hpp"),
                ])

        if float_type == numpy.float64:
            cmod.append(Value("texture<fp_tex_double, 1, cudaReadModeElementType>",
                    "diff_rst_mat_tex"))
        elif float_type == numpy.float32:
            rst_channels = given.devdata.make_valid_tex_channel_count(d)
            cmod.append(Value("texture<float%d, 1, cudaReadModeElementType>"
                    % rst_channels, "diff_rst_mat_tex"))
        else:
            raise ValueError("unsupported float type: %s" % float_type)

        # only preimage size variation is supported here
        assert plan.image_dofs_per_el == given.dofs_per_el()
        assert plan.aligned_image_dofs_per_microblock == given.microblock.aligned_floats

        # FIXME: aligned_image_dofs_per_microblock must be divisible
        # by this, therefore hardcoding for now.
        chunk_size = 16

        cmod.extend([
                Line(),
                Define("DIMENSIONS", discr.dimensions),

                Define("IMAGE_DOFS_PER_EL", plan.image_dofs_per_el),
                Define("PREIMAGE_DOFS_PER_EL", plan.preimage_dofs_per_el),
                Define("ALIGNED_IMAGE_DOFS_PER_MB", plan.aligned_image_dofs_per_microblock),
                Define("ALIGNED_PREIMAGE_DOFS_PER_MB", plan.aligned_preimage_dofs_per_microblock),
                Define("ELS_PER_MB", given.microblock.elements),
                Define("IMAGE_DOFS_PER_MB", "(IMAGE_DOFS_PER_EL*ELS_PER_MB)"),
                Line(),
                Define("CHUNK_SIZE", chunk_size),
                Define("CHUNK_DOF", "threadIdx.x"),
                Define("PAR_MB_NR", "threadIdx.y"),
                Define("CHUNK_NR", "threadIdx.z"),
                Define("IMAGE_MB_DOF", "(CHUNK_NR*CHUNK_SIZE+CHUNK_DOF)"),
                Define("IMAGE_EL_DOF", "(IMAGE_MB_DOF - mb_el*IMAGE_DOFS_PER_EL)"),
                Line(),
                Define("MACROBLOCK_NR", "blockIdx.x"),
                Line(),
                Define("PAR_MB_COUNT", par.parallel),
                Define("INLINE_MB_COUNT", par.inline),
                Define("SEQ_MB_COUNT", par.serial),
                Line(),
                Define("GLOBAL_MB_NR_BASE",
                    "(MACROBLOCK_NR*PAR_MB_COUNT*INLINE_MB_COUNT*SEQ_MB_COUNT)"),
                Define("GLOBAL_MB_NR",
                    "(GLOBAL_MB_NR_BASE"
                    "+ (seq_mb_number*PAR_MB_COUNT + PAR_MB_NR)*INLINE_MB_COUNT)"),
                Define("GLOBAL_MB_IMAGE_DOF_BASE", "(GLOBAL_MB_NR*ALIGNED_IMAGE_DOFS_PER_MB)"),
                Define("GLOBAL_MB_PREIMAGE_DOF_BASE", "(GLOBAL_MB_NR*ALIGNED_PREIMAGE_DOFS_PER_MB)"),
                Line(),
                CudaShared(
                    ArrayOf(
                        ArrayOf(
                            ArrayOf(
                                POD(float_type, "smem_field"),
                                "PAR_MB_COUNT"),
                            "INLINE_MB_COUNT"),
                        "ALIGNED_PREIMAGE_DOFS_PER_MB")),
                Line(),
                ])

        S = Statement
        f_body = Block([
            Initializer(Const(POD(numpy.uint16, "mb_el")),
                "IMAGE_MB_DOF / IMAGE_DOFS_PER_EL"),
            Line(),
            ])

        # ---------------------------------------------------------------------
        def get_load_code():
            mb_img_dofs = plan.aligned_image_dofs_per_microblock
            mb_preimg_dofs = plan.aligned_preimage_dofs_per_microblock
            preimg_dofs_over_dofs = (mb_preimg_dofs+mb_img_dofs-1) // mb_img_dofs

            load_code = []
            store_code = []

            var_num = 0
            for load_block in range(preimg_dofs_over_dofs):
                for inl in range(par.inline):
                    # load and store are split for better pipelining
                    # compiler can't figure that out because of branch

                    var = "tmp%d" % var_num
                    var_num += 1
                    load_code.append(POD(float_type, var))

                    block_addr = "%d * ALIGNED_IMAGE_DOFS_PER_MB + IMAGE_MB_DOF" % load_block
                    load_instr = Assign(var,
                        "field[GLOBAL_MB_PREIMAGE_DOF_BASE"
                        " + %d*ALIGNED_PREIMAGE_DOFS_PER_MB"
                        " + %s]" % (inl, block_addr))
                    store_instr = Assign(
                            "smem_field[PAR_MB_NR][%d][%s]" % (inl, block_addr),
                            var
                            )
                    if (load_block+1)*mb_img_dofs >= mb_preimg_dofs:
                        cond = "%s < ALIGNED_PREIMAGE_DOFS_PER_MB" % block_addr
                        load_instr = If(cond, load_instr)
                        store_instr = If(cond, store_instr)

                    load_code.append(load_instr)
                    store_code.append(store_instr)
            return Block(load_code + [Line()] + store_code)

        def get_scalar_diff_code():
            code = []
            for inl in range(par.inline):
                for axis in dims:
                    code.append(
                        Initializer(POD(float_type, "d%drst%d" % (inl, axis)), 0))

            code.append(Line())

            tex_channels = ["x", "y", "z", "w"]

            store_code = Block()
            for inl in range(par.inline):
                for rst_axis in dims:
                    store_code.append(Assign(
                        "drst%d_global[GLOBAL_MB_IMAGE_DOF_BASE + "
                        "%d*ALIGNED_IMAGE_DOFS_PER_MB + IMAGE_MB_DOF]"
                        % (rst_axis, inl),
                        "d%drst%d" % (inl, rst_axis)
                        ))

            from hedge.backends.cuda.tools import unroll
            code.extend([
                Comment("everybody needs to be done with the old data"),
                S("__syncthreads()"),
                Line(),
                get_load_code(),
                Line(),
                Comment("all the new data must be loaded"),
                S("__syncthreads()"),
                Line(),
                ])

            if float_type == numpy.float32:
                code.append(Value("float%d" % rst_channels, "dmat_entries"))

            code.extend([
                POD(float_type, "field_value%d" % inl)
                for inl in range(par.inline)
                ]+[Line()])

            def unroll_body(j):
                result = [
                    Assign("field_value%d" % inl,
                        "smem_field[PAR_MB_NR][%d][mb_el*PREIMAGE_DOFS_PER_EL+%s]" % (inl, j))
                    for inl in range(par.inline)
                    ]

                if float_type == numpy.float32:
                    result.append(Assign("dmat_entries",
                        "tex1Dfetch(diff_rst_mat_tex, IMAGE_EL_DOF + %s*IMAGE_DOFS_PER_EL)" % j))
                    result.extend(
                        S("d%drst%d += dmat_entries.%s * field_value%d"
                            % (inl, axis, tex_channels[axis], inl))
                        for inl in range(par.inline)
                        for axis in dims)
                elif float_type == numpy.float64:
                    result.extend(
                        S("d%(inl)drst%(axis)d += "
                            "fp_tex1Dfetch(diff_rst_mat_tex, %(axis)d "
                            "+ DIMENSIONS*(IMAGE_EL_DOF + %(j)d*IMAGE_DOFS_PER_EL))"
                            "* field_value%(inl)d" % {
                            "inl": inl,
                            "axis": axis,
                            "j": j
                            })
                        for inl in range(par.inline)
                        for axis in dims)
                else:
                    assert False

                return result

            code.append(If("IMAGE_MB_DOF < IMAGE_DOFS_PER_MB", Block(unroll(unroll_body,
                    total_number=plan.preimage_dofs_per_el)
                    +[store_code])))

            return code

        f_body.extend([
            For("unsigned short seq_mb_number = 0",
                "seq_mb_number < SEQ_MB_COUNT",
                "++seq_mb_number",
                Block(get_scalar_diff_code())
                )
            ])

        # finish off ----------------------------------------------------------
        cmod.append(FunctionBody(f_decl, f_body))

        if not for_benchmark and "cuda_dump_kernels" in discr.debug:
            from hedge.tools import open_unique_debug_file
            open_unique_debug_file("diff", ".cu").write(str(cmod))

        mod = SourceModule(cmod,
                keep="cuda_keep_kernels" in discr.debug,
                #options=["--maxrregcount=16"]
                )

        func = mod.get_function("apply_diff_mat_smem")

        if "cuda_diff" in discr.debug:
            print "diff: lmem=%d smem=%d regs=%d" % (
                    func.local_size_bytes,
                    func.shared_size_bytes,
                    func.registers)

        diff_rst_mat_texref = mod.get_texref("diff_rst_mat_tex")
        gpu_diffmats = self.gpu_diffmats(diff_op, elgroup)

        if given.float_type == numpy.float32:
            gpu_diffmats.bind_to_texref_ext(diff_rst_mat_texref, rst_channels)
        elif given.float_type == numpy.float64:
            gpu_diffmats.bind_to_texref_ext(diff_rst_mat_texref,
                    allow_double_hack=True)
        else:
            assert False

        assert given.microblock.aligned_floats % chunk_size == 0
        block = (
                chunk_size,
                plan.parallelism.parallel,
                given.microblock.aligned_floats//chunk_size)

        func.prepare(
                ["PP"] + discr.dimensions*["P"],
                texrefs=[diff_rst_mat_texref])

        return block, func

예제 #21

0

파일 보기

    def write_boundary_flux_code(self, for_benchmark):
        given = self.plan.given

        flux_write_code = Block()

        fluxes_by_bdry_number = {}
        for flux_nr, wdflux in enumerate(self.fluxes):
            for bflux_info in wdflux.boundaries:
                if for_benchmark:
                    bdry_number = 0
                else:
                    bdry_number = self.executor.boundary_tag_to_number[
                        bflux_info.bpair.tag]

                fluxes_by_bdry_number.setdefault(bdry_number, [])\
                        .append((flux_nr, bflux_info))

        flux_write_code.extend([
            Initializer(MaybeUnused(POD(given.float_type, "flux%d" % flux_nr)),
                        0) for flux_nr in range(len(self.fluxes))
        ])

        for bdry_number, nrs_and_fluxes in fluxes_by_bdry_number.iteritems():
            bblock = []

            from pytools import set_sum
            int_deps = set_sum(flux_rec.int_dependencies
                               for flux_nr, flux_rec in nrs_and_fluxes)
            ext_deps = set_sum(flux_rec.ext_dependencies
                               for flux_nr, flux_rec in nrs_and_fluxes)

            for dep in int_deps:
                bblock.extend([
                    Comment(str(dep)),
                    Initializer(
                        MaybeUnused(
                            POD(given.float_type,
                                "val_a_field%d" % self.dep_to_index[dep])),
                        "fp_tex1Dfetch(field%d_tex, a_index)" %
                        self.dep_to_index[dep])
                ])
            for dep in ext_deps:
                bblock.extend([
                    Comment(str(dep)),
                    Initializer(
                        MaybeUnused(
                            POD(given.float_type,
                                "val_b_field%d" % self.dep_to_index[dep])),
                        "fp_tex1Dfetch(field%s_tex, b_index)" %
                        self.dep_to_index[dep])
                ])

            f2cm = FluxToCodeMapper(given.float_type)

            comp_code = [Line()]
            for flux_nr, flux_rec in nrs_and_fluxes:
                comp_code.append(
                    Statement(
                        ("flux%d += " % flux_nr) +
                        flux_to_code(f2cm,
                                     is_flipped=False,
                                     int_field_expr=flux_rec.bpair.field,
                                     ext_field_expr=flux_rec.bpair.bfield,
                                     dep_to_index=self.dep_to_index,
                                     flux=flux_rec.flux_expr,
                                     prec=PREC_NONE)))

            if f2cm.cse_name_list:
                bblock.append(Line())

            bblock.extend(
                Initializer(Value("value_type", cse_name), cse_str)
                for cse_name, cse_str in f2cm.cse_name_list)

            flux_write_code.extend([
                Line(),
                Comment(nrs_and_fluxes[0][1].bpair.tag),
                If("(fpair->boundary_bitmap) & (1 << %d)" % (bdry_number),
                   Block(bblock + comp_code)),
            ])

        flux_write_code.extend(
            [
                Line(),
            ] + [
                self.gen_store(flux_nr, "fpair->a_dest+FACEDOF_NR",
                               "fpair->face_jacobian * flux%d" % flux_nr)
                for flux_nr in range(len(self.fluxes))
            ]
            #Assign("debugbuf[blockIdx.x*96+fpair_nr]", "10000+fpair->a_dest"),
        )

        return flux_write_code

예제 #22

0

파일 보기

파일: cuda.py 프로젝트: connorjward/loopy

    def emit_atomic_update(self, codegen_state, lhs_atomicity, lhs_var,
                           lhs_expr, rhs_expr, lhs_dtype, rhs_type_context):

        from pymbolic.primitives import Sum
        from cgen import Statement
        from pymbolic.mapper.stringifier import PREC_NONE

        if isinstance(lhs_dtype, NumpyType) and lhs_dtype.numpy_dtype in [
                np.int32, np.int64, np.float32, np.float64
        ]:
            # atomicAdd
            if isinstance(rhs_expr, Sum):
                ecm = self.get_expression_to_code_mapper(codegen_state)

                new_rhs_expr = Sum(
                    tuple(c for c in rhs_expr.children if c != lhs_expr))
                lhs_expr_code = ecm(lhs_expr)
                rhs_expr_code = ecm(new_rhs_expr)

                return Statement("atomicAdd(&{}, {})".format(
                    lhs_expr_code, rhs_expr_code))
            else:
                from cgen import Block, DoWhile, Assign
                from loopy.target.c import POD
                old_val_var = codegen_state.var_name_generator("loopy_old_val")
                new_val_var = codegen_state.var_name_generator("loopy_new_val")

                from loopy.kernel.data import TemporaryVariable
                ecm = codegen_state.expression_to_code_mapper.with_assignments(
                    {
                        old_val_var: TemporaryVariable(old_val_var, lhs_dtype),
                        new_val_var: TemporaryVariable(new_val_var, lhs_dtype),
                    })

                lhs_expr_code = ecm(lhs_expr,
                                    prec=PREC_NONE,
                                    type_context=None)

                from pymbolic.mapper.substitutor import make_subst_func
                from pymbolic import var
                from loopy.symbolic import SubstitutionMapper

                subst = SubstitutionMapper(
                    make_subst_func({lhs_expr: var(old_val_var)}))
                rhs_expr_code = ecm(subst(rhs_expr),
                                    prec=PREC_NONE,
                                    type_context=rhs_type_context,
                                    needed_dtype=lhs_dtype)

                cast_str = ""
                old_val = old_val_var
                new_val = new_val_var

                if lhs_dtype.numpy_dtype.kind == "f":
                    if lhs_dtype.numpy_dtype == np.float32:
                        ctype = "int"
                    elif lhs_dtype.numpy_dtype == np.float64:
                        ctype = "long"
                    else:
                        raise AssertionError()

                    old_val = "*(%s *) &" % ctype + old_val
                    new_val = "*(%s *) &" % ctype + new_val
                    cast_str = "(%s *) " % (ctype)

                return Block([
                    POD(self, NumpyType(lhs_dtype.dtype, target=self.target),
                        old_val_var),
                    POD(self, NumpyType(lhs_dtype.dtype, target=self.target),
                        new_val_var),
                    DoWhile(
                        "atomicCAS("
                        "%(cast_str)s&(%(lhs_expr)s), "
                        "%(old_val)s, "
                        "%(new_val)s"
                        ") != %(old_val)s" % {
                            "cast_str": cast_str,
                            "lhs_expr": lhs_expr_code,
                            "old_val": old_val,
                            "new_val": new_val,
                        },
                        Block([
                            Assign(old_val_var, lhs_expr_code),
                            Assign(new_val_var, rhs_expr_code),
                        ]))
                ])
        else:
            raise NotImplementedError("atomic update for '%s'" % lhs_dtype)

예제 #23

0

파일 보기

        Pointer, Module, Block, Initializer, Assign
from cgen.cuda import CudaGlobal

mod = Module([
    FunctionBody(
        CudaGlobal(
            FunctionDeclaration(Value("void", "add"),
                                arg_decls=[
                                    Pointer(POD(dtype, name))
                                    for name in ["tgt", "op1", "op2"]
                                ])),
        Block([
            Initializer(
                POD(numpy.int32, "idx"), "threadIdx.x + %d*blockIdx.x" %
                (block_size * thread_strides)),
        ] + [
            Assign(
                "tgt[idx+%d]" % (o * block_size), "op1[idx+%d] + op2[idx+%d]" %
                (o * block_size, o * block_size))
            for o in range(thread_strides)
        ]))
])

mod = SourceModule(mod)

func = mod.get_function("add")
func(c_gpu, a_gpu, b_gpu, block=(block_size, 1, 1), grid=(macroblock_count, 1))

c = cuda.from_device_like(c_gpu, a)

assert la.norm(c - (a + b)) == 0

예제 #24

0

파일 보기

파일: instruction.py 프로젝트: dokempf/loopy

def generate_assignment_instruction_code(codegen_state, insn):
    kernel = codegen_state.kernel

    ecm = codegen_state.expression_to_code_mapper

    from loopy.expression import dtype_to_type_context, VectorizabilityChecker

    # {{{ vectorization handling

    if codegen_state.vectorization_info:
        if insn.atomicity:
            raise Unvectorizable("atomic operation")

        vinfo = codegen_state.vectorization_info
        vcheck = VectorizabilityChecker(kernel, vinfo.iname, vinfo.length)
        lhs_is_vector = vcheck(insn.assignee)
        rhs_is_vector = vcheck(insn.expression)

        if not lhs_is_vector and rhs_is_vector:
            raise Unvectorizable("LHS is scalar, RHS is vector, cannot assign")

        is_vector = lhs_is_vector

        del lhs_is_vector
        del rhs_is_vector

    # }}}

    from pymbolic.primitives import Variable, Subscript
    from loopy.symbolic import LinearSubscript

    lhs = insn.assignee
    if isinstance(lhs, Variable):
        assignee_var_name = lhs.name
        assignee_indices = ()

    elif isinstance(lhs, Subscript):
        assignee_var_name = lhs.aggregate.name
        assignee_indices = lhs.index_tuple

    elif isinstance(lhs, LinearSubscript):
        assignee_var_name = lhs.aggregate.name
        assignee_indices = (lhs.index, )

    else:
        raise RuntimeError("invalid lvalue '%s'" % lhs)

    lhs_var = kernel.get_var_descriptor(assignee_var_name)
    lhs_dtype = lhs_var.dtype

    if insn.atomicity is not None:
        lhs_atomicity = [
            a for a in insn.atomicity if a.var_name == assignee_var_name
        ]
        assert len(lhs_atomicity) <= 1
        if lhs_atomicity:
            lhs_atomicity, = lhs_atomicity
        else:
            lhs_atomicity = None
    else:
        lhs_atomicity = None

    from loopy.kernel.data import AtomicInit, AtomicUpdate

    lhs_code = ecm(insn.assignee, prec=PREC_NONE, type_context=None)
    rhs_type_context = dtype_to_type_context(kernel.target, lhs_dtype)
    if lhs_atomicity is None:
        result = codegen_state.ast_builder.emit_assignment(
            codegen_state, lhs_code,
            ecm(insn.expression,
                prec=PREC_NONE,
                type_context=rhs_type_context,
                needed_dtype=lhs_dtype))

    elif isinstance(lhs_atomicity, AtomicInit):
        raise NotImplementedError("atomic init")

    elif isinstance(lhs_atomicity, AtomicUpdate):
        codegen_state.seen_atomic_dtypes.add(lhs_dtype)
        result = codegen_state.ast_builder.generate_atomic_update(
            kernel, codegen_state, lhs_atomicity, lhs_var, insn.assignee,
            insn.expression, lhs_dtype, rhs_type_context)

    else:
        raise ValueError("unexpected lhs atomicity type: %s" %
                         type(lhs_atomicity).__name__)

    # {{{ tracing

    if kernel.options.trace_assignments or kernel.options.trace_assignment_values:
        if codegen_state.vectorization_info and is_vector:
            raise Unvectorizable("tracing does not support vectorization")

        from cgen import Statement as S  # noqa

        gs, ls = kernel.get_grid_size_upper_bounds()

        printf_format = "%s.%s[%s][%s]: %s" % (kernel.name, insn.id, ", ".join(
            "gid%d=%%d" % i for i in range(len(gs))), ", ".join(
                "lid%d=%%d" % i for i in range(len(ls))), assignee_var_name)

        printf_args = (["gid(%d)" % i for i in range(len(gs))] +
                       ["lid(%d)" % i for i in range(len(ls))])

        if assignee_indices:
            printf_format += "[%s]" % ",".join(len(assignee_indices) * ["%d"])
            printf_args.extend(
                ecm(i, prec=PREC_NONE, type_context="i")
                for i in assignee_indices)

        if kernel.options.trace_assignment_values:
            if lhs_dtype.numpy_dtype.kind == "i":
                printf_format += " = %d"
                printf_args.append(lhs_code)
            elif lhs_dtype.numpy_dtype.kind == "f":
                printf_format += " = %g"
                printf_args.append(lhs_code)
            elif lhs_dtype.numpy_dtype.kind == "c":
                printf_format += " = %g + %gj"
                printf_args.extend(["(%s).x" % lhs_code, "(%s).y" % lhs_code])

        if printf_args:
            printf_args_str = ", " + ", ".join(printf_args)
        else:
            printf_args_str = ""

        printf_insn = S("printf(\"%s\\n\"%s)" %
                        (printf_format, printf_args_str))

        from cgen import Block
        if kernel.options.trace_assignment_values:
            result = Block([result, printf_insn])
        else:
            # print first, execute later -> helps find segfaults
            result = Block([printf_insn, result])

    # }}}

    return result

예제 #25

0

파일 보기

    def get_kernel(self, fdata, ilist_data, for_benchmark):
        from cgen.cuda import CudaShared, CudaGlobal
        from pycuda.tools import dtype_to_ctype

        discr = self.discr
        given = self.plan.given
        fplan = self.plan
        d = discr.dimensions
        dims = range(d)

        elgroup, = discr.element_groups

        float_type = given.float_type

        f_decl = CudaGlobal(
            FunctionDeclaration(Value("void", "apply_flux"), [
                Pointer(POD(float_type, "debugbuf")),
                Pointer(POD(numpy.uint8, "gmem_facedata")),
            ] + [
                Pointer(POD(float_type, "gmem_fluxes_on_faces%d" % flux_nr))
                for flux_nr in range(len(self.fluxes))
            ]))

        cmod = Module()
        cmod.append(Include("pycuda-helpers.hpp"))

        for dep_expr in self.all_deps:
            cmod.extend([
                Value(
                    "texture<%s, 1, cudaReadModeElementType>" %
                    dtype_to_ctype(float_type, with_fp_tex_hack=True),
                    "field%d_tex" % self.dep_to_index[dep_expr])
            ])

        if fplan.flux_count != len(self.fluxes):
            from warnings import warn
            warn(
                "Flux count in flux execution plan different from actual flux count.\n"
                "You may want to specify the tune_for= kwarg in the Discretization\n"
                "constructor.")

        cmod.extend([
            Line(),
            Typedef(POD(float_type, "value_type")),
            Line(),
            flux_header_struct(float_type, discr.dimensions),
            Line(),
            face_pair_struct(float_type, discr.dimensions),
            Line(),
            Define("DIMENSIONS", discr.dimensions),
            Define("DOFS_PER_FACE", fplan.dofs_per_face),
            Define("THREADS_PER_FACE", fplan.threads_per_face()),
            Line(),
            Define("CONCURRENT_FACES", fplan.parallel_faces),
            Define("BLOCK_MB_COUNT", fplan.mbs_per_block),
            Line(),
            Define("FACEDOF_NR", "threadIdx.x"),
            Define("BLOCK_FACE", "threadIdx.y"),
            Line(),
            Define("FLUX_COUNT", len(self.fluxes)),
            Line(),
            Define("THREAD_NUM", "(FACEDOF_NR + BLOCK_FACE*THREADS_PER_FACE)"),
            Define("THREAD_COUNT", "(THREADS_PER_FACE*CONCURRENT_FACES)"),
            Define(
                "COALESCING_THREAD_COUNT",
                "(THREAD_COUNT < 0x10 ? THREAD_COUNT : THREAD_COUNT & ~0xf)"),
            Line(),
            Define("DATA_BLOCK_SIZE", fdata.block_bytes),
            Define("ALIGNED_FACE_DOFS_PER_MB",
                   fplan.aligned_face_dofs_per_microblock()),
            Define("ALIGNED_FACE_DOFS_PER_BLOCK",
                   "(ALIGNED_FACE_DOFS_PER_MB*BLOCK_MB_COUNT)"),
            Line(),
            Define("FOF_BLOCK_BASE",
                   "(blockIdx.x*ALIGNED_FACE_DOFS_PER_BLOCK)"),
            Line(),
        ] + ilist_data.code + [
            Line(),
            Value("texture<index_list_entry_t, 1, cudaReadModeElementType>",
                  "tex_index_lists"),
            Line(),
            fdata.struct,
            Line(),
            CudaShared(Value("flux_data", "data")),
        ])

        if not fplan.direct_store:
            cmod.extend([
                CudaShared(
                    ArrayOf(
                        ArrayOf(POD(float_type, "smem_fluxes_on_faces"),
                                "FLUX_COUNT"),
                        "ALIGNED_FACE_DOFS_PER_MB*BLOCK_MB_COUNT")),
                Line(),
            ])

        S = Statement
        f_body = Block()

        from hedge.backends.cuda.tools import get_load_code

        f_body.extend(
            get_load_code(dest="&data",
                          base="gmem_facedata + blockIdx.x*DATA_BLOCK_SIZE",
                          bytes="sizeof(flux_data)",
                          descr="load face_pair data") +
            [S("__syncthreads()"), Line()])

        def get_flux_code(flux_writer):
            flux_code = Block([])

            flux_code.extend([
                Initializer(Pointer(Value("face_pair", "fpair")),
                            "data.facepairs+fpair_nr"),
                Initializer(
                    MaybeUnused(POD(numpy.uint32, "a_index")),
                    "fpair->a_base + tex1Dfetch(tex_index_lists, "
                    "fpair->a_ilist_index + FACEDOF_NR)"),
                Initializer(
                    MaybeUnused(POD(numpy.uint32, "b_index")),
                    "fpair->b_base + tex1Dfetch(tex_index_lists, "
                    "fpair->b_ilist_index + FACEDOF_NR)"),
                Line(),
                flux_writer(),
                Line(),
                S("fpair_nr += CONCURRENT_FACES")
            ])

            return flux_code

        flux_computation = Block([
            Comment("fluxes for dual-sided (intra-block) interior face pairs"),
            While("fpair_nr < data.header.same_facepairs_end",
                  get_flux_code(lambda: self.write_interior_flux_code(True))),
            Line(),
            Comment("work around nvcc assertion failure"),
            S("fpair_nr+=1"),
            S("fpair_nr-=1"),
            Line(),
            Comment(
                "fluxes for single-sided (inter-block) interior face pairs"),
            While("fpair_nr < data.header.diff_facepairs_end",
                  get_flux_code(lambda: self.write_interior_flux_code(False))),
            Line(),
            Comment("fluxes for single-sided boundary face pairs"),
            While(
                "fpair_nr < data.header.bdry_facepairs_end",
                get_flux_code(
                    lambda: self.write_boundary_flux_code(for_benchmark))),
        ])

        f_body.extend_log_block("compute the fluxes", [
            Initializer(POD(numpy.uint32, "fpair_nr"), "BLOCK_FACE"),
            If("FACEDOF_NR < DOFS_PER_FACE", flux_computation)
        ])

        if not fplan.direct_store:
            f_body.extend([Line(), S("__syncthreads()"), Line()])

            f_body.extend_log_block(
                "store fluxes",
                [
                    #Assign("debugbuf[blockIdx.x]", "FOF_BLOCK_BASE"),
                    #Assign("debugbuf[0]", "FOF_BLOCK_BASE"),
                    #Assign("debugbuf[0]", "sizeof(face_pair)"),
                    For(
                        "unsigned word_nr = THREAD_NUM",
                        "word_nr < ALIGNED_FACE_DOFS_PER_MB*BLOCK_MB_COUNT",
                        "word_nr += COALESCING_THREAD_COUNT",
                        Block([
                            Assign(
                                "gmem_fluxes_on_faces%d[FOF_BLOCK_BASE+word_nr]"
                                % flux_nr,
                                "smem_fluxes_on_faces[%d][word_nr]" % flux_nr)
                            for flux_nr in range(len(self.fluxes))
                        ]
                              #+[If("isnan(smem_fluxes_on_faces[%d][word_nr])" % flux_nr,
                              #Block([
                              #Assign("debugbuf[blockIdx.x]", "word_nr"),
                              #])
                              #)
                              #for flux_nr in range(len(self.fluxes))]
                              ))
                ])
        if False:
            f_body.extend([
                Assign("debugbuf[blockIdx.x*96+32+BLOCK_FACE*32+threadIdx.x]",
                       "fpair_nr"),
                Assign("debugbuf[blockIdx.x*96+16]",
                       "data.header.same_facepairs_end"),
                Assign("debugbuf[blockIdx.x*96+17]",
                       "data.header.diff_facepairs_end"),
                Assign("debugbuf[blockIdx.x*96+18]",
                       "data.header.bdry_facepairs_end"),
            ])

        # finish off ----------------------------------------------------------
        cmod.append(FunctionBody(f_decl, f_body))

        if not for_benchmark and "cuda_dump_kernels" in discr.debug:
            from hedge.tools import open_unique_debug_file
            open_unique_debug_file("flux_gather", ".cu").write(str(cmod))

        #from pycuda.tools import allow_user_edit
        mod = SourceModule(
            #allow_user_edit(cmod, "kernel.cu", "the flux kernel"),
            cmod,
            keep="cuda_keep_kernels" in discr.debug)
        expr_to_texture_map = dict(
            (dep_expr,
             mod.get_texref("field%d_tex" % self.dep_to_index[dep_expr]))
            for dep_expr in self.all_deps)

        index_list_texref = mod.get_texref("tex_index_lists")
        index_list_texref.set_address(ilist_data.device_memory,
                                      ilist_data.bytes)
        index_list_texref.set_format(
            cuda.dtype_to_array_format(ilist_data.type), 1)
        index_list_texref.set_flags(cuda.TRSF_READ_AS_INTEGER)

        func = mod.get_function("apply_flux")
        block = (fplan.threads_per_face(), fplan.parallel_faces, 1)
        func.prepare(
            (2 + len(self.fluxes)) * "P",
            texrefs=expr_to_texture_map.values() + [index_list_texref])

        if "cuda_flux" in discr.debug:
            print "flux: lmem=%d smem=%d regs=%d" % (
                func.local_size_bytes, func.shared_size_bytes, func.num_regs)

        return block, func, expr_to_texture_map

예제 #26

0

파일 보기

파일: cuda.py 프로젝트: POETSII/PyOP2

def _cusp_solver(M, parameters):
    cache_key = lambda t, p: (t, p['ksp_type'], p['pc_type'], p['ksp_rtol'], p[
        'ksp_atol'], p['ksp_max_it'], p['ksp_gmres_restart'], p['ksp_monitor'])
    module = _cusp_cache.get(cache_key(M.ctype, parameters))
    if module:
        return module

    import codepy.toolchain
    from cgen import FunctionBody, FunctionDeclaration
    from cgen import Block, Statement, Include, Value
    from codepy.bpl import BoostPythonModule
    from codepy.cuda import CudaModule
    gcc_toolchain = codepy.toolchain.guess_toolchain()
    nvcc_toolchain = codepy.toolchain.guess_nvcc_toolchain()
    if 'CUSP_HOME' in os.environ:
        nvcc_toolchain.add_library('cusp', [os.environ['CUSP_HOME']], [], [])
    host_mod = BoostPythonModule()
    nvcc_mod = CudaModule(host_mod)
    nvcc_includes = [
        'thrust/device_vector.h', 'thrust/fill.h', 'cusp/csr_matrix.h',
        'cusp/krylov/cg.h', 'cusp/krylov/bicgstab.h', 'cusp/krylov/gmres.h',
        'cusp/precond/diagonal.h', 'cusp/precond/smoothed_aggregation.h',
        'cusp/precond/ainv.h', 'string'
    ]
    nvcc_mod.add_to_preamble([Include(s) for s in nvcc_includes])
    nvcc_mod.add_to_preamble([Statement('using namespace std')])

    # We're translating PETSc preconditioner types to CUSP
    diag = Statement(
        'cusp::precond::diagonal< ValueType, cusp::device_memory >M(A)')
    ainv = Statement(
        'cusp::precond::scaled_bridson_ainv< ValueType, cusp::device_memory >M(A)'
    )
    amg = Statement(
        'cusp::precond::smoothed_aggregation< IndexType, ValueType, cusp::device_memory >M(A)'
    )
    none = Statement(
        'cusp::identity_operator< ValueType, cusp::device_memory >M(nrows, ncols)'
    )
    preconditioners = {
        'diagonal': diag,
        'jacobi': diag,
        'ainv': ainv,
        'ainvcusp': ainv,
        'amg': amg,
        'hypre': amg,
        'none': none,
        None: none
    }
    try:
        precond_call = preconditioners[parameters['pc_type']]
    except KeyError:
        raise RuntimeError("Cusp does not support preconditioner type %s" %
                           parameters['pc_type'])
    solvers = {
        'cg':
        Statement('cusp::krylov::cg(A, x, b, monitor, M)'),
        'bicgstab':
        Statement('cusp::krylov::bicgstab(A, x, b, monitor, M)'),
        'gmres':
        Statement(
            'cusp::krylov::gmres(A, x, b, %(ksp_gmres_restart)d, monitor, M)' %
            parameters)
    }
    try:
        solve_call = solvers[parameters['ksp_type']]
    except KeyError:
        raise RuntimeError("Cusp does not support solver type %s" %
                           parameters['ksp_type'])
    monitor = 'monitor(b, %(ksp_max_it)d, %(ksp_rtol)g, %(ksp_atol)g)' % parameters

    nvcc_function = FunctionBody(
        FunctionDeclaration(Value('void', '__cusp_solve'), [
            Value('CUdeviceptr', '_rowptr'),
            Value('CUdeviceptr', '_colidx'),
            Value('CUdeviceptr', '_csrdata'),
            Value('CUdeviceptr', '_b'),
            Value('CUdeviceptr', '_x'),
            Value('int', 'nrows'),
            Value('int', 'ncols'),
            Value('int', 'nnz')
        ]),
        Block([
            Statement('typedef int IndexType'),
            Statement('typedef %s ValueType' % M.ctype),
            Statement(
                'typedef typename cusp::array1d_view< thrust::device_ptr<IndexType> > indices'
            ),
            Statement(
                'typedef typename cusp::array1d_view< thrust::device_ptr<ValueType> > values'
            ),
            Statement(
                'typedef cusp::csr_matrix_view< indices, indices, values, IndexType, ValueType, cusp::device_memory > matrix'
            ),
            Statement(
                'thrust::device_ptr< IndexType > rowptr((IndexType *)_rowptr)'
            ),
            Statement(
                'thrust::device_ptr< IndexType > colidx((IndexType *)_colidx)'
            ),
            Statement(
                'thrust::device_ptr< ValueType > csrdata((ValueType *)_csrdata)'
            ),
            Statement('thrust::device_ptr< ValueType > d_b((ValueType *)_b)'),
            Statement('thrust::device_ptr< ValueType > d_x((ValueType *)_x)'),
            Statement('indices row_offsets(rowptr, rowptr + nrows + 1)'),
            Statement('indices column_indices(colidx, colidx + nnz)'),
            Statement('values matrix_values(csrdata, csrdata + nnz)'),
            Statement('values b(d_b, d_b + nrows)'),
            Statement('values x(d_x, d_x + ncols)'),
            Statement('thrust::fill(x.begin(), x.end(), (ValueType)0)'),
            Statement(
                'matrix A(nrows, ncols, nnz, row_offsets, column_indices, matrix_values)'
            ),
            Statement('cusp::%s_monitor< ValueType > %s' %
                      ('verbose' if parameters['ksp_monitor'] else 'default',
                       monitor)), precond_call, solve_call
        ]))

    host_mod.add_to_preamble(
        [Include('boost/python/extract.hpp'),
         Include('string')])
    host_mod.add_to_preamble([Statement('using namespace boost::python')])
    host_mod.add_to_preamble([Statement('using namespace std')])

    nvcc_mod.add_function(nvcc_function)

    host_mod.add_function(
        FunctionBody(
            FunctionDeclaration(Value('void', 'solve'), [
                Value('object', '_rowptr'),
                Value('object', '_colidx'),
                Value('object', '_csrdata'),
                Value('object', '_b'),
                Value('object', '_x'),
                Value('object', '_nrows'),
                Value('object', '_ncols'),
                Value('object', '_nnz')
            ]),
            Block([
                Statement(
                    'CUdeviceptr rowptr = extract<CUdeviceptr>(_rowptr.attr("gpudata"))'
                ),
                Statement(
                    'CUdeviceptr colidx = extract<CUdeviceptr>(_colidx.attr("gpudata"))'
                ),
                Statement(
                    'CUdeviceptr csrdata = extract<CUdeviceptr>(_csrdata.attr("gpudata"))'
                ),
                Statement(
                    'CUdeviceptr b = extract<CUdeviceptr>(_b.attr("gpudata"))'
                ),
                Statement(
                    'CUdeviceptr x = extract<CUdeviceptr>(_x.attr("gpudata"))'
                ),
                Statement('int nrows = extract<int>(_nrows)'),
                Statement('int ncols = extract<int>(_ncols)'),
                Statement('int nnz = extract<int>(_nnz)'),
                Statement(
                    '__cusp_solve(rowptr, colidx, csrdata, b, x, nrows, ncols, nnz)'
                )
            ])))

    nvcc_toolchain.cflags.append('-arch')
    nvcc_toolchain.cflags.append('sm_20')
    nvcc_toolchain.cflags.append('-O3')
    module = nvcc_mod.compile(gcc_toolchain,
                              nvcc_toolchain,
                              debug=configuration["debug"])

    _cusp_cache[cache_key(M.ctype, parameters)] = module
    return module

예제 #27

0

파일 보기

파일: opencl.py 프로젝트: ml-lab/loopy

    def emit_atomic_update(self, codegen_state, lhs_atomicity, lhs_var,
                           lhs_expr, rhs_expr, lhs_dtype, rhs_type_context):
        from pymbolic.mapper.stringifier import PREC_NONE

        # FIXME: Could detect operations, generate atomic_{add,...} when
        # appropriate.

        if isinstance(lhs_dtype, NumpyType) and lhs_dtype.numpy_dtype in [
                np.int32, np.int64, np.float32, np.float64
        ]:
            from cgen import Block, DoWhile, Assign
            from loopy.target.c import POD
            old_val_var = codegen_state.var_name_generator("loopy_old_val")
            new_val_var = codegen_state.var_name_generator("loopy_new_val")

            from loopy.kernel.data import TemporaryVariable, AddressSpace
            ecm = codegen_state.expression_to_code_mapper.with_assignments({
                old_val_var:
                TemporaryVariable(old_val_var, lhs_dtype),
                new_val_var:
                TemporaryVariable(new_val_var, lhs_dtype),
            })

            lhs_expr_code = ecm(lhs_expr, prec=PREC_NONE, type_context=None)

            from pymbolic.mapper.substitutor import make_subst_func
            from pymbolic import var
            from loopy.symbolic import SubstitutionMapper

            subst = SubstitutionMapper(
                make_subst_func({lhs_expr: var(old_val_var)}))
            rhs_expr_code = ecm(subst(rhs_expr),
                                prec=PREC_NONE,
                                type_context=rhs_type_context,
                                needed_dtype=lhs_dtype)

            if lhs_dtype.numpy_dtype.itemsize == 4:
                func_name = "atomic_cmpxchg"
            elif lhs_dtype.numpy_dtype.itemsize == 8:
                func_name = "atom_cmpxchg"
            else:
                raise LoopyError("unexpected atomic size")

            cast_str = ""
            old_val = old_val_var
            new_val = new_val_var

            if lhs_dtype.numpy_dtype.kind == "f":
                if lhs_dtype.numpy_dtype == np.float32:
                    ctype = "int"
                elif lhs_dtype.numpy_dtype == np.float64:
                    ctype = "long"
                else:
                    assert False

                from loopy.kernel.data import (TemporaryVariable, ArrayArg)
                if (isinstance(lhs_var, ArrayArg)
                        and lhs_var.address_space == AddressSpace.GLOBAL):
                    var_kind = "__global"
                elif (isinstance(lhs_var, ArrayArg)
                      and lhs_var.address_space == AddressSpace.LOCAL):
                    var_kind = "__local"
                elif (isinstance(lhs_var, TemporaryVariable)
                      and lhs_var.address_space == AddressSpace.LOCAL):
                    var_kind = "__local"
                elif (isinstance(lhs_var, TemporaryVariable)
                      and lhs_var.address_space == AddressSpace.GLOBAL):
                    var_kind = "__global"
                else:
                    raise LoopyError("unexpected kind of variable '%s' in "
                                     "atomic operation: " %
                                     (lhs_var.name, type(lhs_var).__name__))

                old_val = "*(%s *) &" % ctype + old_val
                new_val = "*(%s *) &" % ctype + new_val
                cast_str = "(%s %s *) " % (var_kind, ctype)

            return Block([
                POD(self, NumpyType(lhs_dtype.dtype, target=self.target),
                    old_val_var),
                POD(self, NumpyType(lhs_dtype.dtype, target=self.target),
                    new_val_var),
                DoWhile(
                    "%(func_name)s("
                    "%(cast_str)s&(%(lhs_expr)s), "
                    "%(old_val)s, "
                    "%(new_val)s"
                    ") != %(old_val)s" % {
                        "func_name": func_name,
                        "cast_str": cast_str,
                        "lhs_expr": lhs_expr_code,
                        "old_val": old_val,
                        "new_val": new_val,
                    },
                    Block([
                        Assign(old_val_var, lhs_expr_code),
                        Assign(new_val_var, rhs_expr_code),
                    ]))
            ])
        else:
            raise NotImplementedError("atomic update for '%s'" % lhs_dtype)

예제 #28

0

파일 보기

파일: diff_shared_fld.py 프로젝트: yangzilongdmgy/hedge

        def get_scalar_diff_code():
            code = []
            for inl in range(par.inline):
                for axis in dims:
                    code.append(
                        Initializer(POD(float_type, "d%drst%d" % (inl, axis)), 0))

            code.append(Line())

            tex_channels = ["x", "y", "z", "w"]

            store_code = Block()
            for inl in range(par.inline):
                for rst_axis in dims:
                    store_code.append(Assign(
                        "drst%d_global[GLOBAL_MB_IMAGE_DOF_BASE + "
                        "%d*ALIGNED_IMAGE_DOFS_PER_MB + IMAGE_MB_DOF]"
                        % (rst_axis, inl),
                        "d%drst%d" % (inl, rst_axis)
                        ))

            from hedge.backends.cuda.tools import unroll
            code.extend([
                Comment("everybody needs to be done with the old data"),
                S("__syncthreads()"),
                Line(),
                get_load_code(),
                Line(),
                Comment("all the new data must be loaded"),
                S("__syncthreads()"),
                Line(),
                ])

            if float_type == numpy.float32:
                code.append(Value("float%d" % rst_channels, "dmat_entries"))

            code.extend([
                POD(float_type, "field_value%d" % inl)
                for inl in range(par.inline)
                ]+[Line()])

            def unroll_body(j):
                result = [
                    Assign("field_value%d" % inl,
                        "smem_field[PAR_MB_NR][%d][mb_el*PREIMAGE_DOFS_PER_EL+%s]" % (inl, j))
                    for inl in range(par.inline)
                    ]

                if float_type == numpy.float32:
                    result.append(Assign("dmat_entries",
                        "tex1Dfetch(diff_rst_mat_tex, IMAGE_EL_DOF + %s*IMAGE_DOFS_PER_EL)" % j))
                    result.extend(
                        S("d%drst%d += dmat_entries.%s * field_value%d"
                            % (inl, axis, tex_channels[axis], inl))
                        for inl in range(par.inline)
                        for axis in dims)
                elif float_type == numpy.float64:
                    result.extend(
                        S("d%(inl)drst%(axis)d += "
                            "fp_tex1Dfetch(diff_rst_mat_tex, %(axis)d "
                            "+ DIMENSIONS*(IMAGE_EL_DOF + %(j)d*IMAGE_DOFS_PER_EL))"
                            "* field_value%(inl)d" % {
                            "inl": inl,
                            "axis": axis,
                            "j": j
                            })
                        for inl in range(par.inline)
                        for axis in dims)
                else:
                    assert False

                return result

            code.append(If("IMAGE_MB_DOF < IMAGE_DOFS_PER_MB", Block(unroll(unroll_body,
                    total_number=plan.preimage_dofs_per_el)
                    +[store_code])))

            return code

예제 #29

0

파일 보기

def generate_assignment_instruction_code(codegen_state, insn):
    kernel = codegen_state.kernel

    ecm = codegen_state.expression_to_code_mapper

    from loopy.expression import VectorizabilityChecker

    # {{{ vectorization handling

    if codegen_state.vectorization_info:
        if insn.atomicity:
            raise Unvectorizable("atomic operation")

        vinfo = codegen_state.vectorization_info
        vcheck = VectorizabilityChecker(kernel, vinfo.iname, vinfo.length)
        lhs_is_vector = vcheck(insn.assignee)
        rhs_is_vector = vcheck(insn.expression)

        if not lhs_is_vector and rhs_is_vector:
            raise Unvectorizable("LHS is scalar, RHS is vector, cannot assign")

        is_vector = lhs_is_vector

        del lhs_is_vector
        del rhs_is_vector

    # }}}

    from pymbolic.primitives import Variable, Subscript, Lookup
    from loopy.symbolic import LinearSubscript

    lhs = insn.assignee
    if isinstance(lhs, Lookup):
        lhs = lhs.aggregate

    if isinstance(lhs, Variable):
        assignee_var_name = lhs.name
        assignee_indices = ()

    elif isinstance(lhs, Subscript):
        assignee_var_name = lhs.aggregate.name
        assignee_indices = lhs.index_tuple

    elif isinstance(lhs, LinearSubscript):
        assignee_var_name = lhs.aggregate.name
        assignee_indices = (lhs.index, )

    else:
        raise RuntimeError("invalid lvalue '%s'" % lhs)

    del lhs

    result = codegen_state.ast_builder.emit_assignment(codegen_state, insn)

    # {{{ tracing

    lhs_dtype = codegen_state.kernel.get_var_descriptor(
        assignee_var_name).dtype

    if kernel.options.trace_assignments or kernel.options.trace_assignment_values:
        if codegen_state.vectorization_info and is_vector:
            raise Unvectorizable("tracing does not support vectorization")

        from pymbolic.mapper.stringifier import PREC_NONE
        lhs_code = codegen_state.expression_to_code_mapper(
            insn.assignee, PREC_NONE)

        from cgen import Statement as S  # noqa

        gs, ls = kernel.get_grid_size_upper_bounds()

        printf_format = "{}.{}[{}][{}]: {}".format(
            kernel.name, insn.id, ", ".join("gid%d=%%d" % i
                                            for i in range(len(gs))),
            ", ".join("lid%d=%%d" % i for i in range(len(ls))),
            assignee_var_name)

        printf_args = (["gid(%d)" % i for i in range(len(gs))] +
                       ["lid(%d)" % i for i in range(len(ls))])

        if assignee_indices:
            printf_format += "[%s]" % ",".join(len(assignee_indices) * ["%d"])
            printf_args.extend(
                ecm(i, prec=PREC_NONE, type_context="i")
                for i in assignee_indices)

        if kernel.options.trace_assignment_values:
            if lhs_dtype.numpy_dtype.kind == "i":
                printf_format += " = %d"
                printf_args.append(lhs_code)
            elif lhs_dtype.numpy_dtype.kind == "f":
                printf_format += " = %g"
                printf_args.append(lhs_code)
            elif lhs_dtype.numpy_dtype.kind == "c":
                printf_format += " = %g + %gj"
                printf_args.extend(["(%s).x" % lhs_code, "(%s).y" % lhs_code])

        if printf_args:
            printf_args_str = ", " + ", ".join(str(v) for v in printf_args)
        else:
            printf_args_str = ""

        printf_insn = S('printf("{}\\n"{})'.format(printf_format,
                                                   printf_args_str))

        from cgen import Block
        if kernel.options.trace_assignment_values:
            result = Block([result, printf_insn])
        else:
            # print first, execute later -> helps find segfaults
            result = Block([printf_insn, result])

    # }}}

    return result

예제 #30

0

파일 보기

    FunctionBody(
        CLKernel(
            CLRequiredWorkGroupSize(
                (local_size, ),
                FunctionDeclaration(Value("void", "add"),
                                    arg_decls=[
                                        CLGlobal(
                                            Pointer(Const(POD(dtype, name))))
                                        for name in ["tgt", "op1", "op2"]
                                    ]))),
        Block([
            Initializer(
                POD(numpy.int32,
                    "idx"), "get_local_id(0) + %d * get_group_id(0)" %
                (local_size * thread_strides))
        ] + [
            Assign(
                "tgt[idx+%d]" % (o * local_size), "op1[idx+%d] + op2[idx+%d]" %
                (o * local_size, o * local_size))
            for o in range(thread_strides)
        ]))
])

knl = cl.Program(ctx, str(mod)).build().add

knl(queue, (local_size * macroblock_count, ), (local_size, ), c_buf, a_buf,
    b_buf)

c = numpy.empty_like(a)
cl.enqueue_copy(queue, c, c_buf).wait()