Esempio n. 1
0
    def _arg_names_and_decls(self, codegen_state):
        implemented_data_info = codegen_state.implemented_data_info
        arg_names = [iai.name for iai in implemented_data_info]

        arg_decls = [
            self.idi_to_cgen_declarator(codegen_state.kernel, idi)
            for idi in implemented_data_info
        ]

        # {{{ occa compatibility hackery

        from cgen import Value
        if self.target.occa_mode:
            from cgen import ArrayOf, Const
            from cgen.ispc import ISPCUniform

            arg_decls = [
                Const(ISPCUniform(ArrayOf(Value("int", "loopy_dims")))),
                Const(ISPCUniform(Value("int", "o1"))),
                Const(ISPCUniform(Value("int", "o2"))),
                Const(ISPCUniform(Value("int", "o3"))),
            ] + arg_decls
            arg_names = ["loopy_dims", "o1", "o2", "o3"] + arg_names

        # }}}

        return arg_names, arg_decls
Esempio n. 2
0
 def _C_typedecl(self):
     fields = []
     for i, j in self.pfields:
         if i == self._field_flag:
             fields.append(Initializer(Value('volatile %s' % ctypes_to_cstr(j), i), 1))
         else:
             fields.append(Value(ctypes_to_cstr(j), i))
     return Struct(self.pname, fields)
Esempio n. 3
0
    def __init__(self, dims, mode, name):
        if mode == "r":
            spec = "__read_only"
        elif mode == "w":
            spec = "__write_only"
        else:
            raise ValueError("mode must be one of 'r' or 'w'")

        Value.__init__(self, "%s image%dd_t" % (spec, dims), name)
Esempio n. 4
0
    def __init__(self, dims, mode, name):
        if mode == "r":
            spec = "__read_only"
        elif mode == "w":
            spec = "__write_only"
        else:
            raise ValueError("mode must be one of 'r' or 'w'")

        Value.__init__(self, "%s image%dd_t" % (spec, dims), name)
Esempio n. 5
0
def test_cgen():
    s = Struct(
        "yuck",
        [
            POD(
                np.float32,
                "h",
            ),
            POD(np.float32, "order"),
            POD(np.float32, "face_jacobian"),
            ArrayOf(POD(np.float32, "normal"), 17),
            POD(np.uint16, "a_base"),
            POD(np.uint16, "b_base"),
            #CudaGlobal(POD(np.uint8, "a_ilist_number")),
            POD(np.uint8, "b_ilist_number"),
            POD(np.uint8, "bdry_flux_number"),  # 0 if not on boundary
            POD(np.uint8, "reserved"),
            POD(np.uint32, "b_global_base"),
        ])
    f_decl = FunctionDeclaration(POD(np.uint16, "get_num"), [
        POD(np.uint8, "reserved"),
        POD(np.uint32, "b_global_base"),
    ])
    f_body = FunctionBody(
        f_decl,
        Block([
            POD(np.uint32, "i"),
            For(
                "i = 0",
                "i < 17",
                "++i",
                If(
                    "a > b",
                    Assign("a", "b"),
                    Block([
                        Assign("a", "b-1"),
                        #Break(),
                    ])),
            ),
            #BlankLine(),
            Comment("all done"),
        ]))
    t_decl = Template(
        'typename T',
        FunctionDeclaration(
            Value('CUdeviceptr', 'scan'),
            [Value('CUdeviceptr', 'inputPtr'),
             Value('int', 'length')]))

    print(s)
    print(f_body)
    print(t_decl)
Esempio n. 6
0
def get_elwise_module_descriptor(arguments, operation, name="kernel"):
    from codepy.bpl import BoostPythonModule

    from cgen import FunctionBody, FunctionDeclaration, \
            Value, POD, Struct, For, Initializer, Include, Statement, \
            Line, Block

    S = Statement  # noqa: N806

    mod = BoostPythonModule()
    mod.add_to_preamble([
        Include("pyublas/numpy.hpp"),
    ])

    mod.add_to_module([
        S("namespace ublas = boost::numeric::ublas"),
        S("using namespace pyublas"),
        Line(),
    ])

    body = Block([
        Initializer(
            Value(
                "numpy_array<{} >::iterator".format(dtype_to_ctype(
                    varg.dtype)), varg.name), f"args.{varg.name}_ary.begin()")
        for varg in arguments if isinstance(varg, VectorArg)
    ] + [
        Initializer(sarg.declarator(), f"args.{sarg.name}")
        for sarg in arguments if isinstance(sarg, ScalarArg)
    ])

    body.extend([
        Line(),
        For("unsigned i = 0", "i < codepy_length", "++i",
            Block([S(operation)]))
    ])

    arg_struct = Struct("arg_struct", [arg.declarator() for arg in arguments])
    mod.add_struct(arg_struct, "ArgStruct")
    mod.add_to_module([Line()])

    mod.add_function(
        FunctionBody(
            FunctionDeclaration(Value("void", name), [
                POD(numpy.uintp, "codepy_length"),
                Value("arg_struct", "args")
            ]), body))

    return mod
Esempio n. 7
0
    def get_function_declaration(self, codegen_state, codegen_result,
                                 schedule_index):
        name = codegen_result.current_program(codegen_state).name

        from cgen import (FunctionDeclaration, Value)
        from cgen.ispc import ISPCExport, ISPCTask

        arg_names, arg_decls = self._arg_names_and_decls(codegen_state)

        if codegen_state.is_generating_device_code:
            return ISPCTask(FunctionDeclaration(Value("void", name),
                                                arg_decls))
        else:
            return ISPCExport(
                FunctionDeclaration(Value("void", name), arg_decls))
Esempio n. 8
0
    def make_codepy_module(self, toolchain, dtype):
        from codepy.libraries import add_codepy
        toolchain = toolchain.copy()
        add_codepy(toolchain)

        from cgen import (Value, Include, Statement,
                Typedef, FunctionBody, FunctionDeclaration, Block, Const,
                Line, POD, Initializer, CustomLoop)
        S = Statement

        from codepy.bpl import BoostPythonModule
        mod = BoostPythonModule()

        mod.add_to_preamble([
            Include("vector"),
            Include("algorithm"),
            Include("hedge/base.hpp"),
            Include("hedge/volume_operators.hpp"),
            Include("boost/foreach.hpp"),
            Include("boost/numeric/ublas/io.hpp"),
            ]+self.get_cpu_extra_includes())

        mod.add_to_module([
            S("namespace ublas = boost::numeric::ublas"),
            S("using namespace hedge"),
            S("using namespace pyublas"),
            Line(),
            Typedef(POD(dtype, "value_type")),
            Line(),
            ])

        mod.add_function(FunctionBody(
            FunctionDeclaration(Value("void", "process_elements"), [
                Const(Value("uniform_element_ranges", "ers")),
                Const(Value("numpy_vector<value_type>", "field")),
                Value("numpy_vector<value_type>", "result"),
                ]+self.get_cpu_extra_parameter_declarators()),
            Block([
                Typedef(Value("numpy_vector<value_type>::iterator",
                    "it_type")),
                Typedef(Value("numpy_vector<value_type>::const_iterator",
                    "cit_type")),
                Line(),
                Initializer(Value("it_type", "result_it"),
                    "result.begin()"),
                Initializer(Value("cit_type", "field_it"),
                    "field.begin()"),
                Line() ]+self.get_cpu_extra_preamble()+[ Line(),
                CustomLoop(
                    "BOOST_FOREACH(const element_range er, ers)",
                    Block(self.get_cpu_per_element_code())
                    )
                ])))

        #print mod.generate()
        #toolchain = toolchain.copy()
        #toolchain.enable_debugging
        return mod.compile(toolchain)
Esempio n. 9
0
 def cdef(self):
     """
     Return a :class:`cgen.Struct` representing the profiler data structure in C
     (a ``struct``).
     """
     return Struct('profiler',
                   [Value('double', i.name) for i in self._sections])
Esempio n. 10
0
 def _C_typedecl(self):
     if self._is_composite_dtype:
         return Struct(
             self.pname,
             [Value(ctypes_to_cstr(j), i) for i, j in self.pfields])
     else:
         return None
Esempio n. 11
0
    def add_struct(self,
            struct, py_name=None, py_member_name_transform=lambda x: x,
            by_value_members=None):
        if by_value_members is None:
            by_value_members = set()

        from cgen import Block, Line, Statement, Typedef, Value

        if py_name is None:
            py_name = struct.tpname

        self.mod_body.append(struct)

        member_defs = []
        for f in struct.fields:
            py_f_name = py_member_name_transform(f.name)
            tp_lines, declarator = f.get_decl_pair()
            if f.name in by_value_members or tp_lines[0].startswith("numpy_"):
                member_defs.append(
                        ".def(pyublas::by_value_rw_member"
                        f'("{py_f_name}", &cl::{f.name}))')
            else:
                member_defs.append(
                        f'.def_readwrite("{py_f_name}", &cl::{f.name})'
                        )

        self.init_body.append(
            Block([
                Typedef(Value(struct.tpname, "cl")),
                Line(),
                Statement(
                    'boost::python::class_<cl>("{}"){}'.format(
                        py_name, "".join(member_defs))),
                ]))
Esempio n. 12
0
    def get_cpu_per_element_code(self):
        from cgen import (Value, Statement, Initializer, While,
                Comment, Block, For, Line, Pointer)
        S = Statement
        return [
                # assumes there is more than one coefficient
                Initializer(Value("cit_type", "el_modes"), "field_it+er.first"),

                Line(),
                Comment("zero out reduced_modes"),
                For("npy_uint32 mode_idx = 0",
                    "mode_idx < max_degree+1",
                    "++mode_idx",
                    S("reduced_modes[mode_idx] = 0")),

                Line(),
                Comment("gather modes by degree"),
                For("npy_uint32 mode_idx = 0",
                    "mode_idx < mode_count",
                    "++mode_idx",
                    S("reduced_modes[mode_degrees_iterator[mode_idx]]"
                        " += el_modes[mode_idx]")),

                Line(),
                Comment("perform skyline procedure"),
                Initializer(Pointer(Value("value_type", "start")),
                    "reduced_modes.get()"),
                Initializer(Pointer(Value("value_type", "end")),
                    "start+max_degree+1"),
                Initializer(Value("value_type", "cur_max"),
                    "std::max(*(end-1), *(end-2))"),

                Line(),
                While("end != start", Block([
                    S("--end"),
                    S("*end = std::max(cur_max, *end)"),
                    ])),

                Line(),
                Comment("scatter modes by degree"),
                Initializer(Value("it_type", "tgt_base"), "result_it+er.first"),
                For("npy_uint32 mode_idx = 0",
                    "mode_idx < mode_count",
                    "++mode_idx",
                    S("tgt_base[mode_idx] = "
                        "reduced_modes[mode_degrees_iterator[mode_idx]]")),
                ]
Esempio n. 13
0
    def get_function_declaration(self, codegen_state, codegen_result,
                                 schedule_index):
        from cgen import FunctionDeclaration, Value

        name = codegen_result.current_program(codegen_state).name
        if self.target.fortran_abi:
            name += "_"

        if codegen_state.is_entrypoint:
            name = Value("void", name)
        else:
            name = Value("static void", name)
        return FunctionDeclarationWrapper(
            FunctionDeclaration(name, [
                self.idi_to_cgen_declarator(codegen_state.kernel, idi)
                for idi in codegen_state.implemented_data_info
            ]))
Esempio n. 14
0
        def make_it(name, is_const=True, tpname="value_type"):
            if is_const:
                const = "const_"
            else:
                const = ""

            return Initializer(
                Value("numpy_array<%s>::%siterator" % (tpname, const),
                      name + "_it"), "%s.begin()" % name)
Esempio n. 15
0
 def _C_neighbours(self):
     """A :class:`ctypes.Struct` to access the neighborhood of a given rank."""
     entries = list(product(self.dimensions, [LEFT, RIGHT]))
     fields = [('%s%s' % (d, i), c_int) for d, i in entries]
     obj = CompositeObject('nb', 'neighbours', Structure, fields)
     for d, i in entries:
         setattr(obj.value._obj, '%s%s' % (d, i), self.neighbours[d][i])
     cdef = Struct('neighbours', [Value('int', i) for i, _ in fields])
     CNeighbours = namedtuple('CNeighbours', 'ctype cdef obj')
     return CNeighbours(obj.dtype, cdef, obj)
Esempio n. 16
0
 def get_cpu_extra_preamble(self):
     from cgen import Initializer, Value, POD, Statement
     return [
             Initializer(Value("numpy_array<npy_uint32>::const_iterator",
                 "mode_degrees_iterator"),
                 "mode_degrees.begin()"),
             Initializer(POD(numpy.uint32, "mode_count"),
                 "mode_degrees.size()"),
             Statement("boost::scoped_array<value_type> reduced_modes"
                 "(new value_type[max_degree+1])"),
             ]
Esempio n. 17
0
    def generate_top_of_body(self, codegen_state):
        from loopy.kernel.data import ImageArg
        if any(isinstance(arg, ImageArg) for arg in codegen_state.kernel.args):
            from cgen import Value, Const, Initializer
            return [
                    Initializer(Const(Value("sampler_t", "loopy_sampler")),
                        "CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP "
                        "| CLK_FILTER_NEAREST")
                    ]

        return []
Esempio n. 18
0
    def get_cuda_extra_preamble(self, discr, dtype, eg):
        from cgen import ArrayOf, Value, Initializer
        from cgen.cuda import CudaConstant

        ldis = eg.local_discretization
        mode_degrees = [sum(mode_indices) for mode_indices in
                ldis.generate_mode_identifiers()]

        return [Initializer(CudaConstant(
            ArrayOf(Value("unsigned", "mode_degrees"))),
            "{%s}" % ", ".join(str(i) for i in mode_degrees))
            ]
Esempio n. 19
0
    def gen_flux_code():
        f2cm = FluxToCodeMapper()

        result = [
                Assign("fof%d_it[loc_fof_base+i]" % flux_idx,
                    "uncomplex_type(fp.int_side.face_jacobian) * " +
                    flux_to_code(f2cm, False, flux_idx, fvi, flux.op.flux, PREC_PRODUCT))
                for flux_idx, flux in enumerate(fluxes)
                ]

        return [
            Initializer(Value("value_type", cse_name), cse_str)
            for cse_name, cse_str in f2cm.cse_name_list] + result
Esempio n. 20
0
def make_greet_mod(greeting):
    from cgen import FunctionBody, FunctionDeclaration, Block, \
            Const, Pointer, Value, Statement
    from codepy.bpl import BoostPythonModule

    mod = BoostPythonModule()

    mod.add_function(
        FunctionBody(
            FunctionDeclaration(Const(Pointer(Value("char", "greet"))), []),
            Block([Statement('return "%s"' % greeting)])))

    from codepy.toolchain import guess_toolchain
    return mod.compile(guess_toolchain(), wait_on_error=True)
Esempio n. 21
0
    def expose_vector_type(self, name, py_name=None):
        self.add_codepy_include()

        if py_name is None:
            py_name = name

        from cgen import (Block, Typedef, Line, Statement, Value)

        self.init_body.append(
            Block([
                Typedef(Value(name, "cl")),
                Line(),
                Statement(
                    f'boost::python::class_<cl>("{py_name}")'
                    ".def(codepy::no_compare_indexing_suite<cl>())"),
                ]))
Esempio n. 22
0
    def gen_flux_code():
        f2cm = FluxToCodeMapper()

        result = [
                Assign("fof%d_it[%s_fof_base+%s]" % (flux_idx, where, tgt_idx),
                    "uncomplex_type(fp.int_side.face_jacobian) * " +
                    flux_to_code(f2cm, is_flipped, flux_idx, fvi, flux.op.flux, PREC_PRODUCT))
                for flux_idx, flux in enumerate(fluxes)
                for where, is_flipped, tgt_idx in [
                    ("int_side", False, "i"),
                    ("ext_side", True, "ext_native_write_map[i]")
                    ]]

        return [
            Initializer(Value("value_type", cse_name), cse_str)
            for cse_name, cse_str in f2cm.cse_name_list] + result
 def _C_typedecl(self):
     # Overriding for better code readability
     #
     # Struct neighborhood                 Struct neighborhood
     # {                                   {
     #   int ll;                             int ll, lc, lr;
     #   int lc;                 VS          ...
     #   int lr;                             ...
     #   ...                                 ...
     # }                                   }
     #
     # With this override, we generate the one on the right
     groups = [list(g) for k, g in groupby(self.pfields, key=lambda x: x[0][0])]
     groups = [(j[0], i) for i, j in [zip(*g) for g in groups]]
     return Struct(self.pname, [Value(ctypes_to_cstr(i), ', '.join(j))
                                for i, j in groups])
Esempio n. 24
0
        def get_flux_code(flux_writer):
            flux_code = Block([])

            flux_code.extend([
                Initializer(Pointer(Value("face_pair", "fpair")),
                            "data.facepairs+fpair_nr"),
                Initializer(
                    MaybeUnused(POD(numpy.uint32, "a_index")),
                    "fpair->a_base + tex1Dfetch(tex_index_lists, "
                    "fpair->a_ilist_index + FACEDOF_NR)"),
                Initializer(
                    MaybeUnused(POD(numpy.uint32, "b_index")),
                    "fpair->b_base + tex1Dfetch(tex_index_lists, "
                    "fpair->b_ilist_index + FACEDOF_NR)"),
                Line(),
                flux_writer(),
                Line(),
                S("fpair_nr += CONCURRENT_FACES")
            ])

            return flux_code
Esempio n. 25
0
    def get_cpu_per_element_code(self):
        from cgen import (Value, Statement, Initializer, While, Block)
        S = Statement
        return [
                # assumes there is more than one coefficient
                Initializer(Value("cit_type", "start"), "field_it+er.first"),
                Initializer(Value("cit_type", "end"), "field_it+er.second"),
                Initializer(Value("it_type", "tgt"), "result_it+er.first"),

                Initializer(Value("cit_type", "cur"), "start"),
                While("cur != end",
                    Block([
                        Initializer(Value("cit_type", "avg_start"),
                            "std::max(start, cur-1)"),
                        Initializer(Value("cit_type", "avg_end"),
                            "std::min(end, cur+2)"),

                        S("*tgt++ = std::accumulate(avg_start, avg_end, value_type(0))"
                            "/std::distance(avg_start, avg_end)"),
                        S("++cur"),
                        ])
                    )
                ]
Esempio n. 26
0
 def _C_typedecl(self):
     return Struct(self.pname, [Value(ctypes_to_cstr(j), i) for i, j in self.pfields])
Esempio n. 27
0
    def get_temporary_decls(self, codegen_state, schedule_index):
        from loopy.kernel.data import temp_var_scope

        kernel = codegen_state.kernel

        base_storage_decls = []
        temp_decls = []

        # {{{ declare temporaries

        base_storage_sizes = {}
        base_storage_to_scope = {}
        base_storage_to_align_bytes = {}

        from cgen import ArrayOf, Initializer, AlignedAttribute, Value, Line

        for tv in sorted(six.itervalues(kernel.temporary_variables),
                         key=lambda tv: tv.name):
            decl_info = tv.decl_info(self.target,
                                     index_dtype=kernel.index_dtype)

            if not tv.base_storage:
                for idi in decl_info:
                    # global temp vars are mapped to arguments or global declarations
                    if tv.scope != temp_var_scope.GLOBAL:
                        decl = self.wrap_temporary_decl(
                            self.get_temporary_decl(kernel, schedule_index, tv,
                                                    idi), tv.scope)

                        if tv.initializer is not None:
                            decl = Initializer(
                                decl,
                                generate_array_literal(codegen_state, tv,
                                                       tv.initializer))

                        temp_decls.append(decl)

            else:
                assert tv.initializer is None

                offset = 0
                base_storage_sizes.setdefault(tv.base_storage,
                                              []).append(tv.nbytes)
                base_storage_to_scope.setdefault(tv.base_storage,
                                                 []).append(tv.scope)

                align_size = tv.dtype.itemsize

                from loopy.kernel.array import VectorArrayDimTag
                for dim_tag, axis_len in zip(tv.dim_tags, tv.shape):
                    if isinstance(dim_tag, VectorArrayDimTag):
                        align_size *= axis_len

                base_storage_to_align_bytes.setdefault(tv.base_storage,
                                                       []).append(align_size)

                for idi in decl_info:
                    cast_decl = POD(self, idi.dtype, "")
                    temp_var_decl = POD(self, idi.dtype, idi.name)

                    cast_decl = self.wrap_temporary_decl(cast_decl, tv.scope)
                    temp_var_decl = self.wrap_temporary_decl(
                        temp_var_decl, tv.scope)

                    # The 'restrict' part of this is a complete lie--of course
                    # all these temporaries are aliased. But we're promising to
                    # not use them to shovel data from one representation to the
                    # other. That counts, right?

                    cast_decl = _ConstRestrictPointer(cast_decl)
                    temp_var_decl = _ConstRestrictPointer(temp_var_decl)

                    cast_tp, cast_d = cast_decl.get_decl_pair()
                    temp_var_decl = Initializer(
                        temp_var_decl, "(%s %s) (%s + %s)" %
                        (" ".join(cast_tp), cast_d, tv.base_storage, offset))

                    temp_decls.append(temp_var_decl)

                    from pytools import product
                    offset += (idi.dtype.itemsize *
                               product(si for si in idi.shape))

        for bs_name, bs_sizes in sorted(six.iteritems(base_storage_sizes)):
            bs_var_decl = Value("char", bs_name)
            from pytools import single_valued
            bs_var_decl = self.wrap_temporary_decl(
                bs_var_decl, single_valued(base_storage_to_scope[bs_name]))
            bs_var_decl = ArrayOf(bs_var_decl, max(bs_sizes))

            alignment = max(base_storage_to_align_bytes[bs_name])
            bs_var_decl = AlignedAttribute(alignment, bs_var_decl)

            base_storage_decls.append(bs_var_decl)

        # }}}

        result = base_storage_decls + temp_decls

        if result:
            result.append(Line())

        return result
Esempio n. 28
0
a = numpy.random.randn(total_size).astype(dtype)
b = numpy.random.randn(total_size).astype(dtype)

a_gpu = cuda.to_device(a)
b_gpu = cuda.to_device(b)
c_gpu = cuda.mem_alloc(a.nbytes)

from cgen import FunctionBody, \
        FunctionDeclaration, Typedef, POD, Value, \
        Pointer, Module, Block, Initializer, Assign
from cgen.cuda import CudaGlobal

mod = Module([
    FunctionBody(
        CudaGlobal(
            FunctionDeclaration(Value("void", "add"),
                                arg_decls=[
                                    Pointer(POD(dtype, name))
                                    for name in ["tgt", "op1", "op2"]
                                ])),
        Block([
            Initializer(
                POD(numpy.int32, "idx"), "threadIdx.x + %d*blockIdx.x" %
                (block_size * thread_strides)),
        ] + [
            Assign(
                "tgt[idx+%d]" % (o * block_size), "op1[idx+%d] + op2[idx+%d]" %
                (o * block_size, o * block_size))
            for o in range(thread_strides)
        ]))
])
Esempio n. 29
0
    def get_temporary_decls(self, codegen_state, schedule_index):
        from loopy.kernel.data import AddressSpace

        kernel = codegen_state.kernel

        base_storage_decls = []
        temp_decls = []

        # {{{ declare temporaries

        base_storage_sizes = {}
        base_storage_to_scope = {}
        base_storage_to_align_bytes = {}

        from cgen import ArrayOf, Initializer, AlignedAttribute, Value, Line
        # Getting the temporary variables that are needed for the current
        # sub-kernel.
        from loopy.schedule.tools import (
                temporaries_read_in_subkernel,
                temporaries_written_in_subkernel)
        subkernel = kernel.schedule[schedule_index].kernel_name
        sub_knl_temps = (
                temporaries_read_in_subkernel(kernel, subkernel) |
                temporaries_written_in_subkernel(kernel, subkernel))

        for tv in sorted(
                six.itervalues(kernel.temporary_variables),
                key=lambda tv: tv.name):
            decl_info = tv.decl_info(self.target, index_dtype=kernel.index_dtype)

            if not tv.base_storage:
                for idi in decl_info:
                    # global temp vars are mapped to arguments or global declarations
                    if tv.address_space != AddressSpace.GLOBAL and (
                            tv.name in sub_knl_temps):
                        decl = self.wrap_temporary_decl(
                                self.get_temporary_decl(
                                    codegen_state, schedule_index, tv, idi),
                                tv.address_space)

                        if tv.initializer is not None:
                            assert tv.read_only
                            decl = Initializer(decl, generate_array_literal(
                                codegen_state, tv, tv.initializer))

                        temp_decls.append(decl)

            else:
                assert tv.initializer is None

                offset = 0
                base_storage_sizes.setdefault(tv.base_storage, []).append(
                        tv.nbytes)
                base_storage_to_scope.setdefault(tv.base_storage, []).append(
                        tv.address_space)

                align_size = tv.dtype.itemsize

                from loopy.kernel.array import VectorArrayDimTag
                for dim_tag, axis_len in zip(tv.dim_tags, tv.shape):
                    if isinstance(dim_tag, VectorArrayDimTag):
                        align_size *= axis_len

                base_storage_to_align_bytes.setdefault(tv.base_storage, []).append(
                        align_size)

                for idi in decl_info:
                    cast_decl = POD(self, idi.dtype, "")
                    temp_var_decl = POD(self, idi.dtype, idi.name)

                    cast_decl = self.wrap_temporary_decl(cast_decl, tv.address_space)
                    temp_var_decl = self.wrap_temporary_decl(
                            temp_var_decl, tv.address_space)

                    if tv._base_storage_access_may_be_aliasing:
                        ptrtype = _ConstPointer
                    else:
                        # The 'restrict' part of this is a complete lie--of course
                        # all these temporaries are aliased. But we're promising to
                        # not use them to shovel data from one representation to the
                        # other. That counts, right?
                        ptrtype = _ConstRestrictPointer

                    cast_decl = ptrtype(cast_decl)
                    temp_var_decl = ptrtype(temp_var_decl)

                    cast_tp, cast_d = cast_decl.get_decl_pair()
                    temp_var_decl = Initializer(
                            temp_var_decl,
                            "(%s %s) (%s + %s)" % (
                                " ".join(cast_tp), cast_d,
                                tv.base_storage,
                                offset))

                    temp_decls.append(temp_var_decl)

                    from pytools import product
                    offset += (
                            idi.dtype.itemsize
                            * product(si for si in idi.shape))

        ecm = self.get_expression_to_code_mapper(codegen_state)

        for bs_name, bs_sizes in sorted(six.iteritems(base_storage_sizes)):
            bs_var_decl = Value("char", bs_name)
            from pytools import single_valued
            bs_var_decl = self.wrap_temporary_decl(
                    bs_var_decl, single_valued(base_storage_to_scope[bs_name]))

            # FIXME: Could try to use isl knowledge to simplify max.
            if all(isinstance(bs, int) for bs in bs_sizes):
                bs_size_max = max(bs_sizes)
            else:
                bs_size_max = p.Max(tuple(bs_sizes))

            bs_var_decl = ArrayOf(bs_var_decl, ecm(bs_size_max))

            alignment = max(base_storage_to_align_bytes[bs_name])
            bs_var_decl = AlignedAttribute(alignment, bs_var_decl)

            base_storage_decls.append(bs_var_decl)

        # }}}

        result = base_storage_decls + temp_decls

        if result:
            result.append(Line())

        return result
Esempio n. 30
0
    def make_cuda_kernel(self, discr, dtype, eg):
        given = discr.given
        ldis = eg.local_discretization

        microblocks_per_block = 1

        from cgen.cuda import CudaGlobal

        from cgen import (Module, Value, Include,
                Typedef, FunctionBody, FunctionDeclaration, Const,
                Line, POD, LiteralBlock,
                Define, Pointer)

        cmod = Module([
            Include("pycuda-helpers.hpp"),
            Line(),
            Typedef(POD(dtype, "value_type")),
            Line(),
            Define("DOFS_PER_EL", given.dofs_per_el()),
            Define("ALIGNED_DOFS_PER_MB", given.microblock.aligned_floats),
            Define("VERTICES_PER_EL", ldis.vertex_count()),
            Define("ELS_PER_MB", given.microblock.elements),
            Define("MBS_PER_BLOCK", microblocks_per_block),
            Line(),
            Define("DOF_IN_MB_IDX", "threadIdx.x"),
            Define("DOF_IN_EL_IDX", "(DOF_IN_MB_IDX-el_idx_in_mb*DOFS_PER_EL)"),
            Define("MB_IN_BLOCK_IDX", "threadIdx.y"),
            Define("BLOCK_IDX", "blockIdx.x"),
            Define("MB_NUMBER", "(BLOCK_IDX * MBS_PER_BLOCK + MB_IN_BLOCK_IDX)"),
            Define("BLOCK_DATA", "whole_block[MB_IN_BLOCK_IDX]")]
            + self.get_cuda_extra_preamble(discr, dtype, eg)
            + [FunctionBody(
            CudaGlobal(FunctionDeclaration(
                    Value("void", "elwise_kernel"), [
                    Pointer(Const(POD(dtype, "field"))),
                    Pointer(POD(dtype, "result")),
                    POD(numpy.uint32, "mb_count"),
                    ])),
                LiteralBlock("""
                int el_idx_in_mb = DOF_IN_MB_IDX / DOFS_PER_EL;

                if (MB_NUMBER >= mb_count)
                  return;

                int idx =  MB_NUMBER * ALIGNED_DOFS_PER_MB + DOF_IN_MB_IDX;
                int element_base_idx = ALIGNED_DOFS_PER_MB * MB_IN_BLOCK_IDX +
                    (DOF_IN_MB_IDX / DOFS_PER_EL) * DOFS_PER_EL;
                int dof_in_element = DOF_IN_MB_IDX-el_idx_in_mb*DOFS_PER_EL;

                __shared__ value_type whole_block[MBS_PER_BLOCK][ALIGNED_DOFS_PER_MB+1];
                int idx_in_block = ALIGNED_DOFS_PER_MB * MB_IN_BLOCK_IDX + DOF_IN_MB_IDX;
                BLOCK_DATA[idx_in_block] = field[idx];

                __syncthreads();

                %s

                result[idx] = node_result;
                """ % self.get_cuda_code(discr, dtype, eg)))
                ])


        if False:
            for i, l in enumerate(str(cmod).split("\n")):
                print i+1, l
            raw_input()

        from pycuda.compiler import SourceModule
        mod = SourceModule(
                cmod,
                keep="cuda_keep_kernels" in discr.debug,
                )
        func = mod.get_function("elwise_kernel")
        func.prepare(
            "PPI", block=(
                given.microblock.aligned_floats,
                microblocks_per_block, 1))

        mb_count = len(discr.blocks) * discr.given.microblocks_per_block
        grid_dim = (mb_count + microblocks_per_block - 1) \
                // microblocks_per_block

        from pytools import Record
        class KernelInfo(Record):
            pass

        return KernelInfo(
                func=func,
                grid_dim=grid_dim,
                mb_count=mb_count)
Esempio n. 31
0
 def get_cpu_extra_parameter_declarators(self):
     from cgen import Value, POD
     return [
             Value("numpy_array<npy_uint32>", "mode_degrees"),
             POD(numpy.uint32, "max_degree")]