コード例 #1
0
ファイル: visitors.py プロジェクト: xiaocenxiaocen/devito
 def _args_decl(self, args):
     """Convert an iterable of :class:`Argument` into cgen format."""
     ret = []
     for i in args:
         if i.is_ScalarArgument:
             ret.append(
                 c.Value('const %s' % c.dtype_to_ctype(i.dtype), i.name))
         elif i.is_TensorArgument:
             ret.append(
                 c.Value(c.dtype_to_ctype(i.dtype),
                         '*restrict %s_vec' % i.name))
         else:
             ret.append(c.Value('void', '*_%s' % i.name))
     return ret
コード例 #2
0
ファイル: nodes.py プロジェクト: ponykid/SNIST
 def __repr__(self):
     parameters = ",".join([
         'void*' if i.is_Object else c.dtype_to_ctype(i.dtype)
         for i in self.parameters
     ])
     return "%s[%s]<%s; %s>" % (self.__class__.__name__, self.name,
                                self.retval, parameters)
コード例 #3
0
ファイル: tools.py プロジェクト: yangzilongdmgy/hedge
def get_load_code(dest, base, bytes, word_type=numpy.uint32,
        descr=None):
    from cgen import (
            Pointer, POD, 
            Comment, Block, Line, \
            Constant, For, Statement)

    from cgen import dtype_to_ctype
    copy_dtype = numpy.dtype(word_type)
    copy_dtype_str = dtype_to_ctype(copy_dtype)

    code = []
    if descr is not None:
        code.append(Comment(descr))

    code.extend([
        Block([
            Constant(Pointer(POD(copy_dtype, "load_base")),
                ("(%s *) (%s)" % (copy_dtype_str, base))),
            For("unsigned word_nr = THREAD_NUM",
                "word_nr*sizeof(int) < (%s)" % bytes,
                "word_nr += COALESCING_THREAD_COUNT",
                Statement("((%s *) (%s))[word_nr] = load_base[word_nr]"
                    % (copy_dtype_str, dest))
                ),
            ]),
        Line(),
        ])

    return code
コード例 #4
0
ファイル: tools.py プロジェクト: paulcazeaux/hedge
def get_load_code(dest, base, bytes, word_type=numpy.uint32,
        descr=None):
    from cgen import (
            Pointer, POD, 
            Comment, Block, Line, \
            Constant, For, Statement)

    from cgen import dtype_to_ctype
    copy_dtype = numpy.dtype(word_type)
    copy_dtype_str = dtype_to_ctype(copy_dtype)

    code = []
    if descr is not None:
        code.append(Comment(descr))

    code.extend([
        Block([
            Constant(Pointer(POD(copy_dtype, "load_base")),
                ("(%s *) (%s)" % (copy_dtype_str, base))),
            For("unsigned word_nr = THREAD_NUM",
                "word_nr*sizeof(int) < (%s)" % bytes,
                "word_nr += COALESCING_THREAD_COUNT",
                Statement("((%s *) (%s))[word_nr] = load_base[word_nr]"
                    % (copy_dtype_str, dest))
                ),
            ]),
        Line(),
        ])

    return code
コード例 #5
0
 def __repr__(self):
     parameters = ",".join([
         'void*' if i.is_Object else c.dtype_to_ctype(i.dtype)
         for i in self.parameters
     ])
     body = "\n\t".join([str(s) for s in self.body])
     return "Function[%s]<%s; %s>::\n\t%s" % (self.name, self.retval,
                                              parameters, body)
コード例 #6
0
 def _args_decl(self, args):
     """Generate cgen declarations from an iterable of symbols and expressions."""
     ret = []
     for i in args:
         if i.is_Object:
             ret.append(c.Value('void', '*_%s' % i.name))
         elif i.is_Scalar:
             ret.append(
                 c.Value('const %s' % c.dtype_to_ctype(i.dtype), i.name))
         elif i.is_Tensor:
             ret.append(
                 c.Value(c.dtype_to_ctype(i.dtype),
                         '*restrict %s_vec' % i.name))
         elif i.is_Dimension:
             ret.append(
                 c.Value('const %s' % c.dtype_to_ctype(i.dtype), i.name))
         else:
             ret.append(c.Value('void', '*_%s' % i.name))
     return ret
コード例 #7
0
    def push_heap(self, obj):
        """
        Generate cgen objects to declare, allocate memory, and free memory for
        ``obj``, of type :class:`Array`.
        """
        if obj in self.heap:
            return

        decl = "(*%s)%s" % (obj.name, "".join("[%s]" % i for i in obj.symbolic_shape[1:]))
        decl = c.Value(c.dtype_to_ctype(obj.dtype), decl)

        shape = "".join("[%s]" % i for i in obj.symbolic_shape)
        alloc = "posix_memalign((void**)&%s, 64, sizeof(%s%s))"
        alloc = alloc % (obj.name, c.dtype_to_ctype(obj.dtype), shape)
        alloc = c.Statement(alloc)

        free = c.Statement('free(%s)' % obj.name)

        self.heap[obj] = (decl, alloc, free)
コード例 #8
0
ファイル: vector_expr.py プロジェクト: yangzilongdmgy/hedge
    def get_kernel(self, vector_dtypes, scalar_dtypes):
        from pymbolic.mapper.stringifier import PREC_NONE
        from pymbolic.mapper.c_code import CCodeMapper

        elwise = self.elementwise_mod

        result_dtype = self.result_dtype_getter(
                dict(zip(self.vector_deps, vector_dtypes)),
                dict(zip(self.scalar_deps, scalar_dtypes)),
                self.constant_dtypes)

        args = [elwise.VectorArg(result_dtype, vei.name)
                for vei in self.vec_expr_info_list
                if not vei.do_not_return]

        def real_const_mapper(num):
            # Make sure we do not generate integers or doubles by accident.
            # Oh, C and your broken division semantics.

            r = repr(num)
            if "." not in r or result_dtype == numpy.float32:
                from pytools import to_uncomplex_dtype
                from cgen import dtype_to_ctype
                return "%s(%s)" % (dtype_to_ctype(
                        to_uncomplex_dtype(result_dtype)), r)
            else:
                return r

        code_mapper = CCodeMapper(constant_mapper=real_const_mapper)

        code_lines = []
        for vei in self.vec_expr_info_list:
            expr_code = code_mapper(vei.expr, PREC_NONE)
            if vei.do_not_return:
                from cgen import dtype_to_ctype
                code_lines.append(
                        "%s %s = %s;" % (
                            dtype_to_ctype(result_dtype), vei.name, expr_code))
            else:
                code_lines.append(
                        "%s[i] = %s;" % (vei.name, expr_code))

        # common subexpressions have been taken care of by the compiler
        assert not code_mapper.cse_names

        args.extend(
                elwise.VectorArg(dtype, name)
                for dtype, name in zip(vector_dtypes, self.vector_dep_names))
        args.extend(
                elwise.ScalarArg(dtype, name)
                for dtype, name in zip(scalar_dtypes, self.scalar_dep_names))

        return KernelRecord(
                kernel=self.make_kernel_internal(args, "\n".join(code_lines)),
                result_dtype=result_dtype)
コード例 #9
0
ファイル: elementwise.py プロジェクト: inducer/codepy
def get_elwise_module_descriptor(arguments, operation, name="kernel"):
    from codepy.bpl import BoostPythonModule

    from cgen import FunctionBody, FunctionDeclaration, \
            Value, POD, Struct, For, Initializer, Include, Statement, \
            Line, Block

    S = Statement

    mod = BoostPythonModule()
    mod.add_to_preamble([
        Include("pyublas/numpy.hpp"),
        ])

    mod.add_to_module([
        S("namespace ublas = boost::numeric::ublas"),
        S("using namespace pyublas"),
        Line(),
        ])

    body = Block([
        Initializer(
            Value("numpy_array<%s >::iterator"
                % dtype_to_ctype(varg.dtype),
                varg.name),
            "args.%s_ary.begin()" % varg.name)
        for varg in arguments if isinstance(varg, VectorArg)]
        +[Initializer(
            sarg.declarator(), "args." + sarg.name)
        for sarg in arguments if isinstance(sarg, ScalarArg)]
        )

    body.extend([
        Line(),
        For("unsigned i = 0",
            "i < codepy_length",
            "++i",
            Block([S(operation)])
            )
        ])

    arg_struct = Struct("arg_struct", 
            [arg.declarator() for arg in arguments])
    mod.add_struct(arg_struct, "ArgStruct")
    mod.add_to_module([Line()])

    mod.add_function(
            FunctionBody(
                FunctionDeclaration(
                    Value("void", name),
                    [POD(numpy.uintp, "codepy_length"),
                        Value("arg_struct", "args")]),
                body))

    return mod
コード例 #10
0
 def visit_ArrayCast(self, o):
     """
     Build cgen type casts for an :class:`AbstractFunction`.
     """
     f = o.function
     align = "__attribute__((aligned(64)))"
     shape = ''.join(["[%s]" % ccode(j) for j in f.symbolic_shape[1:]])
     lvalue = c.POD(f.dtype, '(*restrict %s)%s %s' % (f.name, shape, align))
     rvalue = '(%s (*)%s) %s' % (c.dtype_to_ctype(
         f.dtype), shape, '%s_vec' % f.name)
     return c.Initializer(lvalue, rvalue)
コード例 #11
0
ファイル: vector_expr.py プロジェクト: allansnielsen/hedge
        def real_const_mapper(num):
            # Make sure we do not generate integers or doubles by accident.
            # Oh, C and your broken division semantics.

            r = repr(num)
            if "." not in r or result_dtype == numpy.float32:
                from pytools import to_uncomplex_dtype
                from cgen import dtype_to_ctype
                return "%s(%s)" % (dtype_to_ctype(
                    to_uncomplex_dtype(result_dtype)), r)
            else:
                return r
コード例 #12
0
ファイル: vector_expr.py プロジェクト: haowu80s/hedge
        def real_const_mapper(num):
            # Make sure we do not generate integers or doubles by accident.
            # Oh, C and your broken division semantics.

            r = repr(num)
            if "." not in r or result_dtype == numpy.float32:
                from pytools import to_uncomplex_dtype
                from cgen import dtype_to_ctype
                return "%s(%s)" % (dtype_to_ctype(
                        to_uncomplex_dtype(result_dtype)), r)
            else:
                return r
コード例 #13
0
ファイル: cgen_utils.py プロジェクト: kwinkunks/devito
    def push_stack(self, scope, obj):
        """
        Generate a cgen statement that allocates ``obj`` on the stack.
        """
        dtype = c.dtype_to_ctype(obj.dtype)
        shape = "".join("[%d]" % j for j in obj.shape)
        alignment = "__attribute__((aligned(64)))"

        item = c.POD(dtype, "%s%s %s" % (obj.name, shape, alignment))
        handle = self.stack.setdefault(scope, [])
        if item not in handle:
            handle.append(item)
コード例 #14
0
ファイル: nodes.py プロジェクト: kwinkunks/devito
 def _cparameters(self):
     """Generate arguments signature."""
     cparameters = []
     for v in self.parameters:
         if isinstance(v, Dimension):
             cparameters.append(v.decl)
         elif v.is_ScalarFunction:
             cparameters.append(c.Value('const int', v.name))
         else:
             cparameters.append(
                 c.Value(c.dtype_to_ctype(v.dtype),
                         '*restrict %s_vec' % v.name))
     return cparameters
コード例 #15
0
ファイル: _altmin_gpu.py プロジェクト: dsuess/pycsalgs
    def left_contractions(self, pos):
        """Generates the code computing the left-contraction part of the
        opimization matrix for site nr. `pos`

        :param pos: The local tensor to copy (should be `< len(X)`)
        :returns: List containing cgen Statements

        """
        if pos == 0:
            return [c.Statement('left_c[0] = 1')]

        result = self.copy_ltens_to_share(0)
        result += [c.Line()]

        contract_ltens_with_a = 'dgemv(blasNoTranspose, x_shared, current_row + {offset:d}, {dim_out:d}, {dim_in:d}, {target:})'
        src = contract_ltens_with_a.format(offset=0,
                                           dim_out=self._ranks[0],
                                           dim_in=self._dims[0],
                                           target='left_c')
        # We need to check this every time and can't simpy return since
        # otherwise __syncthreads crashes
        result += [c.If('mid < %i' % self._meas, c.Statement(src))]

        for i in range(1, pos):
            result += self.copy_ltens_to_share(i)
            result += [c.Line()]

            # Since we assume A to consist of product measurements
            result += [
                c.If(
                    'mid < %i' % self._meas,
                    c.Block([
                        c.Statement(
                            contract_ltens_with_a.format(
                                offset=sum(self._dims[:i]),
                                dim_out=self._ranks[i - 1] * self._ranks[i],
                                dim_in=self._dims[i],
                                target='tmat_c')),
                        c.Statement(
                            'dgemv(blasTranspose, tmat_c, left_c, {rank_l}, {rank_r}, buf_c)'
                            .format(rank_l=self._ranks[i - 1],
                                    rank_r=self._ranks[i])),
                        c.Statement(
                            'memcpy(left_c, buf_c, sizeof({ctype}) * {rank_r})'
                            .format(ctype=c.dtype_to_ctype(self._dtype),
                                    rank_r=self._ranks[i]))
                    ])),
                c.Line()
            ]
        return result
コード例 #16
0
ファイル: _altmin_gpu.py プロジェクト: dsuess/pycsalgs
    def right_contractions(self, pos):
        """Generates the code computing the right-contraction part of the
        opimization matrix for site nr. `pos`

        :param pos: The local tensor to copy (should be `< len(X)`)
        :returns: List containing cgen Statements

        """
        if pos == self._sites - 1:
            return [c.Statement('right_c[0] = 1')]

        result = self.copy_ltens_to_share(self._sites - 1)
        result += [c.Line()]

        contract_ltens_with_a = 'dgemv(blasNoTranspose, x_shared, current_row + {offset:d}, {dim_out:d}, {dim_in:d}, {target:})'
        src = contract_ltens_with_a.format(offset=sum(self._dims[:-1]),
                                           dim_out=self._ranks[-1],
                                           dim_in=self._dims[-1],
                                           target='right_c')
        result += [c.If('mid < %i' % self._meas, c.Statement(src))]

        for i in range(self._sites - 2, pos, -1):
            result += self.copy_ltens_to_share(i)
            result += [c.Line()]

            # Since we assume A to consist of product measurements
            result += [
                c.If(
                    'mid < %i' % self._meas,
                    c.Block([
                        c.Statement(
                            contract_ltens_with_a.format(
                                offset=sum(self._dims[:i]),
                                dim_out=self._ranks[i - 1] * self._ranks[i],
                                dim_in=self._dims[i],
                                target='tmat_c')),
                        c.Statement(
                            'dgemv(blasNoTranspose, tmat_c, right_c, {rank_l}, {rank_r}, buf_c)'
                            .format(rank_l=self._ranks[i - 1],
                                    rank_r=self._ranks[i])),
                        c.Statement(
                            'memcpy(right_c, buf_c, sizeof({ctype}) * {rank_l})'
                            .format(ctype=c.dtype_to_ctype(self._dtype),
                                    rank_l=self._ranks[i - 1])),
                    ])),
                c.Line()
            ]

        return result
コード例 #17
0
ファイル: elementwise.py プロジェクト: inducer/codepy
def get_elwise_module_descriptor(arguments, operation, name="kernel"):
    from codepy.bpl import BoostPythonModule

    from cgen import FunctionBody, FunctionDeclaration, \
            Value, POD, Struct, For, Initializer, Include, Statement, \
            Line, Block

    S = Statement  # noqa: N806

    mod = BoostPythonModule()
    mod.add_to_preamble([
        Include("pyublas/numpy.hpp"),
    ])

    mod.add_to_module([
        S("namespace ublas = boost::numeric::ublas"),
        S("using namespace pyublas"),
        Line(),
    ])

    body = Block([
        Initializer(
            Value(
                "numpy_array<{} >::iterator".format(dtype_to_ctype(
                    varg.dtype)), varg.name), f"args.{varg.name}_ary.begin()")
        for varg in arguments if isinstance(varg, VectorArg)
    ] + [
        Initializer(sarg.declarator(), f"args.{sarg.name}")
        for sarg in arguments if isinstance(sarg, ScalarArg)
    ])

    body.extend([
        Line(),
        For("unsigned i = 0", "i < codepy_length", "++i",
            Block([S(operation)]))
    ])

    arg_struct = Struct("arg_struct", [arg.declarator() for arg in arguments])
    mod.add_struct(arg_struct, "ArgStruct")
    mod.add_to_module([Line()])

    mod.add_function(
        FunctionBody(
            FunctionDeclaration(Value("void", name), [
                POD(numpy.uintp, "codepy_length"),
                Value("arg_struct", "args")
            ]), body))

    return mod
コード例 #18
0
 def _args_call(self, args):
     """Generate cgen function call arguments from an iterable of symbols and
     expressions."""
     ret = []
     for i in args:
         try:
             if i.is_Object:
                 ret.append('*_%s' % i.name)
             elif i.is_Array:
                 ret.append("(%s*)%s" % (c.dtype_to_ctype(i.dtype), i.name))
             elif i.is_Symbol:
                 ret.append(i.name)
             elif i.is_TensorFunction:
                 ret.append('%s_vec' % i.name)
         except AttributeError:
             ret.append(ccode(i))
     return ret
コード例 #19
0
ファイル: nodes.py プロジェクト: kwinkunks/devito
 def _ccasts(self):
     """Generate data casts."""
     alignment = "__attribute__((aligned(64)))"
     handle = [
         f for f in self.parameters
         if isinstance(f, (SymbolicData, TensorFunction))
     ]
     shapes = [(f, ''.join(["[%s]" % i.ccode for i in f.indices[1:]]))
               for f in handle]
     casts = [
         c.Initializer(
             c.POD(v.dtype,
                   '(*restrict %s)%s %s' % (v.name, shape, alignment)),
             '(%s (*)%s) %s' %
             (c.dtype_to_ctype(v.dtype), shape, '%s_vec' % v.name))
         for v, shape in shapes
     ]
     return casts
コード例 #20
0
ファイル: visitors.py プロジェクト: xiaocenxiaocen/devito
 def _args_cast(self, args):
     """Build cgen type casts for an iterable of :class:`Argument`."""
     ret = []
     for i in args:
         if i.is_TensorArgument:
             align = "__attribute__((aligned(64)))"
             shape = ''.join(
                 ["[%s]" % ccode(j) for j in i.provider.symbolic_shape[1:]])
             lvalue = c.POD(i.dtype,
                            '(*restrict %s)%s %s' % (i.name, shape, align))
             rvalue = '(%s (*)%s) %s' % (c.dtype_to_ctype(
                 i.dtype), shape, '%s_vec' % i.name)
             ret.append(c.Initializer(lvalue, rvalue))
         elif i.is_PtrArgument:
             ctype = ctypes_to_C(i.dtype)
             lvalue = c.Pointer(c.Value(ctype, i.name))
             rvalue = '(%s*) %s' % (ctype, '_%s' % i.name)
             ret.append(c.Initializer(lvalue, rvalue))
     return ret
コード例 #21
0
    def ccode(self):
        """Returns the C code generated by this kernel.

        This function generates the internal code block from Iteration
        and Expression objects, and adds the necessary template code
        around it.
        """
        header_vars = [
            c.Pointer(c.POD(v.dtype, '%s_vec' % v.name))
            for v in self.signature
        ]
        header = c.Extern(
            "C", c.FunctionDeclaration(c.Value('int', self.name), header_vars))
        cast_shapes = [(v, ''.join(['[%d]' % d for d in v.shape[1:]]))
                       for v in self.signature]
        casts = [
            c.Initializer(
                c.POD(v.dtype, '(*%s)%s' % (v.name, shape)), '(%s (*)%s) %s' %
                (c.dtype_to_ctype(v.dtype), shape, '%s_vec' % v.name))
            for v, shape in cast_shapes
        ]
        body = [e.ccode for e in self.expressions]
        ret = [c.Statement("return 0")]
        return c.FunctionBody(header, c.Block(casts + body + ret))
コード例 #22
0
ファイル: basic.py プロジェクト: nw0/devito
    def _create_elemental_functions(self, nodes, state):
        """
        Extract :class:`Iteration` sub-trees and move them into :class:`Callable`s.

        Currently, only tagged, elementizable Iteration objects are targeted.
        """
        noinline = self._compiler_decoration('noinline',
                                             c.Comment('noinline?'))

        functions = OrderedDict()
        mapper = {}
        for tree in retrieve_iteration_tree(nodes, mode='superset'):
            # Search an elementizable sub-tree (if any)
            tagged = filter_iterations(tree, lambda i: i.tag is not None,
                                       'asap')
            if not tagged:
                continue
            root = tagged[0]
            if not root.is_Elementizable:
                continue
            target = tree[tree.index(root):]

            # Elemental function arguments
            args = []  # Found so far (scalars, tensors)
            maybe_required = set()  # Scalars that *may* have to be passed in
            not_required = set()  # Elemental function locally declared scalars

            # Build a new Iteration/Expression tree with free bounds
            free = []
            for i in target:
                name, bounds = i.dim.name, i.bounds_symbolic
                # Iteration bounds
                start = Scalar(name='%s_start' % name, dtype=np.int32)
                finish = Scalar(name='%s_finish' % name, dtype=np.int32)
                args.extend(zip([ccode(j) for j in bounds], (start, finish)))
                # Iteration unbounded indices
                ufunc = [
                    Scalar(name='%s_ub%d' % (name, j), dtype=np.int32)
                    for j in range(len(i.uindices))
                ]
                args.extend(zip([ccode(j.start) for j in i.uindices], ufunc))
                limits = [Symbol(start.name), Symbol(finish.name), 1]
                uindices = [
                    UnboundedIndex(j.index, i.dim + as_symbol(k))
                    for j, k in zip(i.uindices, ufunc)
                ]
                free.append(
                    i._rebuild(limits=limits, offsets=None, uindices=uindices))
                not_required.update({i.dim}, set(j.index for j in i.uindices))

            # Construct elemental function body, and inspect it
            free = NestedTransformer(dict((zip(target, free)))).visit(root)
            expressions = FindNodes(Expression).visit(free)
            fsymbols = FindSymbols('symbolics').visit(free)

            # Add all definitely-required arguments
            not_required.update({i.output for i in expressions if i.is_scalar})
            for i in fsymbols:
                if i in not_required:
                    continue
                elif i.is_Array:
                    args.append(
                        ("(%s*)%s" % (c.dtype_to_ctype(i.dtype), i.name), i))
                elif i.is_TensorFunction:
                    args.append(("%s_vec" % i.name, i))
                elif i.is_Scalar:
                    args.append((i.name, i))

            # Add all maybe-required arguments that turn out to be required
            maybe_required.update(
                set(FindSymbols(mode='free-symbols').visit(free)))
            for i in fsymbols:
                not_required.update({as_symbol(i), i.indexify()})
                for j in i.symbolic_shape:
                    maybe_required.update(j.free_symbols)
            required = filter_sorted(maybe_required - not_required,
                                     key=attrgetter('name'))
            args.extend([(i.name, Scalar(name=i.name, dtype=i.dtype))
                         for i in required])

            call, params = zip(*args)
            handle = flatten([p.rtargs for p in params])
            name = "f_%d" % root.tag

            # Produce the new Call
            mapper[root] = List(header=noinline, body=Call(name, call))

            # Produce the new Callable
            functions.setdefault(
                name, Callable(name, free, 'void', handle, ('static', )))

        # Transform the main tree
        processed = Transformer(mapper).visit(nodes)

        return processed, {'elemental_functions': functions.values()}
コード例 #23
0
 def visit_LocalExpression(self, o):
     return c.Initializer(
         c.Value(c.dtype_to_ctype(o.dtype), ccode(o.expr.lhs,
                                                  dtype=o.dtype)),
         ccode(o.expr.rhs, dtype=o.dtype))
コード例 #24
0
 def __call__(self, key):
     if key in self._e.keys():
         return self._e[key]
     else:
         return cgen.dtype_to_ctype(key)
コード例 #25
0
ファイル: elementwise.py プロジェクト: inducer/codepy
 def declarator(self):
     return Value("numpy_array<{} >".format(dtype_to_ctype(self.dtype)),
                  f"{self.name}_ary")
コード例 #26
0
    def get_kernel(self, diff_op_cls, elgroup, for_benchmark=False):
        from cgen import \
                Pointer, POD, Value, ArrayOf, \
                Module, FunctionDeclaration, FunctionBody, Block, \
                Line, Define, Include, \
                Initializer, If, For, Statement, Assign

        from cgen import dtype_to_ctype
        from cgen.cuda import CudaShared, CudaGlobal

        discr = self.discr
        d = discr.dimensions
        dims = range(d)
        given = self.plan.given

        par = self.plan.parallelism

        diffmat_data = self.gpu_diffmats(diff_op_cls, elgroup)
        elgroup, = discr.element_groups

        float_type = given.float_type

        f_decl = CudaGlobal(FunctionDeclaration(Value("void", "apply_diff_mat"),
            [Pointer(POD(numpy.uint8, "gmem_diff_rst_mat")),
                #Pointer(POD(float_type, "debugbuf")),
                ] + [Pointer(POD(float_type, "drst%d_global" % i)) for i in dims]
            ))

        rst_channels = given.devdata.make_valid_tex_channel_count(d)
        cmod = Module([
                Include("pycuda-helpers.hpp"),
                Line(),
                Value("texture<fp_tex_%s, 1, cudaReadModeElementType>"
                    % dtype_to_ctype(float_type),
                    "field_tex"),
                Line(),
                Define("DIMENSIONS", discr.dimensions),
                Define("DOFS_PER_EL", given.dofs_per_el()),
                Line(),
                Define("SEGMENT_DOF", "threadIdx.x"),
                Define("PAR_MB_NR", "threadIdx.y"),
                Line(),
                Define("MB_SEGMENT", "blockIdx.x"),
                Define("MACROBLOCK_NR", "blockIdx.y"),
                Line(),
                Define("DOFS_PER_SEGMENT", self.plan.segment_size),
                Define("SEGMENTS_PER_MB", self.plan.segments_per_microblock()),
                Define("ALIGNED_DOFS_PER_MB", given.microblock.aligned_floats),
                Define("ELS_PER_MB", given.microblock.elements),
                Line(),
                Define("PAR_MB_COUNT", par.parallel),
                Define("INLINE_MB_COUNT", par.inline),
                Define("SEQ_MB_COUNT", par.serial),
                Line(),
                Define("THREAD_NUM", "(SEGMENT_DOF+PAR_MB_NR*DOFS_PER_SEGMENT)"),
                Define("COALESCING_THREAD_COUNT", "(PAR_MB_COUNT*DOFS_PER_SEGMENT)"),
                Line(),
                Define("MB_DOF_BASE", "(MB_SEGMENT*DOFS_PER_SEGMENT)"),
                Define("MB_DOF", "(MB_DOF_BASE+SEGMENT_DOF)"),
                Define("GLOBAL_MB_NR_BASE",
                    "(MACROBLOCK_NR*PAR_MB_COUNT*INLINE_MB_COUNT*SEQ_MB_COUNT)"),
                Define("GLOBAL_MB_NR",
                    "(GLOBAL_MB_NR_BASE"
                    "+ (seq_mb_number*PAR_MB_COUNT + PAR_MB_NR)*INLINE_MB_COUNT)"),
                Define("GLOBAL_MB_DOF_BASE", "(GLOBAL_MB_NR*ALIGNED_DOFS_PER_MB)"),
                Line(),
                Define("DIFFMAT_SEGMENT_FLOATS", diffmat_data.block_floats),
                Define("DIFFMAT_SEGMENT_BYTES", "(DIFFMAT_SEGMENT_FLOATS*%d)"
                     % given.float_size()),
                Define("DIFFMAT_COLUMNS", diffmat_data.matrix_columns),
                Line(),
                CudaShared(ArrayOf(POD(float_type, "smem_diff_rst_mat"),
                    "DIFFMAT_COLUMNS*DOFS_PER_SEGMENT")),
                Line(),
                ])

        S = Statement
        f_body = Block()

        f_body.extend_log_block("calculate responsibility data", [
            Initializer(POD(numpy.uint16, "mb_el"),
                "MB_DOF/DOFS_PER_EL"),
            ])

        from hedge.backends.cuda.tools import get_load_code
        f_body.extend(
            get_load_code(
                dest="smem_diff_rst_mat",
                base="gmem_diff_rst_mat + MB_SEGMENT*DIFFMAT_SEGMENT_BYTES",
                bytes="DIFFMAT_SEGMENT_BYTES",
                descr="load diff mat segment")
            +[S("__syncthreads()"), Line()])

        # ---------------------------------------------------------------------
        def get_scalar_diff_code():
            code = []
            for inl in range(par.inline):
                for axis in dims:
                    code.append(
                        Initializer(POD(float_type, "d%drst%d" % (inl, axis)), 0))

            code.append(Line())

            def get_mat_entry(row, col, axis):
                return ("smem_diff_rst_mat["
                        "%(row)s*DIFFMAT_COLUMNS + %(axis)s*DOFS_PER_EL"
                        " + %(col)s"
                        "]" % {"row":row, "col":col, "axis":axis}
                        )

            tex_channels = ["x", "y", "z", "w"]
            from hedge.backends.cuda.tools import unroll
            code.extend(
                    [POD(float_type, "field_value%d" % inl)
                        for inl in range(par.inline)]
                    +[Line()]
                    +unroll(lambda j: [
                        Assign("field_value%d" % inl,
                            "fp_tex1Dfetch(field_tex, GLOBAL_MB_DOF_BASE + %d*ALIGNED_DOFS_PER_MB "
                            "+ mb_el*DOFS_PER_EL + %s)" % (inl, j)
                            )
                        for inl in range(par.inline)]
                        +[Line()]
                        +[S("d%drst%d += %s * field_value%d"
                            % (inl, axis, get_mat_entry("SEGMENT_DOF", j, axis), inl))
                        for axis in dims
                        for inl in range(par.inline)]
                        +[Line()],
                        given.dofs_per_el(), self.plan.max_unroll)
                    )

            store_code = Block()
            for inl in range(par.inline):
                for rst_axis in dims:
                    store_code.append(Assign(
                        "drst%d_global[GLOBAL_MB_DOF_BASE"
                        " + %d*ALIGNED_DOFS_PER_MB + MB_DOF]" % (rst_axis, inl),
                        "d%drst%d" % (inl, rst_axis),
                        ))

            code.append(If("MB_DOF < DOFS_PER_EL*ELS_PER_MB", store_code))

            return code

        f_body.extend([
            For("unsigned short seq_mb_number = 0",
                "seq_mb_number < SEQ_MB_COUNT",
                "++seq_mb_number",
                Block(get_scalar_diff_code()))
            ])

        # finish off ----------------------------------------------------------
        cmod.append(FunctionBody(f_decl, f_body))

        if not for_benchmark and "cuda_dump_kernels" in discr.debug:
            from hedge.tools import open_unique_debug_file
            open_unique_debug_file("diff", ".cu").write(str(cmod))

        mod = SourceModule(cmod,
                keep="cuda_keep_kernels" in discr.debug,
                #options=["--maxrregcount=10"]
                )

        field_texref = mod.get_texref("field_tex")

        func = mod.get_function("apply_diff_mat")
        func.prepare(
                discr.dimensions*[float_type] + ["P"],
                block=(self.plan.segment_size, par.parallel, 1),
                texrefs=[field_texref])

        if "cuda_diff" in discr.debug:
            print "diff: lmem=%d smem=%d regs=%d" % (
                    func.local_size_bytes,
                    func.shared_size_bytes,
                    func.num_regs)

        return func, field_texref
コード例 #27
0
ファイル: elementwise.py プロジェクト: inducer/codepy
 def declarator(self):
     return Value("numpy_array<%s >" % dtype_to_ctype(self.dtype),
             self.name+"_ary")
コード例 #28
0
    def get_kernel(self, with_scaling, for_benchmark=False):
        from cgen import \
                Pointer, POD, Value, ArrayOf, \
                Module, FunctionDeclaration, FunctionBody, Block, \
                Line, Define, Include, \
                Initializer, If, For, Statement, Assign, \
                ArrayInitializer

        from cgen import dtype_to_ctype
        from cgen.cuda import CudaShared, CudaConstant, CudaGlobal

        discr = self.discr
        d = discr.dimensions
        dims = range(d)
        given = self.plan.given

        float_type = given.float_type

        f_decl = CudaGlobal(
            FunctionDeclaration(Value("void", "apply_el_local_mat_smem_mat"), [
                Pointer(POD(float_type, "out_vector")),
                Pointer(POD(numpy.uint8, "gmem_matrix")),
                Pointer(POD(float_type, "debugbuf")),
                POD(numpy.uint32, "microblock_count"),
            ]))

        cmod = Module([
            Include("pycuda-helpers.hpp"),
            Line(),
            Value(
                "texture<fp_tex_%s, 1, cudaReadModeElementType>" %
                dtype_to_ctype(float_type), "in_vector_tex"),
        ])
        if with_scaling:
            cmod.append(
                Value(
                    "texture<fp_tex_%s, 1, cudaReadModeElementType>" %
                    dtype_to_ctype(float_type), "scaling_tex"), )

        par = self.plan.parallelism

        cmod.extend([
            Line(),
            Define("DIMENSIONS", discr.dimensions),
            Define("DOFS_PER_EL", given.dofs_per_el()),
            Define("PREIMAGE_DOFS_PER_EL", self.plan.preimage_dofs_per_el),
            Line(),
            Define("SEGMENT_DOF", "threadIdx.x"),
            Define("PAR_MB_NR", "threadIdx.y"),
            Line(),
            Define("MB_SEGMENT", "blockIdx.x"),
            Define("MACROBLOCK_NR", "blockIdx.y"),
            Line(),
            Define("DOFS_PER_SEGMENT", self.plan.segment_size),
            Define("SEGMENTS_PER_MB", self.plan.segments_per_microblock()),
            Define("ALIGNED_DOFS_PER_MB", given.microblock.aligned_floats),
            Define("ALIGNED_PREIMAGE_DOFS_PER_MB",
                   self.plan.aligned_preimage_dofs_per_microblock),
            Define("MB_EL_COUNT", given.microblock.elements),
            Line(),
            Define("PAR_MB_COUNT", par.parallel),
            Define("INLINE_MB_COUNT", par.inline),
            Define("SEQ_MB_COUNT", par.serial),
            Line(),
            Define("THREAD_NUM", "(SEGMENT_DOF+PAR_MB_NR*DOFS_PER_SEGMENT)"),
            Define("COALESCING_THREAD_COUNT",
                   "(PAR_MB_COUNT*DOFS_PER_SEGMENT)"),
            Line(),
            Define("MB_DOF_BASE", "(MB_SEGMENT*DOFS_PER_SEGMENT)"),
            Define("MB_DOF", "(MB_DOF_BASE+SEGMENT_DOF)"),
            Define(
                "GLOBAL_MB_NR_BASE",
                "(MACROBLOCK_NR*PAR_MB_COUNT*INLINE_MB_COUNT*SEQ_MB_COUNT)"),
            Define(
                "GLOBAL_MB_NR", "(GLOBAL_MB_NR_BASE"
                "+ (seq_mb_number*PAR_MB_COUNT + PAR_MB_NR)*INLINE_MB_COUNT)"),
            Define("GLOBAL_MB_DOF_BASE", "(GLOBAL_MB_NR*ALIGNED_DOFS_PER_MB)"),
            Define("GLOBAL_MB_PREIMG_DOF_BASE",
                   "(GLOBAL_MB_NR*ALIGNED_PREIMAGE_DOFS_PER_MB)"),
            Line(),
            Define("MATRIX_COLUMNS", self.plan.gpu_matrix_columns()),
            Define("MATRIX_SEGMENT_FLOATS",
                   self.plan.gpu_matrix_block_floats()),
            Define("MATRIX_SEGMENT_BYTES",
                   "(MATRIX_SEGMENT_FLOATS*%d)" % given.float_size()),
            Line(),
            CudaShared(
                ArrayOf(POD(float_type, "smem_matrix"),
                        "MATRIX_SEGMENT_FLOATS")),
            CudaShared(
                ArrayOf(
                    ArrayOf(
                        ArrayOf(POD(float_type, "dof_buffer"), "PAR_MB_COUNT"),
                        "INLINE_MB_COUNT"), "DOFS_PER_SEGMENT"), ),
            CudaShared(POD(numpy.uint16, "segment_start_el")),
            CudaShared(POD(numpy.uint16, "segment_stop_el")),
            CudaShared(POD(numpy.uint16, "segment_el_count")),
            Line(),
            ArrayInitializer(
                CudaConstant(
                    ArrayOf(POD(numpy.uint32, "segment_start_el_lookup"),
                            "SEGMENTS_PER_MB")),
                [(chk * self.plan.segment_size) // given.dofs_per_el()
                 for chk in range(self.plan.segments_per_microblock())]),
            ArrayInitializer(
                CudaConstant(
                    ArrayOf(POD(numpy.uint32, "segment_stop_el_lookup"),
                            "SEGMENTS_PER_MB")),
                [
                    min(given.microblock.elements,
                        (chk * self.plan.segment_size +
                         self.plan.segment_size - 1) // given.dofs_per_el() +
                        1)
                    for chk in range(self.plan.segments_per_microblock())
                ]),
        ])

        S = Statement
        f_body = Block()

        f_body.extend_log_block(
            "calculate this dof's element",
            [Initializer(POD(numpy.uint8, "mb_el"), "MB_DOF/DOFS_PER_EL")])

        if self.plan.use_prefetch_branch:
            f_body.extend_log_block("calculate segment responsibility data", [
                If(
                    "THREAD_NUM==0",
                    Block([
                        Assign("segment_start_el",
                               "segment_start_el_lookup[MB_SEGMENT]"),
                        Assign("segment_stop_el",
                               "segment_stop_el_lookup[MB_SEGMENT]"),
                        Assign("segment_el_count",
                               "segment_stop_el-segment_start_el"),
                    ])),
                S("__syncthreads()")
            ])

        from hedge.backends.cuda.tools import get_load_code
        f_body.extend(
            get_load_code(dest="smem_matrix",
                          base=(
                              "gmem_matrix + MB_SEGMENT*MATRIX_SEGMENT_BYTES"),
                          bytes="MATRIX_SEGMENT_BYTES",
                          descr="load matrix segment") +
            [S("__syncthreads()")])

        # ---------------------------------------------------------------------
        def get_batched_fetch_mat_mul_code(el_fetch_count):
            result = []
            dofs = range(self.plan.preimage_dofs_per_el)

            for load_segment_start in range(0, self.plan.preimage_dofs_per_el,
                                            self.plan.segment_size):
                result.extend([S("__syncthreads()")] + [
                    Assign(
                        "dof_buffer[PAR_MB_NR][%d][SEGMENT_DOF]" %
                        inl, "fp_tex1Dfetch(in_vector_tex, "
                        "GLOBAL_MB_PREIMG_DOF_BASE"
                        " + %d*ALIGNED_PREIMAGE_DOFS_PER_MB"
                        " + (segment_start_el)*PREIMAGE_DOFS_PER_EL + %d + SEGMENT_DOF)"
                        % (inl, load_segment_start))
                    for inl in range(par.inline)
                ] + [
                    S("__syncthreads()"),
                    Line(),
                ])

                for dof in dofs[load_segment_start:load_segment_start +
                                self.plan.segment_size]:
                    for inl in range(par.inline):
                        result.append(
                            S("result%d += "
                              "smem_matrix[SEGMENT_DOF*MATRIX_COLUMNS + %d]"
                              "*"
                              "dof_buffer[PAR_MB_NR][%d][%d]" %
                              (inl, dof, inl, dof - load_segment_start)))
                result.append(Line())
            return result

        from hedge.backends.cuda.tools import unroll

        def get_direct_tex_mat_mul_code():
            return (
                [POD(float_type, "fof%d" % inl) for inl in range(par.inline)] +
                [POD(float_type, "lm"), Line()] + unroll(
                    lambda j: [
                        Assign(
                            "fof%d" % inl,
                            "fp_tex1Dfetch(in_vector_tex, "
                            "GLOBAL_MB_PREIMG_DOF_BASE"
                            " + %(inl)d * ALIGNED_PREIMAGE_DOFS_PER_MB"
                            " + mb_el*PREIMAGE_DOFS_PER_EL+%(j)s)" % {
                                "j": j,
                                "inl": inl,
                                "row": "SEGMENT_DOF"
                            },
                        ) for inl in range(par.inline)
                    ] + [
                        Assign(
                            "lm",
                            "smem_matrix["
                            "%(row)s*MATRIX_COLUMNS + %(j)s]" % {
                                "j": j,
                                "row": "SEGMENT_DOF"
                            },
                        )
                    ] + [
                        S("result%(inl)d += fof%(inl)d*lm" % {"inl": inl})
                        for inl in range(par.inline)
                    ],
                    total_number=self.plan.preimage_dofs_per_el,
                    max_unroll=self.plan.max_unroll) + [Line()])

        def get_mat_mul_code(el_fetch_count):
            if el_fetch_count == 1:
                return get_batched_fetch_mat_mul_code(el_fetch_count)
            else:
                return get_direct_tex_mat_mul_code()

        def mat_mul_outer_loop(fetch_count):
            if with_scaling:
                inv_jac_multiplier = (
                    "fp_tex1Dfetch(scaling_tex,"
                    "(GLOBAL_MB_NR + %(inl)d)*MB_EL_COUNT + mb_el)")
            else:
                inv_jac_multiplier = "1"

            write_condition = "MB_DOF < DOFS_PER_EL*MB_EL_COUNT"
            if self.with_index_check:
                write_condition += " && GLOBAL_MB_NR < microblock_count"
            return For(
                "unsigned short seq_mb_number = 0",
                "seq_mb_number < SEQ_MB_COUNT", "++seq_mb_number",
                Block([
                    Initializer(POD(float_type, "result%d" % inl), 0)
                    for inl in range(par.inline)
                ] + [Line()] + get_mat_mul_code(fetch_count) + [
                    If(
                        write_condition,
                        Block([
                            Assign(
                                "out_vector[GLOBAL_MB_DOF_BASE"
                                " + %d*ALIGNED_DOFS_PER_MB"
                                " + MB_DOF]" % inl, "result%d * %s" %
                                (inl, (inv_jac_multiplier % {
                                    "inl": inl
                                }))) for inl in range(par.inline)
                        ]))
                ]))

        if self.plan.use_prefetch_branch:
            from cgen import make_multiple_ifs
            f_body.append(
                make_multiple_ifs([
                    ("segment_el_count == %d" % fetch_count,
                     mat_mul_outer_loop(fetch_count)) for fetch_count in range(
                         1,
                         self.plan.max_elements_touched_by_segment() + 1)
                ]))
        else:
            f_body.append(mat_mul_outer_loop(0))

        # finish off ----------------------------------------------------------
        cmod.append(FunctionBody(f_decl, f_body))

        if not for_benchmark and "cuda_dump_kernels" in discr.debug:
            from hedge.tools import open_unique_debug_file
            open_unique_debug_file(self.plan.debug_name,
                                   ".cu").write(str(cmod))

        mod = SourceModule(
            cmod,
            keep="cuda_keep_kernels" in discr.debug,
            #options=["--maxrregcount=12"]
        )

        func = mod.get_function("apply_el_local_mat_smem_mat")

        if self.plan.debug_name in discr.debug:
            print "%s: lmem=%d smem=%d regs=%d" % (
                self.plan.debug_name, func.local_size_bytes,
                func.shared_size_bytes, func.num_regs)

        in_vector_texref = mod.get_texref("in_vector_tex")
        texrefs = [in_vector_texref]

        if with_scaling:
            scaling_texref = mod.get_texref("scaling_tex")
            texrefs.append(scaling_texref)
        else:
            scaling_texref = None

        func.prepare("PPPI",
                     block=(self.plan.segment_size,
                            self.plan.parallelism.parallel, 1),
                     texrefs=texrefs)

        return func, in_vector_texref, scaling_texref
コード例 #29
0
 def declarator(self):
     return Value("numpy_array<%s >" % dtype_to_ctype(self.dtype),
                  self.name + "_ary")
コード例 #30
0
    def get_kernel(self, with_scaling, for_benchmark=False):
        from cgen import \
                Pointer, POD, Value, ArrayOf, \
                Module, FunctionDeclaration, FunctionBody, Block, \
                Line, Define, Include, \
                Initializer, If, For, Statement, Assign, \
                ArrayInitializer

        from cgen import dtype_to_ctype
        from cgen.cuda import CudaShared, CudaConstant, CudaGlobal

        discr = self.discr
        d = discr.dimensions
        dims = range(d)
        given = self.plan.given

        float_type = given.float_type

        f_decl = CudaGlobal(FunctionDeclaration(Value("void", "apply_el_local_mat_smem_mat"),
            [
                Pointer(POD(float_type, "out_vector")),
                Pointer(POD(numpy.uint8, "gmem_matrix")),
                Pointer(POD(float_type, "debugbuf")),
                POD(numpy.uint32, "microblock_count"),
                ]
            ))

        cmod = Module([
                Include("pycuda-helpers.hpp"),
                Line(),
                Value("texture<fp_tex_%s, 1, cudaReadModeElementType>"
                    % dtype_to_ctype(float_type),
                    "in_vector_tex"),
                ])
        if with_scaling:
            cmod.append(
                Value("texture<fp_tex_%s, 1, cudaReadModeElementType>"
                    % dtype_to_ctype(float_type),
                    "scaling_tex"),
                )

        par = self.plan.parallelism

        cmod.extend([
                Line(),
                Define("DIMENSIONS", discr.dimensions),
                Define("DOFS_PER_EL", given.dofs_per_el()),
                Define("PREIMAGE_DOFS_PER_EL", self.plan.preimage_dofs_per_el),
                Line(),
                Define("SEGMENT_DOF", "threadIdx.x"),
                Define("PAR_MB_NR", "threadIdx.y"),
                Line(),
                Define("MB_SEGMENT", "blockIdx.x"),
                Define("MACROBLOCK_NR", "blockIdx.y"),
                Line(),
                Define("DOFS_PER_SEGMENT", self.plan.segment_size),
                Define("SEGMENTS_PER_MB", self.plan.segments_per_microblock()),
                Define("ALIGNED_DOFS_PER_MB", given.microblock.aligned_floats),
                Define("ALIGNED_PREIMAGE_DOFS_PER_MB",
                    self.plan.aligned_preimage_dofs_per_microblock),
                Define("MB_EL_COUNT", given.microblock.elements),
                Line(),
                Define("PAR_MB_COUNT", par.parallel),
                Define("INLINE_MB_COUNT", par.inline),
                Define("SEQ_MB_COUNT", par.serial),
                Line(),
                Define("THREAD_NUM", "(SEGMENT_DOF+PAR_MB_NR*DOFS_PER_SEGMENT)"),
                Define("COALESCING_THREAD_COUNT", "(PAR_MB_COUNT*DOFS_PER_SEGMENT)"),
                Line(),
                Define("MB_DOF_BASE", "(MB_SEGMENT*DOFS_PER_SEGMENT)"),
                Define("MB_DOF", "(MB_DOF_BASE+SEGMENT_DOF)"),
                Define("GLOBAL_MB_NR_BASE",
                    "(MACROBLOCK_NR*PAR_MB_COUNT*INLINE_MB_COUNT*SEQ_MB_COUNT)"),
                Define("GLOBAL_MB_NR",
                    "(GLOBAL_MB_NR_BASE"
                    "+ (seq_mb_number*PAR_MB_COUNT + PAR_MB_NR)*INLINE_MB_COUNT)"),
                Define("GLOBAL_MB_DOF_BASE", "(GLOBAL_MB_NR*ALIGNED_DOFS_PER_MB)"),
                Define("GLOBAL_MB_PREIMG_DOF_BASE", "(GLOBAL_MB_NR*ALIGNED_PREIMAGE_DOFS_PER_MB)"),
                Line(),
                Define("MATRIX_COLUMNS", self.plan.gpu_matrix_columns()),
                Define("MATRIX_SEGMENT_FLOATS", self.plan.gpu_matrix_block_floats()),
                Define("MATRIX_SEGMENT_BYTES",
                    "(MATRIX_SEGMENT_FLOATS*%d)" % given.float_size()),

                Line(),
                CudaShared(ArrayOf(POD(float_type, "smem_matrix"),
                    "MATRIX_SEGMENT_FLOATS")),
                CudaShared(
                    ArrayOf(
                        ArrayOf(
                            ArrayOf(
                                POD(float_type, "dof_buffer"),
                                "PAR_MB_COUNT"),
                            "INLINE_MB_COUNT"),
                        "DOFS_PER_SEGMENT"),
                    ),
                CudaShared(POD(numpy.uint16, "segment_start_el")),
                CudaShared(POD(numpy.uint16, "segment_stop_el")),
                CudaShared(POD(numpy.uint16, "segment_el_count")),
                Line(),
                ArrayInitializer(
                        CudaConstant(
                            ArrayOf(
                                POD(numpy.uint32, "segment_start_el_lookup"),
                            "SEGMENTS_PER_MB")),
                        [(chk*self.plan.segment_size)//given.dofs_per_el()
                            for chk in range(self.plan.segments_per_microblock())]
                        ),
                ArrayInitializer(
                        CudaConstant(
                            ArrayOf(
                                POD(numpy.uint32, "segment_stop_el_lookup"),
                            "SEGMENTS_PER_MB")),
                        [min(given.microblock.elements,
                            (chk*self.plan.segment_size+self.plan.segment_size-1)
                                //given.dofs_per_el()+1)
                            for chk in range(self.plan.segments_per_microblock())]
                        ),
                ])

        S = Statement
        f_body = Block()

        f_body.extend_log_block("calculate this dof's element", [
            Initializer(POD(numpy.uint8, "mb_el"),
                "MB_DOF/DOFS_PER_EL") ])

        if self.plan.use_prefetch_branch:
            f_body.extend_log_block("calculate segment responsibility data", [
                If("THREAD_NUM==0",
                    Block([
                        Assign("segment_start_el", "segment_start_el_lookup[MB_SEGMENT]"),
                        Assign("segment_stop_el", "segment_stop_el_lookup[MB_SEGMENT]"),
                        Assign("segment_el_count", "segment_stop_el-segment_start_el"),
                        ])
                    ),
                S("__syncthreads()")
                ])

        from hedge.backends.cuda.tools import get_load_code
        f_body.extend(
            get_load_code(
                dest="smem_matrix",
                base=("gmem_matrix + MB_SEGMENT*MATRIX_SEGMENT_BYTES"),
                bytes="MATRIX_SEGMENT_BYTES",
                descr="load matrix segment")
            +[S("__syncthreads()")]
            )

        # ---------------------------------------------------------------------
        def get_batched_fetch_mat_mul_code(el_fetch_count):
            result = []
            dofs = range(self.plan.preimage_dofs_per_el)

            for load_segment_start in range(0, self.plan.preimage_dofs_per_el,
                    self.plan.segment_size):
                result.extend(
                        [S("__syncthreads()")]
                        +[Assign(
                            "dof_buffer[PAR_MB_NR][%d][SEGMENT_DOF]" % inl,
                            "fp_tex1Dfetch(in_vector_tex, "
                            "GLOBAL_MB_PREIMG_DOF_BASE"
                            " + %d*ALIGNED_PREIMAGE_DOFS_PER_MB"
                            " + (segment_start_el)*PREIMAGE_DOFS_PER_EL + %d + SEGMENT_DOF)"
                            % (inl, load_segment_start)
                            )
                        for inl in range(par.inline)]
                        +[S("__syncthreads()"),
                        Line(),
                        ])

                for dof in dofs[load_segment_start:load_segment_start+self.plan.segment_size]:
                    for inl in range(par.inline):
                        result.append(
                                S("result%d += "
                                    "smem_matrix[SEGMENT_DOF*MATRIX_COLUMNS + %d]"
                                    "*"
                                    "dof_buffer[PAR_MB_NR][%d][%d]"
                                    % (inl, dof, inl, dof-load_segment_start))
                                )
                result.append(Line())
            return result

        from hedge.backends.cuda.tools import unroll
        def get_direct_tex_mat_mul_code():
            return (
                    [POD(float_type, "fof%d" % inl) for inl in range(par.inline)]
                    + [POD(float_type, "lm"), Line()]
                    + unroll(
                        lambda j: [
                        Assign("fof%d" % inl,
                            "fp_tex1Dfetch(in_vector_tex, "
                            "GLOBAL_MB_PREIMG_DOF_BASE"
                            " + %(inl)d * ALIGNED_PREIMAGE_DOFS_PER_MB"
                            " + mb_el*PREIMAGE_DOFS_PER_EL+%(j)s)"
                            % {"j":j, "inl":inl, "row": "SEGMENT_DOF"},)
                        for inl in range(par.inline)
                        ]+[
                        Assign("lm",
                            "smem_matrix["
                            "%(row)s*MATRIX_COLUMNS + %(j)s]"
                            % {"j":j, "row": "SEGMENT_DOF"},
                            )
                        ]+[
                        S("result%(inl)d += fof%(inl)d*lm" % {"inl":inl})
                        for inl in range(par.inline)
                        ],
                        total_number=self.plan.preimage_dofs_per_el,
                        max_unroll=self.plan.max_unroll)
                    + [Line()])

        def get_mat_mul_code(el_fetch_count):
            if el_fetch_count == 1:
                return get_batched_fetch_mat_mul_code(el_fetch_count)
            else:
                return get_direct_tex_mat_mul_code()

        def mat_mul_outer_loop(fetch_count):
            if with_scaling:
                inv_jac_multiplier = ("fp_tex1Dfetch(scaling_tex,"
                        "(GLOBAL_MB_NR + %(inl)d)*MB_EL_COUNT + mb_el)")
            else:
                inv_jac_multiplier = "1"

            write_condition = "MB_DOF < DOFS_PER_EL*MB_EL_COUNT"
            if self.with_index_check:
                write_condition += " && GLOBAL_MB_NR < microblock_count"
            return For("unsigned short seq_mb_number = 0",
                "seq_mb_number < SEQ_MB_COUNT",
                "++seq_mb_number",
                Block([
                    Initializer(POD(float_type, "result%d" % inl), 0)
                    for inl in range(par.inline)
                    ]+[Line()]
                    +get_mat_mul_code(fetch_count)
                    +[
                    If(write_condition,
                        Block([
                            Assign(
                                "out_vector[GLOBAL_MB_DOF_BASE"
                                " + %d*ALIGNED_DOFS_PER_MB"
                                " + MB_DOF]" % inl,
                                "result%d * %s" % (inl, (inv_jac_multiplier % {"inl":inl}))
                                )
                            for inl in range(par.inline)
                            ])
                        )
                    ])
                )

        if self.plan.use_prefetch_branch:
            from cgen import make_multiple_ifs
            f_body.append(make_multiple_ifs([
                    ("segment_el_count == %d" % fetch_count,
                        mat_mul_outer_loop(fetch_count))
                    for fetch_count in
                    range(1, self.plan.max_elements_touched_by_segment()+1)]
                    ))
        else:
            f_body.append(mat_mul_outer_loop(0))

        # finish off ----------------------------------------------------------
        cmod.append(FunctionBody(f_decl, f_body))

        if not for_benchmark and "cuda_dump_kernels" in discr.debug:
            from hedge.tools import open_unique_debug_file
            open_unique_debug_file(self.plan.debug_name, ".cu").write(str(cmod))

        mod = SourceModule(cmod,
                keep="cuda_keep_kernels" in discr.debug,
                #options=["--maxrregcount=12"]
                )

        func = mod.get_function("apply_el_local_mat_smem_mat")

        if self.plan.debug_name in discr.debug:
            print "%s: lmem=%d smem=%d regs=%d" % (
                    self.plan.debug_name,
                    func.local_size_bytes,
                    func.shared_size_bytes,
                    func.num_regs)

        in_vector_texref = mod.get_texref("in_vector_tex")
        texrefs = [in_vector_texref]

        if with_scaling:
            scaling_texref = mod.get_texref("scaling_tex")
            texrefs.append(scaling_texref)
        else:
            scaling_texref = None

        func.prepare(
                "PPPI",
                block=(self.plan.segment_size, self.plan.parallelism.parallel, 1),
                texrefs=texrefs)

        return func, in_vector_texref, scaling_texref
コード例 #31
0
ファイル: nodes.py プロジェクト: kwinkunks/devito
 def ccode(self):
     ctype = c.dtype_to_ctype(self.dtype)
     return c.Initializer(c.Value(ctype, ccode(self.expr.lhs)),
                          ccode(self.expr.rhs))
コード例 #32
0
    def get_kernel(self, for_benchmark=False):
        from cgen import \
                Pointer, POD, Value, ArrayOf, Const, \
                Module, FunctionDeclaration, FunctionBody, Block, \
                Comment, Line, Include, \
                Define, \
                Initializer, If, For, Statement, Assign

        from cgen import dtype_to_ctype
        from cgen.cuda import CudaShared, CudaGlobal

        discr = self.discr
        d = discr.dimensions
        dims = range(d)
        given = self.plan.given

        float_type = given.float_type

        f_decl = CudaGlobal(FunctionDeclaration(Value("void", "apply_el_local_mat_smem_field"),
            [
                Pointer(POD(float_type, "out_vector")),
                Pointer(POD(float_type, "in_vector")),
                Pointer(POD(float_type, "debugbuf")),
                POD(numpy.uint32, "microblock_count"),
                ]
            ))

        cmod = Module([
                Include("pycuda-helpers.hpp"),
                Line(),
                Value("texture<fp_tex_%s, 2, cudaReadModeElementType>"
                    % dtype_to_ctype(float_type),
                    "mat_tex"),
                ])

        plan = self.plan
        par = plan.parallelism

        # FIXME: aligned_image_dofs_per_microblock must be divisible
        # by this, therefore hardcoding for now.
        chunk_size = 16

        cmod.extend([
                Line(),
                Define("DIMENSIONS", discr.dimensions),
                Define("IMAGE_DOFS_PER_EL", plan.image_dofs_per_el),
                Define("PREIMAGE_DOFS_PER_EL", plan.preimage_dofs_per_el),
                Define("ALIGNED_IMAGE_DOFS_PER_MB", plan.aligned_image_dofs_per_microblock),
                Define("ALIGNED_PREIMAGE_DOFS_PER_MB",
                    plan.aligned_preimage_dofs_per_microblock),
                Line(),
                Define("MB_EL_COUNT", plan.elements_per_microblock),
                Line(),
                Define("IMAGE_DOFS_PER_MB", "(IMAGE_DOFS_PER_EL*MB_EL_COUNT)"),
                Line(),
                Define("CHUNK_SIZE", chunk_size),
                Define("CHUNK_DOF", "threadIdx.x"),
                Define("PAR_MB_NR", "threadIdx.y"),
                Define("CHUNK_NR", "threadIdx.z"),
                Define("IMAGE_MB_DOF", "(CHUNK_NR*CHUNK_SIZE+CHUNK_DOF)"),
                Define("IMAGE_EL_DOF", "(IMAGE_MB_DOF - mb_el*IMAGE_DOFS_PER_EL)"),
                Line(),
                Define("MACROBLOCK_NR", "blockIdx.x"),
                Line(),
                Define("PAR_MB_COUNT", par.parallel),
                Define("INLINE_MB_COUNT", par.inline),
                Define("SEQ_MB_COUNT", par.serial),
                Line(),
                Define("GLOBAL_MB_NR_BASE",
                    "(MACROBLOCK_NR*PAR_MB_COUNT*INLINE_MB_COUNT*SEQ_MB_COUNT)"),
                Define("GLOBAL_MB_NR",
                    "(GLOBAL_MB_NR_BASE"
                    "+ (seq_mb_number*PAR_MB_COUNT + PAR_MB_NR)*INLINE_MB_COUNT)"),
                Define("GLOBAL_MB_IMAGE_DOF_BASE", "(GLOBAL_MB_NR*ALIGNED_IMAGE_DOFS_PER_MB)"),
                Define("GLOBAL_MB_PREIMAGE_DOF_BASE", "(GLOBAL_MB_NR*ALIGNED_PREIMAGE_DOFS_PER_MB)"),
                Line(),
                CudaShared(
                    ArrayOf(
                        ArrayOf(
                            ArrayOf(
                                POD(float_type, "smem_in_vector"),
                                "PAR_MB_COUNT"),
                            "INLINE_MB_COUNT"),
                        "ALIGNED_PREIMAGE_DOFS_PER_MB")),
                Line(),
                ])

        S = Statement
        f_body = Block([
            Initializer(Const(POD(numpy.uint16, "mb_el")),
                "IMAGE_MB_DOF / IMAGE_DOFS_PER_EL"),
            Line(),
            ])

        def get_load_code():
            mb_img_dofs = plan.aligned_image_dofs_per_microblock
            mb_preimg_dofs = plan.aligned_preimage_dofs_per_microblock
            preimg_dofs_over_dofs = (mb_preimg_dofs+mb_img_dofs-1) // mb_img_dofs

            load_code = []
            store_code = []

            var_num = 0
            for load_block in range(preimg_dofs_over_dofs):
                for inl in range(par.inline):
                    # load and store are split for better pipelining
                    # compiler can't figure that out because of branch

                    var = "tmp%d" % var_num
                    var_num += 1
                    load_code.append(POD(float_type, var))

                    block_addr = "%d * ALIGNED_IMAGE_DOFS_PER_MB + IMAGE_MB_DOF" % load_block
                    load_instr = Assign(var,
                        "in_vector[GLOBAL_MB_PREIMAGE_DOF_BASE"
                        " + %d*ALIGNED_PREIMAGE_DOFS_PER_MB"
                        " + %s]" % (inl, block_addr))
                    store_instr = Assign(
                            "smem_in_vector[PAR_MB_NR][%d][%s]" % (inl, block_addr),
                            var
                            )
                    if (load_block+1)*mb_img_dofs >= mb_preimg_dofs:
                        cond = "%s < ALIGNED_PREIMAGE_DOFS_PER_MB" % block_addr
                        load_instr = If(cond, load_instr)
                        store_instr = If(cond, store_instr)

                    load_code.append(load_instr)
                    store_code.append(store_instr)
            return Block(load_code + [Line()] + store_code)

        def get_matmul_code():
            from hedge.backends.cuda.tools import unroll

            index_check_condition = "GLOBAL_MB_NR < microblock_count"

            def if_(conditions, then):
                final_cond = " && ".join(cond for cond in conditions if cond)
                if final_cond:
                    return If(final_cond, then)
                else:
                    return then

            result = Block([
                Comment("everybody needs to be done with the old data"),
                S("__syncthreads()"), Line(),
                ]+[If(index_check_condition, get_load_code())]+[
                Line(),
                Comment("all the new data must be loaded"),
                S("__syncthreads()"),
                Line(),
                ]+[
                Initializer(POD(float_type, "result%d" % inl), 0)
                for inl in range(par.inline)
                ]+[
                Line(),
                POD(float_type, "mat_entry"),
                Line(),
                ])

            result.append(if_(["IMAGE_MB_DOF < IMAGE_DOFS_PER_MB", index_check_condition],
                Block(unroll(lambda j:
                    [Assign("mat_entry", "fp_tex2D(mat_tex, IMAGE_EL_DOF, %s)" % j)]
                    +[
                    S("result%d += mat_entry "
                    "* smem_in_vector[PAR_MB_NR][%d][mb_el*PREIMAGE_DOFS_PER_EL + %s]"
                    % (inl, inl, j))
                    for inl in range(par.inline)
                    ],
                    total_number=plan.preimage_dofs_per_el)
                    +[Line()]
                    +[Assign(
                        "out_vector[GLOBAL_MB_IMAGE_DOF_BASE + "
                        "%d*ALIGNED_IMAGE_DOFS_PER_MB + IMAGE_MB_DOF]" % inl,
                        "result%d" % inl)
                    for inl in range(par.inline)]
                    )))

            return result

        f_body.append(For("unsigned short seq_mb_number = 0",
            "seq_mb_number < SEQ_MB_COUNT",
            "++seq_mb_number", get_matmul_code()))

        # finish off ----------------------------------------------------------
        cmod.append(FunctionBody(f_decl, f_body))

        if not for_benchmark and "cuda_dump_kernels" in discr.debug:
            from hedge.tools import open_unique_debug_file
            open_unique_debug_file(plan.debug_name, ".cu").write(str(cmod))

        mod = SourceModule(cmod,
                keep="cuda_keep_kernels" in discr.debug,
                #options=["--maxrregcount=12"]
                )

        func = mod.get_function("apply_el_local_mat_smem_field")

        if plan.debug_name in discr.debug:
            print "%s: lmem=%d smem=%d regs=%d" % (
                    plan.debug_name,
                    func.local_size_bytes,
                    func.shared_size_bytes,
                    func.num_regs)

        mat_texref = mod.get_texref("mat_tex")
        texrefs = [mat_texref]

        func.prepare(
                "PPPI",
                texrefs=texrefs)
        assert plan.aligned_image_dofs_per_microblock % chunk_size == 0
        block = (
                chunk_size,
                plan.parallelism.parallel,
                plan.aligned_image_dofs_per_microblock
                //chunk_size)

        return func, block, mat_texref
コード例 #33
0
    def create_native(self):
        from cgen import (ArrayOf, POD, Block, For, Statement, Struct)
        from cgen import dtype_to_ctype
        import numpy

        members = []
        code = []

        for pk, pv in config.parameters.iteritems():
            if isinstance(pv, int):
                members.append(POD(numpy.int, pk))
                code.append(
                    Statement("params.%s = extract<%s>(cppdict[\"%s\"])" %
                              (pk, dtype_to_ctype(numpy.int), pk)))
            elif isinstance(pv, float):
                members.append(POD(numpy.float64, pk))
                code.append(
                    Statement("params.%s = extract<%s>(cppdict[\"%s\"])" %
                              (pk, dtype_to_ctype(numpy.float64), pk)))
            elif isinstance(pv, list):
                if isinstance(pv[0], int):
                    members.append(ArrayOf(POD(numpy.int, pk), len(pv)))
                    code.append(
                        Block([
                            Statement("list v = extract<%s>(cppdict[\"%s\"])" %
                                      (list.__name__, pk)),
                            For(
                                "unsigned int i  = 0", "i<len(v)", "++i",
                                Statement("params.%s[i] = extract<%s>(v[i])" %
                                          (pk, dtype_to_ctype(numpy.int)))),
                        ]))
                elif isinstance(pv[0], float):
                    members.append(ArrayOf(POD(numpy.float64, pk), len(pv)))
                    code.append(
                        Block([
                            Statement("list v = extract<%s>(cppdict[\"%s\"])" %
                                      (list.__name__, pk)),
                            For(
                                "unsigned int i  = 0", "i < len(v)", "++i",
                                Block([
                                    Statement(
                                        "params.%s[i] = extract<%s>(v[i])" %
                                        (pk, dtype_to_ctype(numpy.float64))),
                                    Statement(
                                        "//std::cout << params.%s[i] << std::endl"
                                        % (pk))
                                ])),
                        ]))

        mystruct = Struct('Parameters', members)
        mycode = Block(code)

        # print mystruct
        # print mycode

        from jinja2 import Template

        tpl = Template("""
#include <boost/python.hpp>
#include <boost/python/object.hpp>
#include <boost/python/extract.hpp>
#include <boost/python/list.hpp>
#include <boost/python/dict.hpp>
#include <boost/python/str.hpp>
#include <stdexcept>
#include <iostream>

{{my_struct}}

Parameters params;

void CopyDictionary(boost::python::object pydict)
{
    using namespace boost::python;

    extract< dict > cppdict_ext(pydict);
    if(!cppdict_ext.check()){
        throw std::runtime_error(
                    "PassObj::pass_dict: type error: not a python dict.");
    }

    dict cppdict = cppdict_ext();
    list keylist = cppdict.keys();

    {{my_extractor}}


}

BOOST_PYTHON_MODULE({{my_module}})
{
   boost::python::def("copy_dict", &CopyDictionary);
}
        """)
        rendered_tpl = tpl.render(my_module="NativeParameters",
                                  my_extractor=mycode,
                                  my_struct=mystruct)

        # print rendered_tpl

        from codepy.toolchain import NVCCToolchain
        import codepy.toolchain

        kwargs = codepy.toolchain._guess_toolchain_kwargs_from_python_config()
        # print kwargs
        kwargs["cc"] = "nvcc"
        # kwargs["cflags"]=["-m64","-x","cu","-Xcompiler","-fPIC","-ccbin","/opt/local/bin/g++-mp-4.4"]
        kwargs["cflags"] = ["-m64", "-x", "cu", "-Xcompiler", "-fPIC"]
        kwargs["include_dirs"].append("/usr/local/cuda/include")
        kwargs["defines"] = []
        kwargs["ldflags"] = ["-shared"]
        # kwargs["libraries"]=["python2.7"]
        kwargs["libraries"] = ["python2.6"]
        print kwargs
        toolchain = NVCCToolchain(**kwargs)

        from codepy.libraries import add_boost_python
        add_boost_python(toolchain)

        from codepy.jit import extension_from_string
        mymod = extension_from_string(toolchain, "NativeParameters",
                                      rendered_tpl)

        mymod.copy_dict(config.parameters)
コード例 #34
0
    def get_kernel(self, diff_op_cls, elgroup, for_benchmark=False):
        from cgen import \
                Pointer, POD, Value, ArrayOf, \
                Module, FunctionDeclaration, FunctionBody, Block, \
                Line, Define, Include, \
                Initializer, If, For, Statement, Assign

        from cgen import dtype_to_ctype
        from cgen.cuda import CudaShared, CudaGlobal

        discr = self.discr
        d = discr.dimensions
        dims = range(d)
        given = self.plan.given

        par = self.plan.parallelism

        diffmat_data = self.gpu_diffmats(diff_op_cls, elgroup)
        elgroup, = discr.element_groups

        float_type = given.float_type

        f_decl = CudaGlobal(
            FunctionDeclaration(
                Value("void", "apply_diff_mat"),
                [
                    Pointer(POD(numpy.uint8, "gmem_diff_rst_mat")),
                    #Pointer(POD(float_type, "debugbuf")),
                ] +
                [Pointer(POD(float_type, "drst%d_global" % i)) for i in dims]))

        rst_channels = given.devdata.make_valid_tex_channel_count(d)
        cmod = Module([
            Include("pycuda-helpers.hpp"),
            Line(),
            Value(
                "texture<fp_tex_%s, 1, cudaReadModeElementType>" %
                dtype_to_ctype(float_type), "field_tex"),
            Line(),
            Define("DIMENSIONS", discr.dimensions),
            Define("DOFS_PER_EL", given.dofs_per_el()),
            Line(),
            Define("SEGMENT_DOF", "threadIdx.x"),
            Define("PAR_MB_NR", "threadIdx.y"),
            Line(),
            Define("MB_SEGMENT", "blockIdx.x"),
            Define("MACROBLOCK_NR", "blockIdx.y"),
            Line(),
            Define("DOFS_PER_SEGMENT", self.plan.segment_size),
            Define("SEGMENTS_PER_MB", self.plan.segments_per_microblock()),
            Define("ALIGNED_DOFS_PER_MB", given.microblock.aligned_floats),
            Define("ELS_PER_MB", given.microblock.elements),
            Line(),
            Define("PAR_MB_COUNT", par.parallel),
            Define("INLINE_MB_COUNT", par.inline),
            Define("SEQ_MB_COUNT", par.serial),
            Line(),
            Define("THREAD_NUM", "(SEGMENT_DOF+PAR_MB_NR*DOFS_PER_SEGMENT)"),
            Define("COALESCING_THREAD_COUNT",
                   "(PAR_MB_COUNT*DOFS_PER_SEGMENT)"),
            Line(),
            Define("MB_DOF_BASE", "(MB_SEGMENT*DOFS_PER_SEGMENT)"),
            Define("MB_DOF", "(MB_DOF_BASE+SEGMENT_DOF)"),
            Define(
                "GLOBAL_MB_NR_BASE",
                "(MACROBLOCK_NR*PAR_MB_COUNT*INLINE_MB_COUNT*SEQ_MB_COUNT)"),
            Define(
                "GLOBAL_MB_NR", "(GLOBAL_MB_NR_BASE"
                "+ (seq_mb_number*PAR_MB_COUNT + PAR_MB_NR)*INLINE_MB_COUNT)"),
            Define("GLOBAL_MB_DOF_BASE", "(GLOBAL_MB_NR*ALIGNED_DOFS_PER_MB)"),
            Line(),
            Define("DIFFMAT_SEGMENT_FLOATS", diffmat_data.block_floats),
            Define("DIFFMAT_SEGMENT_BYTES",
                   "(DIFFMAT_SEGMENT_FLOATS*%d)" % given.float_size()),
            Define("DIFFMAT_COLUMNS", diffmat_data.matrix_columns),
            Line(),
            CudaShared(
                ArrayOf(POD(float_type, "smem_diff_rst_mat"),
                        "DIFFMAT_COLUMNS*DOFS_PER_SEGMENT")),
            Line(),
        ])

        S = Statement
        f_body = Block()

        f_body.extend_log_block("calculate responsibility data", [
            Initializer(POD(numpy.uint16, "mb_el"), "MB_DOF/DOFS_PER_EL"),
        ])

        from hedge.backends.cuda.tools import get_load_code
        f_body.extend(
            get_load_code(
                dest="smem_diff_rst_mat",
                base="gmem_diff_rst_mat + MB_SEGMENT*DIFFMAT_SEGMENT_BYTES",
                bytes="DIFFMAT_SEGMENT_BYTES",
                descr="load diff mat segment") +
            [S("__syncthreads()"), Line()])

        # ---------------------------------------------------------------------
        def get_scalar_diff_code():
            code = []
            for inl in range(par.inline):
                for axis in dims:
                    code.append(
                        Initializer(POD(float_type, "d%drst%d" % (inl, axis)),
                                    0))

            code.append(Line())

            def get_mat_entry(row, col, axis):
                return ("smem_diff_rst_mat["
                        "%(row)s*DIFFMAT_COLUMNS + %(axis)s*DOFS_PER_EL"
                        " + %(col)s"
                        "]" % {
                            "row": row,
                            "col": col,
                            "axis": axis
                        })

            tex_channels = ["x", "y", "z", "w"]
            from hedge.backends.cuda.tools import unroll
            code.extend([
                POD(float_type, "field_value%d" % inl)
                for inl in range(par.inline)
            ] + [Line()] + unroll(
                lambda j: [
                    Assign(
                        "field_value%d" % inl,
                        "fp_tex1Dfetch(field_tex, GLOBAL_MB_DOF_BASE + %d*ALIGNED_DOFS_PER_MB "
                        "+ mb_el*DOFS_PER_EL + %s)" % (inl, j))
                    for inl in range(par.inline)
                ] + [Line()] + [
                    S("d%drst%d += %s * field_value%d" %
                      (inl, axis, get_mat_entry("SEGMENT_DOF", j, axis), inl))
                    for axis in dims for inl in range(par.inline)
                ] + [Line()], given.dofs_per_el(), self.plan.max_unroll))

            store_code = Block()
            for inl in range(par.inline):
                for rst_axis in dims:
                    store_code.append(
                        Assign(
                            "drst%d_global[GLOBAL_MB_DOF_BASE"
                            " + %d*ALIGNED_DOFS_PER_MB + MB_DOF]" %
                            (rst_axis, inl),
                            "d%drst%d" % (inl, rst_axis),
                        ))

            code.append(If("MB_DOF < DOFS_PER_EL*ELS_PER_MB", store_code))

            return code

        f_body.extend([
            For("unsigned short seq_mb_number = 0",
                "seq_mb_number < SEQ_MB_COUNT", "++seq_mb_number",
                Block(get_scalar_diff_code()))
        ])

        # finish off ----------------------------------------------------------
        cmod.append(FunctionBody(f_decl, f_body))

        if not for_benchmark and "cuda_dump_kernels" in discr.debug:
            from hedge.tools import open_unique_debug_file
            open_unique_debug_file("diff", ".cu").write(str(cmod))

        mod = SourceModule(
            cmod,
            keep="cuda_keep_kernels" in discr.debug,
            #options=["--maxrregcount=10"]
        )

        field_texref = mod.get_texref("field_tex")

        func = mod.get_function("apply_diff_mat")
        func.prepare(discr.dimensions * [float_type] + ["P"],
                     block=(self.plan.segment_size, par.parallel, 1),
                     texrefs=[field_texref])

        if "cuda_diff" in discr.debug:
            print "diff: lmem=%d smem=%d regs=%d" % (
                func.local_size_bytes, func.shared_size_bytes, func.num_regs)

        return func, field_texref