def _args_decl(self, args): """Convert an iterable of :class:`Argument` into cgen format.""" ret = [] for i in args: if i.is_ScalarArgument: ret.append( c.Value('const %s' % c.dtype_to_ctype(i.dtype), i.name)) elif i.is_TensorArgument: ret.append( c.Value(c.dtype_to_ctype(i.dtype), '*restrict %s_vec' % i.name)) else: ret.append(c.Value('void', '*_%s' % i.name)) return ret
def __repr__(self): parameters = ",".join([ 'void*' if i.is_Object else c.dtype_to_ctype(i.dtype) for i in self.parameters ]) return "%s[%s]<%s; %s>" % (self.__class__.__name__, self.name, self.retval, parameters)
def get_load_code(dest, base, bytes, word_type=numpy.uint32, descr=None): from cgen import ( Pointer, POD, Comment, Block, Line, \ Constant, For, Statement) from cgen import dtype_to_ctype copy_dtype = numpy.dtype(word_type) copy_dtype_str = dtype_to_ctype(copy_dtype) code = [] if descr is not None: code.append(Comment(descr)) code.extend([ Block([ Constant(Pointer(POD(copy_dtype, "load_base")), ("(%s *) (%s)" % (copy_dtype_str, base))), For("unsigned word_nr = THREAD_NUM", "word_nr*sizeof(int) < (%s)" % bytes, "word_nr += COALESCING_THREAD_COUNT", Statement("((%s *) (%s))[word_nr] = load_base[word_nr]" % (copy_dtype_str, dest)) ), ]), Line(), ]) return code
def __repr__(self): parameters = ",".join([ 'void*' if i.is_Object else c.dtype_to_ctype(i.dtype) for i in self.parameters ]) body = "\n\t".join([str(s) for s in self.body]) return "Function[%s]<%s; %s>::\n\t%s" % (self.name, self.retval, parameters, body)
def _args_decl(self, args): """Generate cgen declarations from an iterable of symbols and expressions.""" ret = [] for i in args: if i.is_Object: ret.append(c.Value('void', '*_%s' % i.name)) elif i.is_Scalar: ret.append( c.Value('const %s' % c.dtype_to_ctype(i.dtype), i.name)) elif i.is_Tensor: ret.append( c.Value(c.dtype_to_ctype(i.dtype), '*restrict %s_vec' % i.name)) elif i.is_Dimension: ret.append( c.Value('const %s' % c.dtype_to_ctype(i.dtype), i.name)) else: ret.append(c.Value('void', '*_%s' % i.name)) return ret
def push_heap(self, obj): """ Generate cgen objects to declare, allocate memory, and free memory for ``obj``, of type :class:`Array`. """ if obj in self.heap: return decl = "(*%s)%s" % (obj.name, "".join("[%s]" % i for i in obj.symbolic_shape[1:])) decl = c.Value(c.dtype_to_ctype(obj.dtype), decl) shape = "".join("[%s]" % i for i in obj.symbolic_shape) alloc = "posix_memalign((void**)&%s, 64, sizeof(%s%s))" alloc = alloc % (obj.name, c.dtype_to_ctype(obj.dtype), shape) alloc = c.Statement(alloc) free = c.Statement('free(%s)' % obj.name) self.heap[obj] = (decl, alloc, free)
def get_kernel(self, vector_dtypes, scalar_dtypes): from pymbolic.mapper.stringifier import PREC_NONE from pymbolic.mapper.c_code import CCodeMapper elwise = self.elementwise_mod result_dtype = self.result_dtype_getter( dict(zip(self.vector_deps, vector_dtypes)), dict(zip(self.scalar_deps, scalar_dtypes)), self.constant_dtypes) args = [elwise.VectorArg(result_dtype, vei.name) for vei in self.vec_expr_info_list if not vei.do_not_return] def real_const_mapper(num): # Make sure we do not generate integers or doubles by accident. # Oh, C and your broken division semantics. r = repr(num) if "." not in r or result_dtype == numpy.float32: from pytools import to_uncomplex_dtype from cgen import dtype_to_ctype return "%s(%s)" % (dtype_to_ctype( to_uncomplex_dtype(result_dtype)), r) else: return r code_mapper = CCodeMapper(constant_mapper=real_const_mapper) code_lines = [] for vei in self.vec_expr_info_list: expr_code = code_mapper(vei.expr, PREC_NONE) if vei.do_not_return: from cgen import dtype_to_ctype code_lines.append( "%s %s = %s;" % ( dtype_to_ctype(result_dtype), vei.name, expr_code)) else: code_lines.append( "%s[i] = %s;" % (vei.name, expr_code)) # common subexpressions have been taken care of by the compiler assert not code_mapper.cse_names args.extend( elwise.VectorArg(dtype, name) for dtype, name in zip(vector_dtypes, self.vector_dep_names)) args.extend( elwise.ScalarArg(dtype, name) for dtype, name in zip(scalar_dtypes, self.scalar_dep_names)) return KernelRecord( kernel=self.make_kernel_internal(args, "\n".join(code_lines)), result_dtype=result_dtype)
def get_elwise_module_descriptor(arguments, operation, name="kernel"): from codepy.bpl import BoostPythonModule from cgen import FunctionBody, FunctionDeclaration, \ Value, POD, Struct, For, Initializer, Include, Statement, \ Line, Block S = Statement mod = BoostPythonModule() mod.add_to_preamble([ Include("pyublas/numpy.hpp"), ]) mod.add_to_module([ S("namespace ublas = boost::numeric::ublas"), S("using namespace pyublas"), Line(), ]) body = Block([ Initializer( Value("numpy_array<%s >::iterator" % dtype_to_ctype(varg.dtype), varg.name), "args.%s_ary.begin()" % varg.name) for varg in arguments if isinstance(varg, VectorArg)] +[Initializer( sarg.declarator(), "args." + sarg.name) for sarg in arguments if isinstance(sarg, ScalarArg)] ) body.extend([ Line(), For("unsigned i = 0", "i < codepy_length", "++i", Block([S(operation)]) ) ]) arg_struct = Struct("arg_struct", [arg.declarator() for arg in arguments]) mod.add_struct(arg_struct, "ArgStruct") mod.add_to_module([Line()]) mod.add_function( FunctionBody( FunctionDeclaration( Value("void", name), [POD(numpy.uintp, "codepy_length"), Value("arg_struct", "args")]), body)) return mod
def visit_ArrayCast(self, o): """ Build cgen type casts for an :class:`AbstractFunction`. """ f = o.function align = "__attribute__((aligned(64)))" shape = ''.join(["[%s]" % ccode(j) for j in f.symbolic_shape[1:]]) lvalue = c.POD(f.dtype, '(*restrict %s)%s %s' % (f.name, shape, align)) rvalue = '(%s (*)%s) %s' % (c.dtype_to_ctype( f.dtype), shape, '%s_vec' % f.name) return c.Initializer(lvalue, rvalue)
def real_const_mapper(num): # Make sure we do not generate integers or doubles by accident. # Oh, C and your broken division semantics. r = repr(num) if "." not in r or result_dtype == numpy.float32: from pytools import to_uncomplex_dtype from cgen import dtype_to_ctype return "%s(%s)" % (dtype_to_ctype( to_uncomplex_dtype(result_dtype)), r) else: return r
def push_stack(self, scope, obj): """ Generate a cgen statement that allocates ``obj`` on the stack. """ dtype = c.dtype_to_ctype(obj.dtype) shape = "".join("[%d]" % j for j in obj.shape) alignment = "__attribute__((aligned(64)))" item = c.POD(dtype, "%s%s %s" % (obj.name, shape, alignment)) handle = self.stack.setdefault(scope, []) if item not in handle: handle.append(item)
def _cparameters(self): """Generate arguments signature.""" cparameters = [] for v in self.parameters: if isinstance(v, Dimension): cparameters.append(v.decl) elif v.is_ScalarFunction: cparameters.append(c.Value('const int', v.name)) else: cparameters.append( c.Value(c.dtype_to_ctype(v.dtype), '*restrict %s_vec' % v.name)) return cparameters
def left_contractions(self, pos): """Generates the code computing the left-contraction part of the opimization matrix for site nr. `pos` :param pos: The local tensor to copy (should be `< len(X)`) :returns: List containing cgen Statements """ if pos == 0: return [c.Statement('left_c[0] = 1')] result = self.copy_ltens_to_share(0) result += [c.Line()] contract_ltens_with_a = 'dgemv(blasNoTranspose, x_shared, current_row + {offset:d}, {dim_out:d}, {dim_in:d}, {target:})' src = contract_ltens_with_a.format(offset=0, dim_out=self._ranks[0], dim_in=self._dims[0], target='left_c') # We need to check this every time and can't simpy return since # otherwise __syncthreads crashes result += [c.If('mid < %i' % self._meas, c.Statement(src))] for i in range(1, pos): result += self.copy_ltens_to_share(i) result += [c.Line()] # Since we assume A to consist of product measurements result += [ c.If( 'mid < %i' % self._meas, c.Block([ c.Statement( contract_ltens_with_a.format( offset=sum(self._dims[:i]), dim_out=self._ranks[i - 1] * self._ranks[i], dim_in=self._dims[i], target='tmat_c')), c.Statement( 'dgemv(blasTranspose, tmat_c, left_c, {rank_l}, {rank_r}, buf_c)' .format(rank_l=self._ranks[i - 1], rank_r=self._ranks[i])), c.Statement( 'memcpy(left_c, buf_c, sizeof({ctype}) * {rank_r})' .format(ctype=c.dtype_to_ctype(self._dtype), rank_r=self._ranks[i])) ])), c.Line() ] return result
def right_contractions(self, pos): """Generates the code computing the right-contraction part of the opimization matrix for site nr. `pos` :param pos: The local tensor to copy (should be `< len(X)`) :returns: List containing cgen Statements """ if pos == self._sites - 1: return [c.Statement('right_c[0] = 1')] result = self.copy_ltens_to_share(self._sites - 1) result += [c.Line()] contract_ltens_with_a = 'dgemv(blasNoTranspose, x_shared, current_row + {offset:d}, {dim_out:d}, {dim_in:d}, {target:})' src = contract_ltens_with_a.format(offset=sum(self._dims[:-1]), dim_out=self._ranks[-1], dim_in=self._dims[-1], target='right_c') result += [c.If('mid < %i' % self._meas, c.Statement(src))] for i in range(self._sites - 2, pos, -1): result += self.copy_ltens_to_share(i) result += [c.Line()] # Since we assume A to consist of product measurements result += [ c.If( 'mid < %i' % self._meas, c.Block([ c.Statement( contract_ltens_with_a.format( offset=sum(self._dims[:i]), dim_out=self._ranks[i - 1] * self._ranks[i], dim_in=self._dims[i], target='tmat_c')), c.Statement( 'dgemv(blasNoTranspose, tmat_c, right_c, {rank_l}, {rank_r}, buf_c)' .format(rank_l=self._ranks[i - 1], rank_r=self._ranks[i])), c.Statement( 'memcpy(right_c, buf_c, sizeof({ctype}) * {rank_l})' .format(ctype=c.dtype_to_ctype(self._dtype), rank_l=self._ranks[i - 1])), ])), c.Line() ] return result
def get_elwise_module_descriptor(arguments, operation, name="kernel"): from codepy.bpl import BoostPythonModule from cgen import FunctionBody, FunctionDeclaration, \ Value, POD, Struct, For, Initializer, Include, Statement, \ Line, Block S = Statement # noqa: N806 mod = BoostPythonModule() mod.add_to_preamble([ Include("pyublas/numpy.hpp"), ]) mod.add_to_module([ S("namespace ublas = boost::numeric::ublas"), S("using namespace pyublas"), Line(), ]) body = Block([ Initializer( Value( "numpy_array<{} >::iterator".format(dtype_to_ctype( varg.dtype)), varg.name), f"args.{varg.name}_ary.begin()") for varg in arguments if isinstance(varg, VectorArg) ] + [ Initializer(sarg.declarator(), f"args.{sarg.name}") for sarg in arguments if isinstance(sarg, ScalarArg) ]) body.extend([ Line(), For("unsigned i = 0", "i < codepy_length", "++i", Block([S(operation)])) ]) arg_struct = Struct("arg_struct", [arg.declarator() for arg in arguments]) mod.add_struct(arg_struct, "ArgStruct") mod.add_to_module([Line()]) mod.add_function( FunctionBody( FunctionDeclaration(Value("void", name), [ POD(numpy.uintp, "codepy_length"), Value("arg_struct", "args") ]), body)) return mod
def _args_call(self, args): """Generate cgen function call arguments from an iterable of symbols and expressions.""" ret = [] for i in args: try: if i.is_Object: ret.append('*_%s' % i.name) elif i.is_Array: ret.append("(%s*)%s" % (c.dtype_to_ctype(i.dtype), i.name)) elif i.is_Symbol: ret.append(i.name) elif i.is_TensorFunction: ret.append('%s_vec' % i.name) except AttributeError: ret.append(ccode(i)) return ret
def _ccasts(self): """Generate data casts.""" alignment = "__attribute__((aligned(64)))" handle = [ f for f in self.parameters if isinstance(f, (SymbolicData, TensorFunction)) ] shapes = [(f, ''.join(["[%s]" % i.ccode for i in f.indices[1:]])) for f in handle] casts = [ c.Initializer( c.POD(v.dtype, '(*restrict %s)%s %s' % (v.name, shape, alignment)), '(%s (*)%s) %s' % (c.dtype_to_ctype(v.dtype), shape, '%s_vec' % v.name)) for v, shape in shapes ] return casts
def _args_cast(self, args): """Build cgen type casts for an iterable of :class:`Argument`.""" ret = [] for i in args: if i.is_TensorArgument: align = "__attribute__((aligned(64)))" shape = ''.join( ["[%s]" % ccode(j) for j in i.provider.symbolic_shape[1:]]) lvalue = c.POD(i.dtype, '(*restrict %s)%s %s' % (i.name, shape, align)) rvalue = '(%s (*)%s) %s' % (c.dtype_to_ctype( i.dtype), shape, '%s_vec' % i.name) ret.append(c.Initializer(lvalue, rvalue)) elif i.is_PtrArgument: ctype = ctypes_to_C(i.dtype) lvalue = c.Pointer(c.Value(ctype, i.name)) rvalue = '(%s*) %s' % (ctype, '_%s' % i.name) ret.append(c.Initializer(lvalue, rvalue)) return ret
def ccode(self): """Returns the C code generated by this kernel. This function generates the internal code block from Iteration and Expression objects, and adds the necessary template code around it. """ header_vars = [ c.Pointer(c.POD(v.dtype, '%s_vec' % v.name)) for v in self.signature ] header = c.Extern( "C", c.FunctionDeclaration(c.Value('int', self.name), header_vars)) cast_shapes = [(v, ''.join(['[%d]' % d for d in v.shape[1:]])) for v in self.signature] casts = [ c.Initializer( c.POD(v.dtype, '(*%s)%s' % (v.name, shape)), '(%s (*)%s) %s' % (c.dtype_to_ctype(v.dtype), shape, '%s_vec' % v.name)) for v, shape in cast_shapes ] body = [e.ccode for e in self.expressions] ret = [c.Statement("return 0")] return c.FunctionBody(header, c.Block(casts + body + ret))
def _create_elemental_functions(self, nodes, state): """ Extract :class:`Iteration` sub-trees and move them into :class:`Callable`s. Currently, only tagged, elementizable Iteration objects are targeted. """ noinline = self._compiler_decoration('noinline', c.Comment('noinline?')) functions = OrderedDict() mapper = {} for tree in retrieve_iteration_tree(nodes, mode='superset'): # Search an elementizable sub-tree (if any) tagged = filter_iterations(tree, lambda i: i.tag is not None, 'asap') if not tagged: continue root = tagged[0] if not root.is_Elementizable: continue target = tree[tree.index(root):] # Elemental function arguments args = [] # Found so far (scalars, tensors) maybe_required = set() # Scalars that *may* have to be passed in not_required = set() # Elemental function locally declared scalars # Build a new Iteration/Expression tree with free bounds free = [] for i in target: name, bounds = i.dim.name, i.bounds_symbolic # Iteration bounds start = Scalar(name='%s_start' % name, dtype=np.int32) finish = Scalar(name='%s_finish' % name, dtype=np.int32) args.extend(zip([ccode(j) for j in bounds], (start, finish))) # Iteration unbounded indices ufunc = [ Scalar(name='%s_ub%d' % (name, j), dtype=np.int32) for j in range(len(i.uindices)) ] args.extend(zip([ccode(j.start) for j in i.uindices], ufunc)) limits = [Symbol(start.name), Symbol(finish.name), 1] uindices = [ UnboundedIndex(j.index, i.dim + as_symbol(k)) for j, k in zip(i.uindices, ufunc) ] free.append( i._rebuild(limits=limits, offsets=None, uindices=uindices)) not_required.update({i.dim}, set(j.index for j in i.uindices)) # Construct elemental function body, and inspect it free = NestedTransformer(dict((zip(target, free)))).visit(root) expressions = FindNodes(Expression).visit(free) fsymbols = FindSymbols('symbolics').visit(free) # Add all definitely-required arguments not_required.update({i.output for i in expressions if i.is_scalar}) for i in fsymbols: if i in not_required: continue elif i.is_Array: args.append( ("(%s*)%s" % (c.dtype_to_ctype(i.dtype), i.name), i)) elif i.is_TensorFunction: args.append(("%s_vec" % i.name, i)) elif i.is_Scalar: args.append((i.name, i)) # Add all maybe-required arguments that turn out to be required maybe_required.update( set(FindSymbols(mode='free-symbols').visit(free))) for i in fsymbols: not_required.update({as_symbol(i), i.indexify()}) for j in i.symbolic_shape: maybe_required.update(j.free_symbols) required = filter_sorted(maybe_required - not_required, key=attrgetter('name')) args.extend([(i.name, Scalar(name=i.name, dtype=i.dtype)) for i in required]) call, params = zip(*args) handle = flatten([p.rtargs for p in params]) name = "f_%d" % root.tag # Produce the new Call mapper[root] = List(header=noinline, body=Call(name, call)) # Produce the new Callable functions.setdefault( name, Callable(name, free, 'void', handle, ('static', ))) # Transform the main tree processed = Transformer(mapper).visit(nodes) return processed, {'elemental_functions': functions.values()}
def visit_LocalExpression(self, o): return c.Initializer( c.Value(c.dtype_to_ctype(o.dtype), ccode(o.expr.lhs, dtype=o.dtype)), ccode(o.expr.rhs, dtype=o.dtype))
def __call__(self, key): if key in self._e.keys(): return self._e[key] else: return cgen.dtype_to_ctype(key)
def declarator(self): return Value("numpy_array<{} >".format(dtype_to_ctype(self.dtype)), f"{self.name}_ary")
def get_kernel(self, diff_op_cls, elgroup, for_benchmark=False): from cgen import \ Pointer, POD, Value, ArrayOf, \ Module, FunctionDeclaration, FunctionBody, Block, \ Line, Define, Include, \ Initializer, If, For, Statement, Assign from cgen import dtype_to_ctype from cgen.cuda import CudaShared, CudaGlobal discr = self.discr d = discr.dimensions dims = range(d) given = self.plan.given par = self.plan.parallelism diffmat_data = self.gpu_diffmats(diff_op_cls, elgroup) elgroup, = discr.element_groups float_type = given.float_type f_decl = CudaGlobal(FunctionDeclaration(Value("void", "apply_diff_mat"), [Pointer(POD(numpy.uint8, "gmem_diff_rst_mat")), #Pointer(POD(float_type, "debugbuf")), ] + [Pointer(POD(float_type, "drst%d_global" % i)) for i in dims] )) rst_channels = given.devdata.make_valid_tex_channel_count(d) cmod = Module([ Include("pycuda-helpers.hpp"), Line(), Value("texture<fp_tex_%s, 1, cudaReadModeElementType>" % dtype_to_ctype(float_type), "field_tex"), Line(), Define("DIMENSIONS", discr.dimensions), Define("DOFS_PER_EL", given.dofs_per_el()), Line(), Define("SEGMENT_DOF", "threadIdx.x"), Define("PAR_MB_NR", "threadIdx.y"), Line(), Define("MB_SEGMENT", "blockIdx.x"), Define("MACROBLOCK_NR", "blockIdx.y"), Line(), Define("DOFS_PER_SEGMENT", self.plan.segment_size), Define("SEGMENTS_PER_MB", self.plan.segments_per_microblock()), Define("ALIGNED_DOFS_PER_MB", given.microblock.aligned_floats), Define("ELS_PER_MB", given.microblock.elements), Line(), Define("PAR_MB_COUNT", par.parallel), Define("INLINE_MB_COUNT", par.inline), Define("SEQ_MB_COUNT", par.serial), Line(), Define("THREAD_NUM", "(SEGMENT_DOF+PAR_MB_NR*DOFS_PER_SEGMENT)"), Define("COALESCING_THREAD_COUNT", "(PAR_MB_COUNT*DOFS_PER_SEGMENT)"), Line(), Define("MB_DOF_BASE", "(MB_SEGMENT*DOFS_PER_SEGMENT)"), Define("MB_DOF", "(MB_DOF_BASE+SEGMENT_DOF)"), Define("GLOBAL_MB_NR_BASE", "(MACROBLOCK_NR*PAR_MB_COUNT*INLINE_MB_COUNT*SEQ_MB_COUNT)"), Define("GLOBAL_MB_NR", "(GLOBAL_MB_NR_BASE" "+ (seq_mb_number*PAR_MB_COUNT + PAR_MB_NR)*INLINE_MB_COUNT)"), Define("GLOBAL_MB_DOF_BASE", "(GLOBAL_MB_NR*ALIGNED_DOFS_PER_MB)"), Line(), Define("DIFFMAT_SEGMENT_FLOATS", diffmat_data.block_floats), Define("DIFFMAT_SEGMENT_BYTES", "(DIFFMAT_SEGMENT_FLOATS*%d)" % given.float_size()), Define("DIFFMAT_COLUMNS", diffmat_data.matrix_columns), Line(), CudaShared(ArrayOf(POD(float_type, "smem_diff_rst_mat"), "DIFFMAT_COLUMNS*DOFS_PER_SEGMENT")), Line(), ]) S = Statement f_body = Block() f_body.extend_log_block("calculate responsibility data", [ Initializer(POD(numpy.uint16, "mb_el"), "MB_DOF/DOFS_PER_EL"), ]) from hedge.backends.cuda.tools import get_load_code f_body.extend( get_load_code( dest="smem_diff_rst_mat", base="gmem_diff_rst_mat + MB_SEGMENT*DIFFMAT_SEGMENT_BYTES", bytes="DIFFMAT_SEGMENT_BYTES", descr="load diff mat segment") +[S("__syncthreads()"), Line()]) # --------------------------------------------------------------------- def get_scalar_diff_code(): code = [] for inl in range(par.inline): for axis in dims: code.append( Initializer(POD(float_type, "d%drst%d" % (inl, axis)), 0)) code.append(Line()) def get_mat_entry(row, col, axis): return ("smem_diff_rst_mat[" "%(row)s*DIFFMAT_COLUMNS + %(axis)s*DOFS_PER_EL" " + %(col)s" "]" % {"row":row, "col":col, "axis":axis} ) tex_channels = ["x", "y", "z", "w"] from hedge.backends.cuda.tools import unroll code.extend( [POD(float_type, "field_value%d" % inl) for inl in range(par.inline)] +[Line()] +unroll(lambda j: [ Assign("field_value%d" % inl, "fp_tex1Dfetch(field_tex, GLOBAL_MB_DOF_BASE + %d*ALIGNED_DOFS_PER_MB " "+ mb_el*DOFS_PER_EL + %s)" % (inl, j) ) for inl in range(par.inline)] +[Line()] +[S("d%drst%d += %s * field_value%d" % (inl, axis, get_mat_entry("SEGMENT_DOF", j, axis), inl)) for axis in dims for inl in range(par.inline)] +[Line()], given.dofs_per_el(), self.plan.max_unroll) ) store_code = Block() for inl in range(par.inline): for rst_axis in dims: store_code.append(Assign( "drst%d_global[GLOBAL_MB_DOF_BASE" " + %d*ALIGNED_DOFS_PER_MB + MB_DOF]" % (rst_axis, inl), "d%drst%d" % (inl, rst_axis), )) code.append(If("MB_DOF < DOFS_PER_EL*ELS_PER_MB", store_code)) return code f_body.extend([ For("unsigned short seq_mb_number = 0", "seq_mb_number < SEQ_MB_COUNT", "++seq_mb_number", Block(get_scalar_diff_code())) ]) # finish off ---------------------------------------------------------- cmod.append(FunctionBody(f_decl, f_body)) if not for_benchmark and "cuda_dump_kernels" in discr.debug: from hedge.tools import open_unique_debug_file open_unique_debug_file("diff", ".cu").write(str(cmod)) mod = SourceModule(cmod, keep="cuda_keep_kernels" in discr.debug, #options=["--maxrregcount=10"] ) field_texref = mod.get_texref("field_tex") func = mod.get_function("apply_diff_mat") func.prepare( discr.dimensions*[float_type] + ["P"], block=(self.plan.segment_size, par.parallel, 1), texrefs=[field_texref]) if "cuda_diff" in discr.debug: print "diff: lmem=%d smem=%d regs=%d" % ( func.local_size_bytes, func.shared_size_bytes, func.num_regs) return func, field_texref
def declarator(self): return Value("numpy_array<%s >" % dtype_to_ctype(self.dtype), self.name+"_ary")
def get_kernel(self, with_scaling, for_benchmark=False): from cgen import \ Pointer, POD, Value, ArrayOf, \ Module, FunctionDeclaration, FunctionBody, Block, \ Line, Define, Include, \ Initializer, If, For, Statement, Assign, \ ArrayInitializer from cgen import dtype_to_ctype from cgen.cuda import CudaShared, CudaConstant, CudaGlobal discr = self.discr d = discr.dimensions dims = range(d) given = self.plan.given float_type = given.float_type f_decl = CudaGlobal( FunctionDeclaration(Value("void", "apply_el_local_mat_smem_mat"), [ Pointer(POD(float_type, "out_vector")), Pointer(POD(numpy.uint8, "gmem_matrix")), Pointer(POD(float_type, "debugbuf")), POD(numpy.uint32, "microblock_count"), ])) cmod = Module([ Include("pycuda-helpers.hpp"), Line(), Value( "texture<fp_tex_%s, 1, cudaReadModeElementType>" % dtype_to_ctype(float_type), "in_vector_tex"), ]) if with_scaling: cmod.append( Value( "texture<fp_tex_%s, 1, cudaReadModeElementType>" % dtype_to_ctype(float_type), "scaling_tex"), ) par = self.plan.parallelism cmod.extend([ Line(), Define("DIMENSIONS", discr.dimensions), Define("DOFS_PER_EL", given.dofs_per_el()), Define("PREIMAGE_DOFS_PER_EL", self.plan.preimage_dofs_per_el), Line(), Define("SEGMENT_DOF", "threadIdx.x"), Define("PAR_MB_NR", "threadIdx.y"), Line(), Define("MB_SEGMENT", "blockIdx.x"), Define("MACROBLOCK_NR", "blockIdx.y"), Line(), Define("DOFS_PER_SEGMENT", self.plan.segment_size), Define("SEGMENTS_PER_MB", self.plan.segments_per_microblock()), Define("ALIGNED_DOFS_PER_MB", given.microblock.aligned_floats), Define("ALIGNED_PREIMAGE_DOFS_PER_MB", self.plan.aligned_preimage_dofs_per_microblock), Define("MB_EL_COUNT", given.microblock.elements), Line(), Define("PAR_MB_COUNT", par.parallel), Define("INLINE_MB_COUNT", par.inline), Define("SEQ_MB_COUNT", par.serial), Line(), Define("THREAD_NUM", "(SEGMENT_DOF+PAR_MB_NR*DOFS_PER_SEGMENT)"), Define("COALESCING_THREAD_COUNT", "(PAR_MB_COUNT*DOFS_PER_SEGMENT)"), Line(), Define("MB_DOF_BASE", "(MB_SEGMENT*DOFS_PER_SEGMENT)"), Define("MB_DOF", "(MB_DOF_BASE+SEGMENT_DOF)"), Define( "GLOBAL_MB_NR_BASE", "(MACROBLOCK_NR*PAR_MB_COUNT*INLINE_MB_COUNT*SEQ_MB_COUNT)"), Define( "GLOBAL_MB_NR", "(GLOBAL_MB_NR_BASE" "+ (seq_mb_number*PAR_MB_COUNT + PAR_MB_NR)*INLINE_MB_COUNT)"), Define("GLOBAL_MB_DOF_BASE", "(GLOBAL_MB_NR*ALIGNED_DOFS_PER_MB)"), Define("GLOBAL_MB_PREIMG_DOF_BASE", "(GLOBAL_MB_NR*ALIGNED_PREIMAGE_DOFS_PER_MB)"), Line(), Define("MATRIX_COLUMNS", self.plan.gpu_matrix_columns()), Define("MATRIX_SEGMENT_FLOATS", self.plan.gpu_matrix_block_floats()), Define("MATRIX_SEGMENT_BYTES", "(MATRIX_SEGMENT_FLOATS*%d)" % given.float_size()), Line(), CudaShared( ArrayOf(POD(float_type, "smem_matrix"), "MATRIX_SEGMENT_FLOATS")), CudaShared( ArrayOf( ArrayOf( ArrayOf(POD(float_type, "dof_buffer"), "PAR_MB_COUNT"), "INLINE_MB_COUNT"), "DOFS_PER_SEGMENT"), ), CudaShared(POD(numpy.uint16, "segment_start_el")), CudaShared(POD(numpy.uint16, "segment_stop_el")), CudaShared(POD(numpy.uint16, "segment_el_count")), Line(), ArrayInitializer( CudaConstant( ArrayOf(POD(numpy.uint32, "segment_start_el_lookup"), "SEGMENTS_PER_MB")), [(chk * self.plan.segment_size) // given.dofs_per_el() for chk in range(self.plan.segments_per_microblock())]), ArrayInitializer( CudaConstant( ArrayOf(POD(numpy.uint32, "segment_stop_el_lookup"), "SEGMENTS_PER_MB")), [ min(given.microblock.elements, (chk * self.plan.segment_size + self.plan.segment_size - 1) // given.dofs_per_el() + 1) for chk in range(self.plan.segments_per_microblock()) ]), ]) S = Statement f_body = Block() f_body.extend_log_block( "calculate this dof's element", [Initializer(POD(numpy.uint8, "mb_el"), "MB_DOF/DOFS_PER_EL")]) if self.plan.use_prefetch_branch: f_body.extend_log_block("calculate segment responsibility data", [ If( "THREAD_NUM==0", Block([ Assign("segment_start_el", "segment_start_el_lookup[MB_SEGMENT]"), Assign("segment_stop_el", "segment_stop_el_lookup[MB_SEGMENT]"), Assign("segment_el_count", "segment_stop_el-segment_start_el"), ])), S("__syncthreads()") ]) from hedge.backends.cuda.tools import get_load_code f_body.extend( get_load_code(dest="smem_matrix", base=( "gmem_matrix + MB_SEGMENT*MATRIX_SEGMENT_BYTES"), bytes="MATRIX_SEGMENT_BYTES", descr="load matrix segment") + [S("__syncthreads()")]) # --------------------------------------------------------------------- def get_batched_fetch_mat_mul_code(el_fetch_count): result = [] dofs = range(self.plan.preimage_dofs_per_el) for load_segment_start in range(0, self.plan.preimage_dofs_per_el, self.plan.segment_size): result.extend([S("__syncthreads()")] + [ Assign( "dof_buffer[PAR_MB_NR][%d][SEGMENT_DOF]" % inl, "fp_tex1Dfetch(in_vector_tex, " "GLOBAL_MB_PREIMG_DOF_BASE" " + %d*ALIGNED_PREIMAGE_DOFS_PER_MB" " + (segment_start_el)*PREIMAGE_DOFS_PER_EL + %d + SEGMENT_DOF)" % (inl, load_segment_start)) for inl in range(par.inline) ] + [ S("__syncthreads()"), Line(), ]) for dof in dofs[load_segment_start:load_segment_start + self.plan.segment_size]: for inl in range(par.inline): result.append( S("result%d += " "smem_matrix[SEGMENT_DOF*MATRIX_COLUMNS + %d]" "*" "dof_buffer[PAR_MB_NR][%d][%d]" % (inl, dof, inl, dof - load_segment_start))) result.append(Line()) return result from hedge.backends.cuda.tools import unroll def get_direct_tex_mat_mul_code(): return ( [POD(float_type, "fof%d" % inl) for inl in range(par.inline)] + [POD(float_type, "lm"), Line()] + unroll( lambda j: [ Assign( "fof%d" % inl, "fp_tex1Dfetch(in_vector_tex, " "GLOBAL_MB_PREIMG_DOF_BASE" " + %(inl)d * ALIGNED_PREIMAGE_DOFS_PER_MB" " + mb_el*PREIMAGE_DOFS_PER_EL+%(j)s)" % { "j": j, "inl": inl, "row": "SEGMENT_DOF" }, ) for inl in range(par.inline) ] + [ Assign( "lm", "smem_matrix[" "%(row)s*MATRIX_COLUMNS + %(j)s]" % { "j": j, "row": "SEGMENT_DOF" }, ) ] + [ S("result%(inl)d += fof%(inl)d*lm" % {"inl": inl}) for inl in range(par.inline) ], total_number=self.plan.preimage_dofs_per_el, max_unroll=self.plan.max_unroll) + [Line()]) def get_mat_mul_code(el_fetch_count): if el_fetch_count == 1: return get_batched_fetch_mat_mul_code(el_fetch_count) else: return get_direct_tex_mat_mul_code() def mat_mul_outer_loop(fetch_count): if with_scaling: inv_jac_multiplier = ( "fp_tex1Dfetch(scaling_tex," "(GLOBAL_MB_NR + %(inl)d)*MB_EL_COUNT + mb_el)") else: inv_jac_multiplier = "1" write_condition = "MB_DOF < DOFS_PER_EL*MB_EL_COUNT" if self.with_index_check: write_condition += " && GLOBAL_MB_NR < microblock_count" return For( "unsigned short seq_mb_number = 0", "seq_mb_number < SEQ_MB_COUNT", "++seq_mb_number", Block([ Initializer(POD(float_type, "result%d" % inl), 0) for inl in range(par.inline) ] + [Line()] + get_mat_mul_code(fetch_count) + [ If( write_condition, Block([ Assign( "out_vector[GLOBAL_MB_DOF_BASE" " + %d*ALIGNED_DOFS_PER_MB" " + MB_DOF]" % inl, "result%d * %s" % (inl, (inv_jac_multiplier % { "inl": inl }))) for inl in range(par.inline) ])) ])) if self.plan.use_prefetch_branch: from cgen import make_multiple_ifs f_body.append( make_multiple_ifs([ ("segment_el_count == %d" % fetch_count, mat_mul_outer_loop(fetch_count)) for fetch_count in range( 1, self.plan.max_elements_touched_by_segment() + 1) ])) else: f_body.append(mat_mul_outer_loop(0)) # finish off ---------------------------------------------------------- cmod.append(FunctionBody(f_decl, f_body)) if not for_benchmark and "cuda_dump_kernels" in discr.debug: from hedge.tools import open_unique_debug_file open_unique_debug_file(self.plan.debug_name, ".cu").write(str(cmod)) mod = SourceModule( cmod, keep="cuda_keep_kernels" in discr.debug, #options=["--maxrregcount=12"] ) func = mod.get_function("apply_el_local_mat_smem_mat") if self.plan.debug_name in discr.debug: print "%s: lmem=%d smem=%d regs=%d" % ( self.plan.debug_name, func.local_size_bytes, func.shared_size_bytes, func.num_regs) in_vector_texref = mod.get_texref("in_vector_tex") texrefs = [in_vector_texref] if with_scaling: scaling_texref = mod.get_texref("scaling_tex") texrefs.append(scaling_texref) else: scaling_texref = None func.prepare("PPPI", block=(self.plan.segment_size, self.plan.parallelism.parallel, 1), texrefs=texrefs) return func, in_vector_texref, scaling_texref
def declarator(self): return Value("numpy_array<%s >" % dtype_to_ctype(self.dtype), self.name + "_ary")
def get_kernel(self, with_scaling, for_benchmark=False): from cgen import \ Pointer, POD, Value, ArrayOf, \ Module, FunctionDeclaration, FunctionBody, Block, \ Line, Define, Include, \ Initializer, If, For, Statement, Assign, \ ArrayInitializer from cgen import dtype_to_ctype from cgen.cuda import CudaShared, CudaConstant, CudaGlobal discr = self.discr d = discr.dimensions dims = range(d) given = self.plan.given float_type = given.float_type f_decl = CudaGlobal(FunctionDeclaration(Value("void", "apply_el_local_mat_smem_mat"), [ Pointer(POD(float_type, "out_vector")), Pointer(POD(numpy.uint8, "gmem_matrix")), Pointer(POD(float_type, "debugbuf")), POD(numpy.uint32, "microblock_count"), ] )) cmod = Module([ Include("pycuda-helpers.hpp"), Line(), Value("texture<fp_tex_%s, 1, cudaReadModeElementType>" % dtype_to_ctype(float_type), "in_vector_tex"), ]) if with_scaling: cmod.append( Value("texture<fp_tex_%s, 1, cudaReadModeElementType>" % dtype_to_ctype(float_type), "scaling_tex"), ) par = self.plan.parallelism cmod.extend([ Line(), Define("DIMENSIONS", discr.dimensions), Define("DOFS_PER_EL", given.dofs_per_el()), Define("PREIMAGE_DOFS_PER_EL", self.plan.preimage_dofs_per_el), Line(), Define("SEGMENT_DOF", "threadIdx.x"), Define("PAR_MB_NR", "threadIdx.y"), Line(), Define("MB_SEGMENT", "blockIdx.x"), Define("MACROBLOCK_NR", "blockIdx.y"), Line(), Define("DOFS_PER_SEGMENT", self.plan.segment_size), Define("SEGMENTS_PER_MB", self.plan.segments_per_microblock()), Define("ALIGNED_DOFS_PER_MB", given.microblock.aligned_floats), Define("ALIGNED_PREIMAGE_DOFS_PER_MB", self.plan.aligned_preimage_dofs_per_microblock), Define("MB_EL_COUNT", given.microblock.elements), Line(), Define("PAR_MB_COUNT", par.parallel), Define("INLINE_MB_COUNT", par.inline), Define("SEQ_MB_COUNT", par.serial), Line(), Define("THREAD_NUM", "(SEGMENT_DOF+PAR_MB_NR*DOFS_PER_SEGMENT)"), Define("COALESCING_THREAD_COUNT", "(PAR_MB_COUNT*DOFS_PER_SEGMENT)"), Line(), Define("MB_DOF_BASE", "(MB_SEGMENT*DOFS_PER_SEGMENT)"), Define("MB_DOF", "(MB_DOF_BASE+SEGMENT_DOF)"), Define("GLOBAL_MB_NR_BASE", "(MACROBLOCK_NR*PAR_MB_COUNT*INLINE_MB_COUNT*SEQ_MB_COUNT)"), Define("GLOBAL_MB_NR", "(GLOBAL_MB_NR_BASE" "+ (seq_mb_number*PAR_MB_COUNT + PAR_MB_NR)*INLINE_MB_COUNT)"), Define("GLOBAL_MB_DOF_BASE", "(GLOBAL_MB_NR*ALIGNED_DOFS_PER_MB)"), Define("GLOBAL_MB_PREIMG_DOF_BASE", "(GLOBAL_MB_NR*ALIGNED_PREIMAGE_DOFS_PER_MB)"), Line(), Define("MATRIX_COLUMNS", self.plan.gpu_matrix_columns()), Define("MATRIX_SEGMENT_FLOATS", self.plan.gpu_matrix_block_floats()), Define("MATRIX_SEGMENT_BYTES", "(MATRIX_SEGMENT_FLOATS*%d)" % given.float_size()), Line(), CudaShared(ArrayOf(POD(float_type, "smem_matrix"), "MATRIX_SEGMENT_FLOATS")), CudaShared( ArrayOf( ArrayOf( ArrayOf( POD(float_type, "dof_buffer"), "PAR_MB_COUNT"), "INLINE_MB_COUNT"), "DOFS_PER_SEGMENT"), ), CudaShared(POD(numpy.uint16, "segment_start_el")), CudaShared(POD(numpy.uint16, "segment_stop_el")), CudaShared(POD(numpy.uint16, "segment_el_count")), Line(), ArrayInitializer( CudaConstant( ArrayOf( POD(numpy.uint32, "segment_start_el_lookup"), "SEGMENTS_PER_MB")), [(chk*self.plan.segment_size)//given.dofs_per_el() for chk in range(self.plan.segments_per_microblock())] ), ArrayInitializer( CudaConstant( ArrayOf( POD(numpy.uint32, "segment_stop_el_lookup"), "SEGMENTS_PER_MB")), [min(given.microblock.elements, (chk*self.plan.segment_size+self.plan.segment_size-1) //given.dofs_per_el()+1) for chk in range(self.plan.segments_per_microblock())] ), ]) S = Statement f_body = Block() f_body.extend_log_block("calculate this dof's element", [ Initializer(POD(numpy.uint8, "mb_el"), "MB_DOF/DOFS_PER_EL") ]) if self.plan.use_prefetch_branch: f_body.extend_log_block("calculate segment responsibility data", [ If("THREAD_NUM==0", Block([ Assign("segment_start_el", "segment_start_el_lookup[MB_SEGMENT]"), Assign("segment_stop_el", "segment_stop_el_lookup[MB_SEGMENT]"), Assign("segment_el_count", "segment_stop_el-segment_start_el"), ]) ), S("__syncthreads()") ]) from hedge.backends.cuda.tools import get_load_code f_body.extend( get_load_code( dest="smem_matrix", base=("gmem_matrix + MB_SEGMENT*MATRIX_SEGMENT_BYTES"), bytes="MATRIX_SEGMENT_BYTES", descr="load matrix segment") +[S("__syncthreads()")] ) # --------------------------------------------------------------------- def get_batched_fetch_mat_mul_code(el_fetch_count): result = [] dofs = range(self.plan.preimage_dofs_per_el) for load_segment_start in range(0, self.plan.preimage_dofs_per_el, self.plan.segment_size): result.extend( [S("__syncthreads()")] +[Assign( "dof_buffer[PAR_MB_NR][%d][SEGMENT_DOF]" % inl, "fp_tex1Dfetch(in_vector_tex, " "GLOBAL_MB_PREIMG_DOF_BASE" " + %d*ALIGNED_PREIMAGE_DOFS_PER_MB" " + (segment_start_el)*PREIMAGE_DOFS_PER_EL + %d + SEGMENT_DOF)" % (inl, load_segment_start) ) for inl in range(par.inline)] +[S("__syncthreads()"), Line(), ]) for dof in dofs[load_segment_start:load_segment_start+self.plan.segment_size]: for inl in range(par.inline): result.append( S("result%d += " "smem_matrix[SEGMENT_DOF*MATRIX_COLUMNS + %d]" "*" "dof_buffer[PAR_MB_NR][%d][%d]" % (inl, dof, inl, dof-load_segment_start)) ) result.append(Line()) return result from hedge.backends.cuda.tools import unroll def get_direct_tex_mat_mul_code(): return ( [POD(float_type, "fof%d" % inl) for inl in range(par.inline)] + [POD(float_type, "lm"), Line()] + unroll( lambda j: [ Assign("fof%d" % inl, "fp_tex1Dfetch(in_vector_tex, " "GLOBAL_MB_PREIMG_DOF_BASE" " + %(inl)d * ALIGNED_PREIMAGE_DOFS_PER_MB" " + mb_el*PREIMAGE_DOFS_PER_EL+%(j)s)" % {"j":j, "inl":inl, "row": "SEGMENT_DOF"},) for inl in range(par.inline) ]+[ Assign("lm", "smem_matrix[" "%(row)s*MATRIX_COLUMNS + %(j)s]" % {"j":j, "row": "SEGMENT_DOF"}, ) ]+[ S("result%(inl)d += fof%(inl)d*lm" % {"inl":inl}) for inl in range(par.inline) ], total_number=self.plan.preimage_dofs_per_el, max_unroll=self.plan.max_unroll) + [Line()]) def get_mat_mul_code(el_fetch_count): if el_fetch_count == 1: return get_batched_fetch_mat_mul_code(el_fetch_count) else: return get_direct_tex_mat_mul_code() def mat_mul_outer_loop(fetch_count): if with_scaling: inv_jac_multiplier = ("fp_tex1Dfetch(scaling_tex," "(GLOBAL_MB_NR + %(inl)d)*MB_EL_COUNT + mb_el)") else: inv_jac_multiplier = "1" write_condition = "MB_DOF < DOFS_PER_EL*MB_EL_COUNT" if self.with_index_check: write_condition += " && GLOBAL_MB_NR < microblock_count" return For("unsigned short seq_mb_number = 0", "seq_mb_number < SEQ_MB_COUNT", "++seq_mb_number", Block([ Initializer(POD(float_type, "result%d" % inl), 0) for inl in range(par.inline) ]+[Line()] +get_mat_mul_code(fetch_count) +[ If(write_condition, Block([ Assign( "out_vector[GLOBAL_MB_DOF_BASE" " + %d*ALIGNED_DOFS_PER_MB" " + MB_DOF]" % inl, "result%d * %s" % (inl, (inv_jac_multiplier % {"inl":inl})) ) for inl in range(par.inline) ]) ) ]) ) if self.plan.use_prefetch_branch: from cgen import make_multiple_ifs f_body.append(make_multiple_ifs([ ("segment_el_count == %d" % fetch_count, mat_mul_outer_loop(fetch_count)) for fetch_count in range(1, self.plan.max_elements_touched_by_segment()+1)] )) else: f_body.append(mat_mul_outer_loop(0)) # finish off ---------------------------------------------------------- cmod.append(FunctionBody(f_decl, f_body)) if not for_benchmark and "cuda_dump_kernels" in discr.debug: from hedge.tools import open_unique_debug_file open_unique_debug_file(self.plan.debug_name, ".cu").write(str(cmod)) mod = SourceModule(cmod, keep="cuda_keep_kernels" in discr.debug, #options=["--maxrregcount=12"] ) func = mod.get_function("apply_el_local_mat_smem_mat") if self.plan.debug_name in discr.debug: print "%s: lmem=%d smem=%d regs=%d" % ( self.plan.debug_name, func.local_size_bytes, func.shared_size_bytes, func.num_regs) in_vector_texref = mod.get_texref("in_vector_tex") texrefs = [in_vector_texref] if with_scaling: scaling_texref = mod.get_texref("scaling_tex") texrefs.append(scaling_texref) else: scaling_texref = None func.prepare( "PPPI", block=(self.plan.segment_size, self.plan.parallelism.parallel, 1), texrefs=texrefs) return func, in_vector_texref, scaling_texref
def ccode(self): ctype = c.dtype_to_ctype(self.dtype) return c.Initializer(c.Value(ctype, ccode(self.expr.lhs)), ccode(self.expr.rhs))
def get_kernel(self, for_benchmark=False): from cgen import \ Pointer, POD, Value, ArrayOf, Const, \ Module, FunctionDeclaration, FunctionBody, Block, \ Comment, Line, Include, \ Define, \ Initializer, If, For, Statement, Assign from cgen import dtype_to_ctype from cgen.cuda import CudaShared, CudaGlobal discr = self.discr d = discr.dimensions dims = range(d) given = self.plan.given float_type = given.float_type f_decl = CudaGlobal(FunctionDeclaration(Value("void", "apply_el_local_mat_smem_field"), [ Pointer(POD(float_type, "out_vector")), Pointer(POD(float_type, "in_vector")), Pointer(POD(float_type, "debugbuf")), POD(numpy.uint32, "microblock_count"), ] )) cmod = Module([ Include("pycuda-helpers.hpp"), Line(), Value("texture<fp_tex_%s, 2, cudaReadModeElementType>" % dtype_to_ctype(float_type), "mat_tex"), ]) plan = self.plan par = plan.parallelism # FIXME: aligned_image_dofs_per_microblock must be divisible # by this, therefore hardcoding for now. chunk_size = 16 cmod.extend([ Line(), Define("DIMENSIONS", discr.dimensions), Define("IMAGE_DOFS_PER_EL", plan.image_dofs_per_el), Define("PREIMAGE_DOFS_PER_EL", plan.preimage_dofs_per_el), Define("ALIGNED_IMAGE_DOFS_PER_MB", plan.aligned_image_dofs_per_microblock), Define("ALIGNED_PREIMAGE_DOFS_PER_MB", plan.aligned_preimage_dofs_per_microblock), Line(), Define("MB_EL_COUNT", plan.elements_per_microblock), Line(), Define("IMAGE_DOFS_PER_MB", "(IMAGE_DOFS_PER_EL*MB_EL_COUNT)"), Line(), Define("CHUNK_SIZE", chunk_size), Define("CHUNK_DOF", "threadIdx.x"), Define("PAR_MB_NR", "threadIdx.y"), Define("CHUNK_NR", "threadIdx.z"), Define("IMAGE_MB_DOF", "(CHUNK_NR*CHUNK_SIZE+CHUNK_DOF)"), Define("IMAGE_EL_DOF", "(IMAGE_MB_DOF - mb_el*IMAGE_DOFS_PER_EL)"), Line(), Define("MACROBLOCK_NR", "blockIdx.x"), Line(), Define("PAR_MB_COUNT", par.parallel), Define("INLINE_MB_COUNT", par.inline), Define("SEQ_MB_COUNT", par.serial), Line(), Define("GLOBAL_MB_NR_BASE", "(MACROBLOCK_NR*PAR_MB_COUNT*INLINE_MB_COUNT*SEQ_MB_COUNT)"), Define("GLOBAL_MB_NR", "(GLOBAL_MB_NR_BASE" "+ (seq_mb_number*PAR_MB_COUNT + PAR_MB_NR)*INLINE_MB_COUNT)"), Define("GLOBAL_MB_IMAGE_DOF_BASE", "(GLOBAL_MB_NR*ALIGNED_IMAGE_DOFS_PER_MB)"), Define("GLOBAL_MB_PREIMAGE_DOF_BASE", "(GLOBAL_MB_NR*ALIGNED_PREIMAGE_DOFS_PER_MB)"), Line(), CudaShared( ArrayOf( ArrayOf( ArrayOf( POD(float_type, "smem_in_vector"), "PAR_MB_COUNT"), "INLINE_MB_COUNT"), "ALIGNED_PREIMAGE_DOFS_PER_MB")), Line(), ]) S = Statement f_body = Block([ Initializer(Const(POD(numpy.uint16, "mb_el")), "IMAGE_MB_DOF / IMAGE_DOFS_PER_EL"), Line(), ]) def get_load_code(): mb_img_dofs = plan.aligned_image_dofs_per_microblock mb_preimg_dofs = plan.aligned_preimage_dofs_per_microblock preimg_dofs_over_dofs = (mb_preimg_dofs+mb_img_dofs-1) // mb_img_dofs load_code = [] store_code = [] var_num = 0 for load_block in range(preimg_dofs_over_dofs): for inl in range(par.inline): # load and store are split for better pipelining # compiler can't figure that out because of branch var = "tmp%d" % var_num var_num += 1 load_code.append(POD(float_type, var)) block_addr = "%d * ALIGNED_IMAGE_DOFS_PER_MB + IMAGE_MB_DOF" % load_block load_instr = Assign(var, "in_vector[GLOBAL_MB_PREIMAGE_DOF_BASE" " + %d*ALIGNED_PREIMAGE_DOFS_PER_MB" " + %s]" % (inl, block_addr)) store_instr = Assign( "smem_in_vector[PAR_MB_NR][%d][%s]" % (inl, block_addr), var ) if (load_block+1)*mb_img_dofs >= mb_preimg_dofs: cond = "%s < ALIGNED_PREIMAGE_DOFS_PER_MB" % block_addr load_instr = If(cond, load_instr) store_instr = If(cond, store_instr) load_code.append(load_instr) store_code.append(store_instr) return Block(load_code + [Line()] + store_code) def get_matmul_code(): from hedge.backends.cuda.tools import unroll index_check_condition = "GLOBAL_MB_NR < microblock_count" def if_(conditions, then): final_cond = " && ".join(cond for cond in conditions if cond) if final_cond: return If(final_cond, then) else: return then result = Block([ Comment("everybody needs to be done with the old data"), S("__syncthreads()"), Line(), ]+[If(index_check_condition, get_load_code())]+[ Line(), Comment("all the new data must be loaded"), S("__syncthreads()"), Line(), ]+[ Initializer(POD(float_type, "result%d" % inl), 0) for inl in range(par.inline) ]+[ Line(), POD(float_type, "mat_entry"), Line(), ]) result.append(if_(["IMAGE_MB_DOF < IMAGE_DOFS_PER_MB", index_check_condition], Block(unroll(lambda j: [Assign("mat_entry", "fp_tex2D(mat_tex, IMAGE_EL_DOF, %s)" % j)] +[ S("result%d += mat_entry " "* smem_in_vector[PAR_MB_NR][%d][mb_el*PREIMAGE_DOFS_PER_EL + %s]" % (inl, inl, j)) for inl in range(par.inline) ], total_number=plan.preimage_dofs_per_el) +[Line()] +[Assign( "out_vector[GLOBAL_MB_IMAGE_DOF_BASE + " "%d*ALIGNED_IMAGE_DOFS_PER_MB + IMAGE_MB_DOF]" % inl, "result%d" % inl) for inl in range(par.inline)] ))) return result f_body.append(For("unsigned short seq_mb_number = 0", "seq_mb_number < SEQ_MB_COUNT", "++seq_mb_number", get_matmul_code())) # finish off ---------------------------------------------------------- cmod.append(FunctionBody(f_decl, f_body)) if not for_benchmark and "cuda_dump_kernels" in discr.debug: from hedge.tools import open_unique_debug_file open_unique_debug_file(plan.debug_name, ".cu").write(str(cmod)) mod = SourceModule(cmod, keep="cuda_keep_kernels" in discr.debug, #options=["--maxrregcount=12"] ) func = mod.get_function("apply_el_local_mat_smem_field") if plan.debug_name in discr.debug: print "%s: lmem=%d smem=%d regs=%d" % ( plan.debug_name, func.local_size_bytes, func.shared_size_bytes, func.num_regs) mat_texref = mod.get_texref("mat_tex") texrefs = [mat_texref] func.prepare( "PPPI", texrefs=texrefs) assert plan.aligned_image_dofs_per_microblock % chunk_size == 0 block = ( chunk_size, plan.parallelism.parallel, plan.aligned_image_dofs_per_microblock //chunk_size) return func, block, mat_texref
def create_native(self): from cgen import (ArrayOf, POD, Block, For, Statement, Struct) from cgen import dtype_to_ctype import numpy members = [] code = [] for pk, pv in config.parameters.iteritems(): if isinstance(pv, int): members.append(POD(numpy.int, pk)) code.append( Statement("params.%s = extract<%s>(cppdict[\"%s\"])" % (pk, dtype_to_ctype(numpy.int), pk))) elif isinstance(pv, float): members.append(POD(numpy.float64, pk)) code.append( Statement("params.%s = extract<%s>(cppdict[\"%s\"])" % (pk, dtype_to_ctype(numpy.float64), pk))) elif isinstance(pv, list): if isinstance(pv[0], int): members.append(ArrayOf(POD(numpy.int, pk), len(pv))) code.append( Block([ Statement("list v = extract<%s>(cppdict[\"%s\"])" % (list.__name__, pk)), For( "unsigned int i = 0", "i<len(v)", "++i", Statement("params.%s[i] = extract<%s>(v[i])" % (pk, dtype_to_ctype(numpy.int)))), ])) elif isinstance(pv[0], float): members.append(ArrayOf(POD(numpy.float64, pk), len(pv))) code.append( Block([ Statement("list v = extract<%s>(cppdict[\"%s\"])" % (list.__name__, pk)), For( "unsigned int i = 0", "i < len(v)", "++i", Block([ Statement( "params.%s[i] = extract<%s>(v[i])" % (pk, dtype_to_ctype(numpy.float64))), Statement( "//std::cout << params.%s[i] << std::endl" % (pk)) ])), ])) mystruct = Struct('Parameters', members) mycode = Block(code) # print mystruct # print mycode from jinja2 import Template tpl = Template(""" #include <boost/python.hpp> #include <boost/python/object.hpp> #include <boost/python/extract.hpp> #include <boost/python/list.hpp> #include <boost/python/dict.hpp> #include <boost/python/str.hpp> #include <stdexcept> #include <iostream> {{my_struct}} Parameters params; void CopyDictionary(boost::python::object pydict) { using namespace boost::python; extract< dict > cppdict_ext(pydict); if(!cppdict_ext.check()){ throw std::runtime_error( "PassObj::pass_dict: type error: not a python dict."); } dict cppdict = cppdict_ext(); list keylist = cppdict.keys(); {{my_extractor}} } BOOST_PYTHON_MODULE({{my_module}}) { boost::python::def("copy_dict", &CopyDictionary); } """) rendered_tpl = tpl.render(my_module="NativeParameters", my_extractor=mycode, my_struct=mystruct) # print rendered_tpl from codepy.toolchain import NVCCToolchain import codepy.toolchain kwargs = codepy.toolchain._guess_toolchain_kwargs_from_python_config() # print kwargs kwargs["cc"] = "nvcc" # kwargs["cflags"]=["-m64","-x","cu","-Xcompiler","-fPIC","-ccbin","/opt/local/bin/g++-mp-4.4"] kwargs["cflags"] = ["-m64", "-x", "cu", "-Xcompiler", "-fPIC"] kwargs["include_dirs"].append("/usr/local/cuda/include") kwargs["defines"] = [] kwargs["ldflags"] = ["-shared"] # kwargs["libraries"]=["python2.7"] kwargs["libraries"] = ["python2.6"] print kwargs toolchain = NVCCToolchain(**kwargs) from codepy.libraries import add_boost_python add_boost_python(toolchain) from codepy.jit import extension_from_string mymod = extension_from_string(toolchain, "NativeParameters", rendered_tpl) mymod.copy_dict(config.parameters)
def get_kernel(self, diff_op_cls, elgroup, for_benchmark=False): from cgen import \ Pointer, POD, Value, ArrayOf, \ Module, FunctionDeclaration, FunctionBody, Block, \ Line, Define, Include, \ Initializer, If, For, Statement, Assign from cgen import dtype_to_ctype from cgen.cuda import CudaShared, CudaGlobal discr = self.discr d = discr.dimensions dims = range(d) given = self.plan.given par = self.plan.parallelism diffmat_data = self.gpu_diffmats(diff_op_cls, elgroup) elgroup, = discr.element_groups float_type = given.float_type f_decl = CudaGlobal( FunctionDeclaration( Value("void", "apply_diff_mat"), [ Pointer(POD(numpy.uint8, "gmem_diff_rst_mat")), #Pointer(POD(float_type, "debugbuf")), ] + [Pointer(POD(float_type, "drst%d_global" % i)) for i in dims])) rst_channels = given.devdata.make_valid_tex_channel_count(d) cmod = Module([ Include("pycuda-helpers.hpp"), Line(), Value( "texture<fp_tex_%s, 1, cudaReadModeElementType>" % dtype_to_ctype(float_type), "field_tex"), Line(), Define("DIMENSIONS", discr.dimensions), Define("DOFS_PER_EL", given.dofs_per_el()), Line(), Define("SEGMENT_DOF", "threadIdx.x"), Define("PAR_MB_NR", "threadIdx.y"), Line(), Define("MB_SEGMENT", "blockIdx.x"), Define("MACROBLOCK_NR", "blockIdx.y"), Line(), Define("DOFS_PER_SEGMENT", self.plan.segment_size), Define("SEGMENTS_PER_MB", self.plan.segments_per_microblock()), Define("ALIGNED_DOFS_PER_MB", given.microblock.aligned_floats), Define("ELS_PER_MB", given.microblock.elements), Line(), Define("PAR_MB_COUNT", par.parallel), Define("INLINE_MB_COUNT", par.inline), Define("SEQ_MB_COUNT", par.serial), Line(), Define("THREAD_NUM", "(SEGMENT_DOF+PAR_MB_NR*DOFS_PER_SEGMENT)"), Define("COALESCING_THREAD_COUNT", "(PAR_MB_COUNT*DOFS_PER_SEGMENT)"), Line(), Define("MB_DOF_BASE", "(MB_SEGMENT*DOFS_PER_SEGMENT)"), Define("MB_DOF", "(MB_DOF_BASE+SEGMENT_DOF)"), Define( "GLOBAL_MB_NR_BASE", "(MACROBLOCK_NR*PAR_MB_COUNT*INLINE_MB_COUNT*SEQ_MB_COUNT)"), Define( "GLOBAL_MB_NR", "(GLOBAL_MB_NR_BASE" "+ (seq_mb_number*PAR_MB_COUNT + PAR_MB_NR)*INLINE_MB_COUNT)"), Define("GLOBAL_MB_DOF_BASE", "(GLOBAL_MB_NR*ALIGNED_DOFS_PER_MB)"), Line(), Define("DIFFMAT_SEGMENT_FLOATS", diffmat_data.block_floats), Define("DIFFMAT_SEGMENT_BYTES", "(DIFFMAT_SEGMENT_FLOATS*%d)" % given.float_size()), Define("DIFFMAT_COLUMNS", diffmat_data.matrix_columns), Line(), CudaShared( ArrayOf(POD(float_type, "smem_diff_rst_mat"), "DIFFMAT_COLUMNS*DOFS_PER_SEGMENT")), Line(), ]) S = Statement f_body = Block() f_body.extend_log_block("calculate responsibility data", [ Initializer(POD(numpy.uint16, "mb_el"), "MB_DOF/DOFS_PER_EL"), ]) from hedge.backends.cuda.tools import get_load_code f_body.extend( get_load_code( dest="smem_diff_rst_mat", base="gmem_diff_rst_mat + MB_SEGMENT*DIFFMAT_SEGMENT_BYTES", bytes="DIFFMAT_SEGMENT_BYTES", descr="load diff mat segment") + [S("__syncthreads()"), Line()]) # --------------------------------------------------------------------- def get_scalar_diff_code(): code = [] for inl in range(par.inline): for axis in dims: code.append( Initializer(POD(float_type, "d%drst%d" % (inl, axis)), 0)) code.append(Line()) def get_mat_entry(row, col, axis): return ("smem_diff_rst_mat[" "%(row)s*DIFFMAT_COLUMNS + %(axis)s*DOFS_PER_EL" " + %(col)s" "]" % { "row": row, "col": col, "axis": axis }) tex_channels = ["x", "y", "z", "w"] from hedge.backends.cuda.tools import unroll code.extend([ POD(float_type, "field_value%d" % inl) for inl in range(par.inline) ] + [Line()] + unroll( lambda j: [ Assign( "field_value%d" % inl, "fp_tex1Dfetch(field_tex, GLOBAL_MB_DOF_BASE + %d*ALIGNED_DOFS_PER_MB " "+ mb_el*DOFS_PER_EL + %s)" % (inl, j)) for inl in range(par.inline) ] + [Line()] + [ S("d%drst%d += %s * field_value%d" % (inl, axis, get_mat_entry("SEGMENT_DOF", j, axis), inl)) for axis in dims for inl in range(par.inline) ] + [Line()], given.dofs_per_el(), self.plan.max_unroll)) store_code = Block() for inl in range(par.inline): for rst_axis in dims: store_code.append( Assign( "drst%d_global[GLOBAL_MB_DOF_BASE" " + %d*ALIGNED_DOFS_PER_MB + MB_DOF]" % (rst_axis, inl), "d%drst%d" % (inl, rst_axis), )) code.append(If("MB_DOF < DOFS_PER_EL*ELS_PER_MB", store_code)) return code f_body.extend([ For("unsigned short seq_mb_number = 0", "seq_mb_number < SEQ_MB_COUNT", "++seq_mb_number", Block(get_scalar_diff_code())) ]) # finish off ---------------------------------------------------------- cmod.append(FunctionBody(f_decl, f_body)) if not for_benchmark and "cuda_dump_kernels" in discr.debug: from hedge.tools import open_unique_debug_file open_unique_debug_file("diff", ".cu").write(str(cmod)) mod = SourceModule( cmod, keep="cuda_keep_kernels" in discr.debug, #options=["--maxrregcount=10"] ) field_texref = mod.get_texref("field_tex") func = mod.get_function("apply_diff_mat") func.prepare(discr.dimensions * [float_type] + ["P"], block=(self.plan.segment_size, par.parallel, 1), texrefs=[field_texref]) if "cuda_diff" in discr.debug: print "diff: lmem=%d smem=%d regs=%d" % ( func.local_size_bytes, func.shared_size_bytes, func.num_regs) return func, field_texref