def generate(self): """Generate (i.e. yield) the source code of the module line-by-line. """ from cgen import Block, Module, Include, Line, Define, \ PrivateNamespace body = [] if self.max_arity is not None: body.append(Define("BOOST_PYTHON_MAX_ARITY", self.max_arity)) if self.use_private_namespace: mod_body = [PrivateNamespace(self.mod_body)] else: mod_body = self.mod_body body += ([Include("boost/python.hpp")] + self.preamble + [Line()] + mod_body + [Line(), Line(f"BOOST_PYTHON_MODULE({self.name})")] + [Block(self.init_body)]) return Module(body)
def get_direct_tex_mat_mul_code(): return ( [POD(float_type, "fof%d" % inl) for inl in range(par.inline)] + [POD(float_type, "lm"), Line()] + unroll( lambda j: [ Assign( "fof%d" % inl, "fp_tex1Dfetch(in_vector_tex, " "GLOBAL_MB_PREIMG_DOF_BASE" " + %(inl)d * ALIGNED_PREIMAGE_DOFS_PER_MB" " + mb_el*PREIMAGE_DOFS_PER_EL+%(j)s)" % { "j": j, "inl": inl, "row": "SEGMENT_DOF" }, ) for inl in range(par.inline) ] + [ Assign( "lm", "smem_matrix[" "%(row)s*MATRIX_COLUMNS + %(j)s]" % { "j": j, "row": "SEGMENT_DOF" }, ) ] + [ S("result%(inl)d += fof%(inl)d*lm" % {"inl": inl}) for inl in range(par.inline) ], total_number=self.plan.preimage_dofs_per_el, max_unroll=self.plan.max_unroll) + [Line()])
def get_batched_fetch_mat_mul_code(el_fetch_count): result = [] dofs = range(self.plan.preimage_dofs_per_el) for load_segment_start in range(0, self.plan.preimage_dofs_per_el, self.plan.segment_size): result.extend([S("__syncthreads()")] + [ Assign( "dof_buffer[PAR_MB_NR][%d][SEGMENT_DOF]" % inl, "fp_tex1Dfetch(in_vector_tex, " "GLOBAL_MB_PREIMG_DOF_BASE" " + %d*ALIGNED_PREIMAGE_DOFS_PER_MB" " + (segment_start_el)*PREIMAGE_DOFS_PER_EL + %d + SEGMENT_DOF)" % (inl, load_segment_start)) for inl in range(par.inline) ] + [ S("__syncthreads()"), Line(), ]) for dof in dofs[load_segment_start:load_segment_start + self.plan.segment_size]: for inl in range(par.inline): result.append( S("result%d += " "smem_matrix[SEGMENT_DOF*MATRIX_COLUMNS + %d]" "*" "dof_buffer[PAR_MB_NR][%d][%d]" % (inl, dof, inl, dof - load_segment_start))) result.append(Line()) return result
def generate_c_instruction_code(codegen_state, insn): kernel = codegen_state.kernel if codegen_state.vectorization_info is not None: raise Unvectorizable("C instructions cannot be vectorized") body = [] from loopy.target.c import POD from cgen import Initializer, Block, Line from pymbolic.primitives import Variable for name, iname_expr in insn.iname_exprs: if (isinstance(iname_expr, Variable) and name not in codegen_state.var_subst_map): # No need, the bare symbol will work continue body.append( Initializer( POD(codegen_state.ast_builder, kernel.index_dtype, name), codegen_state.expression_to_code_mapper(iname_expr, prec=PREC_NONE, type_context="i"))) if body: body.append(Line()) body.extend(Line(line) for line in insn.code.split("\n")) return Block(body)
def make_codepy_module(self, toolchain, dtype): from codepy.libraries import add_codepy toolchain = toolchain.copy() add_codepy(toolchain) from cgen import (Value, Include, Statement, Typedef, FunctionBody, FunctionDeclaration, Block, Const, Line, POD, Initializer, CustomLoop) S = Statement from codepy.bpl import BoostPythonModule mod = BoostPythonModule() mod.add_to_preamble([ Include("vector"), Include("algorithm"), Include("hedge/base.hpp"), Include("hedge/volume_operators.hpp"), Include("boost/foreach.hpp"), Include("boost/numeric/ublas/io.hpp"), ]+self.get_cpu_extra_includes()) mod.add_to_module([ S("namespace ublas = boost::numeric::ublas"), S("using namespace hedge"), S("using namespace pyublas"), Line(), Typedef(POD(dtype, "value_type")), Line(), ]) mod.add_function(FunctionBody( FunctionDeclaration(Value("void", "process_elements"), [ Const(Value("uniform_element_ranges", "ers")), Const(Value("numpy_vector<value_type>", "field")), Value("numpy_vector<value_type>", "result"), ]+self.get_cpu_extra_parameter_declarators()), Block([ Typedef(Value("numpy_vector<value_type>::iterator", "it_type")), Typedef(Value("numpy_vector<value_type>::const_iterator", "cit_type")), Line(), Initializer(Value("it_type", "result_it"), "result.begin()"), Initializer(Value("cit_type", "field_it"), "field.begin()"), Line() ]+self.get_cpu_extra_preamble()+[ Line(), CustomLoop( "BOOST_FOREACH(const element_range er, ers)", Block(self.get_cpu_per_element_code()) ) ]))) #print mod.generate() #toolchain = toolchain.copy() #toolchain.enable_debugging return mod.compile(toolchain)
def get_scalar_diff_code(): code = [] for inl in range(par.inline): for axis in dims: code.append( Initializer(POD(float_type, "d%drst%d" % (inl, axis)), 0)) code.append(Line()) def get_mat_entry(row, col, axis): return ("smem_diff_rst_mat[" "%(row)s*DIFFMAT_COLUMNS + %(axis)s*DOFS_PER_EL" " + %(col)s" "]" % { "row": row, "col": col, "axis": axis }) tex_channels = ["x", "y", "z", "w"] from hedge.backends.cuda.tools import unroll code.extend([ POD(float_type, "field_value%d" % inl) for inl in range(par.inline) ] + [Line()] + unroll( lambda j: [ Assign( "field_value%d" % inl, "fp_tex1Dfetch(field_tex, GLOBAL_MB_DOF_BASE + %d*ALIGNED_DOFS_PER_MB " "+ mb_el*DOFS_PER_EL + %s)" % (inl, j)) for inl in range(par.inline) ] + [Line()] + [ S("d%drst%d += %s * field_value%d" % (inl, axis, get_mat_entry("SEGMENT_DOF", j, axis), inl)) for axis in dims for inl in range(par.inline) ] + [Line()], given.dofs_per_el(), self.plan.max_unroll)) store_code = Block() for inl in range(par.inline): for rst_axis in dims: store_code.append( Assign( "drst%d_global[GLOBAL_MB_DOF_BASE" " + %d*ALIGNED_DOFS_PER_MB + MB_DOF]" % (rst_axis, inl), "d%drst%d" % (inl, rst_axis), )) code.append(If("MB_DOF < DOFS_PER_EL*ELS_PER_MB", store_code)) return code
def get_elwise_module_descriptor(arguments, operation, name="kernel"): from codepy.bpl import BoostPythonModule from cgen import FunctionBody, FunctionDeclaration, \ Value, POD, Struct, For, Initializer, Include, Statement, \ Line, Block S = Statement # noqa: N806 mod = BoostPythonModule() mod.add_to_preamble([ Include("pyublas/numpy.hpp"), ]) mod.add_to_module([ S("namespace ublas = boost::numeric::ublas"), S("using namespace pyublas"), Line(), ]) body = Block([ Initializer( Value( "numpy_array<{} >::iterator".format(dtype_to_ctype( varg.dtype)), varg.name), f"args.{varg.name}_ary.begin()") for varg in arguments if isinstance(varg, VectorArg) ] + [ Initializer(sarg.declarator(), f"args.{sarg.name}") for sarg in arguments if isinstance(sarg, ScalarArg) ]) body.extend([ Line(), For("unsigned i = 0", "i < codepy_length", "++i", Block([S(operation)])) ]) arg_struct = Struct("arg_struct", [arg.declarator() for arg in arguments]) mod.add_struct(arg_struct, "ArgStruct") mod.add_to_module([Line()]) mod.add_function( FunctionBody( FunctionDeclaration(Value("void", name), [ POD(numpy.uintp, "codepy_length"), Value("arg_struct", "args") ]), body)) return mod
def mat_mul_outer_loop(fetch_count): if with_scaling: inv_jac_multiplier = ( "fp_tex1Dfetch(scaling_tex," "(GLOBAL_MB_NR + %(inl)d)*MB_EL_COUNT + mb_el)") else: inv_jac_multiplier = "1" write_condition = "MB_DOF < DOFS_PER_EL*MB_EL_COUNT" if self.with_index_check: write_condition += " && GLOBAL_MB_NR < microblock_count" return For( "unsigned short seq_mb_number = 0", "seq_mb_number < SEQ_MB_COUNT", "++seq_mb_number", Block([ Initializer(POD(float_type, "result%d" % inl), 0) for inl in range(par.inline) ] + [Line()] + get_mat_mul_code(fetch_count) + [ If( write_condition, Block([ Assign( "out_vector[GLOBAL_MB_DOF_BASE" " + %d*ALIGNED_DOFS_PER_MB" " + MB_DOF]" % inl, "result%d * %s" % (inl, (inv_jac_multiplier % { "inl": inl }))) for inl in range(par.inline) ])) ]))
def get_load_code(dest, base, bytes, word_type=numpy.uint32, descr=None): from cgen import ( Pointer, POD, Comment, Block, Line, \ Constant, For, Statement) from cgen import dtype_to_ctype copy_dtype = numpy.dtype(word_type) copy_dtype_str = dtype_to_ctype(copy_dtype) code = [] if descr is not None: code.append(Comment(descr)) code.extend([ Block([ Constant(Pointer(POD(copy_dtype, "load_base")), ("(%s *) (%s)" % (copy_dtype_str, base))), For("unsigned word_nr = THREAD_NUM", "word_nr*sizeof(int) < (%s)" % bytes, "word_nr += COALESCING_THREAD_COUNT", Statement("((%s *) (%s))[word_nr] = load_base[word_nr]" % (copy_dtype_str, dest)) ), ]), Line(), ]) return code
def add_struct(self, struct, py_name=None, py_member_name_transform=lambda x: x, by_value_members=None): if by_value_members is None: by_value_members = set() from cgen import Block, Line, Statement, Typedef, Value if py_name is None: py_name = struct.tpname self.mod_body.append(struct) member_defs = [] for f in struct.fields: py_f_name = py_member_name_transform(f.name) tp_lines, declarator = f.get_decl_pair() if f.name in by_value_members or tp_lines[0].startswith("numpy_"): member_defs.append( ".def(pyublas::by_value_rw_member" f'("{py_f_name}", &cl::{f.name}))') else: member_defs.append( f'.def_readwrite("{py_f_name}", &cl::{f.name})' ) self.init_body.append( Block([ Typedef(Value(struct.tpname, "cl")), Line(), Statement( 'boost::python::class_<cl>("{}"){}'.format( py_name, "".join(member_defs))), ]))
def unroll(body_gen, total_number, max_unroll=None, start=0): from cgen import For, Line, Block from pytools import flatten if max_unroll is None: max_unroll = total_number result = [] if total_number > max_unroll: loop_items = (total_number // max_unroll) * max_unroll result.extend([ For("unsigned j = 0", "j < %d" % loop_items, "j += %d" % max_unroll, Block(list(flatten( body_gen("(j+%d)" % i) for i in range(max_unroll)))) ), Line() ]) start += loop_items result.extend(flatten( body_gen(i) for i in range(start, total_number))) return result
def get_load_code(): mb_img_dofs = plan.aligned_image_dofs_per_microblock mb_preimg_dofs = plan.aligned_preimage_dofs_per_microblock preimg_dofs_over_dofs = (mb_preimg_dofs+mb_img_dofs-1) // mb_img_dofs load_code = [] store_code = [] var_num = 0 for load_block in range(preimg_dofs_over_dofs): for inl in range(par.inline): # load and store are split for better pipelining # compiler can't figure that out because of branch var = "tmp%d" % var_num var_num += 1 load_code.append(POD(float_type, var)) block_addr = "%d * ALIGNED_IMAGE_DOFS_PER_MB + IMAGE_MB_DOF" % load_block load_instr = Assign(var, "field[GLOBAL_MB_PREIMAGE_DOF_BASE" " + %d*ALIGNED_PREIMAGE_DOFS_PER_MB" " + %s]" % (inl, block_addr)) store_instr = Assign( "smem_field[PAR_MB_NR][%d][%s]" % (inl, block_addr), var ) if (load_block+1)*mb_img_dofs >= mb_preimg_dofs: cond = "%s < ALIGNED_PREIMAGE_DOFS_PER_MB" % block_addr load_instr = If(cond, load_instr) store_instr = If(cond, store_instr) load_code.append(load_instr) store_code.append(store_instr) return Block(load_code + [Line()] + store_code)
def get_cpu_per_element_code(self): from cgen import (Value, Statement, Initializer, While, Comment, Block, For, Line, Pointer) S = Statement return [ # assumes there is more than one coefficient Initializer(Value("cit_type", "el_modes"), "field_it+er.first"), Line(), Comment("zero out reduced_modes"), For("npy_uint32 mode_idx = 0", "mode_idx < max_degree+1", "++mode_idx", S("reduced_modes[mode_idx] = 0")), Line(), Comment("gather modes by degree"), For("npy_uint32 mode_idx = 0", "mode_idx < mode_count", "++mode_idx", S("reduced_modes[mode_degrees_iterator[mode_idx]]" " += el_modes[mode_idx]")), Line(), Comment("perform skyline procedure"), Initializer(Pointer(Value("value_type", "start")), "reduced_modes.get()"), Initializer(Pointer(Value("value_type", "end")), "start+max_degree+1"), Initializer(Value("value_type", "cur_max"), "std::max(*(end-1), *(end-2))"), Line(), While("end != start", Block([ S("--end"), S("*end = std::max(cur_max, *end)"), ])), Line(), Comment("scatter modes by degree"), Initializer(Value("it_type", "tgt_base"), "result_it+er.first"), For("npy_uint32 mode_idx = 0", "mode_idx < mode_count", "++mode_idx", S("tgt_base[mode_idx] = " "reduced_modes[mode_degrees_iterator[mode_idx]]")), ]
def get_flux_code(flux_writer): flux_code = Block([]) flux_code.extend([ Initializer(Pointer(Value("face_pair", "fpair")), "data.facepairs+fpair_nr"), Initializer( MaybeUnused(POD(numpy.uint32, "a_index")), "fpair->a_base + tex1Dfetch(tex_index_lists, " "fpair->a_ilist_index + FACEDOF_NR)"), Initializer( MaybeUnused(POD(numpy.uint32, "b_index")), "fpair->b_base + tex1Dfetch(tex_index_lists, " "fpair->b_ilist_index + FACEDOF_NR)"), Line(), flux_writer(), Line(), S("fpair_nr += CONCURRENT_FACES") ]) return flux_code
def get_function_definition(self, codegen_state, codegen_result, schedule_index, function_decl, function_body): kernel = codegen_state.kernel from cgen import ( FunctionBody, # Post-mid-2016 cgens have 'Collection', too. Module as Collection, Initializer, Line) result = [] from loopy.kernel.data import AddressSpace from loopy.schedule import CallKernel # We only need to write declarations for global variables with # the first device program. `is_first_dev_prog` determines # whether this is the first device program in the schedule. is_first_dev_prog = codegen_state.is_generating_device_code for i in range(schedule_index): if isinstance(kernel.schedule[i], CallKernel): is_first_dev_prog = False break if is_first_dev_prog: for tv in sorted( six.itervalues(kernel.temporary_variables), key=lambda tv: tv.name): if tv.address_space == AddressSpace.GLOBAL and ( tv.initializer is not None): assert tv.read_only decl_info, = tv.decl_info(self.target, index_dtype=kernel.index_dtype) decl = self.wrap_global_constant( self.get_temporary_decl( codegen_state, schedule_index, tv, decl_info)) if tv.initializer is not None: decl = Initializer(decl, generate_array_literal( codegen_state, tv, tv.initializer)) result.append(decl) fbody = FunctionBody(function_decl, function_body) if not result: return fbody else: return Collection(result+[Line(), fbody])
def expose_vector_type(self, name, py_name=None): self.add_codepy_include() if py_name is None: py_name = name from cgen import (Block, Typedef, Line, Statement, Value) self.init_body.append( Block([ Typedef(Value(name, "cl")), Line(), Statement( f'boost::python::class_<cl>("{py_name}")' ".def(codepy::no_compare_indexing_suite<cl>())"), ]))
def get_function_definition(self, codegen_state, codegen_result, schedule_index, function_decl, function_body): kernel = codegen_state.kernel from cgen import ( FunctionBody, # Post-mid-2016 cgens have 'Collection', too. Module as Collection, Initializer, Line) result = [] from loopy.kernel.data import temp_var_scope for tv in sorted( six.itervalues(kernel.temporary_variables), key=lambda tv: tv.name): if tv.scope == temp_var_scope.GLOBAL and tv.initializer is not None: assert tv.read_only decl_info, = tv.decl_info(self.target, index_dtype=kernel.index_dtype) decl = self.wrap_global_constant( self.get_temporary_decl( codegen_state, schedule_index, tv, decl_info)) if tv.initializer is not None: decl = Initializer(decl, generate_array_literal( codegen_state, tv, tv.initializer)) result.append(decl) fbody = FunctionBody(function_decl, function_body) if not result: return fbody else: return Collection(result+[Line(), fbody])
def get_kernel(self, diff_op, elgroup, for_benchmark=False): from cgen import \ Pointer, POD, Value, ArrayOf, Const, \ Module, FunctionDeclaration, FunctionBody, Block, \ Comment, Line, Define, Include, \ Initializer, If, For, Statement, Assign from pycuda.tools import dtype_to_ctype from cgen.cuda import CudaShared, CudaGlobal discr = self.discr d = discr.dimensions dims = range(d) plan = self.plan given = plan.given elgroup, = discr.element_groups float_type = given.float_type f_decl = CudaGlobal(FunctionDeclaration(Value("void", "apply_diff_mat_smem"), [Pointer(POD(float_type, "debugbuf")), Pointer(POD(float_type, "field")), ] + [Pointer(POD(float_type, "drst%d_global" % i)) for i in dims] )) par = plan.parallelism cmod = Module([ Include("pycuda-helpers.hpp"), ]) if float_type == numpy.float64: cmod.append(Value("texture<fp_tex_double, 1, cudaReadModeElementType>", "diff_rst_mat_tex")) elif float_type == numpy.float32: rst_channels = given.devdata.make_valid_tex_channel_count(d) cmod.append(Value("texture<float%d, 1, cudaReadModeElementType>" % rst_channels, "diff_rst_mat_tex")) else: raise ValueError("unsupported float type: %s" % float_type) # only preimage size variation is supported here assert plan.image_dofs_per_el == given.dofs_per_el() assert plan.aligned_image_dofs_per_microblock == given.microblock.aligned_floats # FIXME: aligned_image_dofs_per_microblock must be divisible # by this, therefore hardcoding for now. chunk_size = 16 cmod.extend([ Line(), Define("DIMENSIONS", discr.dimensions), Define("IMAGE_DOFS_PER_EL", plan.image_dofs_per_el), Define("PREIMAGE_DOFS_PER_EL", plan.preimage_dofs_per_el), Define("ALIGNED_IMAGE_DOFS_PER_MB", plan.aligned_image_dofs_per_microblock), Define("ALIGNED_PREIMAGE_DOFS_PER_MB", plan.aligned_preimage_dofs_per_microblock), Define("ELS_PER_MB", given.microblock.elements), Define("IMAGE_DOFS_PER_MB", "(IMAGE_DOFS_PER_EL*ELS_PER_MB)"), Line(), Define("CHUNK_SIZE", chunk_size), Define("CHUNK_DOF", "threadIdx.x"), Define("PAR_MB_NR", "threadIdx.y"), Define("CHUNK_NR", "threadIdx.z"), Define("IMAGE_MB_DOF", "(CHUNK_NR*CHUNK_SIZE+CHUNK_DOF)"), Define("IMAGE_EL_DOF", "(IMAGE_MB_DOF - mb_el*IMAGE_DOFS_PER_EL)"), Line(), Define("MACROBLOCK_NR", "blockIdx.x"), Line(), Define("PAR_MB_COUNT", par.parallel), Define("INLINE_MB_COUNT", par.inline), Define("SEQ_MB_COUNT", par.serial), Line(), Define("GLOBAL_MB_NR_BASE", "(MACROBLOCK_NR*PAR_MB_COUNT*INLINE_MB_COUNT*SEQ_MB_COUNT)"), Define("GLOBAL_MB_NR", "(GLOBAL_MB_NR_BASE" "+ (seq_mb_number*PAR_MB_COUNT + PAR_MB_NR)*INLINE_MB_COUNT)"), Define("GLOBAL_MB_IMAGE_DOF_BASE", "(GLOBAL_MB_NR*ALIGNED_IMAGE_DOFS_PER_MB)"), Define("GLOBAL_MB_PREIMAGE_DOF_BASE", "(GLOBAL_MB_NR*ALIGNED_PREIMAGE_DOFS_PER_MB)"), Line(), CudaShared( ArrayOf( ArrayOf( ArrayOf( POD(float_type, "smem_field"), "PAR_MB_COUNT"), "INLINE_MB_COUNT"), "ALIGNED_PREIMAGE_DOFS_PER_MB")), Line(), ]) S = Statement f_body = Block([ Initializer(Const(POD(numpy.uint16, "mb_el")), "IMAGE_MB_DOF / IMAGE_DOFS_PER_EL"), Line(), ]) # --------------------------------------------------------------------- def get_load_code(): mb_img_dofs = plan.aligned_image_dofs_per_microblock mb_preimg_dofs = plan.aligned_preimage_dofs_per_microblock preimg_dofs_over_dofs = (mb_preimg_dofs+mb_img_dofs-1) // mb_img_dofs load_code = [] store_code = [] var_num = 0 for load_block in range(preimg_dofs_over_dofs): for inl in range(par.inline): # load and store are split for better pipelining # compiler can't figure that out because of branch var = "tmp%d" % var_num var_num += 1 load_code.append(POD(float_type, var)) block_addr = "%d * ALIGNED_IMAGE_DOFS_PER_MB + IMAGE_MB_DOF" % load_block load_instr = Assign(var, "field[GLOBAL_MB_PREIMAGE_DOF_BASE" " + %d*ALIGNED_PREIMAGE_DOFS_PER_MB" " + %s]" % (inl, block_addr)) store_instr = Assign( "smem_field[PAR_MB_NR][%d][%s]" % (inl, block_addr), var ) if (load_block+1)*mb_img_dofs >= mb_preimg_dofs: cond = "%s < ALIGNED_PREIMAGE_DOFS_PER_MB" % block_addr load_instr = If(cond, load_instr) store_instr = If(cond, store_instr) load_code.append(load_instr) store_code.append(store_instr) return Block(load_code + [Line()] + store_code) def get_scalar_diff_code(): code = [] for inl in range(par.inline): for axis in dims: code.append( Initializer(POD(float_type, "d%drst%d" % (inl, axis)), 0)) code.append(Line()) tex_channels = ["x", "y", "z", "w"] store_code = Block() for inl in range(par.inline): for rst_axis in dims: store_code.append(Assign( "drst%d_global[GLOBAL_MB_IMAGE_DOF_BASE + " "%d*ALIGNED_IMAGE_DOFS_PER_MB + IMAGE_MB_DOF]" % (rst_axis, inl), "d%drst%d" % (inl, rst_axis) )) from hedge.backends.cuda.tools import unroll code.extend([ Comment("everybody needs to be done with the old data"), S("__syncthreads()"), Line(), get_load_code(), Line(), Comment("all the new data must be loaded"), S("__syncthreads()"), Line(), ]) if float_type == numpy.float32: code.append(Value("float%d" % rst_channels, "dmat_entries")) code.extend([ POD(float_type, "field_value%d" % inl) for inl in range(par.inline) ]+[Line()]) def unroll_body(j): result = [ Assign("field_value%d" % inl, "smem_field[PAR_MB_NR][%d][mb_el*PREIMAGE_DOFS_PER_EL+%s]" % (inl, j)) for inl in range(par.inline) ] if float_type == numpy.float32: result.append(Assign("dmat_entries", "tex1Dfetch(diff_rst_mat_tex, IMAGE_EL_DOF + %s*IMAGE_DOFS_PER_EL)" % j)) result.extend( S("d%drst%d += dmat_entries.%s * field_value%d" % (inl, axis, tex_channels[axis], inl)) for inl in range(par.inline) for axis in dims) elif float_type == numpy.float64: result.extend( S("d%(inl)drst%(axis)d += " "fp_tex1Dfetch(diff_rst_mat_tex, %(axis)d " "+ DIMENSIONS*(IMAGE_EL_DOF + %(j)d*IMAGE_DOFS_PER_EL))" "* field_value%(inl)d" % { "inl": inl, "axis": axis, "j": j }) for inl in range(par.inline) for axis in dims) else: assert False return result code.append(If("IMAGE_MB_DOF < IMAGE_DOFS_PER_MB", Block(unroll(unroll_body, total_number=plan.preimage_dofs_per_el) +[store_code]))) return code f_body.extend([ For("unsigned short seq_mb_number = 0", "seq_mb_number < SEQ_MB_COUNT", "++seq_mb_number", Block(get_scalar_diff_code()) ) ]) # finish off ---------------------------------------------------------- cmod.append(FunctionBody(f_decl, f_body)) if not for_benchmark and "cuda_dump_kernels" in discr.debug: from hedge.tools import open_unique_debug_file open_unique_debug_file("diff", ".cu").write(str(cmod)) mod = SourceModule(cmod, keep="cuda_keep_kernels" in discr.debug, #options=["--maxrregcount=16"] ) func = mod.get_function("apply_diff_mat_smem") if "cuda_diff" in discr.debug: print "diff: lmem=%d smem=%d regs=%d" % ( func.local_size_bytes, func.shared_size_bytes, func.registers) diff_rst_mat_texref = mod.get_texref("diff_rst_mat_tex") gpu_diffmats = self.gpu_diffmats(diff_op, elgroup) if given.float_type == numpy.float32: gpu_diffmats.bind_to_texref_ext(diff_rst_mat_texref, rst_channels) elif given.float_type == numpy.float64: gpu_diffmats.bind_to_texref_ext(diff_rst_mat_texref, allow_double_hack=True) else: assert False assert given.microblock.aligned_floats % chunk_size == 0 block = ( chunk_size, plan.parallelism.parallel, given.microblock.aligned_floats//chunk_size) func.prepare( ["PP"] + discr.dimensions*["P"], texrefs=[diff_rst_mat_texref]) return block, func
def make_cuda_kernel(self, discr, dtype, eg): given = discr.given ldis = eg.local_discretization microblocks_per_block = 1 from cgen.cuda import CudaGlobal from cgen import (Module, Value, Include, Typedef, FunctionBody, FunctionDeclaration, Const, Line, POD, LiteralBlock, Define, Pointer) cmod = Module([ Include("pycuda-helpers.hpp"), Line(), Typedef(POD(dtype, "value_type")), Line(), Define("DOFS_PER_EL", given.dofs_per_el()), Define("ALIGNED_DOFS_PER_MB", given.microblock.aligned_floats), Define("VERTICES_PER_EL", ldis.vertex_count()), Define("ELS_PER_MB", given.microblock.elements), Define("MBS_PER_BLOCK", microblocks_per_block), Line(), Define("DOF_IN_MB_IDX", "threadIdx.x"), Define("DOF_IN_EL_IDX", "(DOF_IN_MB_IDX-el_idx_in_mb*DOFS_PER_EL)"), Define("MB_IN_BLOCK_IDX", "threadIdx.y"), Define("BLOCK_IDX", "blockIdx.x"), Define("MB_NUMBER", "(BLOCK_IDX * MBS_PER_BLOCK + MB_IN_BLOCK_IDX)"), Define("BLOCK_DATA", "whole_block[MB_IN_BLOCK_IDX]")] + self.get_cuda_extra_preamble(discr, dtype, eg) + [FunctionBody( CudaGlobal(FunctionDeclaration( Value("void", "elwise_kernel"), [ Pointer(Const(POD(dtype, "field"))), Pointer(POD(dtype, "result")), POD(numpy.uint32, "mb_count"), ])), LiteralBlock(""" int el_idx_in_mb = DOF_IN_MB_IDX / DOFS_PER_EL; if (MB_NUMBER >= mb_count) return; int idx = MB_NUMBER * ALIGNED_DOFS_PER_MB + DOF_IN_MB_IDX; int element_base_idx = ALIGNED_DOFS_PER_MB * MB_IN_BLOCK_IDX + (DOF_IN_MB_IDX / DOFS_PER_EL) * DOFS_PER_EL; int dof_in_element = DOF_IN_MB_IDX-el_idx_in_mb*DOFS_PER_EL; __shared__ value_type whole_block[MBS_PER_BLOCK][ALIGNED_DOFS_PER_MB+1]; int idx_in_block = ALIGNED_DOFS_PER_MB * MB_IN_BLOCK_IDX + DOF_IN_MB_IDX; BLOCK_DATA[idx_in_block] = field[idx]; __syncthreads(); %s result[idx] = node_result; """ % self.get_cuda_code(discr, dtype, eg))) ]) if False: for i, l in enumerate(str(cmod).split("\n")): print i+1, l raw_input() from pycuda.compiler import SourceModule mod = SourceModule( cmod, keep="cuda_keep_kernels" in discr.debug, ) func = mod.get_function("elwise_kernel") func.prepare( "PPI", block=( given.microblock.aligned_floats, microblocks_per_block, 1)) mb_count = len(discr.blocks) * discr.given.microblocks_per_block grid_dim = (mb_count + microblocks_per_block - 1) \ // microblocks_per_block from pytools import Record class KernelInfo(Record): pass return KernelInfo( func=func, grid_dim=grid_dim, mb_count=mb_count)
def get_kernel(self, with_scaling, for_benchmark=False): from cgen import \ Pointer, POD, Value, ArrayOf, \ Module, FunctionDeclaration, FunctionBody, Block, \ Line, Define, Include, \ Initializer, If, For, Statement, Assign, \ ArrayInitializer from cgen import dtype_to_ctype from cgen.cuda import CudaShared, CudaConstant, CudaGlobal discr = self.discr d = discr.dimensions dims = range(d) given = self.plan.given float_type = given.float_type f_decl = CudaGlobal( FunctionDeclaration(Value("void", "apply_el_local_mat_smem_mat"), [ Pointer(POD(float_type, "out_vector")), Pointer(POD(numpy.uint8, "gmem_matrix")), Pointer(POD(float_type, "debugbuf")), POD(numpy.uint32, "microblock_count"), ])) cmod = Module([ Include("pycuda-helpers.hpp"), Line(), Value( "texture<fp_tex_%s, 1, cudaReadModeElementType>" % dtype_to_ctype(float_type), "in_vector_tex"), ]) if with_scaling: cmod.append( Value( "texture<fp_tex_%s, 1, cudaReadModeElementType>" % dtype_to_ctype(float_type), "scaling_tex"), ) par = self.plan.parallelism cmod.extend([ Line(), Define("DIMENSIONS", discr.dimensions), Define("DOFS_PER_EL", given.dofs_per_el()), Define("PREIMAGE_DOFS_PER_EL", self.plan.preimage_dofs_per_el), Line(), Define("SEGMENT_DOF", "threadIdx.x"), Define("PAR_MB_NR", "threadIdx.y"), Line(), Define("MB_SEGMENT", "blockIdx.x"), Define("MACROBLOCK_NR", "blockIdx.y"), Line(), Define("DOFS_PER_SEGMENT", self.plan.segment_size), Define("SEGMENTS_PER_MB", self.plan.segments_per_microblock()), Define("ALIGNED_DOFS_PER_MB", given.microblock.aligned_floats), Define("ALIGNED_PREIMAGE_DOFS_PER_MB", self.plan.aligned_preimage_dofs_per_microblock), Define("MB_EL_COUNT", given.microblock.elements), Line(), Define("PAR_MB_COUNT", par.parallel), Define("INLINE_MB_COUNT", par.inline), Define("SEQ_MB_COUNT", par.serial), Line(), Define("THREAD_NUM", "(SEGMENT_DOF+PAR_MB_NR*DOFS_PER_SEGMENT)"), Define("COALESCING_THREAD_COUNT", "(PAR_MB_COUNT*DOFS_PER_SEGMENT)"), Line(), Define("MB_DOF_BASE", "(MB_SEGMENT*DOFS_PER_SEGMENT)"), Define("MB_DOF", "(MB_DOF_BASE+SEGMENT_DOF)"), Define( "GLOBAL_MB_NR_BASE", "(MACROBLOCK_NR*PAR_MB_COUNT*INLINE_MB_COUNT*SEQ_MB_COUNT)"), Define( "GLOBAL_MB_NR", "(GLOBAL_MB_NR_BASE" "+ (seq_mb_number*PAR_MB_COUNT + PAR_MB_NR)*INLINE_MB_COUNT)"), Define("GLOBAL_MB_DOF_BASE", "(GLOBAL_MB_NR*ALIGNED_DOFS_PER_MB)"), Define("GLOBAL_MB_PREIMG_DOF_BASE", "(GLOBAL_MB_NR*ALIGNED_PREIMAGE_DOFS_PER_MB)"), Line(), Define("MATRIX_COLUMNS", self.plan.gpu_matrix_columns()), Define("MATRIX_SEGMENT_FLOATS", self.plan.gpu_matrix_block_floats()), Define("MATRIX_SEGMENT_BYTES", "(MATRIX_SEGMENT_FLOATS*%d)" % given.float_size()), Line(), CudaShared( ArrayOf(POD(float_type, "smem_matrix"), "MATRIX_SEGMENT_FLOATS")), CudaShared( ArrayOf( ArrayOf( ArrayOf(POD(float_type, "dof_buffer"), "PAR_MB_COUNT"), "INLINE_MB_COUNT"), "DOFS_PER_SEGMENT"), ), CudaShared(POD(numpy.uint16, "segment_start_el")), CudaShared(POD(numpy.uint16, "segment_stop_el")), CudaShared(POD(numpy.uint16, "segment_el_count")), Line(), ArrayInitializer( CudaConstant( ArrayOf(POD(numpy.uint32, "segment_start_el_lookup"), "SEGMENTS_PER_MB")), [(chk * self.plan.segment_size) // given.dofs_per_el() for chk in range(self.plan.segments_per_microblock())]), ArrayInitializer( CudaConstant( ArrayOf(POD(numpy.uint32, "segment_stop_el_lookup"), "SEGMENTS_PER_MB")), [ min(given.microblock.elements, (chk * self.plan.segment_size + self.plan.segment_size - 1) // given.dofs_per_el() + 1) for chk in range(self.plan.segments_per_microblock()) ]), ]) S = Statement f_body = Block() f_body.extend_log_block( "calculate this dof's element", [Initializer(POD(numpy.uint8, "mb_el"), "MB_DOF/DOFS_PER_EL")]) if self.plan.use_prefetch_branch: f_body.extend_log_block("calculate segment responsibility data", [ If( "THREAD_NUM==0", Block([ Assign("segment_start_el", "segment_start_el_lookup[MB_SEGMENT]"), Assign("segment_stop_el", "segment_stop_el_lookup[MB_SEGMENT]"), Assign("segment_el_count", "segment_stop_el-segment_start_el"), ])), S("__syncthreads()") ]) from hedge.backends.cuda.tools import get_load_code f_body.extend( get_load_code(dest="smem_matrix", base=( "gmem_matrix + MB_SEGMENT*MATRIX_SEGMENT_BYTES"), bytes="MATRIX_SEGMENT_BYTES", descr="load matrix segment") + [S("__syncthreads()")]) # --------------------------------------------------------------------- def get_batched_fetch_mat_mul_code(el_fetch_count): result = [] dofs = range(self.plan.preimage_dofs_per_el) for load_segment_start in range(0, self.plan.preimage_dofs_per_el, self.plan.segment_size): result.extend([S("__syncthreads()")] + [ Assign( "dof_buffer[PAR_MB_NR][%d][SEGMENT_DOF]" % inl, "fp_tex1Dfetch(in_vector_tex, " "GLOBAL_MB_PREIMG_DOF_BASE" " + %d*ALIGNED_PREIMAGE_DOFS_PER_MB" " + (segment_start_el)*PREIMAGE_DOFS_PER_EL + %d + SEGMENT_DOF)" % (inl, load_segment_start)) for inl in range(par.inline) ] + [ S("__syncthreads()"), Line(), ]) for dof in dofs[load_segment_start:load_segment_start + self.plan.segment_size]: for inl in range(par.inline): result.append( S("result%d += " "smem_matrix[SEGMENT_DOF*MATRIX_COLUMNS + %d]" "*" "dof_buffer[PAR_MB_NR][%d][%d]" % (inl, dof, inl, dof - load_segment_start))) result.append(Line()) return result from hedge.backends.cuda.tools import unroll def get_direct_tex_mat_mul_code(): return ( [POD(float_type, "fof%d" % inl) for inl in range(par.inline)] + [POD(float_type, "lm"), Line()] + unroll( lambda j: [ Assign( "fof%d" % inl, "fp_tex1Dfetch(in_vector_tex, " "GLOBAL_MB_PREIMG_DOF_BASE" " + %(inl)d * ALIGNED_PREIMAGE_DOFS_PER_MB" " + mb_el*PREIMAGE_DOFS_PER_EL+%(j)s)" % { "j": j, "inl": inl, "row": "SEGMENT_DOF" }, ) for inl in range(par.inline) ] + [ Assign( "lm", "smem_matrix[" "%(row)s*MATRIX_COLUMNS + %(j)s]" % { "j": j, "row": "SEGMENT_DOF" }, ) ] + [ S("result%(inl)d += fof%(inl)d*lm" % {"inl": inl}) for inl in range(par.inline) ], total_number=self.plan.preimage_dofs_per_el, max_unroll=self.plan.max_unroll) + [Line()]) def get_mat_mul_code(el_fetch_count): if el_fetch_count == 1: return get_batched_fetch_mat_mul_code(el_fetch_count) else: return get_direct_tex_mat_mul_code() def mat_mul_outer_loop(fetch_count): if with_scaling: inv_jac_multiplier = ( "fp_tex1Dfetch(scaling_tex," "(GLOBAL_MB_NR + %(inl)d)*MB_EL_COUNT + mb_el)") else: inv_jac_multiplier = "1" write_condition = "MB_DOF < DOFS_PER_EL*MB_EL_COUNT" if self.with_index_check: write_condition += " && GLOBAL_MB_NR < microblock_count" return For( "unsigned short seq_mb_number = 0", "seq_mb_number < SEQ_MB_COUNT", "++seq_mb_number", Block([ Initializer(POD(float_type, "result%d" % inl), 0) for inl in range(par.inline) ] + [Line()] + get_mat_mul_code(fetch_count) + [ If( write_condition, Block([ Assign( "out_vector[GLOBAL_MB_DOF_BASE" " + %d*ALIGNED_DOFS_PER_MB" " + MB_DOF]" % inl, "result%d * %s" % (inl, (inv_jac_multiplier % { "inl": inl }))) for inl in range(par.inline) ])) ])) if self.plan.use_prefetch_branch: from cgen import make_multiple_ifs f_body.append( make_multiple_ifs([ ("segment_el_count == %d" % fetch_count, mat_mul_outer_loop(fetch_count)) for fetch_count in range( 1, self.plan.max_elements_touched_by_segment() + 1) ])) else: f_body.append(mat_mul_outer_loop(0)) # finish off ---------------------------------------------------------- cmod.append(FunctionBody(f_decl, f_body)) if not for_benchmark and "cuda_dump_kernels" in discr.debug: from hedge.tools import open_unique_debug_file open_unique_debug_file(self.plan.debug_name, ".cu").write(str(cmod)) mod = SourceModule( cmod, keep="cuda_keep_kernels" in discr.debug, #options=["--maxrregcount=12"] ) func = mod.get_function("apply_el_local_mat_smem_mat") if self.plan.debug_name in discr.debug: print "%s: lmem=%d smem=%d regs=%d" % ( self.plan.debug_name, func.local_size_bytes, func.shared_size_bytes, func.num_regs) in_vector_texref = mod.get_texref("in_vector_tex") texrefs = [in_vector_texref] if with_scaling: scaling_texref = mod.get_texref("scaling_tex") texrefs.append(scaling_texref) else: scaling_texref = None func.prepare("PPPI", block=(self.plan.segment_size, self.plan.parallelism.parallel, 1), texrefs=texrefs) return func, in_vector_texref, scaling_texref
def emit_multiple_assignment(self, codegen_state, insn): ecm = codegen_state.expression_to_code_mapper from pymbolic.primitives import Variable from pymbolic.mapper.stringifier import PREC_NONE func_id = insn.expression.function parameters = insn.expression.parameters if isinstance(func_id, Variable): func_id = func_id.name assignee_var_descriptors = [ codegen_state.kernel.get_var_descriptor(a) for a in insn.assignee_var_names() ] par_dtypes = tuple(ecm.infer_type(par) for par in parameters) str_parameters = None mangle_result = codegen_state.kernel.mangle_function( func_id, par_dtypes) if mangle_result is None: raise RuntimeError( "function '%s' unknown--" "maybe you need to register a function mangler?" % func_id) assert mangle_result.arg_dtypes is not None from loopy.expression import dtype_to_type_context str_parameters = [ ecm(par, PREC_NONE, dtype_to_type_context(self.target, tgt_dtype), tgt_dtype) for par, par_dtype, tgt_dtype in zip( parameters, par_dtypes, mangle_result.arg_dtypes) ] from loopy.codegen import SeenFunction codegen_state.seen_functions.add( SeenFunction(func_id, mangle_result.target_name, mangle_result.arg_dtypes)) for i, (a, tgt_dtype) in enumerate( zip(insn.assignees[1:], mangle_result.result_dtypes[1:])): if tgt_dtype != ecm.infer_type(a): raise LoopyError("type mismatch in %d'th (1-based) left-hand " "side of instruction '%s'" % (i + 1, insn.id)) str_parameters.append( "&(%s)" % ecm(a, PREC_NONE, dtype_to_type_context( self.target, tgt_dtype), tgt_dtype)) result = "%s(%s)" % (mangle_result.target_name, ", ".join(str_parameters)) # In case of no assignees, we are done if len(mangle_result.result_dtypes) == 0: from cgen import Line return Line(result + ';') result = ecm.wrap_in_typecast(mangle_result.result_dtypes[0], assignee_var_descriptors[0].dtype, result) lhs_code = ecm(insn.assignees[0], prec=PREC_NONE, type_context=None) from cgen import Assign return Assign(lhs_code, result)
def get_kernel(self, diff_op_cls, elgroup, for_benchmark=False): from cgen import \ Pointer, POD, Value, ArrayOf, \ Module, FunctionDeclaration, FunctionBody, Block, \ Line, Define, Include, \ Initializer, If, For, Statement, Assign from cgen import dtype_to_ctype from cgen.cuda import CudaShared, CudaGlobal discr = self.discr d = discr.dimensions dims = range(d) given = self.plan.given par = self.plan.parallelism diffmat_data = self.gpu_diffmats(diff_op_cls, elgroup) elgroup, = discr.element_groups float_type = given.float_type f_decl = CudaGlobal( FunctionDeclaration( Value("void", "apply_diff_mat"), [ Pointer(POD(numpy.uint8, "gmem_diff_rst_mat")), #Pointer(POD(float_type, "debugbuf")), ] + [Pointer(POD(float_type, "drst%d_global" % i)) for i in dims])) rst_channels = given.devdata.make_valid_tex_channel_count(d) cmod = Module([ Include("pycuda-helpers.hpp"), Line(), Value( "texture<fp_tex_%s, 1, cudaReadModeElementType>" % dtype_to_ctype(float_type), "field_tex"), Line(), Define("DIMENSIONS", discr.dimensions), Define("DOFS_PER_EL", given.dofs_per_el()), Line(), Define("SEGMENT_DOF", "threadIdx.x"), Define("PAR_MB_NR", "threadIdx.y"), Line(), Define("MB_SEGMENT", "blockIdx.x"), Define("MACROBLOCK_NR", "blockIdx.y"), Line(), Define("DOFS_PER_SEGMENT", self.plan.segment_size), Define("SEGMENTS_PER_MB", self.plan.segments_per_microblock()), Define("ALIGNED_DOFS_PER_MB", given.microblock.aligned_floats), Define("ELS_PER_MB", given.microblock.elements), Line(), Define("PAR_MB_COUNT", par.parallel), Define("INLINE_MB_COUNT", par.inline), Define("SEQ_MB_COUNT", par.serial), Line(), Define("THREAD_NUM", "(SEGMENT_DOF+PAR_MB_NR*DOFS_PER_SEGMENT)"), Define("COALESCING_THREAD_COUNT", "(PAR_MB_COUNT*DOFS_PER_SEGMENT)"), Line(), Define("MB_DOF_BASE", "(MB_SEGMENT*DOFS_PER_SEGMENT)"), Define("MB_DOF", "(MB_DOF_BASE+SEGMENT_DOF)"), Define( "GLOBAL_MB_NR_BASE", "(MACROBLOCK_NR*PAR_MB_COUNT*INLINE_MB_COUNT*SEQ_MB_COUNT)"), Define( "GLOBAL_MB_NR", "(GLOBAL_MB_NR_BASE" "+ (seq_mb_number*PAR_MB_COUNT + PAR_MB_NR)*INLINE_MB_COUNT)"), Define("GLOBAL_MB_DOF_BASE", "(GLOBAL_MB_NR*ALIGNED_DOFS_PER_MB)"), Line(), Define("DIFFMAT_SEGMENT_FLOATS", diffmat_data.block_floats), Define("DIFFMAT_SEGMENT_BYTES", "(DIFFMAT_SEGMENT_FLOATS*%d)" % given.float_size()), Define("DIFFMAT_COLUMNS", diffmat_data.matrix_columns), Line(), CudaShared( ArrayOf(POD(float_type, "smem_diff_rst_mat"), "DIFFMAT_COLUMNS*DOFS_PER_SEGMENT")), Line(), ]) S = Statement f_body = Block() f_body.extend_log_block("calculate responsibility data", [ Initializer(POD(numpy.uint16, "mb_el"), "MB_DOF/DOFS_PER_EL"), ]) from hedge.backends.cuda.tools import get_load_code f_body.extend( get_load_code( dest="smem_diff_rst_mat", base="gmem_diff_rst_mat + MB_SEGMENT*DIFFMAT_SEGMENT_BYTES", bytes="DIFFMAT_SEGMENT_BYTES", descr="load diff mat segment") + [S("__syncthreads()"), Line()]) # --------------------------------------------------------------------- def get_scalar_diff_code(): code = [] for inl in range(par.inline): for axis in dims: code.append( Initializer(POD(float_type, "d%drst%d" % (inl, axis)), 0)) code.append(Line()) def get_mat_entry(row, col, axis): return ("smem_diff_rst_mat[" "%(row)s*DIFFMAT_COLUMNS + %(axis)s*DOFS_PER_EL" " + %(col)s" "]" % { "row": row, "col": col, "axis": axis }) tex_channels = ["x", "y", "z", "w"] from hedge.backends.cuda.tools import unroll code.extend([ POD(float_type, "field_value%d" % inl) for inl in range(par.inline) ] + [Line()] + unroll( lambda j: [ Assign( "field_value%d" % inl, "fp_tex1Dfetch(field_tex, GLOBAL_MB_DOF_BASE + %d*ALIGNED_DOFS_PER_MB " "+ mb_el*DOFS_PER_EL + %s)" % (inl, j)) for inl in range(par.inline) ] + [Line()] + [ S("d%drst%d += %s * field_value%d" % (inl, axis, get_mat_entry("SEGMENT_DOF", j, axis), inl)) for axis in dims for inl in range(par.inline) ] + [Line()], given.dofs_per_el(), self.plan.max_unroll)) store_code = Block() for inl in range(par.inline): for rst_axis in dims: store_code.append( Assign( "drst%d_global[GLOBAL_MB_DOF_BASE" " + %d*ALIGNED_DOFS_PER_MB + MB_DOF]" % (rst_axis, inl), "d%drst%d" % (inl, rst_axis), )) code.append(If("MB_DOF < DOFS_PER_EL*ELS_PER_MB", store_code)) return code f_body.extend([ For("unsigned short seq_mb_number = 0", "seq_mb_number < SEQ_MB_COUNT", "++seq_mb_number", Block(get_scalar_diff_code())) ]) # finish off ---------------------------------------------------------- cmod.append(FunctionBody(f_decl, f_body)) if not for_benchmark and "cuda_dump_kernels" in discr.debug: from hedge.tools import open_unique_debug_file open_unique_debug_file("diff", ".cu").write(str(cmod)) mod = SourceModule( cmod, keep="cuda_keep_kernels" in discr.debug, #options=["--maxrregcount=10"] ) field_texref = mod.get_texref("field_tex") func = mod.get_function("apply_diff_mat") func.prepare(discr.dimensions * [float_type] + ["P"], block=(self.plan.segment_size, par.parallel, 1), texrefs=[field_texref]) if "cuda_diff" in discr.debug: print "diff: lmem=%d smem=%d regs=%d" % ( func.local_size_bytes, func.shared_size_bytes, func.num_regs) return func, field_texref
def emit_blank_line(self): from cgen import Line return Line()
def get_temporary_decls(self, codegen_state, schedule_index): from loopy.kernel.data import AddressSpace kernel = codegen_state.kernel base_storage_decls = [] temp_decls = [] # {{{ declare temporaries base_storage_sizes = {} base_storage_to_scope = {} base_storage_to_align_bytes = {} from cgen import ArrayOf, Initializer, AlignedAttribute, Value, Line # Getting the temporary variables that are needed for the current # sub-kernel. from loopy.schedule.tools import ( temporaries_read_in_subkernel, temporaries_written_in_subkernel) subkernel = kernel.schedule[schedule_index].kernel_name sub_knl_temps = ( temporaries_read_in_subkernel(kernel, subkernel) | temporaries_written_in_subkernel(kernel, subkernel)) for tv in sorted( six.itervalues(kernel.temporary_variables), key=lambda tv: tv.name): decl_info = tv.decl_info(self.target, index_dtype=kernel.index_dtype) if not tv.base_storage: for idi in decl_info: # global temp vars are mapped to arguments or global declarations if tv.address_space != AddressSpace.GLOBAL and ( tv.name in sub_knl_temps): decl = self.wrap_temporary_decl( self.get_temporary_decl( codegen_state, schedule_index, tv, idi), tv.address_space) if tv.initializer is not None: assert tv.read_only decl = Initializer(decl, generate_array_literal( codegen_state, tv, tv.initializer)) temp_decls.append(decl) else: assert tv.initializer is None offset = 0 base_storage_sizes.setdefault(tv.base_storage, []).append( tv.nbytes) base_storage_to_scope.setdefault(tv.base_storage, []).append( tv.address_space) align_size = tv.dtype.itemsize from loopy.kernel.array import VectorArrayDimTag for dim_tag, axis_len in zip(tv.dim_tags, tv.shape): if isinstance(dim_tag, VectorArrayDimTag): align_size *= axis_len base_storage_to_align_bytes.setdefault(tv.base_storage, []).append( align_size) for idi in decl_info: cast_decl = POD(self, idi.dtype, "") temp_var_decl = POD(self, idi.dtype, idi.name) cast_decl = self.wrap_temporary_decl(cast_decl, tv.address_space) temp_var_decl = self.wrap_temporary_decl( temp_var_decl, tv.address_space) if tv._base_storage_access_may_be_aliasing: ptrtype = _ConstPointer else: # The 'restrict' part of this is a complete lie--of course # all these temporaries are aliased. But we're promising to # not use them to shovel data from one representation to the # other. That counts, right? ptrtype = _ConstRestrictPointer cast_decl = ptrtype(cast_decl) temp_var_decl = ptrtype(temp_var_decl) cast_tp, cast_d = cast_decl.get_decl_pair() temp_var_decl = Initializer( temp_var_decl, "(%s %s) (%s + %s)" % ( " ".join(cast_tp), cast_d, tv.base_storage, offset)) temp_decls.append(temp_var_decl) from pytools import product offset += ( idi.dtype.itemsize * product(si for si in idi.shape)) ecm = self.get_expression_to_code_mapper(codegen_state) for bs_name, bs_sizes in sorted(six.iteritems(base_storage_sizes)): bs_var_decl = Value("char", bs_name) from pytools import single_valued bs_var_decl = self.wrap_temporary_decl( bs_var_decl, single_valued(base_storage_to_scope[bs_name])) # FIXME: Could try to use isl knowledge to simplify max. if all(isinstance(bs, int) for bs in bs_sizes): bs_size_max = max(bs_sizes) else: bs_size_max = p.Max(tuple(bs_sizes)) bs_var_decl = ArrayOf(bs_var_decl, ecm(bs_size_max)) alignment = max(base_storage_to_align_bytes[bs_name]) bs_var_decl = AlignedAttribute(alignment, bs_var_decl) base_storage_decls.append(bs_var_decl) # }}} result = base_storage_decls + temp_decls if result: result.append(Line()) return result
def write_interior_flux_code(self, is_twosided): given = self.plan.given def get_field(flux_rec, is_interior, flipped): if is_interior ^ flipped: prefix = "a" else: prefix = "b" return ("val_%s_field%d" % (prefix, self.dep_to_index[flux_rec.field_expr])) flux_write_code = Block([]) flux_var_decl = [Initializer(POD(given.float_type, "a_flux"), 0)] if is_twosided: flux_var_decl.append( Initializer(POD(given.float_type, "b_flux"), 0)) prefixes = ["a", "b"] flip_values = [False, True] else: prefixes = ["a"] flip_values = [False] flux_write_code.append(Line()) for dep in self.interior_deps: flux_write_code.append(Comment(str(dep))) for side in ["a", "b"]: flux_write_code.append( Initializer( MaybeUnused( POD( given.float_type, "val_%s_field%d" % (side, self.dep_to_index[dep]))), "fp_tex1Dfetch(field%d_tex, %s_index)" % (self.dep_to_index[dep], side))) f2cm = FluxToCodeMapper(given.float_type) flux_sub_codes = [] for flux_nr, wdflux in enumerate(self.fluxes): my_flux_block = Block(flux_var_decl) for int_rec in wdflux.interiors: for prefix, is_flipped in zip(prefixes, flip_values): my_flux_block.append( Statement("%s_flux += %s" % ( prefix, flux_to_code(f2cm, is_flipped, int_rec.field_expr, int_rec.field_expr, self.dep_to_index, int_rec.flux_expr, PREC_NONE), ))) my_flux_block.append(Line()) my_flux_block.append( self.gen_store(flux_nr, "fpair->a_dest+FACEDOF_NR", "fpair->face_jacobian*a_flux")) #my_flux_block.append( #Statement("if(isnan(val_b_field5)) debugbuf[blockIdx.x] = 1"), #) if is_twosided: my_flux_block.append( self.gen_store( flux_nr, "fpair->b_dest+tex1Dfetch(tex_index_lists, " "fpair->b_write_ilist_index + FACEDOF_NR)", "fpair->face_jacobian*b_flux")) #my_flux_block.append( #Assign("debugbuf[blockIdx.x*96+fpair_nr+8]", "10000+fpair->b_dest"), #) flux_sub_codes.append(my_flux_block) if f2cm.cse_name_list: flux_write_code.append(Line()) flux_write_code.extend( Initializer(Value("value_type", cse_name), cse_str) for cse_name, cse_str in f2cm.cse_name_list) flux_write_code.extend(flux_sub_codes) return flux_write_code
def write_boundary_flux_code(self, for_benchmark): given = self.plan.given flux_write_code = Block() fluxes_by_bdry_number = {} for flux_nr, wdflux in enumerate(self.fluxes): for bflux_info in wdflux.boundaries: if for_benchmark: bdry_number = 0 else: bdry_number = self.executor.boundary_tag_to_number[ bflux_info.bpair.tag] fluxes_by_bdry_number.setdefault(bdry_number, [])\ .append((flux_nr, bflux_info)) flux_write_code.extend([ Initializer(MaybeUnused(POD(given.float_type, "flux%d" % flux_nr)), 0) for flux_nr in range(len(self.fluxes)) ]) for bdry_number, nrs_and_fluxes in fluxes_by_bdry_number.iteritems(): bblock = [] from pytools import set_sum int_deps = set_sum(flux_rec.int_dependencies for flux_nr, flux_rec in nrs_and_fluxes) ext_deps = set_sum(flux_rec.ext_dependencies for flux_nr, flux_rec in nrs_and_fluxes) for dep in int_deps: bblock.extend([ Comment(str(dep)), Initializer( MaybeUnused( POD(given.float_type, "val_a_field%d" % self.dep_to_index[dep])), "fp_tex1Dfetch(field%d_tex, a_index)" % self.dep_to_index[dep]) ]) for dep in ext_deps: bblock.extend([ Comment(str(dep)), Initializer( MaybeUnused( POD(given.float_type, "val_b_field%d" % self.dep_to_index[dep])), "fp_tex1Dfetch(field%s_tex, b_index)" % self.dep_to_index[dep]) ]) f2cm = FluxToCodeMapper(given.float_type) comp_code = [Line()] for flux_nr, flux_rec in nrs_and_fluxes: comp_code.append( Statement( ("flux%d += " % flux_nr) + flux_to_code(f2cm, is_flipped=False, int_field_expr=flux_rec.bpair.field, ext_field_expr=flux_rec.bpair.bfield, dep_to_index=self.dep_to_index, flux=flux_rec.flux_expr, prec=PREC_NONE))) if f2cm.cse_name_list: bblock.append(Line()) bblock.extend( Initializer(Value("value_type", cse_name), cse_str) for cse_name, cse_str in f2cm.cse_name_list) flux_write_code.extend([ Line(), Comment(nrs_and_fluxes[0][1].bpair.tag), If("(fpair->boundary_bitmap) & (1 << %d)" % (bdry_number), Block(bblock + comp_code)), ]) flux_write_code.extend( [ Line(), ] + [ self.gen_store(flux_nr, "fpair->a_dest+FACEDOF_NR", "fpair->face_jacobian * flux%d" % flux_nr) for flux_nr in range(len(self.fluxes)) ] #Assign("debugbuf[blockIdx.x*96+fpair_nr]", "10000+fpair->a_dest"), ) return flux_write_code
def get_temporary_decls(self, codegen_state, schedule_index): from loopy.kernel.data import temp_var_scope kernel = codegen_state.kernel base_storage_decls = [] temp_decls = [] # {{{ declare temporaries base_storage_sizes = {} base_storage_to_scope = {} base_storage_to_align_bytes = {} from cgen import ArrayOf, Initializer, AlignedAttribute, Value, Line for tv in sorted(six.itervalues(kernel.temporary_variables), key=lambda tv: tv.name): decl_info = tv.decl_info(self.target, index_dtype=kernel.index_dtype) if not tv.base_storage: for idi in decl_info: # global temp vars are mapped to arguments or global declarations if tv.scope != temp_var_scope.GLOBAL: decl = self.wrap_temporary_decl( self.get_temporary_decl(kernel, schedule_index, tv, idi), tv.scope) if tv.initializer is not None: decl = Initializer( decl, generate_array_literal(codegen_state, tv, tv.initializer)) temp_decls.append(decl) else: assert tv.initializer is None offset = 0 base_storage_sizes.setdefault(tv.base_storage, []).append(tv.nbytes) base_storage_to_scope.setdefault(tv.base_storage, []).append(tv.scope) align_size = tv.dtype.itemsize from loopy.kernel.array import VectorArrayDimTag for dim_tag, axis_len in zip(tv.dim_tags, tv.shape): if isinstance(dim_tag, VectorArrayDimTag): align_size *= axis_len base_storage_to_align_bytes.setdefault(tv.base_storage, []).append(align_size) for idi in decl_info: cast_decl = POD(self, idi.dtype, "") temp_var_decl = POD(self, idi.dtype, idi.name) cast_decl = self.wrap_temporary_decl(cast_decl, tv.scope) temp_var_decl = self.wrap_temporary_decl( temp_var_decl, tv.scope) # The 'restrict' part of this is a complete lie--of course # all these temporaries are aliased. But we're promising to # not use them to shovel data from one representation to the # other. That counts, right? cast_decl = _ConstRestrictPointer(cast_decl) temp_var_decl = _ConstRestrictPointer(temp_var_decl) cast_tp, cast_d = cast_decl.get_decl_pair() temp_var_decl = Initializer( temp_var_decl, "(%s %s) (%s + %s)" % (" ".join(cast_tp), cast_d, tv.base_storage, offset)) temp_decls.append(temp_var_decl) from pytools import product offset += (idi.dtype.itemsize * product(si for si in idi.shape)) for bs_name, bs_sizes in sorted(six.iteritems(base_storage_sizes)): bs_var_decl = Value("char", bs_name) from pytools import single_valued bs_var_decl = self.wrap_temporary_decl( bs_var_decl, single_valued(base_storage_to_scope[bs_name])) bs_var_decl = ArrayOf(bs_var_decl, max(bs_sizes)) alignment = max(base_storage_to_align_bytes[bs_name]) bs_var_decl = AlignedAttribute(alignment, bs_var_decl) base_storage_decls.append(bs_var_decl) # }}} result = base_storage_decls + temp_decls if result: result.append(Line()) return result
def get_scalar_diff_code(): code = [] for inl in range(par.inline): for axis in dims: code.append( Initializer(POD(float_type, "d%drst%d" % (inl, axis)), 0)) code.append(Line()) tex_channels = ["x", "y", "z", "w"] store_code = Block() for inl in range(par.inline): for rst_axis in dims: store_code.append(Assign( "drst%d_global[GLOBAL_MB_IMAGE_DOF_BASE + " "%d*ALIGNED_IMAGE_DOFS_PER_MB + IMAGE_MB_DOF]" % (rst_axis, inl), "d%drst%d" % (inl, rst_axis) )) from hedge.backends.cuda.tools import unroll code.extend([ Comment("everybody needs to be done with the old data"), S("__syncthreads()"), Line(), get_load_code(), Line(), Comment("all the new data must be loaded"), S("__syncthreads()"), Line(), ]) if float_type == numpy.float32: code.append(Value("float%d" % rst_channels, "dmat_entries")) code.extend([ POD(float_type, "field_value%d" % inl) for inl in range(par.inline) ]+[Line()]) def unroll_body(j): result = [ Assign("field_value%d" % inl, "smem_field[PAR_MB_NR][%d][mb_el*PREIMAGE_DOFS_PER_EL+%s]" % (inl, j)) for inl in range(par.inline) ] if float_type == numpy.float32: result.append(Assign("dmat_entries", "tex1Dfetch(diff_rst_mat_tex, IMAGE_EL_DOF + %s*IMAGE_DOFS_PER_EL)" % j)) result.extend( S("d%drst%d += dmat_entries.%s * field_value%d" % (inl, axis, tex_channels[axis], inl)) for inl in range(par.inline) for axis in dims) elif float_type == numpy.float64: result.extend( S("d%(inl)drst%(axis)d += " "fp_tex1Dfetch(diff_rst_mat_tex, %(axis)d " "+ DIMENSIONS*(IMAGE_EL_DOF + %(j)d*IMAGE_DOFS_PER_EL))" "* field_value%(inl)d" % { "inl": inl, "axis": axis, "j": j }) for inl in range(par.inline) for axis in dims) else: assert False return result code.append(If("IMAGE_MB_DOF < IMAGE_DOFS_PER_MB", Block(unroll(unroll_body, total_number=plan.preimage_dofs_per_el) +[store_code]))) return code
def get_kernel(self, fdata, ilist_data, for_benchmark): from cgen.cuda import CudaShared, CudaGlobal from pycuda.tools import dtype_to_ctype discr = self.discr given = self.plan.given fplan = self.plan d = discr.dimensions dims = range(d) elgroup, = discr.element_groups float_type = given.float_type f_decl = CudaGlobal( FunctionDeclaration(Value("void", "apply_flux"), [ Pointer(POD(float_type, "debugbuf")), Pointer(POD(numpy.uint8, "gmem_facedata")), ] + [ Pointer(POD(float_type, "gmem_fluxes_on_faces%d" % flux_nr)) for flux_nr in range(len(self.fluxes)) ])) cmod = Module() cmod.append(Include("pycuda-helpers.hpp")) for dep_expr in self.all_deps: cmod.extend([ Value( "texture<%s, 1, cudaReadModeElementType>" % dtype_to_ctype(float_type, with_fp_tex_hack=True), "field%d_tex" % self.dep_to_index[dep_expr]) ]) if fplan.flux_count != len(self.fluxes): from warnings import warn warn( "Flux count in flux execution plan different from actual flux count.\n" "You may want to specify the tune_for= kwarg in the Discretization\n" "constructor.") cmod.extend([ Line(), Typedef(POD(float_type, "value_type")), Line(), flux_header_struct(float_type, discr.dimensions), Line(), face_pair_struct(float_type, discr.dimensions), Line(), Define("DIMENSIONS", discr.dimensions), Define("DOFS_PER_FACE", fplan.dofs_per_face), Define("THREADS_PER_FACE", fplan.threads_per_face()), Line(), Define("CONCURRENT_FACES", fplan.parallel_faces), Define("BLOCK_MB_COUNT", fplan.mbs_per_block), Line(), Define("FACEDOF_NR", "threadIdx.x"), Define("BLOCK_FACE", "threadIdx.y"), Line(), Define("FLUX_COUNT", len(self.fluxes)), Line(), Define("THREAD_NUM", "(FACEDOF_NR + BLOCK_FACE*THREADS_PER_FACE)"), Define("THREAD_COUNT", "(THREADS_PER_FACE*CONCURRENT_FACES)"), Define( "COALESCING_THREAD_COUNT", "(THREAD_COUNT < 0x10 ? THREAD_COUNT : THREAD_COUNT & ~0xf)"), Line(), Define("DATA_BLOCK_SIZE", fdata.block_bytes), Define("ALIGNED_FACE_DOFS_PER_MB", fplan.aligned_face_dofs_per_microblock()), Define("ALIGNED_FACE_DOFS_PER_BLOCK", "(ALIGNED_FACE_DOFS_PER_MB*BLOCK_MB_COUNT)"), Line(), Define("FOF_BLOCK_BASE", "(blockIdx.x*ALIGNED_FACE_DOFS_PER_BLOCK)"), Line(), ] + ilist_data.code + [ Line(), Value("texture<index_list_entry_t, 1, cudaReadModeElementType>", "tex_index_lists"), Line(), fdata.struct, Line(), CudaShared(Value("flux_data", "data")), ]) if not fplan.direct_store: cmod.extend([ CudaShared( ArrayOf( ArrayOf(POD(float_type, "smem_fluxes_on_faces"), "FLUX_COUNT"), "ALIGNED_FACE_DOFS_PER_MB*BLOCK_MB_COUNT")), Line(), ]) S = Statement f_body = Block() from hedge.backends.cuda.tools import get_load_code f_body.extend( get_load_code(dest="&data", base="gmem_facedata + blockIdx.x*DATA_BLOCK_SIZE", bytes="sizeof(flux_data)", descr="load face_pair data") + [S("__syncthreads()"), Line()]) def get_flux_code(flux_writer): flux_code = Block([]) flux_code.extend([ Initializer(Pointer(Value("face_pair", "fpair")), "data.facepairs+fpair_nr"), Initializer( MaybeUnused(POD(numpy.uint32, "a_index")), "fpair->a_base + tex1Dfetch(tex_index_lists, " "fpair->a_ilist_index + FACEDOF_NR)"), Initializer( MaybeUnused(POD(numpy.uint32, "b_index")), "fpair->b_base + tex1Dfetch(tex_index_lists, " "fpair->b_ilist_index + FACEDOF_NR)"), Line(), flux_writer(), Line(), S("fpair_nr += CONCURRENT_FACES") ]) return flux_code flux_computation = Block([ Comment("fluxes for dual-sided (intra-block) interior face pairs"), While("fpair_nr < data.header.same_facepairs_end", get_flux_code(lambda: self.write_interior_flux_code(True))), Line(), Comment("work around nvcc assertion failure"), S("fpair_nr+=1"), S("fpair_nr-=1"), Line(), Comment( "fluxes for single-sided (inter-block) interior face pairs"), While("fpair_nr < data.header.diff_facepairs_end", get_flux_code(lambda: self.write_interior_flux_code(False))), Line(), Comment("fluxes for single-sided boundary face pairs"), While( "fpair_nr < data.header.bdry_facepairs_end", get_flux_code( lambda: self.write_boundary_flux_code(for_benchmark))), ]) f_body.extend_log_block("compute the fluxes", [ Initializer(POD(numpy.uint32, "fpair_nr"), "BLOCK_FACE"), If("FACEDOF_NR < DOFS_PER_FACE", flux_computation) ]) if not fplan.direct_store: f_body.extend([Line(), S("__syncthreads()"), Line()]) f_body.extend_log_block( "store fluxes", [ #Assign("debugbuf[blockIdx.x]", "FOF_BLOCK_BASE"), #Assign("debugbuf[0]", "FOF_BLOCK_BASE"), #Assign("debugbuf[0]", "sizeof(face_pair)"), For( "unsigned word_nr = THREAD_NUM", "word_nr < ALIGNED_FACE_DOFS_PER_MB*BLOCK_MB_COUNT", "word_nr += COALESCING_THREAD_COUNT", Block([ Assign( "gmem_fluxes_on_faces%d[FOF_BLOCK_BASE+word_nr]" % flux_nr, "smem_fluxes_on_faces[%d][word_nr]" % flux_nr) for flux_nr in range(len(self.fluxes)) ] #+[If("isnan(smem_fluxes_on_faces[%d][word_nr])" % flux_nr, #Block([ #Assign("debugbuf[blockIdx.x]", "word_nr"), #]) #) #for flux_nr in range(len(self.fluxes))] )) ]) if False: f_body.extend([ Assign("debugbuf[blockIdx.x*96+32+BLOCK_FACE*32+threadIdx.x]", "fpair_nr"), Assign("debugbuf[blockIdx.x*96+16]", "data.header.same_facepairs_end"), Assign("debugbuf[blockIdx.x*96+17]", "data.header.diff_facepairs_end"), Assign("debugbuf[blockIdx.x*96+18]", "data.header.bdry_facepairs_end"), ]) # finish off ---------------------------------------------------------- cmod.append(FunctionBody(f_decl, f_body)) if not for_benchmark and "cuda_dump_kernels" in discr.debug: from hedge.tools import open_unique_debug_file open_unique_debug_file("flux_gather", ".cu").write(str(cmod)) #from pycuda.tools import allow_user_edit mod = SourceModule( #allow_user_edit(cmod, "kernel.cu", "the flux kernel"), cmod, keep="cuda_keep_kernels" in discr.debug) expr_to_texture_map = dict( (dep_expr, mod.get_texref("field%d_tex" % self.dep_to_index[dep_expr])) for dep_expr in self.all_deps) index_list_texref = mod.get_texref("tex_index_lists") index_list_texref.set_address(ilist_data.device_memory, ilist_data.bytes) index_list_texref.set_format( cuda.dtype_to_array_format(ilist_data.type), 1) index_list_texref.set_flags(cuda.TRSF_READ_AS_INTEGER) func = mod.get_function("apply_flux") block = (fplan.threads_per_face(), fplan.parallel_faces, 1) func.prepare( (2 + len(self.fluxes)) * "P", texrefs=expr_to_texture_map.values() + [index_list_texref]) if "cuda_flux" in discr.debug: print "flux: lmem=%d smem=%d regs=%d" % ( func.local_size_bytes, func.shared_size_bytes, func.num_regs) return block, func, expr_to_texture_map
def get_boundary_flux_mod(fluxes, fvi, discr, dtype): from cgen import \ FunctionDeclaration, FunctionBody, Typedef, Struct, \ Const, Reference, Value, POD, MaybeUnused, \ Statement, Include, Line, Block, Initializer, Assign, \ CustomLoop, For from pytools import to_uncomplex_dtype, flatten from codepy.bpl import BoostPythonModule mod = BoostPythonModule() mod.add_to_preamble([ Include("cstdlib"), Include("algorithm"), Line(), Include("boost/foreach.hpp"), Line(), Include("hedge/face_operators.hpp"), ]) S = Statement mod.add_to_module([ S("using namespace hedge"), S("using namespace pyublas"), Line(), Typedef(POD(dtype, "value_type")), Typedef(POD(to_uncomplex_dtype(dtype), "uncomplex_type")), ]) arg_struct = Struct("arg_struct", [ Value("numpy_array<value_type>", "flux%d_on_faces" % i) for i in range(len(fluxes)) ]+[ Value("numpy_array<value_type>", arg_name) for arg_name in fvi.arg_names ]) mod.add_struct(arg_struct, "ArgStruct") mod.add_to_module([Line()]) fdecl = FunctionDeclaration( Value("void", "gather_flux"), [ Const(Reference(Value("face_group<face_pair<straight_face> >" , "fg"))), Reference(Value("arg_struct", "args")) ]) from pymbolic.mapper.stringifier import PREC_PRODUCT def gen_flux_code(): f2cm = FluxToCodeMapper() result = [ Assign("fof%d_it[loc_fof_base+i]" % flux_idx, "uncomplex_type(fp.int_side.face_jacobian) * " + flux_to_code(f2cm, False, flux_idx, fvi, flux.op.flux, PREC_PRODUCT)) for flux_idx, flux in enumerate(fluxes) ] return [ Initializer(Value("value_type", cse_name), cse_str) for cse_name, cse_str in f2cm.cse_name_list] + result fbody = Block([ Initializer( Const(Value("numpy_array<value_type>::iterator", "fof%d_it" % i)), "args.flux%d_on_faces.begin()" % i) for i in range(len(fluxes)) ]+[ Initializer( Const(Value("numpy_array<value_type>::const_iterator", "%s_it" % arg_name)), "args.%s.begin()" % arg_name) for arg_name in fvi.arg_names ]+[ Line(), CustomLoop("BOOST_FOREACH(const face_pair<straight_face> &fp, fg.face_pairs)", Block( list(flatten([ Initializer(Value("node_number_t", "%s_ebi" % where), "fp.%s.el_base_index" % where), Initializer(Value("index_lists_t::const_iterator", "%s_idx_list" % where), "fg.index_list(fp.%s.face_index_list_number)" % where), Line(), ] for where in ["int_side", "ext_side"] ))+[ Line(), Initializer(Value("node_number_t", "loc_fof_base"), "fg.face_length()*(fp.%(where)s.local_el_number*fg.face_count" " + fp.%(where)s.face_id)" % {"where": "int_side"}), Line(), For( "unsigned i = 0", "i < fg.face_length()", "++i", Block( [ Initializer(MaybeUnused( Value("node_number_t", "%s_idx" % where)), "%(where)s_ebi + %(where)s_idx_list[i]" % {"where": where}) for where in ["int_side", "ext_side"] ]+gen_flux_code() ) ) ])) ]) mod.add_function(FunctionBody(fdecl, fbody)) #print "----------------------------------------------------------------" #print mod.generate() #raw_input("[Enter]") return mod.compile(get_flux_toolchain(discr, fluxes))