def _arg_names_and_decls(self, codegen_state): implemented_data_info = codegen_state.implemented_data_info arg_names = [iai.name for iai in implemented_data_info] arg_decls = [ self.idi_to_cgen_declarator(codegen_state.kernel, idi) for idi in implemented_data_info ] # {{{ occa compatibility hackery from cgen import Value if self.target.occa_mode: from cgen import ArrayOf, Const from cgen.ispc import ISPCUniform arg_decls = [ Const(ISPCUniform(ArrayOf(Value("int", "loopy_dims")))), Const(ISPCUniform(Value("int", "o1"))), Const(ISPCUniform(Value("int", "o2"))), Const(ISPCUniform(Value("int", "o3"))), ] + arg_decls arg_names = ["loopy_dims", "o1", "o2", "o3"] + arg_names # }}} return arg_names, arg_decls
def _C_typedecl(self): fields = [] for i, j in self.pfields: if i == self._field_flag: fields.append(Initializer(Value('volatile %s' % ctypes_to_cstr(j), i), 1)) else: fields.append(Value(ctypes_to_cstr(j), i)) return Struct(self.pname, fields)
def __init__(self, dims, mode, name): if mode == "r": spec = "__read_only" elif mode == "w": spec = "__write_only" else: raise ValueError("mode must be one of 'r' or 'w'") Value.__init__(self, "%s image%dd_t" % (spec, dims), name)
def test_cgen(): s = Struct( "yuck", [ POD( np.float32, "h", ), POD(np.float32, "order"), POD(np.float32, "face_jacobian"), ArrayOf(POD(np.float32, "normal"), 17), POD(np.uint16, "a_base"), POD(np.uint16, "b_base"), #CudaGlobal(POD(np.uint8, "a_ilist_number")), POD(np.uint8, "b_ilist_number"), POD(np.uint8, "bdry_flux_number"), # 0 if not on boundary POD(np.uint8, "reserved"), POD(np.uint32, "b_global_base"), ]) f_decl = FunctionDeclaration(POD(np.uint16, "get_num"), [ POD(np.uint8, "reserved"), POD(np.uint32, "b_global_base"), ]) f_body = FunctionBody( f_decl, Block([ POD(np.uint32, "i"), For( "i = 0", "i < 17", "++i", If( "a > b", Assign("a", "b"), Block([ Assign("a", "b-1"), #Break(), ])), ), #BlankLine(), Comment("all done"), ])) t_decl = Template( 'typename T', FunctionDeclaration( Value('CUdeviceptr', 'scan'), [Value('CUdeviceptr', 'inputPtr'), Value('int', 'length')])) print(s) print(f_body) print(t_decl)
def get_elwise_module_descriptor(arguments, operation, name="kernel"): from codepy.bpl import BoostPythonModule from cgen import FunctionBody, FunctionDeclaration, \ Value, POD, Struct, For, Initializer, Include, Statement, \ Line, Block S = Statement # noqa: N806 mod = BoostPythonModule() mod.add_to_preamble([ Include("pyublas/numpy.hpp"), ]) mod.add_to_module([ S("namespace ublas = boost::numeric::ublas"), S("using namespace pyublas"), Line(), ]) body = Block([ Initializer( Value( "numpy_array<{} >::iterator".format(dtype_to_ctype( varg.dtype)), varg.name), f"args.{varg.name}_ary.begin()") for varg in arguments if isinstance(varg, VectorArg) ] + [ Initializer(sarg.declarator(), f"args.{sarg.name}") for sarg in arguments if isinstance(sarg, ScalarArg) ]) body.extend([ Line(), For("unsigned i = 0", "i < codepy_length", "++i", Block([S(operation)])) ]) arg_struct = Struct("arg_struct", [arg.declarator() for arg in arguments]) mod.add_struct(arg_struct, "ArgStruct") mod.add_to_module([Line()]) mod.add_function( FunctionBody( FunctionDeclaration(Value("void", name), [ POD(numpy.uintp, "codepy_length"), Value("arg_struct", "args") ]), body)) return mod
def get_function_declaration(self, codegen_state, codegen_result, schedule_index): name = codegen_result.current_program(codegen_state).name from cgen import (FunctionDeclaration, Value) from cgen.ispc import ISPCExport, ISPCTask arg_names, arg_decls = self._arg_names_and_decls(codegen_state) if codegen_state.is_generating_device_code: return ISPCTask(FunctionDeclaration(Value("void", name), arg_decls)) else: return ISPCExport( FunctionDeclaration(Value("void", name), arg_decls))
def make_codepy_module(self, toolchain, dtype): from codepy.libraries import add_codepy toolchain = toolchain.copy() add_codepy(toolchain) from cgen import (Value, Include, Statement, Typedef, FunctionBody, FunctionDeclaration, Block, Const, Line, POD, Initializer, CustomLoop) S = Statement from codepy.bpl import BoostPythonModule mod = BoostPythonModule() mod.add_to_preamble([ Include("vector"), Include("algorithm"), Include("hedge/base.hpp"), Include("hedge/volume_operators.hpp"), Include("boost/foreach.hpp"), Include("boost/numeric/ublas/io.hpp"), ]+self.get_cpu_extra_includes()) mod.add_to_module([ S("namespace ublas = boost::numeric::ublas"), S("using namespace hedge"), S("using namespace pyublas"), Line(), Typedef(POD(dtype, "value_type")), Line(), ]) mod.add_function(FunctionBody( FunctionDeclaration(Value("void", "process_elements"), [ Const(Value("uniform_element_ranges", "ers")), Const(Value("numpy_vector<value_type>", "field")), Value("numpy_vector<value_type>", "result"), ]+self.get_cpu_extra_parameter_declarators()), Block([ Typedef(Value("numpy_vector<value_type>::iterator", "it_type")), Typedef(Value("numpy_vector<value_type>::const_iterator", "cit_type")), Line(), Initializer(Value("it_type", "result_it"), "result.begin()"), Initializer(Value("cit_type", "field_it"), "field.begin()"), Line() ]+self.get_cpu_extra_preamble()+[ Line(), CustomLoop( "BOOST_FOREACH(const element_range er, ers)", Block(self.get_cpu_per_element_code()) ) ]))) #print mod.generate() #toolchain = toolchain.copy() #toolchain.enable_debugging return mod.compile(toolchain)
def cdef(self): """ Return a :class:`cgen.Struct` representing the profiler data structure in C (a ``struct``). """ return Struct('profiler', [Value('double', i.name) for i in self._sections])
def _C_typedecl(self): if self._is_composite_dtype: return Struct( self.pname, [Value(ctypes_to_cstr(j), i) for i, j in self.pfields]) else: return None
def add_struct(self, struct, py_name=None, py_member_name_transform=lambda x: x, by_value_members=None): if by_value_members is None: by_value_members = set() from cgen import Block, Line, Statement, Typedef, Value if py_name is None: py_name = struct.tpname self.mod_body.append(struct) member_defs = [] for f in struct.fields: py_f_name = py_member_name_transform(f.name) tp_lines, declarator = f.get_decl_pair() if f.name in by_value_members or tp_lines[0].startswith("numpy_"): member_defs.append( ".def(pyublas::by_value_rw_member" f'("{py_f_name}", &cl::{f.name}))') else: member_defs.append( f'.def_readwrite("{py_f_name}", &cl::{f.name})' ) self.init_body.append( Block([ Typedef(Value(struct.tpname, "cl")), Line(), Statement( 'boost::python::class_<cl>("{}"){}'.format( py_name, "".join(member_defs))), ]))
def get_cpu_per_element_code(self): from cgen import (Value, Statement, Initializer, While, Comment, Block, For, Line, Pointer) S = Statement return [ # assumes there is more than one coefficient Initializer(Value("cit_type", "el_modes"), "field_it+er.first"), Line(), Comment("zero out reduced_modes"), For("npy_uint32 mode_idx = 0", "mode_idx < max_degree+1", "++mode_idx", S("reduced_modes[mode_idx] = 0")), Line(), Comment("gather modes by degree"), For("npy_uint32 mode_idx = 0", "mode_idx < mode_count", "++mode_idx", S("reduced_modes[mode_degrees_iterator[mode_idx]]" " += el_modes[mode_idx]")), Line(), Comment("perform skyline procedure"), Initializer(Pointer(Value("value_type", "start")), "reduced_modes.get()"), Initializer(Pointer(Value("value_type", "end")), "start+max_degree+1"), Initializer(Value("value_type", "cur_max"), "std::max(*(end-1), *(end-2))"), Line(), While("end != start", Block([ S("--end"), S("*end = std::max(cur_max, *end)"), ])), Line(), Comment("scatter modes by degree"), Initializer(Value("it_type", "tgt_base"), "result_it+er.first"), For("npy_uint32 mode_idx = 0", "mode_idx < mode_count", "++mode_idx", S("tgt_base[mode_idx] = " "reduced_modes[mode_degrees_iterator[mode_idx]]")), ]
def get_function_declaration(self, codegen_state, codegen_result, schedule_index): from cgen import FunctionDeclaration, Value name = codegen_result.current_program(codegen_state).name if self.target.fortran_abi: name += "_" if codegen_state.is_entrypoint: name = Value("void", name) else: name = Value("static void", name) return FunctionDeclarationWrapper( FunctionDeclaration(name, [ self.idi_to_cgen_declarator(codegen_state.kernel, idi) for idi in codegen_state.implemented_data_info ]))
def make_it(name, is_const=True, tpname="value_type"): if is_const: const = "const_" else: const = "" return Initializer( Value("numpy_array<%s>::%siterator" % (tpname, const), name + "_it"), "%s.begin()" % name)
def _C_neighbours(self): """A :class:`ctypes.Struct` to access the neighborhood of a given rank.""" entries = list(product(self.dimensions, [LEFT, RIGHT])) fields = [('%s%s' % (d, i), c_int) for d, i in entries] obj = CompositeObject('nb', 'neighbours', Structure, fields) for d, i in entries: setattr(obj.value._obj, '%s%s' % (d, i), self.neighbours[d][i]) cdef = Struct('neighbours', [Value('int', i) for i, _ in fields]) CNeighbours = namedtuple('CNeighbours', 'ctype cdef obj') return CNeighbours(obj.dtype, cdef, obj)
def get_cpu_extra_preamble(self): from cgen import Initializer, Value, POD, Statement return [ Initializer(Value("numpy_array<npy_uint32>::const_iterator", "mode_degrees_iterator"), "mode_degrees.begin()"), Initializer(POD(numpy.uint32, "mode_count"), "mode_degrees.size()"), Statement("boost::scoped_array<value_type> reduced_modes" "(new value_type[max_degree+1])"), ]
def generate_top_of_body(self, codegen_state): from loopy.kernel.data import ImageArg if any(isinstance(arg, ImageArg) for arg in codegen_state.kernel.args): from cgen import Value, Const, Initializer return [ Initializer(Const(Value("sampler_t", "loopy_sampler")), "CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP " "| CLK_FILTER_NEAREST") ] return []
def get_cuda_extra_preamble(self, discr, dtype, eg): from cgen import ArrayOf, Value, Initializer from cgen.cuda import CudaConstant ldis = eg.local_discretization mode_degrees = [sum(mode_indices) for mode_indices in ldis.generate_mode_identifiers()] return [Initializer(CudaConstant( ArrayOf(Value("unsigned", "mode_degrees"))), "{%s}" % ", ".join(str(i) for i in mode_degrees)) ]
def gen_flux_code(): f2cm = FluxToCodeMapper() result = [ Assign("fof%d_it[loc_fof_base+i]" % flux_idx, "uncomplex_type(fp.int_side.face_jacobian) * " + flux_to_code(f2cm, False, flux_idx, fvi, flux.op.flux, PREC_PRODUCT)) for flux_idx, flux in enumerate(fluxes) ] return [ Initializer(Value("value_type", cse_name), cse_str) for cse_name, cse_str in f2cm.cse_name_list] + result
def make_greet_mod(greeting): from cgen import FunctionBody, FunctionDeclaration, Block, \ Const, Pointer, Value, Statement from codepy.bpl import BoostPythonModule mod = BoostPythonModule() mod.add_function( FunctionBody( FunctionDeclaration(Const(Pointer(Value("char", "greet"))), []), Block([Statement('return "%s"' % greeting)]))) from codepy.toolchain import guess_toolchain return mod.compile(guess_toolchain(), wait_on_error=True)
def expose_vector_type(self, name, py_name=None): self.add_codepy_include() if py_name is None: py_name = name from cgen import (Block, Typedef, Line, Statement, Value) self.init_body.append( Block([ Typedef(Value(name, "cl")), Line(), Statement( f'boost::python::class_<cl>("{py_name}")' ".def(codepy::no_compare_indexing_suite<cl>())"), ]))
def gen_flux_code(): f2cm = FluxToCodeMapper() result = [ Assign("fof%d_it[%s_fof_base+%s]" % (flux_idx, where, tgt_idx), "uncomplex_type(fp.int_side.face_jacobian) * " + flux_to_code(f2cm, is_flipped, flux_idx, fvi, flux.op.flux, PREC_PRODUCT)) for flux_idx, flux in enumerate(fluxes) for where, is_flipped, tgt_idx in [ ("int_side", False, "i"), ("ext_side", True, "ext_native_write_map[i]") ]] return [ Initializer(Value("value_type", cse_name), cse_str) for cse_name, cse_str in f2cm.cse_name_list] + result
def _C_typedecl(self): # Overriding for better code readability # # Struct neighborhood Struct neighborhood # { { # int ll; int ll, lc, lr; # int lc; VS ... # int lr; ... # ... ... # } } # # With this override, we generate the one on the right groups = [list(g) for k, g in groupby(self.pfields, key=lambda x: x[0][0])] groups = [(j[0], i) for i, j in [zip(*g) for g in groups]] return Struct(self.pname, [Value(ctypes_to_cstr(i), ', '.join(j)) for i, j in groups])
def get_flux_code(flux_writer): flux_code = Block([]) flux_code.extend([ Initializer(Pointer(Value("face_pair", "fpair")), "data.facepairs+fpair_nr"), Initializer( MaybeUnused(POD(numpy.uint32, "a_index")), "fpair->a_base + tex1Dfetch(tex_index_lists, " "fpair->a_ilist_index + FACEDOF_NR)"), Initializer( MaybeUnused(POD(numpy.uint32, "b_index")), "fpair->b_base + tex1Dfetch(tex_index_lists, " "fpair->b_ilist_index + FACEDOF_NR)"), Line(), flux_writer(), Line(), S("fpair_nr += CONCURRENT_FACES") ]) return flux_code
def get_cpu_per_element_code(self): from cgen import (Value, Statement, Initializer, While, Block) S = Statement return [ # assumes there is more than one coefficient Initializer(Value("cit_type", "start"), "field_it+er.first"), Initializer(Value("cit_type", "end"), "field_it+er.second"), Initializer(Value("it_type", "tgt"), "result_it+er.first"), Initializer(Value("cit_type", "cur"), "start"), While("cur != end", Block([ Initializer(Value("cit_type", "avg_start"), "std::max(start, cur-1)"), Initializer(Value("cit_type", "avg_end"), "std::min(end, cur+2)"), S("*tgt++ = std::accumulate(avg_start, avg_end, value_type(0))" "/std::distance(avg_start, avg_end)"), S("++cur"), ]) ) ]
def _C_typedecl(self): return Struct(self.pname, [Value(ctypes_to_cstr(j), i) for i, j in self.pfields])
def get_temporary_decls(self, codegen_state, schedule_index): from loopy.kernel.data import temp_var_scope kernel = codegen_state.kernel base_storage_decls = [] temp_decls = [] # {{{ declare temporaries base_storage_sizes = {} base_storage_to_scope = {} base_storage_to_align_bytes = {} from cgen import ArrayOf, Initializer, AlignedAttribute, Value, Line for tv in sorted(six.itervalues(kernel.temporary_variables), key=lambda tv: tv.name): decl_info = tv.decl_info(self.target, index_dtype=kernel.index_dtype) if not tv.base_storage: for idi in decl_info: # global temp vars are mapped to arguments or global declarations if tv.scope != temp_var_scope.GLOBAL: decl = self.wrap_temporary_decl( self.get_temporary_decl(kernel, schedule_index, tv, idi), tv.scope) if tv.initializer is not None: decl = Initializer( decl, generate_array_literal(codegen_state, tv, tv.initializer)) temp_decls.append(decl) else: assert tv.initializer is None offset = 0 base_storage_sizes.setdefault(tv.base_storage, []).append(tv.nbytes) base_storage_to_scope.setdefault(tv.base_storage, []).append(tv.scope) align_size = tv.dtype.itemsize from loopy.kernel.array import VectorArrayDimTag for dim_tag, axis_len in zip(tv.dim_tags, tv.shape): if isinstance(dim_tag, VectorArrayDimTag): align_size *= axis_len base_storage_to_align_bytes.setdefault(tv.base_storage, []).append(align_size) for idi in decl_info: cast_decl = POD(self, idi.dtype, "") temp_var_decl = POD(self, idi.dtype, idi.name) cast_decl = self.wrap_temporary_decl(cast_decl, tv.scope) temp_var_decl = self.wrap_temporary_decl( temp_var_decl, tv.scope) # The 'restrict' part of this is a complete lie--of course # all these temporaries are aliased. But we're promising to # not use them to shovel data from one representation to the # other. That counts, right? cast_decl = _ConstRestrictPointer(cast_decl) temp_var_decl = _ConstRestrictPointer(temp_var_decl) cast_tp, cast_d = cast_decl.get_decl_pair() temp_var_decl = Initializer( temp_var_decl, "(%s %s) (%s + %s)" % (" ".join(cast_tp), cast_d, tv.base_storage, offset)) temp_decls.append(temp_var_decl) from pytools import product offset += (idi.dtype.itemsize * product(si for si in idi.shape)) for bs_name, bs_sizes in sorted(six.iteritems(base_storage_sizes)): bs_var_decl = Value("char", bs_name) from pytools import single_valued bs_var_decl = self.wrap_temporary_decl( bs_var_decl, single_valued(base_storage_to_scope[bs_name])) bs_var_decl = ArrayOf(bs_var_decl, max(bs_sizes)) alignment = max(base_storage_to_align_bytes[bs_name]) bs_var_decl = AlignedAttribute(alignment, bs_var_decl) base_storage_decls.append(bs_var_decl) # }}} result = base_storage_decls + temp_decls if result: result.append(Line()) return result
a = numpy.random.randn(total_size).astype(dtype) b = numpy.random.randn(total_size).astype(dtype) a_gpu = cuda.to_device(a) b_gpu = cuda.to_device(b) c_gpu = cuda.mem_alloc(a.nbytes) from cgen import FunctionBody, \ FunctionDeclaration, Typedef, POD, Value, \ Pointer, Module, Block, Initializer, Assign from cgen.cuda import CudaGlobal mod = Module([ FunctionBody( CudaGlobal( FunctionDeclaration(Value("void", "add"), arg_decls=[ Pointer(POD(dtype, name)) for name in ["tgt", "op1", "op2"] ])), Block([ Initializer( POD(numpy.int32, "idx"), "threadIdx.x + %d*blockIdx.x" % (block_size * thread_strides)), ] + [ Assign( "tgt[idx+%d]" % (o * block_size), "op1[idx+%d] + op2[idx+%d]" % (o * block_size, o * block_size)) for o in range(thread_strides) ])) ])
def get_temporary_decls(self, codegen_state, schedule_index): from loopy.kernel.data import AddressSpace kernel = codegen_state.kernel base_storage_decls = [] temp_decls = [] # {{{ declare temporaries base_storage_sizes = {} base_storage_to_scope = {} base_storage_to_align_bytes = {} from cgen import ArrayOf, Initializer, AlignedAttribute, Value, Line # Getting the temporary variables that are needed for the current # sub-kernel. from loopy.schedule.tools import ( temporaries_read_in_subkernel, temporaries_written_in_subkernel) subkernel = kernel.schedule[schedule_index].kernel_name sub_knl_temps = ( temporaries_read_in_subkernel(kernel, subkernel) | temporaries_written_in_subkernel(kernel, subkernel)) for tv in sorted( six.itervalues(kernel.temporary_variables), key=lambda tv: tv.name): decl_info = tv.decl_info(self.target, index_dtype=kernel.index_dtype) if not tv.base_storage: for idi in decl_info: # global temp vars are mapped to arguments or global declarations if tv.address_space != AddressSpace.GLOBAL and ( tv.name in sub_knl_temps): decl = self.wrap_temporary_decl( self.get_temporary_decl( codegen_state, schedule_index, tv, idi), tv.address_space) if tv.initializer is not None: assert tv.read_only decl = Initializer(decl, generate_array_literal( codegen_state, tv, tv.initializer)) temp_decls.append(decl) else: assert tv.initializer is None offset = 0 base_storage_sizes.setdefault(tv.base_storage, []).append( tv.nbytes) base_storage_to_scope.setdefault(tv.base_storage, []).append( tv.address_space) align_size = tv.dtype.itemsize from loopy.kernel.array import VectorArrayDimTag for dim_tag, axis_len in zip(tv.dim_tags, tv.shape): if isinstance(dim_tag, VectorArrayDimTag): align_size *= axis_len base_storage_to_align_bytes.setdefault(tv.base_storage, []).append( align_size) for idi in decl_info: cast_decl = POD(self, idi.dtype, "") temp_var_decl = POD(self, idi.dtype, idi.name) cast_decl = self.wrap_temporary_decl(cast_decl, tv.address_space) temp_var_decl = self.wrap_temporary_decl( temp_var_decl, tv.address_space) if tv._base_storage_access_may_be_aliasing: ptrtype = _ConstPointer else: # The 'restrict' part of this is a complete lie--of course # all these temporaries are aliased. But we're promising to # not use them to shovel data from one representation to the # other. That counts, right? ptrtype = _ConstRestrictPointer cast_decl = ptrtype(cast_decl) temp_var_decl = ptrtype(temp_var_decl) cast_tp, cast_d = cast_decl.get_decl_pair() temp_var_decl = Initializer( temp_var_decl, "(%s %s) (%s + %s)" % ( " ".join(cast_tp), cast_d, tv.base_storage, offset)) temp_decls.append(temp_var_decl) from pytools import product offset += ( idi.dtype.itemsize * product(si for si in idi.shape)) ecm = self.get_expression_to_code_mapper(codegen_state) for bs_name, bs_sizes in sorted(six.iteritems(base_storage_sizes)): bs_var_decl = Value("char", bs_name) from pytools import single_valued bs_var_decl = self.wrap_temporary_decl( bs_var_decl, single_valued(base_storage_to_scope[bs_name])) # FIXME: Could try to use isl knowledge to simplify max. if all(isinstance(bs, int) for bs in bs_sizes): bs_size_max = max(bs_sizes) else: bs_size_max = p.Max(tuple(bs_sizes)) bs_var_decl = ArrayOf(bs_var_decl, ecm(bs_size_max)) alignment = max(base_storage_to_align_bytes[bs_name]) bs_var_decl = AlignedAttribute(alignment, bs_var_decl) base_storage_decls.append(bs_var_decl) # }}} result = base_storage_decls + temp_decls if result: result.append(Line()) return result
def make_cuda_kernel(self, discr, dtype, eg): given = discr.given ldis = eg.local_discretization microblocks_per_block = 1 from cgen.cuda import CudaGlobal from cgen import (Module, Value, Include, Typedef, FunctionBody, FunctionDeclaration, Const, Line, POD, LiteralBlock, Define, Pointer) cmod = Module([ Include("pycuda-helpers.hpp"), Line(), Typedef(POD(dtype, "value_type")), Line(), Define("DOFS_PER_EL", given.dofs_per_el()), Define("ALIGNED_DOFS_PER_MB", given.microblock.aligned_floats), Define("VERTICES_PER_EL", ldis.vertex_count()), Define("ELS_PER_MB", given.microblock.elements), Define("MBS_PER_BLOCK", microblocks_per_block), Line(), Define("DOF_IN_MB_IDX", "threadIdx.x"), Define("DOF_IN_EL_IDX", "(DOF_IN_MB_IDX-el_idx_in_mb*DOFS_PER_EL)"), Define("MB_IN_BLOCK_IDX", "threadIdx.y"), Define("BLOCK_IDX", "blockIdx.x"), Define("MB_NUMBER", "(BLOCK_IDX * MBS_PER_BLOCK + MB_IN_BLOCK_IDX)"), Define("BLOCK_DATA", "whole_block[MB_IN_BLOCK_IDX]")] + self.get_cuda_extra_preamble(discr, dtype, eg) + [FunctionBody( CudaGlobal(FunctionDeclaration( Value("void", "elwise_kernel"), [ Pointer(Const(POD(dtype, "field"))), Pointer(POD(dtype, "result")), POD(numpy.uint32, "mb_count"), ])), LiteralBlock(""" int el_idx_in_mb = DOF_IN_MB_IDX / DOFS_PER_EL; if (MB_NUMBER >= mb_count) return; int idx = MB_NUMBER * ALIGNED_DOFS_PER_MB + DOF_IN_MB_IDX; int element_base_idx = ALIGNED_DOFS_PER_MB * MB_IN_BLOCK_IDX + (DOF_IN_MB_IDX / DOFS_PER_EL) * DOFS_PER_EL; int dof_in_element = DOF_IN_MB_IDX-el_idx_in_mb*DOFS_PER_EL; __shared__ value_type whole_block[MBS_PER_BLOCK][ALIGNED_DOFS_PER_MB+1]; int idx_in_block = ALIGNED_DOFS_PER_MB * MB_IN_BLOCK_IDX + DOF_IN_MB_IDX; BLOCK_DATA[idx_in_block] = field[idx]; __syncthreads(); %s result[idx] = node_result; """ % self.get_cuda_code(discr, dtype, eg))) ]) if False: for i, l in enumerate(str(cmod).split("\n")): print i+1, l raw_input() from pycuda.compiler import SourceModule mod = SourceModule( cmod, keep="cuda_keep_kernels" in discr.debug, ) func = mod.get_function("elwise_kernel") func.prepare( "PPI", block=( given.microblock.aligned_floats, microblocks_per_block, 1)) mb_count = len(discr.blocks) * discr.given.microblocks_per_block grid_dim = (mb_count + microblocks_per_block - 1) \ // microblocks_per_block from pytools import Record class KernelInfo(Record): pass return KernelInfo( func=func, grid_dim=grid_dim, mb_count=mb_count)
def get_cpu_extra_parameter_declarators(self): from cgen import Value, POD return [ Value("numpy_array<npy_uint32>", "mode_degrees"), POD(numpy.uint32, "max_degree")]