def generate(self): """Generate (i.e. yield) the source code of the module line-by-line. """ from cgen import Block, Module, Include, Line, Define, \ PrivateNamespace body = [] if self.max_arity is not None: body.append(Define("BOOST_PYTHON_MAX_ARITY", self.max_arity)) if self.use_private_namespace: mod_body = [PrivateNamespace(self.mod_body)] else: mod_body = self.mod_body body += ([Include("boost/python.hpp")] + self.preamble + [Line()] + mod_body + [Line(), Line(f"BOOST_PYTHON_MODULE({self.name})")] + [Block(self.init_body)]) return Module(body)
def add_raw_function_include(self): if self.has_raw_function_include: return from cgen import Include self.add_to_preamble([Include("boost/python/raw_function.hpp")]) self.has_raw_function_include = True
def add_codepy_include(self): if self.has_codepy_include: return from cgen import Include self.add_to_preamble([Include("codepy/bpl.hpp")]) self.has_codepy_include = True
def make_codepy_module(self, toolchain, dtype): from codepy.libraries import add_codepy toolchain = toolchain.copy() add_codepy(toolchain) from cgen import (Value, Include, Statement, Typedef, FunctionBody, FunctionDeclaration, Block, Const, Line, POD, Initializer, CustomLoop) S = Statement from codepy.bpl import BoostPythonModule mod = BoostPythonModule() mod.add_to_preamble([ Include("vector"), Include("algorithm"), Include("hedge/base.hpp"), Include("hedge/volume_operators.hpp"), Include("boost/foreach.hpp"), Include("boost/numeric/ublas/io.hpp"), ]+self.get_cpu_extra_includes()) mod.add_to_module([ S("namespace ublas = boost::numeric::ublas"), S("using namespace hedge"), S("using namespace pyublas"), Line(), Typedef(POD(dtype, "value_type")), Line(), ]) mod.add_function(FunctionBody( FunctionDeclaration(Value("void", "process_elements"), [ Const(Value("uniform_element_ranges", "ers")), Const(Value("numpy_vector<value_type>", "field")), Value("numpy_vector<value_type>", "result"), ]+self.get_cpu_extra_parameter_declarators()), Block([ Typedef(Value("numpy_vector<value_type>::iterator", "it_type")), Typedef(Value("numpy_vector<value_type>::const_iterator", "cit_type")), Line(), Initializer(Value("it_type", "result_it"), "result.begin()"), Initializer(Value("cit_type", "field_it"), "field.begin()"), Line() ]+self.get_cpu_extra_preamble()+[ Line(), CustomLoop( "BOOST_FOREACH(const element_range er, ers)", Block(self.get_cpu_per_element_code()) ) ]))) #print mod.generate() #toolchain = toolchain.copy() #toolchain.enable_debugging return mod.compile(toolchain)
def get_elwise_module_descriptor(arguments, operation, name="kernel"): from codepy.bpl import BoostPythonModule from cgen import FunctionBody, FunctionDeclaration, \ Value, POD, Struct, For, Initializer, Include, Statement, \ Line, Block S = Statement # noqa: N806 mod = BoostPythonModule() mod.add_to_preamble([ Include("pyublas/numpy.hpp"), ]) mod.add_to_module([ S("namespace ublas = boost::numeric::ublas"), S("using namespace pyublas"), Line(), ]) body = Block([ Initializer( Value( "numpy_array<{} >::iterator".format(dtype_to_ctype( varg.dtype)), varg.name), f"args.{varg.name}_ary.begin()") for varg in arguments if isinstance(varg, VectorArg) ] + [ Initializer(sarg.declarator(), f"args.{sarg.name}") for sarg in arguments if isinstance(sarg, ScalarArg) ]) body.extend([ Line(), For("unsigned i = 0", "i < codepy_length", "++i", Block([S(operation)])) ]) arg_struct = Struct("arg_struct", [arg.declarator() for arg in arguments]) mod.add_struct(arg_struct, "ArgStruct") mod.add_to_module([Line()]) mod.add_function( FunctionBody( FunctionDeclaration(Value("void", name), [ POD(numpy.uintp, "codepy_length"), Value("arg_struct", "args") ]), body)) return mod
def get_cpu_extra_includes(self): from cgen import Include return [Include("boost/scoped_array.hpp")]
def make_cuda_kernel(self, discr, dtype, eg): given = discr.given ldis = eg.local_discretization microblocks_per_block = 1 from cgen.cuda import CudaGlobal from cgen import (Module, Value, Include, Typedef, FunctionBody, FunctionDeclaration, Const, Line, POD, LiteralBlock, Define, Pointer) cmod = Module([ Include("pycuda-helpers.hpp"), Line(), Typedef(POD(dtype, "value_type")), Line(), Define("DOFS_PER_EL", given.dofs_per_el()), Define("ALIGNED_DOFS_PER_MB", given.microblock.aligned_floats), Define("VERTICES_PER_EL", ldis.vertex_count()), Define("ELS_PER_MB", given.microblock.elements), Define("MBS_PER_BLOCK", microblocks_per_block), Line(), Define("DOF_IN_MB_IDX", "threadIdx.x"), Define("DOF_IN_EL_IDX", "(DOF_IN_MB_IDX-el_idx_in_mb*DOFS_PER_EL)"), Define("MB_IN_BLOCK_IDX", "threadIdx.y"), Define("BLOCK_IDX", "blockIdx.x"), Define("MB_NUMBER", "(BLOCK_IDX * MBS_PER_BLOCK + MB_IN_BLOCK_IDX)"), Define("BLOCK_DATA", "whole_block[MB_IN_BLOCK_IDX]")] + self.get_cuda_extra_preamble(discr, dtype, eg) + [FunctionBody( CudaGlobal(FunctionDeclaration( Value("void", "elwise_kernel"), [ Pointer(Const(POD(dtype, "field"))), Pointer(POD(dtype, "result")), POD(numpy.uint32, "mb_count"), ])), LiteralBlock(""" int el_idx_in_mb = DOF_IN_MB_IDX / DOFS_PER_EL; if (MB_NUMBER >= mb_count) return; int idx = MB_NUMBER * ALIGNED_DOFS_PER_MB + DOF_IN_MB_IDX; int element_base_idx = ALIGNED_DOFS_PER_MB * MB_IN_BLOCK_IDX + (DOF_IN_MB_IDX / DOFS_PER_EL) * DOFS_PER_EL; int dof_in_element = DOF_IN_MB_IDX-el_idx_in_mb*DOFS_PER_EL; __shared__ value_type whole_block[MBS_PER_BLOCK][ALIGNED_DOFS_PER_MB+1]; int idx_in_block = ALIGNED_DOFS_PER_MB * MB_IN_BLOCK_IDX + DOF_IN_MB_IDX; BLOCK_DATA[idx_in_block] = field[idx]; __syncthreads(); %s result[idx] = node_result; """ % self.get_cuda_code(discr, dtype, eg))) ]) if False: for i, l in enumerate(str(cmod).split("\n")): print i+1, l raw_input() from pycuda.compiler import SourceModule mod = SourceModule( cmod, keep="cuda_keep_kernels" in discr.debug, ) func = mod.get_function("elwise_kernel") func.prepare( "PPI", block=( given.microblock.aligned_floats, microblocks_per_block, 1)) mb_count = len(discr.blocks) * discr.given.microblocks_per_block grid_dim = (mb_count + microblocks_per_block - 1) \ // microblocks_per_block from pytools import Record class KernelInfo(Record): pass return KernelInfo( func=func, grid_dim=grid_dim, mb_count=mb_count)
def _cusp_solver(M, parameters): cache_key = lambda t, p: (t, p['ksp_type'], p['pc_type'], p['ksp_rtol'], p[ 'ksp_atol'], p['ksp_max_it'], p['ksp_gmres_restart'], p['ksp_monitor']) module = _cusp_cache.get(cache_key(M.ctype, parameters)) if module: return module import codepy.toolchain from cgen import FunctionBody, FunctionDeclaration from cgen import Block, Statement, Include, Value from codepy.bpl import BoostPythonModule from codepy.cuda import CudaModule gcc_toolchain = codepy.toolchain.guess_toolchain() nvcc_toolchain = codepy.toolchain.guess_nvcc_toolchain() if 'CUSP_HOME' in os.environ: nvcc_toolchain.add_library('cusp', [os.environ['CUSP_HOME']], [], []) host_mod = BoostPythonModule() nvcc_mod = CudaModule(host_mod) nvcc_includes = [ 'thrust/device_vector.h', 'thrust/fill.h', 'cusp/csr_matrix.h', 'cusp/krylov/cg.h', 'cusp/krylov/bicgstab.h', 'cusp/krylov/gmres.h', 'cusp/precond/diagonal.h', 'cusp/precond/smoothed_aggregation.h', 'cusp/precond/ainv.h', 'string' ] nvcc_mod.add_to_preamble([Include(s) for s in nvcc_includes]) nvcc_mod.add_to_preamble([Statement('using namespace std')]) # We're translating PETSc preconditioner types to CUSP diag = Statement( 'cusp::precond::diagonal< ValueType, cusp::device_memory >M(A)') ainv = Statement( 'cusp::precond::scaled_bridson_ainv< ValueType, cusp::device_memory >M(A)' ) amg = Statement( 'cusp::precond::smoothed_aggregation< IndexType, ValueType, cusp::device_memory >M(A)' ) none = Statement( 'cusp::identity_operator< ValueType, cusp::device_memory >M(nrows, ncols)' ) preconditioners = { 'diagonal': diag, 'jacobi': diag, 'ainv': ainv, 'ainvcusp': ainv, 'amg': amg, 'hypre': amg, 'none': none, None: none } try: precond_call = preconditioners[parameters['pc_type']] except KeyError: raise RuntimeError("Cusp does not support preconditioner type %s" % parameters['pc_type']) solvers = { 'cg': Statement('cusp::krylov::cg(A, x, b, monitor, M)'), 'bicgstab': Statement('cusp::krylov::bicgstab(A, x, b, monitor, M)'), 'gmres': Statement( 'cusp::krylov::gmres(A, x, b, %(ksp_gmres_restart)d, monitor, M)' % parameters) } try: solve_call = solvers[parameters['ksp_type']] except KeyError: raise RuntimeError("Cusp does not support solver type %s" % parameters['ksp_type']) monitor = 'monitor(b, %(ksp_max_it)d, %(ksp_rtol)g, %(ksp_atol)g)' % parameters nvcc_function = FunctionBody( FunctionDeclaration(Value('void', '__cusp_solve'), [ Value('CUdeviceptr', '_rowptr'), Value('CUdeviceptr', '_colidx'), Value('CUdeviceptr', '_csrdata'), Value('CUdeviceptr', '_b'), Value('CUdeviceptr', '_x'), Value('int', 'nrows'), Value('int', 'ncols'), Value('int', 'nnz') ]), Block([ Statement('typedef int IndexType'), Statement('typedef %s ValueType' % M.ctype), Statement( 'typedef typename cusp::array1d_view< thrust::device_ptr<IndexType> > indices' ), Statement( 'typedef typename cusp::array1d_view< thrust::device_ptr<ValueType> > values' ), Statement( 'typedef cusp::csr_matrix_view< indices, indices, values, IndexType, ValueType, cusp::device_memory > matrix' ), Statement( 'thrust::device_ptr< IndexType > rowptr((IndexType *)_rowptr)' ), Statement( 'thrust::device_ptr< IndexType > colidx((IndexType *)_colidx)' ), Statement( 'thrust::device_ptr< ValueType > csrdata((ValueType *)_csrdata)' ), Statement('thrust::device_ptr< ValueType > d_b((ValueType *)_b)'), Statement('thrust::device_ptr< ValueType > d_x((ValueType *)_x)'), Statement('indices row_offsets(rowptr, rowptr + nrows + 1)'), Statement('indices column_indices(colidx, colidx + nnz)'), Statement('values matrix_values(csrdata, csrdata + nnz)'), Statement('values b(d_b, d_b + nrows)'), Statement('values x(d_x, d_x + ncols)'), Statement('thrust::fill(x.begin(), x.end(), (ValueType)0)'), Statement( 'matrix A(nrows, ncols, nnz, row_offsets, column_indices, matrix_values)' ), Statement('cusp::%s_monitor< ValueType > %s' % ('verbose' if parameters['ksp_monitor'] else 'default', monitor)), precond_call, solve_call ])) host_mod.add_to_preamble( [Include('boost/python/extract.hpp'), Include('string')]) host_mod.add_to_preamble([Statement('using namespace boost::python')]) host_mod.add_to_preamble([Statement('using namespace std')]) nvcc_mod.add_function(nvcc_function) host_mod.add_function( FunctionBody( FunctionDeclaration(Value('void', 'solve'), [ Value('object', '_rowptr'), Value('object', '_colidx'), Value('object', '_csrdata'), Value('object', '_b'), Value('object', '_x'), Value('object', '_nrows'), Value('object', '_ncols'), Value('object', '_nnz') ]), Block([ Statement( 'CUdeviceptr rowptr = extract<CUdeviceptr>(_rowptr.attr("gpudata"))' ), Statement( 'CUdeviceptr colidx = extract<CUdeviceptr>(_colidx.attr("gpudata"))' ), Statement( 'CUdeviceptr csrdata = extract<CUdeviceptr>(_csrdata.attr("gpudata"))' ), Statement( 'CUdeviceptr b = extract<CUdeviceptr>(_b.attr("gpudata"))' ), Statement( 'CUdeviceptr x = extract<CUdeviceptr>(_x.attr("gpudata"))' ), Statement('int nrows = extract<int>(_nrows)'), Statement('int ncols = extract<int>(_ncols)'), Statement('int nnz = extract<int>(_nnz)'), Statement( '__cusp_solve(rowptr, colidx, csrdata, b, x, nrows, ncols, nnz)' ) ]))) nvcc_toolchain.cflags.append('-arch') nvcc_toolchain.cflags.append('sm_20') nvcc_toolchain.cflags.append('-O3') module = nvcc_mod.compile(gcc_toolchain, nvcc_toolchain, debug=configuration["debug"]) _cusp_cache[cache_key(M.ctype, parameters)] = module return module
def get_kernel(self, diff_op, elgroup, for_benchmark=False): from cgen import \ Pointer, POD, Value, ArrayOf, Const, \ Module, FunctionDeclaration, FunctionBody, Block, \ Comment, Line, Define, Include, \ Initializer, If, For, Statement, Assign from pycuda.tools import dtype_to_ctype from cgen.cuda import CudaShared, CudaGlobal discr = self.discr d = discr.dimensions dims = range(d) plan = self.plan given = plan.given elgroup, = discr.element_groups float_type = given.float_type f_decl = CudaGlobal(FunctionDeclaration(Value("void", "apply_diff_mat_smem"), [Pointer(POD(float_type, "debugbuf")), Pointer(POD(float_type, "field")), ] + [Pointer(POD(float_type, "drst%d_global" % i)) for i in dims] )) par = plan.parallelism cmod = Module([ Include("pycuda-helpers.hpp"), ]) if float_type == numpy.float64: cmod.append(Value("texture<fp_tex_double, 1, cudaReadModeElementType>", "diff_rst_mat_tex")) elif float_type == numpy.float32: rst_channels = given.devdata.make_valid_tex_channel_count(d) cmod.append(Value("texture<float%d, 1, cudaReadModeElementType>" % rst_channels, "diff_rst_mat_tex")) else: raise ValueError("unsupported float type: %s" % float_type) # only preimage size variation is supported here assert plan.image_dofs_per_el == given.dofs_per_el() assert plan.aligned_image_dofs_per_microblock == given.microblock.aligned_floats # FIXME: aligned_image_dofs_per_microblock must be divisible # by this, therefore hardcoding for now. chunk_size = 16 cmod.extend([ Line(), Define("DIMENSIONS", discr.dimensions), Define("IMAGE_DOFS_PER_EL", plan.image_dofs_per_el), Define("PREIMAGE_DOFS_PER_EL", plan.preimage_dofs_per_el), Define("ALIGNED_IMAGE_DOFS_PER_MB", plan.aligned_image_dofs_per_microblock), Define("ALIGNED_PREIMAGE_DOFS_PER_MB", plan.aligned_preimage_dofs_per_microblock), Define("ELS_PER_MB", given.microblock.elements), Define("IMAGE_DOFS_PER_MB", "(IMAGE_DOFS_PER_EL*ELS_PER_MB)"), Line(), Define("CHUNK_SIZE", chunk_size), Define("CHUNK_DOF", "threadIdx.x"), Define("PAR_MB_NR", "threadIdx.y"), Define("CHUNK_NR", "threadIdx.z"), Define("IMAGE_MB_DOF", "(CHUNK_NR*CHUNK_SIZE+CHUNK_DOF)"), Define("IMAGE_EL_DOF", "(IMAGE_MB_DOF - mb_el*IMAGE_DOFS_PER_EL)"), Line(), Define("MACROBLOCK_NR", "blockIdx.x"), Line(), Define("PAR_MB_COUNT", par.parallel), Define("INLINE_MB_COUNT", par.inline), Define("SEQ_MB_COUNT", par.serial), Line(), Define("GLOBAL_MB_NR_BASE", "(MACROBLOCK_NR*PAR_MB_COUNT*INLINE_MB_COUNT*SEQ_MB_COUNT)"), Define("GLOBAL_MB_NR", "(GLOBAL_MB_NR_BASE" "+ (seq_mb_number*PAR_MB_COUNT + PAR_MB_NR)*INLINE_MB_COUNT)"), Define("GLOBAL_MB_IMAGE_DOF_BASE", "(GLOBAL_MB_NR*ALIGNED_IMAGE_DOFS_PER_MB)"), Define("GLOBAL_MB_PREIMAGE_DOF_BASE", "(GLOBAL_MB_NR*ALIGNED_PREIMAGE_DOFS_PER_MB)"), Line(), CudaShared( ArrayOf( ArrayOf( ArrayOf( POD(float_type, "smem_field"), "PAR_MB_COUNT"), "INLINE_MB_COUNT"), "ALIGNED_PREIMAGE_DOFS_PER_MB")), Line(), ]) S = Statement f_body = Block([ Initializer(Const(POD(numpy.uint16, "mb_el")), "IMAGE_MB_DOF / IMAGE_DOFS_PER_EL"), Line(), ]) # --------------------------------------------------------------------- def get_load_code(): mb_img_dofs = plan.aligned_image_dofs_per_microblock mb_preimg_dofs = plan.aligned_preimage_dofs_per_microblock preimg_dofs_over_dofs = (mb_preimg_dofs+mb_img_dofs-1) // mb_img_dofs load_code = [] store_code = [] var_num = 0 for load_block in range(preimg_dofs_over_dofs): for inl in range(par.inline): # load and store are split for better pipelining # compiler can't figure that out because of branch var = "tmp%d" % var_num var_num += 1 load_code.append(POD(float_type, var)) block_addr = "%d * ALIGNED_IMAGE_DOFS_PER_MB + IMAGE_MB_DOF" % load_block load_instr = Assign(var, "field[GLOBAL_MB_PREIMAGE_DOF_BASE" " + %d*ALIGNED_PREIMAGE_DOFS_PER_MB" " + %s]" % (inl, block_addr)) store_instr = Assign( "smem_field[PAR_MB_NR][%d][%s]" % (inl, block_addr), var ) if (load_block+1)*mb_img_dofs >= mb_preimg_dofs: cond = "%s < ALIGNED_PREIMAGE_DOFS_PER_MB" % block_addr load_instr = If(cond, load_instr) store_instr = If(cond, store_instr) load_code.append(load_instr) store_code.append(store_instr) return Block(load_code + [Line()] + store_code) def get_scalar_diff_code(): code = [] for inl in range(par.inline): for axis in dims: code.append( Initializer(POD(float_type, "d%drst%d" % (inl, axis)), 0)) code.append(Line()) tex_channels = ["x", "y", "z", "w"] store_code = Block() for inl in range(par.inline): for rst_axis in dims: store_code.append(Assign( "drst%d_global[GLOBAL_MB_IMAGE_DOF_BASE + " "%d*ALIGNED_IMAGE_DOFS_PER_MB + IMAGE_MB_DOF]" % (rst_axis, inl), "d%drst%d" % (inl, rst_axis) )) from hedge.backends.cuda.tools import unroll code.extend([ Comment("everybody needs to be done with the old data"), S("__syncthreads()"), Line(), get_load_code(), Line(), Comment("all the new data must be loaded"), S("__syncthreads()"), Line(), ]) if float_type == numpy.float32: code.append(Value("float%d" % rst_channels, "dmat_entries")) code.extend([ POD(float_type, "field_value%d" % inl) for inl in range(par.inline) ]+[Line()]) def unroll_body(j): result = [ Assign("field_value%d" % inl, "smem_field[PAR_MB_NR][%d][mb_el*PREIMAGE_DOFS_PER_EL+%s]" % (inl, j)) for inl in range(par.inline) ] if float_type == numpy.float32: result.append(Assign("dmat_entries", "tex1Dfetch(diff_rst_mat_tex, IMAGE_EL_DOF + %s*IMAGE_DOFS_PER_EL)" % j)) result.extend( S("d%drst%d += dmat_entries.%s * field_value%d" % (inl, axis, tex_channels[axis], inl)) for inl in range(par.inline) for axis in dims) elif float_type == numpy.float64: result.extend( S("d%(inl)drst%(axis)d += " "fp_tex1Dfetch(diff_rst_mat_tex, %(axis)d " "+ DIMENSIONS*(IMAGE_EL_DOF + %(j)d*IMAGE_DOFS_PER_EL))" "* field_value%(inl)d" % { "inl": inl, "axis": axis, "j": j }) for inl in range(par.inline) for axis in dims) else: assert False return result code.append(If("IMAGE_MB_DOF < IMAGE_DOFS_PER_MB", Block(unroll(unroll_body, total_number=plan.preimage_dofs_per_el) +[store_code]))) return code f_body.extend([ For("unsigned short seq_mb_number = 0", "seq_mb_number < SEQ_MB_COUNT", "++seq_mb_number", Block(get_scalar_diff_code()) ) ]) # finish off ---------------------------------------------------------- cmod.append(FunctionBody(f_decl, f_body)) if not for_benchmark and "cuda_dump_kernels" in discr.debug: from hedge.tools import open_unique_debug_file open_unique_debug_file("diff", ".cu").write(str(cmod)) mod = SourceModule(cmod, keep="cuda_keep_kernels" in discr.debug, #options=["--maxrregcount=16"] ) func = mod.get_function("apply_diff_mat_smem") if "cuda_diff" in discr.debug: print "diff: lmem=%d smem=%d regs=%d" % ( func.local_size_bytes, func.shared_size_bytes, func.registers) diff_rst_mat_texref = mod.get_texref("diff_rst_mat_tex") gpu_diffmats = self.gpu_diffmats(diff_op, elgroup) if given.float_type == numpy.float32: gpu_diffmats.bind_to_texref_ext(diff_rst_mat_texref, rst_channels) elif given.float_type == numpy.float64: gpu_diffmats.bind_to_texref_ext(diff_rst_mat_texref, allow_double_hack=True) else: assert False assert given.microblock.aligned_floats % chunk_size == 0 block = ( chunk_size, plan.parallelism.parallel, given.microblock.aligned_floats//chunk_size) func.prepare( ["PP"] + discr.dimensions*["P"], texrefs=[diff_rst_mat_texref]) return block, func
def _init_lr_correction_libs(self): sph_gen = self.sph_gen def _re_lm(l, m): return l**2 + l + m assign_gen = 'double rhol = charge;\n' for lx in range(self.L): for mx in range(-lx, lx+1): res, ims = sph_gen.get_y_sym(lx, -mx) offset = _re_lm(lx, mx) assign_gen += ''.join(['MULTIPOLE[{}] += {} * rhol;\n'.format(*args) for args in ( (offset, str(res)), (offset + self.L**2, str(ims)) ) ]) res, ims = sph_gen.get_y_sym(lx, mx) assign_gen += ''.join(['DOT_VEC[{}] += {} * rhol;\n'.format(*args) for args in ( (offset, str(res)), (offset + self.L**2, '-1.0 * ' + str(ims)) ) ]) assign_gen += 'rhol *= radius;\n' co = self.solver.il[1] new27direct = 0.0 ex = self.domain.extent for ox in co: # image of old pos dox = np.array((ex[0] * ox[0], ex[1] * ox[1], ex[2] * ox[2])) if ox != (0,0,0): new27direct -= 1.0 / np.linalg.norm(dox) offset_consts = '' bc27 = '' for oxi, ox in enumerate(co): offset_consts += ''' const REAL dox{oxi} = EX * {OX}; const REAL doy{oxi} = EY * {OY}; const REAL doz{oxi} = EZ * {OZ}; '''.format( oxi=str(oxi), OX=str(ox[0]), OY=str(ox[1]), OZ=str(ox[2]), ) bc27 += ''' const REAL dpx{oxi} = dox{oxi} + opx; const REAL dpy{oxi} = doy{oxi} + opy; const REAL dpz{oxi} = doz{oxi} + opz; const REAL ddx{oxi} = dpx{oxi} - npx; const REAL ddy{oxi} = dpy{oxi} - npy; const REAL ddz{oxi} = dpz{oxi} - npz; const REAL o_bbp{oxi} = 1.0 / sqrt(ddx{oxi}*ddx{oxi} + ddy{oxi}*ddy{oxi} + ddz{oxi}*ddz{oxi}); energy27 += o_bbp{oxi}; '''.format( oxi=str(oxi) ) src = r''' namespace LR_SI {{ static inline REAL apply_dipole_correction_split( const REAL * RESTRICT M, const REAL * RESTRICT E ){{ REAL tmp = 0.0; tmp += (DIPOLE_SX * M[RE_1P1]) * E[RE_1P1]; tmp += (DIPOLE_SX * M[RE_1P1]) * E[RE_1N1]; tmp -= (DIPOLE_SY * M[IM_1P1]) * E[IM_1P1]; tmp += (DIPOLE_SY * M[IM_1P1]) * E[IM_1N1]; tmp += (DIPOLE_SZ * M[RE_1_0]) * E[RE_1_0]; return tmp; }} static inline REAL linop_csr_both( const REAL * RESTRICT linop_data, const INT64 * RESTRICT linop_indptr, const INT64 * RESTRICT linop_indices, const REAL * RESTRICT x1, const REAL * RESTRICT E ){{ INT64 data_ind = 0; REAL dot_tmp = 0.0; for(INT64 row=0 ; row<HALF_NCOMP ; row++){{ REAL row_tmp_1 = 0.0; REAL row_tmp_2 = 0.0; for(INT64 col_ind=linop_indptr[row] ; col_ind<linop_indptr[row+1] ; col_ind++){{ const INT64 col = linop_indices[data_ind]; const REAL data = linop_data[data_ind]; data_ind++; row_tmp_1 += data * x1[col]; row_tmp_2 += data * x1[col + HALF_NCOMP]; }} dot_tmp += row_tmp_1 * E[row] + row_tmp_2 * E[row + HALF_NCOMP]; }} return dot_tmp; }} static inline void vector_diff( const REAL dx, const REAL dy, const REAL dz, const REAL charge, REAL * MULTIPOLE, REAL * DOT_VEC ){{ const double xy2 = dx * dx + dy * dy; const double radius = sqrt(xy2 + dz * dz); const double theta = atan2(sqrt(xy2), dz); const double phi = atan2(dy, dx); {SPH_GEN} {ASSIGN_GEN} }} static inline REAL lr_energy_diff( const INT64 accept_flag, const REAL * RESTRICT old_position, const REAL * RESTRICT new_position, const REAL charge, const REAL old_energy, REAL * RESTRICT existing_multipole, REAL * RESTRICT existing_evector, const REAL * RESTRICT linop_data, const INT64 * RESTRICT linop_indptr, const INT64 * RESTRICT linop_indices ){{ REAL mvector[NCOMP]; REAL evector[NCOMP]; // copy the existing vectors for(int ix=0 ; ix<NCOMP ; ix++){{ mvector[ix] = existing_multipole[ix]; evector[ix] = existing_evector[ix]; }} // remove the old contribution vector_diff(old_position[0], old_position[1], old_position[2], -1.0 * charge, mvector, evector); // add the new contribution vector_diff(new_position[0], new_position[1], new_position[2], charge, mvector, evector); // cheap way to reuse this code for accepts if (accept_flag > 0){{ for(int ix=0 ; ix<NCOMP ; ix++){{ existing_multipole[ix] = mvector[ix]; existing_evector[ix] = evector[ix]; }} }} // apply the long range linear operator and get the new energy (minus dipole correction) REAL new_energy = 0.5 * linop_csr_both( linop_data, linop_indptr, linop_indices, mvector, evector ); // add the dipole correction new_energy += 0.5 * apply_dipole_correction_split( mvector, evector ); return new_energy - old_energy; }} static inline REAL self_contributon( const REAL * RESTRICT old_position, const REAL * RESTRICT new_position, const REAL charge ){{ const REAL opx = old_position[0]; const REAL opy = old_position[1]; const REAL opz = old_position[2]; const REAL npx = new_position[0]; const REAL npy = new_position[1]; const REAL npz = new_position[2]; {OFFSET_CONSTS} REAL energy27 = (DOMAIN_27_ENERGY); {BC27} return energy27 * charge * charge; }} int lr_self_interaction_inner( const INT64 accept_flag, const REAL * RESTRICT old_position, const REAL * RESTRICT new_position, const REAL charge, const REAL old_energy, REAL * RESTRICT existing_multipole, REAL * RESTRICT existing_evector, const REAL * RESTRICT linop_data, const INT64 * RESTRICT linop_indptr, const INT64 * RESTRICT linop_indices, REAL * RESTRICT return_energy, REAL * RESTRICT TIME_TAKEN ){{ std::chrono::high_resolution_clock::time_point _loop_timer_t0 = std::chrono::high_resolution_clock::now(); REAL tmpu0 = lr_energy_diff(accept_flag, old_position, new_position, charge, old_energy, existing_multipole, existing_evector, linop_data, linop_indptr, linop_indices); REAL tmpu1 = (accept_flag < 1) ? self_contributon(old_position, new_position, charge) : 0.0 ; *return_energy = -1.0 * tmpu0 + tmpu1; std::chrono::high_resolution_clock::time_point _loop_timer_t1 = std::chrono::high_resolution_clock::now(); std::chrono::duration<double> _loop_timer_res = _loop_timer_t1 - _loop_timer_t0; *TIME_TAKEN = (double) _loop_timer_res.count(); return 0; }} }} extern "C" int lr_self_interaction( const INT64 accept_flag, const REAL * RESTRICT old_position, const REAL * RESTRICT new_position, const REAL charge, const REAL old_energy, REAL * RESTRICT existing_multipole, REAL * RESTRICT existing_evector, const REAL * RESTRICT linop_data, const INT64 * RESTRICT linop_indptr, const INT64 * RESTRICT linop_indices, REAL * RESTRICT return_energy, REAL * RESTRICT TIME_TAKEN ) {{ return LR_SI::lr_self_interaction_inner( accept_flag, old_position, new_position, charge, old_energy, existing_multipole, existing_evector, linop_data, linop_indptr, linop_indices, return_energy, TIME_TAKEN ); }} '''.format( SPH_GEN=str(sph_gen.module), ASSIGN_GEN=str(assign_gen), OFFSET_CONSTS=str(offset_consts), BC27=str(bc27) ) header = str( Module(( Include('omp.h'), Include('stdio.h'), Include('math.h'), Include('chrono'), Define('INT64', 'int64_t'), Define('REAL', 'double'), Define('NCOMP', str(self.ncomp)), Define('HALF_NCOMP', str(self.L**2)), Define('DIPOLE_SX', str(self.lrc.dipole_correction[0])), Define('DIPOLE_SY', str(self.lrc.dipole_correction[1])), Define('DIPOLE_SZ', str(self.lrc.dipole_correction[2])), Define('RE_1P1', str(_re_lm(1, 1))), Define('RE_1_0', str(_re_lm(1, 0))), Define('RE_1N1', str(_re_lm(1,-1))), Define('IM_1P1', str(_re_lm(1, 1) + self.L**2)), Define('IM_1_0', str(_re_lm(1, 0) + self.L**2)), Define('DOMAIN_27_ENERGY', str(new27direct)), Define('IM_1N1', str(_re_lm(1,-1) + self.L**2)), Define('EX', self.domain.extent[0]), Define('EY', self.domain.extent[1]), Define('EZ', self.domain.extent[2]), )) ) header_post = ''' #undef NCOMP #undef HALF_NCOMP #undef DIPOLE_SX #undef DIPOLE_SY #undef DIPOLE_SZ #undef RE_1P1 #undef RE_1_0 #undef RE_1N1 #undef IM_1P1 #undef IM_1_0 #undef DOMAIN_27_ENERGY #undef IM_1N1 #undef EX #undef EY #undef EZ ''' src = header + src + header_post self._lr_si_lib = lib.build.simple_lib_creator(header_code='', src_code=src)['lr_self_interaction'] self.lib_sl_source = src self.lib_sl_parameters = [ 'const INT64 LR_SI_accept_flag', 'const REAL * RESTRICT LR_SI_old_position', 'const REAL * RESTRICT LR_SI_new_position', 'const REAL LR_SI_charge', 'const REAL LR_SI_old_energy', ' REAL * RESTRICT LR_SI_existing_multipole', ' REAL * RESTRICT LR_SI_existing_evector', 'const REAL * RESTRICT LR_SI_linop_data', 'const INT64 * RESTRICT LR_SI_linop_indptr', 'const INT64 * RESTRICT LR_SI_linop_indices', ' REAL * RESTRICT LR_SI_return_energy', ' REAL * RESTRICT TIME_TAKEN', ] self.lib_sl_call = ''' LR_SI::lr_self_interaction_inner( LR_SI_accept_flag, LR_SI_old_position, LR_SI_new_position, LR_SI_charge, LR_SI_old_energy, LR_SI_existing_multipole, LR_SI_existing_evector, LR_SI_linop_data, LR_SI_linop_indptr, LR_SI_linop_indices, LR_SI_return_energy, TIME_TAKEN ); ''' if not (self.boundary_condition == BCType.PBC): self.lib_sl_parameters = [] self.lib_sl_call = ''
def get_kernel(self, fdata, ilist_data, for_benchmark): from cgen.cuda import CudaShared, CudaGlobal from pycuda.tools import dtype_to_ctype discr = self.discr given = self.plan.given fplan = self.plan d = discr.dimensions dims = range(d) elgroup, = discr.element_groups float_type = given.float_type f_decl = CudaGlobal( FunctionDeclaration(Value("void", "apply_flux"), [ Pointer(POD(float_type, "debugbuf")), Pointer(POD(numpy.uint8, "gmem_facedata")), ] + [ Pointer(POD(float_type, "gmem_fluxes_on_faces%d" % flux_nr)) for flux_nr in range(len(self.fluxes)) ])) cmod = Module() cmod.append(Include("pycuda-helpers.hpp")) for dep_expr in self.all_deps: cmod.extend([ Value( "texture<%s, 1, cudaReadModeElementType>" % dtype_to_ctype(float_type, with_fp_tex_hack=True), "field%d_tex" % self.dep_to_index[dep_expr]) ]) if fplan.flux_count != len(self.fluxes): from warnings import warn warn( "Flux count in flux execution plan different from actual flux count.\n" "You may want to specify the tune_for= kwarg in the Discretization\n" "constructor.") cmod.extend([ Line(), Typedef(POD(float_type, "value_type")), Line(), flux_header_struct(float_type, discr.dimensions), Line(), face_pair_struct(float_type, discr.dimensions), Line(), Define("DIMENSIONS", discr.dimensions), Define("DOFS_PER_FACE", fplan.dofs_per_face), Define("THREADS_PER_FACE", fplan.threads_per_face()), Line(), Define("CONCURRENT_FACES", fplan.parallel_faces), Define("BLOCK_MB_COUNT", fplan.mbs_per_block), Line(), Define("FACEDOF_NR", "threadIdx.x"), Define("BLOCK_FACE", "threadIdx.y"), Line(), Define("FLUX_COUNT", len(self.fluxes)), Line(), Define("THREAD_NUM", "(FACEDOF_NR + BLOCK_FACE*THREADS_PER_FACE)"), Define("THREAD_COUNT", "(THREADS_PER_FACE*CONCURRENT_FACES)"), Define( "COALESCING_THREAD_COUNT", "(THREAD_COUNT < 0x10 ? THREAD_COUNT : THREAD_COUNT & ~0xf)"), Line(), Define("DATA_BLOCK_SIZE", fdata.block_bytes), Define("ALIGNED_FACE_DOFS_PER_MB", fplan.aligned_face_dofs_per_microblock()), Define("ALIGNED_FACE_DOFS_PER_BLOCK", "(ALIGNED_FACE_DOFS_PER_MB*BLOCK_MB_COUNT)"), Line(), Define("FOF_BLOCK_BASE", "(blockIdx.x*ALIGNED_FACE_DOFS_PER_BLOCK)"), Line(), ] + ilist_data.code + [ Line(), Value("texture<index_list_entry_t, 1, cudaReadModeElementType>", "tex_index_lists"), Line(), fdata.struct, Line(), CudaShared(Value("flux_data", "data")), ]) if not fplan.direct_store: cmod.extend([ CudaShared( ArrayOf( ArrayOf(POD(float_type, "smem_fluxes_on_faces"), "FLUX_COUNT"), "ALIGNED_FACE_DOFS_PER_MB*BLOCK_MB_COUNT")), Line(), ]) S = Statement f_body = Block() from hedge.backends.cuda.tools import get_load_code f_body.extend( get_load_code(dest="&data", base="gmem_facedata + blockIdx.x*DATA_BLOCK_SIZE", bytes="sizeof(flux_data)", descr="load face_pair data") + [S("__syncthreads()"), Line()]) def get_flux_code(flux_writer): flux_code = Block([]) flux_code.extend([ Initializer(Pointer(Value("face_pair", "fpair")), "data.facepairs+fpair_nr"), Initializer( MaybeUnused(POD(numpy.uint32, "a_index")), "fpair->a_base + tex1Dfetch(tex_index_lists, " "fpair->a_ilist_index + FACEDOF_NR)"), Initializer( MaybeUnused(POD(numpy.uint32, "b_index")), "fpair->b_base + tex1Dfetch(tex_index_lists, " "fpair->b_ilist_index + FACEDOF_NR)"), Line(), flux_writer(), Line(), S("fpair_nr += CONCURRENT_FACES") ]) return flux_code flux_computation = Block([ Comment("fluxes for dual-sided (intra-block) interior face pairs"), While("fpair_nr < data.header.same_facepairs_end", get_flux_code(lambda: self.write_interior_flux_code(True))), Line(), Comment("work around nvcc assertion failure"), S("fpair_nr+=1"), S("fpair_nr-=1"), Line(), Comment( "fluxes for single-sided (inter-block) interior face pairs"), While("fpair_nr < data.header.diff_facepairs_end", get_flux_code(lambda: self.write_interior_flux_code(False))), Line(), Comment("fluxes for single-sided boundary face pairs"), While( "fpair_nr < data.header.bdry_facepairs_end", get_flux_code( lambda: self.write_boundary_flux_code(for_benchmark))), ]) f_body.extend_log_block("compute the fluxes", [ Initializer(POD(numpy.uint32, "fpair_nr"), "BLOCK_FACE"), If("FACEDOF_NR < DOFS_PER_FACE", flux_computation) ]) if not fplan.direct_store: f_body.extend([Line(), S("__syncthreads()"), Line()]) f_body.extend_log_block( "store fluxes", [ #Assign("debugbuf[blockIdx.x]", "FOF_BLOCK_BASE"), #Assign("debugbuf[0]", "FOF_BLOCK_BASE"), #Assign("debugbuf[0]", "sizeof(face_pair)"), For( "unsigned word_nr = THREAD_NUM", "word_nr < ALIGNED_FACE_DOFS_PER_MB*BLOCK_MB_COUNT", "word_nr += COALESCING_THREAD_COUNT", Block([ Assign( "gmem_fluxes_on_faces%d[FOF_BLOCK_BASE+word_nr]" % flux_nr, "smem_fluxes_on_faces[%d][word_nr]" % flux_nr) for flux_nr in range(len(self.fluxes)) ] #+[If("isnan(smem_fluxes_on_faces[%d][word_nr])" % flux_nr, #Block([ #Assign("debugbuf[blockIdx.x]", "word_nr"), #]) #) #for flux_nr in range(len(self.fluxes))] )) ]) if False: f_body.extend([ Assign("debugbuf[blockIdx.x*96+32+BLOCK_FACE*32+threadIdx.x]", "fpair_nr"), Assign("debugbuf[blockIdx.x*96+16]", "data.header.same_facepairs_end"), Assign("debugbuf[blockIdx.x*96+17]", "data.header.diff_facepairs_end"), Assign("debugbuf[blockIdx.x*96+18]", "data.header.bdry_facepairs_end"), ]) # finish off ---------------------------------------------------------- cmod.append(FunctionBody(f_decl, f_body)) if not for_benchmark and "cuda_dump_kernels" in discr.debug: from hedge.tools import open_unique_debug_file open_unique_debug_file("flux_gather", ".cu").write(str(cmod)) #from pycuda.tools import allow_user_edit mod = SourceModule( #allow_user_edit(cmod, "kernel.cu", "the flux kernel"), cmod, keep="cuda_keep_kernels" in discr.debug) expr_to_texture_map = dict( (dep_expr, mod.get_texref("field%d_tex" % self.dep_to_index[dep_expr])) for dep_expr in self.all_deps) index_list_texref = mod.get_texref("tex_index_lists") index_list_texref.set_address(ilist_data.device_memory, ilist_data.bytes) index_list_texref.set_format( cuda.dtype_to_array_format(ilist_data.type), 1) index_list_texref.set_flags(cuda.TRSF_READ_AS_INTEGER) func = mod.get_function("apply_flux") block = (fplan.threads_per_face(), fplan.parallel_faces, 1) func.prepare( (2 + len(self.fluxes)) * "P", texrefs=expr_to_texture_map.values() + [index_list_texref]) if "cuda_flux" in discr.debug: print "flux: lmem=%d smem=%d regs=%d" % ( func.local_size_bytes, func.shared_size_bytes, func.num_regs) return block, func, expr_to_texture_map
def get_boundary_flux_mod(fluxes, fvi, discr, dtype): from cgen import \ FunctionDeclaration, FunctionBody, Typedef, Struct, \ Const, Reference, Value, POD, MaybeUnused, \ Statement, Include, Line, Block, Initializer, Assign, \ CustomLoop, For from pytools import to_uncomplex_dtype, flatten from codepy.bpl import BoostPythonModule mod = BoostPythonModule() mod.add_to_preamble([ Include("cstdlib"), Include("algorithm"), Line(), Include("boost/foreach.hpp"), Line(), Include("hedge/face_operators.hpp"), ]) S = Statement mod.add_to_module([ S("using namespace hedge"), S("using namespace pyublas"), Line(), Typedef(POD(dtype, "value_type")), Typedef(POD(to_uncomplex_dtype(dtype), "uncomplex_type")), ]) arg_struct = Struct("arg_struct", [ Value("numpy_array<value_type>", "flux%d_on_faces" % i) for i in range(len(fluxes)) ]+[ Value("numpy_array<value_type>", arg_name) for arg_name in fvi.arg_names ]) mod.add_struct(arg_struct, "ArgStruct") mod.add_to_module([Line()]) fdecl = FunctionDeclaration( Value("void", "gather_flux"), [ Const(Reference(Value("face_group<face_pair<straight_face> >" , "fg"))), Reference(Value("arg_struct", "args")) ]) from pymbolic.mapper.stringifier import PREC_PRODUCT def gen_flux_code(): f2cm = FluxToCodeMapper() result = [ Assign("fof%d_it[loc_fof_base+i]" % flux_idx, "uncomplex_type(fp.int_side.face_jacobian) * " + flux_to_code(f2cm, False, flux_idx, fvi, flux.op.flux, PREC_PRODUCT)) for flux_idx, flux in enumerate(fluxes) ] return [ Initializer(Value("value_type", cse_name), cse_str) for cse_name, cse_str in f2cm.cse_name_list] + result fbody = Block([ Initializer( Const(Value("numpy_array<value_type>::iterator", "fof%d_it" % i)), "args.flux%d_on_faces.begin()" % i) for i in range(len(fluxes)) ]+[ Initializer( Const(Value("numpy_array<value_type>::const_iterator", "%s_it" % arg_name)), "args.%s.begin()" % arg_name) for arg_name in fvi.arg_names ]+[ Line(), CustomLoop("BOOST_FOREACH(const face_pair<straight_face> &fp, fg.face_pairs)", Block( list(flatten([ Initializer(Value("node_number_t", "%s_ebi" % where), "fp.%s.el_base_index" % where), Initializer(Value("index_lists_t::const_iterator", "%s_idx_list" % where), "fg.index_list(fp.%s.face_index_list_number)" % where), Line(), ] for where in ["int_side", "ext_side"] ))+[ Line(), Initializer(Value("node_number_t", "loc_fof_base"), "fg.face_length()*(fp.%(where)s.local_el_number*fg.face_count" " + fp.%(where)s.face_id)" % {"where": "int_side"}), Line(), For( "unsigned i = 0", "i < fg.face_length()", "++i", Block( [ Initializer(MaybeUnused( Value("node_number_t", "%s_idx" % where)), "%(where)s_ebi + %(where)s_idx_list[i]" % {"where": where}) for where in ["int_side", "ext_side"] ]+gen_flux_code() ) ) ])) ]) mod.add_function(FunctionBody(fdecl, fbody)) #print "----------------------------------------------------------------" #print mod.generate() #raw_input("[Enter]") return mod.compile(get_flux_toolchain(discr, fluxes))
def get_kernel(self, diff_op_cls, elgroup, for_benchmark=False): from cgen import \ Pointer, POD, Value, ArrayOf, \ Module, FunctionDeclaration, FunctionBody, Block, \ Line, Define, Include, \ Initializer, If, For, Statement, Assign from cgen import dtype_to_ctype from cgen.cuda import CudaShared, CudaGlobal discr = self.discr d = discr.dimensions dims = range(d) given = self.plan.given par = self.plan.parallelism diffmat_data = self.gpu_diffmats(diff_op_cls, elgroup) elgroup, = discr.element_groups float_type = given.float_type f_decl = CudaGlobal( FunctionDeclaration( Value("void", "apply_diff_mat"), [ Pointer(POD(numpy.uint8, "gmem_diff_rst_mat")), #Pointer(POD(float_type, "debugbuf")), ] + [Pointer(POD(float_type, "drst%d_global" % i)) for i in dims])) rst_channels = given.devdata.make_valid_tex_channel_count(d) cmod = Module([ Include("pycuda-helpers.hpp"), Line(), Value( "texture<fp_tex_%s, 1, cudaReadModeElementType>" % dtype_to_ctype(float_type), "field_tex"), Line(), Define("DIMENSIONS", discr.dimensions), Define("DOFS_PER_EL", given.dofs_per_el()), Line(), Define("SEGMENT_DOF", "threadIdx.x"), Define("PAR_MB_NR", "threadIdx.y"), Line(), Define("MB_SEGMENT", "blockIdx.x"), Define("MACROBLOCK_NR", "blockIdx.y"), Line(), Define("DOFS_PER_SEGMENT", self.plan.segment_size), Define("SEGMENTS_PER_MB", self.plan.segments_per_microblock()), Define("ALIGNED_DOFS_PER_MB", given.microblock.aligned_floats), Define("ELS_PER_MB", given.microblock.elements), Line(), Define("PAR_MB_COUNT", par.parallel), Define("INLINE_MB_COUNT", par.inline), Define("SEQ_MB_COUNT", par.serial), Line(), Define("THREAD_NUM", "(SEGMENT_DOF+PAR_MB_NR*DOFS_PER_SEGMENT)"), Define("COALESCING_THREAD_COUNT", "(PAR_MB_COUNT*DOFS_PER_SEGMENT)"), Line(), Define("MB_DOF_BASE", "(MB_SEGMENT*DOFS_PER_SEGMENT)"), Define("MB_DOF", "(MB_DOF_BASE+SEGMENT_DOF)"), Define( "GLOBAL_MB_NR_BASE", "(MACROBLOCK_NR*PAR_MB_COUNT*INLINE_MB_COUNT*SEQ_MB_COUNT)"), Define( "GLOBAL_MB_NR", "(GLOBAL_MB_NR_BASE" "+ (seq_mb_number*PAR_MB_COUNT + PAR_MB_NR)*INLINE_MB_COUNT)"), Define("GLOBAL_MB_DOF_BASE", "(GLOBAL_MB_NR*ALIGNED_DOFS_PER_MB)"), Line(), Define("DIFFMAT_SEGMENT_FLOATS", diffmat_data.block_floats), Define("DIFFMAT_SEGMENT_BYTES", "(DIFFMAT_SEGMENT_FLOATS*%d)" % given.float_size()), Define("DIFFMAT_COLUMNS", diffmat_data.matrix_columns), Line(), CudaShared( ArrayOf(POD(float_type, "smem_diff_rst_mat"), "DIFFMAT_COLUMNS*DOFS_PER_SEGMENT")), Line(), ]) S = Statement f_body = Block() f_body.extend_log_block("calculate responsibility data", [ Initializer(POD(numpy.uint16, "mb_el"), "MB_DOF/DOFS_PER_EL"), ]) from hedge.backends.cuda.tools import get_load_code f_body.extend( get_load_code( dest="smem_diff_rst_mat", base="gmem_diff_rst_mat + MB_SEGMENT*DIFFMAT_SEGMENT_BYTES", bytes="DIFFMAT_SEGMENT_BYTES", descr="load diff mat segment") + [S("__syncthreads()"), Line()]) # --------------------------------------------------------------------- def get_scalar_diff_code(): code = [] for inl in range(par.inline): for axis in dims: code.append( Initializer(POD(float_type, "d%drst%d" % (inl, axis)), 0)) code.append(Line()) def get_mat_entry(row, col, axis): return ("smem_diff_rst_mat[" "%(row)s*DIFFMAT_COLUMNS + %(axis)s*DOFS_PER_EL" " + %(col)s" "]" % { "row": row, "col": col, "axis": axis }) tex_channels = ["x", "y", "z", "w"] from hedge.backends.cuda.tools import unroll code.extend([ POD(float_type, "field_value%d" % inl) for inl in range(par.inline) ] + [Line()] + unroll( lambda j: [ Assign( "field_value%d" % inl, "fp_tex1Dfetch(field_tex, GLOBAL_MB_DOF_BASE + %d*ALIGNED_DOFS_PER_MB " "+ mb_el*DOFS_PER_EL + %s)" % (inl, j)) for inl in range(par.inline) ] + [Line()] + [ S("d%drst%d += %s * field_value%d" % (inl, axis, get_mat_entry("SEGMENT_DOF", j, axis), inl)) for axis in dims for inl in range(par.inline) ] + [Line()], given.dofs_per_el(), self.plan.max_unroll)) store_code = Block() for inl in range(par.inline): for rst_axis in dims: store_code.append( Assign( "drst%d_global[GLOBAL_MB_DOF_BASE" " + %d*ALIGNED_DOFS_PER_MB + MB_DOF]" % (rst_axis, inl), "d%drst%d" % (inl, rst_axis), )) code.append(If("MB_DOF < DOFS_PER_EL*ELS_PER_MB", store_code)) return code f_body.extend([ For("unsigned short seq_mb_number = 0", "seq_mb_number < SEQ_MB_COUNT", "++seq_mb_number", Block(get_scalar_diff_code())) ]) # finish off ---------------------------------------------------------- cmod.append(FunctionBody(f_decl, f_body)) if not for_benchmark and "cuda_dump_kernels" in discr.debug: from hedge.tools import open_unique_debug_file open_unique_debug_file("diff", ".cu").write(str(cmod)) mod = SourceModule( cmod, keep="cuda_keep_kernels" in discr.debug, #options=["--maxrregcount=10"] ) field_texref = mod.get_texref("field_tex") func = mod.get_function("apply_diff_mat") func.prepare(discr.dimensions * [float_type] + ["P"], block=(self.plan.segment_size, par.parallel, 1), texrefs=[field_texref]) if "cuda_diff" in discr.debug: print "diff: lmem=%d smem=%d regs=%d" % ( func.local_size_bytes, func.shared_size_bytes, func.num_regs) return func, field_texref
def get_kernel(self, with_scaling, for_benchmark=False): from cgen import \ Pointer, POD, Value, ArrayOf, \ Module, FunctionDeclaration, FunctionBody, Block, \ Line, Define, Include, \ Initializer, If, For, Statement, Assign, \ ArrayInitializer from cgen import dtype_to_ctype from cgen.cuda import CudaShared, CudaConstant, CudaGlobal discr = self.discr d = discr.dimensions dims = range(d) given = self.plan.given float_type = given.float_type f_decl = CudaGlobal( FunctionDeclaration(Value("void", "apply_el_local_mat_smem_mat"), [ Pointer(POD(float_type, "out_vector")), Pointer(POD(numpy.uint8, "gmem_matrix")), Pointer(POD(float_type, "debugbuf")), POD(numpy.uint32, "microblock_count"), ])) cmod = Module([ Include("pycuda-helpers.hpp"), Line(), Value( "texture<fp_tex_%s, 1, cudaReadModeElementType>" % dtype_to_ctype(float_type), "in_vector_tex"), ]) if with_scaling: cmod.append( Value( "texture<fp_tex_%s, 1, cudaReadModeElementType>" % dtype_to_ctype(float_type), "scaling_tex"), ) par = self.plan.parallelism cmod.extend([ Line(), Define("DIMENSIONS", discr.dimensions), Define("DOFS_PER_EL", given.dofs_per_el()), Define("PREIMAGE_DOFS_PER_EL", self.plan.preimage_dofs_per_el), Line(), Define("SEGMENT_DOF", "threadIdx.x"), Define("PAR_MB_NR", "threadIdx.y"), Line(), Define("MB_SEGMENT", "blockIdx.x"), Define("MACROBLOCK_NR", "blockIdx.y"), Line(), Define("DOFS_PER_SEGMENT", self.plan.segment_size), Define("SEGMENTS_PER_MB", self.plan.segments_per_microblock()), Define("ALIGNED_DOFS_PER_MB", given.microblock.aligned_floats), Define("ALIGNED_PREIMAGE_DOFS_PER_MB", self.plan.aligned_preimage_dofs_per_microblock), Define("MB_EL_COUNT", given.microblock.elements), Line(), Define("PAR_MB_COUNT", par.parallel), Define("INLINE_MB_COUNT", par.inline), Define("SEQ_MB_COUNT", par.serial), Line(), Define("THREAD_NUM", "(SEGMENT_DOF+PAR_MB_NR*DOFS_PER_SEGMENT)"), Define("COALESCING_THREAD_COUNT", "(PAR_MB_COUNT*DOFS_PER_SEGMENT)"), Line(), Define("MB_DOF_BASE", "(MB_SEGMENT*DOFS_PER_SEGMENT)"), Define("MB_DOF", "(MB_DOF_BASE+SEGMENT_DOF)"), Define( "GLOBAL_MB_NR_BASE", "(MACROBLOCK_NR*PAR_MB_COUNT*INLINE_MB_COUNT*SEQ_MB_COUNT)"), Define( "GLOBAL_MB_NR", "(GLOBAL_MB_NR_BASE" "+ (seq_mb_number*PAR_MB_COUNT + PAR_MB_NR)*INLINE_MB_COUNT)"), Define("GLOBAL_MB_DOF_BASE", "(GLOBAL_MB_NR*ALIGNED_DOFS_PER_MB)"), Define("GLOBAL_MB_PREIMG_DOF_BASE", "(GLOBAL_MB_NR*ALIGNED_PREIMAGE_DOFS_PER_MB)"), Line(), Define("MATRIX_COLUMNS", self.plan.gpu_matrix_columns()), Define("MATRIX_SEGMENT_FLOATS", self.plan.gpu_matrix_block_floats()), Define("MATRIX_SEGMENT_BYTES", "(MATRIX_SEGMENT_FLOATS*%d)" % given.float_size()), Line(), CudaShared( ArrayOf(POD(float_type, "smem_matrix"), "MATRIX_SEGMENT_FLOATS")), CudaShared( ArrayOf( ArrayOf( ArrayOf(POD(float_type, "dof_buffer"), "PAR_MB_COUNT"), "INLINE_MB_COUNT"), "DOFS_PER_SEGMENT"), ), CudaShared(POD(numpy.uint16, "segment_start_el")), CudaShared(POD(numpy.uint16, "segment_stop_el")), CudaShared(POD(numpy.uint16, "segment_el_count")), Line(), ArrayInitializer( CudaConstant( ArrayOf(POD(numpy.uint32, "segment_start_el_lookup"), "SEGMENTS_PER_MB")), [(chk * self.plan.segment_size) // given.dofs_per_el() for chk in range(self.plan.segments_per_microblock())]), ArrayInitializer( CudaConstant( ArrayOf(POD(numpy.uint32, "segment_stop_el_lookup"), "SEGMENTS_PER_MB")), [ min(given.microblock.elements, (chk * self.plan.segment_size + self.plan.segment_size - 1) // given.dofs_per_el() + 1) for chk in range(self.plan.segments_per_microblock()) ]), ]) S = Statement f_body = Block() f_body.extend_log_block( "calculate this dof's element", [Initializer(POD(numpy.uint8, "mb_el"), "MB_DOF/DOFS_PER_EL")]) if self.plan.use_prefetch_branch: f_body.extend_log_block("calculate segment responsibility data", [ If( "THREAD_NUM==0", Block([ Assign("segment_start_el", "segment_start_el_lookup[MB_SEGMENT]"), Assign("segment_stop_el", "segment_stop_el_lookup[MB_SEGMENT]"), Assign("segment_el_count", "segment_stop_el-segment_start_el"), ])), S("__syncthreads()") ]) from hedge.backends.cuda.tools import get_load_code f_body.extend( get_load_code(dest="smem_matrix", base=( "gmem_matrix + MB_SEGMENT*MATRIX_SEGMENT_BYTES"), bytes="MATRIX_SEGMENT_BYTES", descr="load matrix segment") + [S("__syncthreads()")]) # --------------------------------------------------------------------- def get_batched_fetch_mat_mul_code(el_fetch_count): result = [] dofs = range(self.plan.preimage_dofs_per_el) for load_segment_start in range(0, self.plan.preimage_dofs_per_el, self.plan.segment_size): result.extend([S("__syncthreads()")] + [ Assign( "dof_buffer[PAR_MB_NR][%d][SEGMENT_DOF]" % inl, "fp_tex1Dfetch(in_vector_tex, " "GLOBAL_MB_PREIMG_DOF_BASE" " + %d*ALIGNED_PREIMAGE_DOFS_PER_MB" " + (segment_start_el)*PREIMAGE_DOFS_PER_EL + %d + SEGMENT_DOF)" % (inl, load_segment_start)) for inl in range(par.inline) ] + [ S("__syncthreads()"), Line(), ]) for dof in dofs[load_segment_start:load_segment_start + self.plan.segment_size]: for inl in range(par.inline): result.append( S("result%d += " "smem_matrix[SEGMENT_DOF*MATRIX_COLUMNS + %d]" "*" "dof_buffer[PAR_MB_NR][%d][%d]" % (inl, dof, inl, dof - load_segment_start))) result.append(Line()) return result from hedge.backends.cuda.tools import unroll def get_direct_tex_mat_mul_code(): return ( [POD(float_type, "fof%d" % inl) for inl in range(par.inline)] + [POD(float_type, "lm"), Line()] + unroll( lambda j: [ Assign( "fof%d" % inl, "fp_tex1Dfetch(in_vector_tex, " "GLOBAL_MB_PREIMG_DOF_BASE" " + %(inl)d * ALIGNED_PREIMAGE_DOFS_PER_MB" " + mb_el*PREIMAGE_DOFS_PER_EL+%(j)s)" % { "j": j, "inl": inl, "row": "SEGMENT_DOF" }, ) for inl in range(par.inline) ] + [ Assign( "lm", "smem_matrix[" "%(row)s*MATRIX_COLUMNS + %(j)s]" % { "j": j, "row": "SEGMENT_DOF" }, ) ] + [ S("result%(inl)d += fof%(inl)d*lm" % {"inl": inl}) for inl in range(par.inline) ], total_number=self.plan.preimage_dofs_per_el, max_unroll=self.plan.max_unroll) + [Line()]) def get_mat_mul_code(el_fetch_count): if el_fetch_count == 1: return get_batched_fetch_mat_mul_code(el_fetch_count) else: return get_direct_tex_mat_mul_code() def mat_mul_outer_loop(fetch_count): if with_scaling: inv_jac_multiplier = ( "fp_tex1Dfetch(scaling_tex," "(GLOBAL_MB_NR + %(inl)d)*MB_EL_COUNT + mb_el)") else: inv_jac_multiplier = "1" write_condition = "MB_DOF < DOFS_PER_EL*MB_EL_COUNT" if self.with_index_check: write_condition += " && GLOBAL_MB_NR < microblock_count" return For( "unsigned short seq_mb_number = 0", "seq_mb_number < SEQ_MB_COUNT", "++seq_mb_number", Block([ Initializer(POD(float_type, "result%d" % inl), 0) for inl in range(par.inline) ] + [Line()] + get_mat_mul_code(fetch_count) + [ If( write_condition, Block([ Assign( "out_vector[GLOBAL_MB_DOF_BASE" " + %d*ALIGNED_DOFS_PER_MB" " + MB_DOF]" % inl, "result%d * %s" % (inl, (inv_jac_multiplier % { "inl": inl }))) for inl in range(par.inline) ])) ])) if self.plan.use_prefetch_branch: from cgen import make_multiple_ifs f_body.append( make_multiple_ifs([ ("segment_el_count == %d" % fetch_count, mat_mul_outer_loop(fetch_count)) for fetch_count in range( 1, self.plan.max_elements_touched_by_segment() + 1) ])) else: f_body.append(mat_mul_outer_loop(0)) # finish off ---------------------------------------------------------- cmod.append(FunctionBody(f_decl, f_body)) if not for_benchmark and "cuda_dump_kernels" in discr.debug: from hedge.tools import open_unique_debug_file open_unique_debug_file(self.plan.debug_name, ".cu").write(str(cmod)) mod = SourceModule( cmod, keep="cuda_keep_kernels" in discr.debug, #options=["--maxrregcount=12"] ) func = mod.get_function("apply_el_local_mat_smem_mat") if self.plan.debug_name in discr.debug: print "%s: lmem=%d smem=%d regs=%d" % ( self.plan.debug_name, func.local_size_bytes, func.shared_size_bytes, func.num_regs) in_vector_texref = mod.get_texref("in_vector_tex") texrefs = [in_vector_texref] if with_scaling: scaling_texref = mod.get_texref("scaling_tex") texrefs.append(scaling_texref) else: scaling_texref = None func.prepare("PPPI", block=(self.plan.segment_size, self.plan.parallelism.parallel, 1), texrefs=texrefs) return func, in_vector_texref, scaling_texref
def make_lift(self, fgroup, with_scale, dtype): discr = self.discr from cgen import (FunctionDeclaration, FunctionBody, Typedef, Const, Reference, Value, POD, Statement, Include, Line, Block, Initializer, Assign, For, If, Define) from pytools import to_uncomplex_dtype from codepy.bpl import BoostPythonModule mod = BoostPythonModule() S = Statement mod.add_to_preamble([ Include("hedge/face_operators.hpp"), Include("hedge/volume_operators.hpp"), Include("boost/foreach.hpp"), ]) mod.add_to_module([ S("namespace ublas = boost::numeric::ublas"), S("using namespace hedge"), S("using namespace pyublas"), Line(), Define("DOFS_PER_EL", fgroup.ldis_loc.node_count()), Define("FACES_PER_EL", fgroup.ldis_loc.face_count()), Define("DIMENSIONS", discr.dimensions), Line(), Typedef(POD(dtype, "value_type")), Typedef(POD(to_uncomplex_dtype(dtype), "uncomplex_type")), ]) def if_(cond, result, else_=None): if cond: return [result] else: if else_ is None: return [] else: return [else_] fdecl = FunctionDeclaration(Value("void", "lift"), [ Const( Reference(Value("face_group<face_pair<straight_face> >", "fg"))), Value("ublas::matrix<uncomplex_type>", "matrix"), Value("numpy_array<value_type>", "field"), Value("numpy_array<value_type>", "result") ] + if_( with_scale, Const( Reference(Value("numpy_array<double>", "elwise_post_scaling"))))) def make_it(name, is_const=True, tpname="value_type"): if is_const: const = "const_" else: const = "" return Initializer( Value("numpy_array<%s>::%siterator" % (tpname, const), name + "_it"), "%s.begin()" % name) fbody = Block([ make_it("field"), make_it("result", is_const=False), ] + if_(with_scale, make_it("elwise_post_scaling", tpname="double")) + [ Line(), For( "unsigned fg_el_nr = 0", "fg_el_nr < fg.element_count()", "++fg_el_nr", Block([ Initializer(Value("node_number_t", "dest_el_base"), "fg.local_el_write_base[fg_el_nr]"), Initializer(Value("node_number_t", "src_el_base"), "FACES_PER_EL*fg.face_length()*fg_el_nr"), Line(), For( "unsigned i = 0", "i < DOFS_PER_EL", "++i", Block([ Initializer(Value("value_type", "tmp"), 0), Line(), For( "unsigned j = 0", "j < FACES_PER_EL*fg.face_length()", "++j", S("tmp += matrix(i, j)*field_it[src_el_base+j]" )), Line(), ] + if_( with_scale, Assign( "result_it[dest_el_base+i]", "tmp * value_type(*elwise_post_scaling_it)"), Assign("result_it[dest_el_base+i]", "tmp")))), ] + if_(with_scale, S("elwise_post_scaling_it++")))) ]) mod.add_function(FunctionBody(fdecl, fbody)) #print "----------------------------------------------------------------" #print FunctionBody(fdecl, fbody) #raw_input() return mod.compile(self.discr.toolchain).lift
def make_diff(self, elgroup, dtype, shape): """ :param shape: If non-square, the resulting code takes two element_ranges arguments and supports non-square matrices. """ from hedge._internal import UniformElementRanges assert isinstance(elgroup.ranges, UniformElementRanges) ldis = elgroup.local_discretization discr = self.discr from cgen import ( FunctionDeclaration, FunctionBody, Typedef, Const, Reference, Value, POD, Statement, Include, Line, Block, Initializer, Assign, For, If, Define) from pytools import to_uncomplex_dtype from codepy.bpl import BoostPythonModule mod = BoostPythonModule() # {{{ preamble S = Statement mod.add_to_preamble([ Include("hedge/volume_operators.hpp"), Include("boost/foreach.hpp"), ]) mod.add_to_module([ S("namespace ublas = boost::numeric::ublas"), S("using namespace hedge"), S("using namespace pyublas"), Line(), Define("ROW_COUNT", shape[0]), Define("COL_COUNT", shape[1]), Define("DIMENSIONS", discr.dimensions), Line(), Typedef(POD(dtype, "value_type")), Typedef(POD(to_uncomplex_dtype(dtype), "uncomplex_type")), ]) fdecl = FunctionDeclaration( Value("void", "diff"), [ Const(Reference(Value("uniform_element_ranges", "from_ers"))), Const(Reference(Value("uniform_element_ranges", "to_ers"))), Value("numpy_array<value_type>", "field") ]+[ Value("ublas::matrix<uncomplex_type>", "diffmat_rst%d" % rst) for rst in range(discr.dimensions) ]+[ Value("numpy_array<value_type>", "result%d" % i) for i in range(discr.dimensions) ] ) # }}} # {{{ set-up def make_it(name, is_const=True, tpname="value_type"): if is_const: const = "const_" else: const = "" return Initializer( Value("numpy_array<%s>::%siterator" % (tpname, const), name+"_it"), "%s.begin()" % name) fbody = Block([ If("ROW_COUNT != diffmat_rst%d.size1()" % i, S('throw(std::runtime_error("unexpected matrix size"))')) for i in range(discr.dimensions) ] + [ If("COL_COUNT != diffmat_rst%d.size2()" % i, S('throw(std::runtime_error("unexpected matrix size"))')) for i in range(discr.dimensions) ]+[ If("ROW_COUNT != to_ers.el_size()", S('throw(std::runtime_error("unsupported image element size"))')), If("COL_COUNT != from_ers.el_size()", S('throw(std::runtime_error("unsupported preimage element size"))')), If("from_ers.size() != to_ers.size()", S('throw(std::runtime_error("image and preimage element groups ' 'do nothave the same element count"))')), Line(), make_it("field"), ]+[ make_it("result%d" % i, is_const=False) for i in range(discr.dimensions) ]+[ Line(), # }}} # {{{ computation For("element_number_t eg_el_nr = 0", "eg_el_nr < to_ers.size()", "++eg_el_nr", Block([ Initializer( Value("node_number_t", "from_el_base"), "from_ers.start() + eg_el_nr*COL_COUNT"), Initializer( Value("node_number_t", "to_el_base"), "to_ers.start() + eg_el_nr*ROW_COUNT"), Line(), For("unsigned i = 0", "i < ROW_COUNT", "++i", Block([ Initializer(Value("value_type", "drst_%d" % rst), 0) for rst in range(discr.dimensions) ]+[ Line(), ]+[ For("unsigned j = 0", "j < COL_COUNT", "++j", Block([ S("drst_%(rst)d += " "diffmat_rst%(rst)d(i, j)*field_it[from_el_base+j]" % {"rst":rst}) for rst in range(discr.dimensions) ]) ), Line(), ]+[ Assign("result%d_it[to_el_base+i]" % rst, "drst_%d" % rst) for rst in range(discr.dimensions) ]) ) ]) ) ]) # }}} # {{{ compilation mod.add_function(FunctionBody(fdecl, fbody)) #print "----------------------------------------------------------------" #print mod.generate() #raw_input() compiled_func = mod.compile(self.discr.toolchain).diff if self.discr.instrumented: from hedge.tools import time_count_flop compiled_func = time_count_flop(compiled_func, discr.diff_timer, discr.diff_counter, discr.diff_flop_counter, flops=discr.dimensions*( 2 # mul+add * ldis.node_count() * len(elgroup.members) * ldis.node_count() + 2 * discr.dimensions * len(elgroup.members) * ldis.node_count()), increment=discr.dimensions) return compiled_func