def get_constant_arg_decl(self, name, shape, dtype, is_written): from loopy.target.c import POD # uses the correct complex type from cgen import RestrictPointer, Const from cgen.cuda import CudaConstant arg_decl = RestrictPointer(POD(self, dtype, name)) if not is_written: arg_decl = Const(arg_decl) return CudaConstant(arg_decl)
def get_cuda_extra_preamble(self, discr, dtype, eg): from cgen import ArrayOf, Value, Initializer from cgen.cuda import CudaConstant ldis = eg.local_discretization mode_degrees = [sum(mode_indices) for mode_indices in ldis.generate_mode_identifiers()] return [Initializer(CudaConstant( ArrayOf(Value("unsigned", "mode_degrees"))), "{%s}" % ", ".join(str(i) for i in mode_degrees)) ]
def wrap_global_constant(self, decl): from cgen.cuda import CudaConstant return CudaConstant(decl)
def get_kernel(self, with_scaling, for_benchmark=False): from cgen import \ Pointer, POD, Value, ArrayOf, \ Module, FunctionDeclaration, FunctionBody, Block, \ Line, Define, Include, \ Initializer, If, For, Statement, Assign, \ ArrayInitializer from cgen import dtype_to_ctype from cgen.cuda import CudaShared, CudaConstant, CudaGlobal discr = self.discr d = discr.dimensions dims = range(d) given = self.plan.given float_type = given.float_type f_decl = CudaGlobal( FunctionDeclaration(Value("void", "apply_el_local_mat_smem_mat"), [ Pointer(POD(float_type, "out_vector")), Pointer(POD(numpy.uint8, "gmem_matrix")), Pointer(POD(float_type, "debugbuf")), POD(numpy.uint32, "microblock_count"), ])) cmod = Module([ Include("pycuda-helpers.hpp"), Line(), Value( "texture<fp_tex_%s, 1, cudaReadModeElementType>" % dtype_to_ctype(float_type), "in_vector_tex"), ]) if with_scaling: cmod.append( Value( "texture<fp_tex_%s, 1, cudaReadModeElementType>" % dtype_to_ctype(float_type), "scaling_tex"), ) par = self.plan.parallelism cmod.extend([ Line(), Define("DIMENSIONS", discr.dimensions), Define("DOFS_PER_EL", given.dofs_per_el()), Define("PREIMAGE_DOFS_PER_EL", self.plan.preimage_dofs_per_el), Line(), Define("SEGMENT_DOF", "threadIdx.x"), Define("PAR_MB_NR", "threadIdx.y"), Line(), Define("MB_SEGMENT", "blockIdx.x"), Define("MACROBLOCK_NR", "blockIdx.y"), Line(), Define("DOFS_PER_SEGMENT", self.plan.segment_size), Define("SEGMENTS_PER_MB", self.plan.segments_per_microblock()), Define("ALIGNED_DOFS_PER_MB", given.microblock.aligned_floats), Define("ALIGNED_PREIMAGE_DOFS_PER_MB", self.plan.aligned_preimage_dofs_per_microblock), Define("MB_EL_COUNT", given.microblock.elements), Line(), Define("PAR_MB_COUNT", par.parallel), Define("INLINE_MB_COUNT", par.inline), Define("SEQ_MB_COUNT", par.serial), Line(), Define("THREAD_NUM", "(SEGMENT_DOF+PAR_MB_NR*DOFS_PER_SEGMENT)"), Define("COALESCING_THREAD_COUNT", "(PAR_MB_COUNT*DOFS_PER_SEGMENT)"), Line(), Define("MB_DOF_BASE", "(MB_SEGMENT*DOFS_PER_SEGMENT)"), Define("MB_DOF", "(MB_DOF_BASE+SEGMENT_DOF)"), Define( "GLOBAL_MB_NR_BASE", "(MACROBLOCK_NR*PAR_MB_COUNT*INLINE_MB_COUNT*SEQ_MB_COUNT)"), Define( "GLOBAL_MB_NR", "(GLOBAL_MB_NR_BASE" "+ (seq_mb_number*PAR_MB_COUNT + PAR_MB_NR)*INLINE_MB_COUNT)"), Define("GLOBAL_MB_DOF_BASE", "(GLOBAL_MB_NR*ALIGNED_DOFS_PER_MB)"), Define("GLOBAL_MB_PREIMG_DOF_BASE", "(GLOBAL_MB_NR*ALIGNED_PREIMAGE_DOFS_PER_MB)"), Line(), Define("MATRIX_COLUMNS", self.plan.gpu_matrix_columns()), Define("MATRIX_SEGMENT_FLOATS", self.plan.gpu_matrix_block_floats()), Define("MATRIX_SEGMENT_BYTES", "(MATRIX_SEGMENT_FLOATS*%d)" % given.float_size()), Line(), CudaShared( ArrayOf(POD(float_type, "smem_matrix"), "MATRIX_SEGMENT_FLOATS")), CudaShared( ArrayOf( ArrayOf( ArrayOf(POD(float_type, "dof_buffer"), "PAR_MB_COUNT"), "INLINE_MB_COUNT"), "DOFS_PER_SEGMENT"), ), CudaShared(POD(numpy.uint16, "segment_start_el")), CudaShared(POD(numpy.uint16, "segment_stop_el")), CudaShared(POD(numpy.uint16, "segment_el_count")), Line(), ArrayInitializer( CudaConstant( ArrayOf(POD(numpy.uint32, "segment_start_el_lookup"), "SEGMENTS_PER_MB")), [(chk * self.plan.segment_size) // given.dofs_per_el() for chk in range(self.plan.segments_per_microblock())]), ArrayInitializer( CudaConstant( ArrayOf(POD(numpy.uint32, "segment_stop_el_lookup"), "SEGMENTS_PER_MB")), [ min(given.microblock.elements, (chk * self.plan.segment_size + self.plan.segment_size - 1) // given.dofs_per_el() + 1) for chk in range(self.plan.segments_per_microblock()) ]), ]) S = Statement f_body = Block() f_body.extend_log_block( "calculate this dof's element", [Initializer(POD(numpy.uint8, "mb_el"), "MB_DOF/DOFS_PER_EL")]) if self.plan.use_prefetch_branch: f_body.extend_log_block("calculate segment responsibility data", [ If( "THREAD_NUM==0", Block([ Assign("segment_start_el", "segment_start_el_lookup[MB_SEGMENT]"), Assign("segment_stop_el", "segment_stop_el_lookup[MB_SEGMENT]"), Assign("segment_el_count", "segment_stop_el-segment_start_el"), ])), S("__syncthreads()") ]) from hedge.backends.cuda.tools import get_load_code f_body.extend( get_load_code(dest="smem_matrix", base=( "gmem_matrix + MB_SEGMENT*MATRIX_SEGMENT_BYTES"), bytes="MATRIX_SEGMENT_BYTES", descr="load matrix segment") + [S("__syncthreads()")]) # --------------------------------------------------------------------- def get_batched_fetch_mat_mul_code(el_fetch_count): result = [] dofs = range(self.plan.preimage_dofs_per_el) for load_segment_start in range(0, self.plan.preimage_dofs_per_el, self.plan.segment_size): result.extend([S("__syncthreads()")] + [ Assign( "dof_buffer[PAR_MB_NR][%d][SEGMENT_DOF]" % inl, "fp_tex1Dfetch(in_vector_tex, " "GLOBAL_MB_PREIMG_DOF_BASE" " + %d*ALIGNED_PREIMAGE_DOFS_PER_MB" " + (segment_start_el)*PREIMAGE_DOFS_PER_EL + %d + SEGMENT_DOF)" % (inl, load_segment_start)) for inl in range(par.inline) ] + [ S("__syncthreads()"), Line(), ]) for dof in dofs[load_segment_start:load_segment_start + self.plan.segment_size]: for inl in range(par.inline): result.append( S("result%d += " "smem_matrix[SEGMENT_DOF*MATRIX_COLUMNS + %d]" "*" "dof_buffer[PAR_MB_NR][%d][%d]" % (inl, dof, inl, dof - load_segment_start))) result.append(Line()) return result from hedge.backends.cuda.tools import unroll def get_direct_tex_mat_mul_code(): return ( [POD(float_type, "fof%d" % inl) for inl in range(par.inline)] + [POD(float_type, "lm"), Line()] + unroll( lambda j: [ Assign( "fof%d" % inl, "fp_tex1Dfetch(in_vector_tex, " "GLOBAL_MB_PREIMG_DOF_BASE" " + %(inl)d * ALIGNED_PREIMAGE_DOFS_PER_MB" " + mb_el*PREIMAGE_DOFS_PER_EL+%(j)s)" % { "j": j, "inl": inl, "row": "SEGMENT_DOF" }, ) for inl in range(par.inline) ] + [ Assign( "lm", "smem_matrix[" "%(row)s*MATRIX_COLUMNS + %(j)s]" % { "j": j, "row": "SEGMENT_DOF" }, ) ] + [ S("result%(inl)d += fof%(inl)d*lm" % {"inl": inl}) for inl in range(par.inline) ], total_number=self.plan.preimage_dofs_per_el, max_unroll=self.plan.max_unroll) + [Line()]) def get_mat_mul_code(el_fetch_count): if el_fetch_count == 1: return get_batched_fetch_mat_mul_code(el_fetch_count) else: return get_direct_tex_mat_mul_code() def mat_mul_outer_loop(fetch_count): if with_scaling: inv_jac_multiplier = ( "fp_tex1Dfetch(scaling_tex," "(GLOBAL_MB_NR + %(inl)d)*MB_EL_COUNT + mb_el)") else: inv_jac_multiplier = "1" write_condition = "MB_DOF < DOFS_PER_EL*MB_EL_COUNT" if self.with_index_check: write_condition += " && GLOBAL_MB_NR < microblock_count" return For( "unsigned short seq_mb_number = 0", "seq_mb_number < SEQ_MB_COUNT", "++seq_mb_number", Block([ Initializer(POD(float_type, "result%d" % inl), 0) for inl in range(par.inline) ] + [Line()] + get_mat_mul_code(fetch_count) + [ If( write_condition, Block([ Assign( "out_vector[GLOBAL_MB_DOF_BASE" " + %d*ALIGNED_DOFS_PER_MB" " + MB_DOF]" % inl, "result%d * %s" % (inl, (inv_jac_multiplier % { "inl": inl }))) for inl in range(par.inline) ])) ])) if self.plan.use_prefetch_branch: from cgen import make_multiple_ifs f_body.append( make_multiple_ifs([ ("segment_el_count == %d" % fetch_count, mat_mul_outer_loop(fetch_count)) for fetch_count in range( 1, self.plan.max_elements_touched_by_segment() + 1) ])) else: f_body.append(mat_mul_outer_loop(0)) # finish off ---------------------------------------------------------- cmod.append(FunctionBody(f_decl, f_body)) if not for_benchmark and "cuda_dump_kernels" in discr.debug: from hedge.tools import open_unique_debug_file open_unique_debug_file(self.plan.debug_name, ".cu").write(str(cmod)) mod = SourceModule( cmod, keep="cuda_keep_kernels" in discr.debug, #options=["--maxrregcount=12"] ) func = mod.get_function("apply_el_local_mat_smem_mat") if self.plan.debug_name in discr.debug: print "%s: lmem=%d smem=%d regs=%d" % ( self.plan.debug_name, func.local_size_bytes, func.shared_size_bytes, func.num_regs) in_vector_texref = mod.get_texref("in_vector_tex") texrefs = [in_vector_texref] if with_scaling: scaling_texref = mod.get_texref("scaling_tex") texrefs.append(scaling_texref) else: scaling_texref = None func.prepare("PPPI", block=(self.plan.segment_size, self.plan.parallelism.parallel, 1), texrefs=texrefs) return func, in_vector_texref, scaling_texref