def generate_body(self, kernel, codegen_state): from cgen import Block body = Block() # {{{ declare temporaries body.extend( idi.cgen_declarator for tv in six.itervalues(kernel.temporary_variables) for idi in tv.decl_info(kernel.target, is_written=True, index_dtype=kernel.index_dtype) ) # }}} from loopy.codegen.loop import set_up_hw_parallel_loops gen_code = set_up_hw_parallel_loops(kernel, 0, codegen_state) from cgen import Line body.append(Line()) if isinstance(gen_code.ast, Block): body.extend(gen_code.ast.contents) else: body.append(gen_code.ast) return body, gen_code.implemented_domains
def get_scalar_diff_code(): code = [] for inl in range(par.inline): for axis in dims: code.append( Initializer(POD(float_type, "d%drst%d" % (inl, axis)), 0)) code.append(Line()) def get_mat_entry(row, col, axis): return ("smem_diff_rst_mat[" "%(row)s*DIFFMAT_COLUMNS + %(axis)s*DOFS_PER_EL" " + %(col)s" "]" % { "row": row, "col": col, "axis": axis }) tex_channels = ["x", "y", "z", "w"] from hedge.backends.cuda.tools import unroll code.extend([ POD(float_type, "field_value%d" % inl) for inl in range(par.inline) ] + [Line()] + unroll( lambda j: [ Assign( "field_value%d" % inl, "fp_tex1Dfetch(field_tex, GLOBAL_MB_DOF_BASE + %d*ALIGNED_DOFS_PER_MB " "+ mb_el*DOFS_PER_EL + %s)" % (inl, j)) for inl in range(par.inline) ] + [Line()] + [ S("d%drst%d += %s * field_value%d" % (inl, axis, get_mat_entry("SEGMENT_DOF", j, axis), inl)) for axis in dims for inl in range(par.inline) ] + [Line()], given.dofs_per_el(), self.plan.max_unroll)) store_code = Block() for inl in range(par.inline): for rst_axis in dims: store_code.append( Assign( "drst%d_global[GLOBAL_MB_DOF_BASE" " + %d*ALIGNED_DOFS_PER_MB + MB_DOF]" % (rst_axis, inl), "d%drst%d" % (inl, rst_axis), )) code.append(If("MB_DOF < DOFS_PER_EL*ELS_PER_MB", store_code)) return code
def get_scalar_diff_code(): code = [] for inl in range(par.inline): for axis in dims: code.append( Initializer(POD(float_type, "d%drst%d" % (inl, axis)), 0)) code.append(Line()) def get_mat_entry(row, col, axis): return ("smem_diff_rst_mat[" "%(row)s*DIFFMAT_COLUMNS + %(axis)s*DOFS_PER_EL" " + %(col)s" "]" % {"row":row, "col":col, "axis":axis} ) tex_channels = ["x", "y", "z", "w"] from hedge.backends.cuda.tools import unroll code.extend( [POD(float_type, "field_value%d" % inl) for inl in range(par.inline)] +[Line()] +unroll(lambda j: [ Assign("field_value%d" % inl, "fp_tex1Dfetch(field_tex, GLOBAL_MB_DOF_BASE + %d*ALIGNED_DOFS_PER_MB " "+ mb_el*DOFS_PER_EL + %s)" % (inl, j) ) for inl in range(par.inline)] +[Line()] +[S("d%drst%d += %s * field_value%d" % (inl, axis, get_mat_entry("SEGMENT_DOF", j, axis), inl)) for axis in dims for inl in range(par.inline)] +[Line()], given.dofs_per_el(), self.plan.max_unroll) ) store_code = Block() for inl in range(par.inline): for rst_axis in dims: store_code.append(Assign( "drst%d_global[GLOBAL_MB_DOF_BASE" " + %d*ALIGNED_DOFS_PER_MB + MB_DOF]" % (rst_axis, inl), "d%drst%d" % (inl, rst_axis), )) code.append(If("MB_DOF < DOFS_PER_EL*ELS_PER_MB", store_code)) return code
def get_matmul_code(): from hedge.backends.cuda.tools import unroll index_check_condition = "GLOBAL_MB_NR < microblock_count" def if_(conditions, then): final_cond = " && ".join(cond for cond in conditions if cond) if final_cond: return If(final_cond, then) else: return then result = Block([ Comment("everybody needs to be done with the old data"), S("__syncthreads()"), Line(), ]+[If(index_check_condition, get_load_code())]+[ Line(), Comment("all the new data must be loaded"), S("__syncthreads()"), Line(), ]+[ Initializer(POD(float_type, "result%d" % inl), 0) for inl in range(par.inline) ]+[ Line(), POD(float_type, "mat_entry"), Line(), ]) result.append(if_(["IMAGE_MB_DOF < IMAGE_DOFS_PER_MB", index_check_condition], Block(unroll(lambda j: [Assign("mat_entry", "fp_tex2D(mat_tex, IMAGE_EL_DOF, %s)" % j)] +[ S("result%d += mat_entry " "* smem_in_vector[PAR_MB_NR][%d][mb_el*PREIMAGE_DOFS_PER_EL + %s]" % (inl, inl, j)) for inl in range(par.inline) ], total_number=plan.preimage_dofs_per_el) +[Line()] +[Assign( "out_vector[GLOBAL_MB_IMAGE_DOF_BASE + " "%d*ALIGNED_IMAGE_DOFS_PER_MB + IMAGE_MB_DOF]" % inl, "result%d" % inl) for inl in range(par.inline)] ))) return result
def get_kernel(self, with_scaling, for_benchmark=False): from cgen import \ Pointer, POD, Value, ArrayOf, \ Module, FunctionDeclaration, FunctionBody, Block, \ Line, Define, Include, \ Initializer, If, For, Statement, Assign, \ ArrayInitializer from cgen import dtype_to_ctype from cgen.cuda import CudaShared, CudaConstant, CudaGlobal discr = self.discr d = discr.dimensions dims = range(d) given = self.plan.given float_type = given.float_type f_decl = CudaGlobal(FunctionDeclaration(Value("void", "apply_el_local_mat_smem_mat"), [ Pointer(POD(float_type, "out_vector")), Pointer(POD(numpy.uint8, "gmem_matrix")), Pointer(POD(float_type, "debugbuf")), POD(numpy.uint32, "microblock_count"), ] )) cmod = Module([ Include("pycuda-helpers.hpp"), Line(), Value("texture<fp_tex_%s, 1, cudaReadModeElementType>" % dtype_to_ctype(float_type), "in_vector_tex"), ]) if with_scaling: cmod.append( Value("texture<fp_tex_%s, 1, cudaReadModeElementType>" % dtype_to_ctype(float_type), "scaling_tex"), ) par = self.plan.parallelism cmod.extend([ Line(), Define("DIMENSIONS", discr.dimensions), Define("DOFS_PER_EL", given.dofs_per_el()), Define("PREIMAGE_DOFS_PER_EL", self.plan.preimage_dofs_per_el), Line(), Define("SEGMENT_DOF", "threadIdx.x"), Define("PAR_MB_NR", "threadIdx.y"), Line(), Define("MB_SEGMENT", "blockIdx.x"), Define("MACROBLOCK_NR", "blockIdx.y"), Line(), Define("DOFS_PER_SEGMENT", self.plan.segment_size), Define("SEGMENTS_PER_MB", self.plan.segments_per_microblock()), Define("ALIGNED_DOFS_PER_MB", given.microblock.aligned_floats), Define("ALIGNED_PREIMAGE_DOFS_PER_MB", self.plan.aligned_preimage_dofs_per_microblock), Define("MB_EL_COUNT", given.microblock.elements), Line(), Define("PAR_MB_COUNT", par.parallel), Define("INLINE_MB_COUNT", par.inline), Define("SEQ_MB_COUNT", par.serial), Line(), Define("THREAD_NUM", "(SEGMENT_DOF+PAR_MB_NR*DOFS_PER_SEGMENT)"), Define("COALESCING_THREAD_COUNT", "(PAR_MB_COUNT*DOFS_PER_SEGMENT)"), Line(), Define("MB_DOF_BASE", "(MB_SEGMENT*DOFS_PER_SEGMENT)"), Define("MB_DOF", "(MB_DOF_BASE+SEGMENT_DOF)"), Define("GLOBAL_MB_NR_BASE", "(MACROBLOCK_NR*PAR_MB_COUNT*INLINE_MB_COUNT*SEQ_MB_COUNT)"), Define("GLOBAL_MB_NR", "(GLOBAL_MB_NR_BASE" "+ (seq_mb_number*PAR_MB_COUNT + PAR_MB_NR)*INLINE_MB_COUNT)"), Define("GLOBAL_MB_DOF_BASE", "(GLOBAL_MB_NR*ALIGNED_DOFS_PER_MB)"), Define("GLOBAL_MB_PREIMG_DOF_BASE", "(GLOBAL_MB_NR*ALIGNED_PREIMAGE_DOFS_PER_MB)"), Line(), Define("MATRIX_COLUMNS", self.plan.gpu_matrix_columns()), Define("MATRIX_SEGMENT_FLOATS", self.plan.gpu_matrix_block_floats()), Define("MATRIX_SEGMENT_BYTES", "(MATRIX_SEGMENT_FLOATS*%d)" % given.float_size()), Line(), CudaShared(ArrayOf(POD(float_type, "smem_matrix"), "MATRIX_SEGMENT_FLOATS")), CudaShared( ArrayOf( ArrayOf( ArrayOf( POD(float_type, "dof_buffer"), "PAR_MB_COUNT"), "INLINE_MB_COUNT"), "DOFS_PER_SEGMENT"), ), CudaShared(POD(numpy.uint16, "segment_start_el")), CudaShared(POD(numpy.uint16, "segment_stop_el")), CudaShared(POD(numpy.uint16, "segment_el_count")), Line(), ArrayInitializer( CudaConstant( ArrayOf( POD(numpy.uint32, "segment_start_el_lookup"), "SEGMENTS_PER_MB")), [(chk*self.plan.segment_size)//given.dofs_per_el() for chk in range(self.plan.segments_per_microblock())] ), ArrayInitializer( CudaConstant( ArrayOf( POD(numpy.uint32, "segment_stop_el_lookup"), "SEGMENTS_PER_MB")), [min(given.microblock.elements, (chk*self.plan.segment_size+self.plan.segment_size-1) //given.dofs_per_el()+1) for chk in range(self.plan.segments_per_microblock())] ), ]) S = Statement f_body = Block() f_body.extend_log_block("calculate this dof's element", [ Initializer(POD(numpy.uint8, "mb_el"), "MB_DOF/DOFS_PER_EL") ]) if self.plan.use_prefetch_branch: f_body.extend_log_block("calculate segment responsibility data", [ If("THREAD_NUM==0", Block([ Assign("segment_start_el", "segment_start_el_lookup[MB_SEGMENT]"), Assign("segment_stop_el", "segment_stop_el_lookup[MB_SEGMENT]"), Assign("segment_el_count", "segment_stop_el-segment_start_el"), ]) ), S("__syncthreads()") ]) from hedge.backends.cuda.tools import get_load_code f_body.extend( get_load_code( dest="smem_matrix", base=("gmem_matrix + MB_SEGMENT*MATRIX_SEGMENT_BYTES"), bytes="MATRIX_SEGMENT_BYTES", descr="load matrix segment") +[S("__syncthreads()")] ) # --------------------------------------------------------------------- def get_batched_fetch_mat_mul_code(el_fetch_count): result = [] dofs = range(self.plan.preimage_dofs_per_el) for load_segment_start in range(0, self.plan.preimage_dofs_per_el, self.plan.segment_size): result.extend( [S("__syncthreads()")] +[Assign( "dof_buffer[PAR_MB_NR][%d][SEGMENT_DOF]" % inl, "fp_tex1Dfetch(in_vector_tex, " "GLOBAL_MB_PREIMG_DOF_BASE" " + %d*ALIGNED_PREIMAGE_DOFS_PER_MB" " + (segment_start_el)*PREIMAGE_DOFS_PER_EL + %d + SEGMENT_DOF)" % (inl, load_segment_start) ) for inl in range(par.inline)] +[S("__syncthreads()"), Line(), ]) for dof in dofs[load_segment_start:load_segment_start+self.plan.segment_size]: for inl in range(par.inline): result.append( S("result%d += " "smem_matrix[SEGMENT_DOF*MATRIX_COLUMNS + %d]" "*" "dof_buffer[PAR_MB_NR][%d][%d]" % (inl, dof, inl, dof-load_segment_start)) ) result.append(Line()) return result from hedge.backends.cuda.tools import unroll def get_direct_tex_mat_mul_code(): return ( [POD(float_type, "fof%d" % inl) for inl in range(par.inline)] + [POD(float_type, "lm"), Line()] + unroll( lambda j: [ Assign("fof%d" % inl, "fp_tex1Dfetch(in_vector_tex, " "GLOBAL_MB_PREIMG_DOF_BASE" " + %(inl)d * ALIGNED_PREIMAGE_DOFS_PER_MB" " + mb_el*PREIMAGE_DOFS_PER_EL+%(j)s)" % {"j":j, "inl":inl, "row": "SEGMENT_DOF"},) for inl in range(par.inline) ]+[ Assign("lm", "smem_matrix[" "%(row)s*MATRIX_COLUMNS + %(j)s]" % {"j":j, "row": "SEGMENT_DOF"}, ) ]+[ S("result%(inl)d += fof%(inl)d*lm" % {"inl":inl}) for inl in range(par.inline) ], total_number=self.plan.preimage_dofs_per_el, max_unroll=self.plan.max_unroll) + [Line()]) def get_mat_mul_code(el_fetch_count): if el_fetch_count == 1: return get_batched_fetch_mat_mul_code(el_fetch_count) else: return get_direct_tex_mat_mul_code() def mat_mul_outer_loop(fetch_count): if with_scaling: inv_jac_multiplier = ("fp_tex1Dfetch(scaling_tex," "(GLOBAL_MB_NR + %(inl)d)*MB_EL_COUNT + mb_el)") else: inv_jac_multiplier = "1" write_condition = "MB_DOF < DOFS_PER_EL*MB_EL_COUNT" if self.with_index_check: write_condition += " && GLOBAL_MB_NR < microblock_count" return For("unsigned short seq_mb_number = 0", "seq_mb_number < SEQ_MB_COUNT", "++seq_mb_number", Block([ Initializer(POD(float_type, "result%d" % inl), 0) for inl in range(par.inline) ]+[Line()] +get_mat_mul_code(fetch_count) +[ If(write_condition, Block([ Assign( "out_vector[GLOBAL_MB_DOF_BASE" " + %d*ALIGNED_DOFS_PER_MB" " + MB_DOF]" % inl, "result%d * %s" % (inl, (inv_jac_multiplier % {"inl":inl})) ) for inl in range(par.inline) ]) ) ]) ) if self.plan.use_prefetch_branch: from cgen import make_multiple_ifs f_body.append(make_multiple_ifs([ ("segment_el_count == %d" % fetch_count, mat_mul_outer_loop(fetch_count)) for fetch_count in range(1, self.plan.max_elements_touched_by_segment()+1)] )) else: f_body.append(mat_mul_outer_loop(0)) # finish off ---------------------------------------------------------- cmod.append(FunctionBody(f_decl, f_body)) if not for_benchmark and "cuda_dump_kernels" in discr.debug: from hedge.tools import open_unique_debug_file open_unique_debug_file(self.plan.debug_name, ".cu").write(str(cmod)) mod = SourceModule(cmod, keep="cuda_keep_kernels" in discr.debug, #options=["--maxrregcount=12"] ) func = mod.get_function("apply_el_local_mat_smem_mat") if self.plan.debug_name in discr.debug: print "%s: lmem=%d smem=%d regs=%d" % ( self.plan.debug_name, func.local_size_bytes, func.shared_size_bytes, func.num_regs) in_vector_texref = mod.get_texref("in_vector_tex") texrefs = [in_vector_texref] if with_scaling: scaling_texref = mod.get_texref("scaling_tex") texrefs.append(scaling_texref) else: scaling_texref = None func.prepare( "PPPI", block=(self.plan.segment_size, self.plan.parallelism.parallel, 1), texrefs=texrefs) return func, in_vector_texref, scaling_texref
def get_scalar_diff_code(): code = [] for inl in range(par.inline): for axis in dims: code.append(Initializer(POD(float_type, "d%drst%d" % (inl, axis)), 0)) code.append(Line()) tex_channels = ["x", "y", "z", "w"] store_code = Block() for inl in range(par.inline): for rst_axis in dims: store_code.append( Assign( "drst%d_global[GLOBAL_MB_IMAGE_DOF_BASE + " "%d*ALIGNED_IMAGE_DOFS_PER_MB + IMAGE_MB_DOF]" % (rst_axis, inl), "d%drst%d" % (inl, rst_axis), ) ) from hedge.backends.cuda.tools import unroll code.extend( [ Comment("everybody needs to be done with the old data"), S("__syncthreads()"), Line(), get_load_code(), Line(), Comment("all the new data must be loaded"), S("__syncthreads()"), Line(), ] ) if float_type == numpy.float32: code.append(Value("float%d" % rst_channels, "dmat_entries")) code.extend([POD(float_type, "field_value%d" % inl) for inl in range(par.inline)] + [Line()]) def unroll_body(j): result = [ Assign("field_value%d" % inl, "smem_field[PAR_MB_NR][%d][mb_el*PREIMAGE_DOFS_PER_EL+%s]" % (inl, j)) for inl in range(par.inline) ] if float_type == numpy.float32: result.append( Assign("dmat_entries", "tex1Dfetch(diff_rst_mat_tex, IMAGE_EL_DOF + %s*IMAGE_DOFS_PER_EL)" % j) ) result.extend( S("d%drst%d += dmat_entries.%s * field_value%d" % (inl, axis, tex_channels[axis], inl)) for inl in range(par.inline) for axis in dims ) elif float_type == numpy.float64: result.extend( S( "d%(inl)drst%(axis)d += " "fp_tex1Dfetch(diff_rst_mat_tex, %(axis)d " "+ DIMENSIONS*(IMAGE_EL_DOF + %(j)d*IMAGE_DOFS_PER_EL))" "* field_value%(inl)d" % {"inl": inl, "axis": axis, "j": j} ) for inl in range(par.inline) for axis in dims ) else: assert False return result code.append( If( "IMAGE_MB_DOF < IMAGE_DOFS_PER_MB", Block(unroll(unroll_body, total_number=plan.preimage_dofs_per_el) + [store_code]), ) ) return code
def get_kernel(self, for_benchmark=False): from cgen import \ Pointer, POD, Value, ArrayOf, Const, \ Module, FunctionDeclaration, FunctionBody, Block, \ Comment, Line, Include, \ Define, \ Initializer, If, For, Statement, Assign from cgen import dtype_to_ctype from cgen.cuda import CudaShared, CudaGlobal discr = self.discr d = discr.dimensions dims = range(d) given = self.plan.given float_type = given.float_type f_decl = CudaGlobal(FunctionDeclaration(Value("void", "apply_el_local_mat_smem_field"), [ Pointer(POD(float_type, "out_vector")), Pointer(POD(float_type, "in_vector")), Pointer(POD(float_type, "debugbuf")), POD(numpy.uint32, "microblock_count"), ] )) cmod = Module([ Include("pycuda-helpers.hpp"), Line(), Value("texture<fp_tex_%s, 2, cudaReadModeElementType>" % dtype_to_ctype(float_type), "mat_tex"), ]) plan = self.plan par = plan.parallelism # FIXME: aligned_image_dofs_per_microblock must be divisible # by this, therefore hardcoding for now. chunk_size = 16 cmod.extend([ Line(), Define("DIMENSIONS", discr.dimensions), Define("IMAGE_DOFS_PER_EL", plan.image_dofs_per_el), Define("PREIMAGE_DOFS_PER_EL", plan.preimage_dofs_per_el), Define("ALIGNED_IMAGE_DOFS_PER_MB", plan.aligned_image_dofs_per_microblock), Define("ALIGNED_PREIMAGE_DOFS_PER_MB", plan.aligned_preimage_dofs_per_microblock), Line(), Define("MB_EL_COUNT", plan.elements_per_microblock), Line(), Define("IMAGE_DOFS_PER_MB", "(IMAGE_DOFS_PER_EL*MB_EL_COUNT)"), Line(), Define("CHUNK_SIZE", chunk_size), Define("CHUNK_DOF", "threadIdx.x"), Define("PAR_MB_NR", "threadIdx.y"), Define("CHUNK_NR", "threadIdx.z"), Define("IMAGE_MB_DOF", "(CHUNK_NR*CHUNK_SIZE+CHUNK_DOF)"), Define("IMAGE_EL_DOF", "(IMAGE_MB_DOF - mb_el*IMAGE_DOFS_PER_EL)"), Line(), Define("MACROBLOCK_NR", "blockIdx.x"), Line(), Define("PAR_MB_COUNT", par.parallel), Define("INLINE_MB_COUNT", par.inline), Define("SEQ_MB_COUNT", par.serial), Line(), Define("GLOBAL_MB_NR_BASE", "(MACROBLOCK_NR*PAR_MB_COUNT*INLINE_MB_COUNT*SEQ_MB_COUNT)"), Define("GLOBAL_MB_NR", "(GLOBAL_MB_NR_BASE" "+ (seq_mb_number*PAR_MB_COUNT + PAR_MB_NR)*INLINE_MB_COUNT)"), Define("GLOBAL_MB_IMAGE_DOF_BASE", "(GLOBAL_MB_NR*ALIGNED_IMAGE_DOFS_PER_MB)"), Define("GLOBAL_MB_PREIMAGE_DOF_BASE", "(GLOBAL_MB_NR*ALIGNED_PREIMAGE_DOFS_PER_MB)"), Line(), CudaShared( ArrayOf( ArrayOf( ArrayOf( POD(float_type, "smem_in_vector"), "PAR_MB_COUNT"), "INLINE_MB_COUNT"), "ALIGNED_PREIMAGE_DOFS_PER_MB")), Line(), ]) S = Statement f_body = Block([ Initializer(Const(POD(numpy.uint16, "mb_el")), "IMAGE_MB_DOF / IMAGE_DOFS_PER_EL"), Line(), ]) def get_load_code(): mb_img_dofs = plan.aligned_image_dofs_per_microblock mb_preimg_dofs = plan.aligned_preimage_dofs_per_microblock preimg_dofs_over_dofs = (mb_preimg_dofs+mb_img_dofs-1) // mb_img_dofs load_code = [] store_code = [] var_num = 0 for load_block in range(preimg_dofs_over_dofs): for inl in range(par.inline): # load and store are split for better pipelining # compiler can't figure that out because of branch var = "tmp%d" % var_num var_num += 1 load_code.append(POD(float_type, var)) block_addr = "%d * ALIGNED_IMAGE_DOFS_PER_MB + IMAGE_MB_DOF" % load_block load_instr = Assign(var, "in_vector[GLOBAL_MB_PREIMAGE_DOF_BASE" " + %d*ALIGNED_PREIMAGE_DOFS_PER_MB" " + %s]" % (inl, block_addr)) store_instr = Assign( "smem_in_vector[PAR_MB_NR][%d][%s]" % (inl, block_addr), var ) if (load_block+1)*mb_img_dofs >= mb_preimg_dofs: cond = "%s < ALIGNED_PREIMAGE_DOFS_PER_MB" % block_addr load_instr = If(cond, load_instr) store_instr = If(cond, store_instr) load_code.append(load_instr) store_code.append(store_instr) return Block(load_code + [Line()] + store_code) def get_matmul_code(): from hedge.backends.cuda.tools import unroll index_check_condition = "GLOBAL_MB_NR < microblock_count" def if_(conditions, then): final_cond = " && ".join(cond for cond in conditions if cond) if final_cond: return If(final_cond, then) else: return then result = Block([ Comment("everybody needs to be done with the old data"), S("__syncthreads()"), Line(), ]+[If(index_check_condition, get_load_code())]+[ Line(), Comment("all the new data must be loaded"), S("__syncthreads()"), Line(), ]+[ Initializer(POD(float_type, "result%d" % inl), 0) for inl in range(par.inline) ]+[ Line(), POD(float_type, "mat_entry"), Line(), ]) result.append(if_(["IMAGE_MB_DOF < IMAGE_DOFS_PER_MB", index_check_condition], Block(unroll(lambda j: [Assign("mat_entry", "fp_tex2D(mat_tex, IMAGE_EL_DOF, %s)" % j)] +[ S("result%d += mat_entry " "* smem_in_vector[PAR_MB_NR][%d][mb_el*PREIMAGE_DOFS_PER_EL + %s]" % (inl, inl, j)) for inl in range(par.inline) ], total_number=plan.preimage_dofs_per_el) +[Line()] +[Assign( "out_vector[GLOBAL_MB_IMAGE_DOF_BASE + " "%d*ALIGNED_IMAGE_DOFS_PER_MB + IMAGE_MB_DOF]" % inl, "result%d" % inl) for inl in range(par.inline)] ))) return result f_body.append(For("unsigned short seq_mb_number = 0", "seq_mb_number < SEQ_MB_COUNT", "++seq_mb_number", get_matmul_code())) # finish off ---------------------------------------------------------- cmod.append(FunctionBody(f_decl, f_body)) if not for_benchmark and "cuda_dump_kernels" in discr.debug: from hedge.tools import open_unique_debug_file open_unique_debug_file(plan.debug_name, ".cu").write(str(cmod)) mod = SourceModule(cmod, keep="cuda_keep_kernels" in discr.debug, #options=["--maxrregcount=12"] ) func = mod.get_function("apply_el_local_mat_smem_field") if plan.debug_name in discr.debug: print "%s: lmem=%d smem=%d regs=%d" % ( plan.debug_name, func.local_size_bytes, func.shared_size_bytes, func.num_regs) mat_texref = mod.get_texref("mat_tex") texrefs = [mat_texref] func.prepare( "PPPI", texrefs=texrefs) assert plan.aligned_image_dofs_per_microblock % chunk_size == 0 block = ( chunk_size, plan.parallelism.parallel, plan.aligned_image_dofs_per_microblock //chunk_size) return func, block, mat_texref
def write_interior_flux_code(self, is_twosided): given = self.plan.given def get_field(flux_rec, is_interior, flipped): if is_interior ^ flipped: prefix = "a" else: prefix = "b" return ("val_%s_field%d" % (prefix, self.dep_to_index[flux_rec.field_expr])) flux_write_code = Block([]) flux_var_decl = [Initializer(POD(given.float_type, "a_flux"), 0)] if is_twosided: flux_var_decl.append(Initializer(POD(given.float_type, "b_flux"), 0)) prefixes = ["a", "b"] flip_values = [False, True] else: prefixes = ["a"] flip_values = [False] flux_write_code.append(Line()) for dep in self.interior_deps: flux_write_code.append(Comment(str(dep))) for side in ["a", "b"]: flux_write_code.append( Initializer( MaybeUnused(POD(given.float_type, "val_%s_field%d" % (side, self.dep_to_index[dep]))), "fp_tex1Dfetch(field%d_tex, %s_index)" % (self.dep_to_index[dep], side))) f2cm = FluxToCodeMapper(given.float_type) flux_sub_codes = [] for flux_nr, wdflux in enumerate(self.fluxes): my_flux_block = Block(flux_var_decl) for int_rec in wdflux.interiors: for prefix, is_flipped in zip(prefixes, flip_values): my_flux_block.append( Statement("%s_flux += %s" % (prefix, flux_to_code(f2cm, is_flipped, int_rec.field_expr, int_rec.field_expr, self.dep_to_index, int_rec.flux_expr, PREC_NONE), ))) my_flux_block.append(Line()) my_flux_block.append( self.gen_store(flux_nr, "fpair->a_dest+FACEDOF_NR", "fpair->face_jacobian*a_flux")) #my_flux_block.append( #Statement("if(isnan(val_b_field5)) debugbuf[blockIdx.x] = 1"), #) if is_twosided: my_flux_block.append( self.gen_store(flux_nr, "fpair->b_dest+tex1Dfetch(tex_index_lists, " "fpair->b_write_ilist_index + FACEDOF_NR)", "fpair->face_jacobian*b_flux")) #my_flux_block.append( #Assign("debugbuf[blockIdx.x*96+fpair_nr+8]", "10000+fpair->b_dest"), #) flux_sub_codes.append(my_flux_block) if f2cm.cse_name_list: flux_write_code.append(Line()) flux_write_code.extend( Initializer( Value("value_type", cse_name), cse_str) for cse_name, cse_str in f2cm.cse_name_list) flux_write_code.extend(flux_sub_codes) return flux_write_code
def get_scalar_diff_code(): code = [] for inl in range(par.inline): for axis in dims: code.append( Initializer(POD(float_type, "d%drst%d" % (inl, axis)), 0)) code.append(Line()) tex_channels = ["x", "y", "z", "w"] store_code = Block() for inl in range(par.inline): for rst_axis in dims: store_code.append(Assign( "drst%d_global[GLOBAL_MB_IMAGE_DOF_BASE + " "%d*ALIGNED_IMAGE_DOFS_PER_MB + IMAGE_MB_DOF]" % (rst_axis, inl), "d%drst%d" % (inl, rst_axis) )) from hedge.backends.cuda.tools import unroll code.extend([ Comment("everybody needs to be done with the old data"), S("__syncthreads()"), Line(), get_load_code(), Line(), Comment("all the new data must be loaded"), S("__syncthreads()"), Line(), ]) if float_type == numpy.float32: code.append(Value("float%d" % rst_channels, "dmat_entries")) code.extend([ POD(float_type, "field_value%d" % inl) for inl in range(par.inline) ]+[Line()]) def unroll_body(j): result = [ Assign("field_value%d" % inl, "smem_field[PAR_MB_NR][%d][mb_el*PREIMAGE_DOFS_PER_EL+%s]" % (inl, j)) for inl in range(par.inline) ] if float_type == numpy.float32: result.append(Assign("dmat_entries", "tex1Dfetch(diff_rst_mat_tex, IMAGE_EL_DOF + %s*IMAGE_DOFS_PER_EL)" % j)) result.extend( S("d%drst%d += dmat_entries.%s * field_value%d" % (inl, axis, tex_channels[axis], inl)) for inl in range(par.inline) for axis in dims) elif float_type == numpy.float64: result.extend( S("d%(inl)drst%(axis)d += " "fp_tex1Dfetch(diff_rst_mat_tex, %(axis)d " "+ DIMENSIONS*(IMAGE_EL_DOF + %(j)d*IMAGE_DOFS_PER_EL))" "* field_value%(inl)d" % { "inl": inl, "axis": axis, "j": j }) for inl in range(par.inline) for axis in dims) else: assert False return result code.append(If("IMAGE_MB_DOF < IMAGE_DOFS_PER_MB", Block(unroll(unroll_body, total_number=plan.preimage_dofs_per_el) +[store_code]))) return code
def write_interior_flux_code(self, is_twosided): given = self.plan.given def get_field(flux_rec, is_interior, flipped): if is_interior ^ flipped: prefix = "a" else: prefix = "b" return ("val_%s_field%d" % (prefix, self.dep_to_index[flux_rec.field_expr])) flux_write_code = Block([]) flux_var_decl = [Initializer(POD(given.float_type, "a_flux"), 0)] if is_twosided: flux_var_decl.append( Initializer(POD(given.float_type, "b_flux"), 0)) prefixes = ["a", "b"] flip_values = [False, True] else: prefixes = ["a"] flip_values = [False] flux_write_code.append(Line()) for dep in self.interior_deps: flux_write_code.append(Comment(str(dep))) for side in ["a", "b"]: flux_write_code.append( Initializer( MaybeUnused( POD( given.float_type, "val_%s_field%d" % (side, self.dep_to_index[dep]))), "fp_tex1Dfetch(field%d_tex, %s_index)" % (self.dep_to_index[dep], side))) f2cm = FluxToCodeMapper(given.float_type) flux_sub_codes = [] for flux_nr, wdflux in enumerate(self.fluxes): my_flux_block = Block(flux_var_decl) for int_rec in wdflux.interiors: for prefix, is_flipped in zip(prefixes, flip_values): my_flux_block.append( Statement("%s_flux += %s" % ( prefix, flux_to_code(f2cm, is_flipped, int_rec.field_expr, int_rec.field_expr, self.dep_to_index, int_rec.flux_expr, PREC_NONE), ))) my_flux_block.append(Line()) my_flux_block.append( self.gen_store(flux_nr, "fpair->a_dest+FACEDOF_NR", "fpair->face_jacobian*a_flux")) #my_flux_block.append( #Statement("if(isnan(val_b_field5)) debugbuf[blockIdx.x] = 1"), #) if is_twosided: my_flux_block.append( self.gen_store( flux_nr, "fpair->b_dest+tex1Dfetch(tex_index_lists, " "fpair->b_write_ilist_index + FACEDOF_NR)", "fpair->face_jacobian*b_flux")) #my_flux_block.append( #Assign("debugbuf[blockIdx.x*96+fpair_nr+8]", "10000+fpair->b_dest"), #) flux_sub_codes.append(my_flux_block) if f2cm.cse_name_list: flux_write_code.append(Line()) flux_write_code.extend( Initializer(Value("value_type", cse_name), cse_str) for cse_name, cse_str in f2cm.cse_name_list) flux_write_code.extend(flux_sub_codes) return flux_write_code
def get_kernel(self, with_scaling, for_benchmark=False): from cgen import \ Pointer, POD, Value, ArrayOf, \ Module, FunctionDeclaration, FunctionBody, Block, \ Line, Define, Include, \ Initializer, If, For, Statement, Assign, \ ArrayInitializer from cgen import dtype_to_ctype from cgen.cuda import CudaShared, CudaConstant, CudaGlobal discr = self.discr d = discr.dimensions dims = range(d) given = self.plan.given float_type = given.float_type f_decl = CudaGlobal( FunctionDeclaration(Value("void", "apply_el_local_mat_smem_mat"), [ Pointer(POD(float_type, "out_vector")), Pointer(POD(numpy.uint8, "gmem_matrix")), Pointer(POD(float_type, "debugbuf")), POD(numpy.uint32, "microblock_count"), ])) cmod = Module([ Include("pycuda-helpers.hpp"), Line(), Value( "texture<fp_tex_%s, 1, cudaReadModeElementType>" % dtype_to_ctype(float_type), "in_vector_tex"), ]) if with_scaling: cmod.append( Value( "texture<fp_tex_%s, 1, cudaReadModeElementType>" % dtype_to_ctype(float_type), "scaling_tex"), ) par = self.plan.parallelism cmod.extend([ Line(), Define("DIMENSIONS", discr.dimensions), Define("DOFS_PER_EL", given.dofs_per_el()), Define("PREIMAGE_DOFS_PER_EL", self.plan.preimage_dofs_per_el), Line(), Define("SEGMENT_DOF", "threadIdx.x"), Define("PAR_MB_NR", "threadIdx.y"), Line(), Define("MB_SEGMENT", "blockIdx.x"), Define("MACROBLOCK_NR", "blockIdx.y"), Line(), Define("DOFS_PER_SEGMENT", self.plan.segment_size), Define("SEGMENTS_PER_MB", self.plan.segments_per_microblock()), Define("ALIGNED_DOFS_PER_MB", given.microblock.aligned_floats), Define("ALIGNED_PREIMAGE_DOFS_PER_MB", self.plan.aligned_preimage_dofs_per_microblock), Define("MB_EL_COUNT", given.microblock.elements), Line(), Define("PAR_MB_COUNT", par.parallel), Define("INLINE_MB_COUNT", par.inline), Define("SEQ_MB_COUNT", par.serial), Line(), Define("THREAD_NUM", "(SEGMENT_DOF+PAR_MB_NR*DOFS_PER_SEGMENT)"), Define("COALESCING_THREAD_COUNT", "(PAR_MB_COUNT*DOFS_PER_SEGMENT)"), Line(), Define("MB_DOF_BASE", "(MB_SEGMENT*DOFS_PER_SEGMENT)"), Define("MB_DOF", "(MB_DOF_BASE+SEGMENT_DOF)"), Define( "GLOBAL_MB_NR_BASE", "(MACROBLOCK_NR*PAR_MB_COUNT*INLINE_MB_COUNT*SEQ_MB_COUNT)"), Define( "GLOBAL_MB_NR", "(GLOBAL_MB_NR_BASE" "+ (seq_mb_number*PAR_MB_COUNT + PAR_MB_NR)*INLINE_MB_COUNT)"), Define("GLOBAL_MB_DOF_BASE", "(GLOBAL_MB_NR*ALIGNED_DOFS_PER_MB)"), Define("GLOBAL_MB_PREIMG_DOF_BASE", "(GLOBAL_MB_NR*ALIGNED_PREIMAGE_DOFS_PER_MB)"), Line(), Define("MATRIX_COLUMNS", self.plan.gpu_matrix_columns()), Define("MATRIX_SEGMENT_FLOATS", self.plan.gpu_matrix_block_floats()), Define("MATRIX_SEGMENT_BYTES", "(MATRIX_SEGMENT_FLOATS*%d)" % given.float_size()), Line(), CudaShared( ArrayOf(POD(float_type, "smem_matrix"), "MATRIX_SEGMENT_FLOATS")), CudaShared( ArrayOf( ArrayOf( ArrayOf(POD(float_type, "dof_buffer"), "PAR_MB_COUNT"), "INLINE_MB_COUNT"), "DOFS_PER_SEGMENT"), ), CudaShared(POD(numpy.uint16, "segment_start_el")), CudaShared(POD(numpy.uint16, "segment_stop_el")), CudaShared(POD(numpy.uint16, "segment_el_count")), Line(), ArrayInitializer( CudaConstant( ArrayOf(POD(numpy.uint32, "segment_start_el_lookup"), "SEGMENTS_PER_MB")), [(chk * self.plan.segment_size) // given.dofs_per_el() for chk in range(self.plan.segments_per_microblock())]), ArrayInitializer( CudaConstant( ArrayOf(POD(numpy.uint32, "segment_stop_el_lookup"), "SEGMENTS_PER_MB")), [ min(given.microblock.elements, (chk * self.plan.segment_size + self.plan.segment_size - 1) // given.dofs_per_el() + 1) for chk in range(self.plan.segments_per_microblock()) ]), ]) S = Statement f_body = Block() f_body.extend_log_block( "calculate this dof's element", [Initializer(POD(numpy.uint8, "mb_el"), "MB_DOF/DOFS_PER_EL")]) if self.plan.use_prefetch_branch: f_body.extend_log_block("calculate segment responsibility data", [ If( "THREAD_NUM==0", Block([ Assign("segment_start_el", "segment_start_el_lookup[MB_SEGMENT]"), Assign("segment_stop_el", "segment_stop_el_lookup[MB_SEGMENT]"), Assign("segment_el_count", "segment_stop_el-segment_start_el"), ])), S("__syncthreads()") ]) from hedge.backends.cuda.tools import get_load_code f_body.extend( get_load_code(dest="smem_matrix", base=( "gmem_matrix + MB_SEGMENT*MATRIX_SEGMENT_BYTES"), bytes="MATRIX_SEGMENT_BYTES", descr="load matrix segment") + [S("__syncthreads()")]) # --------------------------------------------------------------------- def get_batched_fetch_mat_mul_code(el_fetch_count): result = [] dofs = range(self.plan.preimage_dofs_per_el) for load_segment_start in range(0, self.plan.preimage_dofs_per_el, self.plan.segment_size): result.extend([S("__syncthreads()")] + [ Assign( "dof_buffer[PAR_MB_NR][%d][SEGMENT_DOF]" % inl, "fp_tex1Dfetch(in_vector_tex, " "GLOBAL_MB_PREIMG_DOF_BASE" " + %d*ALIGNED_PREIMAGE_DOFS_PER_MB" " + (segment_start_el)*PREIMAGE_DOFS_PER_EL + %d + SEGMENT_DOF)" % (inl, load_segment_start)) for inl in range(par.inline) ] + [ S("__syncthreads()"), Line(), ]) for dof in dofs[load_segment_start:load_segment_start + self.plan.segment_size]: for inl in range(par.inline): result.append( S("result%d += " "smem_matrix[SEGMENT_DOF*MATRIX_COLUMNS + %d]" "*" "dof_buffer[PAR_MB_NR][%d][%d]" % (inl, dof, inl, dof - load_segment_start))) result.append(Line()) return result from hedge.backends.cuda.tools import unroll def get_direct_tex_mat_mul_code(): return ( [POD(float_type, "fof%d" % inl) for inl in range(par.inline)] + [POD(float_type, "lm"), Line()] + unroll( lambda j: [ Assign( "fof%d" % inl, "fp_tex1Dfetch(in_vector_tex, " "GLOBAL_MB_PREIMG_DOF_BASE" " + %(inl)d * ALIGNED_PREIMAGE_DOFS_PER_MB" " + mb_el*PREIMAGE_DOFS_PER_EL+%(j)s)" % { "j": j, "inl": inl, "row": "SEGMENT_DOF" }, ) for inl in range(par.inline) ] + [ Assign( "lm", "smem_matrix[" "%(row)s*MATRIX_COLUMNS + %(j)s]" % { "j": j, "row": "SEGMENT_DOF" }, ) ] + [ S("result%(inl)d += fof%(inl)d*lm" % {"inl": inl}) for inl in range(par.inline) ], total_number=self.plan.preimage_dofs_per_el, max_unroll=self.plan.max_unroll) + [Line()]) def get_mat_mul_code(el_fetch_count): if el_fetch_count == 1: return get_batched_fetch_mat_mul_code(el_fetch_count) else: return get_direct_tex_mat_mul_code() def mat_mul_outer_loop(fetch_count): if with_scaling: inv_jac_multiplier = ( "fp_tex1Dfetch(scaling_tex," "(GLOBAL_MB_NR + %(inl)d)*MB_EL_COUNT + mb_el)") else: inv_jac_multiplier = "1" write_condition = "MB_DOF < DOFS_PER_EL*MB_EL_COUNT" if self.with_index_check: write_condition += " && GLOBAL_MB_NR < microblock_count" return For( "unsigned short seq_mb_number = 0", "seq_mb_number < SEQ_MB_COUNT", "++seq_mb_number", Block([ Initializer(POD(float_type, "result%d" % inl), 0) for inl in range(par.inline) ] + [Line()] + get_mat_mul_code(fetch_count) + [ If( write_condition, Block([ Assign( "out_vector[GLOBAL_MB_DOF_BASE" " + %d*ALIGNED_DOFS_PER_MB" " + MB_DOF]" % inl, "result%d * %s" % (inl, (inv_jac_multiplier % { "inl": inl }))) for inl in range(par.inline) ])) ])) if self.plan.use_prefetch_branch: from cgen import make_multiple_ifs f_body.append( make_multiple_ifs([ ("segment_el_count == %d" % fetch_count, mat_mul_outer_loop(fetch_count)) for fetch_count in range( 1, self.plan.max_elements_touched_by_segment() + 1) ])) else: f_body.append(mat_mul_outer_loop(0)) # finish off ---------------------------------------------------------- cmod.append(FunctionBody(f_decl, f_body)) if not for_benchmark and "cuda_dump_kernels" in discr.debug: from hedge.tools import open_unique_debug_file open_unique_debug_file(self.plan.debug_name, ".cu").write(str(cmod)) mod = SourceModule( cmod, keep="cuda_keep_kernels" in discr.debug, #options=["--maxrregcount=12"] ) func = mod.get_function("apply_el_local_mat_smem_mat") if self.plan.debug_name in discr.debug: print "%s: lmem=%d smem=%d regs=%d" % ( self.plan.debug_name, func.local_size_bytes, func.shared_size_bytes, func.num_regs) in_vector_texref = mod.get_texref("in_vector_tex") texrefs = [in_vector_texref] if with_scaling: scaling_texref = mod.get_texref("scaling_tex") texrefs.append(scaling_texref) else: scaling_texref = None func.prepare("PPPI", block=(self.plan.segment_size, self.plan.parallelism.parallel, 1), texrefs=texrefs) return func, in_vector_texref, scaling_texref
def generate_body(self, kernel, codegen_state): from cgen import Block body = Block() temp_decls = [] # {{{ declare temporaries base_storage_sizes = {} base_storage_to_is_local = {} base_storage_to_align_bytes = {} from cgen import ArrayOf, Pointer, Initializer, AlignedAttribute from loopy.codegen import POD # uses the correct complex type class ConstRestrictPointer(Pointer): def get_decl_pair(self): sub_tp, sub_decl = self.subdecl.get_decl_pair() return sub_tp, ("*const restrict %s" % sub_decl) for tv in sorted( six.itervalues(kernel.temporary_variables), key=lambda tv: tv.name): decl_info = tv.decl_info(self, index_dtype=kernel.index_dtype) if not tv.base_storage: for idi in decl_info: temp_var_decl = POD(self, idi.dtype, idi.name) if idi.shape: temp_var_decl = ArrayOf(temp_var_decl, " * ".join(str(s) for s in idi.shape)) temp_decls.append( self.wrap_temporary_decl(temp_var_decl, tv.is_local)) else: offset = 0 base_storage_sizes.setdefault(tv.base_storage, []).append( tv.nbytes) base_storage_to_is_local.setdefault(tv.base_storage, []).append( tv.is_local) align_size = tv.dtype.itemsize from loopy.kernel.array import VectorArrayDimTag for dim_tag, axis_len in zip(tv.dim_tags, tv.shape): if isinstance(dim_tag, VectorArrayDimTag): align_size *= axis_len base_storage_to_align_bytes.setdefault(tv.base_storage, []).append( align_size) for idi in decl_info: cast_decl = POD(self, idi.dtype, "") temp_var_decl = POD(self, idi.dtype, idi.name) cast_decl = self.wrap_temporary_decl(cast_decl, tv.is_local) temp_var_decl = self.wrap_temporary_decl( temp_var_decl, tv.is_local) # The 'restrict' part of this is a complete lie--of course # all these temporaries are aliased. But we're promising to # not use them to shovel data from one representation to the # other. That counts, right? cast_decl = ConstRestrictPointer(cast_decl) temp_var_decl = ConstRestrictPointer(temp_var_decl) cast_tp, cast_d = cast_decl.get_decl_pair() temp_var_decl = Initializer( temp_var_decl, "(%s %s) (%s + %s)" % ( " ".join(cast_tp), cast_d, tv.base_storage, offset)) temp_decls.append(temp_var_decl) from pytools import product offset += ( idi.dtype.itemsize * product(si for si in idi.shape)) for bs_name, bs_sizes in sorted(six.iteritems(base_storage_sizes)): bs_var_decl = POD(self, np.int8, bs_name) bs_var_decl = self.wrap_temporary_decl( bs_var_decl, base_storage_to_is_local[bs_name]) bs_var_decl = ArrayOf(bs_var_decl, max(bs_sizes)) alignment = max(base_storage_to_align_bytes[bs_name]) bs_var_decl = AlignedAttribute(alignment, bs_var_decl) body.append(bs_var_decl) body.extend(temp_decls) # }}} from loopy.codegen.loop import set_up_hw_parallel_loops gen_code = set_up_hw_parallel_loops(kernel, 0, codegen_state) from cgen import Line body.append(Line()) if isinstance(gen_code.ast, Block): body.extend(gen_code.ast.contents) else: body.append(gen_code.ast) return body, gen_code.implemented_domains