def vol_empty(): from hedge.backends.cuda.tools import int_ceiling dofs = int_ceiling( given.total_dofs(), self.plan.dofs_per_macroblock()) return gpuarray.empty((dofs,), dtype=given.float_type, allocator=discr.pool.allocate)
def _find_microblock_size(self, allow_microblocking): from hedge.backends.cuda.tools import int_ceiling align_size = self.devdata.align_words(self.float_size()) from pytools import Record class MicroblockInfo(Record): pass if not allow_microblocking: return MicroblockInfo( align_size=align_size, elements=1, aligned_floats=int_ceiling(self.dofs_per_el(), align_size) ) for mb_align_chunks in range(1, 256): mb_aligned_floats = align_size * mb_align_chunks mb_elements = mb_aligned_floats // self.dofs_per_el() mb_floats = self.dofs_per_el()*mb_elements overhead = (mb_aligned_floats-mb_floats)/mb_aligned_floats if overhead <= 0.05: return MicroblockInfo( align_size=align_size, elements=mb_elements, aligned_floats=mb_aligned_floats, ) assert False, "a valid microblock size was not found"
def __init__(self, discr, plan): self.discr = discr self.plan = plan from hedge.backends.cuda.tools import int_ceiling self.grid = (int_ceiling(self.plan.given.total_dofs() / self.plan.dofs_per_macroblock()), 1)
def max_elements_touched_by_segment(self): given = self.given from hedge.backends.cuda.tools import int_ceiling if given.dofs_per_el() > self.segment_size: return 2 else: return int_ceiling(self.segment_size/given.dofs_per_el()) + 1
def vol_empty(): from hedge.backends.cuda.tools import int_ceiling dofs = int_ceiling( given.total_dofs(), self.plan.dofs_per_macroblock()) import pycuda.gpuarray as gpuarray return gpuarray.empty((dofs,), dtype=given.float_type, allocator=discr.pool.allocate)
def threads_per_face(self): dpf = self.dofs_per_face devdata = self.given.devdata if dpf % devdata.smem_granularity >= devdata.smem_granularity // 2: from hedge.backends.cuda.tools import int_ceiling return int_ceiling(dpf, devdata.smem_granularity) else: return dpf
def __init__(self, discr, plan): self.discr = discr self.plan = plan from hedge.backends.cuda.tools import int_ceiling self.grid = (int_ceiling( self.plan.given.total_dofs() / self.plan.dofs_per_macroblock()), 1)
def matmul_preimage_shape(self, matmul_plan): from hedge.backends.cuda.tools import int_ceiling fof_dofs = ( self.block_count * self.microblocks_per_block * matmul_plan.aligned_preimage_dofs_per_microblock) fof_dofs = int_ceiling(fof_dofs, matmul_plan.preimage_dofs_per_macroblock()) return (fof_dofs,)
def __init__(self, discr, plan): self.discr = discr self.plan = plan from hedge.backends.cuda.tools import int_ceiling given = self.plan.given self.grid = (plan.segments_per_microblock(), int_ceiling(given.total_dofs()/plan.dofs_per_macroblock()))
def __init__(self, discr, plan, with_index_check): self.discr = discr self.plan = plan self.with_index_check = with_index_check from hedge.backends.cuda.tools import int_ceiling self.grid = (plan.segments_per_microblock(), int_ceiling(self.plan.given.total_dofs() / plan.dofs_per_macroblock()))
def __init__(self, discr, plan): self.discr = discr self.plan = plan from hedge.backends.cuda.tools import int_ceiling self.grid = (int_ceiling( plan.microblock_count * plan.aligned_image_dofs_per_microblock / self.plan.image_dofs_per_macroblock()), 1)
def __init__(self, discr, plan): self.discr = discr self.plan = plan from hedge.backends.cuda.tools import int_ceiling given = self.plan.given self.grid = (plan.segments_per_microblock(), int_ceiling(given.total_dofs() / plan.dofs_per_macroblock()))
def benchmark(self): discr = self.discr given = self.plan.given from hedge.backends.cuda.tools import int_ceiling block_count = int_ceiling( len(discr.mesh.elements)/self.plan.elements_per_block()) all_fluxes_on_faces = [gpuarray.empty( (block_count * self.plan.microblocks_per_block() * given.aligned_face_dofs_per_microblock(),), dtype=given.float_type, allocator=discr.pool.allocate) for i in range(len(self.fluxes))] field = gpuarray.empty( (self.plan.input_dofs_per_block() * block_count,), dtype=given.float_type, allocator=discr.pool.allocate) fdata = self.fake_flux_face_data_block(block_count) ilist_data = self.fake_index_list_data() block, gather, texref_map = self.get_kernel(fdata, ilist_data, for_benchmark=True) for dep_expr in self.all_deps: field.bind_to_texref_ext(texref_map[dep_expr], allow_double_hack=True) if "cuda_fastbench" in discr.debug: count = 1 else: count = 20 start = cuda.Event() start.record() for i in range(count): if block_count >= 2**16: return None try: gather.prepared_call( (block_count, 1), block, 0, fdata.device_memory, *tuple(fof.gpudata for fof in all_fluxes_on_faces) ) except cuda.LaunchError: return None stop = cuda.Event() stop.record() stop.synchronize() return 1e-3/count * stop.time_since(start)
def benchmark(self): discr = self.discr given = self.plan.given from hedge.backends.cuda.tools import int_ceiling block_count = int_ceiling( len(discr.mesh.elements) / self.plan.elements_per_block()) all_fluxes_on_faces = [ gpuarray.empty((block_count * self.plan.microblocks_per_block() * given.aligned_face_dofs_per_microblock(), ), dtype=given.float_type, allocator=discr.pool.allocate) for i in range(len(self.fluxes)) ] field = gpuarray.empty( (self.plan.input_dofs_per_block() * block_count, ), dtype=given.float_type, allocator=discr.pool.allocate) fdata = self.fake_flux_face_data_block(block_count) ilist_data = self.fake_index_list_data() block, gather, texref_map = self.get_kernel(fdata, ilist_data, for_benchmark=True) for dep_expr in self.all_deps: field.bind_to_texref_ext(texref_map[dep_expr], allow_double_hack=True) if "cuda_fastbench" in discr.debug: count = 1 else: count = 20 start = cuda.Event() start.record() for i in range(count): if block_count >= 2**16: return None try: gather.prepared_call( (block_count, 1), block, 0, fdata.device_memory, *tuple(fof.gpudata for fof in all_fluxes_on_faces)) except cuda.LaunchError: return None stop = cuda.Event() stop.record() stop.synchronize() return 1e-3 / count * stop.time_since(start)
def __init__(self, discr, plan, with_index_check): self.discr = discr self.plan = plan self.with_index_check = with_index_check from hedge.backends.cuda.tools import int_ceiling self.grid = (plan.segments_per_microblock(), int_ceiling( self.plan.given.total_dofs() / plan.dofs_per_macroblock()) )
def segments_per_microblock(self): from hedge.backends.cuda.tools import int_ceiling return int_ceiling( self.given.microblock.aligned_floats/self.segment_size)