def prepare_matrix(self, matrix): discr = self.discr given = self.plan.given assert matrix.shape == (given.dofs_per_el(), self.plan.preimage_dofs_per_el) columns = self.plan.gpu_matrix_columns() block_floats = self.plan.gpu_matrix_block_floats() vstacked_matrix = numpy.vstack(given.microblock.elements * (matrix, )) if vstacked_matrix.shape[1] < columns: vstacked_matrix = numpy.hstack( (vstacked_matrix, numpy.zeros((vstacked_matrix.shape[0], columns - vstacked_matrix.shape[1])))) segments = [ buffer( numpy.asarray(vstacked_matrix[segment_start:segment_start + self.plan.segment_size], dtype=given.float_type, order="C")) for segment_start in range( 0, given.microblock.elements * given.dofs_per_el(), self.plan.segment_size) ] from hedge.backends.cuda.tools import pad_and_join return cuda.to_device( pad_and_join(segments, block_floats * given.float_size()))
def make_superblocks(devdata, struct_name, single_item, multi_item, extra_fields={}): from hedge.backends.cuda.tools import pad_and_join # single_item = [([ block1, block2, ... ], decl), ...] # multi_item = [([ [ item1, item2, ...], ... ], decl), ...] multi_blocks = [ ["".join(s) for s in part_data] for part_data, part_decls in multi_item] block_sizes = [ max(len(b) for b in part_blocks) for part_blocks in multi_blocks] from pytools import single_valued block_count = single_valued( len(si_part_blocks) for si_part_blocks, si_part_decl in single_item) from cgen import Struct, ArrayOf struct_members = [] for part_data, part_decl in single_item: assert block_count == len(part_data) single_valued(len(block) for block in part_data) struct_members.append(part_decl) for part_data, part_decl in multi_item: struct_members.append( ArrayOf(part_decl, max(len(s) for s in part_data))) superblocks = [] for superblock_num in range(block_count): data = "" for part_data, part_decl in single_item: data += part_data[superblock_num] for part_blocks, part_size in zip(multi_blocks, block_sizes): assert block_count == len(part_blocks) data += pad(part_blocks[superblock_num], part_size) superblocks.append(data) superblock_size = devdata.align( single_valued(len(sb) for sb in superblocks)) data = pad_and_join(superblocks, superblock_size) assert len(data) == superblock_size*block_count class SuperblockedDataStructure(Record): pass return SuperblockedDataStructure( struct=Struct(struct_name, struct_members), device_memory=cuda.to_device(data), block_bytes=superblock_size, data=data, **extra_fields )
def make_superblocks(devdata, struct_name, single_item, multi_item, extra_fields={}): from hedge.backends.cuda.tools import pad_and_join # single_item = [([ block1, block2, ... ], decl), ...] # multi_item = [([ [ item1, item2, ...], ... ], decl), ...] multi_blocks = [ ["".join(s) for s in part_data] for part_data, part_decls in multi_item] block_sizes = [ max(len(b) for b in part_blocks) for part_blocks in multi_blocks] from pytools import single_valued block_count = single_valued( len(si_part_blocks) for si_part_blocks, si_part_decl in single_item) from cgen import Struct, ArrayOf struct_members = [] for part_data, part_decl in single_item: assert block_count == len(part_data) single_valued(len(block) for block in part_data) struct_members.append(part_decl) for part_data, part_decl in multi_item: struct_members.append( ArrayOf(part_decl, max(len(s) for s in part_data))) superblocks = [] for superblock_num in range(block_count): data = "" for part_data, part_decl in single_item: data += part_data[superblock_num] for part_blocks, part_size in zip(multi_blocks, block_sizes): assert block_count == len(part_blocks) data += pad(part_blocks[superblock_num], part_size) superblocks.append(data) superblock_size = devdata.align( single_valued(len(sb) for sb in superblocks)) data = pad_and_join(superblocks, superblock_size) assert len(data) == superblock_size*block_count class SuperblockedDataStructure(Record): pass return SuperblockedDataStructure( struct=Struct(struct_name, struct_members), device_memory=cuda.to_device(data), block_bytes=superblock_size, data=data, **extra_fields )
def gpu_diffmats(self, diff_op_cls, elgroup): discr = self.discr given = self.plan.given columns = given.dofs_per_el() * discr.dimensions additional_columns = 0 # avoid smem fetch bank conflicts by ensuring odd col count if columns % 2 == 0: columns += 1 additional_columns += 1 block_floats = given.devdata.align_dtype( columns * self.plan.segment_size, given.float_size()) vstacked_matrices = [ numpy.vstack(given.microblock.elements * (m, )) for m in diff_op_cls.matrices(elgroup) ] segments = [] from pytools import single_valued for segment_start in range( 0, given.microblock.elements * given.dofs_per_el(), self.plan.segment_size): matrices = [ m[segment_start:segment_start + self.plan.segment_size] for m in vstacked_matrices ] matrices.append( numpy.zeros( (single_valued(m.shape[0] for m in matrices), additional_columns))) diffmats = numpy.asarray(numpy.hstack(matrices), dtype=given.float_type, order="C") segments.append(buffer(diffmats)) from hedge.backends.cuda.tools import pad_and_join from pytools import Record class GPUDifferentiationMatrices(Record): pass return GPUDifferentiationMatrices(device_memory=cuda.to_device( pad_and_join(segments, block_floats * given.float_size())), block_floats=block_floats, matrix_columns=columns)
def make_blocks(devdata, data): from hedge.backends.cuda.tools import pad_and_join blocks = ["".join(b) for b in data] block_size = devdata.align(max(len(b) for b in blocks)) class BlockedDataStructure(Record): pass return BlockedDataStructure( blocks=cuda.to_device(pad_and_join(blocks, block_size)), max_per_block=max(len(b) for b in data), block_size=block_size, )
def make_blocks(devdata, data): from hedge.backends.cuda.tools import pad_and_join blocks = ["".join(b) for b in data] block_size = devdata.align(max(len(b) for b in blocks)) class BlockedDataStructure(Record): pass return BlockedDataStructure( blocks=cuda.to_device(pad_and_join(blocks, block_size)), max_per_block=max(len(b) for b in data), block_size=block_size, )
def gpu_diffmats(self, diff_op_cls, elgroup): discr = self.discr given = self.plan.given columns = given.dofs_per_el()*discr.dimensions additional_columns = 0 # avoid smem fetch bank conflicts by ensuring odd col count if columns % 2 == 0: columns += 1 additional_columns += 1 block_floats = given.devdata.align_dtype( columns*self.plan.segment_size, given.float_size()) vstacked_matrices = [ numpy.vstack(given.microblock.elements*(m,)) for m in diff_op_cls.matrices(elgroup) ] segments = [] from pytools import single_valued for segment_start in range(0, given.microblock.elements*given.dofs_per_el(), self.plan.segment_size): matrices = [ m[segment_start:segment_start+self.plan.segment_size] for m in vstacked_matrices] matrices.append( numpy.zeros((single_valued(m.shape[0] for m in matrices), additional_columns)) ) diffmats = numpy.asarray( numpy.hstack(matrices), dtype=given.float_type, order="C") segments.append(buffer(diffmats)) from hedge.backends.cuda.tools import pad_and_join from pytools import Record class GPUDifferentiationMatrices(Record): pass return GPUDifferentiationMatrices( device_memory=cuda.to_device( pad_and_join(segments, block_floats*given.float_size())), block_floats=block_floats, matrix_columns=columns)
def prepare_matrix(self, matrix): discr = self.discr given = self.plan.given assert matrix.shape == (given.dofs_per_el(), self.plan.preimage_dofs_per_el) columns = self.plan.gpu_matrix_columns() block_floats = self.plan.gpu_matrix_block_floats() vstacked_matrix = numpy.vstack( given.microblock.elements*(matrix,)) if vstacked_matrix.shape[1] < columns: vstacked_matrix = numpy.hstack(( vstacked_matrix, numpy.zeros(( vstacked_matrix.shape[0], columns-vstacked_matrix.shape[1] )) )) segments = [ buffer(numpy.asarray( vstacked_matrix[ segment_start:segment_start+self.plan.segment_size], dtype=given.float_type, order="C")) for segment_start in range( 0, given.microblock.elements*given.dofs_per_el(), self.plan.segment_size) ] from hedge.backends.cuda.tools import pad_and_join return cuda.to_device( pad_and_join(segments, block_floats*given.float_size()))