def __init__(self, halo, grid, dimension, in_grid_name="in_grid", out_grid_name="out_grid", device=None): """ :param halo: the halo shape :param grid: the shape of the grid that stencil is to be applied to :param dimension: the dimension this kernel applies to :return: """ self.halo = tuple(halo) self.grid = grid self.shape = grid.shape self.in_grid_name = in_grid_name self.out_grid_name = out_grid_name # check for some pathologies and raise exception if any are present if len(halo) != len(self.shape): raise StencilException("halo {} can't apply to grid shape {}".format(self.halo, self.shape)) if dimension < 0 or dimension >= len(self.shape): raise StencilException("dimension {} too big for grid shape {}".format(dimension, self.shape)) # halo or grid to small if any([x < 1 or y < 1 for x, y in zip(self.halo, self.shape)]): raise StencilException( "halo {} can't be bigger than grid {} in any dimension".format(self.halo, self.shape) ) # no interior points in a dimension if any([s <= 2*h for h, s in zip(self.halo, self.shape)]): raise StencilException( "halo {} can't span grid shape {} in any dimension".format(self.halo, self.shape) ) self.dimension = dimension self.dimensions = len(grid.shape) self.device = device self.global_size, self.global_offset = self.compute_global_size() lcs = LocalSizeComputer(self.global_size, device=device) self.local_size = lcs.compute_local_size_thin() self.virtual_global_size = lcs.compute_virtual_global_size(self.local_size) self.kernel_name = OclBoundaryCopier.kernel_name(self.dimension)
def test_local_size_computer_bulky(self): test_grid_shape = [4, 128] lsc = LocalSizeComputer(test_grid_shape, MockIrisPro) local_size = lsc.compute_local_size_bulky() print("{} ls {}".format(test_grid_shape, local_size)) sizes = [ [[4], (4,), (4,)], [[5], (5,), (5,)], [[255], (255,), (255,)], [[1023], (1023,), (341,)], [[1024], (1024,), (512,)], [[1025], (205,), (205,)], [[1, 4], (1, 1), (1, 4)], [[4, 1], (4, 1), (4, 1)], [[4, 4], (4, 1), (4, 4)], [[4, 128], (4, 1), (4, 128)], [[128, 4], (128, 1), (32, 4)], [[128, 7], (128, 1), (32, 7)], [[128, 128], (128, 1), (16, 32)], [[4, 4, 4], (4, 1, 1), (4, 4, 4)], [[4, 4, 512], (4, 1, 1), (4, 4, 32)], [[512, 512, 4], (512, 1, 1), (8, 8, 4)], [[512, 512, 512], (512, 1, 1), (8, 8, 8)], [[3, 3, 666], (3, 1, 1), (3, 3, 37)], [[99, 99, 99], (99, 1, 1), (3, 11, 11)], ] for grid_shape, expected_cpu_local_size, expected_gpu_local_size in sizes: print("size {!s:16s}".format(grid_shape), end="") cpu_local_size = LocalSizeComputer(grid_shape, MockCPU).compute_local_size_bulky() gpu_local_size = LocalSizeComputer(grid_shape, MockIrisPro).compute_local_size_bulky() print(" d0 cpu local_size {!s:15s} gpu local_size {!s:15s}".format(cpu_local_size, gpu_local_size)) self.assertListEqual(list(cpu_local_size), list(expected_cpu_local_size)) self.assertListEqual(list(gpu_local_size), list(expected_gpu_local_size))
def visit_FunctionDecl(self, node): # This function grabs the input and output grid names which are used to self.local_block = SymbolRef.unique() # generate the proper array macros. arg_cfg = self.arg_cfg global_size = arg_cfg[0].shape if self.testing: local_size = (1, 1, 1) else: desired_device_number = -1 device = cl.clGetDeviceIDs()[desired_device_number] lcs = LocalSizeComputer(global_size, device) local_size = lcs.compute_local_size_bulky() virtual_global_size = lcs.compute_virtual_global_size(local_size) self.global_size = global_size self.local_size = local_size self.virtual_global_size = virtual_global_size super(StencilOclTransformer, self).visit_FunctionDecl(node) for index, param in enumerate(node.params[:-1]): # TODO: Transform numpy type to ctype param.type = ct.POINTER(ct.c_float)() param.set_global() param.set_const() node.set_kernel() node.params[-1].set_global() node.params[-1].type = ct.POINTER(ct.c_float)() node.params.append(SymbolRef(self.local_block.name, ct.POINTER(ct.c_float)())) node.params[-1].set_local() node.defn = node.defn[0] # if boundary handling is copy we have to generate a collection of # boundary kernels to handle the on-gpu boundary copy if self.is_copied: device = cl.clGetDeviceIDs()[-1] self.boundary_handlers = boundary_kernel_factory( self.ghost_depth, self.output_grid, node.params[0].name, node.params[-2].name, # second last parameter is output device ) boundary_kernels = [ FunctionDecl( name=boundary_handler.kernel_name, params=node.params, defn=boundary_handler.generate_ocl_kernel_body(), ) for boundary_handler in self.boundary_handlers ] self.project.files.append(OclFile('kernel', [node])) for dim, boundary_kernel in enumerate(boundary_kernels): boundary_kernel.set_kernel() self.project.files.append(OclFile(kernel_dim_name(dim), [boundary_kernel])) self.boundary_kernels = boundary_kernels # ctree.browser_show_ast(node) # import ctree # ctree.browser_show_ast(boundary_kernels[0]) else: self.project.files.append(OclFile('kernel', [node])) # print(self.project.files[0]) # print(self.project.files[-1]) defn = [ ArrayDef( SymbolRef('global', ct.c_ulong()), arg_cfg[0].ndim, [Constant(d) for d in self.virtual_global_size] ), ArrayDef( SymbolRef('local', ct.c_ulong()), arg_cfg[0].ndim, [Constant(s) for s in local_size] # [Constant(s) for s in [512, 512]] # use this line to force a # opencl local size error ), Assign(SymbolRef("error_code", ct.c_int()), Constant(0)), ] setargs = [clSetKernelArg( SymbolRef('kernel'), Constant(d), FunctionCall(SymbolRef('sizeof'), [SymbolRef('cl_mem')]), Ref(SymbolRef('buf%d' % d)) ) for d in range(len(arg_cfg) + 1)] from functools import reduce import operator local_mem_size = reduce( operator.mul, (size + 2 * self.kernel.ghost_depth[index] for index, size in enumerate(local_size)), ct.sizeof(cl.cl_float()) ) setargs.append( clSetKernelArg( 'kernel', len(arg_cfg) + 1, local_mem_size, NULL() ) ) defn.extend(setargs) enqueue_call = FunctionCall(SymbolRef('clEnqueueNDRangeKernel'), [ SymbolRef('queue'), SymbolRef('kernel'), Constant(self.kernel.dim), NULL(), SymbolRef('global'), SymbolRef('local'), Constant(0), NULL(), NULL() ]) defn.extend(check_ocl_error(enqueue_call, "clEnqueueNDRangeKernel")) params = [ SymbolRef('queue', cl.cl_command_queue()), SymbolRef('kernel', cl.cl_kernel()) ] if self.is_copied: for dim, boundary_kernel in enumerate(self.boundary_kernels): defn.extend([ ArrayDef( SymbolRef(global_for_dim_name(dim), ct.c_ulong()), arg_cfg[0].ndim, [Constant(d) for d in self.boundary_handlers[dim].global_size] ), ArrayDef( SymbolRef(local_for_dim_name(dim), ct.c_ulong()), arg_cfg[0].ndim, [Constant(s) for s in self.boundary_handlers[dim].local_size] ) ]) setargs = [clSetKernelArg( SymbolRef(kernel_dim_name(dim)), Constant(d), FunctionCall(SymbolRef('sizeof'), [SymbolRef('cl_mem')]), Ref(SymbolRef('buf%d' % d)) ) for d in range(len(arg_cfg) + 1)] setargs.append( clSetKernelArg( SymbolRef(kernel_dim_name(dim)), len(arg_cfg) + 1, local_mem_size, NULL() ) ) defn.extend(setargs) enqueue_call = FunctionCall( SymbolRef('clEnqueueNDRangeKernel'), [ SymbolRef('queue'), SymbolRef(kernel_dim_name(dim)), Constant(self.kernel.dim), NULL(), SymbolRef(global_for_dim_name(dim)), SymbolRef(local_for_dim_name(dim)), Constant(0), NULL(), NULL() ] ) defn.append(enqueue_call) params.extend([ SymbolRef(kernel_dim_name(dim), cl.cl_kernel()) ]) # finish_call = FunctionCall(SymbolRef('clFinish'), # [SymbolRef('queue')]) # defn.append(finish_call) # finish_call = [ # Assign( # SymbolRef("error_code", ct.c_int()), # FunctionCall(SymbolRef('clFinish'), [SymbolRef('queue')]) # ), # If( # NotEq(SymbolRef("error_code"), Constant(0)), # FunctionCall( # SymbolRef("printf"), # [ # String("OPENCL KERNEL RETURNED ERROR CODE %d"), # SymbolRef("error_code") # ] # ) # ) # ] finish_call = check_ocl_error( FunctionCall(SymbolRef('clFinish'), [SymbolRef('queue')]), "clFinish" ) defn.extend(finish_call) defn.append(Return(SymbolRef("error_code"))) params.extend(SymbolRef('buf%d' % d, cl.cl_mem()) for d in range(len(arg_cfg) + 1)) control = FunctionDecl(ct.c_int32(), "stencil_control", params=params, defn=defn) return control