def visit_KernelCall(self, node): args = [] for i, arg in enumerate(node.args): size = SizeOf(SymbolRef(arg.name)) setter = clSetKernelArg(node.name, i, size, Ref(SymbolRef(arg.name))) setter.lift(params=arg._lift_params) args.append(setter) kernel_decl = SymbolRef(node.name, cl.cl_kernel()) kernel_symbol = kernel_decl.copy() call = clEnqueueNDRangeKernel(node.location.symbol.copy(), kernel_symbol, work_dim=Constant(1), global_size=node.global_size, local_size=node.local_size) kernel = RefConverter().visit(node.kernel) for param in kernel.params: param.type = param.type.ptr_type kernel.defn.insert(0, Assign(SymbolRef("i", c_int()), get_global_id(0))) kernel_src = kernel.codegen() call.body.append(CppComment(kernel_src)) context = node.location.queue.context kernel_ptr = cl.clCreateProgramWithSource(context, kernel_src).build()[node.name] call.lift(params=[(kernel_decl, kernel_ptr)]) return args + [call]
def visit_FunctionDecl(self, node): # This function grabs the input and output grid names which are used to self.local_block = SymbolRef.unique() # generate the proper array macros. arg_cfg = self.arg_cfg global_size = arg_cfg[0].shape if self.testing: local_size = (1, 1, 1) else: desired_device_number = -1 device = cl.clGetDeviceIDs()[desired_device_number] lcs = LocalSizeComputer(global_size, device) local_size = lcs.compute_local_size_bulky() virtual_global_size = lcs.compute_virtual_global_size(local_size) self.global_size = global_size self.local_size = local_size self.virtual_global_size = virtual_global_size super(StencilOclTransformer, self).visit_FunctionDecl(node) for index, param in enumerate(node.params[:-1]): # TODO: Transform numpy type to ctype param.type = ct.POINTER(ct.c_float)() param.set_global() param.set_const() node.set_kernel() node.params[-1].set_global() node.params[-1].type = ct.POINTER(ct.c_float)() node.params.append(SymbolRef(self.local_block.name, ct.POINTER(ct.c_float)())) node.params[-1].set_local() node.defn = node.defn[0] # if boundary handling is copy we have to generate a collection of # boundary kernels to handle the on-gpu boundary copy if self.is_copied: device = cl.clGetDeviceIDs()[-1] self.boundary_handlers = boundary_kernel_factory( self.ghost_depth, self.output_grid, node.params[0].name, node.params[-2].name, # second last parameter is output device ) boundary_kernels = [ FunctionDecl( name=boundary_handler.kernel_name, params=node.params, defn=boundary_handler.generate_ocl_kernel_body(), ) for boundary_handler in self.boundary_handlers ] self.project.files.append(OclFile('kernel', [node])) for dim, boundary_kernel in enumerate(boundary_kernels): boundary_kernel.set_kernel() self.project.files.append(OclFile(kernel_dim_name(dim), [boundary_kernel])) self.boundary_kernels = boundary_kernels # ctree.browser_show_ast(node) # import ctree # ctree.browser_show_ast(boundary_kernels[0]) else: self.project.files.append(OclFile('kernel', [node])) # print(self.project.files[0]) # print(self.project.files[-1]) defn = [ ArrayDef( SymbolRef('global', ct.c_ulong()), arg_cfg[0].ndim, [Constant(d) for d in self.virtual_global_size] ), ArrayDef( SymbolRef('local', ct.c_ulong()), arg_cfg[0].ndim, [Constant(s) for s in local_size] # [Constant(s) for s in [512, 512]] # use this line to force a # opencl local size error ), Assign(SymbolRef("error_code", ct.c_int()), Constant(0)), ] setargs = [clSetKernelArg( SymbolRef('kernel'), Constant(d), FunctionCall(SymbolRef('sizeof'), [SymbolRef('cl_mem')]), Ref(SymbolRef('buf%d' % d)) ) for d in range(len(arg_cfg) + 1)] from functools import reduce import operator local_mem_size = reduce( operator.mul, (size + 2 * self.kernel.ghost_depth[index] for index, size in enumerate(local_size)), ct.sizeof(cl.cl_float()) ) setargs.append( clSetKernelArg( 'kernel', len(arg_cfg) + 1, local_mem_size, NULL() ) ) defn.extend(setargs) enqueue_call = FunctionCall(SymbolRef('clEnqueueNDRangeKernel'), [ SymbolRef('queue'), SymbolRef('kernel'), Constant(self.kernel.dim), NULL(), SymbolRef('global'), SymbolRef('local'), Constant(0), NULL(), NULL() ]) defn.extend(check_ocl_error(enqueue_call, "clEnqueueNDRangeKernel")) params = [ SymbolRef('queue', cl.cl_command_queue()), SymbolRef('kernel', cl.cl_kernel()) ] if self.is_copied: for dim, boundary_kernel in enumerate(self.boundary_kernels): defn.extend([ ArrayDef( SymbolRef(global_for_dim_name(dim), ct.c_ulong()), arg_cfg[0].ndim, [Constant(d) for d in self.boundary_handlers[dim].global_size] ), ArrayDef( SymbolRef(local_for_dim_name(dim), ct.c_ulong()), arg_cfg[0].ndim, [Constant(s) for s in self.boundary_handlers[dim].local_size] ) ]) setargs = [clSetKernelArg( SymbolRef(kernel_dim_name(dim)), Constant(d), FunctionCall(SymbolRef('sizeof'), [SymbolRef('cl_mem')]), Ref(SymbolRef('buf%d' % d)) ) for d in range(len(arg_cfg) + 1)] setargs.append( clSetKernelArg( SymbolRef(kernel_dim_name(dim)), len(arg_cfg) + 1, local_mem_size, NULL() ) ) defn.extend(setargs) enqueue_call = FunctionCall( SymbolRef('clEnqueueNDRangeKernel'), [ SymbolRef('queue'), SymbolRef(kernel_dim_name(dim)), Constant(self.kernel.dim), NULL(), SymbolRef(global_for_dim_name(dim)), SymbolRef(local_for_dim_name(dim)), Constant(0), NULL(), NULL() ] ) defn.append(enqueue_call) params.extend([ SymbolRef(kernel_dim_name(dim), cl.cl_kernel()) ]) # finish_call = FunctionCall(SymbolRef('clFinish'), # [SymbolRef('queue')]) # defn.append(finish_call) # finish_call = [ # Assign( # SymbolRef("error_code", ct.c_int()), # FunctionCall(SymbolRef('clFinish'), [SymbolRef('queue')]) # ), # If( # NotEq(SymbolRef("error_code"), Constant(0)), # FunctionCall( # SymbolRef("printf"), # [ # String("OPENCL KERNEL RETURNED ERROR CODE %d"), # SymbolRef("error_code") # ] # ) # ) # ] finish_call = check_ocl_error( FunctionCall(SymbolRef('clFinish'), [SymbolRef('queue')]), "clFinish" ) defn.extend(finish_call) defn.append(Return(SymbolRef("error_code"))) params.extend(SymbolRef('buf%d' % d, cl.cl_mem()) for d in range(len(arg_cfg) + 1)) control = FunctionDecl(ct.c_int32(), "stencil_control", params=params, defn=defn) return control
def visit_FunctionDecl(self, node): """ :param node: :type node: FunctionDef """ if node.kernel is True: return node for index, arg in enumerate(node.params): self.arg_cfg_dict[arg.name] = self.arg_cfg[index] if hasattr(self.arg_cfg[index], 'ndpointer'): arg.type = self.arg_cfg[index].ndpointer() else: arg.type = self.arg_cfg[index].ctype() self.params = node.params node.defn = list(filter(None, map(self.visit, node.defn))) params = [ SymbolRef('queue', cl.cl_command_queue()), SymbolRef('kernel', cl.cl_kernel()) ] params.extend(SymbolRef('buf%d' % d, cl.cl_mem()) for d in range(len(self.arg_cfg))) local_size = 4 defn = [ ArrayDef( SymbolRef('global', ct.c_ulong()), Constant(self.ndim), [Constant(d) for d in self.shape] ), ArrayDef( SymbolRef('local', ct.c_ulong()), Constant(self.ndim), [Constant(local_size) for _ in self.shape] ) ] setargs = [clSetKernelArg( SymbolRef('kernel'), Constant(index), Constant(ct.sizeof(arg.ctype)), Ref(SymbolRef('buf%d' % index)) ) for index, arg in enumerate(self.arg_cfg)] defn.extend(setargs) enqueue_call = FunctionCall(SymbolRef('clEnqueueNDRangeKernel'), [ SymbolRef('queue'), SymbolRef('kernel'), Constant(self.ndim), NULL(), SymbolRef('global'), SymbolRef('local'), Constant(0), NULL(), NULL() ]) finish_call = FunctionCall(SymbolRef('clFinish'), [SymbolRef('queue')]) defn.extend([enqueue_call, finish_call]) header = StringTemplate(""" #ifdef __APPLE__ #include <OpenCL/opencl.h> #else #include <CL/cl.h> #endif """) node.params = params node.defn = defn self.fusable_nodes.append( KernelCall(node, self.project.files[-1].body[0], self.shape, defn[0], tuple(local_size for _ in self.shape), defn[1], enqueue_call, finish_call, setargs) ) return [header, node]