def shadow_kernel(self, *args): """ This shadow_kernel method will replace the kernel method that is defined in the sub-class of StencilKernel. If in pure python mode, it will execute the kernel in python. Else, it first checks if we have a cached version of the specialized function for the shapes of the arguments. If so, we make a call to that function with our new arguments. If not, we create a new SpecializedStencil with our arguments and original kernel method and call it with our arguments. :param args: The arguments to our original kernel method. :return: Undefined """ output_grid = np.zeros_like(args[0]) # output_grid = StencilGrid(args[0].shape) # output_grid.ghost_depth = self.ghost_depth if self.pure_python: self.pure_python_kernel(*(args + (output_grid, ))) return output_grid if not self.specialized_sizes or\ self.specialized_sizes != [y.shape for y in args]: self.specialized = SpecializedStencil(self.model, args, output_grid, self, self.testing) self.specialized_sizes = [arg.shape for arg in args] duration = c_float() # args = [arg.data for arg in args] args += (output_grid, byref(duration)) self.specialized(*args) self.specialized.report(time=duration) # print("Took %.3fs" % duration.value) return output_grid
def get_ir_nodes(self, args): tree = copy.deepcopy(self.original_tree) arg_cfg = self.args_to_subconfig(args) output = np.zeros_like(args[0]) shape = output.shape param_types = [ np.ctypeslib.ndpointer(arg.dtype, arg.ndim, arg.shape) for arg in arg_cfg + (output, ) ] for transformer in [ PythonToStencilModel(), self.backend(self.args, output, self.kernel, arg_cfg=arg_cfg, fusable_nodes=None)]: tree = transformer.visit(tree) ocl_file = tree.find(OclFile) loop_body = ocl_file.body[0].defn params = ocl_file.body[0].params print(tree.files[0]) for index, _type in enumerate(param_types): params[index].type = _type() return [Loop(shape, params[:-2], [params[-2]], param_types, loop_body, [params[-1]])]
def __call__(self, *args): """__call__ :param *args: """ if isinstance(args[0], hmarray): output = empty_like(args[0]) else: output = np.zeros_like(args[0]) # self.kernel.argtypes = tuple( # cl_mem for _ in args + (output, ) # ) + (localmem, ) buffers = [] events = [] for index, arg in enumerate(args + (output, )): if isinstance(arg, hmarray): buffers.append(arg.ocl_buf) else: buf, evt = buffer_from_ndarray(self.queue, arg, blocking=True) # evt.wait() events.append(evt) buffers.append(buf) # self.kernel.setarg(index, buf, sizeof(cl_mem)) cl.clWaitForEvents(*events) cl_error = 0 if isinstance(self.kernel, list): kernels = len(self.kernel) if kernels == 2: cl_error = self._c_function(self.queue, self.kernel[0], self.kernel[1], *buffers) elif kernels == 3: cl_error = self._c_function(self.queue, self.kernel[0], self.kernel[1], self.kernel[2], *buffers) elif kernels == 4: cl_error = self._c_function( self.queue, self.kernel[0], self.kernel[1], self.kernel[2], self.kernel[3], *buffers ) else: cl_error = self._c_function(self.queue, self.kernel, *buffers) if cl.cl_errnum(cl_error) != cl.cl_errnum.CL_SUCCESS: raise StencilException( "Error executing stencil kernel: opencl {} {}".format( cl_error, cl.cl_errnum(cl_error) ) ) if isinstance(output, hmarray): return output buf, evt = buffer_to_ndarray( self.queue, buffers[-1], output ) evt.wait() return buf
def python_kernel_wrapper(self, *args): """ create an output buffer based on input_buffer then call the kernel :param args: :return: """ input_grid = args[0] output = np.zeros_like(input_grid) self.kernel(*(args + (output,))) if self.is_copied: for point in self.halo_points(input_grid): output[point] = input_grid[point] return output
def __call__(self, *args): """__call__ :param *args: Arguments to be passed to our C function, the types should match the types specified by the `entry_type` that was passed to :attr: `finalize`. """ # TODO: provide stronger type checking to give users better error # messages. duration = c_float() if self.output is not None: output = self.output self.output = None else: # pragma no cover output = np.zeros_like(args[0]) args += (output, byref(duration)) self._c_function(*args) return output
def __call__(self, *args): """__call__ :param *args: Arguments to be passed to our C function, the types should match the types specified by the `entry_type` that was passed to :attr: `finalize`. """ # TODO: provide stronger type checking to give users better error # messages. duration = c_float() if self.output is not None: output = self.output self.output = None else: output = np.zeros_like(args[0]) args += (output, byref(duration)) self._c_function(*args) return output
def __call__(self, *args): """__call__ :param *args: """ if self.output is not None: output = self.output self.output = None else: output = np.zeros_like(args[0]) self.kernel.argtypes = tuple(cl_mem for _ in args + (output, )) + (localmem, ) bufs = [] events = [] for index, arg in enumerate(args + (output, )): buf, evt = buffer_from_ndarray(self.queue, arg, blocking=False) # evt.wait() events.append(evt) bufs.append(buf) self.kernel.setarg(index, buf, sizeof(cl_mem)) cl.clWaitForEvents(*events) if self.device.type == cl.cl_device_type.CL_DEVICE_TYPE_GPU: local = 8 else: local = 1 localmem_size = reduce(operator.mul, (local + (self.ghost_depth * 2) for _ in range(args[0].ndim)), sizeof(c_float)) self.kernel.setarg( len(args) + 1, localmem(localmem_size), localmem_size) evt = clEnqueueNDRangeKernel(self.queue, self.kernel, self.global_size, tuple(local for _ in range(args[0].ndim))) evt.wait() buf, evt = buffer_to_ndarray(self.queue, bufs[-1], output) evt.wait() for mem in bufs: del mem return buf
def pure_python(self, *args): output = np.zeros_like(args[0]) self.kernel(*(args + (output, ))) return output
def generate_output(self, args): if self.output is not None: return self.output self.output = np.zeros_like(args[0]) return self.output