def __call__(self, input, u, v): output = zeros_like(input.data) events = [] in_buf, in_evt = buffer_from_ndarray(self.queue, input.data, blocking=False) events.append(in_evt) self.kernel.setarg(0, in_buf, sizeof(cl_mem)) u_buf, u_evt = buffer_from_ndarray(self.queue, u.data, blocking=False) events.append(u_evt) self.kernel.setarg(1, u_buf, sizeof(cl_mem)) v_buf, v_evt = buffer_from_ndarray(self.queue, v.data, blocking=False) events.append(v_evt) self.kernel.setarg(2, v_buf, sizeof(cl_mem)) out_buf, out_evt = buffer_from_ndarray(self.queue, output, blocking=False) events.append(out_evt) self.kernel.setarg(3, out_buf, sizeof(cl_mem)) clWaitForEvents(*events) evt = clEnqueueNDRangeKernel(self.queue, self.kernel, self.global_size) evt.wait() _, evt = buffer_to_ndarray(self.queue, out_buf, output) evt.wait() return Array(unique_name(), output)
def process_args(self, *args): processed = [] events = [] output = ct.c_int() out_like = None for arg in args: if isinstance(arg, np.ndarray): buf, evt = cl.buffer_from_ndarray(self.queue, arg, blocking=False) processed.append(buf) events.append(evt) output = buf.empty_like_this() out_like = arg else: if isinstance(arg, int): processed.append(arg) elif isinstance(arg, float) and isinstance(output, ct.c_int): processed.append(arg) output = ct.c_float() else: raise NotImplementedError( "UnsupportedType: %s" % type(arg) ) if self.output is not None: output, evt = cl.buffer_from_ndarray(self.queue, self.output, blocking=False) out_like = self.output evt.wait() if isinstance(output, cl.cl_mem): processed.append(output) else: processed.append(output.byref) cl.clWaitForEvents(*events) return processed, output, out_like
def __call__(self, *args): """__call__ :param *args: """ if isinstance(args[0], hmarray): output = empty_like(args[0]) else: output = np.zeros_like(args[0]) # self.kernel.argtypes = tuple( # cl_mem for _ in args + (output, ) # ) + (localmem, ) buffers = [] events = [] for index, arg in enumerate(args + (output, )): if isinstance(arg, hmarray): buffers.append(arg.ocl_buf) else: buf, evt = buffer_from_ndarray(self.queue, arg, blocking=True) # evt.wait() events.append(evt) buffers.append(buf) # self.kernel.setarg(index, buf, sizeof(cl_mem)) cl.clWaitForEvents(*events) cl_error = 0 if isinstance(self.kernel, list): kernels = len(self.kernel) if kernels == 2: cl_error = self._c_function(self.queue, self.kernel[0], self.kernel[1], *buffers) elif kernels == 3: cl_error = self._c_function(self.queue, self.kernel[0], self.kernel[1], self.kernel[2], *buffers) elif kernels == 4: cl_error = self._c_function( self.queue, self.kernel[0], self.kernel[1], self.kernel[2], self.kernel[3], *buffers ) else: cl_error = self._c_function(self.queue, self.kernel, *buffers) if cl.cl_errnum(cl_error) != cl.cl_errnum.CL_SUCCESS: raise StencilException( "Error executing stencil kernel: opencl {} {}".format( cl_error, cl.cl_errnum(cl_error) ) ) if isinstance(output, hmarray): return output buf, evt = buffer_to_ndarray( self.queue, buffers[-1], output ) evt.wait() return buf
def process_inputs(self, *args): events = [] processed = [] self.kernel.argtypes = tuple(cl_mem for _ in args) for index, arg in enumerate(args): if isinstance(arg, types.common.Array): arg = arg.data buf, evt = buffer_from_ndarray(self.queue, arg, blocking=False) processed.append(buf) events.append(evt) self.kernel.setarg(index, buf, sizeof(cl_mem)) clWaitForEvents(*events) return processed
def fn(*args, **kwargs): for source, arg in zip(filtered_sources, args): self.symbol_table[source.name] = arg if len(kernels) == 0: for op, params in zip(block, block_params): _sinks, _sources = params # if len(kernels) < 1 or \ # kernels[-1].launch_paramaters != launch_params: # kernels.append(Kernel(launch_params)) # else: # raise NotImplementedError() if self.is_not_device_level(op): launch_params = self.get_launch_params( op, _sources, _sinks) if len(kernels) == 0 or \ not isinstance(kernels[-1], Kernel) or \ kernels[-1].launch_parameters[0] != launch_params[0] \ or len(launch_params) > 1 and launch_params[1]: kernels.append(Kernel(launch_params)) kernels[-1].append_body( self.get_emit(op, _sources, _sinks)) for source in _sources: if isinstance(self.symbol_table[source.name], hmarray): kernels[-1].sources.add(source) for sink in _sinks: if isinstance(self.symbol_table[sink.name], hmarray): kernels[-1].sinks.add(sink) else: kernels.append(self.get_launcher(op, _sources, _sinks)) for kernel in kernels: kernel.compile() self.kernels.append(kernel) kernel_map = {} for kernel in kernels: evts = [] for source in kernel.sources: if source.name in kernel_map: evts.extend(kernel_map[source.name]) evts = kernel.launch(self.symbol_table, evts) for sink in kernel.sinks: kernel_map[sink.name] = evts if backend in {"ocl", "opencl", "OCL"}: cl.clWaitForEvents(*evts) ret = tuple(self.symbol_table[sink.name] for sink in filtered_sinks) if len(ret) == 1: return ret[0] return ret
def fn(*args, **kwargs): for source, arg in zip(filtered_sources, args): self.symbol_table[source.name] = arg if len(kernels) == 0: for op, params in zip(block, block_params): _sinks, _sources = params # if len(kernels) < 1 or \ # kernels[-1].launch_paramaters != launch_params: # kernels.append(Kernel(launch_params)) # else: # raise NotImplementedError() if self.is_not_device_level(op): launch_params = self.get_launch_params( op, _sources, _sinks) if len(kernels) == 0 or \ not isinstance(kernels[-1], Kernel) or \ kernels[-1].launch_parameters[0] != launch_params[0] \ or len(launch_params) > 1 and launch_params[1]: kernels.append(Kernel(launch_params)) kernels[-1].append_body( self.get_emit(op, _sources, _sinks) ) for source in _sources: if isinstance(self.symbol_table[source.name], hmarray): kernels[-1].sources.add(source) for sink in _sinks: if isinstance(self.symbol_table[sink.name], hmarray): kernels[-1].sinks.add(sink) else: kernels.append(self.get_launcher(op, _sources, _sinks)) for kernel in kernels: kernel.compile() self.kernels.append(kernel) kernel_map = {} for kernel in kernels: evts = [] for source in kernel.sources: if source.name in kernel_map: evts.extend(kernel_map[source.name]) evts = kernel.launch(self.symbol_table, evts) for sink in kernel.sinks: kernel_map[sink.name] = evts if backend in {"ocl", "opencl", "OCL"}: cl.clWaitForEvents(*evts) ret = tuple(self.symbol_table[sink.name] for sink in filtered_sinks) if len(ret) == 1: return ret[0] return ret
def __call__(self, *args): """__call__ :param *args: """ if self.output is not None: output = self.output self.output = None else: output = np.zeros_like(args[0]) self.kernel.argtypes = tuple(cl_mem for _ in args + (output, )) + (localmem, ) bufs = [] events = [] for index, arg in enumerate(args + (output, )): buf, evt = buffer_from_ndarray(self.queue, arg, blocking=False) # evt.wait() events.append(evt) bufs.append(buf) self.kernel.setarg(index, buf, sizeof(cl_mem)) cl.clWaitForEvents(*events) if self.device.type == cl.cl_device_type.CL_DEVICE_TYPE_GPU: local = 8 else: local = 1 localmem_size = reduce(operator.mul, (local + (self.ghost_depth * 2) for _ in range(args[0].ndim)), sizeof(c_float)) self.kernel.setarg( len(args) + 1, localmem(localmem_size), localmem_size) evt = clEnqueueNDRangeKernel(self.queue, self.kernel, self.global_size, tuple(local for _ in range(args[0].ndim))) evt.wait() buf, evt = buffer_to_ndarray(self.queue, bufs[-1], output) evt.wait() for mem in bufs: del mem return buf