def _do_stage(self, method): from pycuda.gpuarray import splay import pycuda.driver as drv # Call the appropriate kernels for either initialize/stage computation. call_info = self.helper.calls[method] py_call_info = self.helper.py_calls['py_' + method] dtype = np.float64 if self._use_double else np.float32 extra_args = [ np.asarray(self.t, dtype=dtype), np.asarray(self.dt, dtype=dtype) ] # Call the py_{method} for each destination. for name, (py_meth, dest) in py_call_info.items(): py_meth(dest, *extra_args) # Call the stage* method for each destination. for name, (call, args, dest) in call_info.items(): n = dest.get_number_of_particles(real=True) gs, ls = splay(n) gs, ls = int(gs[0]), int(ls[0]) num_blocks = (n + ls - 1) // ls num_tpb = ls # Compute the remaining arguments. args = [x() for x in args[3:]] call(*(args + extra_args), block=(num_tpb, 1, 1), grid=(num_blocks, 1))
def _get_workgroup_size(self, global_size): if self.backend == 'opencl': gs, ls = splay_cl(self.queue, global_size, self._max_work_group_size) elif self.backend == 'cuda': from pycuda.gpuarray import splay gs, ls = splay(global_size) return gs, ls
def __call__(self, *args, **kwargs): vectors = [] range_ = kwargs.pop("range", None) slice_ = kwargs.pop("slice", None) stream = kwargs.pop("stream", None) if kwargs: raise TypeError( "invalid keyword arguments specified: " + ", ".join(kwargs.keys()) ) invocation_args = [] mod, func, arguments = self.generate_stride_kernel_and_types( range_ is not None or slice_ is not None ) for arg, arg_descr in zip(args, arguments): if isinstance(arg_descr, VectorArg): if not arg.flags.forc: raise RuntimeError( "elementwise kernel cannot " "deal with non-contiguous arrays" ) vectors.append(arg) invocation_args.append(arg.gpudata) else: invocation_args.append(arg) repr_vec = vectors[0] if slice_ is not None: if range_ is not None: raise TypeError( "may not specify both range and slice " "keyword arguments" ) range_ = slice(*slice_.indices(repr_vec.size)) if range_ is not None: invocation_args.append(range_.start) invocation_args.append(range_.stop) if range_.step is None: invocation_args.append(1) else: invocation_args.append(range_.step) from pycuda.gpuarray import splay grid, block = splay(abs(range_.stop - range_.start) // range_.step) else: block = repr_vec._block grid = repr_vec._grid invocation_args.append(repr_vec.mem_size) func.prepared_async_call(grid, block, stream, *invocation_args)
def _call_kernel(self, info, extra_args): from pycuda.gpuarray import splay import pycuda.driver as drv nnps = self.nnps call = info.get('method') args = list(info.get('args')) dest = info['dest'] n = dest.get_number_of_particles(info.get('real', True)) # args is actually [queue, None, None, actual_meaningful_args] # we do not need the first 3 args on CUDA. args = [x() for x in args[3:]] # Argument for NP_MAX extra_args[-1][...] = n - 1 gs, ls = splay(n) gs, ls = int(gs[0]), int(ls[0]) num_blocks = (n + ls - 1) // ls #num_blocks = int((gs + ls - 1) / ls) num_tpb = ls if info.get('loop'): if self._use_local_memory: # FIXME: Fix local memory for CUDA nnps.set_context(info['src_idx'], info['dst_idx']) nnps_args, gs_ls = self.nnps.get_kernel_args('float') args[1] = gs_ls[0] args[2] = gs_ls[1] # No need for the guard variable for the local memory code. args = args + extra_args[:-1] + nnps_args call(*args) else: # find block sizes nnps.set_context(info['src_idx'], info['dst_idx']) cache = nnps.current_cache cache.get_neighbors_gpu() args = args + [ cache._nbr_lengths_gpu.dev, cache._start_idx_gpu.dev, cache._neighbors_gpu.dev ] + extra_args event = drv.Event() call(*args, block=(num_tpb, 1, 1), grid=(num_blocks, 1)) event.record() event.synchronize() else: event = drv.Event() call(*(args + extra_args), block=(num_tpb, 1, 1), grid=(num_blocks, 1)) event.record() event.synchronize()
def __call__(self, *args, **kwargs): vectors = [] range_ = kwargs.pop("range", None) slice_ = kwargs.pop("slice", None) stream = kwargs.pop("stream", None) if kwargs: raise TypeError("invalid keyword arguments specified: " + ", ".join(kwargs.iterkeys())) invocation_args = [] func, arguments = self.generate_stride_kernel_and_types( range_ is not None or slice_ is not None) for arg, arg_descr in zip(args, arguments): if isinstance(arg_descr, VectorArg): if not arg.flags.forc: raise RuntimeError("elementwise kernel cannot " "deal with non-contiguous arrays") vectors.append(arg) invocation_args.append(arg.gpudata) else: invocation_args.append(arg) repr_vec = vectors[0] if slice_ is not None: if range_ is not None: raise TypeError("may not specify both range and slice " "keyword arguments") range_ = slice(*slice_.indices(repr_vec.size)) if range_ is not None: invocation_args.append(range_.start) invocation_args.append(range_.stop) if range_.step is None: invocation_args.append(1) else: invocation_args.append(range_.step) from pycuda.gpuarray import splay grid, block = splay(abs(range_.stop - range_.start)//range_.step) else: block = repr_vec._block grid = repr_vec._grid invocation_args.append(repr_vec.mem_size) func.prepared_async_call(grid, block, stream, *invocation_args)
def __call__(self, *args, **kwargs): range_ = kwargs.pop("range", None) slice_ = kwargs.pop("slice", None) stream = kwargs.pop("stream", None) if kwargs: raise TypeError("invalid keyword arguments specified: " + ", ".join(elementwise.six.iterkeys(kwargs))) mod, func, arguments, call_info = self.generate_stride_kernel_and_types( range_ is not None or slice_ is not None, args) vectors, invocation_args = call_info repr_vec = vectors[0] if slice_ is not None: if range_ is not None: raise TypeError("may not specify both range and slice " "keyword arguments") range_ = slice(*slice_.indices(repr_vec.size)) if range_ is not None: invocation_args.append(range_.start) invocation_args.append(range_.stop) if range_.step is None: invocation_args.append(1) else: invocation_args.append(range_.step) from pycuda.gpuarray import splay grid, block = splay(abs(range_.stop - range_.start) // range_.step) else: block = repr_vec._block grid = repr_vec._grid invocation_args.append(repr_vec.mem_size) func.prepared_async_call(grid, block, stream, *invocation_args)
def axnpby(self, y, *xn): if any(y.traits != x.traits for x in xn): raise ValueError('Incompatible matrix types') nv, cnt = len(xn), y.leaddim * y.nrow # Render the kernel template src = self.backend.lookup.get_template('axnpby').render(n=nv) # Build kern = self._build_kernel('axnpby', src, [np.int32] + [np.intp, y.dtype] * (1 + nv)) # Compute a suitable block and grid grid, block = splay(cnt) class AxnpbyKernel(ComputeKernel): def run(self, queue, beta, *alphan): args = [i for axn in zip(xn, alphan) for i in axn] kern.prepared_async_call(grid, block, queue.cuda_stream_comp, cnt, y, beta, *args) return AxnpbyKernel()
def axnpby(self, y, *xn): if any(y.traits != x.traits for x in xn): raise ValueError('Incompatible matrix types') nv, cnt = len(xn), y.leaddim*y.nrow # Render the kernel template src = self.backend.lookup.get_template('axnpby').render(n=nv) # Build kern = self._build_kernel('axnpby', src, [np.int32] + [np.intp, y.dtype]*(1 + nv)) # Compute a suitable block and grid grid, block = splay(cnt) class AxnpbyKernel(ComputeKernel): def run(self, queue, beta, *alphan): args = [i for axn in zip(xn, alphan) for i in axn] kern.prepared_async_call(grid, block, queue.cuda_stream_comp, cnt, y, beta, *args) return AxnpbyKernel()