def __call__(self, *args, **kwargs): MAX_BLOCK_COUNT = 1024 SMALL_SEQ_COUNT = 4 s1_func = self.stage1_func s2_func = self.stage2_func kernel_wrapper = kwargs.get("kernel_wrapper") if kernel_wrapper is not None: s1_func = kernel_wrapper(s1_func) s2_func = kernel_wrapper(s2_func) stream = kwargs.get("stream") from gpuarray import empty f = s1_func arg_types = self.stage1_arg_types while True: invocation_args = [] vectors = [] for arg, arg_tp in zip(args, arg_types): if arg_tp == "P": vectors.append(arg) invocation_args.append(arg.gpudata) else: invocation_args.append(arg) repr_vec = vectors[0] sz = repr_vec.size if sz <= self.block_size * SMALL_SEQ_COUNT * MAX_BLOCK_COUNT: total_block_size = SMALL_SEQ_COUNT * self.block_size block_count = (sz + total_block_size - 1) // total_block_size seq_count = SMALL_SEQ_COUNT else: block_count = MAX_BLOCK_COUNT macroblock_size = block_count * self.block_size seq_count = (sz + macroblock_size - 1) // macroblock_size if block_count == 1: result = empty((), self.dtype_out, repr_vec.allocator) else: result = empty((block_count, ), self.dtype_out, repr_vec.allocator) #print block_count, seq_count, self.block_size f((block_count, 1), stream, *([result.gpudata] + invocation_args + [seq_count, sz])) if block_count == 1: return result else: f = s2_func arg_types = self.stage2_arg_types args = [result]
def __call__(self, *args, **kwargs): MAX_BLOCK_COUNT = 1024 SMALL_SEQ_COUNT = 4 s1_func = self.stage1_func s2_func = self.stage2_func kernel_wrapper = kwargs.get("kernel_wrapper") if kernel_wrapper is not None: s1_func = kernel_wrapper(s1_func) s2_func = kernel_wrapper(s2_func) stream = kwargs.get("stream") from gpuarray import empty f = s1_func arg_types = self.stage1_arg_types while True: invocation_args = [] vectors = [] for arg, arg_tp in zip(args, arg_types): if arg_tp == "P": vectors.append(arg) invocation_args.append(arg.gpudata) else: invocation_args.append(arg) repr_vec = vectors[0] sz = repr_vec.size if sz <= self.block_size*SMALL_SEQ_COUNT*MAX_BLOCK_COUNT: total_block_size = SMALL_SEQ_COUNT*self.block_size block_count = (sz + total_block_size - 1) // total_block_size seq_count = SMALL_SEQ_COUNT else: block_count = MAX_BLOCK_COUNT macroblock_size = block_count*self.block_size seq_count = (sz + macroblock_size - 1) // macroblock_size if block_count == 1: result = empty((), self.dtype_out, repr_vec.allocator) else: result = empty((block_count,), self.dtype_out, repr_vec.allocator) #print block_count, seq_count, self.block_size f((block_count, 1), stream, *([result.gpudata]+invocation_args+[seq_count, sz])) if block_count == 1: return result else: f = s2_func arg_types = self.stage2_arg_types args = [result]
def elemwise2(a, op, b, ary, odtype=None, oper=None, op_tmpl="res[i] = (%(out_t)s)%(a)s %(op)s (%(out_t)s)%(b)s", broadcast=False): ndim_extend = True if not isinstance(a, gpuarray.GpuArray): a = numpy.asarray(a) ndim_extend = False if not isinstance(b, gpuarray.GpuArray): b = numpy.asarray(b) ndim_extend = False if odtype is None: odtype = get_common_dtype(a, b, True) a_arg = as_argument(a, 'a') b_arg = as_argument(b, 'b') args = [ArrayArg(odtype, 'res'), a_arg, b_arg] if ndim_extend: if a.ndim != b.ndim: nd = max(a.ndim, b.ndim) if a.ndim < nd: a = a.reshape(((1, ) * (nd - a.ndim)) + a.shape) if b.ndim < nd: b = b.reshape(((1, ) * (nd - b.ndim)) + b.shape) out_shape = tuple(max(sa, sb) for sa, sb in zip(a.shape, b.shape)) res = gpuarray.empty(out_shape, dtype=odtype, context=ary.context, cls=ary.__class__) else: res = ary._empty_like_me(dtype=odtype) if oper is None: oper = op_tmpl % { 'a': a_arg.expr(), 'op': op, 'b': b_arg.expr(), 'out_t': dtype_to_ctype(odtype) } k = ElemwiseKernel(ary.context, args, oper) k(res, a, b, broadcast=broadcast) return res
def __call__(self, *args, **kwargs): _, nd, dims, strs, offsets, contig = check_args(args, collapse=False, broadcast=False) out = kwargs.pop('out', None) if len(kwargs) != 0: raise TypeError('Unexpected keyword argument: %s' % kwargs.keys()[0]) n = prod(dims) out_shape = tuple(d for i, d in enumerate(dims) if not self.redux[i]) gs = prod(out_shape) if gs == 0: gs = 1 n /= gs if gs > self.context.maxgsize: raise ValueError("Array to big to be reduced along the " "selected axes") if out is None: out = gpuarray.empty(out_shape, context=self.context, dtype=self.dtype_out) else: if out.shape != out_shape or out.dtype != self.dtype_out: raise TypeError( "Out array is not of expected type " "(expected %s %s, got %s %s)" % (out_shape, self.dtype_out, out.shape, out.dtype)) #Don't compile and cache for nothing for big size if self.init_local_size < n: k, _, _, ls = self._get_basic_kernel(self.init_local_size, nd) else: k, _, _, ls = self._get_basic_kernel(n, nd) kargs = [n, out] kargs.extend(dims) for i, arg in enumerate(args): kargs.append(arg) if isinstance(arg, gpuarray.GpuArray): kargs.append(offsets[i]) kargs.extend(strs[i]) k(*kargs, ls=ls, gs=gs) return out
def __call__(self, *args, **kwargs): _, nd, dims, strs, offsets, contig = check_args(args, collapse=False, broadcast=False) out = kwargs.pop('out', None) if len(kwargs) != 0: raise TypeError('Unexpected keyword argument: %s' % kwargs.keys()[0]) n = prod(dims) out_shape = tuple(d for i, d in enumerate(dims) if not self.redux[i]) gs = prod(out_shape) if gs == 0: gs = 1 n /= gs if gs > self.context.maxgsize: raise ValueError("Array to big to be reduced along the " "selected axes") if out is None: out = gpuarray.empty(out_shape, context=self.context, dtype=self.dtype_out) else: if out.shape != out_shape or out.dtype != self.dtype_out: raise TypeError("Out array is not of expected type " "(expected %s %s, got %s %s)" % ( out_shape, self.dtype_out, out.shape, out.dtype)) #Don't compile and cache for nothing for big size if self.init_local_size < n: k, _, _, ls = self._get_basic_kernel(self.init_local_size, nd) else: k, _, _, ls = self._get_basic_kernel(n, nd) kargs = [n, out] kargs.extend(dims) for i, arg in enumerate(args): kargs.append(arg) if isinstance(arg, gpuarray.GpuArray): kargs.append(offsets[i]) kargs.extend(strs[i]) k(*kargs, ls=ls, gs=gs) return out
def elemwise2(a, op, b, ary, odtype=None, oper=None, op_tmpl="res[i] = (%(out_t)s)%(a)s %(op)s (%(out_t)s)%(b)s", broadcast=False): ndim_extend = True if not isinstance(a, gpuarray.GpuArray): a = numpy.asarray(a) ndim_extend = False if not isinstance(b, gpuarray.GpuArray): b = numpy.asarray(b) ndim_extend = False if odtype is None: odtype = get_common_dtype(a, b, True) a_arg = as_argument(a, 'a') b_arg = as_argument(b, 'b') args = [ArrayArg(odtype, 'res'), a_arg, b_arg] if ndim_extend: if a.ndim != b.ndim: nd = max(a.ndim, b.ndim) if a.ndim < nd: a = a.reshape(((1,) * (nd - a.ndim))+a.shape) if b.ndim < nd: b = b.reshape(((1,) * (nd - b.ndim))+b.shape) out_shape = tuple(max(sa, sb) for sa, sb in zip(a.shape, b.shape)) res = gpuarray.empty(out_shape, dtype=odtype, context=ary.context, cls=ary.__class__) else: res = ary._empty_like_me(dtype=odtype) if oper is None: oper = op_tmpl % {'a': a_arg.expr(), 'op': op, 'b': b_arg.expr(), 'out_t': dtype_to_ctype(odtype)} k = ElemwiseKernel(ary.context, args, oper) k(res, a, b, broadcast=broadcast) return res
def __call__(self, *args, **kwargs): MAX_BLOCK_COUNT = 1024 SMALL_SEQ_COUNT = 4 s1_func = self.stage1_func s2_func = self.stage2_func kernel_wrapper = kwargs.get("kernel_wrapper") if kernel_wrapper is not None: s1_func = kernel_wrapper(s1_func) s2_func = kernel_wrapper(s2_func) stream = kwargs.get("stream") from gpuarray import empty f = s1_func arg_types = self.stage1_arg_types stage1_args = args while True: invocation_args = [] vectors = [] for arg, arg_tp in zip(args, arg_types): if arg_tp == "P": if not arg.flags.forc: raise RuntimeError("ReductionKernel cannot " "deal with non-contiguous arrays") vectors.append(arg) invocation_args.append(arg.gpudata) else: invocation_args.append(arg) repr_vec = vectors[0] sz = repr_vec.size if sz <= self.block_size*SMALL_SEQ_COUNT*MAX_BLOCK_COUNT: total_block_size = SMALL_SEQ_COUNT*self.block_size block_count = (sz + total_block_size - 1) // total_block_size seq_count = SMALL_SEQ_COUNT else: block_count = MAX_BLOCK_COUNT macroblock_size = block_count*self.block_size seq_count = (sz + macroblock_size - 1) // macroblock_size if block_count == 1: result = empty((), self.dtype_out, repr_vec.allocator) else: result = empty((block_count,), self.dtype_out, repr_vec.allocator) kwargs = dict(shared_size=self.block_size*self.dtype_out.itemsize) #print block_count, seq_count, self.block_size, sz f((block_count, 1), (self.block_size, 1, 1), stream, *([result.gpudata]+invocation_args+[seq_count, sz]), **kwargs) if block_count == 1: return result else: f = s2_func arg_types = self.stage2_arg_types args = (result,) + stage1_args