def __call__(self, a, axis=None, dtype=None, out=None, keepdims=False): if not isinstance(a, cupy.ndarray): raise TypeError('Input type must be cupy.ndarray') if self.identity is None: assert a.size != 0 in_args = [a] if out is None: out_args = [] else: out_args = [out] internal.check_args_device(in_args + out_args) in_types, out_types, routine = self._guess_routine(in_args, dtype) axis = _get_axis(axis, a.ndim) out_shape = _get_out_shape(a.shape, axis, keepdims) out_args = elementwise._get_out_args(in_args, out_args, out_types, out_shape) in_args, in_shape = _get_trans_args(in_args, axis, in_args[0].shape) in_indexer = cindexer.Indexer(in_shape) out_indexer = cindexer.Indexer(out_shape) out_clp2_size = 2**int.bit_length(int(out_indexer.size - 1)) inout_args, is_ndarray = _get_inout_args(in_args, out_args, in_indexer, out_indexer, out_clp2_size, self._params, True) param_types = elementwise._get_kernel_param_types(inout_args) params = elementwise._get_kernel_params(self._params, is_ndarray, param_types) block_size = 512 reduce_type = routine[3] if reduce_type is None: reduce_type = elementwise._get_typename(out_types[0]) type_preamble = ( 'typedef {} type_in0_raw; typedef {} type_out0_raw;'.format( elementwise._get_typename(in_args[0].dtype), elementwise._get_typename(out_args[0].dtype))) kern = _make_reduction_function_kernel( self.name, block_size, reduce_type, params, self.identity, routine[0], routine[1], routine[2], type_preamble, self._input_expr, self._output_expr, self._preamble) shared_mem = 32 * block_size if out_clp2_size > 256: shared_mem = 0 # TODO(okuta) set actual size kern.linear_launch(max(out_indexer.size, block_size), inout_args, shared_mem=shared_mem, block_max_size=block_size) if len(out_args) == 1: return out_args[0] return tuple(out_args)
def _get_reduction_kernel(params, is_ndarray, param_types, types): kernel_params = elementwise._get_kernel_params(params, is_ndarray, param_types) type_preamble = '\n'.join( 'typedef {} {};'.format(elementwise._get_typename(v), k) for k, v in types) input_expr = '\n'.join( 'const {0} {1} = _raw_{1}[_j];'.format(p.ctype, p.name) for f, p in six.moves.zip(is_ndarray, params) if f and p.const and not p.raw) output_expr = '\n'.join('{0} &{1} = _raw_{1}[_i];'.format(p.ctype, p.name) for f, p in six.moves.zip(is_ndarray, params) if f and not p.const and not p.raw) return kernel_params, type_preamble, input_expr, output_expr
def _get_reduction_kernel( params, is_ndarray, param_types, types): kernel_params = elementwise._get_kernel_params( params, is_ndarray, param_types) type_preamble = '\n'.join( 'typedef {} {};'.format(elementwise._get_typename(v), k) for k, v in types) input_expr = '\n'.join( 'const {0} {1} = _raw_{1}[_j];'.format(p.ctype, p.name) for f, p in six.moves.zip(is_ndarray, params) if f and p.const and not p.raw) output_expr = '\n'.join( '{0} &{1} = _raw_{1}[_i];'.format(p.ctype, p.name) for f, p in six.moves.zip(is_ndarray, params) if f and not p.const and not p.raw) return kernel_params, type_preamble, input_expr, output_expr
def __call__(self, a, axis=None, dtype=None, out=None, keepdims=False): if not isinstance(a, cupy.ndarray): raise TypeError('Input type must be cupy.ndarray') if self.identity is None: assert a.size != 0 in_args = [a] if out is None: out_args = [] else: out_args = [out] internal.check_args_device(in_args + out_args) in_types, out_types, routine = self._guess_routine(in_args, dtype) axis = _get_axis(axis, a.ndim) out_shape = _get_out_shape(a.shape, axis, keepdims) out_args = elementwise._get_out_args( in_args, out_args, out_types, out_shape) in_args, in_shape = _get_trans_args( in_args, axis, in_args[0].shape) in_indexer = cindexer.Indexer(in_shape) out_indexer = cindexer.Indexer(out_shape) out_clp2_size = 2 ** int.bit_length(int(out_indexer.size - 1)) inout_args, is_ndarray = _get_inout_args( in_args, out_args, in_indexer, out_indexer, out_clp2_size, self._params, True) param_types = elementwise._get_kernel_param_types(inout_args) params = elementwise._get_kernel_params( self._params, is_ndarray, param_types) block_size = 512 reduce_type = routine[3] if reduce_type is None: reduce_type = elementwise._get_typename(out_types[0]) type_preamble = ( 'typedef {} type_in0_raw; typedef {} type_out0_raw;'.format( elementwise._get_typename(in_args[0].dtype), elementwise._get_typename(out_args[0].dtype))) kern = _make_reduction_function_kernel( self.name, block_size, reduce_type, params, self.identity, routine[0], routine[1], routine[2], type_preamble, self._input_expr, self._output_expr, self._preamble) shared_mem = 32 * block_size if out_clp2_size > 256: shared_mem = 0 # TODO(okuta) set actual size kern.linear_launch(max(out_indexer.size, block_size), inout_args, shared_mem=shared_mem, block_max_size=block_size) if len(out_args) == 1: return out_args[0] return tuple(out_args)