Beispiel #1
0
    def __call__(self, a, axis=None, dtype=None, out=None, keepdims=False):
        if not isinstance(a, cupy.ndarray):
            raise TypeError('Input type must be cupy.ndarray')

        if self.identity is None:
            assert a.size != 0
        in_args = [a]
        if out is None:
            out_args = []
        else:
            out_args = [out]
        internal.check_args_device(in_args + out_args)

        in_types, out_types, routine = self._guess_routine(in_args, dtype)

        axis = _get_axis(axis, a.ndim)
        out_shape = _get_out_shape(a.shape, axis, keepdims)
        out_args = elementwise._get_out_args(in_args, out_args, out_types,
                                             out_shape)
        in_args, in_shape = _get_trans_args(in_args, axis, in_args[0].shape)

        in_indexer = cindexer.Indexer(in_shape)
        out_indexer = cindexer.Indexer(out_shape)
        out_clp2_size = 2**int.bit_length(int(out_indexer.size - 1))

        inout_args, is_ndarray = _get_inout_args(in_args, out_args, in_indexer,
                                                 out_indexer, out_clp2_size,
                                                 self._params, True)
        param_types = elementwise._get_kernel_param_types(inout_args)
        params = elementwise._get_kernel_params(self._params, is_ndarray,
                                                param_types)

        block_size = 512
        reduce_type = routine[3]
        if reduce_type is None:
            reduce_type = elementwise._get_typename(out_types[0])

        type_preamble = (
            'typedef {} type_in0_raw; typedef {} type_out0_raw;'.format(
                elementwise._get_typename(in_args[0].dtype),
                elementwise._get_typename(out_args[0].dtype)))

        kern = _make_reduction_function_kernel(
            self.name, block_size, reduce_type, params, self.identity,
            routine[0], routine[1], routine[2], type_preamble,
            self._input_expr, self._output_expr, self._preamble)
        shared_mem = 32 * block_size
        if out_clp2_size > 256:
            shared_mem = 0
        # TODO(okuta) set actual size
        kern.linear_launch(max(out_indexer.size, block_size),
                           inout_args,
                           shared_mem=shared_mem,
                           block_max_size=block_size)

        if len(out_args) == 1:
            return out_args[0]
        return tuple(out_args)
Beispiel #2
0
def _get_reduction_kernel(params, is_ndarray, param_types, types):
    kernel_params = elementwise._get_kernel_params(params, is_ndarray,
                                                   param_types)
    type_preamble = '\n'.join(
        'typedef {} {};'.format(elementwise._get_typename(v), k)
        for k, v in types)
    input_expr = '\n'.join(
        'const {0} {1} = _raw_{1}[_j];'.format(p.ctype, p.name)
        for f, p in six.moves.zip(is_ndarray, params)
        if f and p.const and not p.raw)
    output_expr = '\n'.join('{0} &{1} = _raw_{1}[_i];'.format(p.ctype, p.name)
                            for f, p in six.moves.zip(is_ndarray, params)
                            if f and not p.const and not p.raw)
    return kernel_params, type_preamble, input_expr, output_expr
Beispiel #3
0
def _get_reduction_kernel(
        params, is_ndarray, param_types, types):
    kernel_params = elementwise._get_kernel_params(
        params, is_ndarray, param_types)
    type_preamble = '\n'.join(
        'typedef {} {};'.format(elementwise._get_typename(v), k)
        for k, v in types)
    input_expr = '\n'.join(
        'const {0} {1} = _raw_{1}[_j];'.format(p.ctype, p.name)
        for f, p in six.moves.zip(is_ndarray, params)
        if f and p.const and not p.raw)
    output_expr = '\n'.join(
        '{0} &{1} = _raw_{1}[_i];'.format(p.ctype, p.name)
        for f, p in six.moves.zip(is_ndarray, params)
        if f and not p.const and not p.raw)
    return kernel_params, type_preamble, input_expr, output_expr
Beispiel #4
0
    def __call__(self, a, axis=None, dtype=None, out=None, keepdims=False):
        if not isinstance(a, cupy.ndarray):
            raise TypeError('Input type must be cupy.ndarray')

        if self.identity is None:
            assert a.size != 0
        in_args = [a]
        if out is None:
            out_args = []
        else:
            out_args = [out]
        internal.check_args_device(in_args + out_args)

        in_types, out_types, routine = self._guess_routine(in_args, dtype)

        axis = _get_axis(axis, a.ndim)
        out_shape = _get_out_shape(a.shape, axis, keepdims)
        out_args = elementwise._get_out_args(
            in_args, out_args, out_types, out_shape)
        in_args, in_shape = _get_trans_args(
            in_args, axis, in_args[0].shape)

        in_indexer = cindexer.Indexer(in_shape)
        out_indexer = cindexer.Indexer(out_shape)
        out_clp2_size = 2 ** int.bit_length(int(out_indexer.size - 1))

        inout_args, is_ndarray = _get_inout_args(
            in_args, out_args, in_indexer, out_indexer, out_clp2_size,
            self._params, True)
        param_types = elementwise._get_kernel_param_types(inout_args)
        params = elementwise._get_kernel_params(
            self._params, is_ndarray, param_types)

        block_size = 512
        reduce_type = routine[3]
        if reduce_type is None:
            reduce_type = elementwise._get_typename(out_types[0])

        type_preamble = (
            'typedef {} type_in0_raw; typedef {} type_out0_raw;'.format(
                elementwise._get_typename(in_args[0].dtype),
                elementwise._get_typename(out_args[0].dtype)))

        kern = _make_reduction_function_kernel(
            self.name,
            block_size,
            reduce_type,
            params,
            self.identity,
            routine[0], routine[1], routine[2],
            type_preamble, self._input_expr, self._output_expr,
            self._preamble)
        shared_mem = 32 * block_size
        if out_clp2_size > 256:
            shared_mem = 0
        # TODO(okuta) set actual size
        kern.linear_launch(max(out_indexer.size, block_size), inout_args,
                           shared_mem=shared_mem,
                           block_max_size=block_size)

        if len(out_args) == 1:
            return out_args[0]
        return tuple(out_args)