Exemple #1
0
def parallel_sort(keys, values=None):
    N = keys.shape[0]

    num_stages = 0
    p = 1
    while p < N:
        k = p
        while k >= 1:
            invocations = int((N - k - k % p) / (2 * k)) + 1
            if values is None:
                sort_stage(keys, 0, keys, N, p, k, invocations)
            else:
                sort_stage(keys, 1, values, N, p, k, invocations)
            num_stages += 1
            sync()
            k = int(k / 2)
        p = int(p * 2)
    print(num_stages)
Exemple #2
0
    def run_benchmark():
        compile_time = time.time()
        _func(*args)  # compile the kernel first
        sync()
        compile_time = time.time() - compile_time
        stat_write('compilation_time', compile_time)
        codegen_stat = _ti_core.stat()
        for line in codegen_stat.split('\n'):
            try:
                a, b = line.strip().split(':')
            except:
                continue
            a = a.strip()
            b = int(float(b))
            if a == 'codegen_kernel_statements':
                stat_write('compiled_inst', b)
            if a == 'codegen_offloaded_tasks':
                stat_write('compiled_tasks', b)
            elif a == 'launched_tasks':
                stat_write('launched_tasks', b)

        # Use 3 initial iterations to warm up
        # instruction/data caches. Discussion:
        # https://github.com/taichi-dev/taichi/pull/1002#discussion_r426312136
        for _ in range(3):
            _func(*args)
            sync()
        clear_kernel_profile_info()
        t = time.time()
        for _ in range(repeat):
            _func(*args)
            sync()
        elapsed = time.time() - t
        avg = elapsed / repeat
        stat_write('wall_clk_t', avg)
        device_time = kernel_profiler_total_time()
        avg_device_time = device_time / repeat
        stat_write('exec_t', avg_device_time)
Exemple #3
0
        def func__(*args):
            assert len(args) == len(
                self.argument_annotations
            ), f'{len(self.argument_annotations)} arguments needed but {len(args)} provided'

            tmps = []
            callbacks = []
            has_external_arrays = False
            has_torch = has_pytorch()
            ndarray_use_torch = impl.get_runtime().ndarray_use_torch

            actual_argument_slot = 0
            launch_ctx = t_kernel.make_launch_context()
            for i, v in enumerate(args):
                needed = self.argument_annotations[i]
                if isinstance(needed, template):
                    continue
                provided = type(v)
                # Note: do not use sth like "needed == f32". That would be slow.
                if id(needed) in primitive_types.real_type_ids:
                    if not isinstance(v, (float, int)):
                        raise TaichiRuntimeTypeError(i, needed.to_string(),
                                                     provided)
                    launch_ctx.set_arg_float(actual_argument_slot, float(v))
                elif id(needed) in primitive_types.integer_type_ids:
                    if not isinstance(v, int):
                        raise TaichiRuntimeTypeError(i, needed.to_string(),
                                                     provided)
                    launch_ctx.set_arg_int(actual_argument_slot, int(v))
                elif isinstance(needed, sparse_matrix_builder):
                    # Pass only the base pointer of the ti.linalg.sparse_matrix_builder() argument
                    launch_ctx.set_arg_int(actual_argument_slot, v._get_addr())
                elif isinstance(needed, any_arr) and isinstance(
                        v, taichi.lang._ndarray.Ndarray):
                    has_external_arrays = True
                    v = v.arr
                    if ndarray_use_torch:
                        is_ndarray = True
                        tmp, torch_callbacks = self.get_torch_callbacks(
                            v, has_torch, is_ndarray)
                        callbacks += torch_callbacks
                        launch_ctx.set_arg_external_array_with_shape(
                            actual_argument_slot, int(tmp.data_ptr()),
                            tmp.element_size() * tmp.nelement(), v.shape)
                    else:
                        launch_ctx.set_arg_ndarray(actual_argument_slot, v)
                elif isinstance(needed, any_arr) and (self.match_ext_arr(v)):
                    has_external_arrays = True
                    is_numpy = isinstance(v, np.ndarray)
                    if is_numpy:
                        tmp = np.ascontiguousarray(v)
                        # Purpose: DO NOT GC |tmp|!
                        tmps.append(tmp)
                        launch_ctx.set_arg_external_array_with_shape(
                            actual_argument_slot, int(tmp.ctypes.data),
                            tmp.nbytes, v.shape)
                    else:
                        is_ndarray = False
                        tmp, torch_callbacks = self.get_torch_callbacks(
                            v, has_torch, is_ndarray)
                        callbacks += torch_callbacks
                        launch_ctx.set_arg_external_array_with_shape(
                            actual_argument_slot, int(tmp.data_ptr()),
                            tmp.element_size() * tmp.nelement(), v.shape)

                elif isinstance(needed, MatrixType):
                    if id(needed.dtype) in primitive_types.real_type_ids:
                        for a in range(needed.n):
                            for b in range(needed.m):
                                if not isinstance(v[a, b], (int, float)):
                                    raise TaichiRuntimeTypeError(
                                        i, needed.dtype.to_string(),
                                        type(v[a, b]))
                                launch_ctx.set_arg_float(
                                    actual_argument_slot, float(v[a, b]))
                                actual_argument_slot += 1
                    elif id(needed.dtype) in primitive_types.integer_type_ids:
                        for a in range(needed.n):
                            for b in range(needed.m):
                                if not isinstance(v[a, b], int):
                                    raise TaichiRuntimeTypeError(
                                        i, needed.dtype.to_string(),
                                        type(v[a, b]))
                                launch_ctx.set_arg_int(actual_argument_slot,
                                                       int(v[a, b]))
                                actual_argument_slot += 1
                    else:
                        raise ValueError(
                            f'Matrix dtype {needed.dtype} is not integer type or real type.'
                        )
                    continue
                else:
                    raise ValueError(
                        f'Argument type mismatch. Expecting {needed}, got {type(v)}.'
                    )
                actual_argument_slot += 1
            # Both the class kernels and the plain-function kernels are unified now.
            # In both cases, |self.grad| is another Kernel instance that computes the
            # gradient. For class kernels, args[0] is always the kernel owner.
            if not self.is_grad and self.runtime.target_tape and not self.runtime.grad_replaced:
                self.runtime.target_tape.insert(self, args)

            t_kernel(launch_ctx)

            ret = None
            ret_dt = self.return_type
            has_ret = ret_dt is not None

            if has_ret or (impl.current_cfg().async_mode
                           and has_external_arrays):
                runtime_ops.sync()

            if has_ret:
                if id(ret_dt) in primitive_types.integer_type_ids:
                    ret = t_kernel.get_ret_int(0)
                else:
                    ret = t_kernel.get_ret_float(0)

            if callbacks:
                for c in callbacks:
                    c()

            return ret
Exemple #4
0
        def func__(*args):
            assert len(args) == len(
                self.arguments
            ), f'{len(self.arguments)} arguments needed but {len(args)} provided'

            tmps = []
            callbacks = []
            has_external_arrays = False
            has_torch = has_pytorch()
            has_pp = has_paddle()

            actual_argument_slot = 0
            launch_ctx = t_kernel.make_launch_context()
            for i, v in enumerate(args):
                needed = self.arguments[i].annotation
                if isinstance(needed, template):
                    continue
                provided = type(v)
                # Note: do not use sth like "needed == f32". That would be slow.
                if id(needed) in primitive_types.real_type_ids:
                    if not isinstance(v, (float, int)):
                        raise TaichiRuntimeTypeError.get(
                            i, needed.to_string(), provided)
                    launch_ctx.set_arg_float(actual_argument_slot, float(v))
                elif id(needed) in primitive_types.integer_type_ids:
                    if not isinstance(v, int):
                        raise TaichiRuntimeTypeError.get(
                            i, needed.to_string(), provided)
                    launch_ctx.set_arg_int(actual_argument_slot, int(v))
                elif isinstance(needed, sparse_matrix_builder):
                    # Pass only the base pointer of the ti.types.sparse_matrix_builder() argument
                    launch_ctx.set_arg_int(actual_argument_slot, v._get_addr())
                elif isinstance(needed,
                                ndarray_type.NdarrayType) and isinstance(
                                    v, taichi.lang._ndarray.Ndarray):
                    has_external_arrays = True
                    v = v.arr
                    launch_ctx.set_arg_ndarray(actual_argument_slot, v)
                elif isinstance(needed,
                                texture_type.TextureType) and isinstance(
                                    v, taichi.lang._texture.Texture):
                    has_external_arrays = True
                    v = v.tex
                    launch_ctx.set_arg_texture(actual_argument_slot, v)
                elif isinstance(needed,
                                texture_type.RWTextureType) and isinstance(
                                    v, taichi.lang._texture.Texture):
                    has_external_arrays = True
                    v = v.tex
                    launch_ctx.set_arg_rw_texture(actual_argument_slot, v)
                elif isinstance(
                        needed,
                        ndarray_type.NdarrayType) and (self.match_ext_arr(v)):
                    has_external_arrays = True
                    is_numpy = isinstance(v, np.ndarray)
                    is_torch = isinstance(v,
                                          torch.Tensor) if has_torch else False

                    # Element shapes are already spcialized in Taichi codegen.
                    # The shape information for element dims are no longer needed.
                    # Therefore we strip the element shapes from the shape vector,
                    # so that it only holds "real" array shapes.
                    is_soa = needed.layout == Layout.SOA
                    array_shape = v.shape
                    element_dim = needed.element_dim
                    if element_dim:
                        array_shape = v.shape[
                            element_dim:] if is_soa else v.shape[:-element_dim]
                    if is_numpy:
                        tmp = np.ascontiguousarray(v)
                        # Purpose: DO NOT GC |tmp|!
                        tmps.append(tmp)
                        launch_ctx.set_arg_external_array_with_shape(
                            actual_argument_slot, int(tmp.ctypes.data),
                            tmp.nbytes, array_shape)
                    elif is_torch:
                        is_ndarray = False
                        tmp, torch_callbacks = self.get_torch_callbacks(
                            v, has_torch, is_ndarray)
                        callbacks += torch_callbacks
                        launch_ctx.set_arg_external_array_with_shape(
                            actual_argument_slot, int(tmp.data_ptr()),
                            tmp.element_size() * tmp.nelement(), array_shape)
                    else:
                        # For now, paddle.fluid.core.Tensor._ptr() is only available on develop branch
                        tmp, paddle_callbacks = self.get_paddle_callbacks(
                            v, has_pp)
                        callbacks += paddle_callbacks
                        launch_ctx.set_arg_external_array_with_shape(
                            actual_argument_slot, int(tmp._ptr()),
                            v.element_size() * v.size, array_shape)

                elif isinstance(needed, MatrixType):
                    if id(needed.dtype) in primitive_types.real_type_ids:
                        for a in range(needed.n):
                            for b in range(needed.m):
                                if not isinstance(v[a, b], (int, float)):
                                    raise TaichiRuntimeTypeError.get(
                                        i, needed.dtype.to_string(),
                                        type(v[a, b]))
                                launch_ctx.set_arg_float(
                                    actual_argument_slot, float(v[a, b]))
                                actual_argument_slot += 1
                    elif id(needed.dtype) in primitive_types.integer_type_ids:
                        for a in range(needed.n):
                            for b in range(needed.m):
                                if not isinstance(v[a, b], int):
                                    raise TaichiRuntimeTypeError.get(
                                        i, needed.dtype.to_string(),
                                        type(v[a, b]))
                                launch_ctx.set_arg_int(actual_argument_slot,
                                                       int(v[a, b]))
                                actual_argument_slot += 1
                    else:
                        raise ValueError(
                            f'Matrix dtype {needed.dtype} is not integer type or real type.'
                        )
                    continue
                else:
                    raise ValueError(
                        f'Argument type mismatch. Expecting {needed}, got {type(v)}.'
                    )
                actual_argument_slot += 1
            # Both the class kernels and the plain-function kernels are unified now.
            # In both cases, |self.grad| is another Kernel instance that computes the
            # gradient. For class kernels, args[0] is always the kernel owner.
            if self.autodiff_mode == AutodiffMode.NONE and self.runtime.target_tape and not self.runtime.grad_replaced:
                self.runtime.target_tape.insert(self, args)

            if actual_argument_slot > 8 and (
                    impl.current_cfg().arch == _ti_core.opengl
                    or impl.current_cfg().arch == _ti_core.cc):
                raise TaichiRuntimeError(
                    f"The number of elements in kernel arguments is too big! Do not exceed 8 on {_ti_core.arch_name(impl.current_cfg().arch)} backend."
                )

            if actual_argument_slot > 64 and (
                (impl.current_cfg().arch != _ti_core.opengl
                 and impl.current_cfg().arch != _ti_core.cc)):
                raise TaichiRuntimeError(
                    f"The number of elements in kernel arguments is too big! Do not exceed 64 on {_ti_core.arch_name(impl.current_cfg().arch)} backend."
                )

            try:
                t_kernel(launch_ctx)
            except Exception as e:
                e = handle_exception_from_cpp(e)
                raise e from None

            ret = None
            ret_dt = self.return_type
            has_ret = ret_dt is not None

            if has_ret:
                runtime_ops.sync()

            if has_ret:
                if id(ret_dt) in primitive_types.integer_type_ids:
                    ret = t_kernel.get_ret_int(0)
                elif id(ret_dt) in primitive_types.real_type_ids:
                    ret = t_kernel.get_ret_float(0)
                elif id(ret_dt.dtype) in primitive_types.integer_type_ids:
                    it = iter(t_kernel.get_ret_int_tensor(0))
                    ret = Matrix([[next(it) for _ in range(ret_dt.m)]
                                  for _ in range(ret_dt.n)])
                else:
                    it = iter(t_kernel.get_ret_float_tensor(0))
                    ret = Matrix([[next(it) for _ in range(ret_dt.m)]
                                  for _ in range(ret_dt.n)])
            if callbacks:
                for c in callbacks:
                    c()

            return ret