Beispiel #1
0
def gen_tests_for_cvt_reinterpret(opts, tt, t, operator):
    op_name = operator.name
    dirname = os.path.join(opts.tests_dir, 'modules', 'spmd')
    common.mkdir_p(dirname)
    filename = os.path.join(dirname, '{}.{}_{}.cpp'.format(op_name, t, tt))
    if not common.can_create_filename(opts, filename):
        return

    with common.open_utf8(opts, filename) as out:
        out.write('''#include <nsimd/modules/spmd.hpp>
        #include <nsimd/modules/memory_management.hpp>
        #include <nsimd/scalar_utilities.h>
        #include "../common.hpp"

        #if defined(NSIMD_CUDA)

        __global__ void kernel({typ} *dst, {typ} *a0, int n) {{
          int i = threadIdx.x + blockIdx.x * blockDim.x;
          if (i < n) {{
            dst[i] = nsimd::gpu_{op_name}({typ}(), nsimd::gpu_{op_name}(
                         {totyp}(), a0[i]));
          }}
        }}

        void compute_result({typ} *dst, {typ} *a0, unsigned int n) {{
          kernel<<<{gpu_params}>>>(dst, a0, int(n));
        }}

        {cbprng_cuda}

        #elif defined(NSIMD_ROCM)

        __global__ void kernel({typ} *dst, {typ} *a0, size_t n) {{
          size_t i = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
          if (i < n) {{
            dst[i] = nsimd::gpu_{op_name}({typ}(), nsimd::gpu_{op_name}(
                         {totyp}(), a0[i]));
          }}
        }}

        void compute_result({typ} *dst, {typ} *a0, size_t n) {{
          hipLaunchKernelGGL(kernel, {gpu_params}, 0, 0, dst, a0, n);
        }}

        {cbprng_hip}

        #elif defined(NSIMD_ONEAPI)

        inline void kernel({typ} *dst, {typ} *a0, const size_t n,
                           sycl::nd_item<1> item) {{
          const size_t ii = item.get_global_id().get(0);
          if (ii < n){{
            dst[ii] = nsimd::gpu_{op_name}({typ}(), nsimd::gpu_{op_name}(
                              {totyp}(), a0[ii]));
          }}
        }}

        void compute_result({typ} *dst, {typ} *a0, size_t n) {{
          size_t total_num_threads = (size_t)nsimd_kernel_param((int)n, {tpb});
          sycl::queue q_ = nsimd::oneapi::default_queue();
          q_.parallel_for(sycl::nd_range<1>(sycl::range<1>(total_num_threads),
                                            sycl::range<1>({tpb})),
                                            [=](sycl::nd_item<1> item){{
            kernel(dst, a0, n, item);
          }}).wait_and_throw();
        }}

        {cbprng_oneapi}

        #else

        void compute_result({typ} *dst, {typ} *a0, unsigned int n) {{
          for (unsigned int i = 0; i < n; i++) {{
            dst[i] = nsimd::scalar_{op_name}({typ}(), nsimd::scalar_{op_name}(
                         {totyp}(), a0[i]));
          }}
        }}

        {cbprng_cpu}

        #endif

        // clang-format off

        spmd_kernel_1d(kernel, {typ} *dst, {typ} *a0)
          k_store(dst, k_{op_name}({k_typ}, k_{op_name}({k_totyp},
                  k_load(a0))));
        spmd_kernel_end

        // clang-format on

        int main() {{
          unsigned int n_[3] = {{ 10, 1001, 10001 }};
          for (int i = 0; i < (int)(sizeof(n_) / sizeof(int)); i++) {{
            unsigned int n = n_[i];
            int ret = 0;
            {typ} *a0 = nsimd::device_calloc<{typ}>(n);
            random(a0, n, 0);
            {typ} *ref = nsimd::device_calloc<{typ}>(n);
            {typ} *out = nsimd::device_calloc<{typ}>(n);
            spmd_launch_kernel_1d(kernel, {typnbits}, 1, n, out, a0);
            compute_result(ref, a0, n);
            if (!cmp(ref, out, n)) {{
              ret = -1;
            }}
            nsimd::device_free(a0);
            nsimd::device_free(ref);
            nsimd::device_free(out);
            if (ret != 0) {{
              return ret;
            }}
          }}
          return 0;
        }}
        '''.format(
            typ=t,
            totyp=tt,
            op_name=op_name,
            typnbits=t[1:],
            gpu_params=gpu_params,
            k_typ=k_typ[t[0]],
            tpb=tpb,
            cbprng_cpu=nsimd_tests.cbprng(t, operator, 'cpu'),
            cbprng_cuda=nsimd_tests.cbprng(t, operator, 'cuda'),
            cbprng_hip=nsimd_tests.cbprng(t, operator, 'hip', gpu_params),
            cbprng_oneapi=nsimd_tests.cbprng(t, operator, 'oneapi',
                                             ['(int)n', str(tpb)]),
            k_totyp=k_typ[tt[0]]))

    common.clang_format(opts, filename, cuda=True)
Beispiel #2
0
def gen_tests_for(opts, t, operator):
    op_name = operator.name
    dirname = os.path.join(opts.tests_dir, 'modules', 'spmd')
    common.mkdir_p(dirname)
    filename = os.path.join(dirname, '{}.{}.cpp'.format(op_name, t))
    if not common.can_create_filename(opts, filename):
        return

    arity = len(operator.params[1:])
    k_args = ', '.join(['{} *a{}'.format(t, i) for i in range(arity)])
    k_call_args = ', '.join(['a{}'.format(i) for i in range(arity)])

    fill_tabs = '\n'.join(['{typ} *a{i} = nsimd::device_calloc<{typ}>(n);\n' \
                           'random(a{i}, n, {i});'.format(typ=t, i=i) \
                           for i in range(arity)])

    free_tabs = '\n'.join(['nsimd::device_free(a{i});'. \
                           format(typ=t, i=i) for i in range(arity)])

    # spmd
    def get_cte_spmd(typ, cte):
        if typ == 'f16':
            return 'k_f32_to_f16((f32){})'.format(cte)
        else:
            return '({}){}'.format(typ, cte)

    def spmd_load_code(param, typ, i):
        if param == 'l':
            return 'k_lt(k_load(a{}), {})'.format(i, get_cte_spmd(typ, 4))
        if param == 'v':
            return 'k_load(a{})'.format(i)

    args = ', '.join([spmd_load_code(operator.params[i + 1], t, i) \
                      for i in range(arity)])
    if op_name == 'to_mask':
        args = k_typ[t[0]] + ', ' + args
    if operator.params[0] == 'v':
        k_code = 'k_store(dst, k_{}({}));'.format(op_name, args)
    else:
        k_code = '''k_if (k_{}({}))
                      k_store(dst, 1);
                    k_else
                      k_store(dst, 0);
                    k_endif'''.format(op_name, args)

    # gpu
    def get_cte_gpu(typ, cte, target):
        if typ == 'f16' and target == 'cuda_rocm':
            return '__float2half((f32){})'.format(cte)
        else:
            return '({}){}'.format(typ, cte)

    def gpu_load_code(param, typ, i, target):
        if param == 'l':
            return 'nsimd::gpu_lt(a{}[i], {})'. \
                   format(i, get_cte_gpu(typ, 4, target))
        if param == 'v':
            return 'a{}[i]'.format(i)

    args_cuda_rocm = ', '.join([gpu_load_code(operator.params[i + 1], t, i,
                                              'cuda_rocm') \
                                              for i in range(arity)])
    args_oneapi = ', '.join([
        gpu_load_code(operator.params[i + 1], t, i, 'oneapi')
        for i in range(arity)
    ])
    if op_name == 'to_mask':
        args_cuda_rocm = t + '(), ' + args_cuda_rocm
        args_oneapi = t + '(), ' + args_oneapi
    if operator.params[0] == 'v':
        cuda_rocm_kernel = 'dst[i] = nsimd::gpu_{}({});'. \
                           format(op_name, args_cuda_rocm)
        oneapi_kernel = 'dst[i] = nsimd::gpu_{}({});'. \
                        format(op_name, args_oneapi)
    else:
        tmpl = '''if (nsimd::gpu_{}({{}})) {{{{
                    dst[i] = {{}};
                  }}}} else {{{{
                    dst[i] = {{}};
                  }}}}'''.format(op_name)
        cuda_rocm_kernel = tmpl.format(args_cuda_rocm,
                                       get_cte_gpu(t, 1, 'cuda_rocm'),
                                       get_cte_gpu(t, 0, 'cuda_rocm'))
        oneapi_kernel = tmpl.format(args_oneapi, get_cte_gpu(t, 1, 'oneapi'),
                                    get_cte_gpu(t, 0, 'oneapi'))

    # cpu
    def get_cte_cpu(typ, cte):
        if typ == 'f16':
            return 'nsimd_f32_to_f16((f32){})'.format(cte)
        else:
            return '({}){}'.format(typ, cte)

    def cpu_load_code(param, typ, i):
        if param == 'l':
            return 'nsimd::scalar_lt(a{}[i], {})'. \
                   format(i, get_cte_cpu(typ, 4))
        if param == 'v':
            return 'a{}[i]'.format(i)

    args = ', '.join([cpu_load_code(operator.params[i + 1], t, i) \
                      for i in range(arity)])
    if op_name == 'to_mask':
        args = t + '(), ' + args
    if operator.params[0] == 'v':
        cpu_kernel = 'dst[i] = nsimd::scalar_{}({});'.format(op_name, args)
    else:
        cpu_kernel = '''if (nsimd::scalar_{op_name}({args})) {{
                          dst[i] = {one};
                        }} else {{
                          dst[i] = {zero};
                        }}'''.format(op_name=op_name,
                                     args=args,
                                     one=get_cte_cpu(t, 1),
                                     zero=get_cte_cpu(t, 0))

    comp = '!cmp(ref, out, n{})'.format('' if t in common.iutypes \
                                        else ', {}'.format(operator.ufp[t]))

    with common.open_utf8(opts, filename) as out:
        out.write('''#include <nsimd/modules/spmd.hpp>
        #include <nsimd/modules/memory_management.hpp>
        #include <nsimd/scalar_utilities.h>
        #include "../common.hpp"

        #if defined(NSIMD_CUDA)

        __global__ void kernel({typ} *dst, {k_args}, int n) {{
          int i = threadIdx.x + blockIdx.x * blockDim.x;
          if (i < n) {{
            {cuda_rocm_kernel}
          }}
        }}

        void compute_result({typ} *dst, {k_args}, unsigned int n) {{
          kernel<<<{gpu_params}>>>(dst, {k_call_args}, int(n));
        }}

        {cbprng_cuda}

        #elif defined(NSIMD_ROCM)

        __global__ void kernel({typ} *dst, {k_args}, size_t n) {{
          size_t i = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
          if (i < n) {{
            {cuda_rocm_kernel}
          }}
        }}

        void compute_result({typ} *dst, {k_args}, size_t n) {{
          hipLaunchKernelGGL(kernel, {gpu_params}, 0, 0, dst, {k_call_args},
                             n);
        }}

        {cbprng_hip}

        #elif defined(NSIMD_ONEAPI)

        inline void kernel({typ} *dst, {k_args}, const size_t n,
                           sycl::nd_item<1> item) {{
          const size_t i = item.get_global_id().get(0);
          if(i < n){{
            {oneapi_kernel}
          }}
        }}

        void compute_result({typ} *dst, {k_args}, size_t n) {{
          size_t total_num_threads = (size_t)nsimd_kernel_param((int)n, {tpb});
          sycl::queue q_ = nsimd::oneapi::default_queue();
          q_.parallel_for(sycl::nd_range<1>(sycl::range<1>(total_num_threads),
                                            sycl::range<1>({tpb})),
                                            [=](sycl::nd_item<1> item){{
            kernel(dst, {k_call_args}, n, item);
          }}).wait_and_throw();
        }}

        {cbprng_oneapi}

        #else

        void compute_result({typ} *dst, {k_args}, unsigned int n) {{
          for (unsigned int i = 0; i < n; i++) {{
            {cpu_kernel}
          }}
        }}

        {cbprng_cpu}

        #endif

        // clang-format off

        spmd_kernel_1d(kernel, {typ} *dst, {k_args})
          {k_code}
        spmd_kernel_end

        // clang-format on

        #if defined(NSIMD_CUDA) || defined(NSIMD_ROCM) || defined(NSIMD_ONEAPI)
        #define THREADS_PER_BLOCK 128
        #else
        #define THREADS_PER_BLOCK 1
        #endif

        int main() {{
          unsigned int n_[3] = {{ 10, 1001, 10001 }};
          for (int i = 0; i < (int)(sizeof(n_) / sizeof(int)); i++) {{
            unsigned int n = n_[i];
            int ret = 0;
            {fill_tabs}
            {typ} *ref = nsimd::device_calloc<{typ}>(n);
            {typ} *out = nsimd::device_calloc<{typ}>(n);
            spmd_launch_kernel_1d(kernel, {typnbits}, THREADS_PER_BLOCK, n,
                                  out, {k_call_args});
            compute_result(ref, {k_call_args}, n);
            if ({comp}) {{
              ret = -1;
            }}
            nsimd::device_free(ref);
            nsimd::device_free(out);
            {free_tabs}
            if (ret != 0) {{
              return ret;
            }}
          }}
          return 0;
        }}
        '''.format(
            typ=t,
            free_tabs=free_tabs,
            fill_tabs=fill_tabs,
            k_code=k_code,
            k_call_args=k_call_args,
            k_args=k_args,
            cpu_kernel=cpu_kernel,
            comp=comp,
            cuda_rocm_kernel=cuda_rocm_kernel,
            oneapi_kernel=oneapi_kernel,
            cbprng_cpu=nsimd_tests.cbprng(t, operator, 'cpu'),
            cbprng_cuda=nsimd_tests.cbprng(t, operator, 'cuda', gpu_params),
            cbprng_hip=nsimd_tests.cbprng(t, operator, 'hip', gpu_params),
            cbprng_oneapi=nsimd_tests.cbprng(t, operator, 'oneapi',
                                             ['(int)n', str(tpb)]),
            gpu_params=gpu_params,
            typnbits=t[1:],
            tpb=tpb))

    common.clang_format(opts, filename, cuda=True)
Beispiel #3
0
def gen_tests_for(opts, tt, t, operator):
    op_name = operator.name
    dirname = os.path.join(opts.tests_dir, 'modules', 'tet1d')
    common.mkdir_p(dirname)
    filename = os.path.join(
        dirname, '{}.{}.cpp'.format(op_name,
                                    t if t == tt else '{}_{}'.format(t, tt)))
    if not common.can_create_filename(opts, filename):
        return

    arity = len(operator.params[1:])
    args_tabs = ', '.join(['{typ} *tab{i}'.format(typ=t, i=i) \
                           for i in range(arity)])
    args_tabs_call = ', '.join(['tab{i}'.format(i=i) \
                                for i in range(arity)])
    args_tabs_i_call = ', '.join(['tab{i}[i]'.format(i=i) \
                                  for i in range(arity)])
    args_in_tabs_call = ', '.join(['tet1d::in(tab{i}, n)'. \
                                   format(i=i) \
                                   for i in range(arity)])

    fill_tabs = '\n'.join(['{typ} *tab{i} = nsimd::device_calloc<{typ}>(n);\n' \
                           'random(tab{i}, n, {i});'.format(typ=t, i=i) \
                           for i in range(arity)])

    free_tabs = '\n'.join(['nsimd::device_free(tab{i});'. \
                           format(typ=t, i=i) for i in range(arity)])

    zero = '{}(0)'.format(t) if t != 'f16' else '{f32_to_f16}(0.0f)'
    one = '{}(1)'.format(t) if t != 'f16' else '{f32_to_f16}(1.0f)'
    comp_tab0_to_1 = 'tab0[i] == {}(1)'.format(t) if t != 'f16' else \
                     '{f16_to_f32}(tab0[i]) == 1.0f'
    comp_tab1_to_1 = 'tab1[i] == {}(1)'.format(t) if t != 'f16' else \
                     '{f16_to_f32}(tab1[i]) == 1.0f'

    if op_name == 'cvt':
        tet1d_code = \
            '''tet1d::out(out) = tet1d::cvt<{t}>(tet1d::cvt<{tt}>(
                                     tet1d::in(tab0, n)));'''. \
                                     format(t=t, tt=tt)
        compute_result_kernel = \
            '''dst[i] = nsimd::{{p}}_cvt({t}(), nsimd::{{p}}_cvt(
                            {tt}(), tab0[i]));'''.format(t=t, tt=tt)
    elif op_name == 'reinterpret':
        tet1d_code = \
            '''tet1d::out(out) = tet1d::reinterpret<{t}>(
                                     tet1d::reinterpret<{tt}>(tet1d::in(
                                         tab0, n)));'''.format(t=t, tt=tt)
        compute_result_kernel = \
            '''dst[i] = nsimd::{{p}}_reinterpret({t}(),
                            nsimd::{{p}}_reinterpret({tt}(),
                                tab0[i]));'''.format(t=t, tt=tt)
    elif op_name in ['to_mask', 'to_logical']:
        tet1d_code = \
            '''tet1d::out(out) = tet1d::to_mask(tet1d::to_logical(tet1d::in(
                                     tab0, n)));'''
        compute_result_kernel = \
            '''dst[i] = nsimd::{{p}}_to_mask({t}(),
                            nsimd::{{p}}_to_logical(tab0[i]));'''. \
                            format(t=t)
    elif operator.params == ['v'] * len(operator.params):
        compute_result_kernel = \
            'dst[i] = nsimd::{{p}}_{op_name}({args_tabs_i_call});'. \
            format(op_name=op_name, args_tabs_i_call=args_tabs_i_call)
        if operator.cxx_operator != None:
            if len(operator.params[1:]) == 1:
                tet1d_code = 'tet1d::out(out) = {cxx_op}tet1d::in(tab0, n);'. \
                             format(cxx_op=operator.cxx_operator)
            else:
                tet1d_code = 'tet1d::out(out) = tet1d::in(tab0, n) {cxx_op} ' \
                             'tet1d::in(tab1, n);'. \
                             format(cxx_op=operator.cxx_operator)
        else:
            tet1d_code = \
                'tet1d::out(out) = tet1d::{op_name}({args_in_tabs_call});'. \
                format(op_name=op_name, args_in_tabs_call=args_in_tabs_call)
    elif operator.params == ['l', 'v', 'v']:
        if operator.cxx_operator != None:
            cond = 'A {} B'.format(operator.cxx_operator)
        else:
            cond = 'tet1d::{}(A, B)'.format(op_name)
        tet1d_code = \
            '''TET1D_OUT({typ}) Z = tet1d::out(out);
               TET1D_IN({typ}) A = tet1d::in(tab0, n);
               TET1D_IN({typ}) B = tet1d::in(tab1, n);
               Z({cond}) = 1;'''.format(cond=cond, typ=t)
        compute_result_kernel = \
            '''if (nsimd::{{p}}_{op_name}(tab0[i], tab1[i])) {{{{
                 dst[i] = {one};
               }}}} else {{{{
                 dst[i] = {zero};
               }}}}'''.format(op_name=op_name, typ=t, one=one, zero=zero)
    elif operator.params == ['l'] * len(operator.params):
        if len(operator.params[1:]) == 1:
            if operator.cxx_operator != None:
                cond = '{}(A == 1)'.format(operator.cxx_operator)
            else:
                cond = 'tet1d::{}(A == 1)'.format(op_name)
            tet1d_code = \
                '''TET1D_OUT({typ}) Z = tet1d::out(out);
                   TET1D_IN({typ}) A = tet1d::in(tab0, n);
                   Z({cond}) = 1;'''.format(cond=cond, typ=t)
            compute_result_kernel = \
                '''if (nsimd::{{p}}_{op_name}({comp_tab0_to_1})) {{{{
                     dst[i] = {one};
                   }}}} else {{{{
                     dst[i] = {zero};
                   }}}}'''.format(op_name=op_name, typ=t, one=one, zero=zero,
                                  comp_tab0_to_1=comp_tab0_to_1)
        if len(operator.params[1:]) == 2:
            if operator.cxx_operator != None:
                cond = '(A == 1) {} (B == 1)'.format(operator.cxx_operator)
            else:
                cond = 'tet1d::{}(A == 1, B == 1)'.format(op_name)
            tet1d_code = \
                '''TET1D_OUT({typ}) Z = tet1d::out(out);
                   TET1D_IN({typ}) A = tet1d::in(tab0, n);
                   TET1D_IN({typ}) B = tet1d::in(tab1, n);
                   Z({cond}) = 1;'''.format(cond=cond, typ=t)
            compute_result_kernel = \
                '''if (nsimd::{{p}}_{op_name}({comp_tab0_to_1},
                                              {comp_tab1_to_1})) {{{{
                     dst[i] = {one};
                   }}}} else {{{{
                     dst[i] = {zero};
                   }}}}'''.format(op_name=op_name, typ=t, one=one, zero=zero,
                                  comp_tab0_to_1=comp_tab0_to_1,
                                  comp_tab1_to_1=comp_tab1_to_1)
    else:
        raise Exception('Unsupported operator: "{}"'.format(op_name))

    cpu_kernel = compute_result_kernel.format(p='scalar',
                                              f32_to_f16='nsimd_f32_to_f16',
                                              f16_to_f32='nsimd_f16_to_f32')
    cuda_rocm_kernel = compute_result_kernel.format(p='gpu',
                                                    f32_to_f16='__float2half',
                                                    f16_to_f32='__half2float')
    oneapi_kernel = compute_result_kernel.format(p='gpu',
                                                 f32_to_f16='(f16)',
                                                 f16_to_f32='(f32)')

    comp = '!cmp(ref, out, n{})'.format('' if t in common.iutypes \
                                        else ', {}'.format(operator.ufp[t]))

    with common.open_utf8(opts, filename) as out:
        out.write('''#include <nsimd/modules/tet1d.hpp>
        #include <nsimd/modules/memory_management.hpp>
        #include "../common.hpp"

        #if defined(NSIMD_CUDA)

        __global__ void kernel({typ} *dst, {args_tabs}, int n) {{
          int i = threadIdx.x + blockIdx.x * blockDim.x;
          if (i < n) {{
            {cuda_rocm_kernel}
          }}
        }}

        void compute_result({typ} *dst, {args_tabs}, unsigned int n) {{
          kernel<<<{gpu_params}>>>(dst, {args_tabs_call}, int(n));
        }}

        {cbprng_cuda}

        #elif defined(NSIMD_ROCM)

        __global__ void kernel({typ} *dst, {args_tabs}, size_t n) {{
          size_t i = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
          if (i < n) {{
            {cuda_rocm_kernel}
          }}
        }}

        void compute_result({typ} *dst, {args_tabs}, size_t n) {{
          hipLaunchKernelGGL(kernel, {gpu_params}, 0, 0, dst, {args_tabs_call},
                             n);
        }}

        {cbprng_hip}

        #elif defined(NSIMD_ONEAPI)

        inline void kernel({typ} *dst, {args_tabs}, const size_t n,
                           sycl::nd_item<1> item) {{
          size_t i = item.get_global_id().get(0);
          if (i < n) {{
            {oneapi_kernel}
          }}
        }}

        void compute_result({typ} *dst, {args_tabs}, const size_t n) {{
	  size_t total_num_threads = (size_t)nsimd_kernel_param((int)n, {tpb});
	  sycl::queue q_ = nsimd::oneapi::default_queue();
	  q_.parallel_for(sycl::nd_range<1>(sycl::range<1>(total_num_threads),
	                                    sycl::range<1>({tpb})),
	                                    [=](sycl::nd_item<1> item){{
            kernel(dst, {args_tabs_call}, n, item);
          }}).wait_and_throw();
        }}

        {cbprng_oneapi}

        #else

        void compute_result({typ} *dst, {args_tabs}, unsigned int n) {{
          for (unsigned int i = 0; i < n; i++) {{
            {cpu_kernel}
          }}
        }}

        {cbprng_cpu}

        #endif

        int main() {{
          unsigned int n_[3] = {{ 10, 1001, 10001 }};
          for (int i = 0; i < (int)(sizeof(n_) / sizeof(int)); i++) {{
            unsigned int n = n_[i];
            int ret = 0;
            {fill_tabs}
            {typ} *ref = nsimd::device_calloc<{typ}>(n);
            {typ} *out = nsimd::device_calloc<{typ}>(n);
            compute_result(ref, {args_tabs_call}, n);
            {tet1d_code}
            if ({comp}) {{
              ret = -1;
            }}
            nsimd::device_free(ref);
            nsimd::device_free(out);
            {free_tabs}
            if (ret != 0) {{
              return ret;
            }}
          }}
          return 0;
        }}
        '''.format(
            typ=t,
            args_tabs=args_tabs,
            fill_tabs=fill_tabs,
            args_tabs_call=args_tabs_call,
            gpu_params=gpu_params,
            free_tabs=free_tabs,
            tet1d_code=tet1d_code,
            comp=comp,
            cpu_kernel=cpu_kernel,
            tpb=tpb,
            cuda_rocm_kernel=cuda_rocm_kernel,
            oneapi_kernel=oneapi_kernel,
            cbprng_cpu=nsimd_tests.cbprng(t, operator, 'cpu'),
            cbprng_cuda=nsimd_tests.cbprng(t, operator, 'cuda', gpu_params),
            cbprng_hip=nsimd_tests.cbprng(t, operator, 'hip', gpu_params),
            cbprng_oneapi=nsimd_tests.cbprng(t, operator, 'oneapi',
                                             ['(int)n', str(tpb)])))

    common.clang_format(opts, filename, cuda=True)