def gen_tests_for_cvt_reinterpret(opts, tt, t, operator): op_name = operator.name dirname = os.path.join(opts.tests_dir, 'modules', 'spmd') common.mkdir_p(dirname) filename = os.path.join(dirname, '{}.{}_{}.cpp'.format(op_name, t, tt)) if not common.can_create_filename(opts, filename): return with common.open_utf8(opts, filename) as out: out.write('''#include <nsimd/modules/spmd.hpp> #include <nsimd/modules/memory_management.hpp> #include <nsimd/scalar_utilities.h> #include "../common.hpp" #if defined(NSIMD_CUDA) __global__ void kernel({typ} *dst, {typ} *a0, int n) {{ int i = threadIdx.x + blockIdx.x * blockDim.x; if (i < n) {{ dst[i] = nsimd::gpu_{op_name}({typ}(), nsimd::gpu_{op_name}( {totyp}(), a0[i])); }} }} void compute_result({typ} *dst, {typ} *a0, unsigned int n) {{ kernel<<<{gpu_params}>>>(dst, a0, int(n)); }} {cbprng_cuda} #elif defined(NSIMD_ROCM) __global__ void kernel({typ} *dst, {typ} *a0, size_t n) {{ size_t i = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; if (i < n) {{ dst[i] = nsimd::gpu_{op_name}({typ}(), nsimd::gpu_{op_name}( {totyp}(), a0[i])); }} }} void compute_result({typ} *dst, {typ} *a0, size_t n) {{ hipLaunchKernelGGL(kernel, {gpu_params}, 0, 0, dst, a0, n); }} {cbprng_hip} #elif defined(NSIMD_ONEAPI) inline void kernel({typ} *dst, {typ} *a0, const size_t n, sycl::nd_item<1> item) {{ const size_t ii = item.get_global_id().get(0); if (ii < n){{ dst[ii] = nsimd::gpu_{op_name}({typ}(), nsimd::gpu_{op_name}( {totyp}(), a0[ii])); }} }} void compute_result({typ} *dst, {typ} *a0, size_t n) {{ size_t total_num_threads = (size_t)nsimd_kernel_param((int)n, {tpb}); sycl::queue q_ = nsimd::oneapi::default_queue(); q_.parallel_for(sycl::nd_range<1>(sycl::range<1>(total_num_threads), sycl::range<1>({tpb})), [=](sycl::nd_item<1> item){{ kernel(dst, a0, n, item); }}).wait_and_throw(); }} {cbprng_oneapi} #else void compute_result({typ} *dst, {typ} *a0, unsigned int n) {{ for (unsigned int i = 0; i < n; i++) {{ dst[i] = nsimd::scalar_{op_name}({typ}(), nsimd::scalar_{op_name}( {totyp}(), a0[i])); }} }} {cbprng_cpu} #endif // clang-format off spmd_kernel_1d(kernel, {typ} *dst, {typ} *a0) k_store(dst, k_{op_name}({k_typ}, k_{op_name}({k_totyp}, k_load(a0)))); spmd_kernel_end // clang-format on int main() {{ unsigned int n_[3] = {{ 10, 1001, 10001 }}; for (int i = 0; i < (int)(sizeof(n_) / sizeof(int)); i++) {{ unsigned int n = n_[i]; int ret = 0; {typ} *a0 = nsimd::device_calloc<{typ}>(n); random(a0, n, 0); {typ} *ref = nsimd::device_calloc<{typ}>(n); {typ} *out = nsimd::device_calloc<{typ}>(n); spmd_launch_kernel_1d(kernel, {typnbits}, 1, n, out, a0); compute_result(ref, a0, n); if (!cmp(ref, out, n)) {{ ret = -1; }} nsimd::device_free(a0); nsimd::device_free(ref); nsimd::device_free(out); if (ret != 0) {{ return ret; }} }} return 0; }} '''.format( typ=t, totyp=tt, op_name=op_name, typnbits=t[1:], gpu_params=gpu_params, k_typ=k_typ[t[0]], tpb=tpb, cbprng_cpu=nsimd_tests.cbprng(t, operator, 'cpu'), cbprng_cuda=nsimd_tests.cbprng(t, operator, 'cuda'), cbprng_hip=nsimd_tests.cbprng(t, operator, 'hip', gpu_params), cbprng_oneapi=nsimd_tests.cbprng(t, operator, 'oneapi', ['(int)n', str(tpb)]), k_totyp=k_typ[tt[0]])) common.clang_format(opts, filename, cuda=True)
def gen_tests_for(opts, t, operator): op_name = operator.name dirname = os.path.join(opts.tests_dir, 'modules', 'spmd') common.mkdir_p(dirname) filename = os.path.join(dirname, '{}.{}.cpp'.format(op_name, t)) if not common.can_create_filename(opts, filename): return arity = len(operator.params[1:]) k_args = ', '.join(['{} *a{}'.format(t, i) for i in range(arity)]) k_call_args = ', '.join(['a{}'.format(i) for i in range(arity)]) fill_tabs = '\n'.join(['{typ} *a{i} = nsimd::device_calloc<{typ}>(n);\n' \ 'random(a{i}, n, {i});'.format(typ=t, i=i) \ for i in range(arity)]) free_tabs = '\n'.join(['nsimd::device_free(a{i});'. \ format(typ=t, i=i) for i in range(arity)]) # spmd def get_cte_spmd(typ, cte): if typ == 'f16': return 'k_f32_to_f16((f32){})'.format(cte) else: return '({}){}'.format(typ, cte) def spmd_load_code(param, typ, i): if param == 'l': return 'k_lt(k_load(a{}), {})'.format(i, get_cte_spmd(typ, 4)) if param == 'v': return 'k_load(a{})'.format(i) args = ', '.join([spmd_load_code(operator.params[i + 1], t, i) \ for i in range(arity)]) if op_name == 'to_mask': args = k_typ[t[0]] + ', ' + args if operator.params[0] == 'v': k_code = 'k_store(dst, k_{}({}));'.format(op_name, args) else: k_code = '''k_if (k_{}({})) k_store(dst, 1); k_else k_store(dst, 0); k_endif'''.format(op_name, args) # gpu def get_cte_gpu(typ, cte, target): if typ == 'f16' and target == 'cuda_rocm': return '__float2half((f32){})'.format(cte) else: return '({}){}'.format(typ, cte) def gpu_load_code(param, typ, i, target): if param == 'l': return 'nsimd::gpu_lt(a{}[i], {})'. \ format(i, get_cte_gpu(typ, 4, target)) if param == 'v': return 'a{}[i]'.format(i) args_cuda_rocm = ', '.join([gpu_load_code(operator.params[i + 1], t, i, 'cuda_rocm') \ for i in range(arity)]) args_oneapi = ', '.join([ gpu_load_code(operator.params[i + 1], t, i, 'oneapi') for i in range(arity) ]) if op_name == 'to_mask': args_cuda_rocm = t + '(), ' + args_cuda_rocm args_oneapi = t + '(), ' + args_oneapi if operator.params[0] == 'v': cuda_rocm_kernel = 'dst[i] = nsimd::gpu_{}({});'. \ format(op_name, args_cuda_rocm) oneapi_kernel = 'dst[i] = nsimd::gpu_{}({});'. \ format(op_name, args_oneapi) else: tmpl = '''if (nsimd::gpu_{}({{}})) {{{{ dst[i] = {{}}; }}}} else {{{{ dst[i] = {{}}; }}}}'''.format(op_name) cuda_rocm_kernel = tmpl.format(args_cuda_rocm, get_cte_gpu(t, 1, 'cuda_rocm'), get_cte_gpu(t, 0, 'cuda_rocm')) oneapi_kernel = tmpl.format(args_oneapi, get_cte_gpu(t, 1, 'oneapi'), get_cte_gpu(t, 0, 'oneapi')) # cpu def get_cte_cpu(typ, cte): if typ == 'f16': return 'nsimd_f32_to_f16((f32){})'.format(cte) else: return '({}){}'.format(typ, cte) def cpu_load_code(param, typ, i): if param == 'l': return 'nsimd::scalar_lt(a{}[i], {})'. \ format(i, get_cte_cpu(typ, 4)) if param == 'v': return 'a{}[i]'.format(i) args = ', '.join([cpu_load_code(operator.params[i + 1], t, i) \ for i in range(arity)]) if op_name == 'to_mask': args = t + '(), ' + args if operator.params[0] == 'v': cpu_kernel = 'dst[i] = nsimd::scalar_{}({});'.format(op_name, args) else: cpu_kernel = '''if (nsimd::scalar_{op_name}({args})) {{ dst[i] = {one}; }} else {{ dst[i] = {zero}; }}'''.format(op_name=op_name, args=args, one=get_cte_cpu(t, 1), zero=get_cte_cpu(t, 0)) comp = '!cmp(ref, out, n{})'.format('' if t in common.iutypes \ else ', {}'.format(operator.ufp[t])) with common.open_utf8(opts, filename) as out: out.write('''#include <nsimd/modules/spmd.hpp> #include <nsimd/modules/memory_management.hpp> #include <nsimd/scalar_utilities.h> #include "../common.hpp" #if defined(NSIMD_CUDA) __global__ void kernel({typ} *dst, {k_args}, int n) {{ int i = threadIdx.x + blockIdx.x * blockDim.x; if (i < n) {{ {cuda_rocm_kernel} }} }} void compute_result({typ} *dst, {k_args}, unsigned int n) {{ kernel<<<{gpu_params}>>>(dst, {k_call_args}, int(n)); }} {cbprng_cuda} #elif defined(NSIMD_ROCM) __global__ void kernel({typ} *dst, {k_args}, size_t n) {{ size_t i = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; if (i < n) {{ {cuda_rocm_kernel} }} }} void compute_result({typ} *dst, {k_args}, size_t n) {{ hipLaunchKernelGGL(kernel, {gpu_params}, 0, 0, dst, {k_call_args}, n); }} {cbprng_hip} #elif defined(NSIMD_ONEAPI) inline void kernel({typ} *dst, {k_args}, const size_t n, sycl::nd_item<1> item) {{ const size_t i = item.get_global_id().get(0); if(i < n){{ {oneapi_kernel} }} }} void compute_result({typ} *dst, {k_args}, size_t n) {{ size_t total_num_threads = (size_t)nsimd_kernel_param((int)n, {tpb}); sycl::queue q_ = nsimd::oneapi::default_queue(); q_.parallel_for(sycl::nd_range<1>(sycl::range<1>(total_num_threads), sycl::range<1>({tpb})), [=](sycl::nd_item<1> item){{ kernel(dst, {k_call_args}, n, item); }}).wait_and_throw(); }} {cbprng_oneapi} #else void compute_result({typ} *dst, {k_args}, unsigned int n) {{ for (unsigned int i = 0; i < n; i++) {{ {cpu_kernel} }} }} {cbprng_cpu} #endif // clang-format off spmd_kernel_1d(kernel, {typ} *dst, {k_args}) {k_code} spmd_kernel_end // clang-format on #if defined(NSIMD_CUDA) || defined(NSIMD_ROCM) || defined(NSIMD_ONEAPI) #define THREADS_PER_BLOCK 128 #else #define THREADS_PER_BLOCK 1 #endif int main() {{ unsigned int n_[3] = {{ 10, 1001, 10001 }}; for (int i = 0; i < (int)(sizeof(n_) / sizeof(int)); i++) {{ unsigned int n = n_[i]; int ret = 0; {fill_tabs} {typ} *ref = nsimd::device_calloc<{typ}>(n); {typ} *out = nsimd::device_calloc<{typ}>(n); spmd_launch_kernel_1d(kernel, {typnbits}, THREADS_PER_BLOCK, n, out, {k_call_args}); compute_result(ref, {k_call_args}, n); if ({comp}) {{ ret = -1; }} nsimd::device_free(ref); nsimd::device_free(out); {free_tabs} if (ret != 0) {{ return ret; }} }} return 0; }} '''.format( typ=t, free_tabs=free_tabs, fill_tabs=fill_tabs, k_code=k_code, k_call_args=k_call_args, k_args=k_args, cpu_kernel=cpu_kernel, comp=comp, cuda_rocm_kernel=cuda_rocm_kernel, oneapi_kernel=oneapi_kernel, cbprng_cpu=nsimd_tests.cbprng(t, operator, 'cpu'), cbprng_cuda=nsimd_tests.cbprng(t, operator, 'cuda', gpu_params), cbprng_hip=nsimd_tests.cbprng(t, operator, 'hip', gpu_params), cbprng_oneapi=nsimd_tests.cbprng(t, operator, 'oneapi', ['(int)n', str(tpb)]), gpu_params=gpu_params, typnbits=t[1:], tpb=tpb)) common.clang_format(opts, filename, cuda=True)
def gen_tests_for(opts, tt, t, operator): op_name = operator.name dirname = os.path.join(opts.tests_dir, 'modules', 'tet1d') common.mkdir_p(dirname) filename = os.path.join( dirname, '{}.{}.cpp'.format(op_name, t if t == tt else '{}_{}'.format(t, tt))) if not common.can_create_filename(opts, filename): return arity = len(operator.params[1:]) args_tabs = ', '.join(['{typ} *tab{i}'.format(typ=t, i=i) \ for i in range(arity)]) args_tabs_call = ', '.join(['tab{i}'.format(i=i) \ for i in range(arity)]) args_tabs_i_call = ', '.join(['tab{i}[i]'.format(i=i) \ for i in range(arity)]) args_in_tabs_call = ', '.join(['tet1d::in(tab{i}, n)'. \ format(i=i) \ for i in range(arity)]) fill_tabs = '\n'.join(['{typ} *tab{i} = nsimd::device_calloc<{typ}>(n);\n' \ 'random(tab{i}, n, {i});'.format(typ=t, i=i) \ for i in range(arity)]) free_tabs = '\n'.join(['nsimd::device_free(tab{i});'. \ format(typ=t, i=i) for i in range(arity)]) zero = '{}(0)'.format(t) if t != 'f16' else '{f32_to_f16}(0.0f)' one = '{}(1)'.format(t) if t != 'f16' else '{f32_to_f16}(1.0f)' comp_tab0_to_1 = 'tab0[i] == {}(1)'.format(t) if t != 'f16' else \ '{f16_to_f32}(tab0[i]) == 1.0f' comp_tab1_to_1 = 'tab1[i] == {}(1)'.format(t) if t != 'f16' else \ '{f16_to_f32}(tab1[i]) == 1.0f' if op_name == 'cvt': tet1d_code = \ '''tet1d::out(out) = tet1d::cvt<{t}>(tet1d::cvt<{tt}>( tet1d::in(tab0, n)));'''. \ format(t=t, tt=tt) compute_result_kernel = \ '''dst[i] = nsimd::{{p}}_cvt({t}(), nsimd::{{p}}_cvt( {tt}(), tab0[i]));'''.format(t=t, tt=tt) elif op_name == 'reinterpret': tet1d_code = \ '''tet1d::out(out) = tet1d::reinterpret<{t}>( tet1d::reinterpret<{tt}>(tet1d::in( tab0, n)));'''.format(t=t, tt=tt) compute_result_kernel = \ '''dst[i] = nsimd::{{p}}_reinterpret({t}(), nsimd::{{p}}_reinterpret({tt}(), tab0[i]));'''.format(t=t, tt=tt) elif op_name in ['to_mask', 'to_logical']: tet1d_code = \ '''tet1d::out(out) = tet1d::to_mask(tet1d::to_logical(tet1d::in( tab0, n)));''' compute_result_kernel = \ '''dst[i] = nsimd::{{p}}_to_mask({t}(), nsimd::{{p}}_to_logical(tab0[i]));'''. \ format(t=t) elif operator.params == ['v'] * len(operator.params): compute_result_kernel = \ 'dst[i] = nsimd::{{p}}_{op_name}({args_tabs_i_call});'. \ format(op_name=op_name, args_tabs_i_call=args_tabs_i_call) if operator.cxx_operator != None: if len(operator.params[1:]) == 1: tet1d_code = 'tet1d::out(out) = {cxx_op}tet1d::in(tab0, n);'. \ format(cxx_op=operator.cxx_operator) else: tet1d_code = 'tet1d::out(out) = tet1d::in(tab0, n) {cxx_op} ' \ 'tet1d::in(tab1, n);'. \ format(cxx_op=operator.cxx_operator) else: tet1d_code = \ 'tet1d::out(out) = tet1d::{op_name}({args_in_tabs_call});'. \ format(op_name=op_name, args_in_tabs_call=args_in_tabs_call) elif operator.params == ['l', 'v', 'v']: if operator.cxx_operator != None: cond = 'A {} B'.format(operator.cxx_operator) else: cond = 'tet1d::{}(A, B)'.format(op_name) tet1d_code = \ '''TET1D_OUT({typ}) Z = tet1d::out(out); TET1D_IN({typ}) A = tet1d::in(tab0, n); TET1D_IN({typ}) B = tet1d::in(tab1, n); Z({cond}) = 1;'''.format(cond=cond, typ=t) compute_result_kernel = \ '''if (nsimd::{{p}}_{op_name}(tab0[i], tab1[i])) {{{{ dst[i] = {one}; }}}} else {{{{ dst[i] = {zero}; }}}}'''.format(op_name=op_name, typ=t, one=one, zero=zero) elif operator.params == ['l'] * len(operator.params): if len(operator.params[1:]) == 1: if operator.cxx_operator != None: cond = '{}(A == 1)'.format(operator.cxx_operator) else: cond = 'tet1d::{}(A == 1)'.format(op_name) tet1d_code = \ '''TET1D_OUT({typ}) Z = tet1d::out(out); TET1D_IN({typ}) A = tet1d::in(tab0, n); Z({cond}) = 1;'''.format(cond=cond, typ=t) compute_result_kernel = \ '''if (nsimd::{{p}}_{op_name}({comp_tab0_to_1})) {{{{ dst[i] = {one}; }}}} else {{{{ dst[i] = {zero}; }}}}'''.format(op_name=op_name, typ=t, one=one, zero=zero, comp_tab0_to_1=comp_tab0_to_1) if len(operator.params[1:]) == 2: if operator.cxx_operator != None: cond = '(A == 1) {} (B == 1)'.format(operator.cxx_operator) else: cond = 'tet1d::{}(A == 1, B == 1)'.format(op_name) tet1d_code = \ '''TET1D_OUT({typ}) Z = tet1d::out(out); TET1D_IN({typ}) A = tet1d::in(tab0, n); TET1D_IN({typ}) B = tet1d::in(tab1, n); Z({cond}) = 1;'''.format(cond=cond, typ=t) compute_result_kernel = \ '''if (nsimd::{{p}}_{op_name}({comp_tab0_to_1}, {comp_tab1_to_1})) {{{{ dst[i] = {one}; }}}} else {{{{ dst[i] = {zero}; }}}}'''.format(op_name=op_name, typ=t, one=one, zero=zero, comp_tab0_to_1=comp_tab0_to_1, comp_tab1_to_1=comp_tab1_to_1) else: raise Exception('Unsupported operator: "{}"'.format(op_name)) cpu_kernel = compute_result_kernel.format(p='scalar', f32_to_f16='nsimd_f32_to_f16', f16_to_f32='nsimd_f16_to_f32') cuda_rocm_kernel = compute_result_kernel.format(p='gpu', f32_to_f16='__float2half', f16_to_f32='__half2float') oneapi_kernel = compute_result_kernel.format(p='gpu', f32_to_f16='(f16)', f16_to_f32='(f32)') comp = '!cmp(ref, out, n{})'.format('' if t in common.iutypes \ else ', {}'.format(operator.ufp[t])) with common.open_utf8(opts, filename) as out: out.write('''#include <nsimd/modules/tet1d.hpp> #include <nsimd/modules/memory_management.hpp> #include "../common.hpp" #if defined(NSIMD_CUDA) __global__ void kernel({typ} *dst, {args_tabs}, int n) {{ int i = threadIdx.x + blockIdx.x * blockDim.x; if (i < n) {{ {cuda_rocm_kernel} }} }} void compute_result({typ} *dst, {args_tabs}, unsigned int n) {{ kernel<<<{gpu_params}>>>(dst, {args_tabs_call}, int(n)); }} {cbprng_cuda} #elif defined(NSIMD_ROCM) __global__ void kernel({typ} *dst, {args_tabs}, size_t n) {{ size_t i = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; if (i < n) {{ {cuda_rocm_kernel} }} }} void compute_result({typ} *dst, {args_tabs}, size_t n) {{ hipLaunchKernelGGL(kernel, {gpu_params}, 0, 0, dst, {args_tabs_call}, n); }} {cbprng_hip} #elif defined(NSIMD_ONEAPI) inline void kernel({typ} *dst, {args_tabs}, const size_t n, sycl::nd_item<1> item) {{ size_t i = item.get_global_id().get(0); if (i < n) {{ {oneapi_kernel} }} }} void compute_result({typ} *dst, {args_tabs}, const size_t n) {{ size_t total_num_threads = (size_t)nsimd_kernel_param((int)n, {tpb}); sycl::queue q_ = nsimd::oneapi::default_queue(); q_.parallel_for(sycl::nd_range<1>(sycl::range<1>(total_num_threads), sycl::range<1>({tpb})), [=](sycl::nd_item<1> item){{ kernel(dst, {args_tabs_call}, n, item); }}).wait_and_throw(); }} {cbprng_oneapi} #else void compute_result({typ} *dst, {args_tabs}, unsigned int n) {{ for (unsigned int i = 0; i < n; i++) {{ {cpu_kernel} }} }} {cbprng_cpu} #endif int main() {{ unsigned int n_[3] = {{ 10, 1001, 10001 }}; for (int i = 0; i < (int)(sizeof(n_) / sizeof(int)); i++) {{ unsigned int n = n_[i]; int ret = 0; {fill_tabs} {typ} *ref = nsimd::device_calloc<{typ}>(n); {typ} *out = nsimd::device_calloc<{typ}>(n); compute_result(ref, {args_tabs_call}, n); {tet1d_code} if ({comp}) {{ ret = -1; }} nsimd::device_free(ref); nsimd::device_free(out); {free_tabs} if (ret != 0) {{ return ret; }} }} return 0; }} '''.format( typ=t, args_tabs=args_tabs, fill_tabs=fill_tabs, args_tabs_call=args_tabs_call, gpu_params=gpu_params, free_tabs=free_tabs, tet1d_code=tet1d_code, comp=comp, cpu_kernel=cpu_kernel, tpb=tpb, cuda_rocm_kernel=cuda_rocm_kernel, oneapi_kernel=oneapi_kernel, cbprng_cpu=nsimd_tests.cbprng(t, operator, 'cpu'), cbprng_cuda=nsimd_tests.cbprng(t, operator, 'cuda', gpu_params), cbprng_hip=nsimd_tests.cbprng(t, operator, 'hip', gpu_params), cbprng_oneapi=nsimd_tests.cbprng(t, operator, 'oneapi', ['(int)n', str(tpb)]))) common.clang_format(opts, filename, cuda=True)