def doit(opts): gen_overview(opts) gen_doc(opts) gen_modules_md(opts) gen_what_is_wrapped(opts) root_dir = os.path.join(opts.script_dir, '..') copy_github_file_to_doc(opts, os.path.join(root_dir, 'README.md'), common.get_markdown_file(opts, 'index')) copy_github_file_to_doc(opts, os.path.join(root_dir, 'CONTRIBUTING.md'), common.get_markdown_file(opts, 'contribute')) gen_html(opts) # This must be last
def gen_modules_md(opts): common.myprint(opts, 'Generating modules.md') mods = common.get_modules(opts) ndms = [] for mod in mods: name = eval('mods[mod].{}.hatch.name()'.format(mod)) desc = eval('mods[mod].{}.hatch.desc()'.format(mod)) ndms.append([name, desc, mod]) filename = common.get_markdown_file(opts, 'modules') if not common.can_create_filename(opts, filename): return with common.open_utf8(opts, filename) as fout: fout.write('''# Modules NSIMD comes with several additional modules. A module provides a set of functionnalities that are usually not at the same level as SIMD intrinsics and/or that do not provide all C and C++ APIs. These functionnalities are given with the library because they make heavy use of NSIMD core which abstract SIMD intrinsics. Below is the exhaustive list of modules. ''') for ndm in ndms: fout.write('- [{}](module_{}_overview.md) \n'.format( ndm[0], ndm[2])) fout.write('\n'.join([' {}'.format(line.strip()) \ for line in ndm[1].split('\n')])) fout.write('\n\n')
def gen_doc_api(opts): filename = common.get_markdown_file(opts, 'api', 'spmd') if not common.can_create_filename(opts, filename): return # Build tree for api.md api = dict() for _, operator in operators.operators.items(): if not operator.has_scalar_impl: continue for c in operator.categories: if c not in api: api[c] = [operator] else: api[c].append(operator) with common.open_utf8(opts, filename) as fout: fout.write( '''# NSIMD SPMD API reference This page contains the exhaustive API of the SPMD module. Note that most operators names follow the simple naming `k_[NSIMD name]` and have the same semantics. This page is light, you may use CTRL+F to find the operator you are looking for. For genericity on the base type you should use operator names instead of infix operators, e.g. `k_add` instead of `+`. Indeed for `f16`'s NVIDIA CUDA and NSIMD do not provide overloads and therefore code using `+` will fail to compile. Note that all operators accept literals and scalars. For example you may write `k_add(a, 1)` or `float s; k_add(a, s);`. This also applies when using infix operators. But note that literals or scalars must have the same type as the other operands. ''') for c, ops in api.items(): if len(ops) == 0: continue fout.write('\n## {}\n\n'.format(c.title)) for op in ops: fout.write('- `{}` \n'.format(get_signature(op))) if op.cxx_operator != None: fout.write(' Infix operator: `{}` ' \ '(*for certain types only*) \n'.\ format(op.cxx_operator)) fout.write(' {}\n\n'.format(op.desc))
def gen_api(opts): filename = common.get_markdown_file(opts, 'api', 'fixed_point') with common.open_utf8(opts, filename) as fout: fout.write('''# NSIMD fixed point API\n''') for cat in fp_categories: ops = [op for op in fp_operators if cat in op.categories] if (len(ops) == 0): continue fout.write('\n## {}\n\n'.format(cat)) for op in ops: fout.write( '- [{} ({})](module_fixed_point_api_{}.md)\n'\ .format(op.full_name, op.name, common.to_filename(op.name)))
def gen_api(opts, op_list): api = dict() for _, operator in operators.operators.items(): if operator.name not in op_list: continue for c in operator.categories: if c not in api: api[c] = [operator] else: api[c].append(operator) filename = common.get_markdown_file(opts, 'api', 'fixed_point') with common.open_utf8(opts, filename) as fout: fout.write('''# NSIMD fixed point API\n''') for c, ops in api.items(): if len(ops) == 0: continue fout.write('\n## {}\n\n'.format(c.title)) for op in ops: fout.write('- [{} ({})](module_fixed_point_api_{}.md)\n'. \ format(op.full_name, op.name, common.to_filename(op.name)))
def gen_doc(opts): api = '' for func in rand_functions: for word_size, nwords_nrounds in func.wordsize_nwords_nrounds.items(): for nwords, list_nrounds in nwords_nrounds.items(): for nrounds in list_nrounds: api += '- `' + func.gen_signature(nwords, word_size, nrounds) + '`; \n' api += ' Returns a random number using the ' \ '{func_name} generator\n\n'. \ format(func_name=func.name) res = ''' # NSIMD Random module overview {desc} Two different algorithms are proposed : threefry and philox. Both should give high quality random number. Threefry is quicker on CPU, while philox is best used on GPU. Both algorithms are counter based pseudorandom number generator, meaning that they need two parameters: - a key, each key will generate an unique sequence, - a counter, which will give the different numbers in the sequence. # NSIMD Random API reference {api} '''.format(desc=desc(), api=api) filename = common.get_markdown_file(opts, 'overview', 'random') if not common.can_create_filename(opts, filename): return with common.open_utf8(opts, filename) as fout: fout.write(res)
def doit(opts): common.myprint(opts, 'Generating module memory_management') if not opts.doc: return filename = common.get_markdown_file(opts, 'overview', 'memory_management') if not common.can_create_filename(opts, filename): return with common.open_utf8(opts, filename) as fout: fout.write('''# Overview This module provides C-style memory managmenent functions. Its purpose is not to become a fully feature container library. It is to provide portable malloc, memcpy and free functions with a little helpers to copy data from and to the devices. # API reference ## Equivalents of malloc, calloc, memcpy and free for devices Note that the below functions simply wraps the corresponding C functions when targeting a CPU. - `template <typename T> T *device_malloc(size_t sz)`{br} Allocates `sz * sizeof(T)` bytes of memory on the device. On error NULL is returned. - `template <typename T> T *device_calloc(size_t sz)`{br} Allocates `sz * sizeof(T)` bytes of memory on the device and set the allocated memory to zero. On error NULL is returned. - `template <typename T> void device_free(T *ptr)`{br} Free the memory pointed to by the given pointer. - `template <typename T> void copy_to_device(T *device_ptr, T *host_ptr, size_t sz)`{br} Copy data to from host to device. - `template <typename T> void copy_to_host(T *host_ptr, T *device_ptr, size_t sz)`{br} Copy data to from device to host. - `#define nsimd_fill_dev_mem_func(func_name, expr)`{br} Create a device function that will fill data with `expr`. To call the created function one simply does `func_name(ptr, sz)`. The `expr` argument represents some simple C++ expression that can depend only on `i` the i-th element in the vector as shown in the example below. ```c++ nsimd_fill_dev_mem_func(prng, ((i * 1103515245 + 12345) / 65536) % 32768) int main() {{ prng(ptr, 1000); return 0; }} ``` ## Pairs of pointers It is often useful to allocate a pair of data buffers: one on the host and one on the devices to perform data transfers. The below functions provides quick ways to malloc, calloc, free and memcpy pointers on host and devices at once. Note that when targeting CPUs the pair of pointers is reduced to one pointer that ponit the a single data buffer in which case memcpy's are not performed. Note also that there is no implicit synchronization of data between both data buffers. It is up to the programmer to triggers memcpy's. ```c++ template <typename T> struct paired_pointers_t {{ T *device_ptr, *host_ptr; size_t sz; }}; ``` Members of the above structure are not to be modified but can be passed as arguments for reading/writing data from/to memory they point to. - `template <typename T> paired_pointers_t<T> pair_malloc(size_t sz)`{br} Allocate `sz * sizeof(T)` bytes of memory on the host and on the device. If an error occurs both pointers are NULL. - `template <typename T> paired_pointers_t<T> pair_malloc_or_exit(size_t sz)`{br} Allocate `sz * sizeof(T)` bytes of memory on the host and on the device. If an error occurs, prints an error message on stderr and exit(3). - `template <typename T> paired_pointers_t<T> pair_calloc(size_t sz)`{br} Allocate `sz * sizeof(T)` bytes of memory on the host and on the device. Write both data buffers with zeros. If an error occurs both pointers are NULL. - `template <typename T> paired_pointers_t<T> pair_calloc_or_exit(size_t sz)`{br} Allocate `sz * sizeof(T)` bytes of memory on the host and on the device. Write both data buffers with zeros. If an error occurs, prints an error message on stderr and exit(3). - `template <typename T> void pair_free(paired_pointers_t<T> p)`{br} Free data buffers on the host and the device. - `template <typename T> void copy_to_device(paired_pointers_t<T> p)`{br} Copy data from the host buffer to its corresponding device buffer. - `template <typename T> void copy_to_host(paired_pointers_t<T> p)`{br} Copy data from the device buffer to its corresponding host buffer. '''.format(br=' '))
def gen_overview(opts): filename = common.get_markdown_file(opts, 'overview') if not common.can_create_filename(opts, filename): return with common.open_utf8(opts, filename) as fout: fout.write('''# Overview ## NSIMD scalar types Their names follow the following pattern: `Sxx` where - `S` is `i` for signed integers, `u` for unsigned integer or `f` for floatting point number. - `xx` is the number of bits taken to represent the number. Full list of scalar types: ''') for t in common.types: fout.write('- `{}`\n'.format(t)) fout.write(''' ## NSIMD generic SIMD vector types In NSIMD, we call a platform an architecture e.g. Intel, ARM, POWERPC. We call SIMD extension a set of low-level functions and types provided by hardware vendors to access SIMD units. Examples include SSE2, SSE42, AVX, ... When compiling the generic SIMD vector types represents a SIMD register of the target. Examples are a `__m128` for Intel SSE, `__m512` for Intel AVX-512 or `svfloat32_t` for Arm SVE. Their names follow the following pattern: - C base API: `vSCALAR` where `SCALAR` is a one of scalar type listed above. - C advanced API: `nsimd_pack_SCALAR` where `SCALAR` is a one of scalar type listed above. - C++ advanced API: `nsimd::pack<SCALAR>` where `SCALAR` is a one of scalar type listed above. Full list of SIMD vector types: | Base type | C base API | C advanced API | C++ advanced API | |-----------|------------|----------------|------------------| ''') fout.write('\n'.join([ '| `{typ}` | `v{typ}` | `nsimd_pack_{typ}` | `nsimd::pack<{typ}>` |'. \ format(typ=typ) for typ in common.types])) fout.write(''' ## C/C++ base APIs These come automatically when you include `nsimd/nsimd.h`. You do *not* need to include a header file for having a function. Here is a list of supported platforms and their corresponding SIMD extensions. ''') platforms = common.get_platforms(opts) for p in platforms: fout.write('- Platform `{}`\n'.format(p)) for s in platforms[p].get_simd_exts(): fout.write(' - `{}`\n'.format(s)) fout.write(''' Each simd extension has its own set of SIMD types and functions. Types follow the pattern: `nsimd_SIMDEXT_vSCALAR` where - `SIMDEXT` is the SIMD extensions. - `SCALAR` is one of scalar types listed above. There are also logical types associated to each SIMD vector type. These types are used, for example, to represent the result of a comparison of SIMD vectors. They are usually bit masks. Their name follow the pattern: `nsimd_SIMDEXT_vlSCALAR` where - `SIMDEXT` is the SIMD extensions. - `SCALAR` is one of scalar types listed above. Note 1: Platform `cpu` is a 128 bits SIMD emulation fallback when no SIMD extension has been specified or is supported on a given compilation target. Note 2: as all SIMD extensions of all platforms are different there is no need to put the name of the platform in each identifier. Function names follow the pattern: `nsimd_SIMDEXT_FUNCNAME_SCALAR` where - `SIMDEXT` is the SIMD extensions. - `FUNCNAME` is the name of a function e.g. `add` or `sub`. - `SCALAR` is one of scalar types listed above. ### Generic identifier In the base C API, genericity is achieved using macros. - `vec(SCALAR)` is a type to represent a SIMD vector containing SCALAR elements. SCALAR must be one of scalar types listed above. - `vecl(SCALAR)` is a type to represent a SIMD vector of logicals for SCALAR elements. SCALAR must be one of scalar types listed above. - `vec_a(SCALAR, SIMDEXT)` is a type to represent a SIMD vector containing SCALAR elements for the simd extension SIMDEXT. SCALAR must be one of scalar types listed above and SIMDEXT must be a valid SIMD extension. - `vecl_a(SCALAR, SIMDEXT)` is a type to represent a SIMD vector of logicals for SCALAR elements for the simd extension SIMDEXT. SCALAR must be one of scalar types listed above and SIMDEXT must be a valid SIMD extension. - `vFUNCNAME` takes as input the above types to access the operator FUNCNAME e.g. `vadd`, `vsub`. In C++98 and C++03, type traits are available. - `nsimd::simd_traits<SCALAR, SIMDEXT>::vector` is the SIMD vector type for platform SIMDEXT containing SCALAR elements. SIMDEXT is one of SIMD extension listed above, SCALAR is one of scalar type listed above. - `nsimd::simd_traits<SCALAR, SIMDEXT>::vectorl` is the SIMD vector of logicals type for platform SIMDEXT containing SCALAR elements. SIMDEXT is one of SIMD extensions listed above, SCALAR is one of scalar type listed above. In C++11 and beyond, type traits are still available but typedefs are also provided. - `nsimd::vector<SCALAR, SIMDEXT>` is a typedef to `nsimd::simd_traits<SCALAR, SIMDEXT>::vector`. - `nsimd::vectorl<SCALAR, SIMDEXT>` is a typedef to `nsimd::simd_traits<SCALAR, SIMDEXT>::vectorl`. The C++20 API does not bring different types for SIMD registers nor other way to access the other SIMD types. It only brings concepts instead of usual `typename`s. For more informations cf. <concepts.md>. Note that all macro and functions available in plain C are still available in C++. ### List of operators provided by the base APIs In the documentation we use interchangeably the terms "function" and "operator". For each operator FUNCNAME a C function (also available in C++) named `nsimd_SIMDEXT_FUNCNAME_SCALAR` is available for each SCALAR type unless specified otherwise. For each FUNCNAME, a C macro (also available in C++) named `vFUNCNAME` is available and takes as its last argument a SCALAR type. For each FUNCNAME, a C macro (also available in C++) named `vFUNCNAME_a` is available and takes as its two last argument a SCALAR type and a SIMDEXT. For each FUNCNAME, a C++ function in namespace `nsimd` named `FUNCNAME` is available. It takes as its last argument the SCALAR type and can optionnally take the SIMDEXT as its last last argument. For example, for the addition of two SIMD vectors `a` and `b` here are the possibilities: ```c++ c = nsimd_add_avx_f32(a, b); // use AVX c = nsimd::add(a, b, f32()); // use detected SIMDEXT c = nsimd::add(a, b, f32(), avx()); // force AVX even if detected SIMDEXT is not AVX c = vadd(a, b, f32); // use detected SIMDEXT c = vadd_e(a, b, f32, avx); // force AVX even if detected SIMDEXT is not AVX ``` Here is a list of available FUNCNAME. ''') for op_name, operator in operators.items(): return_typ = common.get_one_type_generic(operator.params[0], 'SCALAR') func = operator.name args = ', '.join([common.get_one_type_generic(p, 'SCALAR') + \ ' a' + str(count) for count, p in \ enumerate(operator.params[1:])]) fout.write('- `{} {}({});` \n'.format(return_typ, func, args)) if len(operator.types) < len(common.types): typs = ', '.join(['{}'.format(t) for t in operator.types]) fout.write(' Only available for {}\n'.format(typs)) fout.write(''' ## C advanced API (only available in C11) The C advanced API takes advantage of the C11 `_Generic` keyword to provide function overloading. Unlike the base API described above there is no need to pass as arguments the base type of the SIMD extension. The informations are contained in the types provided by this API. - `nsimd_pack_SCALAR_SIMDEXT` represents a SIMD vectors containing SCALAR elements of SIMD extension SIMDEXT. - `nsimd::packl_SCALAR_SIMDEXT` represents a SIMD vectors of logicals for SCALAR elements of SIMD extension SIMDEXT. There are versions of the above type without SIMDEXT for which the targeted SIMD extension is automatically chosen. - `nsimd_pack_SCALAR` represents a SIMD vectors containing SCALAR elements. - `nsimd::packl_SCALAR` represents a SIMD vectors of logicals for SCALAR elements. Generic types are also available: - `nsimd_pack(SCALAR)` is a type to represent a SIMD vector containing SCALAR elements. SCALAR must be one of scalar types listed above. - `nsimd_packl(SCALAR)` is a type to represent a SIMD vector of logicals for SCALAR elements. SCALAR must be one of scalar types listed above. - `nsimd_pack_a(SCALAR, SIMDEXT)` is a type to represent a SIMD vector containing SCALAR elements for the simd extension SIMDEXT. SCALAR must be one of scalar types listed above and SIMDEXT must be a valid SIMD extension. - `nsimd_packl_a(SCALAR, SIMDEXT)` is a type to represent a SIMD vector of logicals for SCALAR elements for the simd extension SIMDEXT. SCALAR must be one of scalar types listed above and SIMDEXT must be a valid SIMD extension. Finally, operators are follow the naming: `nsimd_FUNCNAME` e.g. `nsimd_add`, `nsimd_sub`. ## C++ advanced API The C++ advanced API is called advanced not because it requires C++11 or above but because it makes use of the particular implementation of ARM SVE by ARM in their compiler. We do not know if GCC (and possibly MSVC in the distant future) will use the same approach. Anyway the current implementation allows us to put SVE SIMD vectors inside some kind of structs that behave like standard structs. If you want to be sure to write portable code do *not* use this API. Two new types are available. - `nsimd::pack<SCALAR, N, SIMDEXT>` represents `N` SIMD vectors containing SCALAR elements of SIMD extension SIMDEXT. You can specify only the first template argument. The second defaults to 1 while the third defaults to the detected SIMDEXT. - `nsimd::packl<SCALAR, N, SIMDEXT>` represents `N` SIMD vectors of logical type containing SCALAR elements of SIMD extension SIMDEXT. You can specify only the first template argument. The second defaults to 1 while the third defaults to the detected SIMDEXT. Use N > 1 when declaring packs to have an unroll of N. This is particularily useful on ARM. Functions that takes packs do not take any other argument unless specified otherwise e.g. the load family of funtions. It is impossible to determine the kind of pack (unroll and SIMDEXT) from the type of a pointer. Therefore in this case, the last argument must be a pack and this same type will then return. Also some functions are available as C++ operators. They follow the naming: `nsimd::FUNCNAME`. ''')
def gen_doc(opts): common.myprint(opts, 'Generating doc for each function') # Build tree for api.md api = dict() for _, operator in operators.items(): for c in operator.categories: if c not in api: api[c] = [operator] else: api[c].append(operator) # api.md # filename = os.path.join(opts.script_dir, '..','doc', 'markdown', 'api.md') filename = common.get_markdown_file(opts, 'api') if common.can_create_filename(opts, filename): with common.open_utf8(opts, filename) as fout: fout.write('# General API\n\n') fout.write('- [Memory function](memory.md)\n') fout.write('- [Float16 related functions](fp16.md)\n') fout.write('- [Defines provided by NSIMD](defines.md)\n') fout.write('- [NSIMD pack and related functions](pack.md)\n\n') fout.write('- [NSIMD C++20 concepts](concepts.md)\n\n') fout.write('# SIMD operators\n') for c, ops in api.items(): if len(ops) == 0: continue fout.write('\n## {}\n\n'.format(c.title)) for op in ops: Full_name = op.full_name[0].upper() + op.full_name[1:] fout.write('- [{} ({})](api_{}.md)\n'.format( Full_name, op.name, common.to_filename(op.name))) # helper to get list of function signatures def to_string(var): sigs = [var] if type(var) == str or not hasattr(var, '__iter__') \ else list(var) for i in range(0, len(sigs)): sigs[i] = re.sub('[ \n\t\r]+', ' ', sigs[i]) return '\n'.join(sigs) # Operators (one file per operator) # dirname = os.path.join(opts.script_dir, '..','doc', 'markdown') dirname = common.get_markdown_dir(opts) common.mkdir_p(dirname) for op_name, operator in operators.items(): # Skip non-matching doc if opts.match and not opts.match.match(op_name): continue # filename = os.path.join(dirname, 'api_{}.md'.format(common.to_filename( # operator.name))) filename = common.get_markdown_api_file(opts, operator.name) if not common.can_create_filename(opts, filename): continue Full_name = operator.full_name[0].upper() + operator.full_name[1:] with common.open_utf8(opts, filename) as fout: fout.write('# {}\n\n'.format(Full_name)) fout.write('## Description\n\n') fout.write(operator.desc) fout.write('\n\n## C base API (generic)\n\n') fout.write('```c\n') fout.write(to_string(operator.get_generic_signature('c_base'))) fout.write('\n```\n\n') fout.write('\n\n## C advanced API (generic, requires C11)\n\n') fout.write('```c\n') fout.write(to_string(operator.get_generic_signature('c_adv'))) fout.write('\n```\n\n') fout.write('## C++ base API (generic)\n\n') fout.write('```c++\n') fout.write(to_string(operator.get_generic_signature('cxx_base'))) fout.write('\n```\n\n') fout.write('## C++ advanced API\n\n') fout.write('```c++\n') fout.write(to_string(operator.get_generic_signature('cxx_adv'). \ values())) fout.write('\n```\n\n') fout.write('## C base API (architecture specifics)') for simd_ext in opts.simd: fout.write('\n\n### {}\n\n'.format(simd_ext.upper())) fout.write('```c\n') for typ in operator.types: fout.write(operator.get_signature(typ, 'c_base', simd_ext)) fout.write(';\n') fout.write('```') fout.write('\n\n## C++ base API (architecture specifics)') for simd_ext in opts.simd: fout.write('\n\n### {}\n\n'.format(simd_ext.upper())) fout.write('```c\n') for typ in operator.types: fout.write( operator.get_signature(typ, 'cxx_base', simd_ext)) fout.write(';\n') fout.write('```')
def gen_overview(opts): # filename = os.path.join(opts.script_dir, '..', 'doc', 'markdown', # 'overview.md') filename = common.get_markdown_file(opts, 'overview') if not common.can_create_filename(opts, filename): return with common.open_utf8(opts, filename) as fout: fout.write('''# Overview ## NSIMD scalar types Their names follows the following pattern: `Sxx` where - `S` is `i` for signed integers, `u` for unsigned integer and `f` for floatting point number. - `xx` is the number of bits taken to represent the number. Full list of scalar types: ''') for t in common.types: fout.write('- `{}`\n'.format(t)) fout.write(''' ## NSIMD SIMD vector types Their names follows the following pattern: `vSCALAR` where `SCALAR` is a one of scalar type listed above. For example `vi8` means a SIMD vector containing `i8`'s. Full list of SIMD vector types: ''') for t in common.types: fout.write('- `v{}`\n'.format(t)) fout.write(''' ## C/C++ base APIs These come automatically when you include `nsimd/nsimd.h`. You do *not* need to include a header file for having a function. In NSIMD, we call a platform an architecture e.g. Intel, ARM, POWERPC. We call SIMD extension a set of low-level functions and types provided to access a given SIDM extension. Examples include SSE2, SSE42, AVX, ... Here is a list of supported platforms and their corresponding SIMD extensions. ''') platforms = common.get_platforms(opts) for p in platforms: fout.write('- Platform `{}`\n'.format(p)) for s in platforms[p].get_simd_exts(): fout.write(' - `{}`\n'.format(s)) fout.write(''' Each simd extension has its own set of SIMD types and functions. Types follow the following pattern: `nsimd_SIMDEXT_vSCALAR` where - `SIMDEXT` is the SIMD extensions. - `SCALAR` is one of scalar types listed above. There are also logical types associated to each SIMD vector type. These types are used to represent the result of a comparison of SIMD vectors. They are usually bit masks. Their name follow the following pattern: `nsimd_SIMDEXT_vlSCALAR` where - `SIMDEXT` is the SIMD extensions. - `SCALAR` is one of scalar types listed above. Note 1: Platform `cpu` is scalar fallback when no SIMD extension has been specified. Note 2: as all SIMD extensions of all platforms are different there is no need to put the name of the platform in each identifier. Function names follow the following pattern: `nsimd_SIMDEXT_FUNCNAME_SCALAR` where - `SIMDEXT` is the SIMD extensions. - `FUNCNAME` is the name of a function e.g. `add` or `sub`. - `SCALAR` is one of scalar types listed above. ### Generic identifier In C, genericity is achieved using macros. - `vec(SCALAR)` represents the SIMD vector type containing SCALAR elements. SCALAR must be one of scalar types listed above. - `vecl(SCALAR)` represents the SIMD vector of logicals type containing SCALAR elements. SCALAR must be one of scalar types listed above. - `vec_e(SCALAR)` represents the SIMD vector type containing SCALAR elements. SCALAR must be one of scalar types listed above. - `vecl_e(SCALAR)` represents the SIMD vector of logicals type containing SCALAR elements. SCALAR must be one of scalar types listed above. - `vFUNCNAME` is the macro name to access the function FUNCNAME e.g. `vadd`, `vsub`. - `vFUNCNAME_e` is the macro name to access the function FUNCNAME e.g. `vadd_e`, `vsub_e`. In C++98 and C++03, type traits are available. - `nsimd::simd_traits<SCALAR, SIMDEXT>::vector` is the SIMD vector type for platform SIMDEXT containing SCALAR elements. SIMDEXT is one of SIMD extension listed above, SCALAR is one of scalar type listed above. - `nsimd::simd_traits<SCALAR, SIMDEXT>::vectorl` is the SIMD vector of logicals type for platform SIMDEXT containing SCALAR elements. SIMDEXT is one of SIMD extensions listed above, SCALAR is one of scalar type listed above. In C++11 and beyond, type traits are still available but typedefs are also provided. - `nsimd::vector<SCALAR, SIMDEXT>` is a typedef to `nsimd::simd_traits<SCALAR, SIMDEXT>::vector`. - `nsimd::vectorl<SCALAR, SIMDEXT>` is a typedef to `nsimd::simd_traits<SCALAR, SIMDEXT>::vectorl`. Note that all macro and functions available in plain C are still available in C++. ### List of functions available for manipulation of SIMD vectors For each FUNCNAME a C function (also available in C++) named `nsimd_SIMDEXT_FUNCNAME_SCALAR` is available for each SCALAR type unless specified otherwise. For each FUNCNAME, a C macro (also available in C++) named `vFUNCNAME` is available and takes as its last argument a SCALAR type. For each FUNCNAME, a C macro (also available in C++) named `vFUNCNAME_a` is available and takes as its two last argument a SCALAR type and a SIMDEXT. For each FUNCNAME, a C++ function in namespace `nsimd` named `FUNCNAME` is available. It takes as its last argument the SCALAR type and can optionnally take the SIMDEXT as its last last argument. For example, for the addition of two SIMD vectors `a` and `b` here are the possibilities: c = nsimd_add_avx_f32(a, b); // use AVX c = nsimd::add(a, b, f32()); // use detected SIMDEXT c = nsimd::add(a, b, f32(), avx()); // force AVX even if detected SIMDEXT is not AVX c = vadd(a, b, f32); // use detected SIMDEXT c = vadd_e(a, b, f32, avx); // force AVX even if detected SIMDEXT is not AVX Here is a list of available FUNCNAME. ''') for op_name, operator in operators.items(): return_typ = common.get_one_type_generic(operator.params[0], 'SCALAR') func = operator.name args = ', '.join([common.get_one_type_generic(p, 'SCALAR') + \ ' a' + str(count) for count, p in \ enumerate(operator.params[1:])]) fout.write('- `{} {}({});`\n'.format(return_typ, func, args)) if operator.domain and len(operator.params[1:]) > 0: params = operator.params[1:] if len(params) == 1: fout.write(' a0 ∈ {}\n'.format(operator.domain)) else: param = ', '.join(['a' + str(count) for count in \ range(len(params))]) fout.write(' ({}) ∈ {}\n'.format(param, operator.domain)) if len(operator.types) < len(common.types): typs = ', '.join(['{}'.format(t) for t in operator.types]) fout.write(' Only available for {}\n'.format(typs)) fout.write(''' ## C++ advanced API The C++ advanced API is called advanced not because it requires C++11 or above but because it makes use of the particular implementation of ARM SVE by ARM in their compiler. We do not know if GCC (and possibly MSVC in the distant future) will use the same approach. Anyway the current implementation allows us to put SVE SIMD vectors inside some kind of structs that behave like standard structs. If you want to be sure to write portable code do *not* use this API. Two new types are available. - `nsimd::pack<SCALAR, N, SIMDEXT>` represents `N` SIMD vectors containing SCALAR elements of SIMD extension SIMDEXT. You can specify only the first template argument. The second defaults to 1 while the third defaults to the detected SIMDEXT. - `nsimd::packl<SCALAR, N, SIMDEXT>` represents `N` SIMD vectors of logical type containing SCALAR elements of SIMD extension SIMDEXT. You can specify only the first template argument. The second defaults to 1 while the third defaults to the detected SIMDEXT. Use N > 1 when declaring packs to have an unroll of N. This is particularily useful on ARM. Functions that takes packs do not take any other argument unless specified otherwise e.g. the load family of funtions. It is impossible to determine the kind of pack (unroll and SIMDEXT) from the type of a pointer. Therefore in this case, the last argument must be a pack and this same type will then return. Also some functions are available as C++ operators. Here is the list of functions that act on packs. ''') for op_name, operator in operators.items(): return_typ = common.get_one_type_pack(operator.params[0], 1, 'N') func = operator.name args = ', '.join([common.get_one_type_pack(p, 0, 'N') + ' a' + \ str(count) for count, p in \ enumerate(operator.params[1:])]) if 'v' not in operator.params[1:] and 'l' not in operator.params[ 1:]: args = args + ', pack<T, N, SimdExt> const&' if args != '' \ else 'pack<T, N, SimdExt> const&' fout.write('- `{} {}({});`\n'.format(return_typ, func, args)) if operator.domain and len(operator.params[1:]) > 0: params = operator.params[1:] if len(params) == 1: fout.write(' a0 ∈ {}\n'.format(operator.domain)) else: param = ', '.join(['a'+str(count) for count in \ range(len(params))]) fout.write(' ({}) ∈ {}\n'.format(param, operator.domain)) if operator.cxx_operator: fout.write(' Available as `{}`\n'.format( operator.cxx_operator)) if len(operator.types) < len(common.types): typs = ', '.join(['{}'.format(t) for t in operator.types]) fout.write(' Only available for {}\n'.format(typs))
def gen_doc(opts): sys.stdout.write('-- Generating doc for each function\n') # Build tree for api.md api = dict() for _, operator in operators.items(): for c in operator.categories: if c not in api: api[c] = [operator] else: api[c].append(operator) # helper to construct filename for operator # def to_filename(op_name): # valid = string.ascii_letters + string.digits # ret = '' # for c in op_name: # ret += '-' if c not in valid else c # return ret # api.md # filename = os.path.join(opts.script_dir, '..','doc', 'markdown', 'api.md') filename = common.get_markdown_file(opts, 'api') if common.can_create_filename(opts, filename): with common.open_utf8(opts, filename) as fout: fout.write('# API\n') for c, ops in api.items(): if len(ops) == 0: continue fout.write('\n## {}\n\n'.format(c.title)) for op in ops: Full_name = op.full_name[0].upper() + op.full_name[1:] fout.write('- [{} ({})](api_{}.md)\n'.format( Full_name, op.name, common.to_filename(op.name))) # helper to get list of function signatures def to_string(var): sigs = [var] if type(var) == str or not hasattr(var, '__iter__') \ else list(var) for i in range(0, len(sigs)): sigs[i] = re.sub('[ \n\t\r]+', ' ', sigs[i]) return '\n'.join(sigs) # Operators (one file per operator) # dirname = os.path.join(opts.script_dir, '..','doc', 'markdown') dirname = common.get_markdown_dir(opts) common.mkdir_p(dirname) for op_name, operator in operators.items(): # Skip non-matching doc if opts.match and not opts.match.match(op_name): continue # filename = os.path.join(dirname, 'api_{}.md'.format(common.to_filename( # operator.name))) filename = common.get_markdown_api_file(opts, operator.name) if not common.can_create_filename(opts, filename): continue Full_name = operator.full_name[0].upper() + operator.full_name[1:] with common.open_utf8(opts, filename) as fout: fout.write('# {}\n\n'.format(Full_name)) fout.write('## Description\n\n') fout.write(operator.desc) fout.write('\n\n## C base API (generic)\n\n') fout.write('```c\n') fout.write(to_string(operator.get_generic_signature('c_base'))) fout.write('\n```\n\n') fout.write('## C++ base API (generic)\n\n') fout.write('```c++\n') fout.write(to_string(operator.get_generic_signature('cxx_base'))) fout.write('\n```\n\n') fout.write('## C++ advanced API\n\n') fout.write('```c++\n') fout.write(to_string(operator.get_generic_signature('cxx_adv'). \ values())) fout.write('\n```\n\n') fout.write('## C base API (architecture specifics)') for simd_ext in opts.simd: fout.write('\n\n### {}\n\n'.format(simd_ext.upper())) fout.write('```c\n') for typ in operator.types: fout.write(operator.get_signature(typ, 'c_base', simd_ext)) fout.write(';\n') fout.write('```') fout.write('\n\n## C++ base API (architecture specifics)') for simd_ext in opts.simd: fout.write('\n\n### {}\n\n'.format(simd_ext.upper())) fout.write('```c\n') for typ in operator.types: fout.write( operator.get_signature(typ, 'cxx_base', simd_ext)) fout.write(';\n') fout.write('```')
def gen_overview(opts): filename = common.get_markdown_file(opts, 'overview', 'fixed_point') with common.open_utf8(opts, filename) as fout: fout.write(''' # NSIMD fixed point module ## Description This module implements a fixed-point numbers support for the `nsimd` library. Fixed-point numbers are integer types used to represent decimal numbers. A number `lf` of bits are used to encode its integer part, and `rt` bits are used to encode its fractional part. The fixed_point module uses the templated type `nsimd::fixed_point::fp_t<lf, rt>` to represent a fixed_point number. All the basic floating-point arithmetic operaors have been defined, therefore fp_t elements can be manipulated as normal numbers. The fixed_point module will use a `i8`, `i16`, or `i32` integer type for storage, depending on the value of `lf + 2 * rt`. All the functions of the module are under the namespace `nsimd::fixed_point`, and match the same interface than `nsimd` C++ . The `fp_t` struct type is defined in `fixed.hpp`, and the associated simd `fpsimd_t` struct type are defined in `simd.hpp`. The modules redefines the `nsimd` pack type for fixed-point numbers, templated with `lf` and `rt` : ```C++ namespace nsimd { namespace fixed_point { template <u8 lf, u8 rt> struct pack; } // namespace fixed_point } // namespace nsimd ``` Then, the pack can be manipulated as an `nsimd` pack like other scalar types. ## Compatibility The fixed point module is a C++ only API, compatible with the C++98 standard. It has the same compilers and hardware support than the main `nsimd` API (see the [API index](index.md)). ## Example Here is a minimal example([main.cpp](../../examples/module_fixed_point.cpp)): @[INCLUDE_CODE:L21:L61](../../examples/module_fixed_point.cpp) To test with avx2 run : ```bash export NSIMD_ROOT=<path/to/nsimd> g++ -o main -I$NSIMD_ROOT/include -mavx2 -DNSIMD_AVX2 main.cpp ./main ``` The console output will look like this : ```console $>./main 1.35938 | -0.421875 | 0.9375 1.13281 | 1.19531 | 2.32812 1.64844 | -1.21094 | 0.4375 -0.660156 | 1.07422 | 0.414062 -0.890625 | 0.214844 | -0.675781 -0.0898438 | 0.515625 | 0.425781 -0.539062 | 0.0546875 | -0.484375 1.80859 | 1.66406 | 3.47266 ``` ''')
def gen_doc_overview(opts): filename = common.get_markdown_file(opts, 'overview', 'tet1d') if not common.can_create_filename(opts, filename): return with common.open_utf8(opts, filename) as fout: fout.write('''# Overview ## What are expression templates? Expression templates are a C++ template metaprogramming technique that essentially allows high level programming for loop fusion. Take the following exemple. ```c++ std::vector<float> operator+(std::vector<float> const &a, std::vector<float> const &b) {{ std::vector<float> ret(a.size()); for (size_t i = 0; i < a.size(); i++) {{ ret[i] = a[i] + b[i]; }} return ret; }} int main() {{ std::vector<float> a, b, c, d, sum; ... sum = a + b + c + d; ... return 0; }} ``` The expression `a + b + c + d` involves three calls to `operator+` and at least nine memory passes are necessary. This can be optimized as follows. ```c++ int main() {{ std::vector<float> a, b, c, d, sum; ... for (size_t i = 0; i < a.size(); i++) {{ ret[i] = a[i] + b[i] + c[i] + d[i]; }} ... return 0; }} ``` The rewriting above requires only four memory passes which is of course better but as humans we prefer the writing `a + b + c + d`. Expression templates solves exactly this problem and allows the programmer to write `a + b + c + d` and the compiler to see the loop written above. ## Expressions templates with NSIMD This module provides expression templates on top of NSIMD core. As a consequence the loops seen by the compiler deduced from the high-level expressions are optimized using SIMD instructions. Note also that NVIDIA and AMD GPUs are supported through CUDA and ROCm/HIP. The API for expression templates in NSIMD is C++98 compatible and is able to work with any container as its only requirement for data is that it must be contiguous. All inputs to an expression must be declared using `tet1d::in` while the output must be declared using `tet1d::out`. ```c++ int main() {{ std::vector<float> a, b, c; ... tet1d::out(a) = tet1d::in(&a[0], a.size()) + tet1d::in(&b[0], b.size()); ... return 0; }} ``` - `template <typename T, typename I> inline node in(const T *data, I sz);`{nl} Construct an input for expression templates starting at address `data` and containing `sz` elements. The return type of this functin `node` can be used with the help of the `TET1D_IN(T)` macro where `T` if the underlying type of data (ints, floats, doubles...). - `template <typename T> node out(T *data);`{nl} Construct an output for expression templates starting at address `data`. Note that memory must be allocated by the user before passing it to the expression template engine. The output type can be used with the `TET1D_OUT(T)` where `T` is the underlying type (ints, floats, doubles...). Note that it is possible to pass parameters to the expression template engine to specify the number of threads per block for GPUs or the SIMD extension to use... - `template <typename T, typename Pack> node out(T *data, int threads_per_block, void *stream);`{nl} Construct an output for expression templates starting at address `data`. Note that memory must be allocated by the user before passing it to the expression template engine. The `Pack` parameter is useful when compiling for CPUs. The type is `nsimd::pack<...>` allowing the developper to specify all details about the NSIMD packs that will be used by the expression template engine. The `threads_per_block` and `stream` arguments are used only when compiling for GPUs. Their meaning is contained in their names. The output type can be used with the `TET1D_OUT_EX(T, N, SimdExt)` where `T` is the underlying type (ints, floats, doubles...), `N` is the unroll factor and `SimdExt` the SIMD extension. Moreover a MATLAB-like syntax is provided. One can select a subrange of given input. Indexes are understood as for Python: -1 represents the last element. The contant `tet1d::end = -1` allows one to write portable code. ```c++ int main() {{ std::vector<float> a, b, c; ... TET1D_IN(float) va = tet1d::in(&a[0], a.size()); TET1D_IN(float) vb = tet1d::in(&b[0], b.size()); tet1d::out(c) = va(10, tet1d::end - 10) + vb; ... return 0; }} ``` One can also specify which elements of the output must be rewritten with the following syntax. ```c++ int main() {{ std::vector<float> a, b, c; ... TET1D_IN(float) va = tet1d::in(&a[0], a.size()); TET1D_IN(float) vb = tet1d::in(&b[0], b.size()); TET1D_OUT(float) vc = tet1d::out(&c[0]); vc(va >= 10 && va < 20) = vb; ... return 0; }} ``` In the exemple above, element `i` in `vc` is written only if `va[i] >= 10` and `va[i] < 20`. The expression appearing in the parenthesis can contain arbitrary expression templates as soon as the underlying type is `bool`. ## Warning using `auto` Using auto can lead to surprising results. We advice you never to use auto when dealing with expression templates. Indeed using `auto` will make the variable an obscure type representing the computation tree of the expression template. This implies that you won't be able to get data from this variable i.e. get the `.data` member for exemple. Again this variable or its type cannot be used in template arguments where you need it. '''.format(nl=' '))
def gen_doc_api(opts): filename = common.get_markdown_file(opts, 'api', 'tet1d') if not common.can_create_filename(opts, filename): return # Build tree for api.md api = dict() for _, operator in operators.operators.items(): if not operator.has_scalar_impl: continue for c in operator.categories: if c not in api: api[c] = [operator] else: api[c].append(operator) def get_signature(op): def get_type(typ): if typ == 'p': return 'int' elif typ == 'v': return 'ExprNumber' elif typ == 'l': return 'ExprBool' ret = get_type(op.params[0]) + ' ' + op.name + '(' if is_not_closed(op): ret += 'ToType' + (', ' if len(op.params[1:]) > 0 else '') ret += ', '.join(['{{t}} {{in{i}}}'.format(i=i). \ format(t=get_type(op.params[i + 1]), in0=common.in0, in1=common.in1, in2=common.in2, in3=common.in3) \ for i in range(len(op.params[1:]))]) ret += ');' return ret with common.open_utf8(opts, filename) as fout: fout.write('''# NSIMD TET1D API reference This page contains the exhaustive API of the TET1D module. Note that most operators names follow their NSIMD counterparts and have the same semantics. This page is light, you may use CTRL+F to find the operator you are looking for. Note that all operators accept literals and scalars. For example you may write `tet1d::add(a, 1)`. This also applies when using infix operators. Note that literals or scalars of different types can be used with expression involving other types. In all signature below the following pseudo types are used for simplification: - `ExprNumber` to designate an existing expression template on signed, unsigned integers of floatting point types or a scalar of signed, unsigned integers or floatting point types. - `ExprBool` to designate an existing expression template over booleans or a boolean. - `ToType` to designate a base type (signed, unsigned integers or floatting point types) and is used when a change in type is requested for example when converting data. ''') for c, ops in api.items(): if len(ops) == 0: continue fout.write('\n## {}\n\n'.format(c.title)) for op in ops: fout.write('- `{}` \n'.format(get_signature(op))) if op.cxx_operator != None: fout.write(' Infix operator: `{}` \n'. \ format(op.cxx_operator[8:])) fout.write(' {}\n\n'.format(op.desc))
def gen_doc_overview(opts): filename = common.get_markdown_file(opts, 'overview', 'spmd') if not common.can_create_filename(opts, filename): return with common.open_utf8(opts, filename) as fout: fout.write('''# Overview ## What is SPMD? SPMD stands for *Single Program Multiple Data*. It is a programming paradigm. It is used by NVIDIA CUDA. Its strengh lies in writing computation kernels. Basically you concentrate your attention on the kernel itself and not on how to run it. An example is worth more than a long speech, let's take vector addition of `float`'s. ```c++ spmd_kernel_1d(add, float *dst, float *a, float *b) k_store(dst, k_load(a) + k_load(b)); spmd_kernel_end ``` It would be written as follows for CUDA (assuming that the vector lenghts are multiples of block's sizes). ```c++ __global__ add(float *dst, float *a, float *b) { int i = blockIdx.x * blockDim.x + threadIdx.x; dst[i] = a[i] + b[i]; } ``` NSIMD's SPMD is a small DSL in standard C++98 that can be used to write computation kernels for GPUs (NVIDIA's and AMD's) and any SIMD units supported by NSIMD. On a more technical side, the DSL keywords are macros that: - translates to C-ish keywords for GPUs and - use masks for CPUs as Intel ISPC (<https://ispc.github.io/>). The difference between NSIMD's SPMD is that a single code can be compiled to target GPUs and CPUs whereas: - NVIDIA CUDA only targets NVIDIA GPUs - AMD HIP only targets NVIDIA and AMD GPUs - INTEL ICP only targets Intel SIMD units and ARM NEON ## Writing kernels and device functions As for CUDA kernels you can write templated and non-templated CUDA kernels. Declaring a kernel function and launching it is straight forward: ```c++ spmd_kernel_1d(kernel_name, arguments) // kernel code spmd_kernel_end int main() { spmd_launch_kernel_1d(kernel_name, bit_width, param, vector_size, arguments); return 0; } ``` The `bit_width` argument indicates the types width in bits that will be available inside kernels. The `param` argument indicates the unroll factor for CPUs and the number of threads per block for GPUs. The `vector_size` argument indicates the vectors length passed as arguments. Device functions can also been implemented. They are functions that will only run on the device. As for kernels, they have the same restrictions. ```c++ spmd_dev_func(k_float device_func, k_float a, k_float b) // Device function code spmd_dev_func_end spmd_kernel_1d(kernel, arguments) // ... spmd_call_dev_func(device_func, a, b); // ... spmd_kernel_end ``` The caveat with `spmd_dev_func` is that its first argument must be the return type followed by the device function name. It is also possible to write templated kernels. Due to C++ `__VA_ARGS__` limitations the number of template argument is limited to one of kind `typename`. If more types or integers are to be passed to device kernels or functions they have to be boxed inside a struct. ```c++ struct mul_t { spmd_dev_func(static k_float dev_impl, k_float a, k_float b) return a * b; spmd_dev_func_end }; struct add_t { spmd_dev_func(static k_float dev_impl, k_float a, k_float b) return a + b; spmd_dev_func_end }; // Op is the template argument (typename Op in C++ code) spmd_tmpl_dev_func(k_float trampoline, Op, k_float a, k_float b) return Op::template spmd_call_dev_func(dev_impl, a, b); spmd_dev_func_end // Op is the template argument (typename Op in C++ code) spmd_tmpl_kernel_1d(tmpl_kernel, Op, arguments) // ... spmd_call_tmpl_dev_func(trampoline, Op, a, b); // ... spmd_kernel_end int main() { // Kernel call for addition spmd_launch_tmpl_kernel_1d(tmpl_kernel, add_t, 32, 1, N, arguments); // Kernel call for multiplication spmd_launch_tmpl_kernel_1d(tmpl_kernel, mul_t, 32, 1, N, arguments); return 0; } ``` ## The NSIMD SPMD C++ DSL The DSL is of course constraint by C++ syntax and constructs. This implies some strange syntax and the impossibility to use infix operator `=`. For now (2020/05/16) the NSIMD SPMD DSL does only supports `if`'s, while-loops and `returns`. It seems that for-loops and do-while-loops cannot be nicely proposed, i.e. with a nice syntax, the switch-case keywords cannot be implemented with a good conformence to the semantic of their C++ counterparts. Goto's also cannot be implemented properly. ### Variables types available in kernels and device functions The following self-explanatory variable types are available inside kernels and devices functions: - `k_int` for signed integers - `k_uint` for unsigned integers - `k_float` for floatting point numbers - `k_bool` for booleans As explained above the bit-width of the above types are determined by the launch kernel function. Note that `k_float` does not exists for 8-bits types. ### Load/store from/to memory Given a pointer, the proper way to load data is to use `k_load(ptr)`. For storing a value to memory `k_store` is to be used. ```c++ k_store(ptr, value); k_store(ptr, expression); ``` As explained above, there is no need to compute the offset to apply to pointers. This is hidden from the programmer. ### Assignment operator (`operator=`) Due to C++ ADL (<https://en.cppreference.com/w/cpp/language/adl>) and the need for keeping things simple for the compiler (which does not always mean simple for the programmer) the use of infix operator `=` will not produce a copmilation error but will give incorrect result. You should use `k_set`. ```c++ k_set(var, value); k_set(var, expression); ``` As written above, `k_set` assign value or the result of an expression to a variable. ### if, then, else You should not use plan C++ `if`'s or `else`'s. This will not cause compilation error but will produce incorrect results at runtime. You should use `k_if`, `k_else`, `k_elseif` and `k_endif` instead. they have the same semantic as their C++ counterparts. ```c++ spmd_kernel_1d(if_elseif_else, float *dst, float *a_ptr) k_float a, ret; k_set(a, k_load(a_ptr)); k_if (a > 15.0f) k_set(ret, 15.0f); k_elseif ( a > 10.0f) k_set(ret, 10.0f); k_elseif ( a > 5.0f) k_set(ret, 5.0f); k_else k_set(ret, 0.0f); k_endif k_store(dst, ret); spmd_kernel_end ``` ### while loops You should not use plan C++ `while`'s, `break`'s and `continue`'s. This will not cause compilation error but will produce incorrect results at runtime. You should use `k_while`, `k_break`, `k_continue` and `k_endif` instead. They have the same semantic as their C++ counterparts. ```c++ spmd_kernel_1d(binpow, float *dst, float *a_ptr, int *p_ptr) k_float a, ret; k_set(a, k_load(a_ptr)); k_set(ret, 1.0f); k_int p; k_set(p, k_load(p_ptr)); k_while(p > 0) k_if ((p & 1) != 0) k_set(ret, ret * a); k_endif k_set(a, a * a); k_set(p, p >> 1); k_endwhile k_store(dst, ret); spmd_kernel_end ``` ### Returns Returns cannot be implemented as macros overloading is not possible in a standard way with an overload taking zero arguments. So returning has to be done correctly. The `k_return` keyword has the same semantic as the C++ `return` keyword without arguments and can be used at will for kernels (as kernels return type is always `void`) and for device functions returning `void`. For device functions returning a value it is recommanded to proceed this way: 1. Declare a variable, say `ret`, to store the return value. 2. Whereever you need to return, set the variable appropriately with `k_set` and return with `k_return`. 3. At the end of the function use `return ret;`. ```c++ spmd_dev_func(k_int func, k_int a) k_float ret; k_if (a == 0) k_set(ret, 0); k_return; k_endif k_if (a == 1) k_set(ret, -1); k_return; k_endif k_set(ret, a); return ret; spmd_dev_func_end ``` ## Advanced techniques and functions This paragraph applies mainly when targeting CPUs. Using techniques described below won't affect GPUs. If you are familiar with the SIMD technique of masking to emulate loops and if's you may know that `k_set` and `k_store` are implemented using respectively `nsimd::if_else` and `nsimd::maskz_storeu` which may incur performance penalties. When you know that a simple assignment or store is sufficient you may use the unmasked variants: - `k_unmasked_set` translates into a C++ assignment. - `k_unmasked_store` translates into a C++ SIMD store. Their arguments are exactly the same as `k_set` and `k_store`. Unmasked operations can usually be used at the beginning of device functions and also inside loops, on temporary variables, knowing that the result of the latter won't be needed later. You may also use C++ standard keywords and constructs. But be aware that doing so will apply all the same treatment too all SIMD lanes. This can be useful when the operations involved are independant of the processed data as in the example below. ```c++ spmd_dev_func(k_float newton_raphson_sqrt, k_float a, k_float x0) k_float ret; for (int i = 0; i < 6; i++) { k_unmasked_set(ret, (ret + ret * a) / 2.0f); } return ret; spmd_dev_func_end ``` ''')