Example #1
0
def doit(opts):
    gen_overview(opts)
    gen_doc(opts)
    gen_modules_md(opts)
    gen_what_is_wrapped(opts)
    root_dir = os.path.join(opts.script_dir, '..')
    copy_github_file_to_doc(opts, os.path.join(root_dir, 'README.md'),
                            common.get_markdown_file(opts, 'index'))
    copy_github_file_to_doc(opts, os.path.join(root_dir, 'CONTRIBUTING.md'),
                            common.get_markdown_file(opts, 'contribute'))
    gen_html(opts)  # This must be last
Example #2
0
def gen_modules_md(opts):
    common.myprint(opts, 'Generating modules.md')
    mods = common.get_modules(opts)
    ndms = []
    for mod in mods:
        name = eval('mods[mod].{}.hatch.name()'.format(mod))
        desc = eval('mods[mod].{}.hatch.desc()'.format(mod))
        ndms.append([name, desc, mod])
    filename = common.get_markdown_file(opts, 'modules')
    if not common.can_create_filename(opts, filename):
        return
    with common.open_utf8(opts, filename) as fout:
        fout.write('''# Modules

NSIMD comes with several additional modules. A module provides a set of
functionnalities that are usually not at the same level as SIMD intrinsics
and/or that do not provide all C and C++ APIs. These functionnalities are
given with the library because they make heavy use of NSIMD core which
abstract SIMD intrinsics. Below is the exhaustive list of modules.

''')
        for ndm in ndms:
            fout.write('- [{}](module_{}_overview.md)  \n'.format(
                ndm[0], ndm[2]))
            fout.write('\n'.join(['  {}'.format(line.strip()) \
                                  for line in ndm[1].split('\n')]))
            fout.write('\n\n')
Example #3
0
File: hatch.py Project: zoq/nsimd
def gen_doc_api(opts):
    filename = common.get_markdown_file(opts, 'api', 'spmd')
    if not common.can_create_filename(opts, filename):
        return

    # Build tree for api.md
    api = dict()
    for _, operator in operators.operators.items():
        if not operator.has_scalar_impl:
            continue
        for c in operator.categories:
            if c not in api:
                api[c] = [operator]
            else:
                api[c].append(operator)

    with common.open_utf8(opts, filename) as fout:
        fout.write(
'''# NSIMD SPMD API reference

This page contains the exhaustive API of the SPMD module. Note that most
operators names follow the simple naming `k_[NSIMD name]` and have the same
semantics. This page is light, you may use CTRL+F to find the operator you
are looking for.

For genericity on the base type you should use operator names instead of
infix operators, e.g. `k_add` instead of `+`. Indeed for `f16`'s NVIDIA CUDA
and NSIMD do not provide overloads and therefore code using `+` will fail to
compile.

Note that all operators accept literals and scalars. For example you may
write `k_add(a, 1)` or `float s; k_add(a, s);`. This also applies when
using infix operators. But note that literals or scalars must have the
same type as the other operands.

''')

        for c, ops in api.items():
            if len(ops) == 0:
                continue
            fout.write('\n## {}\n\n'.format(c.title))
            for op in ops:
                fout.write('- `{}`  \n'.format(get_signature(op)))
                if op.cxx_operator != None:
                    fout.write('  Infix operator: `{}` ' \
                               '(*for certain types only*)  \n'.\
                               format(op.cxx_operator))
                fout.write('  {}\n\n'.format(op.desc))
Example #4
0
def gen_api(opts):
    filename = common.get_markdown_file(opts, 'api', 'fixed_point')
    with common.open_utf8(opts, filename) as fout:
        fout.write('''# NSIMD fixed point API\n''')
        for cat in fp_categories:
            ops = [op for op in fp_operators if cat in op.categories]
            if (len(ops) == 0):
                continue

            fout.write('\n## {}\n\n'.format(cat))

            for op in ops:
                fout.write(
                    '- [{} ({})](module_fixed_point_api_{}.md)\n'\
                           .format(op.full_name, op.name,
                                   common.to_filename(op.name)))
Example #5
0
def gen_api(opts, op_list):
    api = dict()
    for _, operator in operators.operators.items():
        if operator.name not in op_list:
            continue
        for c in operator.categories:
            if c not in api:
                api[c] = [operator]
            else:
                api[c].append(operator)

    filename = common.get_markdown_file(opts, 'api', 'fixed_point')
    with common.open_utf8(opts, filename) as fout:
        fout.write('''# NSIMD fixed point API\n''')
        for c, ops in api.items():
            if len(ops) == 0:
                continue
            fout.write('\n## {}\n\n'.format(c.title))
            for op in ops:
                fout.write('- [{} ({})](module_fixed_point_api_{}.md)\n'. \
                           format(op.full_name, op.name,
                                  common.to_filename(op.name)))
Example #6
0
def gen_doc(opts):
    api = ''
    for func in rand_functions:
        for word_size, nwords_nrounds in func.wordsize_nwords_nrounds.items():
            for nwords, list_nrounds in nwords_nrounds.items():
                for nrounds in list_nrounds:
                    api += '- `' + func.gen_signature(nwords, word_size,
                                                      nrounds) + '`;  \n'
                    api += '  Returns a random number using the ' \
                           '{func_name} generator\n\n'. \
                           format(func_name=func.name)

    res = '''
# NSIMD Random module overview

{desc}

Two different algorithms are proposed : threefry and philox. Both should give
high quality random number.
Threefry is quicker on CPU, while philox is best used on GPU.

Both algorithms are counter based pseudorandom number generator, meaning that
they need two parameters:
- a key, each key will generate an unique sequence,
- a counter, which will give the different numbers in the sequence.

# NSIMD Random API reference

{api}
'''.format(desc=desc(), api=api)

    filename = common.get_markdown_file(opts, 'overview', 'random')
    if not common.can_create_filename(opts, filename):
        return
    with common.open_utf8(opts, filename) as fout:
        fout.write(res)
Example #7
0
File: hatch.py Project: zoq/nsimd
def doit(opts):
    common.myprint(opts, 'Generating module memory_management')
    if not opts.doc:
        return
    filename = common.get_markdown_file(opts, 'overview', 'memory_management')
    if not common.can_create_filename(opts, filename):
        return
    with common.open_utf8(opts, filename) as fout:
        fout.write('''# Overview

This module provides C-style memory managmenent functions. Its purpose is not
to become a fully feature container library. It is to provide portable
malloc, memcpy and free functions with a little helpers to copy data from and
to the devices.

# API reference

## Equivalents of malloc, calloc, memcpy and free for devices

Note that the below functions simply wraps the corresponding C functions
when targeting a CPU.

- `template <typename T> T *device_malloc(size_t sz)`{br}
  Allocates `sz * sizeof(T)` bytes of memory on the device.
  On error NULL is returned.

- `template <typename T> T *device_calloc(size_t sz)`{br}
  Allocates `sz * sizeof(T)` bytes of memory on the device and set the
  allocated memory to zero.
  On error NULL is returned.

- `template <typename T> void device_free(T *ptr)`{br}
  Free the memory pointed to by the given pointer.

- `template <typename T> void copy_to_device(T *device_ptr, T *host_ptr,
  size_t sz)`{br}
  Copy data to from host to device.

- `template <typename T> void copy_to_host(T *host_ptr, T *device_ptr,
  size_t sz)`{br}
  Copy data to from device to host.

- `#define nsimd_fill_dev_mem_func(func_name, expr)`{br}
  Create a device function that will fill data with `expr`. To call the created
  function one simply does `func_name(ptr, sz)`. The `expr` argument represents
  some simple C++ expression that can depend only on `i` the i-th element in
  the vector as shown in the example below.

  ```c++
  nsimd_fill_dev_mem_func(prng, ((i * 1103515245 + 12345) / 65536) % 32768)

  int main() {{
    prng(ptr, 1000);
    return 0;
  }}
  ```

## Pairs of pointers

It is often useful to allocate a pair of data buffers: one on the host and
one on the devices to perform data transfers. The below functions provides
quick ways to malloc, calloc, free and memcpy pointers on host and devices at
once. Note that when targeting CPUs the pair of pointers is reduced to one
pointer that ponit the a single data buffer in which case memcpy's are not
performed. Note also that there is no implicit synchronization of data
between both data buffers. It is up to the programmer to triggers memcpy's.

```c++
template <typename T>
struct paired_pointers_t {{
  T *device_ptr, *host_ptr;
  size_t sz;
}};
```

Members of the above structure are not to be modified but can be passed as
arguments for reading/writing data from/to memory they point to.

- `template <typename T> paired_pointers_t<T> pair_malloc(size_t sz)`{br}
  Allocate `sz * sizeof(T)` bytes of memory on the host and on the device.
  If an error occurs both pointers are NULL.

- `template <typename T> paired_pointers_t<T> pair_malloc_or_exit(size_t
  sz)`{br}
  Allocate `sz * sizeof(T)` bytes of memory on the host and on the device.
  If an error occurs, prints an error message on stderr and exit(3).

- `template <typename T> paired_pointers_t<T> pair_calloc(size_t sz)`{br}
  Allocate `sz * sizeof(T)` bytes of memory on the host and on the device.
  Write both data buffers with zeros.
  If an error occurs both pointers are NULL.

- `template <typename T> paired_pointers_t<T> pair_calloc_or_exit(size_t
  sz)`{br}
  Allocate `sz * sizeof(T)` bytes of memory on the host and on the device.
  Write both data buffers with zeros.
  If an error occurs, prints an error message on stderr and exit(3).

- `template <typename T> void pair_free(paired_pointers_t<T> p)`{br}
  Free data buffers on the host and the device.

- `template <typename T> void copy_to_device(paired_pointers_t<T> p)`{br}
  Copy data from the host buffer to its corresponding device buffer.

- `template <typename T> void copy_to_host(paired_pointers_t<T> p)`{br}
  Copy data from the device buffer to its corresponding host buffer.
'''.format(br='  '))
Example #8
0
def gen_overview(opts):
    filename = common.get_markdown_file(opts, 'overview')
    if not common.can_create_filename(opts, filename):
        return
    with common.open_utf8(opts, filename) as fout:
        fout.write('''# Overview

## NSIMD scalar types

Their names follow the following pattern: `Sxx` where

- `S` is `i` for signed integers, `u` for unsigned integer or `f` for
  floatting point number.
- `xx` is the number of bits taken to represent the number.

Full list of scalar types:

''')
        for t in common.types:
            fout.write('- `{}`\n'.format(t))
        fout.write('''
## NSIMD generic SIMD vector types

In NSIMD, we call a platform an architecture e.g. Intel, ARM, POWERPC. We call
SIMD extension a set of low-level functions and types provided by hardware
vendors to access SIMD units. Examples include SSE2, SSE42, AVX, ...  When
compiling the generic SIMD vector types represents a SIMD register of the
target. Examples are a `__m128` for Intel SSE, `__m512` for Intel AVX-512 or
`svfloat32_t` for Arm SVE.

Their names follow the following pattern:

- C base API: `vSCALAR` where `SCALAR` is a one of scalar type listed above.
- C advanced API: `nsimd_pack_SCALAR` where `SCALAR` is a one of scalar type
  listed above.
- C++ advanced API: `nsimd::pack<SCALAR>` where `SCALAR` is a one of scalar
  type listed above.

Full list of SIMD vector types:

| Base type | C base API | C advanced API | C++ advanced API |
|-----------|------------|----------------|------------------|
''')

        fout.write('\n'.join([
        '| `{typ}` | `v{typ}` | `nsimd_pack_{typ}` | `nsimd::pack<{typ}>` |'. \
        format(typ=typ) for typ in common.types]))

        fout.write('''

## C/C++ base APIs

These come automatically when you include `nsimd/nsimd.h`. You do *not* need
to include a header file for having a function. Here is a list of supported
platforms and their corresponding SIMD extensions.

''')
        platforms = common.get_platforms(opts)
        for p in platforms:
            fout.write('- Platform `{}`\n'.format(p))
            for s in platforms[p].get_simd_exts():
                fout.write('  - `{}`\n'.format(s))
        fout.write('''
Each simd extension has its own set of SIMD types and functions. Types follow
the pattern: `nsimd_SIMDEXT_vSCALAR` where

- `SIMDEXT` is the SIMD extensions.
- `SCALAR` is one of scalar types listed above.

There are also logical types associated to each SIMD vector type. These types
are used, for example, to represent the result of a comparison of SIMD vectors.
They are usually bit masks. Their name follow the pattern:
`nsimd_SIMDEXT_vlSCALAR` where

- `SIMDEXT` is the SIMD extensions.
- `SCALAR` is one of scalar types listed above.

Note 1: Platform `cpu` is a 128 bits SIMD emulation fallback when no SIMD
extension has been specified or is supported on a given compilation target.

Note 2: as all SIMD extensions of all platforms are different there is no
need to put the name of the platform in each identifier.

Function names follow the pattern: `nsimd_SIMDEXT_FUNCNAME_SCALAR` where

- `SIMDEXT` is the SIMD extensions.
- `FUNCNAME` is the name of a function e.g. `add` or `sub`.
- `SCALAR` is one of scalar types listed above.

### Generic identifier

In the base C API, genericity is achieved using macros.

- `vec(SCALAR)` is a type to represent a SIMD vector containing SCALAR
  elements.  SCALAR must be one of scalar types listed above.
- `vecl(SCALAR)` is a type to represent a SIMD vector of logicals for SCALAR
  elements. SCALAR must be one of scalar types listed above.
- `vec_a(SCALAR, SIMDEXT)` is a type to represent a SIMD vector containing
  SCALAR elements for the simd extension SIMDEXT. SCALAR must be one of scalar
  types listed above and SIMDEXT must be a valid SIMD extension.
- `vecl_a(SCALAR, SIMDEXT)` is a type to represent a SIMD vector of logicals
  for SCALAR elements for the simd extension SIMDEXT. SCALAR must be one of
  scalar types listed above and SIMDEXT must be a valid SIMD extension.
- `vFUNCNAME` takes as input the above types to access the operator FUNCNAME
  e.g. `vadd`, `vsub`.

In C++98 and C++03, type traits are available.

- `nsimd::simd_traits<SCALAR, SIMDEXT>::vector` is the SIMD vector type for
  platform SIMDEXT containing SCALAR elements. SIMDEXT is one of SIMD
  extension listed above, SCALAR is one of scalar type listed above.
- `nsimd::simd_traits<SCALAR, SIMDEXT>::vectorl` is the SIMD vector of logicals
  type for platform SIMDEXT containing SCALAR elements. SIMDEXT is one of
  SIMD extensions listed above, SCALAR is one of scalar type listed above.

In C++11 and beyond, type traits are still available but typedefs are also
provided.

- `nsimd::vector<SCALAR, SIMDEXT>` is a typedef to
  `nsimd::simd_traits<SCALAR, SIMDEXT>::vector`.
- `nsimd::vectorl<SCALAR, SIMDEXT>` is a typedef to
  `nsimd::simd_traits<SCALAR, SIMDEXT>::vectorl`.

The C++20 API does not bring different types for SIMD registers nor other
way to access the other SIMD types. It only brings concepts instead of usual
`typename`s. For more informations cf. <concepts.md>.

Note that all macro and functions available in plain C are still available in
C++.

### List of operators provided by the base APIs

In the documentation we use interchangeably the terms "function" and
"operator".  For each operator FUNCNAME a C function (also available in C++)
named `nsimd_SIMDEXT_FUNCNAME_SCALAR` is available for each SCALAR type unless
specified otherwise.

For each FUNCNAME, a C macro (also available in C++) named `vFUNCNAME` is
available and takes as its last argument a SCALAR type.

For each FUNCNAME, a C macro (also available in C++) named `vFUNCNAME_a` is
available and takes as its two last argument a SCALAR type and a SIMDEXT.

For each FUNCNAME, a C++ function in namespace `nsimd` named `FUNCNAME` is
available. It takes as its last argument the SCALAR type and can optionnally
take the SIMDEXT as its last last argument.

For example, for the addition of two SIMD vectors `a` and `b` here are the
possibilities:

```c++
c = nsimd_add_avx_f32(a, b); // use AVX
c = nsimd::add(a, b, f32()); // use detected SIMDEXT
c = nsimd::add(a, b, f32(), avx()); // force AVX even if detected SIMDEXT is not AVX
c = vadd(a, b, f32); // use detected SIMDEXT
c = vadd_e(a, b, f32, avx); // force AVX even if detected SIMDEXT is not AVX
```

Here is a list of available FUNCNAME.

''')
        for op_name, operator in operators.items():
            return_typ = common.get_one_type_generic(operator.params[0],
                                                     'SCALAR')
            func = operator.name
            args = ', '.join([common.get_one_type_generic(p, 'SCALAR') + \
                              ' a' + str(count) for count, p in \
                              enumerate(operator.params[1:])])
            fout.write('- `{} {}({});`  \n'.format(return_typ, func, args))
            if len(operator.types) < len(common.types):
                typs = ', '.join(['{}'.format(t) for t in operator.types])
                fout.write('  Only available for {}\n'.format(typs))

        fout.write('''

## C advanced API (only available in C11)

The C advanced API takes advantage of the C11 `_Generic` keyword to provide
function overloading. Unlike the base API described above there is no need to
pass as arguments the base type of the SIMD extension. The informations are
contained in the types provided by this API.

- `nsimd_pack_SCALAR_SIMDEXT` represents a SIMD vectors containing
  SCALAR elements of SIMD extension SIMDEXT.
- `nsimd::packl_SCALAR_SIMDEXT` represents a SIMD vectors of logicals
  for SCALAR elements of SIMD extension SIMDEXT.

There are versions of the above type without SIMDEXT for which the targeted
SIMD extension is automatically chosen.

- `nsimd_pack_SCALAR` represents a SIMD vectors containing SCALAR elements.
- `nsimd::packl_SCALAR` represents a SIMD vectors of logicals for SCALAR
  elements.

Generic types are also available:

- `nsimd_pack(SCALAR)` is a type to represent a SIMD vector containing SCALAR
  elements.  SCALAR must be one of scalar types listed above.
- `nsimd_packl(SCALAR)` is a type to represent a SIMD vector of logicals for
  SCALAR elements. SCALAR must be one of scalar types listed above.
- `nsimd_pack_a(SCALAR, SIMDEXT)` is a type to represent a SIMD vector
  containing SCALAR elements for the simd extension SIMDEXT. SCALAR must be one
  of scalar types listed above and SIMDEXT must be a valid SIMD extension.
- `nsimd_packl_a(SCALAR, SIMDEXT)` is a type to represent a SIMD vector of
  logicals for SCALAR elements for the simd extension SIMDEXT. SCALAR must be
  one of scalar types listed above and SIMDEXT must be a valid SIMD extension.

Finally, operators are follow the naming: `nsimd_FUNCNAME` e.g. `nsimd_add`,
`nsimd_sub`.

## C++ advanced API

The C++ advanced API is called advanced not because it requires C++11 or above
but because it makes use of the particular implementation of ARM SVE by ARM
in their compiler. We do not know if GCC (and possibly MSVC in the distant
future) will use the same approach. Anyway the current implementation allows
us to put SVE SIMD vectors inside some kind of structs that behave like
standard structs. If you want to be sure to write portable code do *not* use
this API. Two new types are available.

- `nsimd::pack<SCALAR, N, SIMDEXT>` represents `N` SIMD vectors containing
  SCALAR elements of SIMD extension SIMDEXT. You can specify only the first
  template argument. The second defaults to 1 while the third defaults to the
  detected SIMDEXT.
- `nsimd::packl<SCALAR, N, SIMDEXT>` represents `N` SIMD vectors of logical
  type containing SCALAR elements of SIMD extension SIMDEXT. You can specify
  only the first template argument. The second defaults to 1 while the third
  defaults to the detected SIMDEXT.

Use N > 1 when declaring packs to have an unroll of N. This is particularily
useful on ARM.

Functions that takes packs do not take any other argument unless specified
otherwise e.g. the load family of funtions. It is impossible to determine
the kind of pack (unroll and SIMDEXT) from the type of a pointer. Therefore
in this case, the last argument must be a pack and this same type will then
return. Also some functions are available as C++ operators. They follow the
naming: `nsimd::FUNCNAME`.
''')
Example #9
0
def gen_doc(opts):
    common.myprint(opts, 'Generating doc for each function')

    # Build tree for api.md
    api = dict()
    for _, operator in operators.items():
        for c in operator.categories:
            if c not in api:
                api[c] = [operator]
            else:
                api[c].append(operator)

    # api.md
    # filename = os.path.join(opts.script_dir, '..','doc', 'markdown', 'api.md')
    filename = common.get_markdown_file(opts, 'api')
    if common.can_create_filename(opts, filename):
        with common.open_utf8(opts, filename) as fout:
            fout.write('# General API\n\n')
            fout.write('- [Memory function](memory.md)\n')
            fout.write('- [Float16 related functions](fp16.md)\n')
            fout.write('- [Defines provided by NSIMD](defines.md)\n')
            fout.write('- [NSIMD pack and related functions](pack.md)\n\n')
            fout.write('- [NSIMD C++20 concepts](concepts.md)\n\n')
            fout.write('# SIMD operators\n')
            for c, ops in api.items():
                if len(ops) == 0:
                    continue
                fout.write('\n## {}\n\n'.format(c.title))
                for op in ops:
                    Full_name = op.full_name[0].upper() + op.full_name[1:]
                    fout.write('- [{} ({})](api_{}.md)\n'.format(
                        Full_name, op.name, common.to_filename(op.name)))

    # helper to get list of function signatures
    def to_string(var):
        sigs = [var] if type(var) == str or not hasattr(var, '__iter__') \
                     else list(var)
        for i in range(0, len(sigs)):
            sigs[i] = re.sub('[ \n\t\r]+', ' ', sigs[i])
        return '\n'.join(sigs)

    # Operators (one file per operator)
    # dirname = os.path.join(opts.script_dir, '..','doc', 'markdown')
    dirname = common.get_markdown_dir(opts)
    common.mkdir_p(dirname)
    for op_name, operator in operators.items():
        # Skip non-matching doc
        if opts.match and not opts.match.match(op_name):
            continue
        # filename = os.path.join(dirname, 'api_{}.md'.format(common.to_filename(
        #                operator.name)))
        filename = common.get_markdown_api_file(opts, operator.name)
        if not common.can_create_filename(opts, filename):
            continue
        Full_name = operator.full_name[0].upper() + operator.full_name[1:]
        with common.open_utf8(opts, filename) as fout:
            fout.write('# {}\n\n'.format(Full_name))
            fout.write('## Description\n\n')
            fout.write(operator.desc)
            fout.write('\n\n## C base API (generic)\n\n')
            fout.write('```c\n')
            fout.write(to_string(operator.get_generic_signature('c_base')))
            fout.write('\n```\n\n')
            fout.write('\n\n## C advanced API (generic, requires C11)\n\n')
            fout.write('```c\n')
            fout.write(to_string(operator.get_generic_signature('c_adv')))
            fout.write('\n```\n\n')
            fout.write('## C++ base API (generic)\n\n')
            fout.write('```c++\n')
            fout.write(to_string(operator.get_generic_signature('cxx_base')))
            fout.write('\n```\n\n')
            fout.write('## C++ advanced API\n\n')
            fout.write('```c++\n')
            fout.write(to_string(operator.get_generic_signature('cxx_adv'). \
                                 values()))
            fout.write('\n```\n\n')
            fout.write('## C base API (architecture specifics)')
            for simd_ext in opts.simd:
                fout.write('\n\n### {}\n\n'.format(simd_ext.upper()))
                fout.write('```c\n')
                for typ in operator.types:
                    fout.write(operator.get_signature(typ, 'c_base', simd_ext))
                    fout.write(';\n')
                fout.write('```')
            fout.write('\n\n## C++ base API (architecture specifics)')
            for simd_ext in opts.simd:
                fout.write('\n\n### {}\n\n'.format(simd_ext.upper()))
                fout.write('```c\n')
                for typ in operator.types:
                    fout.write(
                        operator.get_signature(typ, 'cxx_base', simd_ext))
                    fout.write(';\n')
                fout.write('```')
Example #10
0
def gen_overview(opts):
    # filename = os.path.join(opts.script_dir, '..', 'doc', 'markdown',
    #                         'overview.md')
    filename = common.get_markdown_file(opts, 'overview')
    if not common.can_create_filename(opts, filename):
        return
    with common.open_utf8(opts, filename) as fout:
        fout.write('''# Overview

## NSIMD scalar types

Their names follows the following pattern: `Sxx` where

- `S` is `i` for signed integers, `u` for unsigned integer and `f` for
  floatting point number.
- `xx` is the number of bits taken to represent the number.

Full list of scalar types:

''')
        for t in common.types:
            fout.write('- `{}`\n'.format(t))
        fout.write('''

## NSIMD SIMD vector types

Their names follows the following pattern: `vSCALAR` where `SCALAR` is a
one of scalar type listed above. For example `vi8` means a SIMD vector
containing `i8`'s.

Full list of SIMD vector types:

''')
        for t in common.types:
            fout.write('- `v{}`\n'.format(t))
        fout.write('''

## C/C++ base APIs

These come automatically when you include `nsimd/nsimd.h`. You do *not* need
to include a header file for having a function. In NSIMD, we call a platform
an architecture e.g. Intel, ARM, POWERPC. We call SIMD extension a set of
low-level functions and types provided to access a given SIDM extension.
Examples include SSE2, SSE42, AVX, ...

Here is a list of supported platforms and their corresponding SIMD extensions.

''')
        platforms = common.get_platforms(opts)
        for p in platforms:
            fout.write('- Platform `{}`\n'.format(p))
            for s in platforms[p].get_simd_exts():
                fout.write('  - `{}`\n'.format(s))
        fout.write('''
Each simd extension has its own set of SIMD types and functions. Types follow
the following pattern: `nsimd_SIMDEXT_vSCALAR` where

- `SIMDEXT` is the SIMD extensions.
- `SCALAR` is one of scalar types listed above.

There are also logical types associated to each SIMD vector type. These types
are used to represent the result of a comparison of SIMD vectors. They are
usually bit masks. Their name follow the following pattern:
`nsimd_SIMDEXT_vlSCALAR` where

- `SIMDEXT` is the SIMD extensions.
- `SCALAR` is one of scalar types listed above.

Note 1: Platform `cpu` is scalar fallback when no SIMD extension has been
specified.

Note 2: as all SIMD extensions of all platforms are different there is no
need to put the name of the platform in each identifier.

Function names follow the following pattern: `nsimd_SIMDEXT_FUNCNAME_SCALAR`
where

- `SIMDEXT` is the SIMD extensions.
- `FUNCNAME` is the name of a function e.g. `add` or `sub`.
- `SCALAR` is one of scalar types listed above.

### Generic identifier

In C, genericity is achieved using macros.

- `vec(SCALAR)` represents the SIMD vector type containing SCALAR elements.
  SCALAR must be one of scalar types listed above.
- `vecl(SCALAR)` represents the SIMD vector of logicals type containing SCALAR
  elements. SCALAR must be one of scalar types listed above.
- `vec_e(SCALAR)` represents the SIMD vector type containing SCALAR elements.
  SCALAR must be one of scalar types listed above.
- `vecl_e(SCALAR)` represents the SIMD vector of logicals type containing
  SCALAR elements. SCALAR must be one of scalar types listed above.
- `vFUNCNAME` is the macro name to access the function FUNCNAME e.g. `vadd`,
  `vsub`.
- `vFUNCNAME_e` is the macro name to access the function FUNCNAME e.g.
  `vadd_e`, `vsub_e`.

In C++98 and C++03, type traits are available.

- `nsimd::simd_traits<SCALAR, SIMDEXT>::vector` is the SIMD vector type for
  platform SIMDEXT containing SCALAR elements. SIMDEXT is one of SIMD
  extension listed above, SCALAR is one of scalar type listed above.
- `nsimd::simd_traits<SCALAR, SIMDEXT>::vectorl` is the SIMD vector of logicals
  type for platform SIMDEXT containing SCALAR elements. SIMDEXT is one of
  SIMD extensions listed above, SCALAR is one of scalar type listed above.

In C++11 and beyond, type traits are still available but typedefs are also
provided.

- `nsimd::vector<SCALAR, SIMDEXT>` is a typedef to
  `nsimd::simd_traits<SCALAR, SIMDEXT>::vector`.
- `nsimd::vectorl<SCALAR, SIMDEXT>` is a typedef to
  `nsimd::simd_traits<SCALAR, SIMDEXT>::vectorl`.

Note that all macro and functions available in plain C are still available in
C++.

### List of functions available for manipulation of SIMD vectors

For each FUNCNAME a C function (also available in C++)
named `nsimd_SIMDEXT_FUNCNAME_SCALAR` is available for each SCALAR type unless
specified otherwise.

For each FUNCNAME, a C macro (also available in C++) named `vFUNCNAME` is
available and takes as its last argument a SCALAR type.

For each FUNCNAME, a C macro (also available in C++) named `vFUNCNAME_a` is
available and takes as its two last argument a SCALAR type and a SIMDEXT.

For each FUNCNAME, a C++ function in namespace `nsimd` named `FUNCNAME` is
available. It takes as its last argument the SCALAR type and can optionnally
take the SIMDEXT as its last last argument.

For example, for the addition of two SIMD vectors `a` and `b` here are the
possibilities:

    c = nsimd_add_avx_f32(a, b); // use AVX
    c = nsimd::add(a, b, f32()); // use detected SIMDEXT
    c = nsimd::add(a, b, f32(), avx()); // force AVX even if detected SIMDEXT is not AVX
    c = vadd(a, b, f32); // use detected SIMDEXT
    c = vadd_e(a, b, f32, avx); // force AVX even if detected SIMDEXT is not AVX

Here is a list of available FUNCNAME.

''')
        for op_name, operator in operators.items():
            return_typ = common.get_one_type_generic(operator.params[0],
                                                     'SCALAR')
            func = operator.name
            args = ', '.join([common.get_one_type_generic(p, 'SCALAR') + \
                              ' a' + str(count) for count, p in \
                              enumerate(operator.params[1:])])
            fout.write('- `{} {}({});`\n'.format(return_typ, func, args))

            if operator.domain and len(operator.params[1:]) > 0:
                params = operator.params[1:]

                if len(params) == 1:
                    fout.write('  a0 ∈ {}\n'.format(operator.domain))
                else:
                    param = ', '.join(['a' + str(count) for count in \
                                       range(len(params))])
                    fout.write('  ({}) ∈ {}\n'.format(param, operator.domain))

            if len(operator.types) < len(common.types):
                typs = ', '.join(['{}'.format(t) for t in operator.types])
                fout.write('  Only available for {}\n'.format(typs))
        fout.write('''

## C++ advanced API

The C++ advanced API is called advanced not because it requires C++11 or above
but because it makes use of the particular implementation of ARM SVE by ARM
in their compiler. We do not know if GCC (and possibly MSVC in the distant
future) will use the same approach. Anyway the current implementation allows
us to put SVE SIMD vectors inside some kind of structs that behave like
standard structs. If you want to be sure to write portable code do *not* use
this API. Two new types are available.

- `nsimd::pack<SCALAR, N, SIMDEXT>` represents `N` SIMD vectors containing
  SCALAR elements of SIMD extension SIMDEXT. You can specify only the first
  template argument. The second defaults to 1 while the third defaults to the
  detected SIMDEXT.
- `nsimd::packl<SCALAR, N, SIMDEXT>` represents `N` SIMD vectors of logical
  type containing SCALAR elements of SIMD extension SIMDEXT. You can specify
  only the first template argument. The second defaults to 1 while the third
  defaults to the detected SIMDEXT.

Use N > 1 when declaring packs to have an unroll of N. This is particularily
useful on ARM.

Functions that takes packs do not take any other argument unless specified
otherwise e.g. the load family of funtions. It is impossible to determine
the kind of pack (unroll and SIMDEXT) from the type of a pointer. Therefore
in this case, the last argument must be a pack and this same type will then
return. Also some functions are available as C++ operators.

Here is the list of functions that act on packs.

''')
        for op_name, operator in operators.items():
            return_typ = common.get_one_type_pack(operator.params[0], 1, 'N')
            func = operator.name
            args = ', '.join([common.get_one_type_pack(p, 0, 'N') + ' a' + \
                              str(count) for count, p in \
                              enumerate(operator.params[1:])])
            if 'v' not in operator.params[1:] and 'l' not in operator.params[
                    1:]:
                args = args + ', pack<T, N, SimdExt> const&' if args != '' \
                              else 'pack<T, N, SimdExt> const&'
            fout.write('- `{} {}({});`\n'.format(return_typ, func, args))

            if operator.domain and len(operator.params[1:]) > 0:
                params = operator.params[1:]
                if len(params) == 1:
                    fout.write('  a0 ∈ {}\n'.format(operator.domain))
                else:
                    param = ', '.join(['a'+str(count) for count in \
                                       range(len(params))])
                    fout.write('  ({}) ∈ {}\n'.format(param, operator.domain))

            if operator.cxx_operator:
                fout.write('  Available as `{}`\n'.format(
                    operator.cxx_operator))

            if len(operator.types) < len(common.types):
                typs = ', '.join(['{}'.format(t) for t in operator.types])
                fout.write('  Only available for {}\n'.format(typs))
Example #11
0
def gen_doc(opts):
    sys.stdout.write('-- Generating doc for each function\n')

    # Build tree for api.md
    api = dict()
    for _, operator in operators.items():
        for c in operator.categories:
            if c not in api:
                api[c] = [operator]
            else:
                api[c].append(operator)

    # helper to construct filename for operator
    # def to_filename(op_name):
    #     valid = string.ascii_letters + string.digits
    #     ret = ''
    #     for c in op_name:
    #         ret += '-' if c not in valid else c
    #     return ret

    # api.md
    # filename = os.path.join(opts.script_dir, '..','doc', 'markdown', 'api.md')
    filename = common.get_markdown_file(opts, 'api')
    if common.can_create_filename(opts, filename):
        with common.open_utf8(opts, filename) as fout:
            fout.write('# API\n')
            for c, ops in api.items():
                if len(ops) == 0:
                    continue
                fout.write('\n## {}\n\n'.format(c.title))
                for op in ops:
                    Full_name = op.full_name[0].upper() + op.full_name[1:]
                    fout.write('- [{} ({})](api_{}.md)\n'.format(
                        Full_name, op.name, common.to_filename(op.name)))

    # helper to get list of function signatures
    def to_string(var):
        sigs = [var] if type(var) == str or not hasattr(var, '__iter__') \
                     else list(var)
        for i in range(0, len(sigs)):
            sigs[i] = re.sub('[ \n\t\r]+', ' ', sigs[i])
        return '\n'.join(sigs)

    # Operators (one file per operator)
    # dirname = os.path.join(opts.script_dir, '..','doc', 'markdown')
    dirname = common.get_markdown_dir(opts)
    common.mkdir_p(dirname)
    for op_name, operator in operators.items():
        # Skip non-matching doc
        if opts.match and not opts.match.match(op_name):
            continue
        # filename = os.path.join(dirname, 'api_{}.md'.format(common.to_filename(
        #                operator.name)))
        filename = common.get_markdown_api_file(opts, operator.name)
        if not common.can_create_filename(opts, filename):
            continue
        Full_name = operator.full_name[0].upper() + operator.full_name[1:]
        with common.open_utf8(opts, filename) as fout:
            fout.write('# {}\n\n'.format(Full_name))
            fout.write('## Description\n\n')
            fout.write(operator.desc)
            fout.write('\n\n## C base API (generic)\n\n')
            fout.write('```c\n')
            fout.write(to_string(operator.get_generic_signature('c_base')))
            fout.write('\n```\n\n')
            fout.write('## C++ base API (generic)\n\n')
            fout.write('```c++\n')
            fout.write(to_string(operator.get_generic_signature('cxx_base')))
            fout.write('\n```\n\n')
            fout.write('## C++ advanced API\n\n')
            fout.write('```c++\n')
            fout.write(to_string(operator.get_generic_signature('cxx_adv'). \
                                 values()))
            fout.write('\n```\n\n')
            fout.write('## C base API (architecture specifics)')
            for simd_ext in opts.simd:
                fout.write('\n\n### {}\n\n'.format(simd_ext.upper()))
                fout.write('```c\n')
                for typ in operator.types:
                    fout.write(operator.get_signature(typ, 'c_base', simd_ext))
                    fout.write(';\n')
                fout.write('```')
            fout.write('\n\n## C++ base API (architecture specifics)')
            for simd_ext in opts.simd:
                fout.write('\n\n### {}\n\n'.format(simd_ext.upper()))
                fout.write('```c\n')
                for typ in operator.types:
                    fout.write(
                        operator.get_signature(typ, 'cxx_base', simd_ext))
                    fout.write(';\n')
                fout.write('```')
Example #12
0
def gen_overview(opts):
    filename = common.get_markdown_file(opts, 'overview', 'fixed_point')
    with common.open_utf8(opts, filename) as fout:
        fout.write('''
# NSIMD fixed point module

## Description

This module implements a fixed-point numbers support for the `nsimd` library.
Fixed-point numbers are integer types used to represent decimal numbers. A
number `lf` of bits are used to encode its integer part, and `rt` bits are used
to encode its fractional part.

The fixed_point module uses the templated type `nsimd::fixed_point::fp_t<lf,
rt>` to represent a fixed_point number. All the basic floating-point arithmetic
operaors have been defined, therefore fp_t elements can be manipulated as
normal numbers.  The fixed_point module will use a `i8`, `i16`, or
`i32` integer type for storage, depending on the value of `lf + 2 * rt`.

All the functions of the module are under the namespace `nsimd::fixed_point`,
and match the same interface than `nsimd` C++ .

The `fp_t` struct type is defined in `fixed.hpp`, and the associated simd
`fpsimd_t` struct type are defined in `simd.hpp`.

The modules redefines the `nsimd` pack type for fixed-point numbers, templated
with `lf` and `rt` :

```C++
namespace nsimd {
namespace fixed_point {
template <u8 lf, u8 rt>
struct pack;
} // namespace fixed_point
} // namespace nsimd
```

Then, the pack can be manipulated as an `nsimd` pack like other scalar types.

## Compatibility

The fixed point module is a C++ only API, compatible with the C++98 standard.
It has the same compilers and hardware support than the main `nsimd` API
(see the [API index](index.md)).

## Example

Here is a minimal example([main.cpp](../../examples/module_fixed_point.cpp)):

@[INCLUDE_CODE:L21:L61](../../examples/module_fixed_point.cpp)

To test with avx2 run :
```bash
export NSIMD_ROOT=<path/to/nsimd>
g++ -o main -I$NSIMD_ROOT/include -mavx2 -DNSIMD_AVX2 main.cpp
./main
```

The console output will look like this :
```console
$>./main
1.35938 | -0.421875 | 0.9375
1.13281 | 1.19531 | 2.32812
1.64844 | -1.21094 | 0.4375
-0.660156 | 1.07422 | 0.414062
-0.890625 | 0.214844 | -0.675781
-0.0898438 | 0.515625 | 0.425781
-0.539062 | 0.0546875 | -0.484375
1.80859 | 1.66406 | 3.47266
```
        ''')
Example #13
0
def gen_doc_overview(opts):
    filename = common.get_markdown_file(opts, 'overview', 'tet1d')
    if not common.can_create_filename(opts, filename):
        return
    with common.open_utf8(opts, filename) as fout:
        fout.write('''# Overview

## What are expression templates?

Expression templates are a C++ template metaprogramming technique that
essentially allows high level programming for loop fusion. Take the following
exemple.

```c++
std::vector<float> operator+(std::vector<float> const &a,
                             std::vector<float> const &b) {{
  std::vector<float> ret(a.size());
  for (size_t i = 0; i < a.size(); i++) {{
    ret[i] = a[i] + b[i];
  }}
  return ret;
}}

int main() {{
  std::vector<float> a, b, c, d, sum;

  ...

  sum = a + b + c + d;

  ...

  return 0;
}}
```

The expression `a + b + c + d` involves three calls to `operator+` and at least
nine memory passes are necessary. This can be optimized as follows.

```c++
int main() {{
  std::vector<float> a, b, c, d, sum;

  ...

  for (size_t i = 0; i < a.size(); i++) {{
    ret[i] = a[i] + b[i] + c[i] + d[i];
  }}

  ...

  return 0;
}}
```

The rewriting above requires only four memory passes which is of course better
but as humans we prefer the writing `a + b + c + d`. Expression templates
solves exactly this problem and allows the programmer to write `a + b + c + d`
and the compiler to see the loop written above.

## Expressions templates with NSIMD

This module provides expression templates on top of NSIMD core. As a
consequence the loops seen by the compiler deduced from the high-level
expressions are optimized using SIMD instructions. Note also that NVIDIA and
AMD GPUs are supported through CUDA and ROCm/HIP. The API for expression
templates in NSIMD is C++98 compatible and is able to work with any container
as its only requirement for data is that it must be contiguous.

All inputs to an expression must be declared using `tet1d::in` while the
output must be declared using `tet1d::out`.

```c++
int main() {{
  std::vector<float> a, b, c;

  ...

  tet1d::out(a) = tet1d::in(&a[0], a.size()) + tet1d::in(&b[0], b.size());

  ...

  return 0;
}}
```

- `template <typename T, typename I> inline node in(const T *data, I sz);`{nl}
  Construct an input for expression templates starting at address `data` and
  containing `sz` elements. The return type of this functin `node` can be used
  with the help of the `TET1D_IN(T)` macro where `T` if the underlying type of
  data (ints, floats, doubles...).

- `template <typename T> node out(T *data);`{nl}
  Construct an output for expression templates starting at address `data`. Note
  that memory must be allocated by the user before passing it to the expression
  template engine. The output type can be used with the `TET1D_OUT(T)` where
  `T` is the underlying type (ints, floats, doubles...).

Note that it is possible to pass parameters to the expression template engine
to specify the number of threads per block for GPUs or the SIMD extension to
use...

- `template <typename T, typename Pack> node out(T *data, int
  threads_per_block, void *stream);`{nl}
  Construct an output for expression templates starting at address `data`. Note
  that memory must be allocated by the user before passing it to the expression
  template engine. The `Pack` parameter is useful when compiling for CPUs. The
  type is `nsimd::pack<...>` allowing the developper to specify all details
  about the NSIMD packs that will be used by the expression template engine.
  The `threads_per_block` and `stream` arguments are used only when compiling
  for GPUs. Their meaning is contained in their names. The output type can be
  used with the `TET1D_OUT_EX(T, N, SimdExt)` where `T` is the underlying type
  (ints, floats, doubles...), `N` is the unroll factor and `SimdExt` the SIMD
  extension.

Moreover a MATLAB-like syntax is provided. One can select a subrange of given
input. Indexes are understood as for Python: -1 represents the last element.
The contant `tet1d::end = -1` allows one to write portable code.

```c++
int main() {{
  std::vector<float> a, b, c;

  ...

  TET1D_IN(float) va = tet1d::in(&a[0], a.size());
  TET1D_IN(float) vb = tet1d::in(&b[0], b.size());
  tet1d::out(c) = va(10, tet1d::end - 10) + vb;

  ...

  return 0;
}}
```

One can also specify which elements of the output must be rewritten with
the following syntax.

```c++
int main() {{
  std::vector<float> a, b, c;

  ...

  TET1D_IN(float) va = tet1d::in(&a[0], a.size());
  TET1D_IN(float) vb = tet1d::in(&b[0], b.size());
  TET1D_OUT(float) vc = tet1d::out(&c[0]);
  vc(va >= 10 && va < 20) = vb;

  ...

  return 0;
}}
```

In the exemple above, element `i` in `vc` is written only if `va[i] >= 10` and
`va[i] < 20`. The expression appearing in the parenthesis can contain
arbitrary expression templates as soon as the underlying type is `bool`.

## Warning using `auto`

Using auto can lead to surprising results. We advice you never to use auto
when dealing with expression templates. Indeed using `auto` will make the
variable an obscure type representing the computation tree of the expression
template. This implies that you won't be able to get data from this variable
i.e. get the `.data` member for exemple. Again this variable or its type cannot
be used in template arguments where you need it.
'''.format(nl='  '))
Example #14
0
def gen_doc_api(opts):
    filename = common.get_markdown_file(opts, 'api', 'tet1d')
    if not common.can_create_filename(opts, filename):
        return

    # Build tree for api.md
    api = dict()
    for _, operator in operators.operators.items():
        if not operator.has_scalar_impl:
            continue
        for c in operator.categories:
            if c not in api:
                api[c] = [operator]
            else:
                api[c].append(operator)

    def get_signature(op):
        def get_type(typ):
            if typ == 'p':
                return 'int'
            elif typ == 'v':
                return 'ExprNumber'
            elif typ == 'l':
                return 'ExprBool'

        ret = get_type(op.params[0]) + ' ' + op.name + '('
        if is_not_closed(op):
            ret += 'ToType' + (', ' if len(op.params[1:]) > 0 else '')
        ret += ', '.join(['{{t}} {{in{i}}}'.format(i=i). \
                          format(t=get_type(op.params[i + 1]), in0=common.in0,
                          in1=common.in1, in2=common.in2, in3=common.in3) \
                          for i in range(len(op.params[1:]))])
        ret += ');'
        return ret

    with common.open_utf8(opts, filename) as fout:
        fout.write('''# NSIMD TET1D API reference

This page contains the exhaustive API of the TET1D module. Note that most
operators names follow their NSIMD counterparts and have the same
semantics. This page is light, you may use CTRL+F to find the operator you
are looking for.

Note that all operators accept literals and scalars. For example you may
write `tet1d::add(a, 1)`. This also applies when using infix operators. Note
that literals or scalars of different types can be used with expression
involving other types.

In all signature below the following pseudo types are used for simplification:
- `ExprNumber` to designate an existing expression template on signed, unsigned
  integers of floatting point types or a scalar of signed, unsigned integers or
  floatting point types.
- `ExprBool` to designate an existing expression template over booleans or
  a boolean.
- `ToType` to designate a base type (signed, unsigned integers or floatting
  point types) and is used when a change in type is requested for example
  when converting data.

''')

        for c, ops in api.items():
            if len(ops) == 0:
                continue
            fout.write('\n## {}\n\n'.format(c.title))
            for op in ops:
                fout.write('- `{}`  \n'.format(get_signature(op)))
                if op.cxx_operator != None:
                    fout.write('  Infix operator: `{}`  \n'. \
                               format(op.cxx_operator[8:]))
                fout.write('  {}\n\n'.format(op.desc))
Example #15
0
def gen_doc_overview(opts):
    filename = common.get_markdown_file(opts, 'overview', 'spmd')
    if not common.can_create_filename(opts, filename):
        return
    with common.open_utf8(opts, filename) as fout:
        fout.write('''# Overview

## What is SPMD?

SPMD stands for *Single Program Multiple Data*. It is a programming paradigm.
It is used by NVIDIA CUDA. Its strengh lies in writing computation kernels.
Basically you concentrate your attention on the kernel itself and not on
how to run it. An example is worth more than a long speech, let's take vector
addition of `float`'s.

```c++
spmd_kernel_1d(add, float *dst, float *a, float *b)
  k_store(dst, k_load(a) + k_load(b));
spmd_kernel_end
```

It would be written as follows for CUDA (assuming that the vector lenghts are
multiples of block's sizes).

```c++
__global__ add(float *dst, float *a, float *b) {
  int i = blockIdx.x * blockDim.x + threadIdx.x;
  dst[i] = a[i] + b[i];
}
```

NSIMD's SPMD is a small DSL in standard C++98 that can be used to write
computation kernels for GPUs (NVIDIA's and AMD's) and any SIMD units supported
by NSIMD. On a more technical side, the DSL keywords are macros that:
- translates to C-ish keywords for GPUs and
- use masks for CPUs as Intel ISPC (<https://ispc.github.io/>).

The difference between NSIMD's SPMD is that a single code can be compiled
to target GPUs and CPUs whereas:
- NVIDIA CUDA only targets NVIDIA GPUs
- AMD HIP only targets NVIDIA and AMD GPUs
- INTEL ICP only targets Intel SIMD units and ARM NEON

## Writing kernels and device functions

As for CUDA kernels you can write templated and non-templated CUDA kernels.
Declaring a kernel function and launching it is straight forward:

```c++
spmd_kernel_1d(kernel_name, arguments)
  // kernel code
spmd_kernel_end

int main() {

  spmd_launch_kernel_1d(kernel_name, bit_width, param,
                        vector_size, arguments);

  return 0;
}
```

The `bit_width` argument indicates the types width in bits that will be
available inside kernels. The `param` argument indicates the unroll factor for
CPUs and the number of threads per block for GPUs. The `vector_size` argument
indicates the vectors length passed as arguments.

Device functions can also been implemented. They are functions that will
only run on the device. As for kernels, they have the same restrictions.

```c++
spmd_dev_func(k_float device_func, k_float a, k_float b)
  // Device function code
spmd_dev_func_end

spmd_kernel_1d(kernel, arguments)

  // ...

  spmd_call_dev_func(device_func, a, b);

  // ...

spmd_kernel_end
```

The caveat with `spmd_dev_func` is that its first argument must be the return
type followed by the device function name.

It is also possible to write templated kernels. Due to C++ `__VA_ARGS__`
limitations the number of template argument is limited to one of kind
`typename`. If more types or integers are to be passed to device kernels or
functions they have to be boxed inside a struct.

```c++
struct mul_t {
  spmd_dev_func(static k_float dev_impl, k_float a, k_float b)
    return a * b;
  spmd_dev_func_end
};

struct add_t {
  spmd_dev_func(static k_float dev_impl, k_float a, k_float b)
    return a + b;
  spmd_dev_func_end
};

// Op is the template argument (typename Op in C++ code)
spmd_tmpl_dev_func(k_float trampoline, Op, k_float a, k_float b)
  return Op::template spmd_call_dev_func(dev_impl, a, b);
spmd_dev_func_end

// Op is the template argument (typename Op in C++ code)
spmd_tmpl_kernel_1d(tmpl_kernel, Op, arguments)

  // ...

  spmd_call_tmpl_dev_func(trampoline, Op, a, b);

  // ...

spmd_kernel_end

int main() {

  // Kernel call for addition
  spmd_launch_tmpl_kernel_1d(tmpl_kernel, add_t, 32, 1, N, arguments);

  // Kernel call for multiplication
  spmd_launch_tmpl_kernel_1d(tmpl_kernel, mul_t, 32, 1, N, arguments);

  return 0;
}
```

## The NSIMD SPMD C++ DSL

The DSL is of course constraint by C++ syntax and constructs. This implies
some strange syntax and the impossibility to use infix operator `=`.
For now (2020/05/16) the NSIMD SPMD DSL does only supports `if`'s, while-loops
and `returns`. It seems that for-loops and do-while-loops cannot be nicely
proposed, i.e. with a nice syntax, the switch-case keywords cannot be
implemented with a good conformence to the semantic of their C++ counterparts.
Goto's also cannot be implemented properly.

### Variables types available in kernels and device functions

The following self-explanatory variable types are available inside kernels
and devices functions:

- `k_int` for signed integers
- `k_uint` for unsigned integers
- `k_float` for floatting point numbers
- `k_bool` for booleans

As explained above the bit-width of the above types are determined by the
launch kernel function. Note that `k_float` does not exists for 8-bits types.

### Load/store from/to memory

Given a pointer, the proper way to load data is to use `k_load(ptr)`. For
storing a value to memory `k_store` is to be used.

```c++
k_store(ptr, value);
k_store(ptr, expression);
```

As explained above, there is no need to compute the offset to apply to
pointers. This is hidden from the programmer.

### Assignment operator (`operator=`)

Due to C++ ADL (<https://en.cppreference.com/w/cpp/language/adl>) and the
need for keeping things simple for the compiler (which does not always mean
simple for the programmer) the use of infix operator `=` will not produce
a copmilation error but will give incorrect result. You should use `k_set`.

```c++
k_set(var, value);
k_set(var, expression);
```

As written above, `k_set` assign value or the result of an expression to a
variable.

### if, then, else

You should not use plan C++ `if`'s or `else`'s. This will not cause compilation
error but will produce incorrect results at runtime. You should use `k_if`,
`k_else`, `k_elseif` and `k_endif` instead. they have the same semantic as
their C++ counterparts.

```c++
spmd_kernel_1d(if_elseif_else, float *dst, float *a_ptr)

  k_float a, ret;
  k_set(a, k_load(a_ptr));

  k_if (a > 15.0f)

    k_set(ret, 15.0f);

  k_elseif ( a > 10.0f)

    k_set(ret, 10.0f);

  k_elseif ( a > 5.0f)

    k_set(ret, 5.0f);

  k_else

    k_set(ret, 0.0f);

  k_endif

  k_store(dst, ret);

spmd_kernel_end
```

### while loops

You should not use plan C++ `while`'s, `break`'s and `continue`'s. This will
not cause compilation error but will produce incorrect results at runtime.
You should use `k_while`, `k_break`, `k_continue` and `k_endif` instead. They
have the same semantic as their C++ counterparts.

```c++
spmd_kernel_1d(binpow, float *dst, float *a_ptr, int *p_ptr)

  k_float a, ret;
  k_set(a, k_load(a_ptr));
  k_set(ret, 1.0f);
  k_int p;
  k_set(p, k_load(p_ptr));

  k_while(p > 0)

    k_if ((p & 1) != 0)

      k_set(ret, ret * a);

    k_endif

    k_set(a, a * a);
    k_set(p, p >> 1);

  k_endwhile

  k_store(dst, ret);

spmd_kernel_end
```

### Returns

Returns cannot be implemented as macros overloading is not possible in a
standard way with an overload taking zero arguments. So returning has to be
done correctly. The `k_return` keyword has the same semantic as the C++
`return` keyword without arguments and can be used at will for kernels (as
kernels return type is always `void`) and for device functions returning
`void`.

For device functions returning a value it is recommanded to proceed this way:

1. Declare a variable, say `ret`, to store the return value.
2. Whereever you need to return, set the variable appropriately with `k_set`
   and return with `k_return`.
3. At the end of the function use `return ret;`.

```c++
spmd_dev_func(k_int func, k_int a)

  k_float ret;

  k_if (a == 0)
    k_set(ret, 0);
    k_return;
  k_endif

  k_if (a == 1)
    k_set(ret, -1);
    k_return;
  k_endif

  k_set(ret, a);

  return ret;

spmd_dev_func_end
```

## Advanced techniques and functions

This paragraph applies mainly when targeting CPUs. Using techniques described
below won't affect GPUs.

If you are familiar with the SIMD technique of masking to emulate loops and
if's you may know that `k_set` and `k_store` are implemented using respectively
`nsimd::if_else` and `nsimd::maskz_storeu` which may incur performance
penalties. When you know that a simple assignment or store is sufficient
you may use the unmasked variants:

- `k_unmasked_set` translates into a C++ assignment.
- `k_unmasked_store` translates into a C++ SIMD store.

Their arguments are exactly the same as `k_set` and `k_store`. Unmasked
operations can usually be used at the beginning of device functions and also
inside loops, on temporary variables, knowing that the result of the latter
won't be needed later.

You may also use C++ standard keywords and constructs. But be aware that doing
so will apply all the same treatment too all SIMD lanes. This can be useful
when the operations involved are independant of the processed data as in the
example below.

```c++
spmd_dev_func(k_float newton_raphson_sqrt, k_float a, k_float x0)
  k_float ret;
  for (int i = 0; i < 6; i++) {
    k_unmasked_set(ret, (ret + ret * a) / 2.0f);
  }
  return ret;
spmd_dev_func_end
```
''')