Beispiel #1
0
def gen_archis_write_file(opts, op, platform, simd_ext, simd_dir):
    filename = os.path.join(simd_dir, '{}.h'.format(op.name))
    if not common.can_create_filename(opts, filename):
        return
    mod = opts.platforms[platform]
    with common.open_utf8(opts, filename) as out:
        out.write('''#ifndef {guard}
                     #define {guard}

                     #include <nsimd/{platform}/{simd_ext}/types.h>
                     {additional_include}

                     {code}

                     {hbar}

                     #endif
                     '''.format(additional_include=mod.get_additional_include(
            op.name, platform, simd_ext),
                                year=date.today().year,
                                guard=op.get_header_guard(platform, simd_ext),
                                platform=platform,
                                simd_ext=simd_ext,
                                func=op.name,
                                hbar=common.hbar,
                                code=get_simd_implementation(
                                    opts, op, mod, simd_ext)))
    common.clang_format(opts, filename)
Beispiel #2
0
def gen_bench(f, simd, typ):
    ## TODO
    path = gen_filename(f, simd, typ)
    ## Check if we need to create the file
    if not common.can_create_filename(_opts, path):
        return
    ## Generate specific code for the bench
    category = common.nsimd_category(simd)
    code = gen_code(f, simd, typ, category=category)
    if code is None:
        return
    ## Now aggregate every parts
    bench = ''
    #bench += gen_bench_asm_function(f, typ, category)
    bench += gen_bench_against(f, simd, typ, f.bench_against_cpu())
    bench += code
    bench += gen_bench_unrolls(f, simd, typ, category)
    bench += gen_bench_against(f, simd, typ, f.bench_against_libs())
    ## Finalize code
    code = gen_bench_from_code(f, typ, bench)
    ## Write file
    with common.open_utf8(path) as f:
        f.write(code)
    ## Clang-format it!
    common.clang_format(_opts, path)
Beispiel #3
0
def gen_modules_md(opts):
    common.myprint(opts, 'Generating modules.md')
    mods = common.get_modules(opts)
    ndms = []
    for mod in mods:
        name = eval('mods[mod].{}.hatch.name()'.format(mod))
        desc = eval('mods[mod].{}.hatch.desc()'.format(mod))
        ndms.append([name, desc, mod])
    filename = common.get_markdown_file(opts, 'modules')
    if not common.can_create_filename(opts, filename):
        return
    with common.open_utf8(opts, filename) as fout:
        fout.write('''# Modules

NSIMD comes with several additional modules. A module provides a set of
functionnalities that are usually not at the same level as SIMD intrinsics
and/or that do not provide all C and C++ APIs. These functionnalities are
given with the library because they make heavy use of NSIMD core which
abstract SIMD intrinsics. Below is the exhaustive list of modules.

''')
        for ndm in ndms:
            fout.write('- [{}](module_{}_overview.md)  \n'.format(
                ndm[0], ndm[2]))
            fout.write('\n'.join(['  {}'.format(line.strip()) \
                                  for line in ndm[1].split('\n')]))
            fout.write('\n\n')
Beispiel #4
0
def doit(opts):
    common.myprint(opts, 'Generating ulps')
    common.mkdir_p(opts.ulps_dir)
    for op_name, operator in operators.operators.items():
        if not operator.tests_mpfr:
            continue
        if op_name in ['gammaln', 'lgamma', 'pow']:
            continue

        mpfr_func = operator.tests_mpfr_name()
        mpfr_rnd = ", MPFR_RNDN"

        for typ in common.ftypes:
            if typ == 'f16':
                random_generator = random_f16_generator
                convert_to_type = "nsimd_f32_to_f16"
                convert_from_type = "nsimd_f16_to_f32"
                mantisse = 10
                size = 0xffff
                mpfr_suffix = "flt"
            elif typ == 'f32':
                convert_to_type = "(f32)"
                convert_from_type = ""
                random_generator = random_f32_generator
                mantisse = 23
                #size = 0xffffffff
                size = 0x00ffffff
                mpfr_suffix = "flt"
            elif typ == 'f64':
                convert_to_type = "(f64)"
                convert_from_type = ""
                random_generator = random_f64_generator
                mantisse = 52
                size = 0x00ffffff
                mpfr_suffix = "d"
            else:
                raise Exception('Unsupported type "{}"'.format(typ))

            filename = os.path.join(opts.ulps_dir, '{}_{}_{}.cpp'. \
                       format(op_name, "ulp", typ))

            if not common.can_create_filename(opts, filename):
                continue

            with common.open_utf8(opts, filename) as out:
                out.write(includes)
                out.write(gen_tests.relative_distance_cpp)
                out.write(
                    code.format(typ=typ,
                                nsimd_func=op_name,
                                mpfr_func=mpfr_func,
                                mpfr_rnd=mpfr_rnd,
                                random_generator=random_generator,
                                convert_from_type=convert_from_type,
                                convert_to_type=convert_to_type,
                                mantisse=mantisse,
                                SIZE=size,
                                mpfr_suffix=mpfr_suffix))

            common.clang_format(opts, filename)
def doit(opts):
    common.myprint(opts, 'Generating friendly but not optimized advanced '
                   'C++ API')
    filename = os.path.join(opts.include_dir, 'friendly_but_not_optimized.hpp')
    if not common.can_create_filename(opts, filename):
        return
    with common.open_utf8(opts, filename) as out:
        out.write('''#ifndef NSIMD_FRIENDLY_BUT_NOT_OPTIMIZED_HPP
                     #define NSIMD_FRIENDLY_BUT_NOT_OPTIMIZED_HPP

                     #include <nsimd/nsimd.h>
                     #include <nsimd/cxx_adv_api.hpp>

                     namespace nsimd {{

                     '''.format(year=date.today().year))
        for op_name, operator in operators.operators.items():
            if operator.cxx_operator == None or len(operator.params) != 3 or \
               operator.name in ['shl', 'shr']:
                continue
            out.write('''{hbar}

                         {code}

                         '''.format(hbar=common.hbar, code=get_impl(operator)))
        out.write('''{hbar}

                     }} // namespace nsimd

                     #endif'''.format(hbar=common.hbar))
    common.clang_format(opts, filename)
Beispiel #6
0
def doit(opts):
    print ('-- Generating base APIs')
    common.mkdir_p(opts.include_dir)
    filename = os.path.join(opts.include_dir, 'functions.h')
    if not common.can_create_filename(opts, filename):
        return
    with common.open_utf8(filename) as out:
        out.write('''#ifndef NSIMD_FUNCTIONS_H
                     #define NSIMD_FUNCTIONS_H

                     '''.format(year=date.today().year))

        for op_name, operator in operators.operators.items():
            out.write('''{}

                         #include NSIMD_AUTO_INCLUDE({}.h)

                         {}

                         {}

                         '''.format(common.hbar, operator.name,
                                    get_c_base_generic(operator),
                                    get_cxx_base_generic(operator)))

        out.write('''{hbar}

                     {put_decl}

                     {hbar}

                     #endif'''. \
                     format(hbar=common.hbar, put_decl=get_put_decl()))
    common.clang_format(opts, filename)
Beispiel #7
0
def doit(opts):
    print ('-- Generating advanced C++ API')
    filename = os.path.join(opts.include_dir, 'cxx_adv_api_functions.hpp')
    if not common.can_create_filename(opts, filename):
        return
    with common.open_utf8(opts, filename) as out:
        out.write('''#ifndef NSIMD_CXX_ADV_API_FUNCTIONS_HPP
                     #define NSIMD_CXX_ADV_API_FUNCTIONS_HPP

                     namespace nsimd {{

                     '''.format(year=date.today().year))

        for op_name, operator in operators.operators.items():
            if not operator.autogen_cxx_adv:
                continue

            out.write('''{hbar}

                         {code}

                         '''.format(hbar=common.hbar,
                                    code=get_cxx_advanced_generic(operator)))


        out.write('''{hbar}

                     }} // namespace nsimd

                     #endif'''.format(hbar=common.hbar))
    common.clang_format(opts, filename)
Beispiel #8
0
def get_filename(opts, op, lf, rt):
    tests_dir = os.path.join(opts.tests_dir, "modules/fixed_point")
    common.mkdir_p(tests_dir)
    filename = os.path.join(tests_dir, '{}.fp_{}_{}.cpp'.format(op, lf, rt))
    if os.path.exists(filename):
        os.remove(filename)
    if common.can_create_filename(opts, filename):
        return filename
    else:
        return None
Beispiel #9
0
def write_cpp(opts, simd_ext, emulate_fp16):
    filename = os.path.join(opts.src_dir, 'api_{}.cpp'.format(simd_ext))
    if not common.can_create_filename(opts, filename):
        return
    with common.open_utf8(opts, filename) as out:
        out.write('''#define NSIMD_INSIDE
                     #include <nsimd/nsimd.h>
                     #include <nsimd/cxx_adv_api.hpp>

                     '''.format(year=date.today().year))
        out.write(get_put_impl(simd_ext))
    common.clang_format(opts, filename)
Beispiel #10
0
def copy_github_file_to_doc(opts, github_filename, doc_filename):
    common.myprint(opts, 'Copying {} ---> {}'. \
                   format(github_filename, doc_filename))
    if not common.can_create_filename(opts, doc_filename):
        return
    with io.open(github_filename, mode='r', encoding='utf-8') as fin:
        file_content = fin.read()
    # we replace all links to doc/... by nsimd/...
    file_content = file_content.replace('doc/markdown/', 'nsimd/')
    file_content = file_content.replace('doc/', 'nsimd/')
    # we do not use common.open_utf8 as the copyright is already in content
    with io.open(doc_filename, mode='w', encoding='utf-8') as fout:
        fout.write(file_content)
Beispiel #11
0
def gen_archis_write_put(opts, platform, simd_ext, simd_dir):
    filename = os.path.join(simd_dir, 'put.h')
    if not common.can_create_filename(opts, filename):
        return
    op = None
    with common.open_utf8(filename) as out:
        out.write( \
        '''#ifndef NSIMD_{PLATFORM}_{SIMD_EXT}_PUT_H
           #define NSIMD_{PLATFORM}_{SIMD_EXT}_PUT_H

           {include_cpu_put}#include <nsimd/{platform}/{simd_ext}/types.h>
           #include <stdio.h>

           {hbar}

           '''.format(year=date.today().year, hbar=common.hbar,
                      simd_ext=simd_ext, platform=platform,
                      PLATFORM=platform.upper(), SIMD_EXT=simd_ext.upper(),
                      include_cpu_put='#include <nsimd/cpu/cpu/put.h>\n' \
                      if simd_ext != 'cpu' else ''))
        for typ in common.types:
            out.write( \
            '''#if NSIMD_CXX > 0
               extern "C" {{
               #endif

               NSIMD_DLLSPEC
               int nsimd_put_{simd_ext}_{typ}(FILE *, const char *,
                                              nsimd_{simd_ext}_v{typ});

               #if NSIMD_CXX > 0
               }} // extern "C"
               #endif

               #if NSIMD_CXX > 0
               namespace nsimd {{
               NSIMD_INLINE int put(FILE *out, const char *fmt,
                                    nsimd_{simd_ext}_v{typ} a0, {typ},
                                    {simd_ext}) {{
                 return nsimd_put_{simd_ext}_{typ}(out, fmt, a0);
               }}
               }} // namespace nsimd
               #endif

               {hbar}

               '''.format(simd_ext=simd_ext, hbar=common.hbar, typ=typ))
        out.write('#endif')
    common.clang_format(opts, filename)
Beispiel #12
0
def get_filename(opts, op, typ, lang):
    pp_lang = {
        'c_base': 'C (base API)',
        'cxx_base': 'C++ (base API)',
        'cxx_adv': 'C++ (advanced API)'
    }
    tests_dir = os.path.join(opts.tests_dir, lang)
    common.mkdir_p(tests_dir)
    filename = os.path.join(
        tests_dir, '{}.{}.{}'.format(op.name, typ,
                                     'c' if lang == 'c_base' else 'cpp'))
    if common.can_create_filename(opts, filename):
        return filename
    else:
        return None
Beispiel #13
0
def gen_doc_api(opts):
    filename = common.get_markdown_file(opts, 'api', 'spmd')
    if not common.can_create_filename(opts, filename):
        return

    # Build tree for api.md
    api = dict()
    for _, operator in operators.operators.items():
        if not operator.has_scalar_impl:
            continue
        for c in operator.categories:
            if c not in api:
                api[c] = [operator]
            else:
                api[c].append(operator)

    with common.open_utf8(opts, filename) as fout:
        fout.write(
'''# NSIMD SPMD API reference

This page contains the exhaustive API of the SPMD module. Note that most
operators names follow the simple naming `k_[NSIMD name]` and have the same
semantics. This page is light, you may use CTRL+F to find the operator you
are looking for.

For genericity on the base type you should use operator names instead of
infix operators, e.g. `k_add` instead of `+`. Indeed for `f16`'s NVIDIA CUDA
and NSIMD do not provide overloads and therefore code using `+` will fail to
compile.

Note that all operators accept literals and scalars. For example you may
write `k_add(a, 1)` or `float s; k_add(a, s);`. This also applies when
using infix operators. But note that literals or scalars must have the
same type as the other operands.

''')

        for c, ops in api.items():
            if len(ops) == 0:
                continue
            fout.write('\n## {}\n\n'.format(c.title))
            for op in ops:
                fout.write('- `{}`  \n'.format(get_signature(op)))
                if op.cxx_operator != None:
                    fout.write('  Infix operator: `{}` ' \
                               '(*for certain types only*)  \n'.\
                               format(op.cxx_operator))
                fout.write('  {}\n\n'.format(op.desc))
Beispiel #14
0
def write_cpp(opts, simd_ext, emulate_fp16):
    filename = os.path.join(opts.src_dir, 'api_{}.cpp'.format(simd_ext))
    if not common.can_create_filename(opts, filename):
        return
    with common.open_utf8(opts, filename) as out:
        out.write('''#define NSIMD_INSIDE
                     #include <nsimd/nsimd.h>
                     #include <nsimd/cxx_adv_api.hpp>

                     '''.format(year=date.today().year))
        for op_name, operator in operators.operators.items():
            if operator.src:
                out.write('''{hbar}

                             #include <nsimd/src/{name}.hpp>

                             '''.format(name=operator.name, hbar=common.hbar))
                out.write(get_impl(operator, emulate_fp16, simd_ext))
        out.write(get_put_impl(simd_ext))

    common.clang_format(opts, filename)
Beispiel #15
0
def doit(opts):
    common.myprint(opts, 'Generating advanced C++ API')
    filename = os.path.join(opts.include_dir, 'cxx_adv_api_functions.hpp')
    if not common.can_create_filename(opts, filename):
        return
    with common.open_utf8(opts, filename) as out:
        out.write('''#ifndef NSIMD_CXX_ADV_API_FUNCTIONS_HPP
                     #define NSIMD_CXX_ADV_API_FUNCTIONS_HPP

                     namespace nsimd {

                     ''')

        for op_name, operator in operators.operators.items():
            if not operator.autogen_cxx_adv:
                continue

            out.write('''{hbar}

                         {code}

                         '''.format(hbar=common.hbar,
                                    code=get_cxx_advanced_generic(operator)))

            if operator.cxx_operator and \
                (operator.args in [['v', 'v'], ['v', 'p']]):
                out.write('{hbar}\n{code}'. \
                        format(hbar=common.hbar,
                               code=gen_assignment_operators(operator)))

        out.write('''{hbar}

                     }} // namespace nsimd

                     #endif'''.format(hbar=common.hbar))
    common.clang_format(opts, filename)
Beispiel #16
0
def gen_doc(opts):
    api = ''
    for func in rand_functions:
        for word_size, nwords_nrounds in func.wordsize_nwords_nrounds.items():
            for nwords, list_nrounds in nwords_nrounds.items():
                for nrounds in list_nrounds:
                    api += '- `' + func.gen_signature(nwords, word_size,
                                                      nrounds) + '`;  \n'
                    api += '  Returns a random number using the ' \
                           '{func_name} generator\n\n'. \
                           format(func_name=func.name)

    res = '''
# NSIMD Random module overview

{desc}

Two different algorithms are proposed : threefry and philox. Both should give
high quality random number.
Threefry is quicker on CPU, while philox is best used on GPU.

Both algorithms are counter based pseudorandom number generator, meaning that
they need two parameters:
- a key, each key will generate an unique sequence,
- a counter, which will give the different numbers in the sequence.

# NSIMD Random API reference

{api}
'''.format(desc=desc(), api=api)

    filename = common.get_markdown_file(opts, 'overview', 'random')
    if not common.can_create_filename(opts, filename):
        return
    with common.open_utf8(opts, filename) as fout:
        fout.write(res)
Beispiel #17
0
def gen_what_is_wrapped(opts):
    common.myprint(opts, 'Generating "which intrinsics are wrapped"')
    build_exe_for_doc(opts)
    wrapped = 'what_is_wrapped.exe' if platform.system() == 'Windows' \
                                    else 'what_is_wrapped'
    doc_dir = os.path.join(opts.script_dir, '..', 'doc')
    full_path_wrapped = os.path.join(doc_dir, wrapped)
    if not os.path.isfile(full_path_wrapped):
        common.myprint(opts, '{} not found'.format(wrapped))
        return

    # Content for indexing files created in this function
    index = '# Intrinsics that are wrapped\n'

    # Build command line
    cmd0 = '{} {},{},{},{},{},{}'.format(full_path_wrapped, common.in0,
                                         common.in1, common.in2, common.in3,
                                         common.in4, common.in5)

    # For now we only list Intel, Arm and POWERPC intrinsics
    simd_exts = common.x86_simds + common.arm_simds + common.ppc_simds
    for p in common.get_platforms(opts):
        index_simds = ''
        for simd_ext in opts.platforms_list[p].get_simd_exts():
            if simd_ext not in simd_exts:
                continue
            md = os.path.join(common.get_markdown_dir(opts),
                              'wrapped_intrinsics_for_{}.md'.format(simd_ext))
            index_simds += '- [{}](wrapped_intrinsics_for_{}.md)\n'. \
                           format(simd_ext.upper(), simd_ext)
            ops = [[], [], [], []]
            for op_name, operator in operators.items():
                if operator.src:
                    continue
                c_src = os.path.join(opts.include_dir, p, simd_ext,
                                     '{}.h'.format(op_name))
                ops[operator.output_to].append('{} "{}"'. \
                                               format(op_name, c_src))
            if not common.can_create_filename(opts, md):
                continue
            with common.open_utf8(opts, md) as fout:
                fout.write('# Intrinsics wrapped for {}\n\n'. \
                           format(simd_ext.upper()))
                fout.write('Notations are as follows:\n'
                           '- `T` for trick usually using other intrinsics\n'
                           '- `E` for scalar emulation\n'
                           '- `NOOP` for no operation\n'
                           '- `NA` means the operator does not exist for '
                           'the given type\n'
                           '- `intrinsic` for the actual wrapped intrinsic\n'
                           '\n')
            cmd = '{} {} same {} >> "{}"'.format(
                cmd0, simd_ext, ' '.join(ops[common.OUTPUT_TO_SAME_TYPE]), md)
            if os.system(cmd) != 0:
                common.myprint(opts, 'Unable to generate markdown for '
                               '"same"')
                continue

            cmd = '{} {} same_size {} >> "{}"'.format(
                cmd0, simd_ext,
                ' '.join(ops[common.OUTPUT_TO_SAME_SIZE_TYPES]), md)
            if os.system(cmd) != 0:
                common.myprint(
                    opts, 'Unable to generate markdown for '
                    '"same_size"')
                continue

            cmd = '{} {} bigger_size {} >> "{}"'.format(
                cmd0, simd_ext, ' '.join(ops[common.OUTPUT_TO_UP_TYPES]), md)
            if os.system(cmd) != 0:
                common.myprint(
                    opts, 'Unable to generate markdown for '
                    '"bigger_size"')
                continue

            cmd = '{} {} lesser_size {} >> "{}"'.format(
                cmd0, simd_ext, ' '.join(ops[common.OUTPUT_TO_DOWN_TYPES]), md)
            if os.system(cmd) != 0:
                common.myprint(
                    opts, 'Unable to generate markdown for '
                    '"lesser_size"')
                continue
        if index_simds != '':
            index += '\n## Platform {}\n\n'.format(p)
            index += index_simds

    md = os.path.join(common.get_markdown_dir(opts), 'wrapped_intrinsics.md')
    if common.can_create_filename(opts, md):
        with common.open_utf8(opts, md) as fout:
            fout.write(index)
Beispiel #18
0
def gen_tests_for_shifts(opts, t, operator):
    op_name = operator.name
    dirname = os.path.join(opts.tests_dir, 'modules', 'tet1d')
    common.mkdir_p(dirname)
    filename = os.path.join(dirname, '{}.{}.cpp'.format(op_name, t))
    if not common.can_create_filename(opts, filename):
        return
    with common.open_utf8(opts, filename) as out:
        out.write('''#include <nsimd/modules/tet1d.hpp>
        #include <nsimd/modules/memory_management.hpp>
        #include "../common.hpp"

        #if defined(NSIMD_CUDA)

        __global__ void kernel({t} *dst, {t} *tab0, int n, int s) {{
          int i = threadIdx.x + blockIdx.x * blockDim.x;
          if (i < n) {{
            dst[i] = nsimd::gpu_{op_name}(tab0[i], s);
          }}
        }}

        void compute_result({t} *dst, {t} *tab0, unsigned int n, int s) {{
          kernel<<<{gpu_params}>>>(dst, tab0, int(n), s);
        }}

        #elif defined(NSIMD_ROCM)

        __global__ void kernel({t} *dst, {t} *tab0, size_t n, int s) {{
          size_t i = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
          if (i < n) {{
            dst[i] = nsimd::gpu_{op_name}(tab0[i], s);
          }}
        }}

        void compute_result({t} *dst, {t} *tab0, size_t n, int s) {{
          hipLaunchKernelGGL(kernel, {gpu_params}, 0, 0, dst, tab0, n, s);
        }}

        #else

        void compute_result({t} *dst, {t} *tab0, unsigned int n, int s) {{
          for (unsigned int i = 0; i < n; i++) {{
            dst[i] = nsimd_scalar_{op_name}_{t}(tab0[i], s);
          }}
        }}

        #endif

        nsimd_fill_dev_mem_func(prng5,
            1 + (((unsigned int)i * 69342380 + 414585) % 5))

        int main() {{
          unsigned int n_[3] = {{ 10, 1001, 10001 }};
          for (int i = 0; i < (int)(sizeof(n_) / sizeof(int)); i++) {{
            unsigned int n = n_[i];
            for (int s = 0; s < {typnbits}; s++) {{
              int ret = 0;
              {t} *tab0 = nsimd::device_calloc<{t}>(n);
              prng5(tab0, n);
              {t} *ref = nsimd::device_calloc<{t}>(n);
              {t} *out = nsimd::device_calloc<{t}>(n);
              compute_result(ref, tab0, n, s);
              tet1d::out(out) = tet1d::{op_name}(tet1d::in(tab0, n), s);
              if (!cmp(ref, out, n)) {{
                ret = -1;
              }}
              nsimd::device_free(ref);
              nsimd::device_free(out);
              nsimd::device_free(tab0);
              if (ret != 0) {{
                return ret;
              }}
            }}
          }}
          return 0;
        }}
        '''.format(gpu_params=gpu_params, op_name=op_name, t=t,
                   typnbits=t[1:]))
    common.clang_format(opts, filename, cuda=True)
Beispiel #19
0
def doit(opts):
    common.myprint(opts, 'Generating scalar implementation for CPU and GPU')
    filename = os.path.join(opts.include_dir, 'scalar_utilities.h')
    if not common.can_create_filename(opts, filename):
        return
    with common.open_utf8(opts, filename) as out:
        # we declare reinterprets now as we need them
        scalar_tmp = []
        gpu_tmp = []
        for t in operators.Reinterpret.types:
            for tt in common.get_output_types(t,
                                              operators.Reinterpret.output_to):
                scalar_tmp += [operators.Reinterpret(). \
                               get_scalar_signature('cpu', t, tt, 'c')]
                gpu_tmp += [operators.Reinterpret(). \
                            get_scalar_signature('gpu', t, tt, 'cxx')]
        scalar_reinterpret_decls = '\n'.join(['NSIMD_INLINE ' + sig + ';' \
                                              for sig in scalar_tmp])
        gpu_reinterpret_decls = '\n'.join(['inline ' + sig + ';' \
                                           for sig in gpu_tmp])
        out.write(
        '''#ifndef NSIMD_SCALAR_UTILITIES_H
           #define NSIMD_SCALAR_UTILITIES_H

           #if NSIMD_CXX > 0
           #include <cmath>
           #include <cstring>
           #else
           #include <math.h>
           #include <string.h>
           #endif

           #ifdef NSIMD_NATIVE_FP16
             #if defined(NSIMD_IS_GCC)
               #pragma GCC diagnostic push
               #pragma GCC diagnostic ignored "-Wdouble-promotion"
             #elif defined(NSIMD_IS_CLANG)
               #pragma clang diagnostic push
               #pragma clang diagnostic ignored "-Wdouble-promotion"
             #endif
           #endif

           {scalar_reinterpret_decls}

           #if defined(NSIMD_CUDA) || defined(NSIMD_ROCM)

           namespace nsimd {{

           {gpu_reinterpret_decls}

           }} // namespace nsimd

           #endif
           '''. \
           format(scalar_reinterpret_decls=scalar_reinterpret_decls,
                  gpu_reinterpret_decls=gpu_reinterpret_decls))
        for op_name, operator in operators.operators.items():
            if not operator.has_scalar_impl:
                continue
            if operator.params == ['l'] * len(operator.params):
                out.write('\n\n' + common.hbar + '\n\n')
                out.write(\
                '''NSIMD_INLINE {c_sig} {{
                  {scalar_impl}
                }}

                #if NSIMD_CXX > 0

                namespace nsimd {{

                NSIMD_INLINE {cxx_sig} {{
                  return nsimd_scalar_{op_name}({c_args});
                }}

                {gpu_impl}

                }} // namespace nsimd

                #endif'''.format(
                c_sig=operator.get_scalar_signature('cpu', '', '', 'c'),
                cxx_sig=operator.get_scalar_signature('cpu', '', '', 'cxx'),
                op_name=op_name,
                c_args=', '.join(['a{}'.format(i - 1) \
                               for i in range(1, len(operator.params))]),
                scalar_impl=scalar.get_impl(operator, tt, t),
                gpu_impl=get_gpu_impl(
                    operator.get_scalar_signature('gpu', t, tt, 'cxx'),
                    cuda.get_impl(operator, tt, t),
                    rocm_impl=rocm.get_impl(operator, tt, t))))
                continue
            for t in operator.types:
                tts = common.get_output_types(t, operator.output_to)
                for tt in tts:
                    out.write('\n\n' + common.hbar + '\n\n')
                    out.write(\
                    '''NSIMD_INLINE {c_sig} {{
                      {scalar_impl}
                    }}

                    #if NSIMD_CXX > 0

                    namespace nsimd {{

                    NSIMD_INLINE {cxx_sig} {{
                      return nsimd_scalar_{op_name}_{suffix}({c_args});
                    }}

                    {gpu_impl}

                    }} // namespace nsimd

                    #endif'''.format(
                    c_sig=operator.get_scalar_signature('cpu', t, tt, 'c'),
                    cxx_sig=operator.get_scalar_signature('cpu', t, tt, 'cxx'),
                    op_name=op_name,
                    suffix=t if operator.closed else '{}_{}'.format(tt, t),
                    c_args=', '.join(['a{}'.format(i - 1) \
                                   for i in range(1, len(operator.params))]),
                    scalar_impl=scalar.get_impl(operator, tt, t),
                    gpu_impl=get_gpu_impl(
                        operator.get_scalar_signature('gpu', t, tt, 'cxx'),
                        cuda.get_impl(operator, tt, t),
                        rocm_impl=rocm.get_impl(operator, tt, t))))

        out.write('''

                  {hbar}

                  #ifdef NSIMD_NATIVE_FP16
                    #if defined(NSIMD_IS_GCC)
                      #pragma GCC diagnostic pop
                    #elif defined(NSIMD_IS_CLANG)
                      #pragma clang diagnostic pop
                    #endif
                  #endif

                  #endif'''.format(hbar=common.hbar))
    common.clang_format(opts, filename)
Beispiel #20
0
def gen_doc(opts):
    sys.stdout.write('-- Generating doc for each functions\n')
    dirname = os.path.join(opts.script_dir, '..', 'doc')
    common.mkdir_p(dirname)

    # Root node first
    obj = collections.OrderedDict()
    obj['title'] = 'Root node'
    obj['sig'] = []
    obj['lang'] = ''
    obj['categories'] = []
    obj['desc'] = []
    obj['parent'] = ''
    obj['id'] = '/'
    obj['type'] = 'root'
    obj['title'] = 'Root node'
    filename = os.path.join(dirname, 'root.json')
    if common.can_create_filename(opts, filename):
        with io.open(filename, mode='w', encoding='utf-8') as fout:
            fout.write(json.dumps(obj, ensure_ascii=False))

    # Categories first
    for name, cat in categories.items():
        filename = os.path.join(dirname, '{}.json'.format(name))
        ## Check if we need to create the file
        if not common.can_create_filename(opts, filename):
            continue

        obj = collections.OrderedDict()
        obj['title'] = cat.name
        obj['sig'] = []
        obj['lang'] = ''
        obj['categories'] = []
        obj['desc'] = []
        obj['parent'] = '/'
        obj['id'] = '/{}'.format(name)
        obj['type'] = 'category'
        obj['title'] = cat.title
        with io.open(filename, mode='w', encoding='utf-8') as fout:
            fout.write(json.dumps(obj, ensure_ascii=False))

    # APIs
    for api in ['c_base', 'cxx_base', 'cxx_adv']:
        filename = os.path.join(dirname, '{}.json'.format(api))
        if common.can_create_filename(opts, filename):
            l = collections.OrderedDict()
            l['title'] = {
                'c_base': 'C API',
                'cxx_base': 'C++ base API',
                'cxx_adv': 'C++ advanced API'
            }[api]
            l['id'] = '/{}'.format(api)
            l['parent'] = '/'
            l['sig'] = []
            l['type'] = ''
            l['desc'] = []
            l['categories'] = []
            l['lang'] = 'C' if api == 'c' else 'C++'
            with io.open(filename, mode='w', encoding='utf-8') as fout:
                fout.write(json.dumps(l, ensure_ascii=False))

    # Operators (one file per operator otherwise too much files)
    for op_name, operator in operators.items():
        ## Skip non-matching doc
        if opts.match and not opts.match.match(op_name):
            continue

        filename = os.path.join(dirname, '{}.json'.format(op_name))
        cats = ['/{}'.format(c.name) for c in operator.categories]
        withdoc_id = '/{}'.format(op_name)
        doc_blocks = []
        obj = collections.OrderedDict()

        # All is withdoc'ed with this docblock which has no desc, no sig...
        obj = collections.OrderedDict()
        obj['id'] = withdoc_id
        obj['desc'] = [operator.desc]
        obj['sig'] = []
        obj['parent'] = '/'
        obj['categories'] = cats
        obj['type'] = 'function'
        obj['title'] = operator.full_name
        obj['lang'] = ''
        doc_blocks.append(obj)

        def to_list(var):
            ret = [var] if type(var) == str or not hasattr(var, '__iter__') \
                        else list(var)
            for i in range(0, len(ret)):
                ret[i] = re.sub('[ \n\t\r]+', ' ', ret[i])
            return ret

        # All base C/C++ functions (for each architecture and type)
        for api in ['c_base', 'cxx_base']:
            for simd_ext in common.simds:
                for typ in operator.types:
                    obj = collections.OrderedDict()
                    obj['id'] = '/{}-{}-{}-{}'.format(op_name, api, simd_ext,
                                                      typ)
                    obj['desc'] = []
                    obj['parent'] = '/{}'.format(api)
                    obj['categories'] = cats
                    obj['type'] = 'function'
                    obj['withdoc'] = withdoc_id
                    obj['sig'] = to_list(
                        operator.get_signature(typ, api, simd_ext))
                    obj['title'] = ''
                    obj['lang'] = common.ext_from_lang(api)
                    doc_blocks.append(obj)

        # C/C++ base/advanced generic functions
        for api in ['c_base', 'cxx_base', 'cxx_adv']:
            obj = collections.OrderedDict()
            obj['id'] = '/{}-{}'.format(op_name, api)
            obj['desc'] = []
            obj['parent'] = '/{}'.format(api)
            obj['categories'] = cats
            obj['type'] = 'function'
            obj['withdoc'] = withdoc_id
            obj['sig'] = to_list(operator.get_generic_signature(api) \
                                 if api != 'cxx_adv' else \
                                 operator.get_generic_signature(api).values())
            obj['title'] = ''
            obj['lang'] = common.ext_from_lang(api)
            doc_blocks.append(obj)

        # Finally dump JSON
        with io.open(filename, mode='w', encoding='utf-8') as fout:
            fout.write(json.dumps(doc_blocks, ensure_ascii=False))
Beispiel #21
0
def gen_readme(opts):
    print('-- Generating documentation in DOC.md')
    filename = os.path.join(opts.script_dir, '..', 'DOC.md')
    if not common.can_create_filename(opts, filename):
        return
    with io.open(filename, mode='w', encoding='utf-8') as fout:
        fout.write('''## NSIMD scalar types

Their names follows the following pattern: `Sxx` where

- `S` is `i` for signed integers, `u` for unsigned integer and `f` for
  floatting point number.
- `xx` is the number of bits taken to represent the number.

Full list of scalar types:

''')
        for t in common.types:
            fout.write('- `{}`\n'.format(t))
        fout.write('''

## NSIMD SIMD vector types

Their names follows the following pattern: `vSCALAR` where `SCALAR` is a
one of scalar type listed above. For example `vi8` means a SIMD vector
containing `i8`'s.

Full list of SIMD vector types:

''')
        for t in common.types:
            fout.write('- `v{}`\n'.format(t))
        fout.write('''

## C/C++ base APIs

These come automatically when you include `nsimd/nsimd.h`. You do *not* need
to include a header file for having a function. In NSIMD, we call a platform
an architecture e.g. Intel, ARM, POWERPC. We call SIMD extension a set of
low-level functions and types provided to access a given SIDM extension.
Examples include SSE2, SSE42, AVX, ...

Here is a list of supported platforms and their corresponding SIMD extensions.

''')
        platforms = common.get_platforms(opts)
        for p in platforms:
            fout.write('- Platform `{}`\n'.format(p))
            for s in platforms[p].get_simd_exts():
                fout.write('  - `{}`\n'.format(s))
        fout.write('''
Each simd extension has its own set of SIMD types and functions. Types follow
the following pattern: `nsimd_SIMDEXT_vSCALAR` where

- `SIMDEXT` is the SIMD extensions.
- `SCALAR` is one of scalar types listed above.

There are also logical types associated to each SIMD vector type. These types
are used to represent the result of a comparison of SIMD vectors. They are
usually bit masks. Their name follow the following pattern:
`nsimd_SIMDEXT_vlSCALAR` where

- `SIMDEXT` is the SIMD extensions.
- `SCALAR` is one of scalar types listed above.

Note 1: Platform `cpu` is scalar fallback when no SIMD extension has been
specified.

Note 2: as all SIMD extensions of all platforms are different there is no
need to put the name of the platform in each identifier.

Function names follow the following pattern: `nsimd_SIMDEXT_FUNCNAME_SCALAR`
where

- `SIMDEXT` is the SIMD extensions.
- `FUNCNAME` is the name of a function e.g. `add` or `sub`.
- `SCALAR` is one of scalar types listed above.

### Generic identifier

In C, genericity is achieved using macros.

- `vec(SCALAR)` represents the SIMD vector type containing SCALAR elements.
  SCALAR must be one of scalar types listed above.
- `vecl(SCALAR)` represents the SIMD vector of logicals type containing SCALAR
  elements. SCALAR must be one of scalar types listed above.
- `vec_e(SCALAR)` represents the SIMD vector type containing SCALAR elements.
  SCALAR must be one of scalar types listed above.
- `vecl_e(SCALAR)` represents the SIMD vector of logicals type containing
  SCALAR elements. SCALAR must be one of scalar types listed above.
- `vFUNCNAME` is the macro name to access the function FUNCNAME e.g. `vadd`,
  `vsub`.
- `vFUNCNAME_e` is the macro name to access the function FUNCNAME e.g.
  `vadd_e`, `vsub_e`.

In C++98 and C++03, type traits are available.

- `nsimd::simd_traits<SCALAR, SIMDEXT>::vector` is the SIMD vector type for
  platform SIMDEXT containing SCALAR elements. SIMDEXT is one of SIMD
  extension listed above, SCALAR is one of scalar type listed above.
- `nsimd::simd_traits<SCALAR, SIMDEXT>::vectorl` is the SIMD vector of logicals
  type for platform SIMDEXT containing SCALAR elements. SIMDEXT is one of
  SIMD extensions listed above, SCALAR is one of scalar type listed above.

In C++11 and beyond, type traits are still available but typedefs are also
provided.

- `nsimd::vector<SCALAR, SIMDEXT>` is a typedef to
  `nsimd::simd_traits<SCALAR, SIMDEXT>::vector`.
- `nsimd::vectorl<SCALAR, SIMDEXT>` is a typedef to
  `nsimd::simd_traits<SCALAR, SIMDEXT>::vectorl`.

Note that all macro and functions available in plain C are still available in
C++.

### List of functions available for manipulation of SIMD vectors

For each FUNCNAME a C function (also available in C++)
named `nsimd_SIMDEXT_FUNCNAME_SCALAR` is available for each SCALAR type unless
specified otherwise.

For each FUNCNAME, a C macro (also available in C++) named `vFUNCNAME` is
available and takes as its last argument a SCALAR type.

For each FUNCNAME, a C macro (also available in C++) named `vFUNCNAME_a` is
available and takes as its two last argument a SCALAR type and a SIMDEXT.

For each FUNCNAME, a C++ function in namespace `nsimd` named `FUNCNAME` is
available. It takes as its last argument the SCALAR type and can optionnally
take the SIMDEXT as its last last argument.

For example, for the addition of two SIMD vectors `a` and `b` here are the
possibilities:

    c = nsimd_add_avx_f32(a, b); // use AVX
    c = nsimd::add(a, b, f32()); // use detected SIMDEXT
    c = nsimd::add(a, b, f32(), avx()); // force AVX even if detected SIMDEXT is not AVX
    c = vadd(a, b, f32); // use detected SIMDEXT
    c = vadd_e(a, b, f32, avx); // force AVX even if detected SIMDEXT is not AVX

Here is a list of available FUNCNAME.

''')
        for op_name, operator in operators.items():
            return_typ = common.get_one_type_generic(operator.params[0],
                                                     'SCALAR')
            func = operator.name
            args = ', '.join([common.get_one_type_generic(p, 'SCALAR') + \
                              ' a' + str(count) for count, p in \
                              enumerate(operator.params[1:])])
            fout.write('- `{} {}({});`\n'.format(return_typ, func, args))

            if operator.domain and len(operator.params[1:]) > 0:
                params = operator.params[1:]

                if len(params) == 1:
                    fout.write('  a0 ∈ {}\n'.format(operator.domain))
                else:
                    param = ', '.join(['a' + str(count) for count in \
                                       range(len(params))])
                    fout.write('  ({}) ∈ {}\n'.format(param, operator.domain))

            if len(operator.types) < len(common.types):
                typs = ', '.join(['{}'.format(t) for t in operator.types])
                fout.write('  Only available for {}\n'.format(typs))
        fout.write('''

## C++ advanced API

The C++ advanced API is called advanced not because it requires C++11 or above
but because it makes use of the particular implementation of ARM SVE by ARM
in their compiler. We do not know if GCC (and possibly MSVC in the distant
future) will use the same approach. Anyway the current implementation allows
us to put SVE SIMD vectors inside some kind of structs that behave like
standard structs. If you want to be sure to write portable code do *not* use
this API. Two new types are available.

- `nsimd::pack<SCALAR, N, SIMDEXT>` represents `N` SIMD vectors containing
  SCALAR elements of SIMD extension SIMDEXT. You can specify only the first
  template argument. The second defaults to 1 while the third defaults to the
  detected SIMDEXT.
- `nsimd::packl<SCALAR, N, SIMDEXT>` represents `N` SIMD vectors of logical
  type containing SCALAR elements of SIMD extension SIMDEXT. You can specify
  only the first template argument. The second defaults to 1 while the third
  defaults to the detected SIMDEXT.

Use N > 1 when declaring packs to have an unroll of N. This is particularily
useful on ARM.

Functions that takes packs do not take any other argument unless specified
otherwise e.g. the load family of funtions. It is impossible to determine
the kind of pack (unroll and SIMDEXT) from the type of a pointer. Therefore
in this case, the last argument must be a pack and this same type will then
return. Also some functions are available as C++ operators.

Here is the list of functions that act on packs.

''')
        for op_name, operator in operators.items():
            return_typ = common.get_one_type_pack(operator.params[0], 1, 'N')
            func = operator.name
            args = ', '.join([common.get_one_type_pack(p, 0, 'N') + ' a' + \
                              str(count) for count, p in \
                              enumerate(operator.params[1:])])
            if 'v' not in operator.params[1:] and 'l' not in operator.params[
                    1:]:
                args = args + ', pack<T, N, SimdExt> const&' if args != '' \
                              else 'pack<T, N, SimdExt> const&'
            fout.write('- `{} {}({});`\n'.format(return_typ, func, args))

            if operator.domain and len(operator.params[1:]) > 0:
                params = operator.params[1:]
                if len(params) == 1:
                    fout.write('  a0 ∈ {}\n'.format(operator.domain))
                else:
                    param = ', '.join(['a'+str(count) for count in \
                                       range(len(params))])
                    fout.write('  ({}) ∈ {}\n'.format(param, operator.domain))

            if operator.cxx_operator:
                fout.write('  Available as {}\n'.format(operator.cxx_operator))

            if len(operator.types) < len(common.types):
                typs = ', '.join(['{}'.format(t) for t in operator.types])
                fout.write('  Only available for {}\n'.format(typs))
Beispiel #22
0
def gen_doc_overview(opts):
    filename = common.get_markdown_file(opts, 'overview', 'tet1d')
    if not common.can_create_filename(opts, filename):
        return
    with common.open_utf8(opts, filename) as fout:
        fout.write('''# Overview

## What are expression templates?

Expression templates are a C++ template metaprogramming technique that
essentially allows high level programming for loop fusion. Take the following
exemple.

```c++
std::vector<float> operator+(std::vector<float> const &a,
                             std::vector<float> const &b) {{
  std::vector<float> ret(a.size());
  for (size_t i = 0; i < a.size(); i++) {{
    ret[i] = a[i] + b[i];
  }}
  return ret;
}}

int main() {{
  std::vector<float> a, b, c, d, sum;

  ...

  sum = a + b + c + d;

  ...

  return 0;
}}
```

The expression `a + b + c + d` involves three calls to `operator+` and at least
nine memory passes are necessary. This can be optimized as follows.

```c++
int main() {{
  std::vector<float> a, b, c, d, sum;

  ...

  for (size_t i = 0; i < a.size(); i++) {{
    ret[i] = a[i] + b[i] + c[i] + d[i];
  }}

  ...

  return 0;
}}
```

The rewriting above requires only four memory passes which is of course better
but as humans we prefer the writing `a + b + c + d`. Expression templates
solves exactly this problem and allows the programmer to write `a + b + c + d`
and the compiler to see the loop written above.

## Expressions templates with NSIMD

This module provides expression templates on top of NSIMD core. As a
consequence the loops seen by the compiler deduced from the high-level
expressions are optimized using SIMD instructions. Note also that NVIDIA and
AMD GPUs are supported through CUDA and ROCm/HIP. The API for expression
templates in NSIMD is C++98 compatible and is able to work with any container
as its only requirement for data is that it must be contiguous.

All inputs to an expression must be declared using `tet1d::in` while the
output must be declared using `tet1d::out`.

```c++
int main() {{
  std::vector<float> a, b, c;

  ...

  tet1d::out(a) = tet1d::in(&a[0], a.size()) + tet1d::in(&b[0], b.size());

  ...

  return 0;
}}
```

- `template <typename T, typename I> inline node in(const T *data, I sz);`{nl}
  Construct an input for expression templates starting at address `data` and
  containing `sz` elements. The return type of this functin `node` can be used
  with the help of the `TET1D_IN(T)` macro where `T` if the underlying type of
  data (ints, floats, doubles...).

- `template <typename T> node out(T *data);`{nl}
  Construct an output for expression templates starting at address `data`. Note
  that memory must be allocated by the user before passing it to the expression
  template engine. The output type can be used with the `TET1D_OUT(T)` where
  `T` is the underlying type (ints, floats, doubles...).

Note that it is possible to pass parameters to the expression template engine
to specify the number of threads per block for GPUs or the SIMD extension to
use...

- `template <typename T, typename Pack> node out(T *data, int
  threads_per_block, void *stream);`{nl}
  Construct an output for expression templates starting at address `data`. Note
  that memory must be allocated by the user before passing it to the expression
  template engine. The `Pack` parameter is useful when compiling for CPUs. The
  type is `nsimd::pack<...>` allowing the developper to specify all details
  about the NSIMD packs that will be used by the expression template engine.
  The `threads_per_block` and `stream` arguments are used only when compiling
  for GPUs. Their meaning is contained in their names. The output type can be
  used with the `TET1D_OUT_EX(T, N, SimdExt)` where `T` is the underlying type
  (ints, floats, doubles...), `N` is the unroll factor and `SimdExt` the SIMD
  extension.

Moreover a MATLAB-like syntax is provided. One can select a subrange of given
input. Indexes are understood as for Python: -1 represents the last element.
The contant `tet1d::end = -1` allows one to write portable code.

```c++
int main() {{
  std::vector<float> a, b, c;

  ...

  TET1D_IN(float) va = tet1d::in(&a[0], a.size());
  TET1D_IN(float) vb = tet1d::in(&b[0], b.size());
  tet1d::out(c) = va(10, tet1d::end - 10) + vb;

  ...

  return 0;
}}
```

One can also specify which elements of the output must be rewritten with
the following syntax.

```c++
int main() {{
  std::vector<float> a, b, c;

  ...

  TET1D_IN(float) va = tet1d::in(&a[0], a.size());
  TET1D_IN(float) vb = tet1d::in(&b[0], b.size());
  TET1D_OUT(float) vc = tet1d::out(&c[0]);
  vc(va >= 10 && va < 20) = vb;

  ...

  return 0;
}}
```

In the exemple above, element `i` in `vc` is written only if `va[i] >= 10` and
`va[i] < 20`. The expression appearing in the parenthesis can contain
arbitrary expression templates as soon as the underlying type is `bool`.

## Warning using `auto`

Using auto can lead to surprising results. We advice you never to use auto
when dealing with expression templates. Indeed using `auto` will make the
variable an obscure type representing the computation tree of the expression
template. This implies that you won't be able to get data from this variable
i.e. get the `.data` member for exemple. Again this variable or its type cannot
be used in template arguments where you need it.
'''.format(nl='  '))
Beispiel #23
0
def gen_tests_for(opts, t, tt, operator):
    op_name = operator.name
    dirname = os.path.join(opts.tests_dir, 'modules', 'tet1d')
    common.mkdir_p(dirname)
    filename = os.path.join(
        dirname, '{}.{}.cpp'.format(op_name,
                                    t if t == tt else '{}_{}'.format(t, tt)))
    if not common.can_create_filename(opts, filename):
        return

    arity = len(operator.params[1:])
    args_tabs = ', '.join(['{typ} *tab{i}'.format(typ=t, i=i) \
                           for i in range(arity)])
    args_tabs_call = ', '.join(['tab{i}'.format(i=i) \
                                for i in range(arity)])
    args_tabs_i_call = ', '.join(['tab{i}[i]'.format(i=i) \
                                  for i in range(arity)])
    args_in_tabs_call = ', '.join(['tet1d::in(tab{i}, n)'. \
                                   format(i=i) \
                                   for i in range(arity)])

    fill_tabs = '\n'.join(['{typ} *tab{i} = nsimd::device_calloc<{typ}>(n);\n' \
                           'prng{ip5}(tab{i}, n);'. \
                           format(typ=t, i=i, ip5=i + 5) \
                           for i in range(arity)])

    free_tabs = '\n'.join(['nsimd::device_free(tab{i});'. \
                           format(typ=t, i=i) for i in range(arity)])

    zero = '{}(0)'.format(t) if t != 'f16' else '{f32_to_f16}(0.0f)'
    one = '{}(1)'.format(t) if t != 'f16' else '{f32_to_f16}(1.0f)'
    comp_tab0_to_1 = 'tab0[i] == {}(1)'.format(t) if t != 'f16' else \
                     '{f16_to_f32}(tab0[i]) == 1.0f'
    comp_tab1_to_1 = 'tab1[i] == {}(1)'.format(t) if t != 'f16' else \
                     '{f16_to_f32}(tab1[i]) == 1.0f'

    if op_name == 'cvt':
        tet1d_code = \
            '''tet1d::out(out) = tet1d::cvt<{t}>(tet1d::cvt<{tt}>(
                                     tet1d::in(tab0, n)));'''. \
                                     format(t=t, tt=tt)
        compute_result_kernel = \
            '''dst[i] = nsimd::{{p}}_cvt({t}(), nsimd::{{p}}_cvt(
                            {tt}(), tab0[i]));'''.format(t=t, tt=tt)
    elif op_name == 'reinterpret':
        tet1d_code = \
            '''tet1d::out(out) = tet1d::reinterpret<{t}>(
                                     tet1d::reinterpret<{tt}>(tet1d::in(
                                         tab0, n)));'''.format(t=t, tt=tt)
        compute_result_kernel = \
            '''dst[i] = nsimd::{{p}}_reinterpret({t}(),
                            nsimd::{{p}}_reinterpret({tt}(),
                                tab0[i]));'''.format(t=t, tt=tt)
    elif op_name in ['to_mask', 'to_logical']:
        tet1d_code = \
            '''tet1d::out(out) = tet1d::to_mask(tet1d::to_logical(tet1d::in(
                                     tab0, n)));'''
        compute_result_kernel = \
            '''dst[i] = nsimd::{{p}}_to_mask({t}(),
                            nsimd::{{p}}_to_logical(tab0[i]));'''. \
                            format(t=t)
    elif operator.params == ['v'] * len(operator.params):
        compute_result_kernel = \
            'dst[i] = nsimd::{{p}}_{op_name}({args_tabs_i_call});'. \
            format(op_name=op_name, args_tabs_i_call=args_tabs_i_call)
        if operator.cxx_operator != None:
            if len(operator.params[1:]) == 1:
                tet1d_code = 'tet1d::out(out) = {cxx_op}tet1d::in(tab0, n);'. \
                             format(cxx_op=operator.cxx_operator)
            else:
                tet1d_code = 'tet1d::out(out) = tet1d::in(tab0, n) {cxx_op} ' \
                             'tet1d::in(tab1, n);'. \
                             format(cxx_op=operator.cxx_operator)
        else:
            tet1d_code = \
                'tet1d::out(out) = tet1d::{op_name}({args_in_tabs_call});'. \
                format(op_name=op_name, args_in_tabs_call=args_in_tabs_call)
    elif operator.params == ['l', 'v', 'v']:
        if operator.cxx_operator != None:
            cond = 'A {} B'.format(operator.cxx_operator)
        else:
            cond = 'tet1d::{}(A, B)'.format(op_name)
        tet1d_code = \
            '''TET1D_OUT({typ}) Z = tet1d::out(out);
               TET1D_IN({typ}) A = tet1d::in(tab0, n);
               TET1D_IN({typ}) B = tet1d::in(tab1, n);
               Z({cond}) = 1;'''.format(cond=cond, typ=t)
        compute_result_kernel = \
            '''if (nsimd::{{p}}_{op_name}(tab0[i], tab1[i])) {{{{
                 dst[i] = {one};
               }}}} else {{{{
                 dst[i] = {zero};
               }}}}'''.format(op_name=op_name, typ=t, one=one, zero=zero)
    elif operator.params == ['l'] * len(operator.params):
        if len(operator.params[1:]) == 1:
            if operator.cxx_operator != None:
                cond = '{}(A == 1)'.format(operator.cxx_operator)
            else:
                cond = 'tet1d::{}(A == 1)'.format(op_name)
            tet1d_code = \
                '''TET1D_OUT({typ}) Z = tet1d::out(out);
                   TET1D_IN({typ}) A = tet1d::in(tab0, n);
                   Z({cond}) = 1;'''.format(cond=cond, typ=t)
            compute_result_kernel = \
                '''if (nsimd::{{p}}_{op_name}({comp_tab0_to_1})) {{{{
                     dst[i] = {one};
                   }}}} else {{{{
                     dst[i] = {zero};
                   }}}}'''.format(op_name=op_name, typ=t, one=one, zero=zero,
                                  comp_tab0_to_1=comp_tab0_to_1)
        if len(operator.params[1:]) == 2:
            if operator.cxx_operator != None:
                cond = '(A == 1) {} (B == 1)'.format(operator.cxx_operator)
            else:
                cond = 'tet1d::{}(A == 1, B == 1)'.format(op_name)
            tet1d_code = \
                '''TET1D_OUT({typ}) Z = tet1d::out(out);
                   TET1D_IN({typ}) A = tet1d::in(tab0, n);
                   TET1D_IN({typ}) B = tet1d::in(tab1, n);
                   Z({cond}) = 1;'''.format(cond=cond, typ=t)
            compute_result_kernel = \
                '''if (nsimd::{{p}}_{op_name}({comp_tab0_to_1},
                                              {comp_tab1_to_1})) {{{{
                     dst[i] = {one};
                   }}}} else {{{{
                     dst[i] = {zero};
                   }}}}'''.format(op_name=op_name, typ=t, one=one, zero=zero,
                                  comp_tab0_to_1=comp_tab0_to_1,
                                  comp_tab1_to_1=comp_tab1_to_1)
    else:
        raise Exception('Unsupported operator: "{}"'.format(op_name))

    cpu_kernel = compute_result_kernel.format(p='scalar',
                                              f32_to_f16='nsimd_f32_to_f16',
                                              f16_to_f32='nsimd_f16_to_f32')
    gpu_kernel = compute_result_kernel.format(p='gpu',
                                              f32_to_f16='__float2half',
                                              f16_to_f32='__half2float')

    if op_name in ['rec11', 'rsqrt11']:
        comp = '!cmp(ref, out, n, .0009765625 /* = 2^-10 */)'
    elif op_name in ['rec8', 'rsqrt8']:
        comp = '!cmp(ref, out, n, .0078125 /* = 2^-7 */)'
    else:
        comp = '!cmp(ref, out, n)'

    with common.open_utf8(opts, filename) as out:
        out.write('''#include <nsimd/modules/tet1d.hpp>
        #include <nsimd/modules/memory_management.hpp>
        #include "../common.hpp"

        #if defined(NSIMD_CUDA)

        __global__ void kernel({typ} *dst, {args_tabs}, int n) {{
          int i = threadIdx.x + blockIdx.x * blockDim.x;
          if (i < n) {{
            {gpu_kernel}
          }}
        }}

        void compute_result({typ} *dst, {args_tabs}, unsigned int n) {{
          kernel<<<{gpu_params}>>>(dst, {args_tabs_call}, int(n));
        }}

        #elif defined(NSIMD_ROCM)

        __global__ void kernel({typ} *dst, {args_tabs}, size_t n) {{
          size_t i = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
          if (i < n) {{
            {gpu_kernel}
          }}
        }}

        void compute_result({typ} *dst, {args_tabs}, size_t n) {{
          hipLaunchKernelGGL(kernel, {gpu_params}, 0, 0, dst, {args_tabs_call},
                             n);
        }}

        #else

        void compute_result({typ} *dst, {args_tabs},
                            unsigned int n) {{
          for (unsigned int i = 0; i < n; i++) {{
            {cpu_kernel}
          }}
        }}

        #endif

        nsimd_fill_dev_mem_func(prng5,
            1 + (((unsigned int)i * 69342380 + 414585) % 5))
        nsimd_fill_dev_mem_func(prng6,
            1 + (((unsigned int)i * 12528380 + 784535) % 6))
        nsimd_fill_dev_mem_func(prng7,
            1 + (((unsigned int)i * 22328380 + 644295) % 7))

        int main() {{
          unsigned int n_[3] = {{ 10, 1001, 10001 }};
          for (int i = 0; i < (int)(sizeof(n_) / sizeof(int)); i++) {{
            unsigned int n = n_[i];
            int ret = 0;
            {fill_tabs}
            {typ} *ref = nsimd::device_calloc<{typ}>(n);
            {typ} *out = nsimd::device_calloc<{typ}>(n);
            compute_result(ref, {args_tabs_call}, n);
            {tet1d_code}
            if ({comp}) {{
              ret = -1;
            }}
            nsimd::device_free(ref);
            nsimd::device_free(out);
            {free_tabs}
            if (ret != 0) {{
              return ret;
            }}
          }}
          return 0;
        }}
        '''.format(typ=t,
                   args_tabs=args_tabs,
                   fill_tabs=fill_tabs,
                   args_tabs_call=args_tabs_call,
                   gpu_params=gpu_params,
                   free_tabs=free_tabs,
                   tet1d_code=tet1d_code,
                   comp=comp,
                   cpu_kernel=cpu_kernel,
                   gpu_kernel=gpu_kernel))

    common.clang_format(opts, filename, cuda=True)
Beispiel #24
0
def gen_functions(opts):
    functions = ''

    for op_name, operator in operators.operators.items():
        if not operator.has_scalar_impl:
            continue

        not_closed = is_not_closed(operator)
        not_closed_tmpl_args = 'typename ToType, ' if not_closed else ''
        not_closed_tmpl_params = 'ToType' if not_closed else 'none_t'

        if op_name in ['shl', 'shr', 'shra']:
            tmpl_args = 'typename Left'
            tmpl_params = 'Left, none_t, none_t'
            size = 'return left.size();'
            args = 'Left const &left, int s'
            members = 'Left left; int s;'
            members_assignment = 'ret.left = to_node(left); ret.s = s;'
            to_node_type = 'typename to_node_t<Left>::type, none_t, none_t'
        elif len(operator.params) == 2:
            tmpl_args = not_closed_tmpl_args + 'typename Left'
            tmpl_params = 'Left, none_t, ' + not_closed_tmpl_params
            size = 'return left.size();'
            args = 'Left const &left'
            members = 'Left left;'
            members_assignment = 'ret.left = to_node(left);'
            to_node_type = 'typename to_node_t<Left>::type, none_t, none_t'
        elif len(operator.params) == 3:
            tmpl_args = 'typename Left, typename Right'
            tmpl_params = 'Left, Right, none_t'
            size = 'return compute_size(left.size(), right.size());'
            args = 'Left const &left, Right const &right'
            members = 'Left left;\nRight right;'
            members_assignment = '''ret.left = to_node(left);
                                    ret.right = to_node(right);'''
            to_node_type = 'typename to_node_t<Left>::type, ' \
                           'typename to_node_t<Right>::type, none_t'
        elif len(operator.params) == 4:
            tmpl_args = 'typename Left, typename Right, typename Extra'
            tmpl_params = 'Left, Right, Extra'
            size = \
            'return compute_size(left.size(), right.size(), extra.size());'
            args = 'Left const &left, Right const &right, Extra const &extra'
            members = 'Left left;\nRight right;\nExtra extra;'
            members_assignment = '''ret.left = to_node(left);
                                    ret.right = to_node(right);
                                    ret.extra = to_node(extra);'''
            to_node_type = 'typename to_node_t<Left>::type, ' \
                           'typename to_node_t<Right>::type, ' \
                           'typename to_node_t<Extra>::type'

        if operator.returns == 'v':
            to_pack = 'to_pack_t'
            return_type = 'out_type'
        else:
            to_pack = 'to_packl_t'
            return_type = 'bool'

        if not_closed:
            to_typ_arg = 'out_type(), '
            to_typ_tmpl_arg = '<typename {to_pack}<out_type, Pack>::type>'. \
                              format(to_pack=to_pack)
            in_out_typedefs = '''typedef typename Left::out_type in_type;
                                 typedef ToType out_type;'''
            to_node_type = 'typename to_node_t<Left>::type, none_t, ToType'
        else:
            to_typ_arg = '' if op_name != 'to_mask' else 'out_type(), '
            to_typ_tmpl_arg = ''
            in_out_typedefs = '''typedef typename Left::out_type in_type;
                                 typedef typename Left::out_type out_type;'''

        impl_args = 'left.{cpu_gpu}_get{tmpl}(i)'
        if (len(operator.params[1:]) >= 2):
            if operator.params[2] == 'p':
                impl_args += ', s'
            else:
                impl_args += ', right.{cpu_gpu}_get{tmpl}(i)'
        if (len(operator.params[1:]) >= 3):
            impl_args += ', extra.{cpu_gpu}_get{tmpl}(i)'

        impl_scalar = 'return nsimd::scalar_{}({}{});'. \
                      format(op_name, to_typ_arg,
                             impl_args.format(cpu_gpu='scalar', tmpl=''))

        impl_gpu = 'return nsimd::gpu_{}({}{});'. \
                   format(op_name, to_typ_arg,
                          impl_args.format(cpu_gpu='gpu', tmpl=''))

        impl_simd = 'return nsimd::{}{}({});'. \
                      format(op_name, to_typ_tmpl_arg,
                             impl_args.format(cpu_gpu='template simd',
                                              tmpl='<Pack>'))

        functions += \
        '''struct {op_name}_t {{}};

        template <{tmpl_args}>
        struct node<{op_name}_t, {tmpl_params}> {{
          {in_out_typedefs}

          {members}

          nsimd::nat size() const {{
            {size}
          }}

        #if defined(NSIMD_CUDA) || defined(NSIMD_ROCM)
          __device__ {return_type} gpu_get(nsimd::nat i) const {{
            {impl_gpu}
          }}
        #else
          {return_type} scalar_get(nsimd::nat i) const {{
            {impl_scalar}
          }}
          template <typename Pack> typename {to_pack}<out_type, Pack>::type
          simd_get(nsimd::nat i) const {{
            {impl_simd}
          }}
        #endif
        }};

        template<{tmpl_args}>
        node<{op_name}_t, {to_node_type}> {op_name}({args}) {{
          node<{op_name}_t, {to_node_type}> ret;
          {members_assignment}
          return ret;
        }}'''.format(op_name=op_name, tmpl_args=tmpl_args, size=size,
                     tmpl_params=tmpl_params, return_type=return_type,
                     args=args, to_pack=to_pack, to_node_type=to_node_type,
                     members=members, members_assignment=members_assignment,
                     in_out_typedefs=in_out_typedefs,
                     impl_gpu=impl_gpu,
                     impl_scalar=impl_scalar,
                     impl_simd=impl_simd)

        if operator.cxx_operator != None and len(operator.params) == 2:
            functions += \
            '''
            template <typename Op, typename Left, typename Right,
                      typename Extra>
            node<{op_name}_t, node<Op, Left, Right, Extra>, none_t, none_t>
            operator{cxx_operator}(node<Op, Left, Right, Extra> const &node) {{
              return tet1d::{op_name}(node);
            }}'''.format(op_name=op_name,
                         cxx_operator=operator.cxx_operator)
        if operator.cxx_operator != None and len(operator.params) == 3:
            functions += '''

            template <typename Op, typename Left, typename Right,
                      typename Extra, typename T>
            node<{op_name}_t, node<Op, Left, Right, Extra>,
                 node<scalar_t, none_t, none_t,
                      typename node<Op, Left, Right, Extra>::in_type>, none_t>
            operator{cxx_operator}(node<Op, Left, Right, Extra> const &node, T a) {{
              typedef typename tet1d::node<Op, Left, Right, Extra>::in_type S;
              return tet1d::{op_name}(node, literal_to<S>::impl(a));
            }}

            template <typename T, typename Op, typename Left, typename Right,
                      typename Extra>
            node<{op_name}_t, node<scalar_t, none_t, none_t,
                              typename node<Op, Left, Right, Extra>::in_type>,
                 node<Op, Left, Right, Extra>, none_t>
            operator{cxx_operator}(T a, node<Op, Left, Right, Extra> const &node) {{
              typedef typename tet1d::node<Op, Left, Right, Extra>::in_type S;
              return tet1d::{op_name}(literal_to<S>::impl(a), node);
            }}

            template <typename LeftOp, typename LeftLeft, typename LeftRight,
                      typename LeftExtra, typename RightOp, typename RightLeft,
                      typename RightRight, typename RightExtra>
            node<{op_name}_t, node<LeftOp, LeftLeft, LeftRight, LeftExtra>,
                              node<RightOp, RightLeft, RightRight, RightExtra>,
                 none_t>
            operator{cxx_operator}(node<LeftOp, LeftLeft, LeftRight,
                                LeftExtra> const &left,
                           node<RightOp, RightLeft, RightRight,
                                RightExtra> const &right) {{
              return tet1d::{op_name}(left, right);
            }}'''.format(op_name=op_name, cxx_operator=operator.cxx_operator)

        functions += '\n\n{}\n\n'.format(common.hbar)

    # Write the code to file
    dirname = os.path.join(opts.include_dir, 'modules', 'tet1d')
    common.mkdir_p(dirname)
    filename = os.path.join(dirname, 'functions.hpp')
    if not common.can_create_filename(opts, filename):
        return
    with common.open_utf8(opts, filename) as out:
        out.write('#ifndef NSIMD_MODULES_TET1D_FUNCTIONS_HPP\n')
        out.write('#define NSIMD_MODULES_TET1D_FUNCTIONS_HPP\n\n')
        out.write('namespace tet1d {\n\n')
        out.write('{}\n\n'.format(common.hbar))
        out.write(functions)
        out.write('} // namespace tet1d\n\n')
        out.write('#endif\n')
    common.clang_format(opts, filename)
Beispiel #25
0
def gen_doc(opts):
    sys.stdout.write('-- Generating doc for each function\n')

    # Build tree for api.md
    api = dict()
    for _, operator in operators.items():
        for c in operator.categories:
            if c not in api:
                api[c] = [operator]
            else:
                api[c].append(operator)

    # helper to construct filename for operator
    # def to_filename(op_name):
    #     valid = string.ascii_letters + string.digits
    #     ret = ''
    #     for c in op_name:
    #         ret += '-' if c not in valid else c
    #     return ret

    # api.md
    # filename = os.path.join(opts.script_dir, '..','doc', 'markdown', 'api.md')
    filename = common.get_markdown_file(opts, 'api')
    if common.can_create_filename(opts, filename):
        with common.open_utf8(opts, filename) as fout:
            fout.write('# API\n')
            for c, ops in api.items():
                if len(ops) == 0:
                    continue
                fout.write('\n## {}\n\n'.format(c.title))
                for op in ops:
                    Full_name = op.full_name[0].upper() + op.full_name[1:]
                    fout.write('- [{} ({})](api_{}.md)\n'.format(
                        Full_name, op.name, common.to_filename(op.name)))

    # helper to get list of function signatures
    def to_string(var):
        sigs = [var] if type(var) == str or not hasattr(var, '__iter__') \
                     else list(var)
        for i in range(0, len(sigs)):
            sigs[i] = re.sub('[ \n\t\r]+', ' ', sigs[i])
        return '\n'.join(sigs)

    # Operators (one file per operator)
    # dirname = os.path.join(opts.script_dir, '..','doc', 'markdown')
    dirname = common.get_markdown_dir(opts)
    common.mkdir_p(dirname)
    for op_name, operator in operators.items():
        # Skip non-matching doc
        if opts.match and not opts.match.match(op_name):
            continue
        # filename = os.path.join(dirname, 'api_{}.md'.format(common.to_filename(
        #                operator.name)))
        filename = common.get_markdown_api_file(opts, operator.name)
        if not common.can_create_filename(opts, filename):
            continue
        Full_name = operator.full_name[0].upper() + operator.full_name[1:]
        with common.open_utf8(opts, filename) as fout:
            fout.write('# {}\n\n'.format(Full_name))
            fout.write('## Description\n\n')
            fout.write(operator.desc)
            fout.write('\n\n## C base API (generic)\n\n')
            fout.write('```c\n')
            fout.write(to_string(operator.get_generic_signature('c_base')))
            fout.write('\n```\n\n')
            fout.write('## C++ base API (generic)\n\n')
            fout.write('```c++\n')
            fout.write(to_string(operator.get_generic_signature('cxx_base')))
            fout.write('\n```\n\n')
            fout.write('## C++ advanced API\n\n')
            fout.write('```c++\n')
            fout.write(to_string(operator.get_generic_signature('cxx_adv'). \
                                 values()))
            fout.write('\n```\n\n')
            fout.write('## C base API (architecture specifics)')
            for simd_ext in opts.simd:
                fout.write('\n\n### {}\n\n'.format(simd_ext.upper()))
                fout.write('```c\n')
                for typ in operator.types:
                    fout.write(operator.get_signature(typ, 'c_base', simd_ext))
                    fout.write(';\n')
                fout.write('```')
            fout.write('\n\n## C++ base API (architecture specifics)')
            for simd_ext in opts.simd:
                fout.write('\n\n### {}\n\n'.format(simd_ext.upper()))
                fout.write('```c\n')
                for typ in operator.types:
                    fout.write(
                        operator.get_signature(typ, 'cxx_base', simd_ext))
                    fout.write(';\n')
                fout.write('```')
Beispiel #26
0
def doit(opts):
    common.myprint(opts, 'Generating module memory_management')
    if not opts.doc:
        return
    filename = common.get_markdown_file(opts, 'overview', 'memory_management')
    if not common.can_create_filename(opts, filename):
        return
    with common.open_utf8(opts, filename) as fout:
        fout.write('''# Overview

This module provides C-style memory managmenent functions. Its purpose is not
to become a fully feature container library. It is to provide portable
malloc, memcpy and free functions with a little helpers to copy data from and
to the devices.

# API reference

## Equivalents of malloc, calloc, memcpy and free for devices

Note that the below functions simply wraps the corresponding C functions
when targeting a CPU.

- `template <typename T> T *device_malloc(size_t sz)`{br}
  Allocates `sz * sizeof(T)` bytes of memory on the device.
  On error NULL is returned.

- `template <typename T> T *device_calloc(size_t sz)`{br}
  Allocates `sz * sizeof(T)` bytes of memory on the device and set the
  allocated memory to zero.
  On error NULL is returned.

- `template <typename T> void device_free(T *ptr)`{br}
  Free the memory pointed to by the given pointer.

- `template <typename T> void copy_to_device(T *device_ptr, T *host_ptr,
  size_t sz)`{br}
  Copy data to from host to device.

- `template <typename T> void copy_to_host(T *host_ptr, T *device_ptr,
  size_t sz)`{br}
  Copy data to from device to host.

- `#define nsimd_fill_dev_mem_func(func_name, expr)`{br}
  Create a device function that will fill data with `expr`. To call the created
  function one simply does `func_name(ptr, sz)`. The `expr` argument represents
  some simple C++ expression that can depend only on `i` the i-th element in
  the vector as shown in the example below.

  ```c++
  nsimd_fill_dev_mem_func(prng, ((i * 1103515245 + 12345) / 65536) % 32768)

  int main() {{
    prng(ptr, 1000);
    return 0;
  }}
  ```

## Pairs of pointers

It is often useful to allocate a pair of data buffers: one on the host and
one on the devices to perform data transfers. The below functions provides
quick ways to malloc, calloc, free and memcpy pointers on host and devices at
once. Note that when targeting CPUs the pair of pointers is reduced to one
pointer that ponit the a single data buffer in which case memcpy's are not
performed. Note also that there is no implicit synchronization of data
between both data buffers. It is up to the programmer to triggers memcpy's.

```c++
template <typename T>
struct paired_pointers_t {{
  T *device_ptr, *host_ptr;
  size_t sz;
}};
```

Members of the above structure are not to be modified but can be passed as
arguments for reading/writing data from/to memory they point to.

- `template <typename T> paired_pointers_t<T> pair_malloc(size_t sz)`{br}
  Allocate `sz * sizeof(T)` bytes of memory on the host and on the device.
  If an error occurs both pointers are NULL.

- `template <typename T> paired_pointers_t<T> pair_malloc_or_exit(size_t
  sz)`{br}
  Allocate `sz * sizeof(T)` bytes of memory on the host and on the device.
  If an error occurs, prints an error message on stderr and exit(3).

- `template <typename T> paired_pointers_t<T> pair_calloc(size_t sz)`{br}
  Allocate `sz * sizeof(T)` bytes of memory on the host and on the device.
  Write both data buffers with zeros.
  If an error occurs both pointers are NULL.

- `template <typename T> paired_pointers_t<T> pair_calloc_or_exit(size_t
  sz)`{br}
  Allocate `sz * sizeof(T)` bytes of memory on the host and on the device.
  Write both data buffers with zeros.
  If an error occurs, prints an error message on stderr and exit(3).

- `template <typename T> void pair_free(paired_pointers_t<T> p)`{br}
  Free data buffers on the host and the device.

- `template <typename T> void copy_to_device(paired_pointers_t<T> p)`{br}
  Copy data from the host buffer to its corresponding device buffer.

- `template <typename T> void copy_to_host(paired_pointers_t<T> p)`{br}
  Copy data from the device buffer to its corresponding host buffer.
'''.format(br='  '))
Beispiel #27
0
def gen_doc(opts):
    common.myprint(opts, 'Generating doc for each function')

    # Build tree for api.md
    api = dict()
    for _, operator in operators.items():
        for c in operator.categories:
            if c not in api:
                api[c] = [operator]
            else:
                api[c].append(operator)

    # api.md
    # filename = os.path.join(opts.script_dir, '..','doc', 'markdown', 'api.md')
    filename = common.get_markdown_file(opts, 'api')
    if common.can_create_filename(opts, filename):
        with common.open_utf8(opts, filename) as fout:
            fout.write('# General API\n\n')
            fout.write('- [Memory function](memory.md)\n')
            fout.write('- [Float16 related functions](fp16.md)\n')
            fout.write('- [Defines provided by NSIMD](defines.md)\n')
            fout.write('- [NSIMD pack and related functions](pack.md)\n\n')
            fout.write('- [NSIMD C++20 concepts](concepts.md)\n\n')
            fout.write('# SIMD operators\n')
            for c, ops in api.items():
                if len(ops) == 0:
                    continue
                fout.write('\n## {}\n\n'.format(c.title))
                for op in ops:
                    Full_name = op.full_name[0].upper() + op.full_name[1:]
                    fout.write('- [{} ({})](api_{}.md)\n'.format(
                        Full_name, op.name, common.to_filename(op.name)))

    # helper to get list of function signatures
    def to_string(var):
        sigs = [var] if type(var) == str or not hasattr(var, '__iter__') \
                     else list(var)
        for i in range(0, len(sigs)):
            sigs[i] = re.sub('[ \n\t\r]+', ' ', sigs[i])
        return '\n'.join(sigs)

    # Operators (one file per operator)
    # dirname = os.path.join(opts.script_dir, '..','doc', 'markdown')
    dirname = common.get_markdown_dir(opts)
    common.mkdir_p(dirname)
    for op_name, operator in operators.items():
        # Skip non-matching doc
        if opts.match and not opts.match.match(op_name):
            continue
        # filename = os.path.join(dirname, 'api_{}.md'.format(common.to_filename(
        #                operator.name)))
        filename = common.get_markdown_api_file(opts, operator.name)
        if not common.can_create_filename(opts, filename):
            continue
        Full_name = operator.full_name[0].upper() + operator.full_name[1:]
        with common.open_utf8(opts, filename) as fout:
            fout.write('# {}\n\n'.format(Full_name))
            fout.write('## Description\n\n')
            fout.write(operator.desc)
            fout.write('\n\n## C base API (generic)\n\n')
            fout.write('```c\n')
            fout.write(to_string(operator.get_generic_signature('c_base')))
            fout.write('\n```\n\n')
            fout.write('\n\n## C advanced API (generic, requires C11)\n\n')
            fout.write('```c\n')
            fout.write(to_string(operator.get_generic_signature('c_adv')))
            fout.write('\n```\n\n')
            fout.write('## C++ base API (generic)\n\n')
            fout.write('```c++\n')
            fout.write(to_string(operator.get_generic_signature('cxx_base')))
            fout.write('\n```\n\n')
            fout.write('## C++ advanced API\n\n')
            fout.write('```c++\n')
            fout.write(to_string(operator.get_generic_signature('cxx_adv'). \
                                 values()))
            fout.write('\n```\n\n')
            fout.write('## C base API (architecture specifics)')
            for simd_ext in opts.simd:
                fout.write('\n\n### {}\n\n'.format(simd_ext.upper()))
                fout.write('```c\n')
                for typ in operator.types:
                    fout.write(operator.get_signature(typ, 'c_base', simd_ext))
                    fout.write(';\n')
                fout.write('```')
            fout.write('\n\n## C++ base API (architecture specifics)')
            for simd_ext in opts.simd:
                fout.write('\n\n### {}\n\n'.format(simd_ext.upper()))
                fout.write('```c\n')
                for typ in operator.types:
                    fout.write(
                        operator.get_signature(typ, 'cxx_base', simd_ext))
                    fout.write(';\n')
                fout.write('```')
Beispiel #28
0
def gen_archis_types(opts, simd_dir, platform, simd_ext):
    filename = os.path.join(simd_dir, 'types.h')
    if not common.can_create_filename(opts, filename):
        return
    mod = opts.platforms[platform]
    c_code = '\n'.join([
        'typedef {} nsimd_{}_v{};'.format(mod.get_type(opts, simd_ext, t),
                                          simd_ext, t) for t in common.types
    ])
    c_code += '\n\n'
    c_code += '\n'.join([
        'typedef {} nsimd_{}_vl{};'.format(
            mod.get_logical_type(opts, simd_ext, t), simd_ext, t)
        for t in common.types
    ])
    if mod.has_compatible_SoA_types(simd_ext):
        for deg in range(2, 5):
            c_code += '\n'.join(['typedef {} nsimd_{}_v{}x{};'. \
                                 format(mod.get_SoA_type(simd_ext, typ, deg),
                                 simd_ext, typ, deg) for typ in common.types])
    else:
        c_code += '\n'.join(['''
                             typedef struct nsimd_{simd_ext}_v{typ}x2 {{
                               nsimd_{simd_ext}_v{typ} v0;
                               nsimd_{simd_ext}_v{typ} v1;
                             }} nsimd_{simd_ext}_v{typ}x2;
                             '''.format(simd_ext=simd_ext, typ=typ) \
                                        for typ in common.types])
        c_code += '\n'.join(['''
                             typedef struct nsimd_{simd_ext}_v{typ}x3 {{
                               nsimd_{simd_ext}_v{typ} v0;
                               nsimd_{simd_ext}_v{typ} v1;
                               nsimd_{simd_ext}_v{typ} v2;
                             }} nsimd_{simd_ext}_v{typ}x3;
                             '''.format(simd_ext=simd_ext, typ=typ) \
                                        for typ in common.types])
        c_code += '\n'.join(['''
                             typedef struct nsimd_{simd_ext}_v{typ}x4 {{
                               nsimd_{simd_ext}_v{typ} v0;
                               nsimd_{simd_ext}_v{typ} v1;
                               nsimd_{simd_ext}_v{typ} v2;
                               nsimd_{simd_ext}_v{typ} v3;
                             }} nsimd_{simd_ext}_v{typ}x4;
                             '''.format(simd_ext=simd_ext, typ=typ) \
                                        for typ in common.types])
        c_code += '\n\n'
    cxx_code = '\n\n'.join([
        '''template <>
                               struct simd_traits<{typ}, {simd_ext}> {{
                                 typedef nsimd_{simd_ext}_v{typ} simd_vector;
                                 typedef nsimd_{simd_ext}_v{typ}x2 simd_vectorx2;
                                 typedef nsimd_{simd_ext}_v{typ}x3 simd_vectorx3;
                                 typedef nsimd_{simd_ext}_v{typ}x4 simd_vectorx4;
                                 typedef nsimd_{simd_ext}_vl{typ} simd_vectorl;
                               }};'''.format(typ=t, simd_ext=simd_ext)
        for t in common.types
    ])
    with common.open_utf8(opts, filename) as out:
        out.write('''#ifndef NSIMD_{platform}_{SIMD_EXT}_TYPES_H
                     #define NSIMD_{platform}_{SIMD_EXT}_TYPES_H

                     {c_code}

                     #define NSIMD_{simd_ext}_NB_REGISTERS  {nb_registers}

                     #if NSIMD_CXX > 0
                     namespace nsimd {{

                     struct {simd_ext} {{}};

                     {cxx_code}

                     }} // namespace nsimd
                     #endif

                     #endif
                     '''.\
                     format(year=date.today().year,
                            platform=platform.upper(),
                            SIMD_EXT=simd_ext.upper(),
                            c_code=c_code, cxx_code=cxx_code,
                            simd_ext=simd_ext,
                            nb_registers=mod.get_nb_registers(simd_ext)))
    common.clang_format(opts, filename)
Beispiel #29
0
def gen_bench(f, simd, typ):
    ## TODO
    path = gen_filename(f, simd, typ)
    ## Check if we need to create the file
    if not common.can_create_filename(_opts, path):
        return
    ## Generate specific code for the bench
    category = common.nsimd_category(simd)
    code = gen_code(f, simd, typ, category=category)
    if code is None:
        return
    ## Now aggregate every parts
    bench = ''
    #bench += gen_bench_asm_function(f, typ, category)
    bench += gen_bench_against(f, 'cpu', typ, f.bench_against_cpu())
    bench += code
    bench += gen_bench_unrolls(f, simd, typ, category)
    bench += gen_bench_against(f, simd, typ, f.bench_against_libs())
    ## bench_with_timestamp
    bench_with_timestamp = ''
    bench_with_timestamp += 'std::map<std::string, std::pair<' + typ + ', double>> sums;' + '\n'
    bench_with_timestamp += 'size_t const nb_runs = 10 * 1000;' + '\n'
    bench_with_timestamp += gen_bench_against_with_timestamp(
        f, 'cpu', typ, f.bench_against_cpu())
    bench_with_timestamp += gen_bench_with_timestamp(f, simd, typ, category)
    bench_with_timestamp += gen_bench_unrolls_with_timestamp(
        f, simd, typ, category)
    bench_with_timestamp += gen_bench_against_with_timestamp(
        f, simd, typ, f.bench_against_libs())
    bench_with_timestamp += '''
                            std::string json = "";
                            json += "{{\\n";
                            json += "  \\"benchmarks\\": [\\n";

                            for (auto const & bench_name_sum_time : sums) {{
                              std::string const & bench_name = bench_name_sum_time.first;
                              {typ} const & sum = bench_name_sum_time.second.first;
                              double const & elapsed_time_ns = bench_name_sum_time.second.second;

                              json += "  {{" "\\n";
                              json += "    \\"name\\": \\"" + bench_name + "/{typ}\\"," + "\\n";
                              json += "    \\"real_time\\": " + std::to_string(elapsed_time_ns) + "," + "\\n";
                              json += "    \\"sum\\": " + std::string(std::isfinite(sum) ? "" : "\\"") + std::to_string(sum) + std::string(std::isfinite(sum) ? "" : "\\"") + "," + "\\n";
                              json += "    \\"time_unit\\": \\"ns\\"\\n";
                              json += "  }}";
                              if (&bench_name_sum_time != &*sums.rbegin()) {{
                                json += ",";
                              }}
                              json += "\\n";
                            }}

                            json += "  ]\\n";
                            json += "}}\\n";

                            std::cout << json << std::flush;
                            '''.format(typ=typ)
    ## Finalize code
    code = gen_bench_from_code(f, typ, bench, '')  # bench_with_timestamp
    ## Write file
    with common.open_utf8(path) as f:
        f.write(code)
    ## Clang-format it!
    common.clang_format(_opts, path)
Beispiel #30
0
def gen_overview(opts):
    filename = common.get_markdown_file(opts, 'overview')
    if not common.can_create_filename(opts, filename):
        return
    with common.open_utf8(opts, filename) as fout:
        fout.write('''# Overview

## NSIMD scalar types

Their names follow the following pattern: `Sxx` where

- `S` is `i` for signed integers, `u` for unsigned integer or `f` for
  floatting point number.
- `xx` is the number of bits taken to represent the number.

Full list of scalar types:

''')
        for t in common.types:
            fout.write('- `{}`\n'.format(t))
        fout.write('''
## NSIMD generic SIMD vector types

In NSIMD, we call a platform an architecture e.g. Intel, ARM, POWERPC. We call
SIMD extension a set of low-level functions and types provided by hardware
vendors to access SIMD units. Examples include SSE2, SSE42, AVX, ...  When
compiling the generic SIMD vector types represents a SIMD register of the
target. Examples are a `__m128` for Intel SSE, `__m512` for Intel AVX-512 or
`svfloat32_t` for Arm SVE.

Their names follow the following pattern:

- C base API: `vSCALAR` where `SCALAR` is a one of scalar type listed above.
- C advanced API: `nsimd_pack_SCALAR` where `SCALAR` is a one of scalar type
  listed above.
- C++ advanced API: `nsimd::pack<SCALAR>` where `SCALAR` is a one of scalar
  type listed above.

Full list of SIMD vector types:

| Base type | C base API | C advanced API | C++ advanced API |
|-----------|------------|----------------|------------------|
''')

        fout.write('\n'.join([
        '| `{typ}` | `v{typ}` | `nsimd_pack_{typ}` | `nsimd::pack<{typ}>` |'. \
        format(typ=typ) for typ in common.types]))

        fout.write('''

## C/C++ base APIs

These come automatically when you include `nsimd/nsimd.h`. You do *not* need
to include a header file for having a function. Here is a list of supported
platforms and their corresponding SIMD extensions.

''')
        platforms = common.get_platforms(opts)
        for p in platforms:
            fout.write('- Platform `{}`\n'.format(p))
            for s in platforms[p].get_simd_exts():
                fout.write('  - `{}`\n'.format(s))
        fout.write('''
Each simd extension has its own set of SIMD types and functions. Types follow
the pattern: `nsimd_SIMDEXT_vSCALAR` where

- `SIMDEXT` is the SIMD extensions.
- `SCALAR` is one of scalar types listed above.

There are also logical types associated to each SIMD vector type. These types
are used, for example, to represent the result of a comparison of SIMD vectors.
They are usually bit masks. Their name follow the pattern:
`nsimd_SIMDEXT_vlSCALAR` where

- `SIMDEXT` is the SIMD extensions.
- `SCALAR` is one of scalar types listed above.

Note 1: Platform `cpu` is a 128 bits SIMD emulation fallback when no SIMD
extension has been specified or is supported on a given compilation target.

Note 2: as all SIMD extensions of all platforms are different there is no
need to put the name of the platform in each identifier.

Function names follow the pattern: `nsimd_SIMDEXT_FUNCNAME_SCALAR` where

- `SIMDEXT` is the SIMD extensions.
- `FUNCNAME` is the name of a function e.g. `add` or `sub`.
- `SCALAR` is one of scalar types listed above.

### Generic identifier

In the base C API, genericity is achieved using macros.

- `vec(SCALAR)` is a type to represent a SIMD vector containing SCALAR
  elements.  SCALAR must be one of scalar types listed above.
- `vecl(SCALAR)` is a type to represent a SIMD vector of logicals for SCALAR
  elements. SCALAR must be one of scalar types listed above.
- `vec_a(SCALAR, SIMDEXT)` is a type to represent a SIMD vector containing
  SCALAR elements for the simd extension SIMDEXT. SCALAR must be one of scalar
  types listed above and SIMDEXT must be a valid SIMD extension.
- `vecl_a(SCALAR, SIMDEXT)` is a type to represent a SIMD vector of logicals
  for SCALAR elements for the simd extension SIMDEXT. SCALAR must be one of
  scalar types listed above and SIMDEXT must be a valid SIMD extension.
- `vFUNCNAME` takes as input the above types to access the operator FUNCNAME
  e.g. `vadd`, `vsub`.

In C++98 and C++03, type traits are available.

- `nsimd::simd_traits<SCALAR, SIMDEXT>::vector` is the SIMD vector type for
  platform SIMDEXT containing SCALAR elements. SIMDEXT is one of SIMD
  extension listed above, SCALAR is one of scalar type listed above.
- `nsimd::simd_traits<SCALAR, SIMDEXT>::vectorl` is the SIMD vector of logicals
  type for platform SIMDEXT containing SCALAR elements. SIMDEXT is one of
  SIMD extensions listed above, SCALAR is one of scalar type listed above.

In C++11 and beyond, type traits are still available but typedefs are also
provided.

- `nsimd::vector<SCALAR, SIMDEXT>` is a typedef to
  `nsimd::simd_traits<SCALAR, SIMDEXT>::vector`.
- `nsimd::vectorl<SCALAR, SIMDEXT>` is a typedef to
  `nsimd::simd_traits<SCALAR, SIMDEXT>::vectorl`.

The C++20 API does not bring different types for SIMD registers nor other
way to access the other SIMD types. It only brings concepts instead of usual
`typename`s. For more informations cf. <concepts.md>.

Note that all macro and functions available in plain C are still available in
C++.

### List of operators provided by the base APIs

In the documentation we use interchangeably the terms "function" and
"operator".  For each operator FUNCNAME a C function (also available in C++)
named `nsimd_SIMDEXT_FUNCNAME_SCALAR` is available for each SCALAR type unless
specified otherwise.

For each FUNCNAME, a C macro (also available in C++) named `vFUNCNAME` is
available and takes as its last argument a SCALAR type.

For each FUNCNAME, a C macro (also available in C++) named `vFUNCNAME_a` is
available and takes as its two last argument a SCALAR type and a SIMDEXT.

For each FUNCNAME, a C++ function in namespace `nsimd` named `FUNCNAME` is
available. It takes as its last argument the SCALAR type and can optionnally
take the SIMDEXT as its last last argument.

For example, for the addition of two SIMD vectors `a` and `b` here are the
possibilities:

```c++
c = nsimd_add_avx_f32(a, b); // use AVX
c = nsimd::add(a, b, f32()); // use detected SIMDEXT
c = nsimd::add(a, b, f32(), avx()); // force AVX even if detected SIMDEXT is not AVX
c = vadd(a, b, f32); // use detected SIMDEXT
c = vadd_e(a, b, f32, avx); // force AVX even if detected SIMDEXT is not AVX
```

Here is a list of available FUNCNAME.

''')
        for op_name, operator in operators.items():
            return_typ = common.get_one_type_generic(operator.params[0],
                                                     'SCALAR')
            func = operator.name
            args = ', '.join([common.get_one_type_generic(p, 'SCALAR') + \
                              ' a' + str(count) for count, p in \
                              enumerate(operator.params[1:])])
            fout.write('- `{} {}({});`  \n'.format(return_typ, func, args))
            if len(operator.types) < len(common.types):
                typs = ', '.join(['{}'.format(t) for t in operator.types])
                fout.write('  Only available for {}\n'.format(typs))

        fout.write('''

## C advanced API (only available in C11)

The C advanced API takes advantage of the C11 `_Generic` keyword to provide
function overloading. Unlike the base API described above there is no need to
pass as arguments the base type of the SIMD extension. The informations are
contained in the types provided by this API.

- `nsimd_pack_SCALAR_SIMDEXT` represents a SIMD vectors containing
  SCALAR elements of SIMD extension SIMDEXT.
- `nsimd::packl_SCALAR_SIMDEXT` represents a SIMD vectors of logicals
  for SCALAR elements of SIMD extension SIMDEXT.

There are versions of the above type without SIMDEXT for which the targeted
SIMD extension is automatically chosen.

- `nsimd_pack_SCALAR` represents a SIMD vectors containing SCALAR elements.
- `nsimd::packl_SCALAR` represents a SIMD vectors of logicals for SCALAR
  elements.

Generic types are also available:

- `nsimd_pack(SCALAR)` is a type to represent a SIMD vector containing SCALAR
  elements.  SCALAR must be one of scalar types listed above.
- `nsimd_packl(SCALAR)` is a type to represent a SIMD vector of logicals for
  SCALAR elements. SCALAR must be one of scalar types listed above.
- `nsimd_pack_a(SCALAR, SIMDEXT)` is a type to represent a SIMD vector
  containing SCALAR elements for the simd extension SIMDEXT. SCALAR must be one
  of scalar types listed above and SIMDEXT must be a valid SIMD extension.
- `nsimd_packl_a(SCALAR, SIMDEXT)` is a type to represent a SIMD vector of
  logicals for SCALAR elements for the simd extension SIMDEXT. SCALAR must be
  one of scalar types listed above and SIMDEXT must be a valid SIMD extension.

Finally, operators are follow the naming: `nsimd_FUNCNAME` e.g. `nsimd_add`,
`nsimd_sub`.

## C++ advanced API

The C++ advanced API is called advanced not because it requires C++11 or above
but because it makes use of the particular implementation of ARM SVE by ARM
in their compiler. We do not know if GCC (and possibly MSVC in the distant
future) will use the same approach. Anyway the current implementation allows
us to put SVE SIMD vectors inside some kind of structs that behave like
standard structs. If you want to be sure to write portable code do *not* use
this API. Two new types are available.

- `nsimd::pack<SCALAR, N, SIMDEXT>` represents `N` SIMD vectors containing
  SCALAR elements of SIMD extension SIMDEXT. You can specify only the first
  template argument. The second defaults to 1 while the third defaults to the
  detected SIMDEXT.
- `nsimd::packl<SCALAR, N, SIMDEXT>` represents `N` SIMD vectors of logical
  type containing SCALAR elements of SIMD extension SIMDEXT. You can specify
  only the first template argument. The second defaults to 1 while the third
  defaults to the detected SIMDEXT.

Use N > 1 when declaring packs to have an unroll of N. This is particularily
useful on ARM.

Functions that takes packs do not take any other argument unless specified
otherwise e.g. the load family of funtions. It is impossible to determine
the kind of pack (unroll and SIMDEXT) from the type of a pointer. Therefore
in this case, the last argument must be a pack and this same type will then
return. Also some functions are available as C++ operators. They follow the
naming: `nsimd::FUNCNAME`.
''')