예제 #1
0
class Intel64Rewriter(CPU64Rewriter):

    lang_intel_common = {
        'ignore-deps': cgen.Pragma('ivdep'),
        'ntstores': cgen.Pragma('vector nontemporal'),
        'storefence': cgen.Statement('_mm_sfence()'),
        'noinline': cgen.Pragma('noinline')
    }
    lang = {
        'IntelCompiler': lang_intel_common,
        'IntelKNLCompiler': lang_intel_common
    }
    """
    Collection of backend-compiler-specific pragmas.
    """
    @dle_pass
    def _avoid_denormals(self, iet):
        header = [
            cgen.Comment('Flush denormal numbers to zero in hardware'),
            cgen.Statement(
                '_MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON)'),
            cgen.Statement('_MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON)')
        ]
        iet = List(header=header, body=iet)
        return iet, {'includes': ('xmmintrin.h', 'pmmintrin.h')}
예제 #2
0
    def execute_time_loop(self):
        statements = []
        statements.append(self.grid.time_stepping)
        if self.pluto:
            statements.append(
                cgen.Block([
                    cgen.Pragma("scop"), self.grid.stress_loop,
                    cgen.Pragma("endscop")
                ]))
        else:
            statements.append(self.grid.stress_loop)
        statements.append(self.grid.stress_bc)

        if self.pluto:
            statements.append(
                cgen.Block([
                    cgen.Pragma("scop"), self.grid.velocity_loop,
                    cgen.Pragma("endscop")
                ]))
        else:
            statements.append(self.grid.velocity_loop)
        statements.append(self.grid.velocity_bc)
        output_step = self.grid.output_step
        if output_step:
            statements.append(output_step)
        result = cgen.For(cgen.InlineInitializer(cgen.Value("int", "_ti"), 0),
                          "_ti < ntsteps", "_ti++", cgen.Block(statements))
        return result
예제 #3
0
class OmpBB(PragmaLangBB):

    mapper = {
        # Misc
        'name': 'OpenMP',
        'header': 'omp.h',
        # Platform mapping
        AMDGPUX: None,
        NVIDIAX: None,
        # Runtime library
        'init': None,
        'thread-num': DefFunction('omp_get_thread_num'),
        'num-devices': lambda args:
            DefFunction('omp_get_num_devices', args),
        'set-device': lambda args:
            Call('omp_set_default_device', args),
        # Pragmas
        'simd-for': c.Pragma('omp simd'),
        'simd-for-aligned': lambda i, j: c.Pragma('omp simd aligned(%s:%d)' % (i, j)),
        'atomic': c.Pragma('omp atomic update'),
        'map-enter-to': lambda i, j:
            c.Pragma('omp target enter data map(to: %s%s)' % (i, j)),
        'map-enter-alloc': lambda i, j:
            c.Pragma('omp target enter data map(alloc: %s%s)' % (i, j)),
        'map-update': lambda i, j:
            c.Pragma('omp target update from(%s%s)' % (i, j)),
        'map-update-host': lambda i, j:
            c.Pragma('omp target update from(%s%s)' % (i, j)),
        'map-update-device': lambda i, j:
            c.Pragma('omp target update to(%s%s)' % (i, j)),
        'map-release': lambda i, j, k:
            c.Pragma('omp target exit data map(release: %s%s)%s'
                     % (i, j, k)),
        'map-exit-delete': lambda i, j, k:
            c.Pragma('omp target exit data map(delete: %s%s)%s'
                     % (i, j, k)),
        'memcpy-to-device': lambda i, j, k:
            Call('omp_target_memcpy', [i, j, k, 0, 0,
                                       DefFunction('omp_get_device_num'),
                                       DefFunction('omp_get_initial_device')]),
        'memcpy-to-device-wait': lambda i, j, k, l:
            Call('omp_target_memcpy', [i, j, k, 0, 0,
                                       DefFunction('omp_get_device_num'),
                                       DefFunction('omp_get_initial_device')]),
        'device-get':
            'omp_get_default_device()',
        'device-alloc': lambda i, j:
            'omp_target_alloc(%s, %s)' % (i, j),
        'device-free': lambda i, j:
            'omp_target_free(%s, %s)' % (i, j)
    }
    mapper.update(CBB.mapper)

    Region = OmpRegion
    HostIteration = OmpIteration
    DeviceIteration = DeviceOmpIteration
    Prodder = ThreadedProdder
예제 #4
0
파일: GPIO.py 프로젝트: KrystianD/ksystem
    def emit_extern_global_variables(self, source_file):
        source_file.add(cgen.Pragma("""push_macro("_SFR_IO8")"""))
        source_file.add(cgen.Line("#undef _SFR_IO8"))
        source_file.add(cgen.Define("_SFR_IO8(x)", "x"))

        for name, definition in self.definitions:
            port = definition.port[0]
            assert port in ("A", "B", "C", "D", "E")
            pin = int(definition.port[1])

            source_file.add(
                cgen.Statement(f"static kGPIO<PORT{port},{pin}> {name}"))

        source_file.add(cgen.Pragma("""pop_macro("_SFR_IO8")"""))
예제 #5
0
    def _generate_lib_outer_loop(self):

        block = cgen.Block([self._components['LIB_KERNEL_CALL']])

        i = self._components['LIB_PAIR_INDEX_0']

        shared = ''
        for sx in self._components['OMP_SHARED_SYMS']:
            shared += sx + ','
        shared = shared[:-1]

        pragma = cgen.Pragma('omp parallel default(none) shared(' + shared +
                             ')')

        parallel_region = cgen.Block((
            cgen.Value('int',
                       '_thread_start'), cgen.Value('int', '_thread_end'),
            cgen.Line(
                'get_thread_decomp((int)_N_LOCAL, &_thread_start, &_thread_end);'
            ),
            cgen.For('int ' + i + '= _thread_start', i + '< _thread_end',
                     i + '++', block)))

        loop = cgen.Module([
            cgen.Line('omp_set_num_threads(_NUM_THREADS);'), pragma,
            parallel_region
        ])

        self._components['LIB_OUTER_LOOP'] = loop
예제 #6
0
    def _make_header(cls, **kwargs):
        construct = cls._make_construct(**kwargs)
        clauses = cls._make_clauses(**kwargs)

        header = ' '.join([construct] + clauses)

        return c.Pragma(header)
예제 #7
0
    def _generate_lib_outer_loop(self):

        block = cgen.Block([self._components['LIB_KERNEL_GATHER'],
                            self._components['LIB_INNER_LOOP'],
                            self._components['LIB_KERNEL_SCATTER']])

        i = self._components['LIB_PAIR_INDEX_0']

        shared = ''
        for sx in self._components['OMP_SHARED_SYMS']:
            shared+= sx+','
        shared = shared[:-1]
        pragma = cgen.Pragma('omp parallel for schedule(static) // default(shared) shared(' + shared + ')')
        if runtime.OMP_NUM_THREADS is None:
            pragma = cgen.Comment(pragma)

        loop = cgen.Module([
            cgen.Line('omp_set_num_threads(_NUM_THREADS);'),
            pragma,
            cgen.For('int ' + i + '=0',
                    i + '<_N_LOCAL',
                    i+'++',
                    block)
        ])

        self._components['LIB_OUTER_LOOP'] = loop
예제 #8
0
    def _make_header(cls, **kwargs):
        kwargs.pop('pragmas', None)

        construct = cls._make_construct(**kwargs)
        clauses = cls._make_clauses(**kwargs)
        header = c.Pragma(' '.join([construct] + clauses))

        return (header, ), kwargs
예제 #9
0
    def execute_parallel_block(self):
        statements = []
        if self.profiling:
            if self.numevents_papi > 0:
                statements += [self.grid.define_papi_events]
                statements.append(
                    cgen.Statement(
                        "opesci_papi_start_counters(numevents, events)"))
            else:
                statements.append(cgen.Value("float", "real_time"))
                statements.append(cgen.Value("float", "proc_time"))
                statements.append(cgen.Value("float", "mflops"))
                statements.append(cgen.Value("long long", "flpins"))
                statements.append(
                    cgen.Statement(
                        "opesci_flops(&real_time, &proc_time, &flpins, &mflops)"
                    ))
        statements.append(self.grid.initialise)

        statements.append(self.execute_time_loop())

        if self.profiling:
            if self.numevents_papi > 0:
                statements.append(
                    cgen.Statement(
                        "opesci_papi_read_counters(numevents, counters)"))
                statements.append(cgen.Pragma("omp critical"))
                statements.append(cgen.Block(self.grid.sum_papi_events()))
            else:
                statements.append(
                    cgen.Statement(
                        "opesci_flops(&real_time, &proc_time, &flpins, &mflops)"
                    ))
                statements.append(cgen.Pragma("omp critical"))
                critical_block = []
                critical_block.append(
                    cgen.Assign("profiling->g_rtime",
                                "fmax(profiling->g_rtime, real_time)"))
                critical_block.append(
                    cgen.Assign("profiling->g_ptime",
                                "fmax(profiling->g_ptime, proc_time)"))
                critical_block.append(
                    cgen.Statement("profiling->g_mflops += mflops;"))
                statements.append(cgen.Block(critical_block))
        return [cgen.Pragma("omp parallel"), cgen.Block(statements)]
예제 #10
0
    def make_gpudirect(self, iet):
        mapper = {}
        for node in FindNodes((IsendCall, IrecvCall)).visit(iet):
            header = c.Pragma('omp target data use_device_ptr(%s)' %
                              node.arguments[0].name)
            mapper[node] = Block(header=header, body=node)

        iet = Transformer(mapper).visit(iet)

        return iet, {}
예제 #11
0
class DeviceAccizer(DeviceOmpizer):

    lang = dict(DeviceOmpizer.__base__.lang)
    lang.update({
        'atomic':
        c.Pragma('acc atomic update'),
        'map-enter-to':
        lambda i, j: c.Pragma('acc enter data copyin(%s%s)' % (i, j)),
        'map-enter-alloc':
        lambda i, j: c.Pragma('acc enter data create(%s%s)' % (i, j)),
        'map-present':
        lambda i, j: c.Pragma('acc data present(%s%s)' % (i, j)),
        'map-update':
        lambda i, j: c.Pragma('acc exit data copyout(%s%s)' % (i, j)),
        'map-release':
        lambda i, j: c.Pragma('acc exit data delete(%s%s)' % (i, j)),
        'map-exit-delete':
        lambda i, j: c.Pragma('acc exit data delete(%s%s)' % (i, j)),
        'map-pointers':
        lambda i: c.Pragma('acc host_data use_device(%s)' % i)
    })

    _Iteration = DeviceOpenACCIteration

    @classmethod
    def _map_present(cls, f):
        # TODO: currently this is unused, because we cannot yet distinguish between
        # "real" Arrays and Functions that "acts as Arrays", created by the compiler
        # to build support routines (e.g., the Sendrecv/Gather/Scatter MPI Callables).
        # We should only use "#pragma acc present" for *real* Arrays -- that is
        # temporaries that are born and die on the Device
        return cls.lang['map-present'](f.name,
                                       ''.join('[0:%s]' % i
                                               for i in cls._map_data(f)))

    @classmethod
    def _map_delete(cls, f):
        return cls.lang['map-exit-delete'](f.name,
                                           ''.join('[0:%s]' % i
                                                   for i in cls._map_data(f)))

    @classmethod
    def _map_pointers(cls, functions):
        return cls.lang['map-pointers'](','.join(f.name for f in functions))

    def _make_parallel(self, iet):
        iet, metadata = super(DeviceAccizer, self)._make_parallel(iet)

        metadata['includes'] = ['openacc.h']

        return iet, metadata
예제 #12
0
파일: nodes.py 프로젝트: speglich/devito
    def _make_header(cls, **kwargs):
        construct = cls._make_construct(**kwargs)
        clauses = cls._make_clauses(**kwargs)
        header = c.Pragma(' '.join([construct] + clauses))

        # Extract the Iteration Properties
        properties = cls._process_properties(**kwargs)

        # Drop the unrecognised or unused kwargs
        kwargs = cls._process_kwargs(**kwargs)

        return (header, ), kwargs, properties
예제 #13
0
파일: rewriters.py 프로젝트: cxz/devito
class BasicRewriter(AbstractRewriter):

    lang_intel_common = {
        'ignore-deps': cgen.Pragma('ivdep'),
        'ntstores': cgen.Pragma('vector nontemporal'),
        'storefence': cgen.Statement('_mm_sfence()'),
        'noinline': cgen.Pragma('noinline')
    }

    lang = {
        'IntelCompiler': lang_intel_common,
        'IntelKNLCompiler': lang_intel_common
    }
    """
    Collection of backend-compiler-specific pragmas.
    """
    def _pipeline(self, state):
        self._avoid_denormals(state)

    def _backend_compiler_pragma(self, name, default=None):
        key = configuration['compiler'].__class__.__name__
        return self.lang.get(key, {}).get(name, default)

    @dle_pass
    def _avoid_denormals(self, iet):
        """
        Introduce nodes in the Iteration/Expression tree that will expand to C
        macros telling the CPU to flush denormal numbers in hardware. Denormals
        are normally flushed when using SSE-based instruction sets, except when
        compiling shared objects.
        """
        header = [
            cgen.Comment('Flush denormal numbers to zero in hardware'),
            cgen.Statement(
                '_MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON)'),
            cgen.Statement('_MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON)')
        ]
        iet = List(header=header, body=iet)
        return (iet, {'includes': ('xmmintrin.h', 'pmmintrin.h')})
예제 #14
0
def mpi_gpu_direct(iet, **kwargs):
    """
    Modify MPI Callables to enable multiple GPUs performing GPU-Direct communication.
    """
    mapper = {}
    for node in FindNodes((IsendCall, IrecvCall)).visit(iet):
        header = c.Pragma('omp target data use_device_ptr(%s)' %
                          node.arguments[0].name)
        mapper[node] = Block(header=header, body=node)

    iet = Transformer(mapper).visit(iet)

    return iet, {}
예제 #15
0
    def _generate_lib_outer_loop(self):

        block = cgen.Block([
            self._components['LIB_KERNEL_GATHER'],
            self._components['LIB_INNER_LOOP'],
            self._components['LIB_KERNEL_SCATTER']
        ])

        cx = self._components['LIB_CELL_CX']
        cy = self._components['LIB_CELL_CY']
        cz = self._components['LIB_CELL_CZ']

        ncx = self._components['N_CELL_X']
        ncy = self._components['N_CELL_Y']
        ncz = self._components['N_CELL_Z']

        exec_count = self._components['EXEC_COUNT']
        red_exec_count = '_' + exec_count

        npad = self._components['N_CELL_PAD']

        shared = ''
        for sx in self._components['OMP_SHARED_SYMS']:
            shared += sx + ','
        shared = shared[:-1]
        pragma = cgen.Pragma('omp parallel for default(none) reduction(+:' + \
            red_exec_count +') schedule(dynamic) collapse(3) ' + \
            'shared(' + shared + ')')
        if runtime.OMP_NUM_THREADS is None:
            pragma = cgen.Comment(pragma)

        loop = cgen.Module([
            cgen.Line('omp_set_num_threads(_NUM_THREADS);'),
            cgen.Line('INT64 ' + red_exec_count + ' = 0;'),
            pragma,
            # cellx loop
            cgen.For(
                'INT64 ' + cx + '=' + npad, cx + '<' + ncx + '-' + npad,
                cx + '++',
                cgen.Block([
                    cgen.For(
                        'INT64 ' + cy + '=' + npad,
                        cy + '<' + ncy + '-' + npad, cy + '++',
                        cgen.Block((cgen.For('INT64 ' + cz + '=' + npad,
                                             cz + '<' + ncz + '-' + npad,
                                             cz + '++', block), ))),
                ])),
            cgen.Line('*' + exec_count + ' += ' + red_exec_count + ';')
        ])

        self._components['LIB_OUTER_LOOP'] = loop
예제 #16
0
class DeviceAccizer(DeviceOmpizer):

    lang = {
        'atomic':
        c.Pragma('acc atomic update'),
        'map-enter-to':
        lambda i, j: c.Pragma('acc enter data copyin(%s%s)' % (i, j)),
        'map-update':
        lambda i, j: c.Pragma('acc exit data copyout(%s%s)' % (i, j)),
        'map-release':
        lambda i, j: c.Pragma('acc exit data delete(%s%s)' % (i, j)),
        'map-exit-delete':
        lambda i, j: c.Pragma('acc exit data delete(%s%s)' % (i, j)),
    }

    _Iteration = DeviceOpenACCIteration

    def _make_parallel(self, iet):
        iet, metadata = super(DeviceAccizer, self)._make_parallel(iet)

        metadata['includes'] = ['openacc.h']

        return iet, metadata
예제 #17
0
class Ompizer(object):

    COLLAPSE = 32
    """Use a collapse clause if the number of available physical cores is
    greater than this threshold."""

    lang = {
        'for': c.Pragma('omp for schedule(static)'),
        'collapse': lambda i: c.Pragma('omp for collapse(%d) schedule(static)' % i),
        'par-region': lambda i: c.Pragma('omp parallel %s' % i),
        'par-for': c.Pragma('omp parallel for schedule(static)'),
        'simd-for': c.Pragma('omp simd'),
        'simd-for-aligned': lambda i, j: c.Pragma('omp simd aligned(%s:%d)' % (i, j)),
        'atomic': c.Pragma('omp atomic update')
    }
    """
    Shortcuts for the OpenMP language.
    """

    def __init__(self, key):
        """
        :param key: A function returning True if ``v`` can be parallelized,
                    False otherwise.
        """
        self.key = key

    def _pragma_for(self, root, candidates):
        # Heuristic: if at least two parallel loops are available and the
        # physical core count is greater than COLLAPSE, then omp-collapse them
        nparallel = len(candidates)
        if (psutil.cpu_count(logical=False) < Ompizer.COLLAPSE
                or nparallel < 2
                or not IsPerfectIteration().visit(root)):
            return self.lang['for']
        else:
            return self.lang['collapse'](nparallel)

    def _make_parallel_tree(self, root, candidates):
        """
        Return a mapper to parallelize the :class:`Iteration`s within /root/.
        """
        parallel = self._pragma_for(root, candidates)

        # Introduce the `omp for` pragma
        mapper = OrderedDict()
        if root.is_ParallelAtomic:
            # Introduce the `omp atomic` pragmas
            exprs = FindNodes(Expression).visit(root)
            subs = {i: List(header=self.lang['atomic'], body=i)
                    for i in exprs if i.is_increment}
            handle = Transformer(subs).visit(root)
            mapper[root] = handle._rebuild(pragmas=root.pragmas + (parallel,))
        else:
            mapper[root] = root._rebuild(pragmas=root.pragmas + (parallel,))

        return mapper

    def make_parallel(self, iet):
        """
        Transform ``iet`` by decorating its parallel :class:`Iteration`s with
        suitable ``#pragma omp ...`` triggering thread-level parallelism.
        """
        # Group sequences of loops that should go within the same parallel region
        was_tagged = False
        groups = OrderedDict()
        for tree in retrieve_iteration_tree(iet):
            # Determine the number of consecutive parallelizable Iterations
            candidates = filter_iterations(tree, key=self.key, stop='asap')
            if not candidates:
                was_tagged = False
                continue
            # Consecutive tagged Iteration go in the same group
            is_tagged = any(i.tag is not None for i in tree)
            key = len(groups) - (is_tagged & was_tagged)
            handle = groups.setdefault(key, OrderedDict())
            handle[candidates[0]] = candidates
            was_tagged = is_tagged

        mapper = OrderedDict()
        for group in groups.values():
            private = []
            for root, candidates in group.items():
                mapper.update(self._make_parallel_tree(root, candidates))

                # Track the thread-private and thread-shared variables
                private.extend([i for i in FindSymbols('symbolics').visit(root)
                                if i.is_Array and i._mem_stack])

            # Build the parallel region
            private = sorted(set([i.name for i in private]))
            private = ('private(%s)' % ','.join(private)) if private else ''
            rebuilt = [v for k, v in mapper.items() if k in group]
            par_region = Block(header=self.lang['par-region'](private), body=rebuilt)
            for k, v in list(mapper.items()):
                if isinstance(v, Iteration):
                    mapper[k] = None if v.is_Remainder else par_region

        return Transformer(mapper).visit(iet)
예제 #18
0
class DeviceOmpizer(Ompizer):

    COLLAPSE_NCORES = 1
    """
    Always collapse when possible.
    """

    COLLAPSE_WORK = 1
    """
    Always collapse when possible.
    """

    lang = dict(Ompizer.lang)
    lang.update({
        'map-enter-to': lambda i, j:
            c.Pragma('omp target enter data map(to: %s%s)' % (i, j)),
        'map-enter-alloc': lambda i, j:
            c.Pragma('omp target enter data map(alloc: %s%s)' % (i, j)),
        'map-update': lambda i, j:
            c.Pragma('omp target update from(%s%s)' % (i, j)),
        'map-release': lambda i, j:
            c.Pragma('omp target exit data map(release: %s%s)' % (i, j)),
        'map-exit-delete': lambda i, j:
            c.Pragma('omp target exit data map(delete: %s%s)' % (i, j)),
    })

    _Iteration = DeviceOpenMPIteration

    @classmethod
    def _map_data(cls, f):
        if f.is_Array:
            return f.symbolic_shape
        else:
            return tuple(f._C_get_field(FULL, d).size for d in f.dimensions)

    @classmethod
    def _map_to(cls, f):
        return cls.lang['map-enter-to'](f.name, ''.join('[0:%s]' % i
                                                        for i in cls._map_data(f)))

    @classmethod
    def _map_alloc(cls, f):
        return cls.lang['map-enter-alloc'](f.name, ''.join('[0:%s]' % i
                                                           for i in cls._map_data(f)))

    @classmethod
    def _map_present(cls, f):
        raise NotImplementedError

    @classmethod
    def _map_update(cls, f):
        return cls.lang['map-update'](f.name, ''.join('[0:%s]' % i
                                                      for i in cls._map_data(f)))

    @classmethod
    def _map_release(cls, f):
        return cls.lang['map-release'](f.name, ''.join('[0:%s]' % i
                                                       for i in cls._map_data(f)))

    @classmethod
    def _map_delete(cls, f):
        return cls.lang['map-exit-delete'](f.name, ''.join('[0:%s]' % i
                                                           for i in cls._map_data(f)))

    @classmethod
    def _map_pointers(cls, f):
        raise NotImplementedError

    def _make_threaded_prodders(self, partree):
        # no-op for now
        return partree

    def _make_partree(self, candidates, nthreads=None):
        """
        Parallelize the `candidates` Iterations attaching suitable OpenMP pragmas
        for GPU offloading.
        """
        assert candidates
        root = candidates[0]

        # Get the collapsable Iterations
        collapsable = self._find_collapsable(root, candidates)
        ncollapse = 1 + len(collapsable)

        # Prepare to build a ParallelTree
        # Create a ParallelTree
        body = self._Iteration(ncollapse=ncollapse, **root.args)
        partree = ParallelTree([], body, nthreads=nthreads)

        collapsed = [partree] + collapsable

        return root, partree, collapsed

    def _make_parregion(self, partree):
        # no-op for now
        return partree

    def _make_guard(self, partree, *args):
        # no-op for now
        return partree

    def _make_nested_partree(self, partree):
        # no-op for now
        return partree
예제 #19
0
class Ompizer(object):

    lang = {
        'simd-for':
        c.Pragma('omp simd'),
        'simd-for-aligned':
        lambda i, j: c.Pragma('omp simd aligned(%s:%d)' % (i, j)),
        'atomic':
        c.Pragma('omp atomic update'),
        'thread-num':
        DefFunction('omp_get_thread_num')
    }
    """
    Shortcuts for the OpenMP language.
    """

    _Region = OpenMPRegion
    _Iteration = OpenMPIteration

    def __init__(self, sregistry, options, key=None):
        """
        Parameters
        ----------
        sregistry : SymbolRegistry
            The symbol registry, to quickly access the special symbols that may
            appear in the IET (e.g., `sregistry.threadid`, `sregistry.nthreads`).
        options : dict
             The optimization options. Accepted: ['par-collapse-ncores',
             'par-collapse-work', 'par-chunk-nonaffine', 'par-dynamic-work', 'par-nested']
             * 'par-collapse-ncores': use a collapse clause if the number of
               available physical cores is greater than this threshold.
             * 'par-collapse-work': use a collapse clause if the trip count of the
               collapsable Iterations is statically known to exceed this threshold.
             * 'par-chunk-nonaffine': coefficient to adjust the chunk size in
               non-affine parallel Iterations.
             * 'par-dynamic-work': use dynamic scheduling if the operation count per
               iteration exceeds this threshold. Otherwise, use static scheduling.
             * 'par-nested': nested parallelism if the number of hyperthreads per core
               is greater than this threshold.
        key : callable, optional
            Return True if an Iteration can be parallelized, False otherwise.
        """
        self.sregistry = sregistry

        self.collapse_ncores = options['par-collapse-ncores']
        self.collapse_work = options['par-collapse-work']
        self.chunk_nonaffine = options['par-chunk-nonaffine']
        self.dynamic_work = options['par-dynamic-work']
        self.nested = options['par-nested']

        if key is not None:
            self.key = key
        else:
            self.key = lambda i: i.is_ParallelRelaxed and not i.is_Vectorized

    @property
    def nthreads(self):
        return self.sregistry.nthreads

    @property
    def nthreads_nested(self):
        return self.sregistry.nthreads_nested

    @property
    def nthreads_nonaffine(self):
        return self.sregistry.nthreads_nonaffine

    @property
    def threadid(self):
        return self.sregistry.threadid

    def _find_collapsable(self, root, candidates):
        collapsable = []
        if ncores() >= self.collapse_ncores:
            for n, i in enumerate(candidates[1:], 1):
                # The Iteration nest [root, ..., i] must be perfect
                if not IsPerfectIteration(depth=i).visit(root):
                    break

                # The OpenMP specification forbids collapsed loops to use iteration
                # variables in initializer expressions. E.g., the following is forbidden:
                #
                # #pragma omp ... collapse(2)
                # for (i = ... )
                #   for (j = i ...)
                #     ...
                #
                # Here, we make sure this won't happen
                if any(j.dim in i.symbolic_min.free_symbols
                       for j in candidates[:n]):
                    break

                # Also, we do not want to collapse vectorizable Iterations
                if i.is_Vectorized:
                    break

                # Would there be enough work per parallel iteration?
                nested = candidates[n + 1:]
                if nested:
                    try:
                        work = prod([int(j.dim.symbolic_size) for j in nested])
                        if work < self.collapse_work:
                            break
                    except TypeError:
                        pass

                collapsable.append(i)
        return collapsable

    @classmethod
    def _make_tid(cls, tid):
        return c.Initializer(c.Value(tid._C_typedata, tid.name),
                             cls.lang['thread-num'])

    def _make_reductions(self, partree, collapsed):
        if not any(i.is_ParallelAtomic for i in collapsed):
            return partree

        # Collect expressions inducing reductions
        exprs = FindNodes(Expression).visit(partree)
        exprs = [
            i for i in exprs if i.is_Increment and not i.is_ForeignExpression
        ]

        reduction = [i.output for i in exprs]
        if (all(i.is_Affine for i in collapsed)
                or all(not i.is_Indexed for i in reduction)):
            # Introduce reduction clause
            mapper = {partree.root: partree.root._rebuild(reduction=reduction)}
        else:
            # Introduce one `omp atomic` pragma for each increment
            mapper = {
                i: List(header=self.lang['atomic'], body=i)
                for i in exprs
            }

        partree = Transformer(mapper).visit(partree)

        return partree

    def _make_threaded_prodders(self, partree):
        mapper = {
            i: ThreadedProdder(i)
            for i in FindNodes(Prodder).visit(partree)
        }
        partree = Transformer(mapper).visit(partree)
        return partree

    def _make_partree(self, candidates, nthreads=None):
        """Parallelize the `candidates` Iterations attaching suitable OpenMP pragmas."""
        assert candidates
        root = candidates[0]

        # Get the collapsable Iterations
        collapsable = self._find_collapsable(root, candidates)
        ncollapse = 1 + len(collapsable)

        # Prepare to build a ParallelTree
        if all(i.is_Affine for i in candidates):
            bundles = FindNodes(ExpressionBundle).visit(root)
            sops = sum(i.ops for i in bundles)
            if sops >= self.dynamic_work:
                schedule = 'dynamic'
            else:
                schedule = 'static'
            if nthreads is None:
                # pragma omp for ... schedule(..., 1)
                nthreads = self.nthreads
                body = OpenMPIteration(schedule=schedule,
                                       ncollapse=ncollapse,
                                       **root.args)
            else:
                # pragma omp parallel for ... schedule(..., 1)
                body = OpenMPIteration(schedule=schedule,
                                       parallel=True,
                                       ncollapse=ncollapse,
                                       nthreads=nthreads,
                                       **root.args)
            prefix = []
        else:
            # pragma omp for ... schedule(..., expr)
            assert nthreads is None
            nthreads = self.nthreads_nonaffine
            chunk_size = Symbol(name='chunk_size')
            body = OpenMPIteration(ncollapse=ncollapse,
                                   chunk_size=chunk_size,
                                   **root.args)

            niters = prod([root.symbolic_size] +
                          [j.symbolic_size for j in collapsable])
            value = INT(Max(niters / (nthreads * self.chunk_nonaffine), 1))
            prefix = [Expression(DummyEq(chunk_size, value, dtype=np.int32))]

        # Create a ParallelTree
        partree = ParallelTree(prefix, body, nthreads=nthreads)

        collapsed = [partree] + collapsable

        return root, partree, collapsed

    def _make_parregion(self, partree, parrays):
        arrays = [i for i in FindSymbols().visit(partree) if i.is_Array]

        # Detect thread-private arrays on the heap and "map" them to shared
        # vector-expanded (one entry per thread) Arrays
        heap_private = [i for i in arrays if i._mem_heap and i._mem_local]
        heap_globals = []
        for i in heap_private:
            if i in parrays:
                pi = parrays[i]
            else:
                pi = parrays.setdefault(
                    i,
                    PointerArray(name=self.sregistry.make_name(),
                                 dimensions=(self.threadid, ),
                                 array=i))
            heap_globals.append(Dereference(i, pi))
        if heap_globals:
            body = List(header=self._make_tid(self.threadid),
                        body=heap_globals + [partree],
                        footer=c.Line())
        else:
            body = partree

        return OpenMPRegion(body, partree.nthreads)

    def _make_guard(self, partree, collapsed):
        # Do not enter the parallel region if the step increment is 0; this
        # would raise a `Floating point exception (core dumped)` in some OpenMP
        # implementations. Note that using an OpenMP `if` clause won't work
        cond = [
            CondEq(i.step, 0) for i in collapsed if isinstance(i.step, Symbol)
        ]
        cond = Or(*cond)
        if cond != False:  # noqa: `cond` may be a sympy.False which would be == False
            partree = List(body=[Conditional(cond, Return()), partree])
        return partree

    def _make_nested_partree(self, partree):
        # Apply heuristic
        if nhyperthreads() <= self.nested:
            return partree

        # Note: there might be multiple sub-trees amenable to nested parallelism,
        # hence we loop over all of them
        #
        # for (i = ... )  // outer parallelism
        #   for (j0 = ...)  // first source of nested parallelism
        #     ...
        #   for (j1 = ...)  // second source of nested parallelism
        #     ...
        mapper = {}
        for tree in retrieve_iteration_tree(partree):
            outer = tree[:partree.ncollapsed]
            inner = tree[partree.ncollapsed:]

            # Heuristic: nested parallelism is applied only if the top nested
            # parallel Iteration iterates *within* the top outer parallel Iteration
            # (i.e., the outer is a loop over blocks, while the nested is a loop
            # within a block)
            candidates = []
            for i in inner:
                if self.key(i) and any(
                        is_integer(j.step - i.symbolic_size) for j in outer):
                    candidates.append(i)
                elif candidates:
                    # If there's at least one candidate but `i` doesn't honor the
                    # heuristic above, then we break, as the candidates must be
                    # perfectly nested
                    break
            if not candidates:
                continue

            # Introduce nested parallelism
            subroot, subpartree, _ = self._make_partree(
                candidates, self.nthreads_nested)

            mapper[subroot] = subpartree

        partree = Transformer(mapper).visit(partree)

        return partree

    def _make_parallel(self, iet):
        mapper = {}
        parrays = {}
        for tree in retrieve_iteration_tree(iet):
            # Get the omp-parallelizable Iterations in `tree`
            candidates = filter_iterations(tree, key=self.key)
            if not candidates:
                continue

            # Outer parallelism
            root, partree, collapsed = self._make_partree(candidates)
            if root in mapper:
                continue

            # Nested parallelism
            partree = self._make_nested_partree(partree)

            # Handle reductions
            partree = self._make_reductions(partree, collapsed)

            # Atomicize and optimize single-thread prodders
            partree = self._make_threaded_prodders(partree)

            # Wrap within a parallel region, declaring private and shared variables
            parregion = self._make_parregion(partree, parrays)

            # Protect the parallel region in case of 0-valued step increments
            parregion = self._make_guard(parregion, collapsed)

            mapper[root] = parregion

        iet = Transformer(mapper).visit(iet)

        # The new arguments introduced by this pass
        args = [
            i for i in FindSymbols().visit(iet)
            if isinstance(i, (NThreadsMixin))
        ]
        for n in FindNodes(Dereference).visit(iet):
            args.extend([(n.array, True), n.parray])

        return iet, {'args': args, 'includes': ['omp.h']}

    @iet_pass
    def make_parallel(self, iet):
        """
        Create a new IET with shared-memory parallelism via OpenMP pragmas.
        """
        return self._make_parallel(iet)

    @iet_pass
    def make_simd(self, iet, **kwargs):
        """
        Create a new IET with SIMD parallelism via OpenMP pragmas.
        """
        simd_reg_size = kwargs.pop('simd_reg_size')

        mapper = {}
        for tree in retrieve_iteration_tree(iet):
            candidates = [i for i in tree if i.is_Parallel]

            # As long as there's an outer level of parallelism, the innermost
            # PARALLEL Iteration gets vectorized
            if len(candidates) < 2:
                continue
            candidate = candidates[-1]

            # Construct OpenMP SIMD pragma
            aligned = [
                j for j in FindSymbols('symbolics').visit(candidate)
                if j.is_DiscreteFunction
            ]
            if aligned:
                simd = self.lang['simd-for-aligned']
                simd = as_tuple(
                    simd(','.join([j.name for j in aligned]), simd_reg_size))
            else:
                simd = as_tuple(self.lang['simd-for'])
            pragmas = candidate.pragmas + simd

            # Add VECTORIZED property
            properties = list(candidate.properties) + [VECTORIZED]

            mapper[candidate] = candidate._rebuild(pragmas=pragmas,
                                                   properties=properties)

        iet = Transformer(mapper).visit(iet)

        return iet, {}
예제 #20
0
파일: utils.py 프로젝트: kwinkunks/devito
import cpuinfo
import numpy as np

import cgen as c

"""
A dictionary to quickly access standard OpenMP pragmas
"""
omplang = {
    'for': c.Pragma('omp for schedule(static)'),
    'collapse': lambda i: c.Pragma('omp for collapse(%d) schedule(static)' % i),
    'par-region': c.Pragma('omp parallel'),
    'par-for': c.Pragma('omp parallel for schedule(static)'),
    'simd-for': c.Pragma('omp simd'),
    'simd-for-aligned': lambda i, j: c.Pragma('omp simd aligned(%s:%d)' % (i, j))
}

"""
Compiler-specific language
"""
complang_ALL = {
    'IntelCompiler': {'ignore-deps': c.Pragma('ivdep'),
                      'ntstores': c.Pragma('vector nontemporal'),
                      'storefence': c.Statement('_mm_sfence()'),
                      'noinline': c.Pragma('noinline')}
}
complang_ALL['IntelKNLCompiler'] = complang_ALL['IntelCompiler']

"""
SIMD generic info
"""
예제 #21
0
import cpuinfo
import numpy as np

import cgen as c
"""
Compiler-specific language
"""
complang_ALL = {
    'IntelCompiler': {
        'ignore-deps': c.Pragma('ivdep'),
        'ntstores': c.Pragma('vector nontemporal'),
        'storefence': c.Statement('_mm_sfence()'),
        'noinline': c.Pragma('noinline')
    }
}
complang_ALL['IntelKNLCompiler'] = complang_ALL['IntelCompiler']
"""
SIMD generic info
"""
simdinfo = {
    # Sizes in bytes of a vector register
    'sse': 16,
    'see4_2': 16,
    'avx': 32,
    'avx2': 32,
    'avx512f': 64
}


def get_simd_flag():
    """Retrieve the best SIMD flag on the current architecture."""
예제 #22
0
class Ompizer(object):

    NESTED = 2
    """
    Use nested parallelism if the number of hyperthreads per core is greater
    than this threshold.
    """

    COLLAPSE_NCORES = 4
    """
    Use a collapse clause if the number of available physical cores is greater
    than this threshold.
    """

    COLLAPSE_WORK = 100
    """
    Use a collapse clause if the trip count of the collapsable Iterations
    exceeds this threshold. Note however the trip count is rarely known at
    compilation time (e.g., this may happen when DefaultDimensions are used).
    """

    CHUNKSIZE_NONAFFINE = 3
    """
    Coefficient to adjust the chunk size in parallelized non-affine Iterations.
    """

    DYNAMIC_WORK = 10
    """
    Use dynamic scheduling if the operation count per iteration exceeds this
    threshold. Otherwise, use static scheduling.
    """

    lang = {
        'simd-for': c.Pragma('omp simd'),
        'simd-for-aligned': lambda i, j: c.Pragma('omp simd aligned(%s:%d)' % (i, j)),
        'atomic': c.Pragma('omp atomic update')
    }
    """
    Shortcuts for the OpenMP language.
    """

    def __init__(self, key=None):
        """
        Parameters
        ----------
        key : callable, optional
            Return True if an Iteration can be parallelized, False otherwise.
        """
        if key is not None:
            self.key = key
        else:
            def key(i):
                if i.uindices:
                    # Iteration must be in OpenMP canonical form
                    return False
                return i.is_ParallelRelaxed and not i.is_Vectorized
            self.key = key
        self.nthreads = NThreads(aliases='nthreads0')
        self.nthreads_nested = NThreadsNested(aliases='nthreads1')
        self.nthreads_nonaffine = NThreadsNonaffine(aliases='nthreads2')

    def _find_collapsable(self, root, candidates):
        collapsable = []
        if ncores() >= self.COLLAPSE_NCORES:
            for n, i in enumerate(candidates[1:], 1):
                # The Iteration nest [root, ..., i] must be perfect
                if not IsPerfectIteration(depth=i).visit(root):
                    break

                # The OpenMP specification forbids collapsed loops to use iteration
                # variables in initializer expressions. E.g., the following is forbidden:
                #
                # #pragma omp ... collapse(2)
                # for (i = ... )
                #   for (j = i ...)
                #     ...
                #
                # Here, we make sure this won't happen
                if any(j.dim in i.symbolic_min.free_symbols for j in candidates[:n]):
                    break

                # Also, we do not want to collapse vectorizable Iterations
                if i.is_Vectorized:
                    break

                # Would there be enough work per parallel iteration?
                nested = candidates[n+1:]
                if nested:
                    try:
                        work = prod([int(j.dim.symbolic_size) for j in nested])
                        if work < self.COLLAPSE_WORK:
                            break
                    except TypeError:
                        pass

                collapsable.append(i)
        return collapsable

    def _make_reductions(self, partree, collapsed):
        if not partree.is_ParallelAtomic:
            return partree

        # Collect expressions inducing reductions
        exprs = FindNodes(Expression).visit(partree)
        exprs = [i for i in exprs if i.is_Increment and not i.is_ForeignExpression]

        reduction = [i.output for i in exprs]
        if (all(i.is_Affine for i in collapsed)
                or all(not i.is_Indexed for i in reduction)):
            # Introduce reduction clause
            mapper = {partree.root: partree.root._rebuild(reduction=reduction)}
        else:
            # Introduce one `omp atomic` pragma for each increment
            mapper = {i: List(header=self.lang['atomic'], body=i) for i in exprs}

        partree = Transformer(mapper).visit(partree)

        return partree

    def _make_threaded_prodders(self, partree):
        mapper = {i: ThreadedProdder(i) for i in FindNodes(Prodder).visit(partree)}
        partree = Transformer(mapper).visit(partree)
        return partree

    def _make_partree(self, candidates, nthreads=None):
        """Parallelize the `candidates` Iterations attaching suitable OpenMP pragmas."""
        assert candidates
        root = candidates[0]

        # Get the collapsable Iterations
        collapsable = self._find_collapsable(root, candidates)
        ncollapse = 1 + len(collapsable)

        # Prepare to build a ParallelTree
        if all(i.is_Affine for i in candidates):
            bundles = FindNodes(ExpressionBundle).visit(root)
            sops = sum(i.ops for i in bundles)
            if sops >= self.DYNAMIC_WORK:
                schedule = 'dynamic'
            else:
                schedule = 'static'
            if nthreads is None:
                # pragma omp for ... schedule(..., 1)
                nthreads = self.nthreads
                body = ParallelIteration(schedule=schedule, ncollapse=ncollapse,
                                         **root.args)
            else:
                # pragma omp parallel for ... schedule(..., 1)
                body = ParallelIteration(schedule=schedule, parallel=True,
                                         ncollapse=ncollapse, nthreads=nthreads,
                                         **root.args)
            prefix = []
        else:
            # pragma omp for ... schedule(..., expr)
            assert nthreads is None
            nthreads = self.nthreads_nonaffine
            chunk_size = Symbol(name='chunk_size')
            body = ParallelIteration(ncollapse=ncollapse, chunk_size=chunk_size,
                                     **root.args)

            niters = prod([root.symbolic_size] + [j.symbolic_size for j in collapsable])
            value = INT(Max(niters / (nthreads*self.CHUNKSIZE_NONAFFINE), 1))
            prefix = [Expression(DummyEq(chunk_size, value, dtype=np.int32))]

        # Create a ParallelTree
        partree = ParallelTree(prefix, body, nthreads=nthreads)

        collapsed = [partree] + collapsable

        return root, partree, collapsed

    def _make_parregion(self, partree):
        # Build the `omp-parallel` region
        private = [i for i in FindSymbols().visit(partree)
                   if i.is_Array and i._mem_stack]
        private = sorted(set([i.name for i in private]))
        return ParallelRegion(partree, partree.nthreads, private)

    def _make_guard(self, partree, collapsed):
        # Do not enter the parallel region if the step increment is 0; this
        # would raise a `Floating point exception (core dumped)` in some OpenMP
        # implementations. Note that using an OpenMP `if` clause won't work
        cond = [CondEq(i.step, 0) for i in collapsed if isinstance(i.step, Symbol)]
        cond = Or(*cond)
        if cond != False:  # noqa: `cond` may be a sympy.False which would be == False
            partree = List(body=[Conditional(cond, Return()), partree])
        return partree

    def _make_nested_partree(self, partree):
        # Apply heuristic
        if nhyperthreads() <= Ompizer.NESTED:
            return partree

        # Note: there might be multiple sub-trees amenable to nested parallelism,
        # hence we loop over all of them
        #
        # for (i = ... )  // outer parallelism
        #   for (j0 = ...)  // first source of nested parallelism
        #     ...
        #   for (j1 = ...)  // second source of nested parallelism
        #     ...
        mapper = {}
        for tree in retrieve_iteration_tree(partree):
            outer = tree[:partree.ncollapsed]
            inner = tree[partree.ncollapsed:]

            # Heuristic: nested parallelism is applied only if the top nested
            # parallel Iteration iterates *within* the top outer parallel Iteration
            # (i.e., the outer is a loop over blocks, while the nested is a loop
            # within a block)
            candidates = []
            for i in inner:
                if any(is_integer(j.step - i.symbolic_size) for j in outer):
                    candidates.append(i)
                elif candidates:
                    # If there's at least one candidate but `i` doesn't honor the
                    # heuristic above, then we break, as the candidates must be
                    # perfectly nested
                    break
            if not candidates:
                continue

            # Introduce nested parallelism
            subroot, subpartree, _ = self._make_partree(candidates, self.nthreads_nested)

            mapper[subroot] = subpartree

        partree = Transformer(mapper).visit(partree)

        return partree

    def _make_parallel(self, iet):
        mapper = OrderedDict()
        for tree in retrieve_iteration_tree(iet):
            # Get the omp-parallelizable Iterations in `tree`
            candidates = filter_iterations(tree, key=self.key)
            if not candidates:
                continue

            # Outer parallelism
            root, partree, collapsed = self._make_partree(candidates)

            # Nested parallelism
            partree = self._make_nested_partree(partree)

            # Handle reductions
            partree = self._make_reductions(partree, collapsed)

            # Atomicize and optimize single-thread prodders
            partree = self._make_threaded_prodders(partree)

            # Wrap within a parallel region, declaring private and shared variables
            parregion = self._make_parregion(partree)

            # Protect the parallel region in case of 0-valued step increments
            parregion = self._make_guard(parregion, collapsed)

            mapper[root] = parregion

        iet = Transformer(mapper).visit(iet)

        # The used `nthreads` arguments
        args = [i for i in FindSymbols().visit(iet) if isinstance(i, (NThreadsMixin))]

        return iet, {'args': args, 'includes': ['omp.h']}

    @iet_pass
    def make_parallel(self, iet):
        """
        Create a new IET with shared-memory parallelism via OpenMP pragmas.
        """
        return self._make_parallel(iet)

    @iet_pass
    def make_simd(self, iet, **kwargs):
        """
        Create a new IET with SIMD parallelism via OpenMP pragmas.
        """
        simd_reg_size = kwargs.pop('simd_reg_size')

        mapper = {}
        for tree in retrieve_iteration_tree(iet):
            candidates = [i for i in tree if i.is_Parallel]

            # As long as there's an outer level of parallelism, the innermost
            # PARALLEL Iteration gets vectorized
            if len(candidates) < 2:
                continue
            candidate = candidates[-1]

            # Construct OpenMP SIMD pragma
            aligned = [j for j in FindSymbols('symbolics').visit(candidate)
                       if j.is_DiscreteFunction]
            if aligned:
                simd = self.lang['simd-for-aligned']
                simd = as_tuple(simd(','.join([j.name for j in aligned]),
                                simd_reg_size))
            else:
                simd = as_tuple(self.lang['simd-for'])
            pragmas = candidate.pragmas + simd

            # Add VECTORIZED property
            properties = list(candidate.properties) + [VECTORIZED]

            mapper[candidate] = candidate._rebuild(pragmas=pragmas, properties=properties)

        iet = Transformer(mapper).visit(iet)

        return iet, {}
예제 #23
0
    def __init__(self, is_header: bool):
        self.includes = []
        self.objects = []

        if is_header:
            self.add(cgen.Pragma("once"))
예제 #24
0
 def _make_header(cls, nthreads, private):
     private = ('private(%s)' % ','.join(private)) if private else ''
     return c.Pragma('omp parallel num_threads(%s) %s' % (nthreads.name, private))
예제 #25
0
class DeviceOmpizer(Ompizer):

    lang = dict(Ompizer.lang)
    lang.update({
        'map-enter-to': lambda i, j:
            c.Pragma('omp target enter data map(to: %s%s)' % (i, j)),
        'map-enter-alloc': lambda i, j:
            c.Pragma('omp target enter data map(alloc: %s%s)' % (i, j)),
        'map-update': lambda i, j:
            c.Pragma('omp target update from(%s%s)' % (i, j)),
        'map-update-host': lambda i, j:
            c.Pragma('omp target update from(%s%s)' % (i, j)),
        'map-update-device': lambda i, j:
            c.Pragma('omp target update to(%s%s)' % (i, j)),
        'map-release': lambda i, j:
            c.Pragma('omp target exit data map(release: %s%s)'
                     % (i, j)),
        'map-exit-delete': lambda i, j, k:
            c.Pragma('omp target exit data map(delete: %s%s)%s'
                     % (i, j, k)),
    })

    _Iteration = DeviceOpenMPIteration

    def __init__(self, sregistry, options, key=None):
        super().__init__(sregistry, options, key=key)
        self.gpu_fit = options['gpu-fit']
        self.par_disabled = options['par-disabled']

    @classmethod
    def _make_sections_from_imask(cls, f, imask):
        datasize = cls._map_data(f)
        if imask is None:
            imask = [FULL]*len(datasize)
        assert len(imask) == len(datasize)
        sections = []
        for i, j in zip(imask, datasize):
            if i is FULL:
                start, size = 0, j
            else:
                try:
                    start, size = i
                except TypeError:
                    start, size = i, 1
                start = ccode(start)
            sections.append('[%s:%s]' % (start, size))
        return ''.join(sections)

    @classmethod
    def _map_data(cls, f):
        if f.is_Array:
            return f.symbolic_shape
        else:
            return tuple(f._C_get_field(FULL, d).size for d in f.dimensions)

    @classmethod
    def _map_to(cls, f, imask=None, queueid=None):
        sections = cls._make_sections_from_imask(f, imask)
        return cls.lang['map-enter-to'](f.name, sections)

    _map_to_wait = _map_to

    @classmethod
    def _map_alloc(cls, f, imask=None):
        sections = cls._make_sections_from_imask(f, imask)
        return cls.lang['map-enter-alloc'](f.name, sections)

    @classmethod
    def _map_present(cls, f, imask=None):
        return

    @classmethod
    def _map_update(cls, f):
        return cls.lang['map-update'](f.name, ''.join('[0:%s]' % i
                                                      for i in cls._map_data(f)))

    @classmethod
    def _map_update_host(cls, f, imask=None, queueid=None):
        sections = cls._make_sections_from_imask(f, imask)
        return cls.lang['map-update-host'](f.name, sections)

    _map_update_wait_host = _map_update_host

    @classmethod
    def _map_update_device(cls, f, imask=None, queueid=None):
        sections = cls._make_sections_from_imask(f, imask)
        return cls.lang['map-update-device'](f.name, sections)

    _map_update_wait_device = _map_update_device

    @classmethod
    def _map_release(cls, f):
        return cls.lang['map-release'](f.name, ''.join('[0:%s]' % i
                                                       for i in cls._map_data(f)))

    @classmethod
    def _map_delete(cls, f, imask=None):
        sections = cls._make_sections_from_imask(f, imask)
        # This ugly condition is to avoid a copy-back when, due to
        # domain decomposition, the local size of a Function is 0, which
        # would cause a crash
        cond = ' if(%s)' % ' && '.join('(%s != 0)' % i for i in cls._map_data(f))
        return cls.lang['map-exit-delete'](f.name, sections, cond)

    @classmethod
    def _map_pointers(cls, f):
        raise NotImplementedError

    def _make_threaded_prodders(self, partree):
        if isinstance(partree.root, DeviceOpenMPIteration):
            # no-op for now
            return partree
        else:
            return super()._make_threaded_prodders(partree)

    def _make_partree(self, candidates, nthreads=None):
        """
        Parallelize the `candidates` Iterations attaching suitable OpenMP pragmas
        for parallelism. In particular:

            * All parallel Iterations not *writing* to a host Function, that
              is a Function `f` such that ``is_on_device(f) == False`, are offloaded
              to the device.
            * The remaining ones, that is those writing to a host Function,
              are parallelized on the host.
        """
        assert candidates
        root = candidates[0]

        if is_on_device(root, self.gpu_fit, only_writes=True):
            # The typical case: all written Functions are device Functions, that is
            # they're mapped in the device memory. Then we offload `root` to the device

            # Get the collapsable Iterations
            collapsable = self._find_collapsable(root, candidates)
            ncollapse = 1 + len(collapsable)

            body = self._Iteration(gpu_fit=self.gpu_fit, ncollapse=ncollapse, **root.args)
            partree = ParallelTree([], body, nthreads=nthreads)
            collapsed = [partree] + collapsable

            return root, partree, collapsed
        elif not self.par_disabled:
            # Resort to host parallelism
            return super()._make_partree(candidates, nthreads)
        else:
            return root, None, None

    def _make_parregion(self, partree, *args):
        if isinstance(partree.root, DeviceOpenMPIteration):
            # no-op for now
            return partree
        else:
            return super()._make_parregion(partree, *args)

    def _make_guard(self, parregion, *args):
        partrees = FindNodes(ParallelTree).visit(parregion)
        if any(isinstance(i.root, DeviceOpenMPIteration) for i in partrees):
            # no-op for now
            return parregion
        else:
            return super()._make_guard(parregion, *args)

    def _make_nested_partree(self, partree):
        if isinstance(partree.root, DeviceOpenMPIteration):
            # no-op for now
            return partree
        else:
            return super()._make_nested_partree(partree)
예제 #26
0
class Ompizer(object):

    COLLAPSE = 32
    """Use a collapse clause if the number of available physical cores is
    greater than this threshold."""

    lang = {
        'for':
        lambda i: c.Pragma('omp for collapse(%d) schedule(static)' % i),
        'par-region':
        lambda nt, i: c.Pragma('omp parallel num_threads(%s) %s' % (nt, i)),
        'simd-for':
        c.Pragma('omp simd'),
        'simd-for-aligned':
        lambda i, j: c.Pragma('omp simd aligned(%s:%d)' % (i, j)),
        'atomic':
        c.Pragma('omp atomic update')
    }
    """
    Shortcuts for the OpenMP language.
    """
    def __init__(self, key=None):
        """
        Parameters
        ----------
        key : callable, optional
            Return True if an Iteration can be parallelized, False otherwise.
        """
        if key is not None:
            self.key = key
        else:
            self.key = lambda i: i.is_ParallelRelaxed and not i.is_Vectorizable
        self.nthreads = NThreads(name='nthreads')

    def _ncollapse(self, root, candidates):
        # The OpenMP specification forbids collapsed loops to use iteration variables
        # in initializer expressions. For example, the following is forbidden:
        #
        # #pragma omp ... collapse(2)
        # for (int i = ... )
        #   for (int j = i ...)
        #     ...
        #
        # Below, we make sure this won't happen
        for n, i in enumerate(candidates):
            if any(j.dim in i.symbolic_min.free_symbols
                   for j in candidates[:n]):
                break
        candidates = candidates[:n]
        # Heuristic: if at least two parallel loops are available and the
        # physical core count is greater than COLLAPSE, then omp-collapse them
        nparallel = len(candidates)
        isperfect = IsPerfectIteration().visit(root)
        if ncores() < Ompizer.COLLAPSE or nparallel < 2 or not isperfect:
            return 1
        else:
            return nparallel

    def _make_parallel_tree(self, root, candidates):
        """Parallelize the IET rooted in `root`."""
        ncollapse = self._ncollapse(root, candidates)
        parallel = self.lang['for'](ncollapse)

        pragmas = root.pragmas + (parallel, )
        properties = root.properties + (COLLAPSED(ncollapse), )

        # Introduce the `omp for` pragma
        mapper = OrderedDict()
        if root.is_ParallelAtomic:
            # Introduce the `omp atomic` pragmas
            exprs = FindNodes(Expression).visit(root)
            subs = {
                i: List(header=self.lang['atomic'], body=i)
                for i in exprs if i.is_Increment
            }
            handle = Transformer(subs).visit(root)
            mapper[root] = handle._rebuild(pragmas=pragmas,
                                           properties=properties)
        else:
            mapper[root] = root._rebuild(pragmas=pragmas,
                                         properties=properties)

        root = Transformer(mapper).visit(root)

        return root

    def make_parallel(self, iet):
        """Transform ``iet`` by introducing shared-memory parallelism."""
        mapper = OrderedDict()
        for tree in retrieve_iteration_tree(iet):
            # Get the first omp-parallelizable Iteration in `tree`
            candidates = filter_iterations(tree, key=self.key, stop='asap')
            if not candidates:
                continue
            root = candidates[0]

            # Build the `omp-for` tree
            partree = self._make_parallel_tree(root, candidates)

            # Find out the thread-private and thread-shared variables
            private = [
                i for i in FindSymbols().visit(partree)
                if i.is_Array and i._mem_stack
            ]

            # Build the `omp-parallel` region
            private = sorted(set([i.name for i in private]))
            private = ('private(%s)' % ','.join(private)) if private else ''
            partree = Block(header=self.lang['par-region'](self.nthreads.name,
                                                           private),
                            body=partree)

            # Do not enter the parallel region if the step increment might be 0; this
            # would raise a `Floating point exception (core dumped)` in some OpenMP
            # implementation. Note that using an OpenMP `if` clause won't work
            if isinstance(root.step, Symbol):
                cond = Conditional(CondEq(root.step, 0),
                                   Element(c.Statement('return')))
                partree = List(body=[cond, partree])

            mapper[root] = partree
        iet = Transformer(mapper).visit(iet)

        return iet, {'input': [self.nthreads] if mapper else []}
예제 #27
0
class Ompizer(object):

    NESTED = 2
    """
    Use nested parallelism if the number of hyperthreads per core is greater
    than this threshold.
    """

    COLLAPSE = 32
    """
    Use a collapse clause if the number of available physical cores is greater
    than this threshold.
    """

    lang = {
        'for': lambda i: c.Pragma('omp for collapse(%d) schedule(static,1)' % i),
        'par-for': lambda i, j: c.Pragma('omp parallel for collapse(%d) '
                                         'schedule(static,1) num_threads(%d)' % (i, j)),
        'simd-for': c.Pragma('omp simd'),
        'simd-for-aligned': lambda i, j: c.Pragma('omp simd aligned(%s:%d)' % (i, j)),
        'atomic': c.Pragma('omp atomic update')
    }
    """
    Shortcuts for the OpenMP language.
    """

    def __init__(self, key=None):
        """
        Parameters
        ----------
        key : callable, optional
            Return True if an Iteration can be parallelized, False otherwise.
        """
        if key is not None:
            self.key = key
        else:
            self.key = lambda i: i.is_ParallelRelaxed and not i.is_Vectorizable
        self.nthreads = NThreads(name='nthreads')

    def _make_atomic_incs(self, partree):
        if not partree.is_ParallelAtomic:
            return partree
        # Introduce one `omp atomic` pragma for each increment
        exprs = FindNodes(Expression).visit(partree)
        exprs = [i for i in exprs if i.is_Increment and not i.is_ForeignExpression]
        mapper = {i: List(header=self.lang['atomic'], body=i) for i in exprs}
        partree = Transformer(mapper).visit(partree)
        return partree

    def _make_atomic_prodders(self, partree):
        # Atomic-ize any single-thread Prodders in the parallel tree
        mapper = {i: SingleThreadProdder(i) for i in FindNodes(Prodder).visit(partree)}
        partree = Transformer(mapper).visit(partree)
        return partree

    def _make_partree(self, candidates, omp_pragma):
        """Parallelize `root` attaching a suitable OpenMP pragma."""
        assert candidates
        root = candidates[0]

        # Get the collapsable Iterations
        collapsable = []
        if ncores() >= Ompizer.COLLAPSE and IsPerfectIteration().visit(root):
            for n, i in enumerate(candidates[1:], 1):
                # The OpenMP specification forbids collapsed loops to use iteration
                # variables in initializer expressions. E.g., the following is forbidden:
                #
                # #pragma omp ... collapse(2)
                # for (i = ... )
                #   for (j = i ...)
                #     ...
                #
                # Here, we make sure this won't happen
                if any(j.dim in i.symbolic_min.free_symbols for j in candidates[:n]):
                    break

                # Also, we do not want to collapse vectorizable Iterations
                if i.is_Vectorizable:
                    break

                collapsable.append(i)

        # Attach an OpenMP pragma-for with a collapse clause
        ncollapse = 1 + len(collapsable)
        partree = root._rebuild(pragmas=root.pragmas + (omp_pragma(ncollapse),),
                                properties=root.properties + (COLLAPSED(ncollapse),))

        collapsed = [partree] + collapsable

        return root, partree, collapsed

    def _make_parregion(self, partree):
        # Build the `omp-parallel` region
        private = [i for i in FindSymbols().visit(partree)
                   if i.is_Array and i._mem_stack]
        private = sorted(set([i.name for i in private]))
        return ParallelRegion(partree, self.nthreads, private)

    def _make_guard(self, partree, collapsed):
        # Do not enter the parallel region if the step increment is 0; this
        # would raise a `Floating point exception (core dumped)` in some OpenMP
        # implementations. Note that using an OpenMP `if` clause won't work
        cond = [CondEq(i.step, 0) for i in collapsed if isinstance(i.step, Symbol)]
        cond = Or(*cond)
        if cond != False:  # noqa: `cond` may be a sympy.False which would be == False
            partree = List(body=[Conditional(cond, Return()), partree])
        return partree

    def _make_nested_partree(self, partree):
        # Apply heuristic
        if nhyperthreads() <= Ompizer.NESTED:
            return partree

        # Note: there might be multiple sub-trees amenable to nested parallelism,
        # hence we loop over all of them
        #
        # for (i = ... )  // outer parallelism
        #   for (j0 = ...)  // first source of nested parallelism
        #     ...
        #   for (j1 = ...)  // second source of nested parallelism
        #     ...
        mapper = {}
        for tree in retrieve_iteration_tree(partree):
            index = tree.index(partree)
            outer = tree[index:index + partree.ncollapsed]
            inner = tree[index + partree.ncollapsed:]

            # Heuristic: nested parallelism is applied only if the top nested
            # parallel Iteration iterates *within* the top outer parallel Iteration
            # (i.e., the outer is a loop over blocks, while the nested is a loop
            # within a block)
            candidates = []
            for i in inner:
                if any(is_integer(j.step - i.symbolic_size) for j in outer):
                    candidates.append(i)
                elif candidates:
                    # If there's at least one candidate but `i` doesn't honor the
                    # heuristic above, then we break, as the candidates must be
                    # perfectly nested
                    break
            if not candidates:
                continue

            # Introduce nested parallelism
            omp_pragma = lambda i: self.lang['par-for'](i, nhyperthreads())
            subroot, subpartree, _ = self._make_partree(candidates, omp_pragma)

            mapper[subroot] = subpartree

        partree = Transformer(mapper).visit(partree)

        return partree

    def make_parallel(self, iet):
        """Transform ``iet`` by introducing shared-memory parallelism."""
        mapper = OrderedDict()
        for tree in retrieve_iteration_tree(iet):
            # Get the omp-parallelizable Iterations in `tree`
            candidates = filter_iterations(tree, key=self.key)
            if not candidates:
                continue

            # Outer parallelism
            root, partree, collapsed = self._make_partree(candidates, self.lang['for'])

            # Nested parallelism
            partree = self._make_nested_partree(partree)

            # Ensure increments are atomic
            partree = self._make_atomic_incs(partree)

            # Ensure single-thread prodders are atomic
            partree = self._make_atomic_prodders(partree)

            # Wrap within a parallel region, declaring private and shared variables
            parregion = self._make_parregion(partree)

            # Protect the parallel region in case of 0-valued step increments
            parregion = self._make_guard(parregion, collapsed)

            mapper[root] = parregion

        iet = Transformer(mapper).visit(iet)

        return iet, {'args': [self.nthreads] if mapper else [],
                     'includes': ['omp.h']}
예제 #28
0
class DeviceAccizer(DeviceOmpizer):

    lang = dict(DeviceOmpizer.__base__.lang)
    lang.update({
        'atomic':
        c.Pragma('acc atomic update'),
        'map-enter-to':
        lambda i, j: c.Pragma('acc enter data copyin(%s%s)' % (i, j)),
        'map-enter-to-wait':
        lambda i, j, k: (c.Pragma('acc enter data copyin(%s%s) async(%s)' %
                                  (i, j, k)), c.Pragma('acc wait(%s)' % k)),
        'map-enter-alloc':
        lambda i, j: c.Pragma('acc enter data create(%s%s)' % (i, j)),
        'map-present':
        lambda i, j: c.Pragma('acc data present(%s%s)' % (i, j)),
        'map-update':
        lambda i, j: c.Pragma('acc exit data copyout(%s%s)' % (i, j)),
        'map-update-host':
        lambda i, j: c.Pragma('acc update self(%s%s)' % (i, j)),
        'map-update-wait-host':
        lambda i, j, k: (c.Pragma('acc update self(%s%s) async(%s)' %
                                  (i, j, k)), c.Pragma('acc wait(%s)' % k)),
        'map-update-device':
        lambda i, j: c.Pragma('acc update device(%s%s)' % (i, j)),
        'map-update-wait-device':
        lambda i, j, k: (c.Pragma('acc update device(%s%s) async(%s)' %
                                  (i, j, k)), c.Pragma('acc wait(%s)' % k)),
        'map-release':
        lambda i, j: c.Pragma('acc exit data delete(%s%s)' % (i, j)),
        'map-exit-delete':
        lambda i, j: c.Pragma('acc exit data delete(%s%s)' % (i, j)),
        'map-pointers':
        lambda i: c.Pragma('acc host_data use_device(%s)' % i)
    })

    _Iteration = DeviceOpenACCIteration

    @classmethod
    def _map_to_wait(cls, f, imask=None, queueid=None):
        sections = cls._make_sections_from_imask(f, imask)
        return cls.lang['map-enter-to-wait'](f.name, sections, queueid)

    @classmethod
    def _map_present(cls, f, imask=None):
        sections = cls._make_sections_from_imask(f, imask)
        return cls.lang['map-present'](f.name, sections)

    @classmethod
    def _map_delete(cls, f, imask=None):
        sections = cls._make_sections_from_imask(f, imask)
        return cls.lang['map-exit-delete'](f.name, sections)

    @classmethod
    def _map_update_wait_host(cls, f, imask=None, queueid=None):
        sections = cls._make_sections_from_imask(f, imask)
        return cls.lang['map-update-wait-host'](f.name, sections, queueid)

    @classmethod
    def _map_update_wait_device(cls, f, imask=None, queueid=None):
        sections = cls._make_sections_from_imask(f, imask)
        return cls.lang['map-update-wait-device'](f.name, sections, queueid)

    @classmethod
    def _map_pointers(cls, functions):
        return cls.lang['map-pointers'](','.join(f.name for f in functions))

    def _make_parallel(self, iet):
        iet, metadata = super(DeviceAccizer, self)._make_parallel(iet)

        metadata['includes'] = ['openacc.h']

        return iet, metadata
예제 #29
0
class OffloadingOmpizer(Ompizer):

    COLLAPSE_NCORES = 1
    """
    Always collapse when possible.
    """

    COLLAPSE_WORK = 1
    """
    Always collapse when possible.
    """

    lang = dict(Ompizer.lang)
    lang.update({
        'par-for-teams':
        lambda i: c.Pragma(
            'omp target teams distribute parallel for collapse(%d)' % i),
        'map-enter-to':
        lambda i, j: c.Pragma('omp target enter data map(to: %s%s)' % (i, j)),
        'map-enter-alloc':
        lambda i, j: c.Pragma('omp target enter data map(alloc: %s%s)' %
                              (i, j)),
        'map-exit-from':
        lambda i, j: c.Pragma('omp target exit data map(from: %s%s)' % (i, j)),
        'map-exit-delete':
        lambda i, j: c.Pragma('omp target exit data map(delete: %s%s)' %
                              (i, j)),
    })

    def __init__(self, key=None):
        if key is None:
            key = lambda i: i.is_ParallelRelaxed
        super(OffloadingOmpizer, self).__init__(key=key)

    @classmethod
    def _map_data(cls, f):
        if f.is_Array:
            return f.symbolic_shape
        else:
            return tuple(f._C_get_field(FULL, d).size for d in f.dimensions)

    @classmethod
    def _map_to(cls, f):
        return cls.lang['map-enter-to'](f.name,
                                        ''.join('[0:%s]' % i
                                                for i in cls._map_data(f)))

    @classmethod
    def _map_alloc(cls, f):
        return cls.lang['map-enter-alloc'](f.name,
                                           ''.join('[0:%s]' % i
                                                   for i in cls._map_data(f)))

    @classmethod
    def _map_from(cls, f):
        return cls.lang['map-exit-from'](f.name,
                                         ''.join('[0:%s]' % i
                                                 for i in cls._map_data(f)))

    @classmethod
    def _map_delete(cls, f):
        return cls.lang['map-exit-delete'](f.name,
                                           ''.join('[0:%s]' % i
                                                   for i in cls._map_data(f)))

    def _make_threaded_prodders(self, partree):
        # no-op for now
        return partree

    def _make_partree(self, candidates, nthreads=None):
        """
        Parallelize the `candidates` Iterations attaching suitable OpenMP pragmas
        for GPU offloading.
        """
        assert candidates
        root = candidates[0]

        # Get the collapsable Iterations
        collapsable = self._find_collapsable(root, candidates)
        ncollapse = 1 + len(collapsable)

        # Prepare to build a ParallelTree
        omp_pragma = self.lang['par-for-teams'](ncollapse)

        # Create a ParallelTree
        body = root._rebuild(pragmas=root.pragmas + (omp_pragma, ),
                             properties=root.properties +
                             (COLLAPSED(ncollapse), ))
        partree = ParallelTree([], body, nthreads=nthreads)

        collapsed = [partree] + collapsable

        return root, partree, collapsed

    def _make_parregion(self, partree):
        # no-op for now
        return partree

    def _make_guard(self, partree, *args):
        # no-op for now
        return partree

    def _make_nested_partree(self, partree):
        # no-op for now
        return partree
예제 #30
0
 def _cgen(self):
     return cgen.Pragma(self.value)