Beispiel #1
0
class OmpBB(PragmaLangBB):

    mapper = {
        # Misc
        'name':
        'OpenMP',
        'header':
        'omp.h',
        # Platform mapping
        AMDGPUX:
        None,
        NVIDIAX:
        None,
        # Runtime library
        'init':
        None,
        'thread-num':
        DefFunction('omp_get_thread_num'),
        'num-devices':
        lambda args: DefFunction('omp_get_num_devices', args),
        'set-device':
        lambda args: Call('omp_set_default_device', args),
        # Pragmas
        'simd-for':
        c.Pragma('omp simd'),
        'simd-for-aligned':
        lambda i, j: c.Pragma('omp simd aligned(%s:%d)' % (i, j)),
        'atomic':
        c.Pragma('omp atomic update'),
        'map-enter-to':
        lambda i, j: c.Pragma('omp target enter data map(to: %s%s)' % (i, j)),
        'map-enter-alloc':
        lambda i, j: c.Pragma('omp target enter data map(alloc: %s%s)' %
                              (i, j)),
        'map-update':
        lambda i, j: c.Pragma('omp target update from(%s%s)' % (i, j)),
        'map-update-host':
        lambda i, j: c.Pragma('omp target update from(%s%s)' % (i, j)),
        'map-update-device':
        lambda i, j: c.Pragma('omp target update to(%s%s)' % (i, j)),
        'map-release':
        lambda i, j, k: c.Pragma('omp target exit data map(release: %s%s)%s' %
                                 (i, j, k)),
        'map-exit-delete':
        lambda i, j, k: c.Pragma('omp target exit data map(delete: %s%s)%s' %
                                 (i, j, k)),
    }
    mapper.update(CBB.mapper)

    Region = OmpRegion
    HostIteration = OmpIteration
    DeviceIteration = DeviceOmpIteration
    Prodder = ThreadedProdder
Beispiel #2
0
    def __init__(self, prodder):
        # Atomic-ize any single-thread Prodders in the parallel tree
        condition = CondEq(DefFunction('omp_get_thread_num'), 0)

        # Prod within a while loop until all communications have completed
        # In other words, the thread delegated to prodding is entrapped for as long
        # as it's required
        prod_until = Not(DefFunction(prodder.name, [i.name for i in prodder.arguments]))
        then_body = List(header=c.Comment('Entrap thread until comms have completed'),
                         body=While(prod_until))

        Conditional.__init__(self, condition, then_body)
        Prodder.__init__(self, prodder.name, prodder.arguments, periodic=prodder.periodic)
Beispiel #3
0
def _(f, szs, sregistry):
    assert len(szs) == len(f.dimensions) - 1

    pname = sregistry.make_name(prefix='%sL' % f.name)
    cbk = lambda i, pname=pname: FIndexed(i, pname)

    expr = sum([MacroArgument(d0.name)*szs[d1]
                for d0, d1 in zip(f.dimensions, f.dimensions[1:])])
    expr += MacroArgument(f.dimensions[-1].name)
    expr = Indexed(IndexedData(f.name, None, f), expr)
    define = DefFunction(pname, f.dimensions)
    header = (ccode(define), ccode(expr))

    return header, cbk
Beispiel #4
0
    def _(iet):
        # TODO: we need to pick the rank from `comm_shm`, not `comm`,
        # so that we have nranks == ngpus (as long as the user has launched
        # the right number of MPI processes per node given the available
        # number of GPUs per node)

        objcomm = None
        for i in iet.parameters:
            if isinstance(i, MPICommObject):
                objcomm = i
                break

        deviceid = DeviceID()
        device_nvidia = Macro('acc_device_nvidia')
        if objcomm is not None:
            rank = Symbol(name='rank')
            rank_decl = LocalExpression(DummyEq(rank, 0))
            rank_init = Call('MPI_Comm_rank', [objcomm, Byref(rank)])

            ngpus = Symbol(name='ngpus')
            call = DefFunction('acc_get_num_devices', device_nvidia)
            ngpus_init = LocalExpression(DummyEq(ngpus, call))

            asdn_then = Call('acc_set_device_num', [deviceid, device_nvidia])
            asdn_else = Call('acc_set_device_num',
                             [rank % ngpus, device_nvidia])

            body = [
                Call('acc_init', [device_nvidia]),
                Conditional(
                    CondNe(deviceid, -1), asdn_then,
                    List(body=[rank_decl, rank_init, ngpus_init, asdn_else]))
            ]
        else:
            body = [
                Call('acc_init', [device_nvidia]),
                Conditional(
                    CondNe(deviceid, -1),
                    Call('acc_set_device_num', [deviceid, device_nvidia]))
            ]

        init = List(header=c.Comment('Begin of OpenACC+MPI setup'),
                    body=body,
                    footer=(c.Comment('End of OpenACC+MPI setup'), c.Line()))
        iet = iet._rebuild(body=(init, ) + iet.body)

        return iet, {'args': deviceid}
Beispiel #5
0
    def _alloc_array_on_high_bw_mem(self, site, obj, storage):
        if obj._mem_mapped:
            super()._alloc_array_on_high_bw_mem(site, obj, storage)
        else:
            # E.g., use `acc_malloc` or `omp_target_alloc` -- the Array only resides
            # on the device as it never needs to be accessed on the host
            assert obj._mem_local

            deviceid = DefFunction(self.lang['device-get'].name)
            doalloc = self.lang['device-alloc']
            dofree = self.lang['device-free']

            size = SizeOf(obj._C_typedata) * prod(obj.symbolic_shape)
            init = doalloc(size, deviceid, retobj=obj)

            free = dofree(obj._C_name, deviceid)

            storage.update(obj, site, allocs=init, frees=free)
Beispiel #6
0
    def _initialize(iet):
        # TODO: we need to pick the rank from `comm_shm`, not `comm`,
        # so that we have nranks == ngpus (as long as the user has launched
        # the right number of MPI processes per node given the available
        # number of GPUs per node)
        comm = None
        for i in iet.parameters:
            if isinstance(i, MPICommObject):
                comm = i
                break

        device_nvidia = Macro('acc_device_nvidia')
        body = Call('acc_init', [device_nvidia])

        if comm is not None:
            rank = Symbol(name='rank')
            rank_decl = LocalExpression(DummyEq(rank, 0))
            rank_init = Call('MPI_Comm_rank', [comm, Byref(rank)])

            ngpus = Symbol(name='ngpus')
            call = DefFunction('acc_get_num_devices', device_nvidia)
            ngpus_init = LocalExpression(DummyEq(ngpus, call))

            devicenum = Symbol(name='devicenum')
            devicenum_init = LocalExpression(DummyEq(devicenum, rank % ngpus))

            set_device_num = Call('acc_set_device_num',
                                  [devicenum, device_nvidia])

            body = [
                rank_decl, rank_init, ngpus_init, devicenum_init,
                set_device_num, body
            ]

        init = List(header=c.Comment('Begin of OpenACC+MPI setup'),
                    body=body,
                    footer=(c.Comment('End of OpenACC+MPI setup'), c.Line()))

        iet = iet._rebuild(body=(init, ) + iet.body)

        return iet
Beispiel #7
0
def test_symbolics():
    a = Symbol('a')

    id = IntDiv(a, 3)
    pkl_id = pickle.dumps(id)
    new_id = pickle.loads(pkl_id)
    assert id == new_id

    ffp = CallFromPointer('foo', a, ['b', 'c'])
    pkl_ffp = pickle.dumps(ffp)
    new_ffp = pickle.loads(pkl_ffp)
    assert ffp == new_ffp

    li = ListInitializer(['a', 'b'])
    pkl_li = pickle.dumps(li)
    new_li = pickle.loads(pkl_li)
    assert li == new_li

    df = DefFunction('f', ['a', 1, 2])
    pkl_df = pickle.dumps(df)
    new_df = pickle.loads(pkl_df)
    assert df == new_df
    assert df.arguments == new_df.arguments
Beispiel #8
0
class Ompizer(object):

    lang = {
        'simd-for':
        c.Pragma('omp simd'),
        'simd-for-aligned':
        lambda i, j: c.Pragma('omp simd aligned(%s:%d)' % (i, j)),
        'atomic':
        c.Pragma('omp atomic update'),
        'thread-num':
        DefFunction('omp_get_thread_num')
    }
    """
    Shortcuts for the OpenMP language.
    """

    _Region = OpenMPRegion
    _Iteration = OpenMPIteration

    def __init__(self, sregistry, options, key=None):
        """
        Parameters
        ----------
        sregistry : SymbolRegistry
            The symbol registry, to quickly access the special symbols that may
            appear in the IET (e.g., `sregistry.threadid`, `sregistry.nthreads`).
        options : dict
             The optimization options. Accepted: ['par-collapse-ncores',
             'par-collapse-work', 'par-chunk-nonaffine', 'par-dynamic-work', 'par-nested']
             * 'par-collapse-ncores': use a collapse clause if the number of
               available physical cores is greater than this threshold.
             * 'par-collapse-work': use a collapse clause if the trip count of the
               collapsable Iterations is statically known to exceed this threshold.
             * 'par-chunk-nonaffine': coefficient to adjust the chunk size in
               non-affine parallel Iterations.
             * 'par-dynamic-work': use dynamic scheduling if the operation count per
               iteration exceeds this threshold. Otherwise, use static scheduling.
             * 'par-nested': nested parallelism if the number of hyperthreads per core
               is greater than this threshold.
        key : callable, optional
            Return True if an Iteration can be parallelized, False otherwise.
        """
        self.sregistry = sregistry

        self.collapse_ncores = options['par-collapse-ncores']
        self.collapse_work = options['par-collapse-work']
        self.chunk_nonaffine = options['par-chunk-nonaffine']
        self.dynamic_work = options['par-dynamic-work']
        self.nested = options['par-nested']

        if key is not None:
            self.key = key
        else:
            self.key = lambda i: i.is_ParallelRelaxed and not i.is_Vectorized

    @property
    def nthreads(self):
        return self.sregistry.nthreads

    @property
    def nthreads_nested(self):
        return self.sregistry.nthreads_nested

    @property
    def nthreads_nonaffine(self):
        return self.sregistry.nthreads_nonaffine

    @property
    def threadid(self):
        return self.sregistry.threadid

    def _find_collapsable(self, root, candidates):
        collapsable = []
        if ncores() >= self.collapse_ncores:
            for n, i in enumerate(candidates[1:], 1):
                # The Iteration nest [root, ..., i] must be perfect
                if not IsPerfectIteration(depth=i).visit(root):
                    break

                # The OpenMP specification forbids collapsed loops to use iteration
                # variables in initializer expressions. E.g., the following is forbidden:
                #
                # #pragma omp ... collapse(2)
                # for (i = ... )
                #   for (j = i ...)
                #     ...
                #
                # Here, we make sure this won't happen
                if any(j.dim in i.symbolic_min.free_symbols
                       for j in candidates[:n]):
                    break

                # Also, we do not want to collapse vectorizable Iterations
                if i.is_Vectorized:
                    break

                # Would there be enough work per parallel iteration?
                nested = candidates[n + 1:]
                if nested:
                    try:
                        work = prod([int(j.dim.symbolic_size) for j in nested])
                        if work < self.collapse_work:
                            break
                    except TypeError:
                        pass

                collapsable.append(i)
        return collapsable

    @classmethod
    def _make_tid(cls, tid):
        return c.Initializer(c.Value(tid._C_typedata, tid.name),
                             cls.lang['thread-num'])

    def _make_reductions(self, partree, collapsed):
        if not any(i.is_ParallelAtomic for i in collapsed):
            return partree

        # Collect expressions inducing reductions
        exprs = FindNodes(Expression).visit(partree)
        exprs = [
            i for i in exprs if i.is_Increment and not i.is_ForeignExpression
        ]

        reduction = [i.output for i in exprs]
        if (all(i.is_Affine for i in collapsed)
                or all(not i.is_Indexed for i in reduction)):
            # Introduce reduction clause
            mapper = {partree.root: partree.root._rebuild(reduction=reduction)}
        else:
            # Introduce one `omp atomic` pragma for each increment
            mapper = {
                i: List(header=self.lang['atomic'], body=i)
                for i in exprs
            }

        partree = Transformer(mapper).visit(partree)

        return partree

    def _make_threaded_prodders(self, partree):
        mapper = {
            i: ThreadedProdder(i)
            for i in FindNodes(Prodder).visit(partree)
        }
        partree = Transformer(mapper).visit(partree)
        return partree

    def _make_partree(self, candidates, nthreads=None):
        """Parallelize the `candidates` Iterations attaching suitable OpenMP pragmas."""
        assert candidates
        root = candidates[0]

        # Get the collapsable Iterations
        collapsable = self._find_collapsable(root, candidates)
        ncollapse = 1 + len(collapsable)

        # Prepare to build a ParallelTree
        if all(i.is_Affine for i in candidates):
            bundles = FindNodes(ExpressionBundle).visit(root)
            sops = sum(i.ops for i in bundles)
            if sops >= self.dynamic_work:
                schedule = 'dynamic'
            else:
                schedule = 'static'
            if nthreads is None:
                # pragma omp for ... schedule(..., 1)
                nthreads = self.nthreads
                body = OpenMPIteration(schedule=schedule,
                                       ncollapse=ncollapse,
                                       **root.args)
            else:
                # pragma omp parallel for ... schedule(..., 1)
                body = OpenMPIteration(schedule=schedule,
                                       parallel=True,
                                       ncollapse=ncollapse,
                                       nthreads=nthreads,
                                       **root.args)
            prefix = []
        else:
            # pragma omp for ... schedule(..., expr)
            assert nthreads is None
            nthreads = self.nthreads_nonaffine
            chunk_size = Symbol(name='chunk_size')
            body = OpenMPIteration(ncollapse=ncollapse,
                                   chunk_size=chunk_size,
                                   **root.args)

            niters = prod([root.symbolic_size] +
                          [j.symbolic_size for j in collapsable])
            value = INT(Max(niters / (nthreads * self.chunk_nonaffine), 1))
            prefix = [Expression(DummyEq(chunk_size, value, dtype=np.int32))]

        # Create a ParallelTree
        partree = ParallelTree(prefix, body, nthreads=nthreads)

        collapsed = [partree] + collapsable

        return root, partree, collapsed

    def _make_parregion(self, partree, parrays):
        arrays = [i for i in FindSymbols().visit(partree) if i.is_Array]

        # Detect thread-private arrays on the heap and "map" them to shared
        # vector-expanded (one entry per thread) Arrays
        heap_private = [i for i in arrays if i._mem_heap and i._mem_local]
        heap_globals = []
        for i in heap_private:
            if i in parrays:
                pi = parrays[i]
            else:
                pi = parrays.setdefault(
                    i,
                    PointerArray(name=self.sregistry.make_name(),
                                 dimensions=(self.threadid, ),
                                 array=i))
            heap_globals.append(Dereference(i, pi))
        if heap_globals:
            body = List(header=self._make_tid(self.threadid),
                        body=heap_globals + [partree],
                        footer=c.Line())
        else:
            body = partree

        return OpenMPRegion(body, partree.nthreads)

    def _make_guard(self, partree, collapsed):
        # Do not enter the parallel region if the step increment is 0; this
        # would raise a `Floating point exception (core dumped)` in some OpenMP
        # implementations. Note that using an OpenMP `if` clause won't work
        cond = [
            CondEq(i.step, 0) for i in collapsed if isinstance(i.step, Symbol)
        ]
        cond = Or(*cond)
        if cond != False:  # noqa: `cond` may be a sympy.False which would be == False
            partree = List(body=[Conditional(cond, Return()), partree])
        return partree

    def _make_nested_partree(self, partree):
        # Apply heuristic
        if nhyperthreads() <= self.nested:
            return partree

        # Note: there might be multiple sub-trees amenable to nested parallelism,
        # hence we loop over all of them
        #
        # for (i = ... )  // outer parallelism
        #   for (j0 = ...)  // first source of nested parallelism
        #     ...
        #   for (j1 = ...)  // second source of nested parallelism
        #     ...
        mapper = {}
        for tree in retrieve_iteration_tree(partree):
            outer = tree[:partree.ncollapsed]
            inner = tree[partree.ncollapsed:]

            # Heuristic: nested parallelism is applied only if the top nested
            # parallel Iteration iterates *within* the top outer parallel Iteration
            # (i.e., the outer is a loop over blocks, while the nested is a loop
            # within a block)
            candidates = []
            for i in inner:
                if self.key(i) and any(
                        is_integer(j.step - i.symbolic_size) for j in outer):
                    candidates.append(i)
                elif candidates:
                    # If there's at least one candidate but `i` doesn't honor the
                    # heuristic above, then we break, as the candidates must be
                    # perfectly nested
                    break
            if not candidates:
                continue

            # Introduce nested parallelism
            subroot, subpartree, _ = self._make_partree(
                candidates, self.nthreads_nested)

            mapper[subroot] = subpartree

        partree = Transformer(mapper).visit(partree)

        return partree

    def _make_parallel(self, iet):
        mapper = {}
        parrays = {}
        for tree in retrieve_iteration_tree(iet):
            # Get the omp-parallelizable Iterations in `tree`
            candidates = filter_iterations(tree, key=self.key)
            if not candidates:
                continue

            # Outer parallelism
            root, partree, collapsed = self._make_partree(candidates)
            if root in mapper:
                continue

            # Nested parallelism
            partree = self._make_nested_partree(partree)

            # Handle reductions
            partree = self._make_reductions(partree, collapsed)

            # Atomicize and optimize single-thread prodders
            partree = self._make_threaded_prodders(partree)

            # Wrap within a parallel region, declaring private and shared variables
            parregion = self._make_parregion(partree, parrays)

            # Protect the parallel region in case of 0-valued step increments
            parregion = self._make_guard(parregion, collapsed)

            mapper[root] = parregion

        iet = Transformer(mapper).visit(iet)

        # The new arguments introduced by this pass
        args = [
            i for i in FindSymbols().visit(iet)
            if isinstance(i, (NThreadsMixin))
        ]
        for n in FindNodes(Dereference).visit(iet):
            args.extend([(n.array, True), n.parray])

        return iet, {'args': args, 'includes': ['omp.h']}

    @iet_pass
    def make_parallel(self, iet):
        """
        Create a new IET with shared-memory parallelism via OpenMP pragmas.
        """
        return self._make_parallel(iet)

    @iet_pass
    def make_simd(self, iet, **kwargs):
        """
        Create a new IET with SIMD parallelism via OpenMP pragmas.
        """
        simd_reg_size = kwargs.pop('simd_reg_size')

        mapper = {}
        for tree in retrieve_iteration_tree(iet):
            candidates = [i for i in tree if i.is_Parallel]

            # As long as there's an outer level of parallelism, the innermost
            # PARALLEL Iteration gets vectorized
            if len(candidates) < 2:
                continue
            candidate = candidates[-1]

            # Construct OpenMP SIMD pragma
            aligned = [
                j for j in FindSymbols('symbolics').visit(candidate)
                if j.is_DiscreteFunction
            ]
            if aligned:
                simd = self.lang['simd-for-aligned']
                simd = as_tuple(
                    simd(','.join([j.name for j in aligned]), simd_reg_size))
            else:
                simd = as_tuple(self.lang['simd-for'])
            pragmas = candidate.pragmas + simd

            # Add VECTORIZED property
            properties = list(candidate.properties) + [VECTORIZED]

            mapper[candidate] = candidate._rebuild(pragmas=pragmas,
                                                   properties=properties)

        iet = Transformer(mapper).visit(iet)

        return iet, {}
Beispiel #9
0
def linearize_accesses(iet, cache, sregistry):
    """
    Turn Indexeds into FIndexeds and create the necessary access Macros.
    """
    # Find all objects amenable to linearization
    symbol_names = {i.name for i in FindSymbols('indexeds').visit(iet)}
    functions = [f for f in FindSymbols().visit(iet)
                 if ((f.is_DiscreteFunction or f.is_Array) and
                     f.ndim > 1 and
                     f.name in symbol_names)]
    functions = sorted(functions, key=lambda f: len(f.dimensions), reverse=True)

    # Find unique sizes (unique -> minimize necessary registers)
    mapper = DefaultOrderedDict(list)
    for f in functions:
        if f not in cache:
            # NOTE: the outermost dimension is unnecessary
            for d in f.dimensions[1:]:
                # TODO: same grid + same halo => same padding, however this is
                # never asserted throughout the compiler yet... maybe should do
                # it when in debug mode at `prepare_arguments` time, ie right
                # before jumping to C?
                mapper[(d, f._size_halo[d], getattr(f, 'grid', None))].append(f)

    # Build all exprs such as `x_fsz0 = u_vec->size[1]`
    imapper = DefaultOrderedDict(list)
    for (d, halo, _), v in mapper.items():
        name = sregistry.make_name(prefix='%s_fsz' % d.name)
        s = Symbol(name=name, dtype=np.int32, is_const=True)
        try:
            expr = DummyExpr(s, v[0]._C_get_field(FULL, d).size, init=True)
        except AttributeError:
            assert v[0].is_Array
            expr = DummyExpr(s, v[0].symbolic_shape[d], init=True)
        for f in v:
            imapper[f].append((d, s))
            cache[f].stmts0.append(expr)

    # Build all exprs such as `y_slc0 = y_fsz0*z_fsz0`
    built = {}
    mapper = DefaultOrderedDict(list)
    for f, v in imapper.items():
        for n, (d, _) in enumerate(v):
            expr = prod(list(zip(*v[n:]))[1])
            try:
                stmt = built[expr]
            except KeyError:
                name = sregistry.make_name(prefix='%s_slc' % d.name)
                s = Symbol(name=name, dtype=np.int32, is_const=True)
                stmt = built[expr] = DummyExpr(s, expr, init=True)
            mapper[f].append(stmt.write)
            cache[f].stmts1.append(stmt)
    mapper.update([(f, []) for f in functions if f not in mapper])

    # Build defines. For example:
    # `define uL(t, x, y, z) u[(t)*t_slice_sz + (x)*x_slice_sz + (y)*y_slice_sz + (z)]`
    headers = []
    findexeds = {}
    for f, szs in mapper.items():
        if cache[f].cbk is not None:
            # Perhaps we've already built an access macro for `f` through another efunc
            findexeds[f] = cache[f].cbk
        else:
            assert len(szs) == len(f.dimensions) - 1
            pname = sregistry.make_name(prefix='%sL' % f.name)

            expr = sum([MacroArgument(d.name)*s for d, s in zip(f.dimensions, szs)])
            expr += MacroArgument(f.dimensions[-1].name)
            expr = Indexed(IndexedData(f.name, None, f), expr)
            define = DefFunction(pname, f.dimensions)
            headers.append((ccode(define), ccode(expr)))

            cache[f].cbk = findexeds[f] = lambda i, pname=pname: FIndexed(i, pname)

    # Build "functional" Indexeds. For example:
    # `u[t2, x+8, y+9, z+7] => uL(t2, x+8, y+9, z+7)`
    mapper = {}
    for n in FindNodes(Expression).visit(iet):
        subs = {}
        for i in retrieve_indexed(n.expr):
            try:
                subs[i] = findexeds[i.function](i)
            except KeyError:
                pass
        mapper[n] = n._rebuild(expr=uxreplace(n.expr, subs))

    # Put together all of the necessary exprs for `y_fsz0`, ..., `y_slc0`, ...
    stmts0 = filter_ordered(flatten(cache[f].stmts0 for f in functions))
    if stmts0:
        stmts0.append(BlankLine)
    stmts1 = filter_ordered(flatten(cache[f].stmts1 for f in functions))
    if stmts1:
        stmts1.append(BlankLine)

    iet = Transformer(mapper).visit(iet)
    body = iet.body._rebuild(body=tuple(stmts0) + tuple(stmts1) + iet.body.body)
    iet = iet._rebuild(body=body)

    return iet, headers
Beispiel #10
0
class AccBB(PragmaLangBB):

    mapper = {
        # Misc
        'name':
        'OpenACC',
        'header':
        'openacc.h',
        # Platform mapping
        AMDGPUX:
        Macro('acc_device_radeon'),
        NVIDIAX:
        Macro('acc_device_nvidia'),
        # Runtime library
        'init':
        lambda args: Call('acc_init', args),
        'num-devices':
        lambda args: DefFunction('acc_get_num_devices', args),
        'set-device':
        lambda args: Call('acc_set_device_num', args),
        # Pragmas
        'atomic':
        c.Pragma('acc atomic update'),
        'map-enter-to':
        lambda i, j: c.Pragma('acc enter data copyin(%s%s)' % (i, j)),
        'map-enter-to-wait':
        lambda i, j, k: (c.Pragma('acc enter data copyin(%s%s) async(%s)' %
                                  (i, j, k)), c.Pragma('acc wait(%s)' % k)),
        'map-enter-alloc':
        lambda i, j: c.Pragma('acc enter data create(%s%s)' % (i, j)),
        'map-present':
        lambda i, j: c.Pragma('acc data present(%s%s)' % (i, j)),
        'map-wait':
        lambda i: c.Pragma('acc wait(%s)' % i),
        'map-update':
        lambda i, j: c.Pragma('acc exit data copyout(%s%s)' % (i, j)),
        'map-update-host':
        lambda i, j: c.Pragma('acc update self(%s%s)' % (i, j)),
        'map-update-host-async':
        lambda i, j, k: c.Pragma('acc update self(%s%s) async(%s)' %
                                 (i, j, k)),
        'map-update-device':
        lambda i, j: c.Pragma('acc update device(%s%s)' % (i, j)),
        'map-update-device-async':
        lambda i, j, k: c.Pragma('acc update device(%s%s) async(%s)' %
                                 (i, j, k)),
        'map-release':
        lambda i, j, k: c.Pragma('acc exit data delete(%s%s)%s' % (i, j, k)),
        'map-exit-delete':
        lambda i, j, k: c.Pragma('acc exit data delete(%s%s)%s' % (i, j, k)),
        'memcpy-to-device':
        lambda i, j, k: Call('acc_memcpy_to_device', [i, j, k]),
        'memcpy-to-device-wait':
        lambda i, j, k, l: List(body=[
            Call('acc_memcpy_to_device_async', [i, j, k, l]),
            Call('acc_wait', [l])
        ]),
        'device-get':
        Call('acc_get_device_num'),
        'device-alloc':
        lambda i, *a, retobj=None: Call(
            'acc_malloc', (i, ), retobj=retobj, cast=True),
        'device-free':
        lambda i, *a: Call('acc_free', (i, ))
    }
    mapper.update(CBB.mapper)

    Region = OmpRegion
    HostIteration = OmpIteration  # Host parallelism still goes via OpenMP
    DeviceIteration = DeviceAccIteration

    @classmethod
    def _map_to_wait(cls, f, imask=None, queueid=None):
        sections = cls._make_sections_from_imask(f, imask)
        return cls.mapper['map-enter-to-wait'](f.name, sections, queueid)

    @classmethod
    def _map_present(cls, f, imask=None):
        sections = cls._make_sections_from_imask(f, imask)
        return cls.mapper['map-present'](f.name, sections)

    @classmethod
    def _map_delete(cls, f, imask=None, devicerm=None):
        sections = cls._make_sections_from_imask(f, imask)
        if devicerm is not None:
            cond = ' if(%s)' % devicerm.name
        else:
            cond = ''
        return cls.mapper['map-exit-delete'](f.name, sections, cond)

    @classmethod
    def _map_update_host_async(cls, f, imask=None, queueid=None):
        sections = cls._make_sections_from_imask(f, imask)
        return cls.mapper['map-update-host-async'](f.name, sections, queueid)

    @classmethod
    def _map_update_device_async(cls, f, imask=None, queueid=None):
        sections = cls._make_sections_from_imask(f, imask)
        return cls.mapper['map-update-device-async'](f.name, sections, queueid)