Ejemplo n.º 1
0
 def _make_compute(self, hs, key, msgs, callpoke):
     if hs.body.is_Call:
         return None
     else:
         mapper = {i: List(body=[callpoke, i]) for i in
                   FindNodes(ExpressionBundle).visit(hs.body)}
         iet = Transformer(mapper).visit(hs.body)
         return make_efunc('compute%d' % key, iet, hs.arguments)
Ejemplo n.º 2
0
 def _build_casts(self, iet):
     """Introduce array and pointer casts at the top of the Iteration/Expression
     tree ``iet``."""
     casts = [
         ArrayCast(f) for f in self.input if f.is_Tensor and f._mem_external
     ]
     casts.append(PointerCast(Timer(self.profiler)))
     return List(body=casts + [iet])
Ejemplo n.º 3
0
def iet_insert_decls(iet, external):
    """
    Transform the input IET inserting the necessary symbol declarations.
    Declarations are placed as close as possible to the first symbol occurrence.

    Parameters
    ----------
    iet : Node
        The input Iteration/Expression tree.
    external : tuple, optional
        The symbols defined in some outer Callable, which therefore must not
        be re-defined.
    """
    iet = as_tuple(iet)

    # Classify and then schedule declarations to stack/heap
    allocator = Allocator()
    for k, v in MapExprStmts().visit(iet).items():
        if k.is_Expression:
            if k.is_definition:
                # On the stack
                site = v if v else iet
                allocator.push_scalar_on_stack(site[-1], k)
                continue
            objs = [k.write]
        elif k.is_Call:
            objs = k.arguments

        for i in objs:
            try:
                if i.is_LocalObject:
                    # On the stack
                    site = v if v else iet
                    allocator.push_object_on_stack(site[-1], i)
                elif i.is_Array:
                    if i in as_tuple(external):
                        # The Array is defined in some other IET
                        continue
                    elif i._mem_stack:
                        # On the stack
                        allocator.push_object_on_stack(iet[0], i)
                    else:
                        # On the heap
                        allocator.push_array_on_heap(i)
            except AttributeError:
                # E.g., a generic SymPy expression
                pass

    # Introduce declarations on the stack
    mapper = dict(allocator.onstack)
    iet = Transformer(mapper, nested=True).visit(iet)

    # Introduce declarations on the heap (if any)
    if allocator.onheap:
        decls, allocs, frees = zip(*allocator.onheap)
        iet = List(header=decls + allocs, body=iet, footer=frees)

    return iet
Ejemplo n.º 4
0
    def _make_sendrecv(self, f, hse, key, **kwargs):
        comm = f.grid.distributor._obj_comm

        buf_dims = [
            Dimension(name='buf_%s' % d.root) for d in f.dimensions
            if d not in hse.loc_indices
        ]
        bufg = Array(name='bufg',
                     dimensions=buf_dims,
                     dtype=f.dtype,
                     padding=0,
                     scope='heap')
        bufs = Array(name='bufs',
                     dimensions=buf_dims,
                     dtype=f.dtype,
                     padding=0,
                     scope='heap')

        ofsg = [Symbol(name='og%s' % d.root) for d in f.dimensions]
        ofss = [Symbol(name='os%s' % d.root) for d in f.dimensions]

        fromrank = Symbol(name='fromrank')
        torank = Symbol(name='torank')

        gather = Call('gather_%s' % key,
                      [bufg] + list(bufg.shape) + [f] + ofsg)
        scatter = Call('scatter_%s' % key,
                       [bufs] + list(bufs.shape) + [f] + ofss)

        # The `gather` is unnecessary if sending to MPI.PROC_NULL
        gather = Conditional(CondNe(torank, Macro('MPI_PROC_NULL')), gather)
        # The `scatter` must be guarded as we must not alter the halo values along
        # the domain boundary, where the sender is actually MPI.PROC_NULL
        scatter = Conditional(CondNe(fromrank, Macro('MPI_PROC_NULL')),
                              scatter)

        count = reduce(mul, bufs.shape, 1)
        rrecv = MPIRequestObject(name='rrecv')
        rsend = MPIRequestObject(name='rsend')
        recv = Call('MPI_Irecv', [
            bufs, count,
            Macro(dtype_to_mpitype(f.dtype)), fromrank,
            Integer(13), comm, rrecv
        ])
        send = Call('MPI_Isend', [
            bufg, count,
            Macro(dtype_to_mpitype(f.dtype)), torank,
            Integer(13), comm, rsend
        ])

        waitrecv = Call('MPI_Wait', [rrecv, Macro('MPI_STATUS_IGNORE')])
        waitsend = Call('MPI_Wait', [rsend, Macro('MPI_STATUS_IGNORE')])

        iet = List(body=[recv, gather, send, waitsend, waitrecv, scatter])
        parameters = ([f] + list(bufs.shape) + ofsg + ofss +
                      [fromrank, torank, comm])
        return Callable('sendrecv_%s' % key, iet, 'void', parameters,
                        ('static', ))
Ejemplo n.º 5
0
 def _avoid_denormals(self, nodes, state):
     """
     Introduce nodes in the Iteration/Expression tree that will expand to C
     macros telling the CPU to flush denormal numbers in hardware. Denormals
     are normally flushed when using SSE-based instruction sets, except when
     compiling shared objects.
     """
     return (List(body=(Denormals(), nodes)),
             {'includes': ('xmmintrin.h', 'pmmintrin.h')})
Ejemplo n.º 6
0
 def _avoid_denormals(self, iet):
     header = [
         cgen.Comment('Flush denormal numbers to zero in hardware'),
         cgen.Statement(
             '_MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON)'),
         cgen.Statement('_MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON)')
     ]
     iet = List(header=header, body=iet)
     return iet, {'includes': ('xmmintrin.h', 'pmmintrin.h')}
Ejemplo n.º 7
0
    def _(iet):
        # TODO: we need to pick the rank from `comm_shm`, not `comm`,
        # so that we have nranks == ngpus (as long as the user has launched
        # the right number of MPI processes per node given the available
        # number of GPUs per node)

        objcomm = None
        for i in iet.parameters:
            if isinstance(i, MPICommObject):
                objcomm = i
                break

        deviceid = DeviceID()
        if objcomm is not None:
            rank = Symbol(name='rank')
            rank_decl = LocalExpression(DummyEq(rank, 0))
            rank_init = Call('MPI_Comm_rank', [objcomm, Byref(rank)])

            ngpus = Symbol(name='ngpus')
            call = Function('omp_get_num_devices')()
            ngpus_init = LocalExpression(DummyEq(ngpus, call))

            osdd_then = Call('omp_set_default_device', [deviceid])
            osdd_else = Call('omp_set_default_device', [rank % ngpus])

            body = [
                Conditional(
                    CondNe(deviceid, -1),
                    osdd_then,
                    List(body=[rank_decl, rank_init, ngpus_init, osdd_else]),
                )
            ]
        else:
            body = [
                Conditional(CondNe(deviceid, -1),
                            Call('omp_set_default_device', [deviceid]))
            ]

        init = List(header=c.Comment('Begin of OpenMP+MPI setup'),
                    body=body,
                    footer=(c.Comment('End of OpenMP+MPI setup'), c.Line()))
        iet = iet._rebuild(body=(init, ) + iet.body)

        return iet, {'args': deviceid}
Ejemplo n.º 8
0
    def make(self, hs):
        """
        Construct Callables and Calls implementing distributed-memory halo
        exchange for the HaloSpot ``hs``.
        """
        # Sanity check
        assert all(f.is_Function and f.grid is not None for f in hs.fmapper)

        for f, hse in hs.fmapper.items():
            # Build an MPIMsg, a data structure to be propagated across the
            # various halo exchange routines
            if (f, hse) not in self._msgs:
                key = self._gen_msgkey()
                msg = self._msgs.setdefault((f, hse),
                                            self._make_msg(f, hse, key))
            else:
                msg = self._msgs[(f, hse)]

            # Callables for send/recv/wait
            if (f.ndim, hse) not in self._cache_halo:
                self._make_all(f, hse, msg)

        msgs = [self._msgs[(f, hse)] for f, hse in hs.fmapper.items()]

        # Callable for poking the asynchronous progress engine
        key = self._gen_compkey()
        poke = self._make_poke(hs, key, msgs)
        if poke is not None:
            self._efuncs.append(poke)

        # Callable for compute over the CORE region
        callpoke = self._call_poke(poke)
        compute = self._make_compute(hs, key, msgs, callpoke)
        if compute is not None:
            self._efuncs.append(compute)

        # Callable for compute over the OWNED region
        region = self._make_region(hs, key)
        region = self._regions.setdefault(hs, region)
        callcompute = self._call_compute(hs, compute, msgs)
        remainder = self._make_remainder(hs, key, callcompute, region)
        if remainder is not None:
            self._efuncs.append(remainder)

        # Now build up the HaloSpot body, with explicit Calls to the constructed Callables
        body = [callcompute]
        for i, (f, hse) in enumerate(hs.fmapper.items()):
            msg = self._msgs[(f, hse)]
            haloupdate, halowait = self._cache_halo[(f.ndim, hse)]
            body.insert(i, self._call_haloupdate(haloupdate.name, f, hse, msg))
            if halowait is not None:
                body.append(self._call_halowait(halowait.name, f, hse, msg))
        if remainder is not None:
            body.append(self._call_remainder(remainder))

        return List(body=body)
Ejemplo n.º 9
0
    def _padding(self, nodes, state):
        """
        Introduce temporary buffers padded to the nearest multiple of the vector
        length, to maximize data alignment. At the bottom of the kernel, the
        values in the padded temporaries will be copied back into the input arrays.
        """
        mapper = OrderedDict()

        # Assess feasibility of the transformation
        handle = FindSymbols('symbolics-writes').visit(nodes)
        if not handle:
            return nodes, {}
        shape = max([i.shape for i in handle], key=len)
        if not shape:
            return nodes, {}
        candidates = [i for i in handle if i.shape[-1] == shape[-1]]
        if not candidates:
            return nodes, {}

        # Retrieve the maximum number of items in a SIMD register when processing
        # the expressions in /node/
        exprs = FindNodes(Expression).visit(nodes)
        exprs = [e for e in exprs if e.write in candidates]
        assert len(exprs) > 0
        dtype = exprs[0].dtype
        assert all(e.dtype == dtype for e in exprs)
        try:
            simd_items = get_simd_items(dtype)
        except KeyError:
            # Fallback to 16 (maximum expectable padding, for AVX512 registers)
            simd_items = simdinfo['avx512f'] / np.dtype(dtype).itemsize

        shapes = {
            k: k.shape[:-1] + (roundm(k.shape[-1], simd_items), )
            for k in candidates
        }
        mapper.update(
            OrderedDict([(k.indexed,
                          Array(name='p%s' % k.name,
                                shape=shapes[k],
                                dimensions=k.indices,
                                onstack=k._mem_stack).indexed)
                         for k in candidates]))

        # Substitute original arrays with padded buffers
        processed = SubstituteExpression(mapper).visit(nodes)

        # Build Iteration trees for initialization and copy-back of padded arrays
        mapper = OrderedDict([(k, v) for k, v in mapper.items()
                              if k.function.is_SymbolicFunction])
        init = copy_arrays(mapper, reverse=True)
        copyback = copy_arrays(mapper)

        processed = List(body=init + as_tuple(processed) + copyback)

        return processed, {}
Ejemplo n.º 10
0
    def instrument(self, iet):
        sections = FindNodes(Section).visit(iet)

        # Transform the Iteration/Expression tree introducing Advisor calls that
        # resume and stop data collection
        mapper = {i: List(body=[Call(self._api_resume), i, Call(self._api_pause)])
                  for i in sections}
        iet = Transformer(mapper).visit(iet)

        return iet
Ejemplo n.º 11
0
 def _build_casts(self, nodes):
     """Introduce array and pointer casts at the top of the Iteration/Expression
     tree ``nodes``."""
     casts = [
         ArrayCast(f) for f in self.input if f.is_Tensor and f._mem_external
     ]
     profiler = Object(self.profiler.name, self.profiler.dtype,
                       self.profiler.new)
     casts.append(PointerCast(profiler))
     return List(body=casts + [nodes])
Ejemplo n.º 12
0
def iet_insert_C_decls(iet, func_table):
    """
    Given an Iteration/Expression tree ``iet``, build a new tree with the
    necessary symbol declarations. Declarations are placed as close as
    possible to the first symbol use.

    :param iet: The input Iteration/Expression tree.
    :param func_table: A mapper from callable names to :class:`Callable`s
                       called from within ``iet``.
    """
    # Resolve function calls first
    scopes = []
    me = MapExpressions()
    for k, v in me.visit(iet).items():
        if k.is_Call:
            func = func_table[k.name]
            if func.local:
                scopes.extend(me.visit(func.root, queue=list(v)).items())
        else:
            scopes.append((k, v))

    # Determine all required declarations
    allocator = Allocator()
    mapper = OrderedDict()
    for k, v in scopes:
        if k.is_scalar:
            # Inline declaration
            mapper[k] = LocalExpression(**k.args)
        elif k.write is None or k.write._mem_external:
            # Nothing to do, e.g., variable passed as kernel argument
            continue
        elif k.write._mem_stack:
            # On the stack
            key = lambda i: not i.is_Parallel
            site = filter_iterations(v, key=key, stop='asap') or [iet]
            allocator.push_stack(site[-1], k.write)
        else:
            # On the heap, as a tensor that must be globally accessible
            allocator.push_heap(k.write)

    # Introduce declarations on the stack
    for k, v in allocator.onstack:
        mapper[k] = tuple(Element(i) for i in v)
    iet = NestedTransformer(mapper).visit(iet)
    for k, v in list(func_table.items()):
        if v.local:
            func_table[k] = MetaCall(
                Transformer(mapper).visit(v.root), v.local)

    # Introduce declarations on the heap (if any)
    if allocator.onheap:
        decls, allocs, frees = zip(*allocator.onheap)
        iet = List(header=decls + allocs, body=iet, footer=frees)

    return iet
Ejemplo n.º 13
0
    def _specialize_iet(self, iet, **kwargs):
        warning("The OPS backend is still work-in-progress")

        ops_init = Call(namespace['ops_init'], [0, 0, 2])
        ops_partition = Call(namespace['ops_partition'], Literal('""'))
        ops_exit = Call(namespace['ops_exit'])

        ops_block = OpsBlock('block')

        # Extract all symbols that need to be converted to ops_dat
        dims = []
        to_dat = set()
        for section, trees in find_affine_trees(iet).items():
            dims.append(len(trees[0].dimensions))
            symbols = set(FindSymbols('symbolics').visit(trees[0].root))
            symbols -= set(FindSymbols('defines').visit(trees[0].root))
            to_dat |= symbols

        # To ensure deterministic code generation we order the datasets to
        # be generated (since a set is an unordered collection)
        to_dat = filter_sorted(to_dat)

        name_to_ops_dat = {}
        pre_time_loop = []
        for f in to_dat:
            if f.is_Constant:
                continue

            pre_time_loop.extend(create_ops_dat(f, name_to_ops_dat, ops_block))

        for n, (section, trees) in enumerate(find_affine_trees(iet).items()):
            pre_loop, ops_kernel = opsit(trees, n)

            pre_time_loop.extend(pre_loop)
            self._ops_kernels.append(ops_kernel)

        assert (d == dims[0] for d in dims), \
            "The OPS backend currently assumes that all kernels \
            have the same number of dimensions"

        ops_block_init = Expression(
            ClusterizedEq(
                Eq(ops_block,
                   namespace['ops_decl_block'](dims[0], Literal('"block"')))))

        self._headers.append(namespace['ops_define_dimension'](dims[0]))
        self._includes.append('stdio.h')

        body = [
            ops_init, ops_block_init, *pre_time_loop, ops_partition, iet,
            ops_exit
        ]

        return List(body=body)
Ejemplo n.º 14
0
def update_halo(f, fixed):
    """
    Construct an IET performing a halo exchange for a :class:`TensorFunction`.
    """
    # Requirements
    assert f.is_Function
    assert f.grid is not None

    distributor = f.grid.distributor
    nb = distributor._C_neighbours.obj
    comm = distributor._C_comm

    fixed = {d: Symbol(name="o%s" % d.root) for d in fixed}

    mapper = get_views(f, fixed)

    body = []
    masks = []
    for d in f.dimensions:
        if d in fixed:
            continue

        rpeer = FieldFromPointer("%sright" % d, nb)
        lpeer = FieldFromPointer("%sleft" % d, nb)

        # Sending to left, receiving from right
        lsizes, loffsets = mapper[(d, LEFT, OWNED)]
        rsizes, roffsets = mapper[(d, RIGHT, HALO)]
        assert lsizes == rsizes
        sizes = lsizes
        parameters = ([f] + list(f.symbolic_shape) + sizes + loffsets +
                      roffsets + [rpeer, lpeer, comm])
        call = Call('sendrecv_%s' % f.name, parameters)
        mask = Symbol(name='m%sl' % d)
        body.append(Conditional(mask, call))
        masks.append(mask)

        # Sending to right, receiving from left
        rsizes, roffsets = mapper[(d, RIGHT, OWNED)]
        lsizes, loffsets = mapper[(d, LEFT, HALO)]
        assert rsizes == lsizes
        sizes = rsizes
        parameters = ([f] + list(f.symbolic_shape) + sizes + roffsets +
                      loffsets + [lpeer, rpeer, comm])
        call = Call('sendrecv_%s' % f.name, parameters)
        mask = Symbol(name='m%sr' % d)
        body.append(Conditional(mask, call))
        masks.append(mask)

    iet = List(body=body)
    parameters = ([f] + masks + [comm, nb] + list(fixed.values()) +
                  [d.symbolic_size for d in f.dimensions])
    return Callable('halo_exchange_%s' % f.name, iet, 'void', parameters,
                    ('static', ))
Ejemplo n.º 15
0
    def process(self, iet):
        def key(s):
            # The SyncOps are to be processed in the following order
            return [WaitLock, WithLock, Delete, FetchWait,
                    FetchWaitPrefetch].index(s)

        callbacks = {
            WaitLock: self._make_waitlock,
            WithLock: self._make_withlock,
            FetchWait: self._make_fetchwait,
            FetchWaitPrefetch: self._make_fetchwaitprefetch,
            Delete: self._make_delete
        }

        sync_spots = FindNodes(SyncSpot).visit(iet)

        if not sync_spots:
            return iet, {}

        pieces = namedtuple('Pieces', 'init finalize funcs threads')([], [],
                                                                     [], [])

        subs = {}
        for n in sync_spots:
            mapper = as_mapper(n.sync_ops, lambda i: type(i))
            for _type in sorted(mapper, key=key):
                subs[n] = callbacks[_type](subs.get(n, n), mapper[_type],
                                           pieces, iet)

        iet = Transformer(subs).visit(iet)

        # Add initialization and finalization code
        init = List(body=pieces.init, footer=c.Line())
        finalize = List(header=c.Line(), body=pieces.finalize)
        iet = iet._rebuild(body=(init, ) + iet.body + (finalize, ))

        return iet, {
            'efuncs': pieces.funcs,
            'includes': ['pthread.h'],
            'args': [i.size for i in pieces.threads if not is_integer(i.size)]
        }
Ejemplo n.º 16
0
    def _schedule_expressions(self, clusters):
        """Create an Iteartion/Expression tree given an iterable of
        :class:`Cluster` objects."""

        # Build the Iteration/Expression tree
        processed = []
        schedule = OrderedDict()
        for i in clusters:
            # Build the Expression objects to be inserted within an Iteration tree
            expressions = [
                Expression(v, np.int32 if i.trace.is_index(k) else self.dtype)
                for k, v in i.trace.items()
            ]

            if not i.stencil.empty:
                root = None
                entries = i.stencil.entries

                # Can I reuse any of the previously scheduled Iterations ?
                index = 0
                for j0, j1 in zip(entries, list(schedule)):
                    if j0 != j1 or j0.dim in clusters.atomics[i]:
                        break
                    root = schedule[j1]
                    index += 1
                needed = entries[index:]

                # Build and insert the required Iterations
                iters = [
                    Iteration([], j.dim, j.dim.limits, offsets=j.ofs)
                    for j in needed
                ]
                body, tree = compose_nodes(iters + [expressions],
                                           retrieve=True)
                scheduling = OrderedDict(zip(needed, tree))
                if root is None:
                    processed.append(body)
                    schedule = scheduling
                else:
                    nodes = list(root.nodes) + [body]
                    mapper = {root: root._rebuild(nodes, **root.args_frozen)}
                    transformer = Transformer(mapper)
                    processed = list(transformer.visit(processed))
                    schedule = OrderedDict(
                        list(schedule.items())[:index] +
                        list(scheduling.items()))
                    for k, v in list(schedule.items()):
                        schedule[k] = transformer.rebuilt.get(v, v)
            else:
                # No Iterations are needed
                processed.extend(expressions)

        return List(body=processed)
Ejemplo n.º 17
0
    def _make_haloupdate(self, f, hse, key, **kwargs):
        distributor = f.grid.distributor
        nb = distributor._obj_neighborhood
        comm = distributor._obj_comm
        sendrecv = self._cache_dims[f.dimensions][0]

        fixed = {d: Symbol(name="o%s" % d.root) for d in hse.loc_indices}

        # Build a mapper `(dim, side, region) -> (size, ofs)` for `f`. `size` and
        # `ofs` are symbolic objects. This mapper tells what data values should be
        # sent (OWNED) or received (HALO) given dimension and side
        mapper = {}
        for d0, side, region in product(f.dimensions, (LEFT, RIGHT), (OWNED, HALO)):
            if d0 in fixed:
                continue
            sizes = []
            ofs = []
            for d1 in f.dimensions:
                if d1 in fixed:
                    ofs.append(fixed[d1])
                else:
                    meta = f._C_get_field(region if d0 is d1 else NOPAD, d1, side)
                    ofs.append(meta.offset)
                    sizes.append(meta.size)
            mapper[(d0, side, region)] = (sizes, ofs)

        body = []
        for d in f.dimensions:
            if d in fixed:
                continue

            name = ''.join('r' if i is d else 'c' for i in distributor.dimensions)
            rpeer = FieldFromPointer(name, nb)
            name = ''.join('l' if i is d else 'c' for i in distributor.dimensions)
            lpeer = FieldFromPointer(name, nb)

            if (d, LEFT) in hse.halos:
                # Sending to left, receiving from right
                lsizes, lofs = mapper[(d, LEFT, OWNED)]
                rsizes, rofs = mapper[(d, RIGHT, HALO)]
                args = [f, lsizes, lofs, rofs, rpeer, lpeer, comm]
                body.append(self._call_sendrecv(sendrecv.name, *args, **kwargs))

            if (d, RIGHT) in hse.halos:
                # Sending to right, receiving from left
                rsizes, rofs = mapper[(d, RIGHT, OWNED)]
                lsizes, lofs = mapper[(d, LEFT, HALO)]
                args = [f, rsizes, rofs, lofs, lpeer, rpeer, comm]
                body.append(self._call_sendrecv(sendrecv.name, *args, **kwargs))

        iet = List(body=body)
        parameters = [f, comm, nb] + list(fixed.values())
        return HaloUpdate(key, iet, parameters)
Ejemplo n.º 18
0
    def _make_withlock(self, iet, sync_ops, pieces, root):
        # Sorting for deterministic code gen
        locks = sorted({s.lock for s in sync_ops}, key=lambda i: i.name)

        # The `min` is used to pick the maximum possible degree of parallelism.
        # For example, assume there are two locks in the given `sync_ops`, `lock0(i)`
        # and `lock1(j)`. If, say, `lock0` protects 3 entries of a certain Function
        # `u`, while `lock1` protects 2 entries of the Function `v`, then there
        # will never be more than 2 threads in flight concurrently
        npthreads = min(i.size for i in locks)

        preactions = [BlankLine]
        for s in sync_ops:
            imask = [
                s.handle.indices[d]
                if d.root in s.lock.locked_dimensions else FULL
                for d in s.target.dimensions
            ]
            update = PragmaTransfer(self.lang._map_update_host_async,
                                    s.target,
                                    imask=imask,
                                    queueid=SharedData._field_id)
            preactions.append(update)
        wait = self.lang._map_wait(SharedData._field_id)
        if wait is not None:
            preactions.append(Pragma(wait))
        preactions.extend([DummyExpr(s.handle, 1) for s in sync_ops])
        preactions.append(BlankLine)

        postactions = [BlankLine]
        postactions.extend([DummyExpr(s.handle, 2) for s in sync_ops])

        # Turn `iet` into a ThreadFunction so that it can be executed
        # asynchronously by a pthread in the `npthreads` pool
        name = self.sregistry.make_name(prefix='copy_device_to_host')
        body = List(body=tuple(preactions) + iet.body + tuple(postactions))
        tctx = make_thread_ctx(name, body, root, npthreads, sync_ops,
                               self.sregistry)
        pieces.funcs.extend(tctx.funcs)

        # Schedule computation to the first available thread
        iet = tctx.activate

        # Fire up the threads
        pieces.init.append(tctx.init)

        # Final wait before jumping back to Python land
        pieces.finalize.append(tctx.finalize)

        # Keep track of created objects
        pieces.objs.add(sync_ops, tctx.sdata, tctx.threads)

        return iet
Ejemplo n.º 19
0
    def _make_fetchwait(self, iet, sync_ops, *args):
        # Construct fetches
        fetches = []
        for s in sync_ops:
            fc = s.fetch.subs(s.dim, s.dim.symbolic_min)
            imask = [(fc, s.size) if d.root is s.dim.root else FULL for d in s.dimensions]
            fetches.append(self.lang._map_to(s.function, imask))

        # Glue together the new IET pieces
        iet = List(header=fetches, body=iet)

        return iet
Ejemplo n.º 20
0
    def _make_sendrecv(self, f, hse, key='', msg=None):
        comm = f.grid.distributor._obj_comm

        bufg = FieldFromPointer(msg._C_field_bufg, msg)
        bufs = FieldFromPointer(msg._C_field_bufs, msg)

        ofsg = [Symbol(name='og%s' % d.root) for d in f.dimensions]

        fromrank = Symbol(name='fromrank')
        torank = Symbol(name='torank')

        sizes = [
            FieldFromPointer('%s[%d]' % (msg._C_field_sizes, i), msg)
            for i in range(len(f._dist_dimensions))
        ]
        gather = Call('gather%s' % key, [bufg] + sizes + [f] + ofsg)

        # The `gather` is unnecessary if sending to MPI.PROC_NULL
        gather = Conditional(CondNe(torank, Macro('MPI_PROC_NULL')), gather)

        count = reduce(mul, sizes, 1)
        rrecv = Byref(FieldFromPointer(msg._C_field_rrecv, msg))
        rsend = Byref(FieldFromPointer(msg._C_field_rsend, msg))
        recv = Call('MPI_Irecv', [
            bufs, count,
            Macro(dtype_to_mpitype(f.dtype)), fromrank,
            Integer(13), comm, rrecv
        ])
        send = Call('MPI_Isend', [
            bufg, count,
            Macro(dtype_to_mpitype(f.dtype)), torank,
            Integer(13), comm, rsend
        ])

        iet = List(body=[recv, gather, send])
        iet = List(body=iet_insert_C_decls(iet))
        parameters = ([f] + ofsg + [fromrank, torank, comm, msg])
        return Callable('sendrecv%s' % key, iet, 'void', parameters,
                        ('static', ))
Ejemplo n.º 21
0
def avoid_denormals(iet):
    """
    Introduce nodes in the Iteration/Expression tree that will expand to C
    macros telling the CPU to flush denormal numbers in hardware. Denormals
    are normally flushed when using SSE-based instruction sets, except when
    compiling shared objects.
    """
    header = (
        cgen.Comment('Flush denormal numbers to zero in hardware'),
        cgen.Statement('_MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON)'),
        cgen.Statement('_MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON)'))
    iet = iet._rebuild(body=(List(header=header), ) + iet.body)
    return iet, {'includes': ('xmmintrin.h', 'pmmintrin.h')}
Ejemplo n.º 22
0
    def _make_waitprefetch(self, iet, sync_ops, pieces, *args):
        ff = SharedData._field_flag

        waits = []
        objs = filter_ordered(pieces.objs.get(s) for s in sync_ops)
        for sdata, threads in objs:
            wait = BusyWait(
                CondNe(FieldFromComposite(ff, sdata[threads.index]), 1))
            waits.append(wait)

        iet = List(header=c.Comment("Wait for the arrival of prefetched data"),
                   body=waits + [BlankLine, iet])

        return iet
Ejemplo n.º 23
0
    def instrument(self, iet, timer):
        # Look for the presence of a time loop within the IET of the Operator
        mapper = {}
        for i in FindNodes(Iteration).visit(iet):
            if i.dim.is_Time:
                # The calls to Advisor's Collection Control API are only for Operators
                # with a time loop
                mapper[i] = List(header=c.Statement('%s()' % self._api_resume),
                                 body=i,
                                 footer=c.Statement('%s()' % self._api_pause))
                return Transformer(mapper).visit(iet)

        # Return the IET intact if no time loop is found
        return iet
Ejemplo n.º 24
0
    def _build_casts(self, iet):
        iet = super(Operator, self)._build_casts(iet)

        # Add YASK solution pointer for use in C-land
        soln_obj = Object(namespace['code-soln-name'], namespace['type-solution'])

        # Add YASK user and local grids pointers for use in C-land
        grid_objs = [YaskGridObject(i.name) for i in self.input if i.from_YASK]
        grid_objs.extend([YaskGridObject(i) for i in self.yk_soln.local_grids])

        # Build pointer casts
        casts = [PointerCast(soln_obj)] + [PointerCast(i) for i in grid_objs]

        return List(body=casts + [iet])
Ejemplo n.º 25
0
    def _generate_mpi(self, iet, **kwargs):
        if configuration['mpi'] is False:
            return iet

        halo_spots = FindNodes(HaloSpot).visit(iet)

        # For each MPI-distributed TensorFunction, generate all necessary
        # C-level routines to perform a halo update
        callables = OrderedDict()
        for hs in halo_spots:
            for f, v in hs.fmapper.items():
                callables[f] = [update_halo(f, v.loc_indices)]
                callables[f].append(sendrecv(f, v.loc_indices))
                callables[f].append(copy(f, v.loc_indices))
                callables[f].append(copy(f, v.loc_indices, True))
        callables = flatten(callables.values())

        # Replace HaloSpots with suitable calls performing the halo update
        mapper = {}
        for hs in halo_spots:
            for f, v in hs.fmapper.items():
                stencil = [int(i) for i in hs.mask[f].values()]
                comm = f.grid.distributor._C_comm
                nb = f.grid.distributor._C_neighbours.obj
                loc_indices = list(v.loc_indices.values())
                dsizes = [d.symbolic_size for d in f.dimensions]
                parameters = [f] + stencil + [comm, nb] + loc_indices + dsizes
                call = Call('halo_exchange_%s' % f.name, parameters)
                mapper.setdefault(hs, []).append(call)

        # Sorting is for deterministic code generation. However, in practice,
        # we don't expect `cstructs` to contain more than one element because
        # there should always be one grid per Operator (though we're not really
        # enforcing it)
        cstructs = {
            f.grid.distributor._C_neighbours.cdef
            for f in flatten(i.fmapper for i in halo_spots)
        }
        self._globals.extend(sorted(cstructs, key=lambda i: i.tpname))

        self._includes.append('mpi.h')

        self._func_table.update(
            OrderedDict([(i.name, MetaCall(i, True)) for i in callables]))

        # Add in the halo update calls
        mapper = {k: List(body=v + list(k.body)) for k, v in mapper.items()}
        iet = Transformer(mapper, nested=True).visit(iet)

        return iet
Ejemplo n.º 26
0
def test_create_elemental_functions_simple(simple_function):
    roots = [i[-1] for i in retrieve_iteration_tree(simple_function)]
    retagged = [i._rebuild(properties=tagger(0)) for i in roots]
    mapper = {
        i: j._rebuild(properties=(j.properties + (ELEMENTAL, )))
        for i, j in zip(roots, retagged)
    }
    function = Transformer(mapper).visit(simple_function)
    handle = transform(function, mode='split')
    block = List(body=[handle.nodes] + handle.elemental_functions)
    output = str(block.ccode)
    # Make output compiler independent
    output = [
        i for i in output.split('\n')
        if all([j not in i for j in ('#pragma', '/*')])
    ]
    assert '\n'.join(output) == \
        ("""void foo(float *restrict a_vec, float *restrict b_vec,"""
         """ float *restrict c_vec, float *restrict d_vec)
{
  float (*restrict a) __attribute__((aligned(64))) = (float (*)) a_vec;
  float (*restrict b) __attribute__((aligned(64))) = (float (*)) b_vec;
  float (*restrict c)[j_size] __attribute__((aligned(64))) = (float (*)[j_size]) c_vec;
  float (*restrict d)[j_size][k_size] __attribute__((aligned(64))) ="""
         """ (float (*)[j_size][k_size]) d_vec;
  for (int i = 0; i < 3; i += 1)
  {
    for (int j = 0; j < 5; j += 1)
    {
      f_0(0,7,(float*)a,(float*)b,(float*)c,(float*)d,i,i_size,j,j_size,k_size);
    }
  }
}
void f_0(const int k_start, const int k_finish,"""
         """ float *restrict a_vec, float *restrict b_vec,"""
         """ float *restrict c_vec, float *restrict d_vec,"""
         """ const int i, const int i_size, const int j, const int j_size, const int k_size)
{
  float (*restrict a) __attribute__((aligned(64))) = (float (*)) a_vec;
  float (*restrict b) __attribute__((aligned(64))) = (float (*)) b_vec;
  float (*restrict c)[j_size] __attribute__((aligned(64))) = (float (*)[j_size]) c_vec;
  float (*restrict d)[j_size][k_size] __attribute__((aligned(64))) ="""
         """ (float (*)[j_size][k_size]) d_vec;
  for (int k = k_start; k < k_finish; k += 1)
  {
    a[i] = a[i] + b[i] + 5.0F;
    a[i] = -a[i]*c[i][j] + b[i]*d[i][j][k];
  }
}""")
Ejemplo n.º 27
0
    def _build_casts(self, iet):
        iet = super(Operator, self)._build_casts(iet)

        # Add YASK solution pointer for use in C-land
        soln_objs = [YaskSolnObject(cname) for _, cname in self.yk_solns]

        # Add YASK user and local grids pointers for use in C-land
        grid_objs = [YaskGridObject(i.name) for i in self.input if i.from_YASK]
        grid_objs.extend([YaskGridObject(i) for i in self._local_grids])

        # Build pointer casts
        casts = [PointerCast(i)
                 for i in soln_objs] + [PointerCast(i) for i in grid_objs]

        return List(body=casts + [iet])
Ejemplo n.º 28
0
    def _make_prefetchupdate(self, iet, sync_ops, pieces, root):
        fid = SharedData._field_id

        postactions = []
        for s in sync_ops:
            # `pcond` is not None, but we won't use it here because the condition
            # is actually already encoded in `iet` itself (it stems from the
            # originating Cluster's guards)
            assert s.pcond is not None

            imask = [(s.tstore, s.size) if d.root is s.dim.root else FULL
                     for d in s.dimensions]
            prefetch = List(
                header=self.lang._map_update_wait_device(s.target, imask, fid))
            postactions.append(prefetch)

        # Turn prefetch IET into a ThreadFunction
        name = self.sregistry.make_name(prefix='prefetch_host_to_device')
        body = List(body=iet.body + tuple(postactions))
        tctx = make_thread_ctx(name, body, root, None, sync_ops,
                               self.sregistry)
        pieces.funcs.extend(tctx.funcs)

        # The IET degenerates to the threads activation logic
        iet = tctx.activate

        # Fire up the threads
        pieces.init.append(tctx.init)

        # Final wait before jumping back to Python land
        pieces.finalize.append(tctx.finalize)

        # Keep track of created objects
        pieces.objs.add(sync_ops, tctx.sdata, tctx.threads)

        return iet
Ejemplo n.º 29
0
    def _make_delete(self, iet, sync_ops, *args):
        # Construct deletion clauses
        deletions = []
        for s in sync_ops:
            dimensions = s.dimensions
            fc = s.fetch

            imask = [(fc, s.size) if d.root is s.dim.root else FULL
                     for d in dimensions]
            deletions.append(self.lang._map_delete(s.function, imask))

        # Glue together the new IET pieces
        iet = List(header=c.Line(), body=iet, footer=[c.Line()] + deletions)

        return iet
Ejemplo n.º 30
0
    def _initialize(iet):
        comm = None

        for i in iet.parameters:
            if isinstance(i, MPICommObject):
                comm = i
                break

        if comm is not None:
            rank = Symbol(name='rank')
            rank_decl = LocalExpression(DummyEq(rank, 0))
            rank_init = Call('MPI_Comm_rank', [comm, Byref(rank)])

            ngpus = Symbol(name='ngpus')
            call = Function('omp_get_num_devices')()
            ngpus_init = LocalExpression(DummyEq(ngpus, call))

            devicenum_init = LocalExpression(DummyEq(devicenum, rank % ngpus))

            body = [rank_decl, rank_init, ngpus_init, devicenum_init]

            init = List(header=c.Comment('Begin of OpenMP+MPI setup'),
                        body=body,
                        footer=(c.Comment('End of OpenMP+MPI setup'),
                                c.Line()))
        else:
            devicenum_init = LocalExpression(DummyEq(devicenum, 0))
            body = [devicenum_init]

            init = List(header=c.Comment('Begin of OpenMP setup'),
                        body=body,
                        footer=(c.Comment('End of OpenMP setup'), c.Line()))

        iet = iet._rebuild(body=(init, ) + iet.body)

        return iet