Example #1
0
    def inject(self, field, expr, offset=0):
        """
        Generate equations injecting an arbitrary expression into a field.

        Parameters
        ----------
        field : Function
            Input field into which the injection is performed.
        expr : expr-like
            Injected expression.
        offset : int, optional
            Additional offset from the boundary.
        """
        expr = indexify(expr)
        field = indexify(field)

        p, _ = self.gridpoints.indices
        dim_subs = []
        coeffs = []
        for i, d in enumerate(self.grid.dimensions):
            rd = DefaultDimension(name="r%s" % d.name, default_value=self.r)
            dim_subs.append((d, INT(rd + self.gridpoints[p, i])))
            coeffs.append(self.interpolation_coeffs[p, i, rd])
        rhs = prod(coeffs) * expr
        field = field.subs(dim_subs)
        return [Eq(field, field + rhs.subs(dim_subs))]
Example #2
0
    def _make_symbolic_sections_from_imask(cls, f, imask):
        datasize = cls._map_data(f)
        if imask is None:
            imask = [FULL]*len(datasize)

        sections = []
        for i, j in zip(imask, datasize):
            if i is FULL:
                start, size = 0, j
            else:
                try:
                    start, size = i
                except TypeError:
                    start, size = i, 1
            sections.append((start, size))

        # Unroll (or "flatten") the remaining Dimensions not captured by `imask`
        if len(imask) < len(datasize):
            try:
                start, size = sections.pop(-1)
            except IndexError:
                start, size = (0, 1)
            remainder_size = prod(datasize[len(imask):])
            # The reason we may see a Wildcard is detailed in the `linearize_transfer`
            # pass, take a look there for more info. Basically, a Wildcard here means
            # that the symbol `start` is actually a temporary whose value already
            # represents the unrolled size
            if not isinstance(start, Wildcard):
                start *= remainder_size
            size *= remainder_size
            sections.append((start, size))

        return sections
Example #3
0
    def interpolate(self, expr, offset=0, increment=False, self_subs={}):
        """
        Generate equations interpolating an arbitrary expression into ``self``.

        Parameters
        ----------
        expr : expr-like
            Input expression to interpolate.
        offset : int, optional
            Additional offset from the boundary.
        increment: bool, optional
            If True, generate increments (Inc) rather than assignments (Eq).
        """
        expr = indexify(expr)

        p, _, _ = self.interpolation_coeffs.indices
        dim_subs = []
        coeffs = []
        for i, d in enumerate(self.grid.dimensions):
            rd = DefaultDimension(name="r%s" % d.name, default_value=self.r)
            dim_subs.append((d, INT(rd + self.gridpoints[p, i])))
            coeffs.append(self.interpolation_coeffs[p, i, rd])
        # Apply optional time symbol substitutions to lhs of assignment
        lhs = self.subs(self_subs)
        rhs = prod(coeffs) * expr.subs(dim_subs)

        return [Eq(lhs, lhs + rhs)]
Example #4
0
    def _make_guard(self, parregion, *args):
        partrees = FindNodes(ParallelTree).visit(parregion)
        if not any(isinstance(i.root, self.DeviceIteration) for i in partrees):
            return super()._make_guard(parregion, *args)

        cond = []
        # There must be at least one iteration or potential crash
        if not parregion.is_Affine:
            trees = retrieve_iteration_tree(parregion.root)
            tree = trees[0][:parregion.ncollapsed]
            cond.extend([i.symbolic_size > 0 for i in tree])

        # SparseFunctions may occasionally degenerate to zero-size arrays. In such
        # a case, a copy-in produces a `nil` pointer on the device. To fire up a
        # parallel loop we must ensure none of the SparseFunction pointers are `nil`
        symbols = FindSymbols().visit(parregion)
        sfs = [i for i in symbols if i.is_SparseFunction]
        if sfs:
            size = [prod(f._C_get_field(FULL, d).size for d in f.dimensions) for f in sfs]
            cond.extend([i > 0 for i in size])

        # Combine all cond elements
        if cond:
            parregion = List(body=[Conditional(And(*cond), parregion)])

        return parregion
Example #5
0
def calculate_nblocks(tree, blockable):
    collapsed = tree[:(tree[0].ncollapsed or 1)]
    blocked = [i.dim for i in collapsed if i.dim in blockable]
    remainders = [(d.root.symbolic_max-d.root.symbolic_min+1) % d.step for d in blocked]
    niters = [d.root.symbolic_max - i for d, i in zip(blocked, remainders)]
    nblocks = prod((i - d.root.symbolic_min + 1) / d.step
                   for d, i in zip(blocked, niters))
    return nblocks
Example #6
0
def calculate_nblocks(tree, blockable):
    collapsed = tree[:(tree[0].ncollapsed or 1)]
    blocked = [i.dim for i in collapsed if i.dim in blockable]
    remainders = [(d.root.symbolic_max-d.root.symbolic_min+1) % d.step for d in blocked]
    niters = [d.root.symbolic_max - i for d, i in zip(blocked, remainders)]
    nblocks = prod((i - d.root.symbolic_min + 1) / d.step
                   for d, i in zip(blocked, niters))
    return nblocks
Example #7
0
    def _select_candidates(self, candidates):
        assert candidates

        if self.ncores < self.collapse_ncores:
            return candidates[0], []

        mapper = {}
        for n0, root in enumerate(candidates):

            collapsable = []
            for n, i in enumerate(candidates[n0+1:], n0+1):
                # The Iteration nest [root, ..., i] must be perfect
                if not IsPerfectIteration(depth=i).visit(root):
                    break

                # Loops are collapsable only if none of the iteration variables appear
                # in initializer expressions. For example, the following two loops
                # cannot be collapsed
                #
                # for (i = ... )
                #   for (j = i ...)
                #     ...
                #
                # Here, we make sure this won't happen
                if any(j.dim in i.symbolic_min.free_symbols for j in candidates[n0:n]):
                    break

                # Also, we do not want to collapse SIMD-vectorized Iterations
                if i.is_Vectorized:
                    break

                # Would there be enough work per parallel iteration?
                nested = candidates[n+1:]
                if nested:
                    try:
                        work = prod([int(j.dim.symbolic_size) for j in nested])
                        if work < self.collapse_work:
                            break
                    except TypeError:
                        pass

                collapsable.append(i)

            # Give a score to this candidate, based on the number of fully-parallel
            # Iterations and their position (i.e. outermost to innermost) in the nest
            score = (
                int(root.is_ParallelNoAtomic),
                int(len([i for i in collapsable if i.is_ParallelNoAtomic]) >= 1),
                int(len([i for i in collapsable if i.is_ParallelRelaxed]) >= 1),
                -(n0 + 1)  # The outermost, the better
            )

            mapper[(root, tuple(collapsable))] = score

        # Retrieve the candidates with highest score
        root, collapsable = max(mapper, key=mapper.get)

        return root, list(collapsable)
Example #8
0
    def _make_partree(self, candidates, omp_pragma=None):
        """Parallelize `root` attaching a suitable OpenMP pragma."""
        assert candidates
        root = candidates[0]

        # Pick up an omp-pragma template
        # Caller-provided -> stick to it
        # Affine -> ... schedule(static,1) ...
        # Non-affine -> ... schedule(static) ...
        if omp_pragma is None:
            if all(i.is_Affine for i in candidates):
                omp_pragma = self.lang['for-static-1']
            else:
                omp_pragma = self.lang['for-static']

        # Get the collapsable Iterations
        collapsable = []
        if ncores() >= Ompizer.COLLAPSE_NCORES and IsPerfectIteration().visit(
                root):
            for n, i in enumerate(candidates[1:], 1):
                # The OpenMP specification forbids collapsed loops to use iteration
                # variables in initializer expressions. E.g., the following is forbidden:
                #
                # #pragma omp ... collapse(2)
                # for (i = ... )
                #   for (j = i ...)
                #     ...
                #
                # Here, we make sure this won't happen
                if any(j.dim in i.symbolic_min.free_symbols
                       for j in candidates[:n]):
                    break

                # Also, we do not want to collapse vectorizable Iterations
                if i.is_Vectorizable:
                    break

                # Would there be enough work per parallel iteration?
                try:
                    work = prod(
                        [int(j.dim.symbolic_size) for j in candidates[n + 1:]])
                    if work < Ompizer.COLLAPSE_WORK:
                        break
                except TypeError:
                    pass

                collapsable.append(i)

        # Attach an OpenMP pragma-for with a collapse clause
        ncollapse = 1 + len(collapsable)
        partree = root._rebuild(
            pragmas=root.pragmas + (omp_pragma(ncollapse), ),
            properties=root.properties + (COLLAPSED(ncollapse), ))

        collapsed = [partree] + collapsable

        return root, partree, collapsed
Example #9
0
def calculate_nblocks(tree, blockable):
    block_indices = [n for n, i in enumerate(tree) if i.dim in blockable]
    index = block_indices[0]
    collapsed = tree[index:index + (tree[index].ncollapsed or index+1)]
    blocked = [i.dim for i in collapsed if i.dim in blockable]
    remainders = [(d.root.symbolic_max-d.root.symbolic_min+1) % d.step for d in blocked]
    niters = [d.root.symbolic_max - i for d, i in zip(blocked, remainders)]
    nblocks = prod((i - d.root.symbolic_min + 1) / d.step
                   for d, i in zip(blocked, niters))
    return nblocks
Example #10
0
    def _dist_alltoall(self):
        """
        The metadata necessary to perform an ``MPI_Alltoallv`` distributing the
        sparse data values across the MPI ranks needing them.
        """
        ssparse, rsparse = self._dist_count

        # Per-rank shape of send/recv data
        sshape = []
        rshape = []
        for s, r in zip(ssparse, rsparse):
            handle = list(self.shape)
            handle[self._sparse_position] = s
            sshape.append(tuple(handle))

            handle = list(self.shape)
            handle[self._sparse_position] = r
            rshape.append(tuple(handle))

        # Per-rank count of send/recv data
        scount = tuple(prod(i) for i in sshape)
        rcount = tuple(prod(i) for i in rshape)

        # Per-rank displacement of send/recv data (it's actually all contiguous,
        # but the Alltoallv needs this information anyway)
        sdisp = np.concatenate([[0], np.cumsum(scount)[:-1]])
        rdisp = np.concatenate([[0], tuple(np.cumsum(rcount))[:-1]])

        # Total shape of send/recv data
        sshape = list(self.shape)
        sshape[self._sparse_position] = sum(ssparse)
        rshape = list(self.shape)
        rshape[self._sparse_position] = sum(rsparse)

        # May have to swap axes, as `MPI_Alltoallv` expects contiguous data, and
        # the sparse dimension may not be the outermost
        sshape = tuple(sshape[i] for i in self._dist_reorder_mask)
        rshape = tuple(rshape[i] for i in self._dist_reorder_mask)

        return sshape, scount, sdisp, rshape, rcount, rdisp
Example #11
0
    def _dist_alltoall(self):
        """
        The metadata necessary to perform an ``MPI_Alltoallv`` distributing the
        sparse data values across the MPI ranks needing them.
        """
        ssparse, rsparse = self._dist_count

        # Per-rank shape of send/recv data
        sshape = []
        rshape = []
        for s, r in zip(ssparse, rsparse):
            handle = list(self.shape)
            handle[self._sparse_position] = s
            sshape.append(tuple(handle))

            handle = list(self.shape)
            handle[self._sparse_position] = r
            rshape.append(tuple(handle))

        # Per-rank count of send/recv data
        scount = tuple(prod(i) for i in sshape)
        rcount = tuple(prod(i) for i in rshape)

        # Per-rank displacement of send/recv data (it's actually all contiguous,
        # but the Alltoallv needs this information anyway)
        sdisp = np.concatenate([[0], np.cumsum(scount)[:-1]])
        rdisp = np.concatenate([[0], tuple(np.cumsum(rcount))[:-1]])

        # Total shape of send/recv data
        sshape = list(self.shape)
        sshape[self._sparse_position] = sum(ssparse)
        rshape = list(self.shape)
        rshape[self._sparse_position] = sum(rsparse)

        # May have to swap axes, as `MPI_Alltoallv` expects contiguous data, and
        # the sparse dimension may not be the outermost
        sshape = tuple(sshape[i] for i in self._dist_reorder_mask)
        rshape = tuple(rshape[i] for i in self._dist_reorder_mask)

        return sshape, scount, sdisp, rshape, rcount, rdisp
Example #12
0
def make_calculate_parblocks(trees, blockable, nthreads):
    trees = [i for i in trees if any(d in i.dimensions for d in blockable)]
    nblocks_per_threads = []
    for tree, nt in product(trees, nthreads):
        collapsed = tree[:tree[0].ncollapsed]
        blocked = [i.dim for i in collapsed if i.dim in blockable]
        remainders = [(d.root.symbolic_max - d.root.symbolic_min + 1) % d.step
                      for d in blocked]
        niters = [d.root.symbolic_max - i for d, i in zip(blocked, remainders)]
        nblocks = prod((i - d.root.symbolic_min + 1) / d.step
                       for d, i in zip(blocked, niters))
        nblocks_per_threads.append(nblocks / nt)
    return nblocks_per_threads
Example #13
0
    def _make_partree(self, candidates, nthreads=None):
        """Parallelize the `candidates` Iterations attaching suitable OpenMP pragmas."""
        assert candidates
        root = candidates[0]

        # Get the collapsable Iterations
        collapsable = self._find_collapsable(root, candidates)
        ncollapse = 1 + len(collapsable)

        # Prepare to build a ParallelTree
        if all(i.is_Affine for i in candidates):
            bundles = FindNodes(ExpressionBundle).visit(root)
            sops = sum(i.ops for i in bundles)
            if sops >= self.dynamic_work:
                schedule = 'dynamic'
            else:
                schedule = 'static'
            if nthreads is None:
                # pragma omp for ... schedule(..., 1)
                nthreads = self.nthreads
                body = OpenMPIteration(schedule=schedule,
                                       ncollapse=ncollapse,
                                       **root.args)
            else:
                # pragma omp parallel for ... schedule(..., 1)
                body = OpenMPIteration(schedule=schedule,
                                       parallel=True,
                                       ncollapse=ncollapse,
                                       nthreads=nthreads,
                                       **root.args)
            prefix = []
        else:
            # pragma omp for ... schedule(..., expr)
            assert nthreads is None
            nthreads = self.nthreads_nonaffine
            chunk_size = Symbol(name='chunk_size')
            body = OpenMPIteration(ncollapse=ncollapse,
                                   chunk_size=chunk_size,
                                   **root.args)

            niters = prod([root.symbolic_size] +
                          [j.symbolic_size for j in collapsable])
            value = INT(Max(niters / (nthreads * self.chunk_nonaffine), 1))
            prefix = [Expression(DummyEq(chunk_size, value, dtype=np.int32))]

        # Create a ParallelTree
        partree = ParallelTree(prefix, body, nthreads=nthreads)

        collapsed = [partree] + collapsable

        return root, partree, collapsed
Example #14
0
    def _alloc_array_on_high_bw_mem(self, site, obj, storage):
        """
        Allocate an Array in the high bandwidth memory.
        """
        size_trunkated = "".join("[%s]" % i for i in obj.symbolic_shape[1:])
        decl = c.Value(obj._C_typedata, "(*%s)%s" % (obj.name, size_trunkated))
        cast = "(%s (*)%s)" % (obj._C_typedata, size_trunkated)
        size_full = prod(obj.symbolic_shape)
        alloc = "%s acc_malloc(sizeof(%s[%s]))" % (cast, obj._C_typedata, size_full)
        init = c.Initializer(decl, alloc)

        free = c.Statement('acc_free(%s)' % obj.name)

        storage.update(obj, site, allocs=init, frees=free)
Example #15
0
    def _dist_subfunc_alltoall(self):
        ssparse, rsparse = self._dist_count

        # Per-rank shape of send/recv `coordinates`
        sshape = [(i, self.grid.dim) for i in ssparse]
        rshape = [(i, self.grid.dim) for i in rsparse]

        # Per-rank count of send/recv `coordinates`
        scount = [prod(i) for i in sshape]
        rcount = [prod(i) for i in rshape]

        # Per-rank displacement of send/recv `coordinates` (it's actually all
        # contiguous, but the Alltoallv needs this information anyway)
        sdisp = np.concatenate([[0], np.cumsum(scount)[:-1]])
        rdisp = np.concatenate([[0], tuple(np.cumsum(rcount))[:-1]])

        # Total shape of send/recv `coordinates`
        sshape = list(self.coordinates.shape)
        sshape[0] = sum(ssparse)
        rshape = list(self.coordinates.shape)
        rshape[0] = sum(rsparse)

        return sshape, scount, sdisp, rshape, rcount, rdisp
Example #16
0
    def _dist_subfunc_alltoall(self):
        ssparse, rsparse = self._dist_count

        # Per-rank shape of send/recv `coordinates`
        sshape = [(i, self.grid.dim) for i in ssparse]
        rshape = [(i, self.grid.dim) for i in rsparse]

        # Per-rank count of send/recv `coordinates`
        scount = [prod(i) for i in sshape]
        rcount = [prod(i) for i in rshape]

        # Per-rank displacement of send/recv `coordinates` (it's actually all
        # contiguous, but the Alltoallv needs this information anyway)
        sdisp = np.concatenate([[0], np.cumsum(scount)[:-1]])
        rdisp = np.concatenate([[0], tuple(np.cumsum(rcount))[:-1]])

        # Total shape of send/recv `coordinates`
        sshape = list(self.coordinates.shape)
        sshape[0] = sum(ssparse)
        rshape = list(self.coordinates.shape)
        rshape[0] = sum(rsparse)

        return sshape, scount, sdisp, rshape, rcount, rdisp
Example #17
0
    def _alloc_array_on_high_bw_mem(self, site, obj, storage, *args):
        """
        Allocate an Array in the high bandwidth memory.
        """
        decl = Definition(obj)

        memptr = VOID(Byref(obj._C_symbol), '**')
        alignment = obj._data_alignment
        size = SizeOf(obj._C_typedata) * prod(obj.symbolic_shape)
        alloc = self.lang['host-alloc'](memptr, alignment, size)

        free = self.lang['host-free'](obj._C_symbol)

        storage.update(obj, site, allocs=(decl, alloc), frees=free)
Example #18
0
        def callback():
            _expr = indexify(expr)
            _field = indexify(field)

            p, _ = self.obj.gridpoints.indices
            dim_subs = []
            coeffs = []
            for i, d in enumerate(self.obj.grid.dimensions):
                rd = DefaultDimension(name="r%s" % d.name, default_value=self.r)
                dim_subs.append((d, INT(rd + self.obj.gridpoints[p, i])))
                coeffs.append(self.obj.interpolation_coeffs[p, i, rd])
            rhs = prod(coeffs) * _expr
            _field = _field.subs(dim_subs)
            return [Eq(_field, _field + rhs.subs(dim_subs))]
Example #19
0
        def callback():
            _expr = indexify(expr)

            p, _, _ = self.obj.interpolation_coeffs.indices
            dim_subs = []
            coeffs = []
            for i, d in enumerate(self.obj.grid.dimensions):
                rd = DefaultDimension(name="r%s" % d.name, default_value=self.r)
                dim_subs.append((d, INT(rd + self.obj.gridpoints[p, i])))
                coeffs.append(self.obj.interpolation_coeffs[p, i, rd])
            # Apply optional time symbol substitutions to lhs of assignment
            lhs = self.obj.subs(self_subs)
            rhs = prod(coeffs) * _expr.subs(dim_subs)

            return [Eq(lhs, lhs + rhs)]
Example #20
0
    def test_guard_overflow(self):
        """
        A toy test showing how to create a Guard to prevent writes to buffers
        with not enough space to fit another snapshot.
        """
        grid = Grid(shape=(4, 4))

        f = Function(name='f', grid=grid)

        freespace = Scalar(name='freespace')
        size = prod(f.symbolic_shape)

        guard = GuardOverflow(freespace, size)

        assert ccode(guard) == 'freespace >= (x_size + 2)*(y_size + 2)'
Example #21
0
    def _alloc_array_on_high_bw_mem(self, site, obj, storage):
        if obj._mem_mapped:
            super()._alloc_array_on_high_bw_mem(site, obj, storage)
        else:
            # E.g., use `acc_malloc` or `omp_target_alloc` -- the Array only resides
            # on the device as it never needs to be accessed on the host
            assert obj._mem_local

            deviceid = DefFunction(self.lang['device-get'].name)
            doalloc = self.lang['device-alloc']
            dofree = self.lang['device-free']

            size = SizeOf(obj._C_typedata) * prod(obj.symbolic_shape)
            init = doalloc(size, deviceid, retobj=obj)

            free = dofree(obj._C_name, deviceid)

            storage.update(obj, site, allocs=init, frees=free)
Example #22
0
    def _alloc_pointed_array_on_high_bw_mem(self, site, obj, storage):
        """
        Allocate the following objects in the high bandwidth memory:

            * The pointer array `obj`;
            * The pointee Array `obj.array`

        If the pointer array is defined over `sregistry.threadid`, that is a thread
        Dimension, then each `obj.array` slice is allocated and freed individually
        by the owner thread.
        """
        # The pointer array
        decl = Definition(obj)

        memptr = VOID(Byref(obj._C_symbol), '**')
        alignment = obj._data_alignment
        size = SizeOf(Keyword('%s*' % obj._C_typedata)) * obj.dim.symbolic_size
        alloc0 = self.lang['host-alloc'](memptr, alignment, size)

        free0 = self.lang['host-free'](obj._C_symbol)

        # The pointee Array
        pobj = IndexedPointer(obj._C_symbol, obj.dim)
        memptr = VOID(Byref(pobj), '**')
        size = SizeOf(obj._C_typedata) * prod(obj.array.symbolic_shape)
        alloc1 = self.lang['host-alloc'](memptr, alignment, size)

        free1 = self.lang['host-free'](pobj)

        # Dump
        if obj.dim is self.sregistry.threadid:
            storage.update(obj,
                           site,
                           allocs=(decl, alloc0),
                           frees=free0,
                           pallocs=(obj.dim, alloc1),
                           pfrees=(obj.dim, free1))
        else:
            storage.update(obj,
                           site,
                           allocs=(decl, alloc0, alloc1),
                           frees=(free0, free1))
Example #23
0
    def _make_partree(self, candidates, nthreads=None):
        """Parallelize the `candidates` Iterations attaching suitable OpenMP pragmas."""
        assert candidates
        root = candidates[0]

        # Get the collapsable Iterations
        collapsable = self._find_collapsable(root, candidates)
        ncollapse = 1 + len(collapsable)

        # Prepare to build a ParallelTree
        prefix = []
        if all(i.is_Affine for i in candidates):
            if nthreads is None:
                # pragma omp for ... schedule(..., 1)
                nthreads = self.nthreads
                omp_pragma = self.lang['for'](ncollapse, 1)
            else:
                # pragma omp parallel for ... schedule(..., 1)
                omp_pragma = self.lang['par-for'](ncollapse, 1, nthreads)
        else:
            # pragma omp for ... schedule(..., expr)
            assert nthreads is None
            nthreads = self.nthreads_nonaffine

            chunk_size = Symbol(name='chunk_size')
            omp_pragma = self.lang['for'](ncollapse, chunk_size)

            niters = prod([root.symbolic_size] +
                          [j.symbolic_size for j in collapsable])
            value = INT(Max(niters / (nthreads * self.CHUNKSIZE_NONAFFINE), 1))
            prefix.append(
                Expression(DummyEq(chunk_size, value, dtype=np.int32)))

        # Create a ParallelTree
        body = root._rebuild(pragmas=root.pragmas + (omp_pragma, ),
                             properties=root.properties +
                             (COLLAPSED(ncollapse), ))
        partree = ParallelTree(prefix, body, nthreads=nthreads)

        collapsed = [partree] + collapsable

        return root, partree, collapsed
Example #24
0
    def _alloc_array_on_high_bw_mem(self, site, obj, storage):
        if obj._mem_mapped:
            super()._alloc_array_on_high_bw_mem(site, obj, storage)
        else:
            # E.g., use `acc_malloc` or `omp_target_alloc` -- the Array only resides
            # on the device as it never needs to be accessed on the host
            assert obj._mem_default
            decl = c.Value(obj._C_typedata, "*%s" % obj._C_name)
            size = "sizeof(%s[%s])" % (obj._C_typedata, prod(
                obj.symbolic_shape))

            deviceid = self.lang['device-get']
            doalloc = self.lang['device-alloc']
            dofree = self.lang['device-free']

            alloc = "(%s*) %s" % (obj._C_typedata, doalloc(size, deviceid))
            init = c.Initializer(decl, alloc)

            free = c.Statement(dofree(obj._C_name, deviceid))

            storage.update(obj, site, allocs=init, frees=free)
Example #25
0
    def _alloc_array_on_high_bw_mem(self, site, obj, storage):
        if obj._mem_mapped:
            # posix_memalign + copy-to-device
            super()._alloc_array_on_high_bw_mem(site, obj, storage)
        else:
            # acc_malloc -- the Array only resides on the device, ie, it never
            # needs to be accessed on the host
            assert obj._mem_default
            size_trunkated = "".join("[%s]" % i
                                     for i in obj.symbolic_shape[1:])
            decl = c.Value(obj._C_typedata,
                           "(*%s)%s" % (obj.name, size_trunkated))
            cast = "(%s (*)%s)" % (obj._C_typedata, size_trunkated)
            size_full = "sizeof(%s[%s])" % (obj._C_typedata,
                                            prod(obj.symbolic_shape))
            alloc = "%s %s" % (cast, self.lang['device-alloc'](size_full))
            init = c.Initializer(decl, alloc)

            free = c.Statement(self.lang['device-free'](obj.name))

            storage.update(obj, site, allocs=init, frees=free)
Example #26
0
    def _alloc_array_on_high_bw_mem(self, site, obj, storage):
        """
        Allocate an Array in the high bandwidth memory.
        """
        if obj._mem_mapped:
            # posix_memalign + copy-to-device
            super()._alloc_array_on_high_bw_mem(site, obj, storage)
        else:
            # acc_malloc -- the Array only resides on the device, ie, it never
            # needs to be accessed on the host
            assert obj._mem_default
            size_trunkated = "".join("[%s]" % i for i in obj.symbolic_shape[1:])
            decl = c.Value(obj._C_typedata, "(*%s)%s" % (obj.name, size_trunkated))
            cast = "(%s (*)%s)" % (obj._C_typedata, size_trunkated)
            size_full = prod(obj.symbolic_shape)
            alloc = "%s acc_malloc(sizeof(%s[%s]))" % (cast, obj._C_typedata, size_full)
            init = c.Initializer(decl, alloc)

            free = c.Statement('acc_free(%s)' % obj.name)

            storage.update(obj, site, allocs=init, frees=free)
Example #27
0
    def _find_collapsable(self, root, candidates):
        collapsable = []
        if ncores() >= self.collapse_ncores:
            for n, i in enumerate(candidates[1:], 1):
                # The Iteration nest [root, ..., i] must be perfect
                if not IsPerfectIteration(depth=i).visit(root):
                    break

                # The OpenMP specification forbids collapsed loops to use iteration
                # variables in initializer expressions. E.g., the following is forbidden:
                #
                # #pragma omp ... collapse(2)
                # for (i = ... )
                #   for (j = i ...)
                #     ...
                #
                # Here, we make sure this won't happen
                if any(j.dim in i.symbolic_min.free_symbols
                       for j in candidates[:n]):
                    break

                # Also, we do not want to collapse vectorizable Iterations
                if i.is_Vectorized:
                    break

                # Would there be enough work per parallel iteration?
                nested = candidates[n + 1:]
                if nested:
                    try:
                        work = prod([int(j.dim.symbolic_size) for j in nested])
                        if work < self.collapse_work:
                            break
                    except TypeError:
                        pass

                collapsable.append(i)
        return collapsable
Example #28
0
    def _find_collapsable(self, root, candidates):
        collapsable = []
        if self.ncores >= self.collapse_ncores:
            for n, i in enumerate(candidates[1:], 1):
                # The Iteration nest [root, ..., i] must be perfect
                if not IsPerfectIteration(depth=i).visit(root):
                    break

                # Loops are collapsable only if none of the iteration variables appear
                # in initializer expressions. For example, the following two loops
                # cannot be collapsed
                #
                # for (i = ... )
                #   for (j = i ...)
                #     ...
                #
                # Here, we make sure this won't happen
                if any(j.dim in i.symbolic_min.free_symbols
                       for j in candidates[:n]):
                    break

                # Also, we do not want to collapse SIMD-vectorized Iterations
                if i.is_Vectorized:
                    break

                # Would there be enough work per parallel iteration?
                nested = candidates[n + 1:]
                if nested:
                    try:
                        work = prod([int(j.dim.symbolic_size) for j in nested])
                        if work < self.collapse_work:
                            break
                    except TypeError:
                        pass

                collapsable.append(i)
        return collapsable
Example #29
0
    def _make_partree(self, candidates, nthreads=None):
        """Parallelize `root` attaching a suitable OpenMP pragma."""
        assert candidates
        root = candidates[0]

        # Get the collapsable Iterations
        collapsable = []
        if ncores() >= Ompizer.COLLAPSE_NCORES and IsPerfectIteration().visit(
                root):
            for n, i in enumerate(candidates[1:], 1):
                # The OpenMP specification forbids collapsed loops to use iteration
                # variables in initializer expressions. E.g., the following is forbidden:
                #
                # #pragma omp ... collapse(2)
                # for (i = ... )
                #   for (j = i ...)
                #     ...
                #
                # Here, we make sure this won't happen
                if any(j.dim in i.symbolic_min.free_symbols
                       for j in candidates[:n]):
                    break

                # Also, we do not want to collapse vectorizable Iterations
                if i.is_Vectorizable:
                    break

                # Would there be enough work per parallel iteration?
                try:
                    work = prod(
                        [int(j.dim.symbolic_size) for j in candidates[n + 1:]])
                    if work < Ompizer.COLLAPSE_WORK:
                        break
                except TypeError:
                    pass

                collapsable.append(i)
        ncollapse = 1 + len(collapsable)

        # Prepare to build a ParallelTree
        prefix = []
        if all(i.is_Affine for i in candidates):
            if nthreads is None:
                # pragma omp for ... schedule(..., 1)
                nthreads = self.nthreads
                omp_pragma = self.lang['for'](ncollapse, 1)
            else:
                # pragma omp parallel for ... schedule(..., 1)
                omp_pragma = self.lang['par-for'](ncollapse, 1, nthreads)
        else:
            # pragma omp for ... schedule(..., expr)
            assert nthreads is None
            nthreads = self.nthreads_nonaffine

            chunk_size = Symbol(name='chunk_size')
            omp_pragma = self.lang['for'](ncollapse, chunk_size)

            niters = prod([root.symbolic_size] +
                          [j.symbolic_size for j in collapsable])
            value = INT(Max(niters / (nthreads * self.CHUNKSIZE_NONAFFINE), 1))
            prefix.append(
                Expression(DummyEq(chunk_size, value, dtype=np.int32)))

        # Create a ParallelTree
        body = root._rebuild(pragmas=root.pragmas + (omp_pragma, ),
                             properties=root.properties +
                             (COLLAPSED(ncollapse), ))
        partree = ParallelTree(prefix, body, nthreads=nthreads)

        collapsed = [partree] + collapsable

        return root, partree, collapsed
Example #30
0
def linearize_accesses(iet, key, cache, sregistry):
    """
    Turn Indexeds into FIndexeds and create the necessary access Macros.
    """
    # `functions` are all Functions that `iet` may need to linearize
    functions = [f for f in FindSymbols().visit(iet) if key(f) and f.ndim > 1]
    functions = sorted(functions, key=lambda f: len(f.dimensions), reverse=True)

    # `functions_unseen` are all Functions that `iet` may need to linearize
    # and have not been seen while processing other IETs
    functions_unseen = [f for f in functions if f not in cache]

    # Find unique sizes (unique -> minimize necessary registers)
    mapper = DefaultOrderedDict(list)
    for f in functions:
        # NOTE: the outermost dimension is unnecessary
        for d in f.dimensions[1:]:
            # TODO: same grid + same halo => same padding, however this is
            # never asserted throughout the compiler yet... maybe should do
            # it when in debug mode at `prepare_arguments` time, ie right
            # before jumping to C?
            mapper[(d, f._size_halo[d], getattr(f, 'grid', None))].append(f)

    # For all unseen Functions, build the size exprs. For example:
    # `x_fsz0 = u_vec->size[1]`
    imapper = DefaultOrderedDict(dict)
    for (d, halo, _), v in mapper.items():
        v_unseen = [f for f in v if f in functions_unseen]
        if not v_unseen:
            continue
        expr = _generate_fsz(v_unseen[0], d, sregistry)
        if expr:
            for f in v_unseen:
                imapper[f][d] = expr.write
                cache[f].stmts0.append(expr)

    # For all unseen Functions, build the stride exprs. For example:
    # `y_stride0 = y_fsz0*z_fsz0`
    built = {}
    mapper = DefaultOrderedDict(dict)
    for f, v in imapper.items():
        for d in v:
            n = f.dimensions.index(d)
            expr = prod(v[i] for i in f.dimensions[n:])
            try:
                stmt = built[expr]
            except KeyError:
                name = sregistry.make_name(prefix='%s_stride' % d.name)
                s = Symbol(name=name, dtype=np.int64, is_const=True)
                stmt = built[expr] = DummyExpr(s, expr, init=True)
            mapper[f][d] = stmt.write
            cache[f].stmts1.append(stmt)
    mapper.update([(f, {}) for f in functions_unseen if f not in mapper])

    # For all unseen Functions, build defines. For example:
    # `#define uL(t, x, y, z) u[(t)*t_stride0 + (x)*x_stride0 + (y)*y_stride0 + (z)]`
    headers = []
    findexeds = {}
    for f in functions:
        if cache[f].cbk is None:
            header, cbk = _generate_macro(f, mapper[f], sregistry)
            headers.append(header)
            cache[f].cbk = findexeds[f] = cbk
        else:
            findexeds[f] = cache[f].cbk

    # Build "functional" Indexeds. For example:
    # `u[t2, x+8, y+9, z+7] => uL(t2, x+8, y+9, z+7)`
    mapper = {}
    indexeds = FindSymbols('indexeds').visit(iet)
    for i in indexeds:
        try:
            mapper[i] = findexeds[i.function](i)
        except KeyError:
            pass

    # Introduce the linearized expressions
    iet = Uxreplace(mapper).visit(iet)

    # All Functions that actually require linearization in `iet`
    candidates = []

    candidates.extend(filter_ordered(i.function for i in indexeds))

    calls = FindNodes(Call).visit(iet)
    cfuncs = filter_ordered(flatten(i.functions for i in calls))
    candidates.extend(i for i in cfuncs if i.function.is_DiscreteFunction)

    # All Functions that can be linearized in `iet`
    defines = FindSymbols('defines-aliases').visit(iet)

    # Place the linearization expressions or delegate to ancestor efunc
    stmts0 = []
    stmts1 = []
    args = []
    for f in candidates:
        if f in defines:
            stmts0.extend(cache[f].stmts0)
            stmts1.extend(cache[f].stmts1)
        else:
            args.extend([e.write for e in cache[f].stmts1])
    if stmts0:
        assert len(stmts1) > 0
        stmts0 = filter_ordered(stmts0) + [BlankLine]
        stmts1 = filter_ordered(stmts1) + [BlankLine]
        body = iet.body._rebuild(body=tuple(stmts0) + tuple(stmts1) + iet.body.body)
        iet = iet._rebuild(body=body)
    else:
        assert len(stmts0) == 0

    return iet, headers, args
Example #31
0
def linearize_accesses(iet, cache, sregistry):
    """
    Turn Indexeds into FIndexeds and create the necessary access Macros.
    """
    # Find all objects amenable to linearization
    symbol_names = {i.name for i in FindSymbols('indexeds').visit(iet)}
    functions = [f for f in FindSymbols().visit(iet)
                 if ((f.is_DiscreteFunction or f.is_Array) and
                     f.ndim > 1 and
                     f.name in symbol_names)]
    functions = sorted(functions, key=lambda f: len(f.dimensions), reverse=True)

    # Find unique sizes (unique -> minimize necessary registers)
    mapper = DefaultOrderedDict(list)
    for f in functions:
        if f not in cache:
            # NOTE: the outermost dimension is unnecessary
            for d in f.dimensions[1:]:
                # TODO: same grid + same halo => same padding, however this is
                # never asserted throughout the compiler yet... maybe should do
                # it when in debug mode at `prepare_arguments` time, ie right
                # before jumping to C?
                mapper[(d, f._size_halo[d], getattr(f, 'grid', None))].append(f)

    # Build all exprs such as `x_fsz0 = u_vec->size[1]`
    imapper = DefaultOrderedDict(list)
    for (d, halo, _), v in mapper.items():
        name = sregistry.make_name(prefix='%s_fsz' % d.name)
        s = Symbol(name=name, dtype=np.int32, is_const=True)
        try:
            expr = DummyExpr(s, v[0]._C_get_field(FULL, d).size, init=True)
        except AttributeError:
            assert v[0].is_Array
            expr = DummyExpr(s, v[0].symbolic_shape[d], init=True)
        for f in v:
            imapper[f].append((d, s))
            cache[f].stmts0.append(expr)

    # Build all exprs such as `y_slc0 = y_fsz0*z_fsz0`
    built = {}
    mapper = DefaultOrderedDict(list)
    for f, v in imapper.items():
        for n, (d, _) in enumerate(v):
            expr = prod(list(zip(*v[n:]))[1])
            try:
                stmt = built[expr]
            except KeyError:
                name = sregistry.make_name(prefix='%s_slc' % d.name)
                s = Symbol(name=name, dtype=np.int32, is_const=True)
                stmt = built[expr] = DummyExpr(s, expr, init=True)
            mapper[f].append(stmt.write)
            cache[f].stmts1.append(stmt)
    mapper.update([(f, []) for f in functions if f not in mapper])

    # Build defines. For example:
    # `define uL(t, x, y, z) u[(t)*t_slice_sz + (x)*x_slice_sz + (y)*y_slice_sz + (z)]`
    headers = []
    findexeds = {}
    for f, szs in mapper.items():
        if cache[f].cbk is not None:
            # Perhaps we've already built an access macro for `f` through another efunc
            findexeds[f] = cache[f].cbk
        else:
            assert len(szs) == len(f.dimensions) - 1
            pname = sregistry.make_name(prefix='%sL' % f.name)

            expr = sum([MacroArgument(d.name)*s for d, s in zip(f.dimensions, szs)])
            expr += MacroArgument(f.dimensions[-1].name)
            expr = Indexed(IndexedData(f.name, None, f), expr)
            define = DefFunction(pname, f.dimensions)
            headers.append((ccode(define), ccode(expr)))

            cache[f].cbk = findexeds[f] = lambda i, pname=pname: FIndexed(i, pname)

    # Build "functional" Indexeds. For example:
    # `u[t2, x+8, y+9, z+7] => uL(t2, x+8, y+9, z+7)`
    mapper = {}
    for n in FindNodes(Expression).visit(iet):
        subs = {}
        for i in retrieve_indexed(n.expr):
            try:
                subs[i] = findexeds[i.function](i)
            except KeyError:
                pass
        mapper[n] = n._rebuild(expr=uxreplace(n.expr, subs))

    # Put together all of the necessary exprs for `y_fsz0`, ..., `y_slc0`, ...
    stmts0 = filter_ordered(flatten(cache[f].stmts0 for f in functions))
    if stmts0:
        stmts0.append(BlankLine)
    stmts1 = filter_ordered(flatten(cache[f].stmts1 for f in functions))
    if stmts1:
        stmts1.append(BlankLine)

    iet = Transformer(mapper).visit(iet)
    body = iet.body._rebuild(body=tuple(stmts0) + tuple(stmts1) + iet.body.body)
    iet = iet._rebuild(body=body)

    return iet, headers