Esempio n. 1
0
def fold_blockable_tree(node, exclude_innermost=False):
    """
    Create IterationFolds from sequences of nested Iterations.
    """
    found = FindAdjacent(Iteration).visit(node)

    mapper = {}
    for k, v in found.items():
        for i in v:
            # Pre-condition: they all must be perfect iterations
            assert len(i) > 1
            if any(not IsPerfectIteration().visit(j) for j in i):
                continue
            # Only retain consecutive trees having same depth
            trees = [retrieve_iteration_tree(j)[0] for j in i]
            handle = []
            for j in trees:
                if len(j) != len(trees[0]):
                    break
                handle.append(j)
            trees = handle
            if not trees:
                continue
            # Check foldability
            pairwise_folds = list(zip(*reversed(trees)))
            if any(not is_foldable(j) for j in pairwise_folds):
                continue
            # Maybe heuristically exclude innermost Iteration
            if exclude_innermost is True:
                pairwise_folds = pairwise_folds[:-1]
            # Perhaps there's nothing to fold
            if len(pairwise_folds) == 1:
                continue
            # TODO: we do not currently support blocking if any of the foldable
            # iterations writes to user data (need min/max loop bounds?)
            exprs = flatten(
                FindNodes(Expression).visit(j.root) for j in trees[:-1])
            if any(j.write.is_Input for j in exprs):
                continue
            # Perform folding
            for j in pairwise_folds:
                root, remainder = j[0], j[1:]
                folds = [(tuple(y - x
                                for x, y in zip(i.offsets, root.offsets)),
                          i.nodes) for i in remainder]
                mapper[root] = IterationFold(folds=folds, **root.args)
                for k in remainder:
                    mapper[k] = None

    # Insert the IterationFolds in the Iteration/Expression tree
    processed = Transformer(mapper, nested=True).visit(node)

    return processed
Esempio n. 2
0
    def _optimize_halo_updates(self, iet, state):
        """
        Drop unnecessary halo exchanges, or shuffle them around to improve
        computation-communication overlap.
        """
        # First, seek adjacent redundant HaloSpots, since by dropping any of
        # these we may be able to change the IET nesting, thus affecting
        # later manipulation passes
        # Example (assume halo1 is redundant and has same HaloScheme as halo0):
        # root -> halo0 -> iteration0        root -> halo0 -> iteration0
        #      |                        ==>                |
        #      -> halo1 -> iteration1                      -> iteration1
        mapper = {}
        for k, v in FindAdjacent(HaloSpot).visit(iet).items():
            for adjacents in v:
                # Note: at this point `adjacents` has at least two items
                crt = adjacents[0]
                for i in adjacents[1:]:
                    if i.is_Redundant and i.halo_scheme == crt.halo_scheme:
                        mapper[crt] = crt._rebuild(
                            body=mapper.get(crt, crt).body + (i, ),
                            **crt.args_frozen)
                        mapper[i] = None
                    else:
                        crt = i
        processed = Transformer(mapper).visit(iet)

        # Then, drop any leftover redundant HaloSpot
        hss = FindNodes(HaloSpot).visit(processed)
        mapper = {i: i.body[0] for i in hss if i.is_Redundant}
        processed = Transformer(mapper, nested=True).visit(processed)

        return processed, {}
Esempio n. 3
0
def fold_blockable_tree(iet, blockinner=True):
    """
    Create IterationFolds from sequences of nested Iterations.
    """
    mapper = {}
    for k, sequence in FindAdjacent(Iteration).visit(iet).items():
        # Group based on Dimension
        groups = []
        for subsequence in sequence:
            for _, v in groupby(subsequence, lambda i: i.dim):
                i = list(v)
                if len(i) >= 2:
                    groups.append(i)
        for i in groups:
            # Pre-condition: they all must be perfect iterations
            if any(not IsPerfectIteration().visit(j) for j in i):
                continue
            # Only retain consecutive trees having same depth
            trees = [retrieve_iteration_tree(j)[0] for j in i]
            handle = []
            for j in trees:
                if len(j) != len(trees[0]):
                    break
                handle.append(j)
            trees = handle
            if not trees:
                continue
            # Check foldability
            pairwise_folds = list(zip(*reversed(trees)))
            if any(not is_foldable(j) for j in pairwise_folds):
                continue
            # Maybe heuristically exclude innermost Iteration
            if blockinner is False:
                pairwise_folds = pairwise_folds[:-1]
            # Perhaps there's nothing to fold
            if len(pairwise_folds) == 0:
                continue
            # TODO: we do not currently support blocking if any of the foldable
            # iterations writes to user data (need min/max loop bounds?)
            exprs = flatten(
                FindNodes(Expression).visit(j.root) for j in trees[:-1])
            if any(j.write.is_Input for j in exprs):
                continue
            # Perform folding
            for j in pairwise_folds:
                r, remainder = j[0], j[1:]
                folds = [(tuple(y - x for x, y in zip(i.offsets, r.offsets)),
                          i.nodes) for i in remainder]
                mapper[r] = IterationFold(folds=folds, **r.args)
                for k in remainder:
                    mapper[k] = None

    # Insert the IterationFolds in the Iteration/Expression tree
    iet = Transformer(mapper, nested=True).visit(iet)

    return iet
Esempio n. 4
0
    def _optimize_halospots(self, iet):
        """
        Optimize the HaloSpots in ``iet``.

        * Remove all ``useless`` HaloSpots;
        * Merge all ``hoistable`` HaloSpots with their root HaloSpot, thus
          removing redundant communications and anticipating communications
          that will be required by later Iterations.
        """
        # Drop `useless` HaloSpots
        mapper = {
            hs: hs._rebuild(halo_scheme=hs.halo_scheme.drop(hs.useless))
            for hs in FindNodes(HaloSpot).visit(iet)
        }
        iet = Transformer(mapper, nested=True).visit(iet)

        # Handle `hoistable` HaloSpots
        # First, we merge `hoistable` HaloSpots together, to anticipate communications
        mapper = {}
        for tree in retrieve_iteration_tree(iet):
            halo_spots = FindNodes(HaloSpot).visit(tree.root)
            if not halo_spots:
                continue
            root = halo_spots[0]
            if root in mapper:
                continue
            hss = [root.halo_scheme]
            hss.extend([
                hs.halo_scheme.project(hs.hoistable) for hs in halo_spots[1:]
            ])
            try:
                mapper[root] = root._rebuild(halo_scheme=HaloScheme.union(hss))
            except ValueError:
                # HaloSpots have non-matching `loc_indices` and therefore can't be merged
                warning("Found hoistable HaloSpots with disjoint loc_indices, "
                        "skipping optimization")
                continue
            for hs in halo_spots[1:]:
                halo_scheme = hs.halo_scheme.drop(hs.hoistable)
                if halo_scheme.is_void:
                    mapper[hs] = hs.body
                else:
                    mapper[hs] = hs._rebuild(halo_scheme=halo_scheme)
        iet = Transformer(mapper, nested=True).visit(iet)

        # Then, we make sure the halo exchanges get performed *before*
        # the first distributed Dimension. Again, we do this to anticipate
        # communications, which hopefully has a pay off in performance
        #
        # <Iteration x>                    <HaloSpot(u)>, in y
        #   <HaloSpot(u)>, in y    ---->   <Iteration x>
        #   <Iteration y>                    <Iteration y>
        mapper = {}
        for i, halo_spots in MapNodes(Iteration, HaloSpot).visit(iet).items():
            hoistable = [hs for hs in halo_spots if hs.hoistable]
            if not hoistable:
                continue
            elif len(hoistable) > 1:
                # We should never end up here, but for now we can't prove it formally
                warning(
                    "Found multiple hoistable HaloSpots, skipping optimization"
                )
                continue
            hs = hoistable.pop()
            if hs in mapper:
                continue
            if i.dim.root in hs.dimensions:
                halo_scheme = hs.halo_scheme.drop(hs.hoistable)
                if halo_scheme.is_void:
                    mapper[hs] = hs.body
                else:
                    mapper[hs] = hs._rebuild(halo_scheme=halo_scheme)

                halo_scheme = hs.halo_scheme.project(hs.hoistable)
                mapper[i] = hs._rebuild(halo_scheme=halo_scheme,
                                        body=i._rebuild())
        iet = Transformer(mapper, nested=True).visit(iet)

        # Finally, we try to move HaloSpot-free Iteration nests within HaloSpot
        # subtrees, to overlap as much computation as possible. The HaloSpot-free
        # Iteration nests must be fully affine, otherwise we wouldn't be able to
        # honour the data dependences along the halo
        #
        # <HaloSpot(u,v)>            HaloSpot(u,v)
        #   <A>             ---->      <A>
        # <B>              affine?     <B>
        #
        # Here, <B> doesn't require any halo exchange, but it might still need the
        # output of <A>; thus, if we do computation/communication overlap over <A>
        # *and* want to embed <B> within the HaloSpot, then <B>'s iteration space
        # will have to be split as well. For this, <B> must be affine.
        mapper = {}
        for v in FindAdjacent((HaloSpot, Iteration)).visit(iet).values():
            for g in v:
                root = None
                for i in g:
                    if i.is_HaloSpot:
                        root = i
                        mapper[root] = [root.body]
                    elif root and all(j.is_Affine
                                      for j in FindNodes(Iteration).visit(i)):
                        mapper[root].append(i)
                        mapper[i] = None
                    else:
                        root = None
        mapper = {
            k: k._rebuild(body=List(body=v)) if v else v
            for k, v in mapper.items()
        }
        iet = Transformer(mapper).visit(iet)

        return iet, {}
Esempio n. 5
0
    def _optimize_halospots(self, iet):
        """
        Optimize the HaloSpots in ``iet``.

        * Remove all USELESS HaloSpots;
        * Merge all hoistable HaloSpots with their root HaloSpot, thus
          removing redundant communications and anticipating communications
          that will be required by later Iterations.
        """
        # Drop USELESS HaloSpots
        mapper = {hs: hs.body for hs in FindNodes(HaloSpot).visit(iet) if hs.is_Useless}
        iet = Transformer(mapper, nested=True).visit(iet)

        # Handle `hoistable` HaloSpots
        mapper = {}
        for halo_spots in MapNodes(Iteration, HaloSpot).visit(iet).values():
            root = halo_spots[0]
            halo_schemes = [hs.halo_scheme.project(hs.hoistable) for hs in halo_spots[1:]]
            mapper[root] = root._rebuild(halo_scheme=root.halo_scheme.union(halo_schemes))
            mapper.update({hs: hs._rebuild(halo_scheme=hs.halo_scheme.drop(hs.hoistable))
                           for hs in halo_spots[1:]})
        iet = Transformer(mapper, nested=True).visit(iet)

        # At this point, some HaloSpots may have become empty (i.e., requiring
        # no communications), hence they can be removed
        #
        # <HaloSpot(u,v)>           HaloSpot(u,v)
        #   <A>                       <A>
        # <HaloSpot()>      ---->   <B>
        #   <B>
        mapper = {i: i.body for i in FindNodes(HaloSpot).visit(iet) if i.is_empty}
        iet = Transformer(mapper, nested=True).visit(iet)

        # Finally, we try to move HaloSpot-free Iteration nests within HaloSpot
        # subtrees, to overlap as much computation as possible. The HaloSpot-free
        # Iteration nests must be fully affine, otherwise we wouldn't be able to
        # honour the data dependences along the halo
        #
        # <HaloSpot(u,v)>            HaloSpot(u,v)
        #   <A>             ---->      <A>
        # <B>              affine?     <B>
        #
        # Here, <B> doesn't require any halo exchange, but it might still need the
        # output of <A>; thus, if we do computation/communication overlap over <A>
        # *and* want to embed <B> within the HaloSpot, then <B>'s iteration space
        # will have to be split as well. For this, <B> must be affine.
        mapper = {}
        for v in FindAdjacent((HaloSpot, Iteration)).visit(iet).values():
            for g in v:
                root = None
                for i in g:
                    if i.is_HaloSpot:
                        root = i
                        mapper[root] = [root.body]
                    elif root and all(j.is_Affine for j in FindNodes(Iteration).visit(i)):
                        mapper[root].append(i)
                        mapper[i] = None
                    else:
                        root = None
        mapper = {k: k._rebuild(body=List(body=v)) if v else v for k, v in mapper.items()}
        iet = Transformer(mapper).visit(iet)

        return iet, {}