def fold_blockable_tree(node, exclude_innermost=False): """ Create IterationFolds from sequences of nested Iterations. """ found = FindAdjacent(Iteration).visit(node) mapper = {} for k, v in found.items(): for i in v: # Pre-condition: they all must be perfect iterations assert len(i) > 1 if any(not IsPerfectIteration().visit(j) for j in i): continue # Only retain consecutive trees having same depth trees = [retrieve_iteration_tree(j)[0] for j in i] handle = [] for j in trees: if len(j) != len(trees[0]): break handle.append(j) trees = handle if not trees: continue # Check foldability pairwise_folds = list(zip(*reversed(trees))) if any(not is_foldable(j) for j in pairwise_folds): continue # Maybe heuristically exclude innermost Iteration if exclude_innermost is True: pairwise_folds = pairwise_folds[:-1] # Perhaps there's nothing to fold if len(pairwise_folds) == 1: continue # TODO: we do not currently support blocking if any of the foldable # iterations writes to user data (need min/max loop bounds?) exprs = flatten( FindNodes(Expression).visit(j.root) for j in trees[:-1]) if any(j.write.is_Input for j in exprs): continue # Perform folding for j in pairwise_folds: root, remainder = j[0], j[1:] folds = [(tuple(y - x for x, y in zip(i.offsets, root.offsets)), i.nodes) for i in remainder] mapper[root] = IterationFold(folds=folds, **root.args) for k in remainder: mapper[k] = None # Insert the IterationFolds in the Iteration/Expression tree processed = Transformer(mapper, nested=True).visit(node) return processed
def _optimize_halo_updates(self, iet, state): """ Drop unnecessary halo exchanges, or shuffle them around to improve computation-communication overlap. """ # First, seek adjacent redundant HaloSpots, since by dropping any of # these we may be able to change the IET nesting, thus affecting # later manipulation passes # Example (assume halo1 is redundant and has same HaloScheme as halo0): # root -> halo0 -> iteration0 root -> halo0 -> iteration0 # | ==> | # -> halo1 -> iteration1 -> iteration1 mapper = {} for k, v in FindAdjacent(HaloSpot).visit(iet).items(): for adjacents in v: # Note: at this point `adjacents` has at least two items crt = adjacents[0] for i in adjacents[1:]: if i.is_Redundant and i.halo_scheme == crt.halo_scheme: mapper[crt] = crt._rebuild( body=mapper.get(crt, crt).body + (i, ), **crt.args_frozen) mapper[i] = None else: crt = i processed = Transformer(mapper).visit(iet) # Then, drop any leftover redundant HaloSpot hss = FindNodes(HaloSpot).visit(processed) mapper = {i: i.body[0] for i in hss if i.is_Redundant} processed = Transformer(mapper, nested=True).visit(processed) return processed, {}
def fold_blockable_tree(iet, blockinner=True): """ Create IterationFolds from sequences of nested Iterations. """ mapper = {} for k, sequence in FindAdjacent(Iteration).visit(iet).items(): # Group based on Dimension groups = [] for subsequence in sequence: for _, v in groupby(subsequence, lambda i: i.dim): i = list(v) if len(i) >= 2: groups.append(i) for i in groups: # Pre-condition: they all must be perfect iterations if any(not IsPerfectIteration().visit(j) for j in i): continue # Only retain consecutive trees having same depth trees = [retrieve_iteration_tree(j)[0] for j in i] handle = [] for j in trees: if len(j) != len(trees[0]): break handle.append(j) trees = handle if not trees: continue # Check foldability pairwise_folds = list(zip(*reversed(trees))) if any(not is_foldable(j) for j in pairwise_folds): continue # Maybe heuristically exclude innermost Iteration if blockinner is False: pairwise_folds = pairwise_folds[:-1] # Perhaps there's nothing to fold if len(pairwise_folds) == 0: continue # TODO: we do not currently support blocking if any of the foldable # iterations writes to user data (need min/max loop bounds?) exprs = flatten( FindNodes(Expression).visit(j.root) for j in trees[:-1]) if any(j.write.is_Input for j in exprs): continue # Perform folding for j in pairwise_folds: r, remainder = j[0], j[1:] folds = [(tuple(y - x for x, y in zip(i.offsets, r.offsets)), i.nodes) for i in remainder] mapper[r] = IterationFold(folds=folds, **r.args) for k in remainder: mapper[k] = None # Insert the IterationFolds in the Iteration/Expression tree iet = Transformer(mapper, nested=True).visit(iet) return iet
def _optimize_halospots(self, iet): """ Optimize the HaloSpots in ``iet``. * Remove all ``useless`` HaloSpots; * Merge all ``hoistable`` HaloSpots with their root HaloSpot, thus removing redundant communications and anticipating communications that will be required by later Iterations. """ # Drop `useless` HaloSpots mapper = { hs: hs._rebuild(halo_scheme=hs.halo_scheme.drop(hs.useless)) for hs in FindNodes(HaloSpot).visit(iet) } iet = Transformer(mapper, nested=True).visit(iet) # Handle `hoistable` HaloSpots # First, we merge `hoistable` HaloSpots together, to anticipate communications mapper = {} for tree in retrieve_iteration_tree(iet): halo_spots = FindNodes(HaloSpot).visit(tree.root) if not halo_spots: continue root = halo_spots[0] if root in mapper: continue hss = [root.halo_scheme] hss.extend([ hs.halo_scheme.project(hs.hoistable) for hs in halo_spots[1:] ]) try: mapper[root] = root._rebuild(halo_scheme=HaloScheme.union(hss)) except ValueError: # HaloSpots have non-matching `loc_indices` and therefore can't be merged warning("Found hoistable HaloSpots with disjoint loc_indices, " "skipping optimization") continue for hs in halo_spots[1:]: halo_scheme = hs.halo_scheme.drop(hs.hoistable) if halo_scheme.is_void: mapper[hs] = hs.body else: mapper[hs] = hs._rebuild(halo_scheme=halo_scheme) iet = Transformer(mapper, nested=True).visit(iet) # Then, we make sure the halo exchanges get performed *before* # the first distributed Dimension. Again, we do this to anticipate # communications, which hopefully has a pay off in performance # # <Iteration x> <HaloSpot(u)>, in y # <HaloSpot(u)>, in y ----> <Iteration x> # <Iteration y> <Iteration y> mapper = {} for i, halo_spots in MapNodes(Iteration, HaloSpot).visit(iet).items(): hoistable = [hs for hs in halo_spots if hs.hoistable] if not hoistable: continue elif len(hoistable) > 1: # We should never end up here, but for now we can't prove it formally warning( "Found multiple hoistable HaloSpots, skipping optimization" ) continue hs = hoistable.pop() if hs in mapper: continue if i.dim.root in hs.dimensions: halo_scheme = hs.halo_scheme.drop(hs.hoistable) if halo_scheme.is_void: mapper[hs] = hs.body else: mapper[hs] = hs._rebuild(halo_scheme=halo_scheme) halo_scheme = hs.halo_scheme.project(hs.hoistable) mapper[i] = hs._rebuild(halo_scheme=halo_scheme, body=i._rebuild()) iet = Transformer(mapper, nested=True).visit(iet) # Finally, we try to move HaloSpot-free Iteration nests within HaloSpot # subtrees, to overlap as much computation as possible. The HaloSpot-free # Iteration nests must be fully affine, otherwise we wouldn't be able to # honour the data dependences along the halo # # <HaloSpot(u,v)> HaloSpot(u,v) # <A> ----> <A> # <B> affine? <B> # # Here, <B> doesn't require any halo exchange, but it might still need the # output of <A>; thus, if we do computation/communication overlap over <A> # *and* want to embed <B> within the HaloSpot, then <B>'s iteration space # will have to be split as well. For this, <B> must be affine. mapper = {} for v in FindAdjacent((HaloSpot, Iteration)).visit(iet).values(): for g in v: root = None for i in g: if i.is_HaloSpot: root = i mapper[root] = [root.body] elif root and all(j.is_Affine for j in FindNodes(Iteration).visit(i)): mapper[root].append(i) mapper[i] = None else: root = None mapper = { k: k._rebuild(body=List(body=v)) if v else v for k, v in mapper.items() } iet = Transformer(mapper).visit(iet) return iet, {}
def _optimize_halospots(self, iet): """ Optimize the HaloSpots in ``iet``. * Remove all USELESS HaloSpots; * Merge all hoistable HaloSpots with their root HaloSpot, thus removing redundant communications and anticipating communications that will be required by later Iterations. """ # Drop USELESS HaloSpots mapper = {hs: hs.body for hs in FindNodes(HaloSpot).visit(iet) if hs.is_Useless} iet = Transformer(mapper, nested=True).visit(iet) # Handle `hoistable` HaloSpots mapper = {} for halo_spots in MapNodes(Iteration, HaloSpot).visit(iet).values(): root = halo_spots[0] halo_schemes = [hs.halo_scheme.project(hs.hoistable) for hs in halo_spots[1:]] mapper[root] = root._rebuild(halo_scheme=root.halo_scheme.union(halo_schemes)) mapper.update({hs: hs._rebuild(halo_scheme=hs.halo_scheme.drop(hs.hoistable)) for hs in halo_spots[1:]}) iet = Transformer(mapper, nested=True).visit(iet) # At this point, some HaloSpots may have become empty (i.e., requiring # no communications), hence they can be removed # # <HaloSpot(u,v)> HaloSpot(u,v) # <A> <A> # <HaloSpot()> ----> <B> # <B> mapper = {i: i.body for i in FindNodes(HaloSpot).visit(iet) if i.is_empty} iet = Transformer(mapper, nested=True).visit(iet) # Finally, we try to move HaloSpot-free Iteration nests within HaloSpot # subtrees, to overlap as much computation as possible. The HaloSpot-free # Iteration nests must be fully affine, otherwise we wouldn't be able to # honour the data dependences along the halo # # <HaloSpot(u,v)> HaloSpot(u,v) # <A> ----> <A> # <B> affine? <B> # # Here, <B> doesn't require any halo exchange, but it might still need the # output of <A>; thus, if we do computation/communication overlap over <A> # *and* want to embed <B> within the HaloSpot, then <B>'s iteration space # will have to be split as well. For this, <B> must be affine. mapper = {} for v in FindAdjacent((HaloSpot, Iteration)).visit(iet).values(): for g in v: root = None for i in g: if i.is_HaloSpot: root = i mapper[root] = [root.body] elif root and all(j.is_Affine for j in FindNodes(Iteration).visit(i)): mapper[root].append(i) mapper[i] = None else: root = None mapper = {k: k._rebuild(body=List(body=v)) if v else v for k, v in mapper.items()} iet = Transformer(mapper).visit(iet) return iet, {}