def _fetch_scope(self, clusters): key = as_tuple(clusters) if key not in self.state.scopes: self.state.scopes[key] = Scope(flatten(c.exprs for c in key)) return self.state.scopes[key]
def groupby(clusters): """ Attempt grouping :class:`PartialCluster`s together to create bigger :class:`PartialCluster`s (i.e., containing more expressions). .. note:: This function relies on advanced data dependency analysis tools based upon classic Lamport theory. """ clusters = clusters.unfreeze() processed = ClusterGroup() for c in clusters: if c.guards: # Guarded clusters cannot be grouped together processed.append(c) continue fused = False for candidate in reversed(list(processed)): # Collect all relevant data dependences scope = Scope(exprs=candidate.exprs + c.exprs) # Collect anti-dependences preventing grouping anti = scope.d_anti.carried() - scope.d_anti.increment funcs = [i.function for i in anti] # Collect flow-dependences breaking the search flow = scope.d_flow - (scope.d_flow.inplace() + scope.d_flow.increment) flow = {i.cause for i in flow} if candidate.ispace.is_compatible(c.ispace) and\ all(is_local(i, candidate, c, clusters) for i in funcs): # /c/ will be fused into /candidate/. All fusion-induced anti # dependences are eliminated through so called "index bumping and # array contraction", which transforms array accesses into scalars # Optimization: we also bump-and-contract the Arrays inducing # non-carried dependences, to avoid useless memory accesses funcs += [ i.function for i in scope.d_flow.independent() if is_local(i.function, candidate, c, clusters) ] bump_and_contract(funcs, candidate, c) candidate.squash(c) fused = True break elif anti: # Data dependences prevent fusion with earlier clusters, so # must break up the search c.atomics.update(set(anti.cause)) break elif set(flow).intersection(candidate.atomics): # We cannot even attempt fusing with earlier clusters, as # otherwise the existing flow dependences wouldn't be honored break # Fallback if not fused: processed.append(c) return processed
def _build_dag(self, cgroups, prefix): """ A DAG captures data dependences between ClusterGroups up to the iteration space depth dictated by ``prefix``. Examples -------- Consider two ClusterGroups `c0` and `c1`, and ``prefix=[i]``. 1) cg0 := b[i, j] = ... cg1 := ... = ... b[i, j] ... Non-carried flow-dependence, so `cg1` must go after `cg0`. 2) cg0 := b[i, j] = ... cg1 := ... = ... b[i, j-1] ... Carried flow-dependence in `j`, so `cg1` must go after `cg0`. 3) cg0 := b[i, j] = ... cg1 := ... = ... b[i, j+1] ... Carried anti-dependence in `j`, so `cg1` must go after `cg0`. 4) cg0 := b[i, j] = ... cg1 := ... = ... b[i-1, j+1] ... Carried flow-dependence in `i`, so `cg1` can safely go before or after `cg0`. Note: the `j+1` in `cg1` has no impact -- the actual dependence betweeb `b[i, j]` and `b[i-1, j+1]` is along `i`. """ prefix = {i.dim for i in as_tuple(prefix)} dag = DAG(nodes=cgroups) for n, cg0 in enumerate(cgroups): for cg1 in cgroups[n + 1:]: scope = Scope(exprs=cg0.exprs + cg1.exprs) # Handle anti-dependences deps = scope.d_anti - (cg0.scope.d_anti + cg1.scope.d_anti) if any(i.cause & prefix for i in deps): # Anti-dependences break the execution flow # i) ClusterGroups between `cg0` and `cg1` must precede `cg1` for cg2 in cgroups[n:cgroups.index(cg1)]: dag.add_edge(cg2, cg1) # ii) ClusterGroups after `cg1` cannot precede `cg1` for cg2 in cgroups[cgroups.index(cg1) + 1:]: dag.add_edge(cg1, cg2) break elif deps: dag.add_edge(cg0, cg1) # Flow-dependences along one of the `prefix` Dimensions can # be ignored; all others require sequentialization deps = scope.d_flow - (cg0.scope.d_flow + cg1.scope.d_flow) if any(not (i.cause and i.cause & prefix) for i in deps): dag.add_edge(cg0, cg1) continue # Handle increment-after-write dependences deps = scope.d_output - (cg0.scope.d_output + cg1.scope.d_output) if any(i.is_iaw for i in deps): dag.add_edge(cg0, cg1) continue return dag
def scope(self): return Scope(exprs=self.exprs)
def _hoist_halospots(iet): """ Hoist HaloSpots from inner to outer Iterations where all data dependencies would be honored. """ # Hoisting rules -- if the retval is True, then it means the input `dep` is not # a stopper to halo hoisting def rule0(dep, candidates): # E.g., `dep=W<f,[x]> -> R<f,[x-1]>` and `candidates=(time, x)` => False # E.g., `dep=W<f,[t1, x, y]> -> R<f,[t0, x-1, y+1]>`, `dep.cause={t,time}` and # `candidates=(x,)` => True return (all(d in dep.distance_mapper for d in candidates) and not dep.cause & candidates) def rule1(dep, candidates): # An increment isn't a stopper to hoisting return dep.write.is_increment hoist_rules = [rule0, rule1] # Precompute scopes to save time scopes = { i: Scope([e.expr for e in v]) for i, v in MapNodes().visit(iet).items() } # Analysis hsmapper = {} imapper = defaultdict(list) for iters, halo_spots in MapNodes(Iteration, HaloSpot, 'groupby').visit(iet).items(): for hs in halo_spots: hsmapper[hs] = hs.halo_scheme for f in hs.fmapper: for n, i in enumerate(iters): candidates = set().union( *[i.dim._defines for i in iters[n:]]) test = True for dep in scopes[i].d_flow.project(f): if any(rule(dep, candidates) for rule in hoist_rules): continue test = False break if test: hsmapper[hs] = hsmapper[hs].drop(f) imapper[i].append(hs.halo_scheme.project(f)) break # Post-process analysis mapper = { i: HaloSpot(HaloScheme.union(hss), i._rebuild()) for i, hss in imapper.items() } mapper.update({ i: i.body if hs.is_void else i._rebuild(halo_scheme=hs) for i, hs in hsmapper.items() }) # Transform the IET hoisting/dropping HaloSpots as according to the analysis iet = Transformer(mapper, nested=True).visit(iet) # Clean up: de-nest HaloSpots if necessary mapper = {} for hs in FindNodes(HaloSpot).visit(iet): if hs.body.is_HaloSpot: halo_scheme = HaloScheme.union( [hs.halo_scheme, hs.body.halo_scheme]) mapper[hs] = hs._rebuild(halo_scheme=halo_scheme, body=hs.body.body) iet = Transformer(mapper, nested=True).visit(iet) return iet
def callback(self, clusters, prefix, backlog=None, known_break=None): if not prefix: return clusters known_break = known_break or set() backlog = backlog or [] # Take the innermost Dimension -- no other Clusters other than those in # `clusters` are supposed to share it candidates = prefix[-1].dim._defines scope = Scope(exprs=flatten(c.exprs for c in clusters)) # Handle the nastiest case -- ambiguity due to the presence of both a # flow- and an anti-dependence. # # Note: in most cases, `scope.d_anti.cause == {}` -- either because # `scope.d_anti == {}` or because the few anti dependences are not carried # in any Dimension. We exploit this observation so that we only compute # `d_flow`, which instead may be expensive, when strictly necessary maybe_break = scope.d_anti.cause & candidates if len(clusters) > 1 and maybe_break: require_break = scope.d_flow.cause & maybe_break if require_break: backlog = [clusters[-1]] + backlog # Try with increasingly smaller ClusterGroups until the ambiguity is gone return self.callback(clusters[:-1], prefix, backlog, require_break) # Schedule Clusters over different IterationSpaces if this increases parallelism for i in range(1, len(clusters)): if self._break_for_parallelism(scope, candidates, i): return self.callback(clusters[:i], prefix, clusters[i:] + backlog, candidates | known_break) # Compute iteration direction idir = { d: Backward for d in candidates if d.root in scope.d_anti.cause } if maybe_break: idir.update({ d: Forward for d in candidates if d.root in scope.d_flow.cause }) idir.update({d: Forward for d in candidates if d not in idir}) # Enforce iteration direction on each Cluster processed = [] for c in clusters: ispace = IterationSpace(c.ispace.intervals, c.ispace.sub_iterators, { **c.ispace.directions, **idir }) processed.append(c.rebuild(ispace=ispace)) if not backlog: return processed # Handle the backlog -- the Clusters characterized by flow- and anti-dependences # along one or more Dimensions idir = {d: Any for d in known_break} for i, c in enumerate(list(backlog)): ispace = IterationSpace(c.ispace.intervals.lift(known_break), c.ispace.sub_iterators, { **c.ispace.directions, **idir }) dspace = c.dspace.lift(known_break) backlog[i] = c.rebuild(ispace=ispace, dspace=dspace) return processed + self.callback(backlog, prefix)
def _merge_halospots(iet): """ Merge HaloSpots on the same Iteration tree level where all data dependencies would be honored. """ # Merge rules -- if the retval is True, then it means the input `dep` is not # a stopper to halo merging def rule0(dep, hs, loc_indices): # E.g., `dep=W<f,[t1, x]> -> R<f,[t0, x-1]>` => True return not any( d in hs.dimensions or dep.distance_mapper[d] is S.Infinity for d in dep.cause) def rule1(dep, hs, loc_indices): # TODO This is apparently never hit, but feeling uncomfortable to remove it return dep.is_regular and all(not any(dep.read.touched_halo(d.root)) for d in dep.cause) def rule2(dep, hs, loc_indices): # E.g., `dep=W<f,[t1, x+1]> -> R<f,[t1, xl+1]>` and `loc_indices={t: t0}` => True return any(dep.distance_mapper[d] == 0 and dep.source[d] is not v for d, v in loc_indices.items()) merge_rules = [rule0, rule1, rule2] # Analysis mapper = {} for i, halo_spots in MapNodes(Iteration, HaloSpot, 'immediate').visit(iet).items(): if i is None or len(halo_spots) <= 1: continue scope = Scope([e.expr for e in FindNodes(Expression).visit(i)]) hs0 = halo_spots[0] mapper[hs0] = hs0.halo_scheme for hs in halo_spots[1:]: mapper[hs] = hs.halo_scheme for f, (loc_indices, _) in hs.fmapper.items(): test = True for dep in scope.d_flow.project(f): if any(rule(dep, hs, loc_indices) for rule in merge_rules): continue test = False break if test: try: mapper[hs0] = HaloScheme.union( [mapper[hs0], hs.halo_scheme.project(f)]) mapper[hs] = mapper[hs].drop(f) except ValueError: # `hs.loc_indices=<frozendict {t: t1}` and # `hs0.loc_indices=<frozendict {t: t0}` pass # Post-process analysis mapper = { i: i.body if hs.is_void else i._rebuild(halo_scheme=hs) for i, hs in mapper.items() } # Transform the IET merging/dropping HaloSpots as according to the analysis iet = Transformer(mapper, nested=True).visit(iet) return iet
def groupby(clusters): """ Group PartialClusters together to create "fatter" PartialClusters (i.e., containing more expressions). Notes ----- This function relies on advanced data dependency analysis tools based upon classic Lamport theory. """ clusters = clusters.unfreeze() processed = ClusterGroup() for c in clusters: fused = False for candidate in reversed(list(processed)): # Guarded clusters cannot be grouped together if c.guards: break # Collect all relevant data dependences scope = Scope(exprs=candidate.exprs + c.exprs) # Collect anti-dependences preventing grouping anti = scope.d_anti.carried() - scope.d_anti.increment funcs = set(anti.functions) # Collect flow-dependences breaking the search flow = scope.d_flow - (scope.d_flow.inplace() + scope.d_flow.increment) # Can we group `c` with `candidate`? test0 = not candidate.guards # No intervening guards test1 = candidate.ispace.is_compatible( c.ispace) # Compatible ispaces test2 = all(is_local(i, candidate, c, clusters) for i in funcs) # No antideps if test0 and test1 and test2: # Yes, `c` can be grouped with `candidate`. All anti-dependences # (if any) can be eliminated through "index bumping and array # contraction", which turns Array temporaries into Scalar temporaries # Optimization: we also bump-and-contract the Arrays inducing # non-carried dependences, to minimize the working set funcs.update({ i.function for i in scope.d_flow.independent() if is_local(i.function, candidate, c, clusters) }) bump_and_contract(funcs, candidate, c) candidate.squash(c) fused = True break elif anti: # Data dependences prevent fusion with earlier Clusters, so # must break up the search c.atomics.update(anti.cause) break elif flow.cause & candidate.atomics: # We cannot even attempt fusing with earlier Clusters, as # otherwise the carried flow dependences wouldn't be honored break elif set(candidate.guards) & set(c.dimensions): # Like above, we can't attempt fusion with earlier Clusters. # Time time because there are intervening conditionals along # one or more of the shared iteration dimensions break # Fallback if not fused: processed.append(c) return processed
def classify(exprs, ispace): """ Produce the mapper ``Function -> HaloSchemeEntry``, which describes the necessary halo exchanges in the given Scope. """ scope = Scope(exprs) mapper = {} for f, r in scope.reads.items(): if not f.is_DiscreteFunction: continue elif not isinstance(f.grid, Grid): # TODO: improve me continue # For each data access, determine if (and what type of) a halo exchange # is required halo_labels = defaultdict(set) for i in r: v = {} for d in i.findices: if f.grid.is_distributed(d): if i.affine(d): thl, thr = i.touched_halo(d) # Note: if the left-HALO is touched (i.e., `thl = True`), then # the *right-HALO* is to be sent over in a halo exchange v[(d, LEFT)] = (thr and STENCIL) or IDENTITY v[(d, RIGHT)] = (thl and STENCIL) or IDENTITY else: v[(d, LEFT)] = STENCIL v[(d, RIGHT)] = STENCIL else: v[(d, i[d])] = NONE # Does `i` actually require a halo exchange? if not any(hl is STENCIL for hl in v.values()): continue # Derive diagonal halo exchanges from the previous analysis combs = list(product([LEFT, CENTER, RIGHT], repeat=len(f._dist_dimensions))) combs.remove((CENTER,)*len(f._dist_dimensions)) for c in combs: key = (f._dist_dimensions, c) if all(v.get((d, s)) is STENCIL or s is CENTER for d, s in zip(*key)): v[key] = STENCIL # Finally update the `halo_labels` for j, hl in v.items(): halo_labels[j].add(hl) if not halo_labels: continue # Distinguish between Dimensions requiring a halo exchange and those which don't up_loc_indices, halos = defaultdict(list), [] for (d, s), hl in halo_labels.items(): try: hl.remove(IDENTITY) except KeyError: pass if not hl: continue elif len(hl) > 1: raise HaloSchemeException("Inconsistency found while building a halo " "scheme for `%s` along Dimension `%s`" % (f, d)) elif hl.pop() is STENCIL: halos.append(Halo(d, s)) else: up_loc_indices[d].append(s) # Process the loc_indices. Consider: # 1) u[t+1, x] = f(u[t, x]) => shift == 1 # 2) u[t-1, x] = f(u[t, x]) => shift == 1 # 3) u[t+1, x] = f(u[t+1, x]) => shift == 0 # In the first and second cases, the x-halo should be inserted at `t`, # while in the last case it should be inserted at `t+1`. loc_indices = {} for d, aindices in up_loc_indices.items(): try: func = Max if ispace.is_forward(d.root) else Min except KeyError: # Max or Min is the same since `d` isn't an `ispace` Dimension func = Max candidates = [i for i in aindices if not is_integer(i)] candidates = {(i.origin if d.is_Stepping else i) - d: i for i in candidates} try: loc_indices[d] = candidates[func(*candidates.keys())] except KeyError: # E.g., `aindices = [0, 1, d+1]` -- it doesn't really matter # what we put here, so we place 0 as it's the old behaviour loc_indices[d] = 0 mapper[f] = HaloSchemeEntry(frozendict(loc_indices), frozenset(halos)) return mapper
def callback(self, clusters, prefix, backlog=None, known_break=None): if not prefix: return clusters known_break = known_break or set() backlog = backlog or [] # Take the innermost Dimension -- no other Clusters other than those in # `clusters` are supposed to share it candidates = prefix[-1].dim._defines scope = Scope(exprs=flatten(c.exprs for c in clusters)) # The nastiest case: # eq0 := u[t+1, x] = ... u[t, x] # eq1 := v[t+1, x] = ... v[t, x] ... u[t, x] ... u[t+1, x] ... u[t+2, x] # Here, `eq0` marches forward along `t`, while `eq1` has both a flow and an # anti dependence with `eq0`, which ultimately will require `eq1` to go in # a separate t-loop require_break = (scope.d_flow.cause & scope.d_anti.cause) & candidates if require_break and len(clusters) > 1: backlog = [clusters[-1]] + backlog # Try with increasingly smaller Cluster groups until the ambiguity is solved return self.callback(clusters[:-1], prefix, backlog, require_break) # If the flow- or anti-dependences are not coupled, one or more Clusters # might be scheduled separately, to increase parallelism (this is basically # what low-level compilers call "loop fission") for n, _ in enumerate(clusters): d_cross = scope.d_from_access(scope.a_query(n, 'R')).cross() if any(d.is_storage_volatile(candidates) for d in d_cross): break elif d_cross.cause & candidates: if n > 0: return self.callback( clusters[:n], prefix, clusters[n:] + backlog, (d_cross.cause & candidates) | known_break) break # Compute iteration direction direction = { d: Backward for d in candidates if d.root in scope.d_anti.cause } direction.update( {d: Forward for d in candidates if d.root in scope.d_flow.cause}) direction.update( {d: Forward for d in candidates if d not in direction}) # Enforce iteration direction on each Cluster processed = [] for c in clusters: ispace = IterationSpace(c.ispace.intervals, c.ispace.sub_iterators, { **c.ispace.directions, **direction }) processed.append(Cluster(c.exprs, ispace, c.dspace)) if not backlog: return processed # Handle the backlog -- the Clusters characterized by flow- and anti-dependences # along one or more Dimensions direction = {d: Any for d in known_break} for i, c in enumerate(list(backlog)): ispace = IterationSpace(c.ispace.intervals.lift(known_break), c.ispace.sub_iterators, { **c.ispace.directions, **direction }) backlog[i] = Cluster(c.exprs, ispace, c.dspace) return processed + self.callback(backlog, prefix)
def _fetch_scope(self, clusters): exprs = flatten(c.exprs for c in as_tuple(clusters)) key = tuple(exprs) if key not in self.state.scopes: self.state.scopes[key] = Scope(exprs) return self.state.scopes[key]