def _eliminate_inter_stencil_redundancies(self, cluster, template, **kwargs): """ Search aliasing expressions and capture them into vector temporaries. Examples -------- 1) temp = (a[x,y,z]+b[x,y,z])*c[t,x,y,z] >>> ti[x,y,z] = a[x,y,z] + b[x,y,z] temp = ti[x,y,z]*c[t,x,y,z] 2) temp1 = 2.0*a[x,y,z]*b[x,y,z] temp2 = 3.0*a[x,y,z+1]*b[x,y,z+1] >>> ti[x,y,z] = a[x,y,z]*b[x,y,z] temp1 = 2.0*ti[x,y,z] temp2 = 3.0*ti[x,y,z+1] """ # For more information about "aliases", refer to collect.__doc__ aliases = collect(cluster.exprs) # Redundancies will be stored in space-varying temporaries graph = FlowGraph(cluster.exprs) time_invariants = { v.rhs: graph.time_invariant(v) for v in graph.values() } # Find the candidate expressions processed = [] candidates = OrderedDict() for k, v in graph.items(): # Cost check (to keep the memory footprint under control) naliases = len(aliases.get(v.rhs)) cost = estimate_cost(v, True) * naliases test0 = lambda: cost >= self.MIN_COST_ALIAS and naliases > 1 test1 = lambda: cost >= self.MIN_COST_ALIAS_INV and time_invariants[ v.rhs] if test0() or test1(): candidates[v.rhs] = k else: processed.append(v) # Create alias Clusters and all necessary substitution rules # for the new temporaries alias_clusters = [] subs = {} for origin, alias in aliases.items(): if all(i not in candidates for i in alias.aliased): continue # The write-to Intervals writeto = [ Interval(i.dim, *alias.relaxed_diameter.get(i.dim, (0, 0))) for i in cluster.ispace.intervals if not i.dim.is_Time ] writeto = IntervalGroup(writeto) # Optimization: no need to retain a SpaceDimension if it does not # induce a flow/anti dependence (below, `i.offsets` captures this, by # telling how much halo will be needed to honour such dependences) dep_inducing = [i for i in writeto if any(i.offsets)] try: index = writeto.index(dep_inducing[0]) writeto = IntervalGroup(writeto[index:]) except IndexError: warning("Couldn't optimize some of the detected redundancies") # Create a temporary to store `alias` dimensions = [d.root for d in writeto.dimensions] halo = [(abs(i.lower), abs(i.upper)) for i in writeto] array = Array(name=template(), dimensions=dimensions, halo=halo, dtype=cluster.dtype) # Build up the expression evaluating `alias` access = tuple(i.dim - i.lower for i in writeto) expression = Eq(array[access], origin) # Create the substitution rules so that we can use the newly created # temporary in place of the aliasing expressions for aliased, distance in alias.with_distance: assert all(i.dim in distance.labels for i in writeto) access = [i.dim - i.lower + distance[i.dim] for i in writeto] if aliased in candidates: # It would *not* be in `candidates` if part of a composite alias subs[candidates[aliased]] = array[access] subs[aliased] = array[access] # Construct the `alias` IterationSpace intervals, sub_iterators, directions = cluster.ispace.args ispace = IterationSpace(intervals.add(writeto), sub_iterators, directions) # Construct the `alias` DataSpace mapper = detect_accesses(expression) parts = { k: IntervalGroup(build_intervals(v)).add(ispace.intervals) for k, v in mapper.items() if k } dspace = DataSpace(cluster.dspace.intervals, parts) # Create a new Cluster for `alias` alias_clusters.append(Cluster([expression], ispace, dspace)) # Switch temporaries in the expression trees processed = [e.xreplace(subs) for e in processed] return alias_clusters + [cluster.rebuild(processed)]
def _eliminate_inter_stencil_redundancies(self, cluster, template, **kwargs): """ Search for redundancies across the expressions and expose them to the later stages of the optimisation pipeline by introducing new temporaries of suitable rank. Two type of redundancies are sought: * Time-invariants, and * Across different space points Examples ======== Let ``t`` be the time dimension, ``x, y, z`` the space dimensions. Then: 1) temp = (a[x,y,z]+b[x,y,z])*c[t,x,y,z] >>> ti[x,y,z] = a[x,y,z] + b[x,y,z] temp = ti[x,y,z]*c[t,x,y,z] 2) temp1 = 2.0*a[x,y,z]*b[x,y,z] temp2 = 3.0*a[x,y,z+1]*b[x,y,z+1] >>> ti[x,y,z] = a[x,y,z]*b[x,y,z] temp1 = 2.0*ti[x,y,z] temp2 = 3.0*ti[x,y,z+1] """ if cluster.is_sparse: return cluster # For more information about "aliases", refer to collect.__doc__ mapper, aliases = collect(cluster.exprs) # Redundancies will be stored in space-varying temporaries g = cluster.trace indices = g.space_indices time_invariants = {v.rhs: g.time_invariant(v) for v in g.values()} # Find the candidate expressions processed = [] candidates = OrderedDict() for k, v in g.items(): # Cost check (to keep the memory footprint under control) naliases = len(mapper.get(v.rhs, [])) cost = estimate_cost(v, True) * naliases if cost >= self.thresholds['min-cost-alias'] and\ (naliases > 1 or time_invariants[v.rhs]): candidates[v.rhs] = k else: processed.append(v) # Create alias Clusters and all necessary substitution rules # for the new temporaries alias_clusters = ClusterGroup() rules = OrderedDict() for origin, alias in aliases.items(): if all(i not in candidates for i in alias.aliased): continue # Construct an iteration space suitable for /alias/ intervals, sub_iterators, directions = cluster.ispace.args intervals = [ Interval(i.dim, *alias.relaxed_diameter.get(i.dim, i.limits)) for i in cluster.ispace.intervals ] ispace = IterationSpace(intervals, sub_iterators, directions) # Optimization: perhaps we can lift the cluster outside the time dimension if all(time_invariants[i] for i in alias.aliased): ispace = ispace.project(lambda i: not i.is_Time) # Build a symbolic function for /alias/ intervals = ispace.intervals halo = [(abs(intervals[i].lower), abs(intervals[i].upper)) for i in indices] function = Array(name=template(), dimensions=indices, halo=halo) access = tuple(i - intervals[i].lower for i in indices) expression = Eq(Indexed(function.indexed, *access), origin) # Construct a data space suitable for /alias/ mapper = detect_accesses(expression) parts = { k: IntervalGroup(build_intervals(v)).add(intervals) for k, v in mapper.items() if k } dspace = DataSpace([i.zero() for i in intervals], parts) # Create a new Cluster for /alias/ alias_clusters.append(Cluster([expression], ispace, dspace)) # Add substitution rules for aliased, distance in alias.with_distance: access = [ i - intervals[i].lower + j for i, j in distance if i in indices ] temporary = Indexed(function.indexed, *tuple(access)) rules[candidates[aliased]] = temporary rules[aliased] = temporary # Group clusters together if possible alias_clusters = groupby(alias_clusters).finalize() alias_clusters.sort(key=lambda i: i.is_dense) # Switch temporaries in the expression trees processed = [e.xreplace(rules) for e in processed] return alias_clusters + [cluster.rebuild(processed)]
def _eliminate_inter_stencil_redundancies(self, cluster, template, **kwargs): """ Search for redundancies across the expressions and expose them to the later stages of the optimisation pipeline by introducing new temporaries of suitable rank. Two type of redundancies are sought: * Time-invariants, and * Across different space points Examples ======== Let ``t`` be the time dimension, ``x, y, z`` the space dimensions. Then: 1) temp = (a[x,y,z]+b[x,y,z])*c[t,x,y,z] >>> ti[x,y,z] = a[x,y,z] + b[x,y,z] temp = ti[x,y,z]*c[t,x,y,z] 2) temp1 = 2.0*a[x,y,z]*b[x,y,z] temp2 = 3.0*a[x,y,z+1]*b[x,y,z+1] >>> ti[x,y,z] = a[x,y,z]*b[x,y,z] temp1 = 2.0*ti[x,y,z] temp2 = 3.0*ti[x,y,z+1] """ if cluster.is_sparse: return cluster # For more information about "aliases", refer to collect.__doc__ mapper, aliases = collect(cluster.exprs) # Redundancies will be stored in space-varying temporaries g = cluster.trace indices = g.space_indices time_invariants = {v.rhs: g.time_invariant(v) for v in g.values()} # Template for captured redundancies shape = tuple(i.symbolic_extent for i in indices) make = lambda i: Array( name=template(i), shape=shape, dimensions=indices).indexed # Find the candidate expressions processed = [] candidates = OrderedDict() for k, v in g.items(): # Cost check (to keep the memory footprint under control) naliases = len(mapper.get(v.rhs, [])) cost = estimate_cost(v, True) * naliases if cost >= self.thresholds['min-cost-alias'] and\ (naliases > 1 or time_invariants[v.rhs]): candidates[v.rhs] = k else: processed.append(Eq(k, v.rhs)) # Create alias Clusters and all necessary substitution rules # for the new temporaries alias_clusters = ClusterGroup() rules = OrderedDict() for c, (origin, alias) in enumerate(aliases.items()): if all(i not in candidates for i in alias.aliased): continue function = make(c) # Build new Cluster expression = Eq(Indexed(function, *indices), origin) intervals, sub_iterators, directions = cluster.ispace.args # Adjust intervals intervals = intervals.subtract( alias.anti_stencil.boxify().negate()) if all(time_invariants[i] for i in alias.aliased): intervals = intervals.drop( [i for i in intervals.dimensions if i.is_Time]) ispace = IterationSpace(intervals, sub_iterators, directions) alias_clusters.append(Cluster([expression], ispace)) # Update substitution rules for aliased, distance in alias.with_distance: coordinates = [ sum([i, j]) for i, j in distance.items() if i in indices ] temporary = Indexed(function, *tuple(coordinates)) rules[candidates[aliased]] = temporary rules[aliased] = temporary alias_clusters = groupby(alias_clusters).finalize() alias_clusters.sort(key=lambda i: i.is_dense) # Switch temporaries in the expression trees processed = [e.xreplace(rules) for e in processed] return alias_clusters + [cluster.rebuild(processed)]
def lower_schedule(schedule, meta, sregistry, ftemps): """ Turn a Schedule into a sequence of Clusters. """ if ftemps: make = TempFunction else: # Typical case -- the user does *not* "see" the CIRE-created temporaries make = Array clusters = [] subs = {} for pivot, writeto, ispace, aliaseds, indicess, _ in schedule: name = sregistry.make_name() dtype = meta.dtype if writeto: # The Dimensions defining the shape of Array # Note: with SubDimensions, we may have the following situation: # # for zi = z_m + zi_ltkn; zi <= z_M - zi_rtkn; ... # r[zi] = ... # # Instead of `r[zi - z_m - zi_ltkn]` we have just `r[zi]`, so we'll need # as much room as in `zi`'s parent to avoid going OOB # Aside from ugly generated code, the reason we do not rather shift the # indices is that it prevents future passes to transform the loop bounds # (e.g., MPI's comp/comm overlap does that) dimensions = [ d.parent if d.is_Sub else d for d in writeto.itdimensions ] # The halo must be set according to the size of writeto space halo = [(abs(i.lower), abs(i.upper)) for i in writeto] # The indices used to write into the Array indices = [] for i in writeto: try: # E.g., `xs` sub_iterators = writeto.sub_iterators[i.dim] assert len(sub_iterators) == 1 indices.append(sub_iterators[0]) except KeyError: # E.g., `z` -- a non-shifted Dimension indices.append(i.dim - i.lower) obj = make(name=name, dimensions=dimensions, halo=halo, dtype=dtype) expression = Eq(obj[indices], uxreplace(pivot, subs)) callback = lambda idx: obj[idx] else: # Degenerate case: scalar expression assert writeto.size == 0 obj = Symbol(name=name, dtype=dtype) expression = Eq(obj, uxreplace(pivot, subs)) callback = lambda idx: obj # Create the substitution rules for the aliasing expressions subs.update({ aliased: callback(indices) for aliased, indices in zip(aliaseds, indicess) }) # Construct the alias DataSpace accesses = detect_accesses(expression) parts = { k: IntervalGroup(build_intervals(v)).add(ispace.intervals).relaxed for k, v in accesses.items() if k } dspace = DataSpace(meta.dintervals, parts) # Drop or weaken parallelism if necessary properties = dict(meta.properties) for d, v in meta.properties.items(): if any(i.is_Modulo for i in ispace.sub_iterators[d]): properties[d] = normalize_properties(v, {SEQUENTIAL}) elif d not in writeto.dimensions: properties[d] = normalize_properties( v, {PARALLEL_IF_PVT}) - {ROUNDABLE} # Finally, build the alias Cluster clusters.append( Cluster(expression, ispace, dspace, meta.guards, properties)) return clusters, subs
def callback(self, clusters, prefix, cache=None): # Locate all Function accesses within the provided `clusters` accessmap = AccessMapper(clusters) # Create the buffers buffers = BufferBatch() for f, accessv in accessmap.items(): # Has a buffer already been produced for `f`? if f in cache: continue # Is `f` really a buffering candidate? dims = self.callback0(f) if dims is None: continue if not all(any([i.dim in d._defines for i in prefix]) for d in dims): continue b = cache[f] = buffers.make(f, dims, accessv, self.options, self.sregistry) if not buffers: return clusters try: pd = prefix[-2].dim except IndexError: pd = None # Create Eqs to initialize buffers. Note: a buffer needs to be initialized # only if the buffered Function is read in at least one place or in the case # of non-uniform SubDimensions, to avoid uninitialized values to be copied-back # into the buffered Function noinit = self.options['buf-noinit'] processed = [] for b in buffers: if b.size == 1 and noinit: # Special case: avoid initialization if not strictly necessary # See docstring for more info about what this implies continue if b.is_read or not b.has_uniform_subdims: dims = b.function.dimensions lhs = b.indexed[[b.initmap.get(d, Map(d, d)).b for d in dims]] rhs = b.function[[b.initmap.get(d, Map(d, d)).f for d in dims]] expr = lower_exprs(Eq(lhs, rhs)) ispace = b.writeto guards = {pd: GuardBound(d.root.symbolic_min, d.root.symbolic_max) for d in b.contraction_mapper} properties = {d: {PARALLEL} for d in ispace.itdimensions} processed.append( Cluster(expr, ispace, guards=guards, properties=properties) ) # Substitution rules to replace buffered Functions with buffers subs = {} for b in buffers: for a in b.accessv.accesses: subs[a] = b.indexed[[b.index_mapper_flat.get(i, i) for i in a.indices]] for c in clusters: # If a buffer is read but never written, then we need to add # an Eq to step through the next slot # E.g., `ub[0, x] = u[time+2, x]` for b in buffers: if not b.is_readonly: continue try: c.exprs.index(b.firstread) except ValueError: continue dims = b.function.dimensions lhs = b.indexed[[b.lastmap.get(d, Map(d, d)).b for d in dims]] rhs = b.function[[b.lastmap.get(d, Map(d, d)).f for d in dims]] expr = lower_exprs(uxreplace(Eq(lhs, rhs), b.subdims_mapper)) ispace = b.written # Buffering creates a storage-related dependence along the # contracted dimensions properties = dict(c.properties) for d in b.contraction_mapper: d = ispace[d].dim # E.g., `time_sub -> time` properties[d] = normalize_properties(properties[d], {SEQUENTIAL}) processed.append( c.rebuild(exprs=expr, ispace=ispace, properties=properties) ) # Substitute buffered Functions with the newly created buffers exprs = [uxreplace(e, subs) for e in c.exprs] ispace = c.ispace for b in buffers: ispace = ispace.augment(b.sub_iterators) processed.append(c.rebuild(exprs=exprs, ispace=ispace)) # Also append the copy-back if `e` is the last-write of some buffers # E.g., `u[time + 1, x] = ub[sb1, x]` for b in buffers: if b.is_readonly: continue try: c.exprs.index(b.lastwrite) except ValueError: continue dims = b.function.dimensions lhs = b.function[[b.lastmap.get(d, Map(d, d)).f for d in dims]] rhs = b.indexed[[b.lastmap.get(d, Map(d, d)).b for d in dims]] expr = lower_exprs(uxreplace(Eq(lhs, rhs), b.subdims_mapper)) ispace = b.written # Buffering creates a storage-related dependence along the # contracted dimensions properties = dict(c.properties) for d in b.contraction_mapper: d = ispace[d].dim # E.g., `time_sub -> time` properties[d] = normalize_properties(properties[d], {SEQUENTIAL}) processed.append( c.rebuild(exprs=expr, ispace=ispace, properties=properties) ) return processed