def _optimize_schedule_padding(cluster, schedule, platform): """ Round up the innermost IterationInterval of the tensor temporaries IterationSpace to a multiple of the SIMD vector length. This is not always possible though (it depends on how much halo is safely accessible in all read Functions). """ processed = [] for i in schedule: try: it = i.ispace.itintervals[-1] if ROUNDABLE in cluster.properties[it.dim]: vl = platform.simd_items_per_reg(cluster.dtype) ispace = i.ispace.add( Interval(it.dim, 0, it.interval.size % vl)) else: ispace = i.ispace processed.append( ScheduledAlias(i.alias, i.writeto, ispace, i.aliaseds, i.indicess)) except (TypeError, KeyError): processed.append(i) return Schedule(*processed, dmapper=schedule.dmapper, rmapper=schedule.rmapper)
def make_rotations_table(d, v): """ All possible rotations of `range(v+1)`. """ m = np.array([[j-i if j > i else 0 for j in range(v+1)] for i in range(v+1)]) m = (m - m.T)[::-1, :] # Shift the table so that the middle rotation is at the top m = np.roll(m, int(-np.floor(v/2)), axis=0) # Turn into a more compact representation as a list of Intervals m = [Interval(d, min(i), max(i)) for i in m] return m
def _pivot_min_intervals(self): """ The minimum Interval along each Dimension such that by evaluating the pivot, all Candidates are evaluated too. """ c = self.pivot ret = defaultdict(lambda: [np.inf, -np.inf]) for i in self: distance = [o.distance(v) for o, v in zip(i.offsets, c.offsets)] distance = [(d, set(v)) for d, v in LabeledVector.transpose(*distance)] for d, v in distance: value = v.pop() ret[d][0] = min(ret[d][0], value) ret[d][1] = max(ret[d][1], value) ret = {d: Interval(d, m, M) for d, (m, M) in ret.items()} return ret
def writeto(self): """ The `writeto` IterationSpace, that is the iteration space that must be iterated over in order to initialize the buffer. """ intervals = [] sub_iterators = {} directions = {} for d in self.buffer.dimensions: try: interval, si, direction = self.itintervals_mapper[d] except KeyError: # E.g., the contraction Dimension `db0` assert d in self.contraction_mapper.values() interval, si, direction = Interval(d, 0, 0), (), Forward intervals.append(interval) sub_iterators[d] = si directions[d] = direction relations = (self.buffer.dimensions,) intervals = IntervalGroup(intervals, relations=relations) return IterationSpace(intervals, sub_iterators, directions)
def _eliminate_inter_stencil_redundancies(self, cluster, template, **kwargs): """ Search aliasing expressions and capture them into vector temporaries. Examples -------- 1) temp = (a[x,y,z]+b[x,y,z])*c[t,x,y,z] >>> ti[x,y,z] = a[x,y,z] + b[x,y,z] temp = ti[x,y,z]*c[t,x,y,z] 2) temp1 = 2.0*a[x,y,z]*b[x,y,z] temp2 = 3.0*a[x,y,z+1]*b[x,y,z+1] >>> ti[x,y,z] = a[x,y,z]*b[x,y,z] temp1 = 2.0*ti[x,y,z] temp2 = 3.0*ti[x,y,z+1] """ # For more information about "aliases", refer to collect.__doc__ aliases = collect(cluster.exprs) # Redundancies will be stored in space-varying temporaries graph = FlowGraph(cluster.exprs) time_invariants = { v.rhs: graph.time_invariant(v) for v in graph.values() } # Find the candidate expressions processed = [] candidates = OrderedDict() for k, v in graph.items(): # Cost check (to keep the memory footprint under control) naliases = len(aliases.get(v.rhs)) cost = estimate_cost(v, True) * naliases test0 = lambda: cost >= self.MIN_COST_ALIAS and naliases > 1 test1 = lambda: cost >= self.MIN_COST_ALIAS_INV and time_invariants[ v.rhs] if test0() or test1(): candidates[v.rhs] = k else: processed.append(v) # Create alias Clusters and all necessary substitution rules # for the new temporaries alias_clusters = [] subs = {} for origin, alias in aliases.items(): if all(i not in candidates for i in alias.aliased): continue # The write-to Intervals writeto = [ Interval(i.dim, *alias.relaxed_diameter.get(i.dim, (0, 0))) for i in cluster.ispace.intervals if not i.dim.is_Time ] writeto = IntervalGroup(writeto) # Optimization: no need to retain a SpaceDimension if it does not # induce a flow/anti dependence (below, `i.offsets` captures this, by # telling how much halo will be needed to honour such dependences) dep_inducing = [i for i in writeto if any(i.offsets)] try: index = writeto.index(dep_inducing[0]) writeto = IntervalGroup(writeto[index:]) except IndexError: warning("Couldn't optimize some of the detected redundancies") # Create a temporary to store `alias` dimensions = [d.root for d in writeto.dimensions] halo = [(abs(i.lower), abs(i.upper)) for i in writeto] array = Array(name=template(), dimensions=dimensions, halo=halo, dtype=cluster.dtype) # Build up the expression evaluating `alias` access = tuple(i.dim - i.lower for i in writeto) expression = Eq(array[access], origin) # Create the substitution rules so that we can use the newly created # temporary in place of the aliasing expressions for aliased, distance in alias.with_distance: assert all(i.dim in distance.labels for i in writeto) access = [i.dim - i.lower + distance[i.dim] for i in writeto] if aliased in candidates: # It would *not* be in `candidates` if part of a composite alias subs[candidates[aliased]] = array[access] subs[aliased] = array[access] # Construct the `alias` IterationSpace intervals, sub_iterators, directions = cluster.ispace.args ispace = IterationSpace(intervals.add(writeto), sub_iterators, directions) # Construct the `alias` DataSpace mapper = detect_accesses(expression) parts = { k: IntervalGroup(build_intervals(v)).add(ispace.intervals) for k, v in mapper.items() if k } dspace = DataSpace(cluster.dspace.intervals, parts) # Create a new Cluster for `alias` alias_clusters.append(Cluster([expression], ispace, dspace)) # Switch temporaries in the expression trees processed = [e.xreplace(subs) for e in processed] return alias_clusters + [cluster.rebuild(processed)]
def process(cluster, chosen, aliases, sregistry, platform): clusters = [] subs = {} for alias, writeto, aliaseds, distances in aliases.iter(cluster.ispace): if all(i not in chosen for i in aliaseds): continue # The Dimensions defining the shape of Array # Note: with SubDimensions, we may have the following situation: # # for zi = z_m + zi_ltkn; zi <= z_M - zi_rtkn; ... # r[zi] = ... # # Instead of `r[zi - z_m - zi_ltkn]` we have just `r[zi]`, so we'll need # as much room as in `zi`'s parent to avoid going OOB # Aside from ugly generated code, the reason we do not rather shift the # indices is that it prevents future passes to transform the loop bounds # (e.g., MPI's comp/comm overlap does that) dimensions = [d.parent if d.is_Sub else d for d in writeto.dimensions] # The halo of the Array halo = [(abs(i.lower), abs(i.upper)) for i in writeto] # The data sharing mode of the Array sharing = 'local' if any(d.is_Incr for d in writeto.dimensions) else 'shared' # Finally create the temporary Array that will store `alias` array = Array(name=sregistry.make_name(), dimensions=dimensions, halo=halo, dtype=cluster.dtype, sharing=sharing) # The access Dimensions may differ from `writeto.dimensions`. This may # happen e.g. if ShiftedDimensions are introduced (`a[x,y]` -> `a[xs,y]`) adims = [aliases.index_mapper.get(d, d) for d in writeto.dimensions] # The expression computing `alias` adims = [aliases.index_mapper.get(d, d) for d in writeto.dimensions] # x -> xs indices = [d - (0 if writeto[d].is_Null else writeto[d].lower) for d in adims] expression = Eq(array[indices], uxreplace(alias, subs)) # Create the substitution rules so that we can use the newly created # temporary in place of the aliasing expressions for aliased, distance in zip(aliaseds, distances): assert all(i.dim in distance.labels for i in writeto) indices = [d - i.lower + distance[i.dim] for d, i in zip(adims, writeto)] subs[aliased] = array[indices] if aliased in chosen: subs[chosen[aliased]] = array[indices] else: # Perhaps part of a composite alias ? pass # Construct the `alias` IterationSpace ispace = cluster.ispace.add(writeto).augment(aliases.index_mapper) # Optimization: if possible, the innermost IterationInterval is # rounded up to a multiple of the vector length try: it = ispace.itintervals[-1] if ROUNDABLE in cluster.properties[it.dim]: vl = platform.simd_items_per_reg(cluster.dtype) ispace = ispace.add(Interval(it.dim, 0, it.interval.size % vl)) except (TypeError, KeyError): pass # Construct the `alias` DataSpace accesses = detect_accesses(expression) parts = {k: IntervalGroup(build_intervals(v)).add(ispace.intervals).relaxed for k, v in accesses.items() if k} dspace = DataSpace(cluster.dspace.intervals, parts) # Finally, build a new Cluster for `alias` built = cluster.rebuild(exprs=expression, ispace=ispace, dspace=dspace) clusters.insert(0, built) return clusters, subs
def _optimize_schedule_rotations(schedule, sregistry): """ Transform the schedule such that the tensor temporaries "rotate" along the outermost Dimension. This trades a parallel Dimension for a smaller working set size. """ # The rotations Dimension is the outermost ridx = 0 rmapper = defaultdict(list) processed = [] for k, group in groupby(schedule, key=lambda i: i.writeto): g = list(group) candidate = k[ridx] d = candidate.dim try: ds = schedule.dmapper[d] except KeyError: # Can't do anything if `d` isn't an IncrDimension over a block processed.extend(g) continue n = candidate.min_size assert n > 0 iis = candidate.lower iib = candidate.upper ii = ModuloDimension('%sii' % d, ds, iis, incr=iib) cd = CustomDimension(name='%s%s' % (d, d), symbolic_min=ii, symbolic_max=iib, symbolic_size=n) dsi = ModuloDimension('%si' % ds, cd, cd + ds - iis, n) mapper = OrderedDict() for i in g: # Update `indicess` to use `xs0`, `xs1`, ... mds = [] for indices in i.indicess: v = indices[ridx] try: md = mapper[v] except KeyError: name = sregistry.make_name(prefix='%sr' % d.name) md = mapper.setdefault(v, ModuloDimension(name, ds, v, n)) mds.append(md) indicess = [ indices[:ridx] + [md] + indices[ridx + 1:] for md, indices in zip(mds, i.indicess) ] # Update `writeto` by switching `d` to `dsi` intervals = k.intervals.switch(d, dsi).zero(dsi) sub_iterators = dict(k.sub_iterators) sub_iterators[d] = dsi writeto = IterationSpace(intervals, sub_iterators) # Transform `alias` by adding `i` alias = i.alias.xreplace({d: d + cd}) # Extend `ispace` to iterate over rotations d1 = writeto[ridx + 1].dim # Note: we're by construction in-bounds here intervals = IntervalGroup(Interval(cd, 0, 0), relations={(d, cd, d1)}) rispace = IterationSpace(intervals, {cd: dsi}, {cd: Forward}) aispace = i.ispace.zero(d) aispace = aispace.augment({d: mds + [ii]}) ispace = IterationSpace.union(rispace, aispace) processed.append( ScheduledAlias(alias, writeto, ispace, i.aliaseds, indicess)) # Update the rotations mapper rmapper[d].extend(list(mapper.values())) return Schedule(*processed, dmapper=schedule.dmapper, rmapper=rmapper)
def _eliminate_inter_stencil_redundancies(self, cluster, template, **kwargs): """ Search for redundancies across the expressions and expose them to the later stages of the optimisation pipeline by introducing new temporaries of suitable rank. Two type of redundancies are sought: * Time-invariants, and * Across different space points Examples ======== Let ``t`` be the time dimension, ``x, y, z`` the space dimensions. Then: 1) temp = (a[x,y,z]+b[x,y,z])*c[t,x,y,z] >>> ti[x,y,z] = a[x,y,z] + b[x,y,z] temp = ti[x,y,z]*c[t,x,y,z] 2) temp1 = 2.0*a[x,y,z]*b[x,y,z] temp2 = 3.0*a[x,y,z+1]*b[x,y,z+1] >>> ti[x,y,z] = a[x,y,z]*b[x,y,z] temp1 = 2.0*ti[x,y,z] temp2 = 3.0*ti[x,y,z+1] """ if cluster.is_sparse: return cluster # For more information about "aliases", refer to collect.__doc__ mapper, aliases = collect(cluster.exprs) # Redundancies will be stored in space-varying temporaries g = cluster.trace indices = g.space_indices time_invariants = {v.rhs: g.time_invariant(v) for v in g.values()} # Find the candidate expressions processed = [] candidates = OrderedDict() for k, v in g.items(): # Cost check (to keep the memory footprint under control) naliases = len(mapper.get(v.rhs, [])) cost = estimate_cost(v, True) * naliases if cost >= self.thresholds['min-cost-alias'] and\ (naliases > 1 or time_invariants[v.rhs]): candidates[v.rhs] = k else: processed.append(v) # Create alias Clusters and all necessary substitution rules # for the new temporaries alias_clusters = ClusterGroup() rules = OrderedDict() for origin, alias in aliases.items(): if all(i not in candidates for i in alias.aliased): continue # Construct an iteration space suitable for /alias/ intervals, sub_iterators, directions = cluster.ispace.args intervals = [ Interval(i.dim, *alias.relaxed_diameter.get(i.dim, i.limits)) for i in cluster.ispace.intervals ] ispace = IterationSpace(intervals, sub_iterators, directions) # Optimization: perhaps we can lift the cluster outside the time dimension if all(time_invariants[i] for i in alias.aliased): ispace = ispace.project(lambda i: not i.is_Time) # Build a symbolic function for /alias/ intervals = ispace.intervals halo = [(abs(intervals[i].lower), abs(intervals[i].upper)) for i in indices] function = Array(name=template(), dimensions=indices, halo=halo) access = tuple(i - intervals[i].lower for i in indices) expression = Eq(Indexed(function.indexed, *access), origin) # Construct a data space suitable for /alias/ mapper = detect_accesses(expression) parts = { k: IntervalGroup(build_intervals(v)).add(intervals) for k, v in mapper.items() if k } dspace = DataSpace([i.zero() for i in intervals], parts) # Create a new Cluster for /alias/ alias_clusters.append(Cluster([expression], ispace, dspace)) # Add substitution rules for aliased, distance in alias.with_distance: access = [ i - intervals[i].lower + j for i, j in distance if i in indices ] temporary = Indexed(function.indexed, *tuple(access)) rules[candidates[aliased]] = temporary rules[aliased] = temporary # Group clusters together if possible alias_clusters = groupby(alias_clusters).finalize() alias_clusters.sort(key=lambda i: i.is_dense) # Switch temporaries in the expression trees processed = [e.xreplace(rules) for e in processed] return alias_clusters + [cluster.rebuild(processed)]
def process(candidates, aliases, cluster, template): """ Create Clusters from aliasing expressions. """ clusters = [] subs = {} for origin, alias in aliases.items(): if all(i not in candidates for i in alias.aliased): continue # The write-to Intervals writeto = [ Interval(i.dim, *alias.relaxed_diameter.get(i.dim, (0, 0))) for i in cluster.ispace.intervals if not i.dim.is_Time ] writeto = IntervalGroup(writeto) # Optimization: no need to retain a SpaceDimension if it does not # induce a flow/anti dependence (below, `i.offsets` captures this, by # telling how much halo will be required to honour such dependences) dep_inducing = [i for i in writeto if any(i.offsets)] try: index = writeto.index(dep_inducing[0]) writeto = IntervalGroup(writeto[index:]) except IndexError: perf_adv("Could not optimize some of the detected redundancies") # Create a temporary to store `alias` dimensions = [d.root for d in writeto.dimensions] halo = [(abs(i.lower), abs(i.upper)) for i in writeto] array = Array(name=template(), dimensions=dimensions, halo=halo, dtype=cluster.dtype) # Build up the expression evaluating `alias` access = tuple(i.dim - i.lower for i in writeto) expression = Eq(array[access], origin.xreplace(subs)) # Create the substitution rules so that we can use the newly created # temporary in place of the aliasing expressions for aliased, distance in alias.with_distance: assert all(i.dim in distance.labels for i in writeto) access = [i.dim - i.lower + distance[i.dim] for i in writeto] if aliased in candidates: # It would *not* be in `candidates` if part of a composite alias subs[candidates[aliased]] = array[access] subs[aliased] = array[access] # Construct the `alias` IterationSpace intervals, sub_iterators, directions = cluster.ispace.args ispace = IterationSpace(intervals.add(writeto), sub_iterators, directions) # Optimize the `alias` IterationSpace: if possible, the innermost # IterationInterval is rounded up to a multiple of the vector length try: it = ispace.itintervals[-1] if ROUNDABLE in cluster.properties[it.dim]: from devito.parameters import configuration vl = configuration['platform'].simd_items_per_reg( cluster.dtype) ispace = ispace.add(Interval(it.dim, 0, it.interval.size % vl)) except (TypeError, KeyError): pass # Construct the `alias` DataSpace mapper = detect_accesses(expression) parts = { k: IntervalGroup(build_intervals(v)).add(ispace.intervals) for k, v in mapper.items() if k } dspace = DataSpace(cluster.dspace.intervals, parts) # Create a new Cluster for `alias` clusters.append( cluster.rebuild(exprs=[expression], ispace=ispace, dspace=dspace)) return clusters, subs