def schedule(self, ispace): """ The aliases can be be scheduled in any order, but we privilege the one that minimizes storage while maximizing fusion. """ items = [] for alias, (intervals, aliaseds, distances) in self.items(): mapper = {i.dim: i for i in intervals} writeto = [] for n, i in enumerate(ispace.intervals): interval = mapper.get(i.dim) if interval is not None: if not writeto and interval == interval.zero(): # Optimize away unnecessary temporary Dimensions continue writeto.append(interval) if writeto: writeto = IntervalGroup(writeto, relations=ispace.relations) else: # E.g., an `alias` having 0-distance along all Dimensions writeto = IntervalGroup(intervals, relations=ispace.relations) items.append((alias, writeto, aliaseds, distances)) queue = list(items) while queue: # Shortest write-to region first item = min(queue, key=lambda i: len(i[1])) queue.remove(item) yield item
def iter(self, ispace): """ The aliases can legally be scheduled in many different orders, but we privilege the one that minimizes storage while maximizing fusion. """ items = [] for alias, (intervals, aliaseds, distances) in self.items(): mapper = {i.dim: i for i in intervals} mapper.update({ i.dim.parent: i for i in intervals if i.dim.is_NonlinearDerived }) # Becomes True as soon as a Dimension in `ispace` is found to # be independent of `intervals` flag = False writeto = [] for i in ispace.intervals: try: interval = mapper[i.dim] except KeyError: if not any(i.dim in d._defines for d in mapper): # E.g., `t[0,0]<0>` in the case of t-invariant aliases, # whereas if `i.dim` is `x0_blk0` in `x0_blk0[0,0]<0>` then # we would not enter here flag = True continue # Try to optimize away unnecessary temporary Dimensions if not flag and interval == interval.zero(): continue # Adjust the Interval's stamp # E.g., `i=x[0,0]<1>` and `interval=x[-4,4]<0>`. We need to # use `<1>` which is the actual stamp used in the Cluster # from which the aliasing expressions were extracted assert i.stamp >= interval.stamp interval = interval.lift(i.stamp) writeto.append(interval) flag = True if writeto: writeto = IntervalGroup(writeto, relations=ispace.relations) else: # E.g., an `alias` having 0-distance along all Dimensions writeto = IntervalGroup(intervals, relations=ispace.relations) items.append((alias, writeto, aliaseds, distances)) queue = list(items) while queue: # Shortest write-to region first item = min(queue, key=lambda i: len(i[1])) queue.remove(item) yield item
def written(self): """ The `written` IterationSpace, that is the iteration space that must be iterated over in order to read all of the written buffer values. """ intervals = [] sub_iterators = {} directions = {} for dd in self.function.dimensions: d = dd.xreplace(self.subdims_mapper) try: interval, si, direction = self.itintervals_mapper[d] except KeyError: # E.g., d=time_sub assert d.is_NonlinearDerived d = d.root interval, si, direction = self.itintervals_mapper[d] intervals.append(interval) sub_iterators[d] = si + as_tuple(self.sub_iterators[d]) directions[d] = direction relations = (tuple(i.dim for i in intervals),) intervals = IntervalGroup(intervals, relations=relations) return IterationSpace(intervals, sub_iterators, directions)
def _aliases_from_clusters(self, clusters, exclude, meta): exprs = flatten([c.exprs for c in clusters]) # [Clusters]_n -> [Schedule]_m variants = [] for mapper in self._generate(exprs, exclude): # Clusters -> AliasList found = collect(mapper.extracted, meta.ispace, self.opt_minstorage) pexprs, aliases = choose(found, exprs, mapper, self.opt_mingain) if not aliases: continue # AliasList -> Schedule schedule = lower_aliases(aliases, meta, self.opt_maxpar) variants.append(Variant(schedule, pexprs)) # [Schedule]_m -> Schedule (s.t. best memory/flops trade-off) if not variants: return [] schedule, exprs = pick_best(variants, self.opt_schedule_strategy, self._eval_variants_delta) # Schedule -> Schedule (optimization) if self.opt_rotate: schedule = optimize_schedule_rotations(schedule, self.sregistry) schedule = optimize_schedule_padding(schedule, meta, self.platform) # Schedule -> [Clusters]_k processed, subs = lower_schedule(schedule, meta, self.sregistry, self.opt_ftemps) # [Clusters]_k -> [Clusters]_k (optimization) if self.opt_multisubdomain: processed = optimize_clusters_msds(processed) # [Clusters]_k -> [Clusters]_{k+n} for c in clusters: n = len(c.exprs) cexprs, exprs = exprs[:n], exprs[n:] cexprs = [uxreplace(e, subs) for e in cexprs] ispace = c.ispace.augment(schedule.dmapper) ispace = ispace.augment(schedule.rmapper) accesses = detect_accesses(cexprs) parts = { k: IntervalGroup(build_intervals(v)).relaxed for k, v in accesses.items() if k } dspace = DataSpace(c.dspace.intervals, parts) processed.append( c.rebuild(exprs=cexprs, ispace=ispace, dspace=dspace)) assert len(exprs) == 0 return processed
def derive_dspace(expr, ispace): accesses = detect_accesses(expr) parts = { k: IntervalGroup(build_intervals(v)).relaxed for k, v in accesses.items() if k } dspace = DataSpace(ispace.intervals, parts) return dspace
def iter(self, ispace): """ The aliases can be be scheduled in any order, but we privilege the one that minimizes storage while maximizing fusion. """ items = [] for alias, (intervals, aliaseds, distances) in self.items(): mapper = {i.dim: i for i in intervals} writeto = [] for n, i in enumerate(ispace.intervals): interval = mapper.get(i.dim) if interval is not None: if not writeto and interval == interval.zero(): # Optimize away unnecessary temporary Dimensions continue # Adjust the Interval's stamp # E.g., `i=x[0,0]<1>` and `interval=x[-4,4]<0>`. We need to # use `<1>` which is the actual stamp used in the Cluster # from which the aliasing expressions were extracted assert i.stamp >= interval.stamp interval = interval.lift(i.stamp) writeto.append(interval) if writeto: writeto = IntervalGroup(writeto, relations=ispace.relations) else: # E.g., an `alias` having 0-distance along all Dimensions writeto = IntervalGroup(intervals, relations=ispace.relations) items.append((alias, writeto, aliaseds, distances)) queue = list(items) while queue: # Shortest write-to region first item = min(queue, key=lambda i: len(i[1])) queue.remove(item) yield item
def rebuild(cluster, others, aliases, subs): # Rebuild the non-aliasing expressions exprs = [uxreplace(e, subs) for e in others] # Add any new ShiftedDimension to the IterationSpace ispace = cluster.ispace.augment(aliases.index_mapper) # Rebuild the DataSpace to include the new symbols accesses = detect_accesses(exprs) parts = {k: IntervalGroup(build_intervals(v)).relaxed for k, v in accesses.items() if k} dspace = DataSpace(cluster.dspace.intervals, parts) return cluster.rebuild(exprs=exprs, ispace=ispace, dspace=dspace)
def rebuild(cluster, others, schedule, subs): """ Plug the optimized aliases into the input Cluster. This leads to creating a new Cluster with suitable IterationSpace and DataSpace. """ exprs = [uxreplace(e, subs) for e in others] ispace = cluster.ispace.augment(schedule.dmapper) ispace = ispace.augment(schedule.rmapper) accesses = detect_accesses(exprs) parts = {k: IntervalGroup(build_intervals(v)).relaxed for k, v in accesses.items() if k} dspace = DataSpace(cluster.dspace.intervals, parts) return cluster.rebuild(exprs=exprs, ispace=ispace, dspace=dspace)
def optimize_clusters_msds(clusters): """ Relax the clusters by letting the expressions defined over MultiSubDomains to rather be computed over the entire domain. This increases the likelihood of code lifting by later passes. """ processed = [] for c in clusters: msds = [ d for d in c.ispace.itdimensions if isinstance(d, MultiSubDimension) ] if msds: mapper = {d: d.root for d in msds} exprs = [uxreplace(e, mapper) for e in c.exprs] ispace = c.ispace.relaxed(msds) accesses = detect_accesses(exprs) parts = { k: IntervalGroup(build_intervals(v)).relaxed for k, v in accesses.items() if k } intervals = [i for i in c.dspace if i.dim not in msds] dspace = DataSpace(intervals, parts) guards = {mapper.get(d, d): v for d, v in c.guards.items()} properties = {mapper.get(d, d): v for d, v in c.properties.items()} syncs = {mapper.get(d, d): v for d, v in c.syncs.items()} processed.append( c.rebuild(exprs=exprs, ispace=ispace, dspace=dspace, guards=guards, properties=properties, syncs=syncs)) else: processed.append(c) return processed
def writeto(self): """ The `writeto` IterationSpace, that is the iteration space that must be iterated over in order to initialize the buffer. """ intervals = [] sub_iterators = {} directions = {} for d in self.buffer.dimensions: try: interval, si, direction = self.itintervals_mapper[d] except KeyError: # E.g., the contraction Dimension `db0` assert d in self.contraction_mapper.values() interval, si, direction = Interval(d, 0, 0), (), Forward intervals.append(interval) sub_iterators[d] = si directions[d] = direction relations = (self.buffer.dimensions,) intervals = IntervalGroup(intervals, relations=relations) return IterationSpace(intervals, sub_iterators, directions)
def process(cluster, chosen, aliases, sregistry, platform): clusters = [] subs = {} for alias, writeto, aliaseds, distances in aliases.iter(cluster.ispace): if all(i not in chosen for i in aliaseds): continue # The Dimensions defining the shape of Array # Note: with SubDimensions, we may have the following situation: # # for zi = z_m + zi_ltkn; zi <= z_M - zi_rtkn; ... # r[zi] = ... # # Instead of `r[zi - z_m - zi_ltkn]` we have just `r[zi]`, so we'll need # as much room as in `zi`'s parent to avoid going OOB # Aside from ugly generated code, the reason we do not rather shift the # indices is that it prevents future passes to transform the loop bounds # (e.g., MPI's comp/comm overlap does that) dimensions = [d.parent if d.is_Sub else d for d in writeto.dimensions] # The halo of the Array halo = [(abs(i.lower), abs(i.upper)) for i in writeto] # The data sharing mode of the Array sharing = 'local' if any(d.is_Incr for d in writeto.dimensions) else 'shared' # Finally create the temporary Array that will store `alias` array = Array(name=sregistry.make_name(), dimensions=dimensions, halo=halo, dtype=cluster.dtype, sharing=sharing) # The access Dimensions may differ from `writeto.dimensions`. This may # happen e.g. if ShiftedDimensions are introduced (`a[x,y]` -> `a[xs,y]`) adims = [aliases.index_mapper.get(d, d) for d in writeto.dimensions] # The expression computing `alias` adims = [aliases.index_mapper.get(d, d) for d in writeto.dimensions] # x -> xs indices = [d - (0 if writeto[d].is_Null else writeto[d].lower) for d in adims] expression = Eq(array[indices], uxreplace(alias, subs)) # Create the substitution rules so that we can use the newly created # temporary in place of the aliasing expressions for aliased, distance in zip(aliaseds, distances): assert all(i.dim in distance.labels for i in writeto) indices = [d - i.lower + distance[i.dim] for d, i in zip(adims, writeto)] subs[aliased] = array[indices] if aliased in chosen: subs[chosen[aliased]] = array[indices] else: # Perhaps part of a composite alias ? pass # Construct the `alias` IterationSpace ispace = cluster.ispace.add(writeto).augment(aliases.index_mapper) # Optimization: if possible, the innermost IterationInterval is # rounded up to a multiple of the vector length try: it = ispace.itintervals[-1] if ROUNDABLE in cluster.properties[it.dim]: vl = platform.simd_items_per_reg(cluster.dtype) ispace = ispace.add(Interval(it.dim, 0, it.interval.size % vl)) except (TypeError, KeyError): pass # Construct the `alias` DataSpace accesses = detect_accesses(expression) parts = {k: IntervalGroup(build_intervals(v)).add(ispace.intervals).relaxed for k, v in accesses.items() if k} dspace = DataSpace(cluster.dspace.intervals, parts) # Finally, build a new Cluster for `alias` built = cluster.rebuild(exprs=expression, ispace=ispace, dspace=dspace) clusters.insert(0, built) return clusters, subs
def lower_schedule(cluster, schedule, sregistry, options): """ Turn a Schedule into a sequence of Clusters. """ ftemps = options['cire-ftemps'] if ftemps: make = TempFunction else: # Typical case -- the user does *not* "see" the CIRE-created temporaries make = Array clusters = [] subs = {} for alias, writeto, ispace, aliaseds, indicess in schedule: # Basic info to create the temporary that will hold the alias name = sregistry.make_name() dtype = cluster.dtype if writeto: # The Dimensions defining the shape of Array # Note: with SubDimensions, we may have the following situation: # # for zi = z_m + zi_ltkn; zi <= z_M - zi_rtkn; ... # r[zi] = ... # # Instead of `r[zi - z_m - zi_ltkn]` we have just `r[zi]`, so we'll need # as much room as in `zi`'s parent to avoid going OOB # Aside from ugly generated code, the reason we do not rather shift the # indices is that it prevents future passes to transform the loop bounds # (e.g., MPI's comp/comm overlap does that) dimensions = [ d.parent if d.is_Sub else d for d in writeto.itdimensions ] # The halo must be set according to the size of writeto space halo = [(abs(i.lower), abs(i.upper)) for i in writeto] # The indices used to write into the Array indices = [] for i in writeto: try: # E.g., `xs` sub_iterators = writeto.sub_iterators[i.dim] assert len(sub_iterators) == 1 indices.append(sub_iterators[0]) except KeyError: # E.g., `z` -- a non-shifted Dimension indices.append(i.dim - i.lower) obj = make(name=name, dimensions=dimensions, halo=halo, dtype=dtype) expression = Eq(obj[indices], alias) callback = lambda idx: obj[idx] else: # Degenerate case: scalar expression assert writeto.size == 0 obj = Symbol(name=name, dtype=dtype) expression = Eq(obj, alias) callback = lambda idx: obj # Create the substitution rules for the aliasing expressions subs.update({ aliased: callback(indices) for aliased, indices in zip(aliaseds, indicess) }) # Construct the `alias` DataSpace accesses = detect_accesses(expression) parts = { k: IntervalGroup(build_intervals(v)).add(ispace.intervals).relaxed for k, v in accesses.items() if k } dspace = DataSpace(cluster.dspace.intervals, parts) # Drop or weaken parallelism if necessary properties = dict(cluster.properties) for d, v in cluster.properties.items(): if any(i.is_Modulo for i in ispace.sub_iterators[d]): properties[d] = normalize_properties(v, {SEQUENTIAL}) elif d not in writeto.dimensions: properties[d] = normalize_properties(v, {PARALLEL_IF_PVT}) # Finally, build the `alias` Cluster clusters.append( cluster.rebuild(exprs=expression, ispace=ispace, dspace=dspace, properties=properties)) return clusters, subs
def _optimize_schedule_rotations(schedule, sregistry): """ Transform the schedule such that the tensor temporaries "rotate" along the outermost Dimension. This trades a parallel Dimension for a smaller working set size. """ # The rotations Dimension is the outermost ridx = 0 rmapper = defaultdict(list) processed = [] for k, group in groupby(schedule, key=lambda i: i.writeto): g = list(group) candidate = k[ridx] d = candidate.dim try: ds = schedule.dmapper[d] except KeyError: # Can't do anything if `d` isn't an IncrDimension over a block processed.extend(g) continue n = candidate.min_size assert n > 0 iis = candidate.lower iib = candidate.upper ii = ModuloDimension('%sii' % d, ds, iis, incr=iib) cd = CustomDimension(name='%s%s' % (d, d), symbolic_min=ii, symbolic_max=iib, symbolic_size=n) dsi = ModuloDimension('%si' % ds, cd, cd + ds - iis, n) mapper = OrderedDict() for i in g: # Update `indicess` to use `xs0`, `xs1`, ... mds = [] for indices in i.indicess: v = indices[ridx] try: md = mapper[v] except KeyError: name = sregistry.make_name(prefix='%sr' % d.name) md = mapper.setdefault(v, ModuloDimension(name, ds, v, n)) mds.append(md) indicess = [ indices[:ridx] + [md] + indices[ridx + 1:] for md, indices in zip(mds, i.indicess) ] # Update `writeto` by switching `d` to `dsi` intervals = k.intervals.switch(d, dsi).zero(dsi) sub_iterators = dict(k.sub_iterators) sub_iterators[d] = dsi writeto = IterationSpace(intervals, sub_iterators) # Transform `alias` by adding `i` alias = i.alias.xreplace({d: d + cd}) # Extend `ispace` to iterate over rotations d1 = writeto[ridx + 1].dim # Note: we're by construction in-bounds here intervals = IntervalGroup(Interval(cd, 0, 0), relations={(d, cd, d1)}) rispace = IterationSpace(intervals, {cd: dsi}, {cd: Forward}) aispace = i.ispace.zero(d) aispace = aispace.augment({d: mds + [ii]}) ispace = IterationSpace.union(rispace, aispace) processed.append( ScheduledAlias(alias, writeto, ispace, i.aliaseds, indicess)) # Update the rotations mapper rmapper[d].extend(list(mapper.values())) return Schedule(*processed, dmapper=schedule.dmapper, rmapper=rmapper)
def lower_aliases(cluster, aliases, in_writeto, maxpar): """ Create a Schedule from an AliasMapper. """ dmapper = {} processed = [] for alias, v in aliases.items(): imapper = { **{i.dim: i for i in v.intervals}, **{ i.dim.parent: i for i in v.intervals if i.dim.is_NonlinearDerived } } intervals = [] writeto = [] sub_iterators = {} indicess = [[] for _ in v.distances] for i in cluster.ispace.intervals: try: interval = imapper[i.dim] except KeyError: # E.g., `x0_blk0` or (`a[y_m+1]` => `y not in imapper`) intervals.append(i) continue assert i.stamp >= interval.stamp if not (writeto or interval != interval.zero() or in_writeto(i.dim, cluster)): # The alias doesn't require a temporary Dimension along i.dim intervals.append(i) continue assert not i.dim.is_NonlinearDerived # `i.dim` is necessarily part of the write-to region, so # we have to adjust the Interval's stamp. For example, consider # `i=x[0,0]<1>` and `interval=x[-4,4]<0>`; here we need to # use `<1>` as stamp, which is what appears in `cluster` interval = interval.lift(i.stamp) # We further bump the interval stamp if we were requested to trade # fusion for more collapse-parallelism interval = interval.lift(interval.stamp + int(maxpar)) writeto.append(interval) intervals.append(interval) if i.dim.is_Incr: # Suitable IncrDimensions must be used to avoid OOB accesses. # E.g., r[xs][ys][z] => both `xs` and `ys` must be initialized such # that all accesses are within bounds. This requires traversing the # hierarchy of IncrDimensions to set `xs` (`ys`) in a way that # consecutive blocks access consecutive regions in `r` (e.g., # `xs=x0_blk1-x0_blk0` with `blocklevels=2`; `xs=0` with # `blocklevels=1`, that is it degenerates in this case) try: d = dmapper[i.dim] except KeyError: dd = i.dim.parent assert dd.is_Incr if dd.parent.is_Incr: # An IncrDimension in between IncrDimensions m = i.dim.symbolic_min - i.dim.parent.symbolic_min else: m = 0 d = dmapper[i.dim] = IncrDimension("%ss" % i.dim.name, i.dim, m, dd.symbolic_size, 1, dd.step) sub_iterators[i.dim] = d else: d = i.dim # Given the iteration `interval`, lower distances to indices for distance, indices in zip(v.distances, indicess): indices.append(d - interval.lower + distance[interval.dim]) # The alias write-to space writeto = IterationSpace(IntervalGroup(writeto), sub_iterators) # The alias iteration space intervals = IntervalGroup(intervals, cluster.ispace.relations) ispace = IterationSpace(intervals, cluster.sub_iterators, cluster.directions) ispace = ispace.augment(sub_iterators) processed.append( ScheduledAlias(alias, writeto, ispace, v.aliaseds, indicess)) # The [ScheduledAliases] must be ordered so as to reuse as many of the # `cluster`'s IterationIntervals as possible in order to honor the # write-to region. Another fundamental reason for ordering is to ensure # deterministic code generation processed = sorted(processed, key=lambda i: cit(cluster.ispace, i.ispace)) return Schedule(*processed, dmapper=dmapper)
def _eliminate_inter_stencil_redundancies(self, cluster, template, **kwargs): """ Search for redundancies across the expressions and expose them to the later stages of the optimisation pipeline by introducing new temporaries of suitable rank. Two type of redundancies are sought: * Time-invariants, and * Across different space points Examples ======== Let ``t`` be the time dimension, ``x, y, z`` the space dimensions. Then: 1) temp = (a[x,y,z]+b[x,y,z])*c[t,x,y,z] >>> ti[x,y,z] = a[x,y,z] + b[x,y,z] temp = ti[x,y,z]*c[t,x,y,z] 2) temp1 = 2.0*a[x,y,z]*b[x,y,z] temp2 = 3.0*a[x,y,z+1]*b[x,y,z+1] >>> ti[x,y,z] = a[x,y,z]*b[x,y,z] temp1 = 2.0*ti[x,y,z] temp2 = 3.0*ti[x,y,z+1] """ if cluster.is_sparse: return cluster # For more information about "aliases", refer to collect.__doc__ mapper, aliases = collect(cluster.exprs) # Redundancies will be stored in space-varying temporaries g = cluster.trace indices = g.space_indices time_invariants = {v.rhs: g.time_invariant(v) for v in g.values()} # Find the candidate expressions processed = [] candidates = OrderedDict() for k, v in g.items(): # Cost check (to keep the memory footprint under control) naliases = len(mapper.get(v.rhs, [])) cost = estimate_cost(v, True) * naliases if cost >= self.thresholds['min-cost-alias'] and\ (naliases > 1 or time_invariants[v.rhs]): candidates[v.rhs] = k else: processed.append(v) # Create alias Clusters and all necessary substitution rules # for the new temporaries alias_clusters = ClusterGroup() rules = OrderedDict() for origin, alias in aliases.items(): if all(i not in candidates for i in alias.aliased): continue # Construct an iteration space suitable for /alias/ intervals, sub_iterators, directions = cluster.ispace.args intervals = [ Interval(i.dim, *alias.relaxed_diameter.get(i.dim, i.limits)) for i in cluster.ispace.intervals ] ispace = IterationSpace(intervals, sub_iterators, directions) # Optimization: perhaps we can lift the cluster outside the time dimension if all(time_invariants[i] for i in alias.aliased): ispace = ispace.project(lambda i: not i.is_Time) # Build a symbolic function for /alias/ intervals = ispace.intervals halo = [(abs(intervals[i].lower), abs(intervals[i].upper)) for i in indices] function = Array(name=template(), dimensions=indices, halo=halo) access = tuple(i - intervals[i].lower for i in indices) expression = Eq(Indexed(function.indexed, *access), origin) # Construct a data space suitable for /alias/ mapper = detect_accesses(expression) parts = { k: IntervalGroup(build_intervals(v)).add(intervals) for k, v in mapper.items() if k } dspace = DataSpace([i.zero() for i in intervals], parts) # Create a new Cluster for /alias/ alias_clusters.append(Cluster([expression], ispace, dspace)) # Add substitution rules for aliased, distance in alias.with_distance: access = [ i - intervals[i].lower + j for i, j in distance if i in indices ] temporary = Indexed(function.indexed, *tuple(access)) rules[candidates[aliased]] = temporary rules[aliased] = temporary # Group clusters together if possible alias_clusters = groupby(alias_clusters).finalize() alias_clusters.sort(key=lambda i: i.is_dense) # Switch temporaries in the expression trees processed = [e.xreplace(rules) for e in processed] return alias_clusters + [cluster.rebuild(processed)]
def process(candidates, aliases, cluster, template): """ Create Clusters from aliasing expressions. """ clusters = [] subs = {} for origin, alias in aliases.items(): if all(i not in candidates for i in alias.aliased): continue # The write-to Intervals writeto = [ Interval(i.dim, *alias.relaxed_diameter.get(i.dim, (0, 0))) for i in cluster.ispace.intervals if not i.dim.is_Time ] writeto = IntervalGroup(writeto) # Optimization: no need to retain a SpaceDimension if it does not # induce a flow/anti dependence (below, `i.offsets` captures this, by # telling how much halo will be required to honour such dependences) dep_inducing = [i for i in writeto if any(i.offsets)] try: index = writeto.index(dep_inducing[0]) writeto = IntervalGroup(writeto[index:]) except IndexError: perf_adv("Could not optimize some of the detected redundancies") # Create a temporary to store `alias` dimensions = [d.root for d in writeto.dimensions] halo = [(abs(i.lower), abs(i.upper)) for i in writeto] array = Array(name=template(), dimensions=dimensions, halo=halo, dtype=cluster.dtype) # Build up the expression evaluating `alias` access = tuple(i.dim - i.lower for i in writeto) expression = Eq(array[access], origin.xreplace(subs)) # Create the substitution rules so that we can use the newly created # temporary in place of the aliasing expressions for aliased, distance in alias.with_distance: assert all(i.dim in distance.labels for i in writeto) access = [i.dim - i.lower + distance[i.dim] for i in writeto] if aliased in candidates: # It would *not* be in `candidates` if part of a composite alias subs[candidates[aliased]] = array[access] subs[aliased] = array[access] # Construct the `alias` IterationSpace intervals, sub_iterators, directions = cluster.ispace.args ispace = IterationSpace(intervals.add(writeto), sub_iterators, directions) # Optimize the `alias` IterationSpace: if possible, the innermost # IterationInterval is rounded up to a multiple of the vector length try: it = ispace.itintervals[-1] if ROUNDABLE in cluster.properties[it.dim]: from devito.parameters import configuration vl = configuration['platform'].simd_items_per_reg( cluster.dtype) ispace = ispace.add(Interval(it.dim, 0, it.interval.size % vl)) except (TypeError, KeyError): pass # Construct the `alias` DataSpace mapper = detect_accesses(expression) parts = { k: IntervalGroup(build_intervals(v)).add(ispace.intervals) for k, v in mapper.items() if k } dspace = DataSpace(cluster.dspace.intervals, parts) # Create a new Cluster for `alias` clusters.append( cluster.rebuild(exprs=[expression], ispace=ispace, dspace=dspace)) return clusters, subs
def make_schedule(cluster, aliases, in_writeto, options): """ Create a Schedule from an AliasMapper. """ max_par = options['cire-maxpar'] dmapper = {} processed = [] for alias, v in aliases.items(): imapper = {**{i.dim: i for i in v.intervals}, **{i.dim.parent: i for i in v.intervals if i.dim.is_NonlinearDerived}} intervals = [] writeto = [] sub_iterators = {} indicess = [[] for _ in v.distances] for i in cluster.ispace.intervals: try: interval = imapper[i.dim] except KeyError: # E.g., `x0_blk0` or (`a[y_m+1]` => `y not in imapper`) intervals.append(i) continue assert i.stamp >= interval.stamp if not (writeto or interval != interval.zero() or in_writeto(i.dim, cluster)): # The alias doesn't require a temporary Dimension along i.dim intervals.append(i) continue assert not i.dim.is_NonlinearDerived # `i.dim` is necessarily part of the write-to region, so # we have to adjust the Interval's stamp. For example, consider # `i=x[0,0]<1>` and `interval=x[-4,4]<0>`; here we need to # use `<1>` as stamp, which is what appears in `cluster` interval = interval.lift(i.stamp) # We further bump the interval stamp if we were requested to trade # fusion for more collapse-parallelism interval = interval.lift(interval.stamp + int(max_par)) writeto.append(interval) intervals.append(interval) if i.dim.is_Incr: # Suitable ShiftedDimensions must be used to avoid OOB accesses. # E.g., r[xs][ys][z] => both `xs` and `ys` must start at 0, # not at `x0_blk0` try: d = dmapper[i.dim] except KeyError: d = dmapper[i.dim] = ShiftedDimension(i.dim, name="%ss" % i.dim.name) sub_iterators[i.dim] = d else: d = i.dim # Given the iteration `interval`, lower distances to indices for distance, indices in zip(v.distances, indicess): indices.append(d - interval.lower + distance[interval.dim]) # The alias write-to space writeto = IterationSpace(IntervalGroup(writeto), sub_iterators) # The alias iteration space intervals = IntervalGroup(intervals, cluster.ispace.relations) ispace = IterationSpace(intervals, cluster.sub_iterators, cluster.directions) ispace = ispace.augment(sub_iterators) processed.append(ScheduledAlias(alias, writeto, ispace, v.aliaseds, indicess)) # Sort by write-to region for deterministic code generation processed = sorted(processed, key=lambda i: i.writeto) return Schedule(*processed, dmapper=dmapper)
def lower_schedule(cluster, schedule, chosen, sregistry, options): """ Turn a Schedule into a sequence of Clusters. """ onstack = options['cire-onstack'] clusters = [] subs = {} for alias, writeto, ispace, aliaseds, indicess in schedule: if all(i not in chosen for i in aliaseds): continue # The Dimensions defining the shape of Array # Note: with SubDimensions, we may have the following situation: # # for zi = z_m + zi_ltkn; zi <= z_M - zi_rtkn; ... # r[zi] = ... # # Instead of `r[zi - z_m - zi_ltkn]` we have just `r[zi]`, so we'll need # as much room as in `zi`'s parent to avoid going OOB # Aside from ugly generated code, the reason we do not rather shift the # indices is that it prevents future passes to transform the loop bounds # (e.g., MPI's comp/comm overlap does that) dimensions = [d.parent if d.is_Sub else d for d in writeto.itdimensions] halo = [(abs(i.lower), abs(i.upper)) for i in writeto] # The data sharing mode of the Array. It can safely be `shared` only if # all of the PARALLEL `cluster` Dimensions appear in `writeto` parallel = [d for d, v in cluster.properties.items() if PARALLEL in v] sharing = 'shared' if set(parallel) == set(writeto.itdimensions) else 'local' # The memory region of the Array. On the heap, unless the user has # explicitly requested allocation on the stack scope = 'stack' if onstack else 'heap' array = Array(name=sregistry.make_name(), dimensions=dimensions, halo=halo, dtype=cluster.dtype, scope=scope, sharing=sharing) indices = [] for i in writeto: try: # E.g., `xs` sub_iterators = writeto.sub_iterators[i.dim] assert len(sub_iterators) == 1 indices.append(sub_iterators[0]) except KeyError: # E.g., `z` -- a non-shifted Dimension indices.append(i.dim - i.lower) expression = Eq(array[indices], alias) # Create the substitution rules so that we can use the newly created # temporary in place of the aliasing expressions for aliased, indices in zip(aliaseds, indicess): subs[aliased] = array[indices] if aliased in chosen: subs[chosen[aliased]] = array[indices] else: # Perhaps part of a composite alias ? pass # Construct the `alias` DataSpace accesses = detect_accesses(expression) parts = {k: IntervalGroup(build_intervals(v)).add(ispace.intervals).relaxed for k, v in accesses.items() if k} dspace = DataSpace(cluster.dspace.intervals, parts) # Drop parallelism if using ModuloDimensions (due to rotations) properties = dict(cluster.properties) for d, v in cluster.properties.items(): if any(i.is_Modulo for i in ispace.sub_iterators[d]): properties[d] = normalize_properties(v, {SEQUENTIAL}) # Finally, build the `alias` Cluster clusters.append(cluster.rebuild(exprs=expression, ispace=ispace, dspace=dspace, properties=properties)) return clusters, subs
def _eliminate_inter_stencil_redundancies(self, cluster, template, **kwargs): """ Search aliasing expressions and capture them into vector temporaries. Examples -------- 1) temp = (a[x,y,z]+b[x,y,z])*c[t,x,y,z] >>> ti[x,y,z] = a[x,y,z] + b[x,y,z] temp = ti[x,y,z]*c[t,x,y,z] 2) temp1 = 2.0*a[x,y,z]*b[x,y,z] temp2 = 3.0*a[x,y,z+1]*b[x,y,z+1] >>> ti[x,y,z] = a[x,y,z]*b[x,y,z] temp1 = 2.0*ti[x,y,z] temp2 = 3.0*ti[x,y,z+1] """ # For more information about "aliases", refer to collect.__doc__ aliases = collect(cluster.exprs) # Redundancies will be stored in space-varying temporaries graph = FlowGraph(cluster.exprs) time_invariants = { v.rhs: graph.time_invariant(v) for v in graph.values() } # Find the candidate expressions processed = [] candidates = OrderedDict() for k, v in graph.items(): # Cost check (to keep the memory footprint under control) naliases = len(aliases.get(v.rhs)) cost = estimate_cost(v, True) * naliases test0 = lambda: cost >= self.MIN_COST_ALIAS and naliases > 1 test1 = lambda: cost >= self.MIN_COST_ALIAS_INV and time_invariants[ v.rhs] if test0() or test1(): candidates[v.rhs] = k else: processed.append(v) # Create alias Clusters and all necessary substitution rules # for the new temporaries alias_clusters = [] subs = {} for origin, alias in aliases.items(): if all(i not in candidates for i in alias.aliased): continue # The write-to Intervals writeto = [ Interval(i.dim, *alias.relaxed_diameter.get(i.dim, (0, 0))) for i in cluster.ispace.intervals if not i.dim.is_Time ] writeto = IntervalGroup(writeto) # Optimization: no need to retain a SpaceDimension if it does not # induce a flow/anti dependence (below, `i.offsets` captures this, by # telling how much halo will be needed to honour such dependences) dep_inducing = [i for i in writeto if any(i.offsets)] try: index = writeto.index(dep_inducing[0]) writeto = IntervalGroup(writeto[index:]) except IndexError: warning("Couldn't optimize some of the detected redundancies") # Create a temporary to store `alias` dimensions = [d.root for d in writeto.dimensions] halo = [(abs(i.lower), abs(i.upper)) for i in writeto] array = Array(name=template(), dimensions=dimensions, halo=halo, dtype=cluster.dtype) # Build up the expression evaluating `alias` access = tuple(i.dim - i.lower for i in writeto) expression = Eq(array[access], origin) # Create the substitution rules so that we can use the newly created # temporary in place of the aliasing expressions for aliased, distance in alias.with_distance: assert all(i.dim in distance.labels for i in writeto) access = [i.dim - i.lower + distance[i.dim] for i in writeto] if aliased in candidates: # It would *not* be in `candidates` if part of a composite alias subs[candidates[aliased]] = array[access] subs[aliased] = array[access] # Construct the `alias` IterationSpace intervals, sub_iterators, directions = cluster.ispace.args ispace = IterationSpace(intervals.add(writeto), sub_iterators, directions) # Construct the `alias` DataSpace mapper = detect_accesses(expression) parts = { k: IntervalGroup(build_intervals(v)).add(ispace.intervals) for k, v in mapper.items() if k } dspace = DataSpace(cluster.dspace.intervals, parts) # Create a new Cluster for `alias` alias_clusters.append(Cluster([expression], ispace, dspace)) # Switch temporaries in the expression trees processed = [e.xreplace(subs) for e in processed] return alias_clusters + [cluster.rebuild(processed)]
def iter(self, cluster, max_par): """ The aliases can legally be scheduled in many different orders, but we privilege the one that minimizes storage while maximizing fusion. """ items = [] for alias, (intervals, aliaseds, distances) in self.items(): mapper = {i.dim: i for i in intervals} mapper.update({ i.dim.parent: i for i in intervals if i.dim.is_NonlinearDerived }) # Becomes True as soon as a Dimension in `ispace` is found to # be independent of `intervals` flag = False iteron = [] writeto = [] for i in cluster.ispace.intervals: try: interval = mapper[i.dim] except KeyError: if not any(i.dim in d._defines for d in mapper): # E.g., `t[0,0]<0>` in the case of t-invariant aliases, # whereas if `i.dim` is `x0_blk0` in `x0_blk0[0,0]<0>` then # we would not enter here flag = True iteron.append(i) continue assert i.stamp >= interval.stamp # Does `i.dim` actually need to be a write-to Dimension ? if flag or interval != interval.zero(): # Yes, so we also have to adjust the Interval's stamp. # E.g., `i=x[0,0]<1>` and `interval=x[-4,4]<0>`. We need to # use `<1>` which is the actual stamp used in `cluster` interval = interval.lift(i.stamp) iteron.append(interval) writeto.append(interval) flag = True elif max_par and PARALLEL in cluster.properties[i.dim]: # Not necessarily, but with `max_par` the user is # expressing the wish to trade-off storage for parallelism interval = interval.lift(i.stamp + 1) iteron.append(interval) writeto.append(interval) flag = True else: iteron.append(i) if writeto: writeto = IntervalGroup(writeto, cluster.ispace.relations) else: # E.g., an `alias` having 0-distance along all Dimensions writeto = IntervalGroup(intervals, cluster.ispace.relations) # Construct the IterationSpace within which the alias will be computed ispace = IterationSpace( IntervalGroup(iteron, cluster.ispace.relations), cluster.sub_iterators, cluster.directions) ispace = ispace.augment(self.index_mapper) items.append((alias, writeto, ispace, aliaseds, distances)) queue = list(items) while queue: # Shortest write-to region first item = min(queue, key=lambda i: len(i[1])) queue.remove(item) yield item