def _aliases_from_clusters(self, clusters, exclude, meta): exprs = flatten([c.exprs for c in clusters]) # [Clusters]_n -> [Schedule]_m variants = [] for mapper in self._generate(exprs, exclude): # Clusters -> AliasList found = collect(mapper.extracted, meta.ispace, self.opt_minstorage) pexprs, aliases = choose(found, exprs, mapper, self.opt_mingain) if not aliases: continue # AliasList -> Schedule schedule = lower_aliases(aliases, meta, self.opt_maxpar) variants.append(Variant(schedule, pexprs)) # [Schedule]_m -> Schedule (s.t. best memory/flops trade-off) if not variants: return [] schedule, exprs = pick_best(variants, self.opt_schedule_strategy, self._eval_variants_delta) # Schedule -> Schedule (optimization) if self.opt_rotate: schedule = optimize_schedule_rotations(schedule, self.sregistry) schedule = optimize_schedule_padding(schedule, meta, self.platform) # Schedule -> [Clusters]_k processed, subs = lower_schedule(schedule, meta, self.sregistry, self.opt_ftemps) # [Clusters]_k -> [Clusters]_k (optimization) if self.opt_multisubdomain: processed = optimize_clusters_msds(processed) # [Clusters]_k -> [Clusters]_{k+n} for c in clusters: n = len(c.exprs) cexprs, exprs = exprs[:n], exprs[n:] cexprs = [uxreplace(e, subs) for e in cexprs] ispace = c.ispace.augment(schedule.dmapper) ispace = ispace.augment(schedule.rmapper) accesses = detect_accesses(cexprs) parts = { k: IntervalGroup(build_intervals(v)).relaxed for k, v in accesses.items() if k } dspace = DataSpace(c.dspace.intervals, parts) processed.append( c.rebuild(exprs=cexprs, ispace=ispace, dspace=dspace)) assert len(exprs) == 0 return processed
def derive_dspace(expr, ispace): accesses = detect_accesses(expr) parts = { k: IntervalGroup(build_intervals(v)).relaxed for k, v in accesses.items() if k } dspace = DataSpace(ispace.intervals, parts) return dspace
def rebuild(cluster, others, aliases, subs): # Rebuild the non-aliasing expressions exprs = [uxreplace(e, subs) for e in others] # Add any new ShiftedDimension to the IterationSpace ispace = cluster.ispace.augment(aliases.index_mapper) # Rebuild the DataSpace to include the new symbols accesses = detect_accesses(exprs) parts = {k: IntervalGroup(build_intervals(v)).relaxed for k, v in accesses.items() if k} dspace = DataSpace(cluster.dspace.intervals, parts) return cluster.rebuild(exprs=exprs, ispace=ispace, dspace=dspace)
def rebuild(cluster, others, schedule, subs): """ Plug the optimized aliases into the input Cluster. This leads to creating a new Cluster with suitable IterationSpace and DataSpace. """ exprs = [uxreplace(e, subs) for e in others] ispace = cluster.ispace.augment(schedule.dmapper) ispace = ispace.augment(schedule.rmapper) accesses = detect_accesses(exprs) parts = {k: IntervalGroup(build_intervals(v)).relaxed for k, v in accesses.items() if k} dspace = DataSpace(cluster.dspace.intervals, parts) return cluster.rebuild(exprs=exprs, ispace=ispace, dspace=dspace)
def optimize_clusters_msds(clusters): """ Relax the clusters by letting the expressions defined over MultiSubDomains to rather be computed over the entire domain. This increases the likelihood of code lifting by later passes. """ processed = [] for c in clusters: msds = [ d for d in c.ispace.itdimensions if isinstance(d, MultiSubDimension) ] if msds: mapper = {d: d.root for d in msds} exprs = [uxreplace(e, mapper) for e in c.exprs] ispace = c.ispace.relaxed(msds) accesses = detect_accesses(exprs) parts = { k: IntervalGroup(build_intervals(v)).relaxed for k, v in accesses.items() if k } intervals = [i for i in c.dspace if i.dim not in msds] dspace = DataSpace(intervals, parts) guards = {mapper.get(d, d): v for d, v in c.guards.items()} properties = {mapper.get(d, d): v for d, v in c.properties.items()} syncs = {mapper.get(d, d): v for d, v in c.syncs.items()} processed.append( c.rebuild(exprs=exprs, ispace=ispace, dspace=dspace, guards=guards, properties=properties, syncs=syncs)) else: processed.append(c) return processed
def _eliminate_inter_stencil_redundancies(self, cluster, template, **kwargs): """ Search aliasing expressions and capture them into vector temporaries. Examples -------- 1) temp = (a[x,y,z]+b[x,y,z])*c[t,x,y,z] >>> ti[x,y,z] = a[x,y,z] + b[x,y,z] temp = ti[x,y,z]*c[t,x,y,z] 2) temp1 = 2.0*a[x,y,z]*b[x,y,z] temp2 = 3.0*a[x,y,z+1]*b[x,y,z+1] >>> ti[x,y,z] = a[x,y,z]*b[x,y,z] temp1 = 2.0*ti[x,y,z] temp2 = 3.0*ti[x,y,z+1] """ # For more information about "aliases", refer to collect.__doc__ aliases = collect(cluster.exprs) # Redundancies will be stored in space-varying temporaries graph = FlowGraph(cluster.exprs) time_invariants = { v.rhs: graph.time_invariant(v) for v in graph.values() } # Find the candidate expressions processed = [] candidates = OrderedDict() for k, v in graph.items(): # Cost check (to keep the memory footprint under control) naliases = len(aliases.get(v.rhs)) cost = estimate_cost(v, True) * naliases test0 = lambda: cost >= self.MIN_COST_ALIAS and naliases > 1 test1 = lambda: cost >= self.MIN_COST_ALIAS_INV and time_invariants[ v.rhs] if test0() or test1(): candidates[v.rhs] = k else: processed.append(v) # Create alias Clusters and all necessary substitution rules # for the new temporaries alias_clusters = [] subs = {} for origin, alias in aliases.items(): if all(i not in candidates for i in alias.aliased): continue # The write-to Intervals writeto = [ Interval(i.dim, *alias.relaxed_diameter.get(i.dim, (0, 0))) for i in cluster.ispace.intervals if not i.dim.is_Time ] writeto = IntervalGroup(writeto) # Optimization: no need to retain a SpaceDimension if it does not # induce a flow/anti dependence (below, `i.offsets` captures this, by # telling how much halo will be needed to honour such dependences) dep_inducing = [i for i in writeto if any(i.offsets)] try: index = writeto.index(dep_inducing[0]) writeto = IntervalGroup(writeto[index:]) except IndexError: warning("Couldn't optimize some of the detected redundancies") # Create a temporary to store `alias` dimensions = [d.root for d in writeto.dimensions] halo = [(abs(i.lower), abs(i.upper)) for i in writeto] array = Array(name=template(), dimensions=dimensions, halo=halo, dtype=cluster.dtype) # Build up the expression evaluating `alias` access = tuple(i.dim - i.lower for i in writeto) expression = Eq(array[access], origin) # Create the substitution rules so that we can use the newly created # temporary in place of the aliasing expressions for aliased, distance in alias.with_distance: assert all(i.dim in distance.labels for i in writeto) access = [i.dim - i.lower + distance[i.dim] for i in writeto] if aliased in candidates: # It would *not* be in `candidates` if part of a composite alias subs[candidates[aliased]] = array[access] subs[aliased] = array[access] # Construct the `alias` IterationSpace intervals, sub_iterators, directions = cluster.ispace.args ispace = IterationSpace(intervals.add(writeto), sub_iterators, directions) # Construct the `alias` DataSpace mapper = detect_accesses(expression) parts = { k: IntervalGroup(build_intervals(v)).add(ispace.intervals) for k, v in mapper.items() if k } dspace = DataSpace(cluster.dspace.intervals, parts) # Create a new Cluster for `alias` alias_clusters.append(Cluster([expression], ispace, dspace)) # Switch temporaries in the expression trees processed = [e.xreplace(subs) for e in processed] return alias_clusters + [cluster.rebuild(processed)]
def process(cluster, chosen, aliases, sregistry, platform): clusters = [] subs = {} for alias, writeto, aliaseds, distances in aliases.iter(cluster.ispace): if all(i not in chosen for i in aliaseds): continue # The Dimensions defining the shape of Array # Note: with SubDimensions, we may have the following situation: # # for zi = z_m + zi_ltkn; zi <= z_M - zi_rtkn; ... # r[zi] = ... # # Instead of `r[zi - z_m - zi_ltkn]` we have just `r[zi]`, so we'll need # as much room as in `zi`'s parent to avoid going OOB # Aside from ugly generated code, the reason we do not rather shift the # indices is that it prevents future passes to transform the loop bounds # (e.g., MPI's comp/comm overlap does that) dimensions = [d.parent if d.is_Sub else d for d in writeto.dimensions] # The halo of the Array halo = [(abs(i.lower), abs(i.upper)) for i in writeto] # The data sharing mode of the Array sharing = 'local' if any(d.is_Incr for d in writeto.dimensions) else 'shared' # Finally create the temporary Array that will store `alias` array = Array(name=sregistry.make_name(), dimensions=dimensions, halo=halo, dtype=cluster.dtype, sharing=sharing) # The access Dimensions may differ from `writeto.dimensions`. This may # happen e.g. if ShiftedDimensions are introduced (`a[x,y]` -> `a[xs,y]`) adims = [aliases.index_mapper.get(d, d) for d in writeto.dimensions] # The expression computing `alias` adims = [aliases.index_mapper.get(d, d) for d in writeto.dimensions] # x -> xs indices = [d - (0 if writeto[d].is_Null else writeto[d].lower) for d in adims] expression = Eq(array[indices], uxreplace(alias, subs)) # Create the substitution rules so that we can use the newly created # temporary in place of the aliasing expressions for aliased, distance in zip(aliaseds, distances): assert all(i.dim in distance.labels for i in writeto) indices = [d - i.lower + distance[i.dim] for d, i in zip(adims, writeto)] subs[aliased] = array[indices] if aliased in chosen: subs[chosen[aliased]] = array[indices] else: # Perhaps part of a composite alias ? pass # Construct the `alias` IterationSpace ispace = cluster.ispace.add(writeto).augment(aliases.index_mapper) # Optimization: if possible, the innermost IterationInterval is # rounded up to a multiple of the vector length try: it = ispace.itintervals[-1] if ROUNDABLE in cluster.properties[it.dim]: vl = platform.simd_items_per_reg(cluster.dtype) ispace = ispace.add(Interval(it.dim, 0, it.interval.size % vl)) except (TypeError, KeyError): pass # Construct the `alias` DataSpace accesses = detect_accesses(expression) parts = {k: IntervalGroup(build_intervals(v)).add(ispace.intervals).relaxed for k, v in accesses.items() if k} dspace = DataSpace(cluster.dspace.intervals, parts) # Finally, build a new Cluster for `alias` built = cluster.rebuild(exprs=expression, ispace=ispace, dspace=dspace) clusters.insert(0, built) return clusters, subs
def lower_schedule(cluster, schedule, sregistry, options): """ Turn a Schedule into a sequence of Clusters. """ ftemps = options['cire-ftemps'] if ftemps: make = TempFunction else: # Typical case -- the user does *not* "see" the CIRE-created temporaries make = Array clusters = [] subs = {} for alias, writeto, ispace, aliaseds, indicess in schedule: # Basic info to create the temporary that will hold the alias name = sregistry.make_name() dtype = cluster.dtype if writeto: # The Dimensions defining the shape of Array # Note: with SubDimensions, we may have the following situation: # # for zi = z_m + zi_ltkn; zi <= z_M - zi_rtkn; ... # r[zi] = ... # # Instead of `r[zi - z_m - zi_ltkn]` we have just `r[zi]`, so we'll need # as much room as in `zi`'s parent to avoid going OOB # Aside from ugly generated code, the reason we do not rather shift the # indices is that it prevents future passes to transform the loop bounds # (e.g., MPI's comp/comm overlap does that) dimensions = [ d.parent if d.is_Sub else d for d in writeto.itdimensions ] # The halo must be set according to the size of writeto space halo = [(abs(i.lower), abs(i.upper)) for i in writeto] # The indices used to write into the Array indices = [] for i in writeto: try: # E.g., `xs` sub_iterators = writeto.sub_iterators[i.dim] assert len(sub_iterators) == 1 indices.append(sub_iterators[0]) except KeyError: # E.g., `z` -- a non-shifted Dimension indices.append(i.dim - i.lower) obj = make(name=name, dimensions=dimensions, halo=halo, dtype=dtype) expression = Eq(obj[indices], alias) callback = lambda idx: obj[idx] else: # Degenerate case: scalar expression assert writeto.size == 0 obj = Symbol(name=name, dtype=dtype) expression = Eq(obj, alias) callback = lambda idx: obj # Create the substitution rules for the aliasing expressions subs.update({ aliased: callback(indices) for aliased, indices in zip(aliaseds, indicess) }) # Construct the `alias` DataSpace accesses = detect_accesses(expression) parts = { k: IntervalGroup(build_intervals(v)).add(ispace.intervals).relaxed for k, v in accesses.items() if k } dspace = DataSpace(cluster.dspace.intervals, parts) # Drop or weaken parallelism if necessary properties = dict(cluster.properties) for d, v in cluster.properties.items(): if any(i.is_Modulo for i in ispace.sub_iterators[d]): properties[d] = normalize_properties(v, {SEQUENTIAL}) elif d not in writeto.dimensions: properties[d] = normalize_properties(v, {PARALLEL_IF_PVT}) # Finally, build the `alias` Cluster clusters.append( cluster.rebuild(exprs=expression, ispace=ispace, dspace=dspace, properties=properties)) return clusters, subs
def _eliminate_inter_stencil_redundancies(self, cluster, template, **kwargs): """ Search for redundancies across the expressions and expose them to the later stages of the optimisation pipeline by introducing new temporaries of suitable rank. Two type of redundancies are sought: * Time-invariants, and * Across different space points Examples ======== Let ``t`` be the time dimension, ``x, y, z`` the space dimensions. Then: 1) temp = (a[x,y,z]+b[x,y,z])*c[t,x,y,z] >>> ti[x,y,z] = a[x,y,z] + b[x,y,z] temp = ti[x,y,z]*c[t,x,y,z] 2) temp1 = 2.0*a[x,y,z]*b[x,y,z] temp2 = 3.0*a[x,y,z+1]*b[x,y,z+1] >>> ti[x,y,z] = a[x,y,z]*b[x,y,z] temp1 = 2.0*ti[x,y,z] temp2 = 3.0*ti[x,y,z+1] """ if cluster.is_sparse: return cluster # For more information about "aliases", refer to collect.__doc__ mapper, aliases = collect(cluster.exprs) # Redundancies will be stored in space-varying temporaries g = cluster.trace indices = g.space_indices time_invariants = {v.rhs: g.time_invariant(v) for v in g.values()} # Find the candidate expressions processed = [] candidates = OrderedDict() for k, v in g.items(): # Cost check (to keep the memory footprint under control) naliases = len(mapper.get(v.rhs, [])) cost = estimate_cost(v, True) * naliases if cost >= self.thresholds['min-cost-alias'] and\ (naliases > 1 or time_invariants[v.rhs]): candidates[v.rhs] = k else: processed.append(v) # Create alias Clusters and all necessary substitution rules # for the new temporaries alias_clusters = ClusterGroup() rules = OrderedDict() for origin, alias in aliases.items(): if all(i not in candidates for i in alias.aliased): continue # Construct an iteration space suitable for /alias/ intervals, sub_iterators, directions = cluster.ispace.args intervals = [ Interval(i.dim, *alias.relaxed_diameter.get(i.dim, i.limits)) for i in cluster.ispace.intervals ] ispace = IterationSpace(intervals, sub_iterators, directions) # Optimization: perhaps we can lift the cluster outside the time dimension if all(time_invariants[i] for i in alias.aliased): ispace = ispace.project(lambda i: not i.is_Time) # Build a symbolic function for /alias/ intervals = ispace.intervals halo = [(abs(intervals[i].lower), abs(intervals[i].upper)) for i in indices] function = Array(name=template(), dimensions=indices, halo=halo) access = tuple(i - intervals[i].lower for i in indices) expression = Eq(Indexed(function.indexed, *access), origin) # Construct a data space suitable for /alias/ mapper = detect_accesses(expression) parts = { k: IntervalGroup(build_intervals(v)).add(intervals) for k, v in mapper.items() if k } dspace = DataSpace([i.zero() for i in intervals], parts) # Create a new Cluster for /alias/ alias_clusters.append(Cluster([expression], ispace, dspace)) # Add substitution rules for aliased, distance in alias.with_distance: access = [ i - intervals[i].lower + j for i, j in distance if i in indices ] temporary = Indexed(function.indexed, *tuple(access)) rules[candidates[aliased]] = temporary rules[aliased] = temporary # Group clusters together if possible alias_clusters = groupby(alias_clusters).finalize() alias_clusters.sort(key=lambda i: i.is_dense) # Switch temporaries in the expression trees processed = [e.xreplace(rules) for e in processed] return alias_clusters + [cluster.rebuild(processed)]
def lower_schedule(cluster, schedule, chosen, sregistry, options): """ Turn a Schedule into a sequence of Clusters. """ onstack = options['cire-onstack'] clusters = [] subs = {} for alias, writeto, ispace, aliaseds, indicess in schedule: if all(i not in chosen for i in aliaseds): continue # The Dimensions defining the shape of Array # Note: with SubDimensions, we may have the following situation: # # for zi = z_m + zi_ltkn; zi <= z_M - zi_rtkn; ... # r[zi] = ... # # Instead of `r[zi - z_m - zi_ltkn]` we have just `r[zi]`, so we'll need # as much room as in `zi`'s parent to avoid going OOB # Aside from ugly generated code, the reason we do not rather shift the # indices is that it prevents future passes to transform the loop bounds # (e.g., MPI's comp/comm overlap does that) dimensions = [d.parent if d.is_Sub else d for d in writeto.itdimensions] halo = [(abs(i.lower), abs(i.upper)) for i in writeto] # The data sharing mode of the Array. It can safely be `shared` only if # all of the PARALLEL `cluster` Dimensions appear in `writeto` parallel = [d for d, v in cluster.properties.items() if PARALLEL in v] sharing = 'shared' if set(parallel) == set(writeto.itdimensions) else 'local' # The memory region of the Array. On the heap, unless the user has # explicitly requested allocation on the stack scope = 'stack' if onstack else 'heap' array = Array(name=sregistry.make_name(), dimensions=dimensions, halo=halo, dtype=cluster.dtype, scope=scope, sharing=sharing) indices = [] for i in writeto: try: # E.g., `xs` sub_iterators = writeto.sub_iterators[i.dim] assert len(sub_iterators) == 1 indices.append(sub_iterators[0]) except KeyError: # E.g., `z` -- a non-shifted Dimension indices.append(i.dim - i.lower) expression = Eq(array[indices], alias) # Create the substitution rules so that we can use the newly created # temporary in place of the aliasing expressions for aliased, indices in zip(aliaseds, indicess): subs[aliased] = array[indices] if aliased in chosen: subs[chosen[aliased]] = array[indices] else: # Perhaps part of a composite alias ? pass # Construct the `alias` DataSpace accesses = detect_accesses(expression) parts = {k: IntervalGroup(build_intervals(v)).add(ispace.intervals).relaxed for k, v in accesses.items() if k} dspace = DataSpace(cluster.dspace.intervals, parts) # Drop parallelism if using ModuloDimensions (due to rotations) properties = dict(cluster.properties) for d, v in cluster.properties.items(): if any(i.is_Modulo for i in ispace.sub_iterators[d]): properties[d] = normalize_properties(v, {SEQUENTIAL}) # Finally, build the `alias` Cluster clusters.append(cluster.rebuild(exprs=expression, ispace=ispace, dspace=dspace, properties=properties)) return clusters, subs
def process(candidates, aliases, cluster, template): """ Create Clusters from aliasing expressions. """ clusters = [] subs = {} for origin, alias in aliases.items(): if all(i not in candidates for i in alias.aliased): continue # The write-to Intervals writeto = [ Interval(i.dim, *alias.relaxed_diameter.get(i.dim, (0, 0))) for i in cluster.ispace.intervals if not i.dim.is_Time ] writeto = IntervalGroup(writeto) # Optimization: no need to retain a SpaceDimension if it does not # induce a flow/anti dependence (below, `i.offsets` captures this, by # telling how much halo will be required to honour such dependences) dep_inducing = [i for i in writeto if any(i.offsets)] try: index = writeto.index(dep_inducing[0]) writeto = IntervalGroup(writeto[index:]) except IndexError: perf_adv("Could not optimize some of the detected redundancies") # Create a temporary to store `alias` dimensions = [d.root for d in writeto.dimensions] halo = [(abs(i.lower), abs(i.upper)) for i in writeto] array = Array(name=template(), dimensions=dimensions, halo=halo, dtype=cluster.dtype) # Build up the expression evaluating `alias` access = tuple(i.dim - i.lower for i in writeto) expression = Eq(array[access], origin.xreplace(subs)) # Create the substitution rules so that we can use the newly created # temporary in place of the aliasing expressions for aliased, distance in alias.with_distance: assert all(i.dim in distance.labels for i in writeto) access = [i.dim - i.lower + distance[i.dim] for i in writeto] if aliased in candidates: # It would *not* be in `candidates` if part of a composite alias subs[candidates[aliased]] = array[access] subs[aliased] = array[access] # Construct the `alias` IterationSpace intervals, sub_iterators, directions = cluster.ispace.args ispace = IterationSpace(intervals.add(writeto), sub_iterators, directions) # Optimize the `alias` IterationSpace: if possible, the innermost # IterationInterval is rounded up to a multiple of the vector length try: it = ispace.itintervals[-1] if ROUNDABLE in cluster.properties[it.dim]: from devito.parameters import configuration vl = configuration['platform'].simd_items_per_reg( cluster.dtype) ispace = ispace.add(Interval(it.dim, 0, it.interval.size % vl)) except (TypeError, KeyError): pass # Construct the `alias` DataSpace mapper = detect_accesses(expression) parts = { k: IntervalGroup(build_intervals(v)).add(ispace.intervals) for k, v in mapper.items() if k } dspace = DataSpace(cluster.dspace.intervals, parts) # Create a new Cluster for `alias` clusters.append( cluster.rebuild(exprs=[expression], ispace=ispace, dspace=dspace)) return clusters, subs