def choose(aliases, exprs, mapper, selector): """ Analyze the detected aliases and, after applying a cost model to rule out the aliases with a bad flops/memory trade-off, inject them into the original expressions. """ tot = 0 retained = AliasMapper() # Pass 1: a set of aliasing expressions is retained only if its cost # exceeds the mode's threshold candidates = OrderedDict() aliaseds = [] others = [] for e, v in aliases.items(): score = selector(e, len(v.aliaseds)) if score > 0: candidates[e] = score aliaseds.extend(v.aliaseds) else: others.append(e) # Do not waste time if unneccesary if not candidates: return exprs, retained, tot # Project the candidate aliases into exprs to determine what the new # working set would be mapper = { k: v for k, v in mapper.items() if v.free_symbols & set(aliaseds) } templated = [uxreplace(e, mapper) for e in exprs] # Pass 2: a set of aliasing expressions is retained only if the tradeoff # between operation count reduction and working set increase is favorable owset = wset(others + templated) for e, v in aliases.items(): try: score = candidates[e] except KeyError: score = 0 if score > 1 or \ score == 1 and max(len(wset(e)), 1) > len(wset(e) & owset): retained[e] = v tot += score # Do not waste time if unneccesary if not retained: return exprs, retained, tot # Substitute the chosen aliasing sub-expressions mapper = { k: v for k, v in mapper.items() if v.free_symbols & set(retained.aliaseds) } exprs = [uxreplace(e, mapper) for e in exprs] return exprs, retained, tot
def _buffering(expressions, callback, options): async_degree = options['buf-async-degree'] # Locate all Function accesses within the provided `expressions` accessmap = AccessMapper(expressions) # Create the buffers buffers = [] for n, (f, accessv) in enumerate(accessmap.items()): dims = callback(f) if dims is None: # Not a buffer candidate continue if accessv.lastwrite is None: # Read-only Functions cannot be buffering candidates continue buffers.append(Buffer(f, dims, accessv, n, async_degree)) # Create Eqs to initialize buffers. Note: a buffer needs to be initialized # only if the buffered Function is read in at least one place or in the case # of non-uniform SubDimensions, to avoid uninitialized values to be copied-back # into the buffered Function processed = [Eq(b.indexify(), b.function.subs(b.contraction_mapper)) for b in buffers if b.is_read or not b.has_uniform_subdims] # Substitution rules to replace buffered Functions with buffers subs = {} for b in buffers: for a in b.accessv.accesses: subs[a] = b.indexify(a.indices) # Create Eqs to copy back buffers into their buffered Functions for e in expressions: processed.append(uxreplace(e, subs)) # We also append the copy-back if `e` is the last-write of some buffers for b in buffers: if e is b.accessv.lastwrite: items = list(zip(e.lhs.indices, b.function.dimensions)) lhs = b.function[[i if i in b.index_mapper else d for i, d in items]] rhs = b.indexed[[b.index_mapper.get(i, d) for i, d in items]] if b.subdims_mapper: processed.append(uxreplace(Eq(lhs, rhs), b.subdims_mapper)) else: processed.append(Eq(lhs, rhs)) break return processed
def _compact_temporaries(exprs): """ Drop temporaries consisting of isolated symbols. """ # First of all, convert to SSA exprs = makeit_ssa(exprs) # What's gonna be dropped mapper = {e.lhs: e.rhs for e in exprs if e.lhs.is_Symbol and (q_leaf(e.rhs) or e.rhs.is_Function)} processed = [] for e in exprs: if e.lhs not in mapper: # The temporary is retained, and substitutions may be applied expr = e while True: handle = uxreplace(expr, mapper) if handle == expr: break else: expr = handle processed.append(handle) return processed
def visit_Conditional(self, o): condition = uxreplace(o.condition, self.mapper) then_body = self._visit(o.then_body) else_body = self._visit(o.else_body) return o._rebuild(condition=condition, then_body=then_body, else_body=else_body)
def _lower_exprs(cls, expressions, **kwargs): """ Expression lowering: * Form and gather any required implicit expressions; * Evaluate derivatives; * Flatten vectorial equations; * Indexify Functions; * Apply substitution rules; * Specialize (e.g., index shifting) """ # Add in implicit expressions, e.g., induced by SubDomains expressions = cls._add_implicit(expressions) # Unfold lazyiness expressions = flatten([i.evaluate for i in expressions]) # Scalarize tensor expressions expressions = [j for i in expressions for j in i._flatten] # Indexification # E.g., f(x - 2*h_x, y) -> f[xi + 2, yi + 4] (assuming halo_size=4) processed = [] for expr in expressions: if expr.subdomain: dimension_map = expr.subdomain.dimension_map else: dimension_map = {} # Handle Functions (typical case) mapper = { f: f.indexify(lshift=True, subs=dimension_map) for f in retrieve_functions(expr) } # Handle Indexeds (from index notation) for i in retrieve_indexed(expr): f = i.function # Introduce shifting to align with the computational domain indices = [(a + o) for a, o in zip(i.indices, f._size_nodomain.left)] # Apply substitutions, if necessary if dimension_map: indices = [j.xreplace(dimension_map) for j in indices] mapper[i] = f.indexed[indices] subs = kwargs.get('subs') if subs: # Include the user-supplied substitutions, and use # `xreplace` for constant folding processed.append(expr.xreplace({**mapper, **subs})) else: processed.append(uxreplace(expr, mapper)) processed = cls._specialize_exprs(processed) return processed
def extract(cls, n, context, min_cost, cluster, sregistry): make = lambda: Scalar(name=sregistry.make_name(), dtype=cluster.dtype ).indexify() exclude = { i.source.indexed for i in cluster.scope.d_flow.independent() } rule0 = lambda e: not e.free_symbols & exclude rule1 = make_is_time_invariant(context) rule2 = lambda e: estimate_cost(e, True) >= min_cost rule = lambda e: rule0(e) and rule1(e) and rule2(e) extracted = [] mapper = OrderedDict() for e in cluster.exprs: for i in search(e, rule, 'all', 'dfs_first_hit'): if i not in mapper: symbol = make() mapper[i] = symbol extracted.append(e.func(symbol, i)) processed = [uxreplace(e, mapper) for e in cluster.exprs] return extracted + processed, extracted
def makeit_ssa(exprs): """ Convert an iterable of Eqs into Static Single Assignment (SSA) form. """ # Identify recurring LHSs seen = {} for i, e in enumerate(exprs): seen.setdefault(e.lhs, []).append(i) # Optimization: don't waste time reconstructing stuff if already in SSA form if all(len(i) == 1 for i in seen.values()): return exprs # SSA conversion c = 0 mapper = {} processed = [] for i, e in enumerate(exprs): where = seen[e.lhs] rhs = uxreplace(e.rhs, mapper) if len(where) > 1: needssa = e.is_Scalar or where[-1] != i lhs = Symbol(name='ssa%d' % c, dtype=e.dtype) if needssa else e.lhs if e.is_Increment: # Turn AugmentedAssignment into Assignment processed.append( e.func(lhs, mapper[e.lhs] + rhs, is_Increment=False)) else: processed.append(e.func(lhs, rhs)) mapper[e.lhs] = lhs c += 1 else: processed.append(e.func(e.lhs, rhs)) return processed
def _compact_temporaries(exprs): """ Drop temporaries consisting of isolated symbols. """ # First of all, convert to SSA exprs = makeit_ssa(exprs) # Drop candidates are all exprs in the form `t0 = s` where `s` is a symbol # Note: only CSE-captured Temps, which are by construction local objects, may # safely be compacted; a generic Symbol could instead be accessed in a subsequent # Cluster, for example: `for (i = ...) { a = b; for (j = a ...) ...` mapper = { e.lhs: e.rhs for e in exprs if isinstance(e.lhs, Temp) and (q_leaf(e.rhs) or e.rhs.is_Function) } processed = [] for e in exprs: if e.lhs not in mapper: # The temporary is retained, and substitutions may be applied expr = e while True: handle = uxreplace(expr, mapper) if handle == expr: break else: expr = handle processed.append(handle) return processed
def _aliases_from_clusters(self, clusters, exclude, meta): exprs = flatten([c.exprs for c in clusters]) # [Clusters]_n -> [Schedule]_m variants = [] for mapper in self._generate(exprs, exclude): # Clusters -> AliasList found = collect(mapper.extracted, meta.ispace, self.opt_minstorage) pexprs, aliases = choose(found, exprs, mapper, self.opt_mingain) if not aliases: continue # AliasList -> Schedule schedule = lower_aliases(aliases, meta, self.opt_maxpar) variants.append(Variant(schedule, pexprs)) # [Schedule]_m -> Schedule (s.t. best memory/flops trade-off) if not variants: return [] schedule, exprs = pick_best(variants, self.opt_schedule_strategy, self._eval_variants_delta) # Schedule -> Schedule (optimization) if self.opt_rotate: schedule = optimize_schedule_rotations(schedule, self.sregistry) schedule = optimize_schedule_padding(schedule, meta, self.platform) # Schedule -> [Clusters]_k processed, subs = lower_schedule(schedule, meta, self.sregistry, self.opt_ftemps) # [Clusters]_k -> [Clusters]_k (optimization) if self.opt_multisubdomain: processed = optimize_clusters_msds(processed) # [Clusters]_k -> [Clusters]_{k+n} for c in clusters: n = len(c.exprs) cexprs, exprs = exprs[:n], exprs[n:] cexprs = [uxreplace(e, subs) for e in cexprs] ispace = c.ispace.augment(schedule.dmapper) ispace = ispace.augment(schedule.rmapper) accesses = detect_accesses(cexprs) parts = { k: IntervalGroup(build_intervals(v)).relaxed for k, v in accesses.items() if k } dspace = DataSpace(c.dspace.intervals, parts) processed.append( c.rebuild(exprs=cexprs, ispace=ispace, dspace=dspace)) assert len(exprs) == 0 return processed
def choose(aliases, exprs, mapper, mingain): """ Analyze the detected aliases and, after applying a cost model to rule out the aliases with a bad memory/flops trade-off, inject them into the original expressions. """ aliases = AliasList(aliases) # `score < m` => discarded # `score > M` => optimized # `m <= score <= M` => maybe discarded, maybe optimized -- depends on heuristics m = mingain M = mingain * 3 if not aliases: return exprs, aliases # Filter off the aliases with low score key = lambda a: a.score >= m aliases.filter(key) # Project the candidate aliases into `exprs` to derive the final working set mapper = { k: v for k, v in mapper.items() if v.free_symbols & set(aliases.aliaseds) } templated = [uxreplace(e, mapper) for e in exprs] owset = wset(templated) # Filter off the aliases with a weak flop-reduction / working-set tradeoff key = lambda a: \ a.score > M or \ m <= a.score <= M and max(len(wset(a.pivot)), 1) > len(wset(a.pivot) & owset) aliases.filter(key) if not aliases: return exprs, aliases # Substitute the chosen aliasing sub-expressions mapper = { k: v for k, v in mapper.items() if v.free_symbols & set(aliases.aliaseds) } exprs = [uxreplace(e, mapper) for e in exprs] return exprs, aliases
def extract(cls, n, context, min_cost, max_alias, cluster, sregistry): make = lambda: Scalar(name=sregistry.make_name(), dtype=cluster.dtype ).indexify() # The `depth` determines "how big" the extracted sum-of-products will be. # We observe that in typical FD codes: # add(mul, mul, ...) -> stems from first order derivative # add(mul(add(mul, mul, ...), ...), ...) -> stems from second order derivative # To search the muls in the former case, we need `depth=0`; to search the outer # muls in the latter case, we need `depth=2` depth = n exclude = { i.source.indexed for i in cluster.scope.d_flow.independent() } rule0 = lambda e: not e.free_symbols & exclude rule1 = lambda e: e.is_Mul and q_terminalop(e, depth) rule = lambda e: rule0(e) and rule1(e) extracted = OrderedDict() mapper = {} for e in cluster.exprs: for i in search(e, rule, 'all', 'bfs_first_hit'): if i in mapper: continue key = lambda a: a.is_Add terms, others = split(list(i.args), key) if max_alias: # Treat `e` as an FD expression and pull out the derivative # coefficient from `i` # Note: typically derivative coefficients are numbers, but # sometimes they could be provided in symbolic form through an # arbitrary Function. In the latter case, we rely on the # heuristic that such Function's basically never span the whole # grid, but rather a single Grid dimension (e.g., `c[z, n]` for a # stencil of diameter `n` along `z`) if e.grid is not None and terms: key = partial(maybe_coeff_key, e.grid) others, more_terms = split(others, key) terms.extend(more_terms) if terms: k = i.func(*terms) try: symbol, _ = extracted[k] except KeyError: symbol, _ = extracted.setdefault(k, (make(), e)) mapper[i] = i.func(symbol, *others) if mapper: extracted = [e.func(v, k) for k, (v, e) in extracted.items()] processed = [uxreplace(e, mapper) for e in cluster.exprs] return extracted + processed, extracted else: return cluster.exprs, []
def callback(self, clusters, prefix, blk_size_gen=None): if not prefix: return clusters d = prefix[-1].dim if not any(TILABLE in c.properties[d] for c in clusters): return clusters # Create the block Dimensions (in total `self.levels` Dimensions) base = self.sregistry.make_name(prefix=d.name) if blk_size_gen is not None: # By passing a suitable key to `next` we ensure that we pull the # next par-tile entry iff we're now blocking an unseen TILABLE nest step = sympify(blk_size_gen.next(clusters)) else: # This will result in a parametric step, e.g. `x0_blk0_size` step = None name = self.sregistry.make_name(prefix="%s_blk" % base) bd = BlockDimension(name, d, d.symbolic_min, d.symbolic_max, step) step = bd.step block_dims = [bd] for _ in range(1, self.levels): name = self.sregistry.make_name(prefix="%s_blk" % base) bd = BlockDimension(name, bd, bd, bd + bd.step - 1, size=step) block_dims.append(bd) bd = BlockDimension(d.name, bd, bd, bd + bd.step - 1, 1, size=step) block_dims.append(bd) processed = [] for c in clusters: if TILABLE in c.properties[d]: ispace = decompose(c.ispace, d, block_dims) # Use the innermost BlockDimension in place of `d` exprs = [uxreplace(e, {d: bd}) for e in c.exprs] # The new Cluster properties # TILABLE property is dropped after the blocking. properties = dict(c.properties) properties.pop(d) properties.update( {bd: c.properties[d] - {TILABLE} for bd in block_dims}) processed.append( c.rebuild(exprs=exprs, ispace=ispace, properties=properties)) else: processed.append(c) return processed
def lower_exprs(expressions, **kwargs): """ Lowering an expression consists of the following passes: * Indexify functions; * Align Indexeds with the computational domain; * Apply user-provided substitution; Examples -------- f(x - 2*h_x, y) -> f[xi + 2, yi + 4] (assuming halo_size=4) """ processed = [] for expr in as_tuple(expressions): try: dimension_map = expr.subdomain.dimension_map except AttributeError: # Some Relationals may be pure SymPy objects, thus lacking the subdomain dimension_map = {} # Handle Functions (typical case) mapper = { f: f.indexify(lshift=True, subs=dimension_map) for f in retrieve_functions(expr) } # Handle Indexeds (from index notation) for i in retrieve_indexed(expr): f = i.function # Introduce shifting to align with the computational domain indices = [(lower_exprs(a) + o) for a, o in zip(i.indices, f._size_nodomain.left)] # Apply substitutions, if necessary if dimension_map: indices = [j.xreplace(dimension_map) for j in indices] mapper[i] = f.indexed[indices] subs = kwargs.get('subs') # Add dimensions map to the mapper in case dimensions are used # as an expression, i.e. Eq(u, x, subdomain=xleft) mapper.update(dimension_map) if subs: # Include the user-supplied substitutions, and use # `xreplace` for constant folding processed.append(expr.xreplace({**mapper, **subs})) else: processed.append(uxreplace(expr, mapper)) if isinstance(expressions, Iterable): return processed else: assert len(processed) == 1 return processed.pop()
def scalarize(clusters, template): """ Turn local "isolated" Arrays, that is Arrays appearing only in one Cluster, into Scalars. """ processed = [] for c in clusters: # Get any Arrays appearing only in `c` impacted = set(clusters) - {c} arrays = {i for i in c.scope.writes if i.is_Array} arrays -= set().union(*[i.scope.reads for i in impacted]) # Turn them into scalars # # r[x,y,z] = g(b[x,y,z]) t0 = g(b[x,y,z]) # ... = r[x,y,z] + r[x,y,z+1]` ----> t1 = g(b[x,y,z+1]) # ... = t0 + t1 mapper = {} exprs = [] for n, e in enumerate(c.exprs): f = e.lhs.function if f in arrays: indexeds = [i.indexed for i in c.scope[f] if i.timestamp > n] for i in filter_ordered(indexeds): mapper[i] = Scalar(name=template(), dtype=f.dtype) assert len(f.indices) == len(e.lhs.indices) == len( i.indices) shifting = { idx: idx + (o2 - o1) for idx, o1, o2 in zip(f.indices, e.lhs.indices, i.indices) } handle = e.func(mapper[i], uxreplace(e.rhs, mapper)) handle = xreplace_indices(handle, shifting) exprs.append(handle) else: exprs.append(e.func(e.lhs, uxreplace(e.rhs, mapper))) processed.append(c.rebuild(exprs)) return processed
def rebuild(cluster, others, aliases, subs): # Rebuild the non-aliasing expressions exprs = [uxreplace(e, subs) for e in others] # Add any new ShiftedDimension to the IterationSpace ispace = cluster.ispace.augment(aliases.index_mapper) # Rebuild the DataSpace to include the new symbols accesses = detect_accesses(exprs) parts = {k: IntervalGroup(build_intervals(v)).relaxed for k, v in accesses.items() if k} dspace = DataSpace(cluster.dspace.intervals, parts) return cluster.rebuild(exprs=exprs, ispace=ispace, dspace=dspace)
def rebuild(cluster, others, schedule, subs): """ Plug the optimized aliases into the input Cluster. This leads to creating a new Cluster with suitable IterationSpace and DataSpace. """ exprs = [uxreplace(e, subs) for e in others] ispace = cluster.ispace.augment(schedule.dmapper) ispace = ispace.augment(schedule.rmapper) accesses = detect_accesses(exprs) parts = {k: IntervalGroup(build_intervals(v)).relaxed for k, v in accesses.items() if k} dspace = DataSpace(cluster.dspace.intervals, parts) return cluster.rebuild(exprs=exprs, ispace=ispace, dspace=dspace)
def callback(self, clusters, prefix): if not prefix: return clusters d = prefix[-1].dim # Create the block Dimensions (in total `self.levels` Dimensions) name = self.template % (d.name, self.nblocked[d], '%d') bd = IncrDimension(name % 0, d, d.symbolic_min, d.symbolic_max) size = bd.step block_dims = [bd] for i in range(1, self.levels): bd = IncrDimension(name % i, bd, bd, bd + bd.step - 1, size=size) block_dims.append(bd) bd = IncrDimension(d.name, bd, bd, bd + bd.step - 1, 1, size=size) block_dims.append(bd) processed = [] for c in clusters: if TILABLE in c.properties[d]: ispace = decompose(c.ispace, d, block_dims) # Use the innermost IncrDimension in place of `d` exprs = [uxreplace(e, {d: bd}) for e in c.exprs] # The new Cluster properties # TILABLE property is dropped after the blocking. # SKEWABLE is dropped as well, but only from the new # block dimensions. properties = dict(c.properties) properties.pop(d) properties.update({bd: c.properties[d] - {TILABLE} for bd in block_dims}) properties.update({bd: c.properties[d] - {SKEWABLE} for bd in block_dims[:-1]}) processed.append(c.rebuild(exprs=exprs, ispace=ispace, properties=properties)) else: processed.append(c) # Make sure to use unique IncrDimensions self.nblocked[d] += int(any(TILABLE in c.properties[d] for c in clusters)) return processed
def extract(cls, n, context, min_cost, max_alias, cluster, sregistry): make = lambda: Scalar(name=sregistry.make_name(), dtype=cluster.dtype ).indexify() rule = cls._extract_rule(context, min_cost, cluster) extracted = [] mapper = OrderedDict() for e in cluster.exprs: for i in search(e, rule, 'all', 'dfs_first_hit'): if i not in mapper: symbol = make() mapper[i] = symbol extracted.append(e.func(symbol, i)) processed = [uxreplace(e, mapper) for e in cluster.exprs] return extracted + processed, extracted
def eliminate_arrays(clusters): """ Eliminate redundant expressions stored in Arrays. """ mapper = {} processed = [] for c in clusters: if not c.is_dense: processed.append(c) continue # Search for any redundant RHSs seen = {} for e in c.exprs: f = e.lhs.function if not f.is_Array: continue v = seen.get(e.rhs) if v is not None: # Found a redundant RHS mapper[f] = v else: seen[e.rhs] = f if not mapper: # Do not waste time processed.append(c) continue # Replace redundancies subs = {} for f, v in mapper.items(): for i in filter_ordered(i.indexed for i in c.scope[f]): subs[i] = v[i.indices] exprs = [] for e in c.exprs: if e.lhs.function in mapper: # Drop the write continue exprs.append(uxreplace(e, subs)) processed.append(c.rebuild(exprs)) return processed
def extract(cls, n, context, min_cost, cluster, sregistry): make = lambda: Scalar(name=sregistry.make_name(), dtype=cluster.dtype ).indexify() # The `depth` determines "how big" the extracted sum-of-products will be. # We observe that in typical FD codes: # add(mul, mul, ...) -> stems from first order derivative # add(mul(add(mul, mul, ...), ...), ...) -> stems from second order derivative # To search the muls in the former case, we need `depth=0`; to search the outer # muls in the latter case, we need `depth=2` depth = n exclude = { i.source.indexed for i in cluster.scope.d_flow.independent() } rule0 = lambda e: not e.free_symbols & exclude rule1 = lambda e: e.is_Mul and q_terminalop(e, depth) rule = lambda e: rule0(e) and rule1(e) extracted = OrderedDict() mapper = {} for e in cluster.exprs: for i in search(e, rule, 'all', 'bfs_first_hit'): if i in mapper: continue # Separate numbers and Functions, as they could be a derivative coeff terms, others = split(i.args, lambda a: a.is_Add) if terms: k = i.func(*terms) try: symbol, _ = extracted[k] except KeyError: symbol, _ = extracted.setdefault(k, (make(), e)) mapper[i] = i.func(symbol, *others) if mapper: extracted = [e.func(v, k) for k, (v, e) in extracted.items()] processed = [uxreplace(e, mapper) for e in cluster.exprs] return extracted + processed, extracted else: return cluster.exprs, []
def optimize_clusters_msds(clusters): """ Relax the clusters by letting the expressions defined over MultiSubDomains to rather be computed over the entire domain. This increases the likelihood of code lifting by later passes. """ processed = [] for c in clusters: msds = [ d for d in c.ispace.itdimensions if isinstance(d, MultiSubDimension) ] if msds: mapper = {d: d.root for d in msds} exprs = [uxreplace(e, mapper) for e in c.exprs] ispace = c.ispace.relaxed(msds) accesses = detect_accesses(exprs) parts = { k: IntervalGroup(build_intervals(v)).relaxed for k, v in accesses.items() if k } intervals = [i for i in c.dspace if i.dim not in msds] dspace = DataSpace(intervals, parts) guards = {mapper.get(d, d): v for d, v in c.guards.items()} properties = {mapper.get(d, d): v for d, v in c.properties.items()} syncs = {mapper.get(d, d): v for d, v in c.syncs.items()} processed.append( c.rebuild(exprs=exprs, ispace=ispace, dspace=dspace, guards=guards, properties=properties, syncs=syncs)) else: processed.append(c) return processed
def __new__(cls, *args, **kwargs): if len(args) == 1 and isinstance(args[0], LoweredEq): # origin: LoweredEq(devito.LoweredEq, **kwargs) input_expr = args[0] expr = sympy.Eq.__new__(cls, *input_expr.args, evaluate=False) for i in cls._state: setattr(expr, '_%s' % i, kwargs.get(i) or getattr(input_expr, i)) return expr elif len(args) == 1 and isinstance(args[0], Eq): # origin: LoweredEq(devito.Eq) input_expr = expr = args[0] elif len(args) == 2: expr = sympy.Eq.__new__(cls, *args, evaluate=False) for i in cls._state: setattr(expr, '_%s' % i, kwargs.pop(i)) return expr else: raise ValueError("Cannot construct LoweredEq from args=%s " "and kwargs=%s" % (str(args), str(kwargs))) # Well-defined dimension ordering ordering = dimension_sort(expr) # Analyze the expression accesses = detect_accesses(expr) dimensions = Stencil.union(*accesses.values()) # Separate out the SubIterators from the main iteration Dimensions, that # is those which define an actual iteration space iterators = {} for d in dimensions: if d.is_SubIterator: iterators.setdefault(d.root, set()).add(d) elif d.is_Conditional: # Use `parent`, and `root`, because a ConditionalDimension may # have a SubDimension as parent iterators.setdefault(d.parent, set()) else: iterators.setdefault(d, set()) # Construct the IterationSpace intervals = IntervalGroup([Interval(d, 0, 0) for d in iterators], relations=ordering.relations) ispace = IterationSpace(intervals, iterators) # Construct the conditionals and replace the ConditionalDimensions in `expr` conditionals = {} for d in ordering: if not d.is_Conditional: continue if d.condition is None: conditionals[d] = GuardFactor(d) else: conditionals[d] = diff2sympy(lower_exprs(d.condition)) if d.factor is not None: expr = uxreplace(expr, {d: IntDiv(d.index, d.factor)}) conditionals = frozendict(conditionals) # Lower all Differentiable operations into SymPy operations rhs = diff2sympy(expr.rhs) # Finally create the LoweredEq with all metadata attached expr = super(LoweredEq, cls).__new__(cls, expr.lhs, rhs, evaluate=False) expr._ispace = ispace expr._conditionals = conditionals expr._reads, expr._writes = detect_io(expr) expr._is_Increment = input_expr.is_Increment expr._implicit_dims = input_expr.implicit_dims return expr
def process(cluster, chosen, aliases, sregistry, platform): clusters = [] subs = {} for alias, writeto, aliaseds, distances in aliases.iter(cluster.ispace): if all(i not in chosen for i in aliaseds): continue # The Dimensions defining the shape of Array # Note: with SubDimensions, we may have the following situation: # # for zi = z_m + zi_ltkn; zi <= z_M - zi_rtkn; ... # r[zi] = ... # # Instead of `r[zi - z_m - zi_ltkn]` we have just `r[zi]`, so we'll need # as much room as in `zi`'s parent to avoid going OOB # Aside from ugly generated code, the reason we do not rather shift the # indices is that it prevents future passes to transform the loop bounds # (e.g., MPI's comp/comm overlap does that) dimensions = [d.parent if d.is_Sub else d for d in writeto.dimensions] # The halo of the Array halo = [(abs(i.lower), abs(i.upper)) for i in writeto] # The data sharing mode of the Array sharing = 'local' if any(d.is_Incr for d in writeto.dimensions) else 'shared' # Finally create the temporary Array that will store `alias` array = Array(name=sregistry.make_name(), dimensions=dimensions, halo=halo, dtype=cluster.dtype, sharing=sharing) # The access Dimensions may differ from `writeto.dimensions`. This may # happen e.g. if ShiftedDimensions are introduced (`a[x,y]` -> `a[xs,y]`) adims = [aliases.index_mapper.get(d, d) for d in writeto.dimensions] # The expression computing `alias` adims = [aliases.index_mapper.get(d, d) for d in writeto.dimensions] # x -> xs indices = [d - (0 if writeto[d].is_Null else writeto[d].lower) for d in adims] expression = Eq(array[indices], uxreplace(alias, subs)) # Create the substitution rules so that we can use the newly created # temporary in place of the aliasing expressions for aliased, distance in zip(aliaseds, distances): assert all(i.dim in distance.labels for i in writeto) indices = [d - i.lower + distance[i.dim] for d, i in zip(adims, writeto)] subs[aliased] = array[indices] if aliased in chosen: subs[chosen[aliased]] = array[indices] else: # Perhaps part of a composite alias ? pass # Construct the `alias` IterationSpace ispace = cluster.ispace.add(writeto).augment(aliases.index_mapper) # Optimization: if possible, the innermost IterationInterval is # rounded up to a multiple of the vector length try: it = ispace.itintervals[-1] if ROUNDABLE in cluster.properties[it.dim]: vl = platform.simd_items_per_reg(cluster.dtype) ispace = ispace.add(Interval(it.dim, 0, it.interval.size % vl)) except (TypeError, KeyError): pass # Construct the `alias` DataSpace accesses = detect_accesses(expression) parts = {k: IntervalGroup(build_intervals(v)).add(ispace.intervals).relaxed for k, v in accesses.items() if k} dspace = DataSpace(cluster.dspace.intervals, parts) # Finally, build a new Cluster for `alias` built = cluster.rebuild(exprs=expression, ispace=ispace, dspace=dspace) clusters.insert(0, built) return clusters, subs
def collect(exprs, min_storage, ignore_collected): """ Find groups of aliasing expressions. We shall introduce the following (loose) terminology: * A ``terminal`` is the leaf of a mathematical operation. Terminals can be numbers (n), literals (l), or Indexeds (I). * ``R`` is the relaxation operator := ``R(n) = n``, ``R(l) = l``, ``R(I) = J``, where ``J`` has the same base as ``I`` but with all offsets stripped away. For example, ``R(a[i+2,j-1]) = a[i,j]``. * A ``relaxed expression`` is an expression in which all of the terminals are relaxed. Now we define the concept of aliasing. We say that an expression A aliases an expression B if: * ``R(A) == R(B)`` * all pairwise Indexeds in A and B access memory locations at a fixed constant distance along each Dimension. For example, consider the following expressions: * a[i+1] + b[i+1] * a[i+1] + b[j+1] * a[i] + c[i] * a[i+2] - b[i+2] * a[i+2] + b[i] * a[i-1] + b[i-1] Out of the expressions above, the following alias to `a[i] + b[i]`: * a[i+1] + b[i+1] : same operands and operations, distance along i: 1 * a[i-1] + b[i-1] : same operands and operations, distance along i: -1 Whereas the following do not: * a[i+1] + b[j+1] : because at least one index differs * a[i] + c[i] : because at least one of the operands differs * a[i+2] - b[i+2] : because at least one operation differs * a[i+2] + b[i] : because the distances along ``i`` differ (+2 and +0) """ # Find the potential aliases found = [] for expr in exprs: if expr.lhs.is_Indexed or expr.is_Increment: continue indexeds = retrieve_indexed(expr.rhs) bases = [] offsets = [] for i in indexeds: ii = IterationInstance(i) if ii.is_irregular: break base = [] offset = [] for e, ai in zip(ii, ii.aindices): if q_constant(e): base.append(e) else: base.append(ai) offset.append((ai, e - ai)) bases.append(tuple(base)) offsets.append(LabeledVector(offset)) if indexeds and len(bases) == len(indexeds): found.append(Candidate(expr, indexeds, bases, offsets)) # Create groups of aliasing expressions mapper = OrderedDict() unseen = list(found) while unseen: c = unseen.pop(0) group = [c] for u in list(unseen): # Is the arithmetic structure of `c` and `u` equivalent ? if not compare_ops(c.expr, u.expr): continue # Is `c` translated w.r.t. `u` ? if not c.translated(u): continue group.append(u) unseen.remove(u) group = Group(group) # Apply callback to heuristically discard groups if ignore_collected(group): continue if min_storage: k = group.dimensions_translated else: k = group.dimensions mapper.setdefault(k, []).append(group) aliases = Aliases() for _groups in list(mapper.values()): groups = list(_groups) while groups: # For each Dimension, determine the Minimum Intervals (MI) spanning # all of the Groups diameters # Example: x's largest_diameter=2 => [x[-2,0], x[-1,1], x[0,2]] # Note: Groups that cannot evaluate their diameter are dropped mapper = defaultdict(int) for g in list(groups): try: mapper.update({d: max(mapper[d], v) for d, v in g.diameter.items()}) except ValueError: groups.remove(g) intervalss = {d: make_rotations_table(d, v) for d, v in mapper.items()} # For each Group, find a rotation that is compatible with a given MI mapper = {} for d, intervals in intervalss.items(): for interval in list(intervals): found = {g: g.find_rotation_distance(d, interval) for g in groups} if all(distance is not None for distance in found.values()): # `interval` is OK ! mapper[interval] = found break if len(mapper) == len(intervalss): break # Try again with fewer groups smallest = len(min(groups, key=len)) groups = [g for g in groups if len(g) > smallest] for g in groups: c = g.pivot distances = defaultdict(int, [(i.dim, v[g]) for i, v in mapper.items()]) # Create the basis alias offsets = [LabeledVector([(l, v[l] + distances[l]) for l in v.labels]) for v in c.offsets] subs = {i: i.function[[l + v.fromlabel(l, 0) for l in b]] for i, b, v in zip(c.indexeds, c.bases, offsets)} alias = uxreplace(c.expr, subs) # All aliased expressions aliaseds = [i.expr for i in g] # Distance of each aliased expression from the basis alias distances = [] for i in g: distance = [o.distance(v) for o, v in zip(i.offsets, offsets)] distance = [(d, set(v)) for d, v in LabeledVector.transpose(*distance)] distances.append(LabeledVector([(d, v.pop()) for d, v in distance])) aliases.add(alias, list(mapper), aliaseds, distances) return aliases
def __init__(self, mapper=None, replacer=None): super(XSubs, self).__init__() self.replacer = replacer or (lambda i: uxreplace(i, mapper))
def callback(self, clusters, prefix): if not prefix: return clusters d = prefix[-1].dim subiters = flatten( [c.ispace.sub_iterators.get(d, []) for c in clusters]) subiters = {i for i in subiters if i.is_Stepping} if not subiters: return clusters # Collect the index access functions along `d`, e.g., `t + 1` where `t` is # a SteppingDimension for `d = time` mapper = DefaultOrderedDict(lambda: DefaultOrderedDict(set)) for c in clusters: indexeds = [ a.indexed for a in c.scope.accesses if a.function.is_Tensor ] for i in indexeds: try: iaf = i.indices[d] except KeyError: continue # Sanity checks sis = iaf.free_symbols & subiters if len(sis) == 0: continue elif len(sis) == 1: si = sis.pop() else: raise InvalidOperator( "Cannot use multiple SteppingDimensions " "to index into a Function") size = i.function.shape_allocated[d] assert is_integer(size) mapper[size][si].add(iaf) # Construct the ModuloDimensions mds = [] for size, v in mapper.items(): for si, iafs in list(v.items()): # Offsets are sorted so that the semantic order (t0, t1, t2) follows # SymPy's index ordering (t, t-1, t+1) afer modulo replacement so # that associativity errors are consistent. This corresponds to # sorting offsets {-1, 0, 1} as {0, -1, 1} assigning -inf to 0 siafs = sorted(iafs, key=lambda i: -np.inf if i - si == 0 else (i - si)) for iaf in siafs: name = '%s%d' % (si.name, len(mds)) offset = uxreplace(iaf, {si: d.root}) mds.append( ModuloDimension(name, si, offset, size, origin=iaf)) # Replacement rule for ModuloDimensions def rule(size, e): try: return e.function.shape_allocated[d] == size except (AttributeError, KeyError): return False # Reconstruct the Clusters processed = [] for c in clusters: # Apply substitutions to expressions # Note: In an expression, there could be `u[t+1, ...]` and `v[t+1, # ...]`, where `u` and `v` are TimeFunction with circular time # buffers (save=None) *but* different modulo extent. The `t+1` # indices above are therefore conceptually different, so they will # be replaced with the proper ModuloDimension through two different # calls to `xreplace_indices` exprs = c.exprs groups = as_mapper(mds, lambda d: d.modulo) for size, v in groups.items(): mapper = {md.origin: md for md in v} func = partial(xreplace_indices, mapper=mapper, key=partial(rule, size)) exprs = [e.apply(func) for e in exprs] # Augment IterationSpace ispace = IterationSpace(c.ispace.intervals, { **c.ispace.sub_iterators, **{ d: tuple(mds) } }, c.ispace.directions) processed.append(c.rebuild(exprs=exprs, ispace=ispace)) return processed
def _cse(maybe_exprs, make, mode='default'): """ Main common sub-expressions elimination routine. Note: the output is guaranteed to be topologically sorted. Parameters ---------- maybe_exprs : expr-like or list of expr-like or Cluster One or more expressions to which CSE is applied. make : callable Build symbols to store temporary, redundant values. mode : str, optional The CSE algorithm applied. Accepted: ['default']. """ # Note: not defaulting to SymPy's CSE() function for three reasons: # - it also captures array index access functions (eg, i+1 in A[i+1] and B[i+1]); # - it sometimes "captures too much", losing factorization opportunities; # - very slow # TODO: a second "sympy" mode will be provided, relying on SymPy's CSE() but # also ensuring some form of post-processing assert mode == 'default' # Only supported mode ATM # Just for flexibility, accept either Clusters or exprs if isinstance(maybe_exprs, Cluster): cluster = maybe_exprs processed = list(cluster.exprs) scope = cluster.scope else: processed = list(maybe_exprs) scope = Scope(maybe_exprs) # Some sub-expressions aren't really "common" -- that's the case of Dimension- # independent data dependences. For example: # # ... = ... a[i] + 1 ... # a[i] = ... # ... = ... a[i] + 1 ... # # `a[i] + 1` will be excluded, as there's a flow Dimension-independent data # dependence involving `a` exclude = {i.source.access for i in scope.d_flow.independent()} mapped = [] while True: # Detect redundancies counted = count(mapped + processed, q_xop).items() targets = OrderedDict([(k, estimate_cost(k, True)) for k, v in counted if v > 1]) # Rule out Dimension-independent data dependencies targets = OrderedDict([(k, v) for k, v in targets.items() if not k.free_symbols & exclude]) if not targets: break # Create temporaries hit = max(targets.values()) picked = [k for k, v in targets.items() if v == hit] mapper = OrderedDict([(e, make()) for i, e in enumerate(picked)]) # Apply replacements processed = [uxreplace(e, mapper) for e in processed] mapped = [uxreplace(e, mapper) for e in mapped] mapped = [Eq(v, k) for k, v in reversed(list(mapper.items()))] + mapped # Update `exclude` for the same reasons as above -- to rule out CSE across # Dimension-independent data dependences exclude.update({i for i in mapper.values()}) # Prepare for the next round for k in picked: targets.pop(k) processed = mapped + processed # At this point we may have useless temporaries (e.g., r0=r1). Let's drop them processed = _compact_temporaries(processed) return processed
def visit_Expression(self, o): return o._rebuild(expr=uxreplace(o.expr, self.mapper))
def visit_Call(self, o): arguments = [uxreplace(i, self.mapper) for i in o.arguments] return o._rebuild(arguments=arguments)
def actions_from_update_memcpy(cluster, clusters, prefix, actions): it = prefix[-1] d = it.dim direction = it.direction # Prepare the data to instantiate a PrefetchUpdate SyncOp e = cluster.exprs[0] size = 1 function = e.rhs.function fetch = e.rhs.indices[d] ifetch = fetch.subs(d, d.symbolic_min) if direction is Forward: fcond = make_cond(cluster.guards.get(d), d, direction, d.symbolic_min) else: fcond = make_cond(cluster.guards.get(d), d, direction, d.symbolic_max) if direction is Forward: pfetch = fetch + 1 pcond = make_cond(cluster.guards.get(d), d, direction, d + 1) else: pfetch = fetch - 1 pcond = make_cond(cluster.guards.get(d), d, direction, d - 1) target = e.lhs.function tstore0 = e.lhs.indices[d] # If fetching into e.g., `ub[sb1]`, we'll need to prefetch into e.g. `ub[sb0]` if is_integer(tstore0): tstore = tstore0 else: assert tstore0.is_Modulo subiters = [md for md in cluster.sub_iterators[d] if md.parent is tstore0.parent] osubiters = sorted(subiters, key=lambda i: Vector(i.offset)) n = osubiters.index(tstore0) if direction is Forward: tstore = osubiters[(n + 1) % len(osubiters)] else: tstore = osubiters[(n - 1) % len(osubiters)] # Turn `cluster` into a prefetch Cluster expr = uxreplace(e, {tstore0: tstore, fetch: pfetch}) guards = {d: And(*([pcond] + as_list(cluster.guards.get(d))))} syncs = {d: [PrefetchUpdate( d, size, function, fetch, ifetch, fcond, pfetch, pcond, target, tstore )]} pcluster = cluster.rebuild(exprs=expr, guards=guards, syncs=syncs) # Since we're turning `e` into a prefetch, we need to: # 1) attach a WaitPrefetch SyncOp to the first Cluster accessing `target` # 2) insert the prefetch Cluster right after the last Cluster accessing `target` # 3) drop the original Cluster performing a memcpy-based fetch n = clusters.index(cluster) first = None last = None for c in clusters[n+1:]: if target in c.scope.reads: if first is None: first = c last = c assert first is not None assert last is not None actions[first].syncs[d].append(WaitPrefetch( d, size, function, fetch, ifetch, fcond, pfetch, pcond, target, tstore )) actions[last].insert.append(pcluster) actions[cluster].drop = True