def _factorize(self, cluster, *args, **kwargs): """ Collect terms in each expr in exprs based on the following heuristic: * Collect all literals; * Collect all temporaries produced by CSE; * If the expression has an operation count higher than ``self.MIN_COST_FACTORIZE``, then this is applied recursively until no more factorization opportunities are available. """ processed = [] for expr in cluster.exprs: handle = collect_nested(expr) cost_handle = estimate_cost(handle) if cost_handle >= self.MIN_COST_FACTORIZE: handle_prev = handle cost_prev = estimate_cost(expr) while cost_handle < cost_prev: handle_prev, handle = handle, collect_nested(handle) cost_prev, cost_handle = cost_handle, estimate_cost(handle) cost_handle, handle = cost_prev, handle_prev processed.append(handle) return cluster.rebuild(processed)
def _selector(self, e, naliases): if all(i.function.is_Symbol for i in e.free_symbols): # E.g., `dt**(-2)` mincost = self._opt_mincost['scalar'] else: mincost = self._opt_mincost['tensor'] return estimate_cost(e, True)*naliases // mincost
def extract(cls, n, context, min_cost, max_alias, cluster, sregistry): make = lambda: Scalar(name=sregistry.make_name(), dtype=cluster.dtype ).indexify() exclude = { i.source.indexed for i in cluster.scope.d_flow.independent() } rule0 = lambda e: not e.free_symbols & exclude rule1 = make_is_time_invariant(context) rule2 = lambda e: estimate_cost(e, True) >= min_cost rule = lambda e: rule0(e) and rule1(e) and rule2(e) extracted = [] mapper = OrderedDict() for e in cluster.exprs: for i in search(e, rule, 'all', 'dfs_first_hit'): if i not in mapper: symbol = make() mapper[i] = symbol extracted.append(e.func(symbol, i)) processed = [uxreplace(e, mapper) for e in cluster.exprs] return extracted + processed, extracted
def _extract_time_invariants(self, cluster, template, with_cse=True, costmodel=None, **kwargs): """ Extract time-invariant subexpressions, and assign them to temporaries. """ make = lambda: Scalar(name=template(), dtype=cluster.dtype).indexify() rule = iq_timeinvariant(cluster.trace) costmodel = costmodel or (lambda e: estimate_cost(e) > 0) processed, found = xreplace_constrained(cluster.exprs, make, rule, costmodel) if with_cse: leaves = [i for i in processed if i not in found] # Search for common sub-expressions amongst them (and only them) found = common_subexprs_elimination(found, make) # Some temporaries may be droppable at this point processed = compact_temporaries(found, leaves) return cluster.rebuild(processed)
def test_xreplace_constrained_time_varying(tu, tv, tw, ti0, ti1, t0, t1, exprs, expected): exprs = EVAL(exprs, tu, tv, tw, ti0, ti1, t0, t1) make = lambda i: Scalar(name='r%d' % i).indexify() processed, found = xreplace_constrained( exprs, make, iq_timevarying(TemporariesGraph(exprs)), lambda i: estimate_cost(i) > 0) assert len(found) == len(expected) assert all(str(i.rhs) == j for i, j in zip(found, expected))
def create_profile(name, iet): """ Enrich the Iteration/Expression tree ``iet`` adding nodes for C-level performance profiling. In particular, turn all :class:`Section`s within ``iet`` into :class:`TimedList`s. A :class:`Profiler` is returned to access profiling data. """ sections = FindNodes(Section).visit(iet) # Construct the Profiler profiler = Profiler(name) for section in sections: # All ExpressionBundles within `section` bundles = FindNodes(ExpressionBundle).visit(section) # Total operation count ops = sum(i.ops for i in bundles) # Operation count at each section iteration sops = sum( estimate_cost(i.expr) for i in flatten(b.exprs for b in bundles)) # Total memory traffic mapper = {} for i in bundles: for k, v in i.traffic.items(): mapper.setdefault(k, []).append(v) traffic = [ IntervalGroup.generate('merge', *i) for i in mapper.values() ] traffic = sum(i.extent for i in traffic) # Each ExpressionBundle lives in its own iteration space itershapes = [i.shape for i in bundles] # Track how many grid points are written within `section` points = [] for i in bundles: writes = { e.write for e in i.exprs if e.is_tensor and e.write.is_TimeFunction } points.append(reduce(mul, i.shape) * len(writes)) points = sum(points) profiler.add(section, SectionData(ops, sops, points, traffic, itershapes)) # Transform the Iteration/Expression tree introducing the C-level timers mapper = { i: TimedList(gname=name, lname=i.name, body=i.body) for i in sections } iet = Transformer(mapper).visit(iet) return iet, profiler
def test_yreplace_time_invariants(tu, tv, tw, ti0, ti1, t0, t1, exprs, expected): exprs = EVAL(exprs, tu, tv, tw, ti0, ti1, t0, t1) counter = generator() make = lambda: Scalar(name='r%d' % counter()).indexify() processed, found = yreplace(exprs, make, make_is_time_invariant(exprs), lambda i: estimate_cost(i) > 0) assert len(found) == len(expected) assert all(str(i.rhs) == j for i, j in zip(found, expected))
def _extract_time_invariants(self, cluster, template, **kwargs): """ Extract time-invariant subexpressions, and assign them to temporaries. """ make = lambda: Scalar(name=template(), dtype=cluster.dtype).indexify() rule = make_is_time_invariant(cluster.exprs) costmodel = lambda e: estimate_cost(e, True) >= self.MIN_COST_ALIAS_INV processed, found = yreplace(cluster.exprs, make, rule, costmodel, eager=True) return cluster.rebuild(processed)
def callbacks_invariants(context, n): min_cost_inv = min_cost['invariants'] if callable(min_cost_inv): min_cost_inv = min_cost_inv(n) extractor = make_is_time_invariant(context) model = lambda e: estimate_cost(e, True) >= min_cost_inv ignore_collected = lambda g: False selector = lambda c, n: c >= min_cost_inv and n >= 1 return extractor, model, ignore_collected, selector
def test_xreplace_constrained_time_varying(tu, tv, tw, ti0, ti1, t0, t1, exprs, expected): exprs = EVAL(exprs, tu, tv, tw, ti0, ti1, t0, t1) counter = generator() make = lambda: Scalar(name='r%d' % counter()).indexify() processed, found = xreplace_constrained(exprs, make, iq_timevarying(FlowGraph(exprs)), lambda i: estimate_cost(i) > 0) assert len(found) == len(expected) assert all(str(i.rhs) == j for i, j in zip(found, expected))
def _extract_rule(cls, context, min_cost, cluster): exclude = { i.source.indexed for i in cluster.scope.d_flow.independent() } rule0 = lambda e: not e.free_symbols & exclude rule1 = make_is_time_invariant(context) rule2 = lambda e: estimate_cost(e, True) >= min_cost return lambda e: rule0(e) and rule1(e) and rule2(e)
def instrument(self, iet): """ Enrich the Iteration/Expression tree ``iet`` adding nodes for C-level performance profiling. In particular, turn all Sections within ``iet`` into TimedLists. """ sections = FindNodes(Section).visit(iet) for section in sections: bundles = FindNodes(ExpressionBundle).visit(section) # Total operation count ops = sum(i.ops for i in bundles) # Operation count at each section iteration sops = sum( estimate_cost(i.expr) for i in flatten(b.exprs for b in bundles)) # Total memory traffic mapper = {} for i in bundles: for k, v in i.traffic.items(): mapper.setdefault(k, []).append(v) traffic = 0 for i in mapper.values(): try: traffic += IntervalGroup.generate('union', *i).size except ValueError: # Over different iteration spaces traffic += sum(j.size for j in i) # Each ExpressionBundle lives in its own iteration space itermaps = [i.ispace.dimension_map for i in bundles] # Track how many grid points are written within `section` points = [] for i in bundles: writes = { e.write for e in i.exprs if e.is_tensor and e.write.is_TimeFunction } points.append(i.size * len(writes)) points = sum(points) self._sections[section] = SectionData(ops, sops, points, traffic, itermaps) # Transform the Iteration/Expression tree introducing the C-level timers mapper = { i: TimedList(timer=self.timer, lname=i.name, body=i) for i in sections } iet = Transformer(mapper).visit(iet) return iet
def common_subexprs_elimination(exprs, make, mode='default'): """ Perform common sub-expressions elimination, or CSE. Note: the output is guaranteed to be topologically sorted. Parameters ---------- exprs : expr-like or list of expr-like One or more expressions to which CSE is applied. make : callable Build symbols to store temporary, redundant values. mode : str, optional The CSE algorithm applied. Accepted: ['default']. """ # Note: not defaulting to SymPy's CSE() function for three reasons: # - it also captures array index access functions (eg, i+1 in A[i+1] and B[i+1]); # - it sometimes "captures too much", losing factorization opportunities; # - very slow # TODO: a second "sympy" mode will be provided, relying on SymPy's CSE() but # also ensuring some sort of post-processing assert mode == 'default' # Only supported mode ATM processed = list(exprs) mapped = [] while True: # Detect redundancies counted = count(mapped + processed, q_xop).items() targets = OrderedDict([(k, estimate_cost(k, True)) for k, v in counted if v > 1]) if not targets: break # Create temporaries hit = max(targets.values()) picked = [k for k, v in targets.items() if v == hit] mapper = OrderedDict([(e, make()) for i, e in enumerate(picked)]) # Apply replacements processed = [e.xreplace(mapper) for e in processed] mapped = [e.xreplace(mapper) for e in mapped] mapped = [DummyEq(v, k) for k, v in reversed(list(mapper.items()))] + mapped # Prepare for the next round for k in picked: targets.pop(k) processed = mapped + processed # At this point we may have useless temporaries (e.g., r0=r1). Let's drop them processed = _compact_temporaries(processed) # Perform topological sorting so that reads-after-writes are honored processed = _topological_sort(processed) return processed
def common_subexprs_elimination(exprs, make, mode='default'): """ Perform common sub-expressions elimination, or CSE. Note: the output is not guranteed to be topologically sorted. Parameters ---------- exprs : expr-like or list of expr-like One or more expressions to which CSE is applied. make : callable Build symbols to store temporary, redundant values. mode : str, optional The CSE algorithm applied. Accepted: ['default']. """ # Note: not defaulting to SymPy's CSE() function for three reasons: # - it also captures array index access functions (eg, i+1 in A[i+1] and B[i+1]); # - it sometimes "captures too much", losing factorization opportunities; # - very slow # TODO: a second "sympy" mode will be provided, relying on SymPy's CSE() but # also ensuring some sort of post-processing assert mode == 'default' # Only supported mode ATM processed = list(exprs) mapped = [] while True: # Detect redundancies counted = count(mapped + processed, q_op).items() targets = OrderedDict([(k, estimate_cost(k)) for k, v in counted if v > 1]) if not targets: break # Create temporaries hit = max(targets.values()) picked = [k for k, v in targets.items() if v == hit] mapper = OrderedDict([(e, make()) for i, e in enumerate(picked)]) # Apply replacements processed = [e.xreplace(mapper) for e in processed] mapped = [e.xreplace(mapper) for e in mapped] mapped = [Eq(v, k) for k, v in reversed(list(mapper.items()))] + mapped # Prepare for the next round for k in picked: targets.pop(k) processed = mapped + processed # Simply renumber the temporaries in ascending order mapper = {i.lhs: j.lhs for i, j in zip(mapped, reversed(mapped))} processed = [e.xreplace(mapper) for e in processed] return processed
def choose(exprs, aliases, selector): others = [] chosen = OrderedDict() for e in exprs: naliases = len(aliases.get(e.rhs)) cost = estimate_cost(e, True)*naliases if selector(cost, naliases): chosen[e.rhs] = e.lhs else: others.append(e) return chosen, others
def _extract_time_varying(self, cluster, template, **kwargs): """ Extract time-varying subexpressions, and assign them to temporaries. Time varying subexpressions arise for example when approximating derivatives through finite differences. """ make = lambda: Scalar(name=template(), dtype=cluster.dtype).indexify() rule = iq_timevarying(cluster.trace) costmodel = lambda i: estimate_cost(i) > 0 processed, _ = xreplace_constrained(cluster.exprs, make, rule, costmodel) return cluster.rebuild(processed)
def _lower_clusters(cls, expressions, profiler, **kwargs): """ Clusters lowering: * Group expressions into Clusters; * Introduce guards for conditional Clusters; * Analyze Clusters to detect computational properties such as parallelism. """ # Build a sequence of Clusters from a sequence of Eqs clusters = clusterize(expressions) # Operation count before specialization init_ops = sum(estimate_cost(c.exprs) for c in clusters if c.is_dense) clusters = cls._specialize_clusters(clusters, **kwargs) # Operation count after specialization final_ops = sum(estimate_cost(c.exprs) for c in clusters if c.is_dense) profiler.record_ops_variation(init_ops, final_ops) return ClusterGroup(clusters)
def _extract_time_varying(self, cluster, template, **kwargs): """ Extract time-varying subexpressions, and assign them to temporaries. Time varying subexpressions arise for example when approximating derivatives through finite differences. """ make = lambda i: Scalar(name=template(i)).indexify() rule = iq_timevarying(cluster.trace) costmodel = lambda i: estimate_cost(i) > 0 processed, _ = xreplace_constrained(cluster.exprs, make, rule, costmodel) return cluster.rebuild(processed)
def test_estimate_cost(expr, expected, estimate): # Note: integer arithmetic isn't counted grid = Grid(shape=(4, 4)) x, y = grid.dimensions # noqa t0 = Scalar(name='t0') # noqa t1 = Scalar(name='t1') # noqa t2 = Scalar(name='t2') # noqa fa = Function(name='fa', grid=grid, shape=(4, ), dimensions=(x, )) # noqa fb = Function(name='fb', grid=grid, shape=(4, ), dimensions=(x, )) # noqa fc = Function(name='fc', grid=grid) # noqa assert estimate_cost(eval(expr), estimate) == expected
def wrapper(self, state, **kwargs): # Invoke the DSE pass on each Cluster tic = time() state.update(flatten([func(self, c, state.template, **kwargs) for c in state.clusters])) toc = time() # Profiling key = '%s%d' % (func.__name__, len(state.timings)) state.timings[key] = toc - tic if self.profile: candidates = [c.exprs for c in state.clusters if c.is_dense] state.ops[key] = estimate_cost(flatten(candidates))
def common_subexprs_elimination(exprs, make, mode='default'): """ Perform common subexpressions elimination. Note: the output is not guranteed to be topologically sorted. :param exprs: The target SymPy expression, or a collection of SymPy expressions. :param make: A function to construct symbols used for replacement. The function takes as input an integer ID; ID is computed internally and used as a unique identifier for the constructed symbols. """ # Note: not defaulting to SymPy's CSE() function for three reasons: # - it also captures array index access functions (eg, i+1 in A[i+1] and B[i+1]); # - it sometimes "captures too much", losing factorization opportunities; # - very slow # TODO: a second "sympy" mode will be provided, relying on SymPy's CSE() but # also ensuring some sort of post-processing assert mode == 'default' # Only supported mode ATM processed = list(exprs) mapped = [] while True: # Detect redundancies counted = count(mapped + processed, q_op).items() targets = OrderedDict([(k, estimate_cost(k)) for k, v in counted if v > 1]) if not targets: break # Create temporaries hit = max(targets.values()) picked = [k for k, v in targets.items() if v == hit] mapper = OrderedDict([(e, make(len(mapped) + i)) for i, e in enumerate(picked)]) # Apply repleacements processed = [e.xreplace(mapper) for e in processed] mapped = [e.xreplace(mapper) for e in mapped] mapped = [Eq(v, k) for k, v in reversed(list(mapper.items()))] + mapped # Prepare for the next round for k in picked: targets.pop(k) processed = mapped + processed # Simply renumber the temporaries in ascending order mapper = {i.lhs: j.lhs for i, j in zip(mapped, reversed(mapped))} processed = [e.xreplace(mapper) for e in processed] return processed
def factorize(cluster, *args): """ Factorize trascendental functions, symbolic powers, numeric coefficients. If the expression has an operation count greater than ``MIN_COST_FACTORIZE``, then the algorithm is applied recursively until no more factorization opportunities are detected. """ processed = [] for expr in cluster.exprs: handle = _collect_nested(expr) cost_handle = estimate_cost(handle) if cost_handle >= MIN_COST_FACTORIZE: handle_prev = handle cost_prev = estimate_cost(expr) while cost_handle < cost_prev: handle_prev, handle = handle, _collect_nested(handle) cost_prev, cost_handle = cost_handle, estimate_cost(handle) cost_handle, handle = cost_prev, handle_prev processed.append(handle) return cluster.rebuild(processed)
def test_yreplace_time_invariants(exprs, expected): grid = Grid((3, 3, 3)) dims = grid.dimensions tu = TimeFunction(name="tu", grid=grid, space_order=4).indexify() tv = TimeFunction(name="tv", grid=grid, space_order=4).indexify() tw = TimeFunction(name="tw", grid=grid, space_order=4).indexify() ti0 = Array(name='ti0', shape=(3, 5, 7), dimensions=dims).indexify() ti1 = Array(name='ti1', shape=(3, 5, 7), dimensions=dims).indexify() t0 = Scalar(name='t0').indexify() t1 = Scalar(name='t1').indexify() exprs = EVAL(exprs, tu, tv, tw, ti0, ti1, t0, t1) counter = generator() make = lambda: Scalar(name='r%d' % counter()).indexify() processed, found = yreplace(exprs, make, make_is_time_invariant(exprs), lambda i: estimate_cost(i) > 0) assert len(found) == len(expected) assert all(str(i.rhs) == j for i, j in zip(found, expected))
def choose(exprs, aliases, selector): """ Use a cost model to select the aliases that are worth optimizing. """ retained = AliasMapper(aliases) others = [] chosen = OrderedDict() for e in exprs: aliaseds = aliases.get(e.rhs) naliases = len(aliaseds) cost = estimate_cost(e, True)*naliases if selector(cost, naliases): chosen[e.rhs] = e.lhs else: others.append(e) retained.remove(e.rhs) return retained, chosen, others
def _extract_time_invariants(self, cluster, template, with_cse=True, **kwargs): """ Extract time-invariant subexpressions, and assign them to temporaries. """ make = lambda: Scalar(name=template(), dtype=cluster.dtype).indexify() rule = iq_timeinvariant(cluster.trace) costmodel = lambda e: estimate_cost(e) > 0 processed, found = xreplace_constrained(cluster.exprs, make, rule, costmodel) if with_cse: leaves = [i for i in processed if i not in found] # Search for common sub-expressions amongst them (and only them) found = common_subexprs_elimination(found, make) # Some temporaries may be droppable at this point processed = compact_temporaries(found, leaves) return cluster.rebuild(processed)
def instrument(self, iet): """ Enrich the Iteration/Expression tree ``iet`` adding nodes for C-level performance profiling. In particular, turn all :class:`Section`s within ``iet`` into :class:`TimedList`s. """ sections = FindNodes(Section).visit(iet) for section in sections: bundles = FindNodes(ExpressionBundle).visit(section) # Total operation count ops = sum(i.ops for i in bundles) # Operation count at each section iteration sops = sum(estimate_cost(i.expr) for i in flatten(b.exprs for b in bundles)) # Total memory traffic mapper = {} for i in bundles: for k, v in i.traffic.items(): mapper.setdefault(k, []).append(v) traffic = [IntervalGroup.generate('merge', *i) for i in mapper.values()] traffic = sum(i.size for i in traffic) # Each ExpressionBundle lives in its own iteration space itershapes = [i.shape for i in bundles] # Track how many grid points are written within `section` points = [] for i in bundles: writes = {e.write for e in i.exprs if e.is_tensor and e.write.is_TimeFunction} points.append(reduce(mul, i.shape)*len(writes)) points = sum(points) self._sections[section] = SectionData(ops, sops, points, traffic, itershapes) # Transform the Iteration/Expression tree introducing the C-level timers mapper = {i: TimedList(timer=self.timer, lname=i.name, body=i) for i in sections} iet = Transformer(mapper).visit(iet) return iet
def extract(exprs, aliases): """ Extract the candidate aliases. """ is_time_invariant = make_is_time_invariant(exprs) time_invariants = {e.rhs: is_time_invariant(e) for e in exprs} processed = [] candidates = OrderedDict() for e in exprs: # Cost check (to keep the memory footprint under control) naliases = len(aliases.get(e.rhs)) cost = estimate_cost(e, True) * naliases test0 = lambda: cost >= MIN_COST_ALIAS and naliases > 1 test1 = lambda: cost >= MIN_COST_ALIAS_INV and time_invariants[e.rhs] if test0() or test1(): candidates[e.rhs] = e.lhs else: processed.append(e) return candidates, processed
def _generate(self, exprs, exclude): # E.g., extract `u.dx*a*b` and `u.dx*a*c` from `[(u.dx*a*b).dy`, `(u.dx*a*c).dy]` def cbk_search(expr): if isinstance(expr, EvalDerivative) and not expr.base.is_Function: return expr.args else: return flatten(e for e in [cbk_search(a) for a in expr.args] if e) cbk_compose = lambda e: split_coeff(e)[1] basextr = self._do_generate(exprs, exclude, cbk_search, cbk_compose) if not basextr: return yield basextr # E.g., extract `u.dx*a` from `[(u.dx*a*b).dy, (u.dx*a*c).dy]` # That is, attempt extracting the largest common derivative-induced subexprs mappers = [deindexify(e) for e in basextr.extracted] counter = Counter(flatten(m.keys() for m in mappers)) groups = as_mapper(counter, key=counter.get) grank = { k: sorted(v, key=lambda e: estimate_cost(e), reverse=True) for k, v in groups.items() } def cbk_search2(expr, rank): ret = [] for e in cbk_search(expr): mapper = deindexify(e) for i in rank: if i in mapper: ret.extend(mapper[i]) break return ret candidates = sorted(grank, reverse=True)[:2] for i in candidates: lower_pri_elems = flatten([grank[j] for j in candidates if j != i]) cbk_search_i = lambda e: cbk_search2(e, grank[i] + lower_pri_elems) yield self._do_generate(exprs, exclude, cbk_search_i, cbk_compose)
def wrapper(self, state, **kwargs): # A template to construct temporaries tempname = self.conventions.get(func.__name__) if tempname: start = kwargs.get('start') tempname += '%d' if start is None else (('_%d_' % start) + '%d') template = lambda i: tempname % i else: template = None # Invoke the DSE pass tic = time() state.update( flatten( [func(self, c, template, **kwargs) for c in state.clusters])) toc = time() # Profiling key = '%s%d' % (func.__name__, len(self.timings)) self.timings[key] = toc - tic if self.profile: candidates = [c.exprs for c in state.clusters if c.is_dense] self.ops[key] = estimate_cost(flatten(candidates))
def choose(exprs, aliases, selector): """ Use a cost model to select the aliases that are worth optimizing. """ # Pass 1: a set of aliasing expressions is optimizable only if its cost # exceeds the mode's threshold candidates = OrderedDict() others = [] for e in exprs: naliases = len(aliases.get(e.rhs)) cost = estimate_cost(e, True) * naliases score = selector(cost) if score > 0: candidates[e] = score else: others.append(e) # Pass 2: a set of aliasing expressions is optimizable if and only if it survived # Pass 1 above *and* the tradeoff between operation count and working set increase # is favorable owset = wset(others) retained = AliasMapper(aliases) chosen = OrderedDict() others = [] for e in exprs: try: score = candidates[e] except KeyError: score = 0 if score > 1 or \ score == 1 and max(len(wset(e)), 1) > len(wset(e) & owset): chosen[e.rhs] = e.lhs else: others.append(e) retained.remove(e.rhs) return retained, chosen, others
def _selector(self, e, naliases): if naliases <= 1: return 0 else: return estimate_cost(e, True) * naliases // self._opt_mincost
def test_estimate_cost(fa, fb, fc, t0, t1, t2, expr, expected): # Note: integer arithmetic isn't counted assert estimate_cost(EVAL(expr, fa, fb, fc, t0, t1, t2)) == expected
def ops(self): """Number of operations performed at each iteration.""" return sum(estimate_cost(i) for i in self.exprs)
def _cse(maybe_exprs, make, mode='default'): """ Main common sub-expressions elimination routine. Note: the output is guaranteed to be topologically sorted. Parameters ---------- maybe_exprs : expr-like or list of expr-like or Cluster One or more expressions to which CSE is applied. make : callable Build symbols to store temporary, redundant values. mode : str, optional The CSE algorithm applied. Accepted: ['default']. """ # Note: not defaulting to SymPy's CSE() function for three reasons: # - it also captures array index access functions (eg, i+1 in A[i+1] and B[i+1]); # - it sometimes "captures too much", losing factorization opportunities; # - very slow # TODO: a second "sympy" mode will be provided, relying on SymPy's CSE() but # also ensuring some form of post-processing assert mode == 'default' # Only supported mode ATM # Just for flexibility, accept either Clusters or exprs if isinstance(maybe_exprs, Cluster): cluster = maybe_exprs processed = list(cluster.exprs) scope = cluster.scope else: processed = list(maybe_exprs) scope = Scope(maybe_exprs) # Some sub-expressions aren't really "common" -- that's the case of Dimension- # independent data dependences. For example: # # ... = ... a[i] + 1 ... # a[i] = ... # ... = ... a[i] + 1 ... # # `a[i] + 1` will be excluded, as there's a flow Dimension-independent data # dependence involving `a` exclude = {i.source.access for i in scope.d_flow.independent()} mapped = [] while True: # Detect redundancies counted = count(mapped + processed, q_xop).items() targets = OrderedDict([(k, estimate_cost(k, True)) for k, v in counted if v > 1]) # Rule out Dimension-independent data dependencies targets = OrderedDict([(k, v) for k, v in targets.items() if not k.free_symbols & exclude]) if not targets: break # Create temporaries hit = max(targets.values()) picked = [k for k, v in targets.items() if v == hit] mapper = OrderedDict([(e, make()) for i, e in enumerate(picked)]) # Apply replacements processed = [uxreplace(e, mapper) for e in processed] mapped = [uxreplace(e, mapper) for e in mapped] mapped = [Eq(v, k) for k, v in reversed(list(mapper.items()))] + mapped # Update `exclude` for the same reasons as above -- to rule out CSE across # Dimension-independent data dependences exclude.update({i for i in mapper.values()}) # Prepare for the next round for k in picked: targets.pop(k) processed = mapped + processed # At this point we may have useless temporaries (e.g., r0=r1). Let's drop them processed = _compact_temporaries(processed) return processed
def _eliminate_inter_stencil_redundancies(self, cluster, template, **kwargs): """ Search aliasing expressions and capture them into vector temporaries. Examples -------- 1) temp = (a[x,y,z]+b[x,y,z])*c[t,x,y,z] >>> ti[x,y,z] = a[x,y,z] + b[x,y,z] temp = ti[x,y,z]*c[t,x,y,z] 2) temp1 = 2.0*a[x,y,z]*b[x,y,z] temp2 = 3.0*a[x,y,z+1]*b[x,y,z+1] >>> ti[x,y,z] = a[x,y,z]*b[x,y,z] temp1 = 2.0*ti[x,y,z] temp2 = 3.0*ti[x,y,z+1] """ # For more information about "aliases", refer to collect.__doc__ aliases = collect(cluster.exprs) # Redundancies will be stored in space-varying temporaries graph = FlowGraph(cluster.exprs) time_invariants = { v.rhs: graph.time_invariant(v) for v in graph.values() } # Find the candidate expressions processed = [] candidates = OrderedDict() for k, v in graph.items(): # Cost check (to keep the memory footprint under control) naliases = len(aliases.get(v.rhs)) cost = estimate_cost(v, True) * naliases test0 = lambda: cost >= self.MIN_COST_ALIAS and naliases > 1 test1 = lambda: cost >= self.MIN_COST_ALIAS_INV and time_invariants[ v.rhs] if test0() or test1(): candidates[v.rhs] = k else: processed.append(v) # Create alias Clusters and all necessary substitution rules # for the new temporaries alias_clusters = [] subs = {} for origin, alias in aliases.items(): if all(i not in candidates for i in alias.aliased): continue # The write-to Intervals writeto = [ Interval(i.dim, *alias.relaxed_diameter.get(i.dim, (0, 0))) for i in cluster.ispace.intervals if not i.dim.is_Time ] writeto = IntervalGroup(writeto) # Optimization: no need to retain a SpaceDimension if it does not # induce a flow/anti dependence (below, `i.offsets` captures this, by # telling how much halo will be needed to honour such dependences) dep_inducing = [i for i in writeto if any(i.offsets)] try: index = writeto.index(dep_inducing[0]) writeto = IntervalGroup(writeto[index:]) except IndexError: warning("Couldn't optimize some of the detected redundancies") # Create a temporary to store `alias` dimensions = [d.root for d in writeto.dimensions] halo = [(abs(i.lower), abs(i.upper)) for i in writeto] array = Array(name=template(), dimensions=dimensions, halo=halo, dtype=cluster.dtype) # Build up the expression evaluating `alias` access = tuple(i.dim - i.lower for i in writeto) expression = Eq(array[access], origin) # Create the substitution rules so that we can use the newly created # temporary in place of the aliasing expressions for aliased, distance in alias.with_distance: assert all(i.dim in distance.labels for i in writeto) access = [i.dim - i.lower + distance[i.dim] for i in writeto] if aliased in candidates: # It would *not* be in `candidates` if part of a composite alias subs[candidates[aliased]] = array[access] subs[aliased] = array[access] # Construct the `alias` IterationSpace intervals, sub_iterators, directions = cluster.ispace.args ispace = IterationSpace(intervals.add(writeto), sub_iterators, directions) # Construct the `alias` DataSpace mapper = detect_accesses(expression) parts = { k: IntervalGroup(build_intervals(v)).add(ispace.intervals) for k, v in mapper.items() if k } dspace = DataSpace(cluster.dspace.intervals, parts) # Create a new Cluster for `alias` alias_clusters.append(Cluster([expression], ispace, dspace)) # Switch temporaries in the expression trees processed = [e.xreplace(subs) for e in processed] return alias_clusters + [cluster.rebuild(processed)]
def ops(self): """The Cluster operation count.""" return self.ispace.size*sum(estimate_cost(i) for i in self.exprs)
def _eliminate_inter_stencil_redundancies(self, cluster, template, **kwargs): """ Search for redundancies across the expressions and expose them to the later stages of the optimisation pipeline by introducing new temporaries of suitable rank. Two type of redundancies are sought: * Time-invariants, and * Across different space points Examples ======== Let ``t`` be the time dimension, ``x, y, z`` the space dimensions. Then: 1) temp = (a[x,y,z]+b[x,y,z])*c[t,x,y,z] >>> ti[x,y,z] = a[x,y,z] + b[x,y,z] temp = ti[x,y,z]*c[t,x,y,z] 2) temp1 = 2.0*a[x,y,z]*b[x,y,z] temp2 = 3.0*a[x,y,z+1]*b[x,y,z+1] >>> ti[x,y,z] = a[x,y,z]*b[x,y,z] temp1 = 2.0*ti[x,y,z] temp2 = 3.0*ti[x,y,z+1] """ if cluster.is_sparse: return cluster # For more information about "aliases", refer to collect.__doc__ mapper, aliases = collect(cluster.exprs) # Redundancies will be stored in space-varying temporaries g = cluster.trace indices = g.space_indices time_invariants = {v.rhs: g.time_invariant(v) for v in g.values()} # Find the candidate expressions processed = [] candidates = OrderedDict() for k, v in g.items(): # Cost check (to keep the memory footprint under control) naliases = len(mapper.get(v.rhs, [])) cost = estimate_cost(v, True)*naliases if cost >= self.MIN_COST_ALIAS and (naliases > 1 or time_invariants[v.rhs]): candidates[v.rhs] = k else: processed.append(v) # Create alias Clusters and all necessary substitution rules # for the new temporaries alias_clusters = ClusterGroup() rules = OrderedDict() for origin, alias in aliases.items(): if all(i not in candidates for i in alias.aliased): continue # Construct an iteration space suitable for /alias/ intervals, sub_iterators, directions = cluster.ispace.args intervals = [Interval(i.dim, *alias.relaxed_diameter.get(i.dim, i.limits)) for i in cluster.ispace.intervals] ispace = IterationSpace(intervals, sub_iterators, directions) # Optimization: perhaps we can lift the cluster outside the time dimension if all(time_invariants[i] for i in alias.aliased): ispace = ispace.project(lambda i: not i.is_Time) # Build a symbolic function for /alias/ intervals = ispace.intervals halo = [(abs(intervals[i].lower), abs(intervals[i].upper)) for i in indices] function = Array(name=template(), dimensions=indices, halo=halo) access = tuple(i - intervals[i].lower for i in indices) expression = Eq(function[access], origin) # Construct a data space suitable for /alias/ mapper = detect_accesses(expression) parts = {k: IntervalGroup(build_intervals(v)).add(intervals) for k, v in mapper.items() if k} dspace = DataSpace([i.zero() for i in intervals], parts) # Create a new Cluster for /alias/ alias_clusters.append(Cluster([expression], ispace, dspace)) # Add substitution rules for aliased, distance in alias.with_distance: access = [i - intervals[i].lower + j for i, j in distance if i in indices] rules[candidates[aliased]] = function[access] rules[aliased] = function[access] # Group clusters together if possible alias_clusters = groupby(alias_clusters).finalize() alias_clusters.sort(key=lambda i: i.is_dense) # Switch temporaries in the expression trees processed = [e.xreplace(rules) for e in processed] return alias_clusters + [cluster.rebuild(processed)]
def ops(self): """The Cluster operation count.""" return self.size*sum(estimate_cost(i) for i in self.exprs)