def copy_arrays(mapper, reverse=False): """ Build an Iteration/Expression tree performing the copy ``k = v``, or ``v = k`` if reverse=True, for each (k, v) in mapper. (k, v) are expected to be of type :class:`IndexedData`. The loop bounds are inferred from the dimensions used in ``k``. """ if not mapper: return () # Build the Iteration tree for the copy iterations = [] for k, v in mapper.items(): handle = [] indices = k.function.indices for i, j in zip(k.shape, indices): handle.append(Iteration([], dimension=j, limits=i)) lhs, rhs = (v, k) if reverse else (k, v) handle.append( Expression(Eq(lhs[indices], rhs[indices]), dtype=k.function.dtype)) iterations.append(compose_nodes(handle)) # Maybe some Iterations are mergeable iterations = MergeOuterIterations().visit(iterations) return iterations
def common_subexprs_elimination(exprs, make, mode='default'): """ Perform common sub-expressions elimination, or CSE. Note: the output is not guranteed to be topologically sorted. Parameters ---------- exprs : expr-like or list of expr-like One or more expressions to which CSE is applied. make : callable Build symbols to store temporary, redundant values. mode : str, optional The CSE algorithm applied. Accepted: ['default']. """ # Note: not defaulting to SymPy's CSE() function for three reasons: # - it also captures array index access functions (eg, i+1 in A[i+1] and B[i+1]); # - it sometimes "captures too much", losing factorization opportunities; # - very slow # TODO: a second "sympy" mode will be provided, relying on SymPy's CSE() but # also ensuring some sort of post-processing assert mode == 'default' # Only supported mode ATM processed = list(exprs) mapped = [] while True: # Detect redundancies counted = count(mapped + processed, q_xop).items() targets = OrderedDict([(k, estimate_cost(k, True)) for k, v in counted if v > 1]) if not targets: break # Create temporaries hit = max(targets.values()) picked = [k for k, v in targets.items() if v == hit] mapper = OrderedDict([(e, make()) for i, e in enumerate(picked)]) # Apply replacements processed = [e.xreplace(mapper) for e in processed] mapped = [e.xreplace(mapper) for e in mapped] mapped = [Eq(v, k) for k, v in reversed(list(mapper.items()))] + mapped # Prepare for the next round for k in picked: targets.pop(k) processed = mapped + processed # Perform topological sorting so that reads-after-writes are honored processed = topological_sort(processed) return processed
def promote_scalar_expressions(exprs, shape, indices, onstack): """ Transform a collection of scalar expressions into tensor expressions. """ processed = [] # Fist promote the LHS mapper = {} for k, v in FlowGraph(exprs).items(): if v.is_scalar: # Create a new function symbol data = Array(name=k.name, shape=shape, dimensions=indices, onstack=onstack) indexed = Indexed(data.indexed, *indices) mapper[k] = indexed processed.append(Eq(indexed, v.rhs)) else: processed.append(Eq(k, v.rhs)) # Propagate the transformed LHS through the expressions processed = [Eq(n.lhs, n.rhs.xreplace(mapper)) for n in processed] return processed
def common_subexprs_elimination(exprs, make, mode='default'): """ Perform common subexpressions elimination. Note: the output is not guranteed to be topologically sorted. :param exprs: The target SymPy expression, or a collection of SymPy expressions. :param make: A function to construct symbols used for replacement. The function takes as input an integer ID; ID is computed internally and used as a unique identifier for the constructed symbols. """ # Note: not defaulting to SymPy's CSE() function for three reasons: # - it also captures array index access functions (eg, i+1 in A[i+1] and B[i+1]); # - it sometimes "captures too much", losing factorization opportunities; # - very slow # TODO: a second "sympy" mode will be provided, relying on SymPy's CSE() but # also ensuring some sort of post-processing assert mode == 'default' # Only supported mode ATM processed = list(exprs) mapped = [] while True: # Detect redundancies counted = count(mapped + processed, q_op).items() targets = OrderedDict([(k, estimate_cost(k)) for k, v in counted if v > 1]) if not targets: break # Create temporaries hit = max(targets.values()) picked = [k for k, v in targets.items() if v == hit] mapper = OrderedDict([(e, make(len(mapped) + i)) for i, e in enumerate(picked)]) # Apply repleacements processed = [e.xreplace(mapper) for e in processed] mapped = [e.xreplace(mapper) for e in mapped] mapped = [Eq(v, k) for k, v in reversed(list(mapper.items()))] + mapped # Prepare for the next round for k in picked: targets.pop(k) processed = mapped + processed # Simply renumber the temporaries in ascending order mapper = {i.lhs: j.lhs for i, j in zip(mapped, reversed(mapped))} processed = [e.xreplace(mapper) for e in processed] return processed
def interpolate(self, expr, offset=0, **kwargs): """Creates a :class:`sympy.Eq` equation for the interpolation of an expression onto this sparse point collection. :param expr: The expression to interpolate. :param offset: Additional offset from the boundary for absorbing boundary conditions. :param u_t: (Optional) time index to use for indexing into field data in `expr`. :param p_t: (Optional) time index to use for indexing into the sparse point data. """ u_t = kwargs.get('u_t', None) p_t = kwargs.get('p_t', None) expr = indexify(expr) # Apply optional time symbol substitutions to expr if u_t is not None: time = self.grid.time_dim t = self.grid.stepping_dim expr = expr.subs(t, u_t).subs(time, u_t) variables = list(retrieve_indexed(expr)) # List of indirection indices for all adjacent grid points index_matrix = [ tuple(idx + ii + offset for ii, idx in zip(inc, self.coordinate_indices)) for inc in self.point_increments ] # Generate index substituions for all grid variables idx_subs = [] for i, idx in enumerate(index_matrix): v_subs = [(v, v.base[v.indices[:-self.grid.dim] + idx]) for v in variables] idx_subs += [OrderedDict(v_subs)] # Substitute coordinate base symbols into the coefficients subs = OrderedDict(zip(self.point_symbols, self.coordinate_bases)) rhs = sum([ expr.subs(vsub) * b.subs(subs) for b, vsub in zip(self.coefficients, idx_subs) ]) # Apply optional time symbol substitutions to lhs of assignment lhs = self if p_t is None else self.subs(self.indices[0], p_t) cummulative = kwargs.get("cummulative", False) rhs = rhs + lhs if cummulative else rhs return [Eq(lhs, rhs)]
def _extract_increments(self, cluster, template, **kwargs): """ Extract the RHS of non-local tensor expressions performing an associative and commutative increment, and assign them to temporaries. """ processed = [] for e in cluster.exprs: if e.is_Increment and e.lhs.function.is_Input: handle = Scalar(name=template(), dtype=e.dtype).indexify() extracted = e.rhs.func(*[i for i in e.rhs.args if i != e.lhs]) processed.extend([Eq(handle, extracted), e.func(e.lhs, handle + e.lhs)]) else: processed.append(e) return cluster.rebuild(processed)
def _extract_nonaffine_indices(self, cluster, template, **kwargs): """ Extract non-affine array indices, and assign them to temporaries. """ make = lambda: Scalar(name=template(), dtype=np.int32).indexify() mapper = OrderedDict() for e in cluster.exprs: for indexed in retrieve_indexed(e): for i, d in zip(indexed.indices, indexed.base.function.indices): if not (q_affine(i, d) or i.is_Number): mapper[i] = make() processed = [Eq(v, k) for k, v in mapper.items()] processed.extend([e.xreplace(mapper) for e in cluster.exprs]) return cluster.rebuild(processed)
def _extract_nonaffine_indices(self, cluster, template, **kwargs): """ Extract non-affine array indices, and assign them to temporaries. """ make = lambda: Scalar(name=template(), dtype=np.int32).indexify() mapper = OrderedDict() for e in cluster.exprs: # Note: using mode='all' and then checking for presence in the mapper # (a few lines below), rather retrieving unique indexeds only (a set), # is the key to deterministic code generation for indexed in retrieve_indexed(e, mode='all'): for i, d in zip(indexed.indices, indexed.function.indices): if q_affine(i, d) or q_scalar(i): continue elif i not in mapper: mapper[i] = make() processed = [Eq(v, k) for k, v in mapper.items()] processed.extend([e.xreplace(mapper) for e in cluster.exprs]) return cluster.rebuild(processed)
def _eliminate_inter_stencil_redundancies(self, cluster, template, **kwargs): """ Search for redundancies across the expressions and expose them to the later stages of the optimisation pipeline by introducing new temporaries of suitable rank. Two type of redundancies are sought: * Time-invariants, and * Across different space points Examples ======== Let ``t`` be the time dimension, ``x, y, z`` the space dimensions. Then: 1) temp = (a[x,y,z]+b[x,y,z])*c[t,x,y,z] >>> ti[x,y,z] = a[x,y,z] + b[x,y,z] temp = ti[x,y,z]*c[t,x,y,z] 2) temp1 = 2.0*a[x,y,z]*b[x,y,z] temp2 = 3.0*a[x,y,z+1]*b[x,y,z+1] >>> ti[x,y,z] = a[x,y,z]*b[x,y,z] temp1 = 2.0*ti[x,y,z] temp2 = 3.0*ti[x,y,z+1] """ if cluster.is_sparse: return cluster # For more information about "aliases", refer to collect.__doc__ mapper, aliases = collect(cluster.exprs) # Redundancies will be stored in space-varying temporaries g = cluster.trace indices = g.space_indices time_invariants = {v.rhs: g.time_invariant(v) for v in g.values()} # Find the candidate expressions processed = [] candidates = OrderedDict() for k, v in g.items(): # Cost check (to keep the memory footprint under control) naliases = len(mapper.get(v.rhs, [])) cost = estimate_cost(v, True) * naliases if cost >= self.thresholds['min-cost-alias'] and\ (naliases > 1 or time_invariants[v.rhs]): candidates[v.rhs] = k else: processed.append(v) # Create alias Clusters and all necessary substitution rules # for the new temporaries alias_clusters = ClusterGroup() rules = OrderedDict() for origin, alias in aliases.items(): if all(i not in candidates for i in alias.aliased): continue # Construct an iteration space suitable for /alias/ intervals, sub_iterators, directions = cluster.ispace.args intervals = [ Interval(i.dim, *alias.relaxed_diameter.get(i.dim, i.limits)) for i in cluster.ispace.intervals ] ispace = IterationSpace(intervals, sub_iterators, directions) # Optimization: perhaps we can lift the cluster outside the time dimension if all(time_invariants[i] for i in alias.aliased): ispace = ispace.project(lambda i: not i.is_Time) # Build a symbolic function for /alias/ intervals = ispace.intervals halo = [(abs(intervals[i].lower), abs(intervals[i].upper)) for i in indices] function = Array(name=template(), dimensions=indices, halo=halo) access = tuple(i - intervals[i].lower for i in indices) expression = Eq(Indexed(function.indexed, *access), origin) # Construct a data space suitable for /alias/ mapper = detect_accesses(expression) parts = { k: IntervalGroup(build_intervals(v)).add(intervals) for k, v in mapper.items() if k } dspace = DataSpace([i.zero() for i in intervals], parts) # Create a new Cluster for /alias/ alias_clusters.append(Cluster([expression], ispace, dspace)) # Add substitution rules for aliased, distance in alias.with_distance: access = [ i - intervals[i].lower + j for i, j in distance if i in indices ] temporary = Indexed(function.indexed, *tuple(access)) rules[candidates[aliased]] = temporary rules[aliased] = temporary # Group clusters together if possible alias_clusters = groupby(alias_clusters).finalize() alias_clusters.sort(key=lambda i: i.is_dense) # Switch temporaries in the expression trees processed = [e.xreplace(rules) for e in processed] return alias_clusters + [cluster.rebuild(processed)]
def _eliminate_inter_stencil_redundancies(self, cluster, template, **kwargs): """ Search for redundancies across the expressions and expose them to the later stages of the optimisation pipeline by introducing new temporaries of suitable rank. Two type of redundancies are sought: * Time-invariants, and * Across different space points Examples ======== Let ``t`` be the time dimension, ``x, y, z`` the space dimensions. Then: 1) temp = (a[x,y,z]+b[x,y,z])*c[t,x,y,z] >>> ti[x,y,z] = a[x,y,z] + b[x,y,z] temp = ti[x,y,z]*c[t,x,y,z] 2) temp1 = 2.0*a[x,y,z]*b[x,y,z] temp2 = 3.0*a[x,y,z+1]*b[x,y,z+1] >>> ti[x,y,z] = a[x,y,z]*b[x,y,z] temp1 = 2.0*ti[x,y,z] temp2 = 3.0*ti[x,y,z+1] """ if cluster.is_sparse: return cluster # For more information about "aliases", refer to collect.__doc__ mapper, aliases = collect(cluster.exprs) # Redundancies will be stored in space-varying temporaries g = cluster.trace indices = g.space_indices time_invariants = {v.rhs: g.time_invariant(v) for v in g.values()} # Template for captured redundancies shape = tuple(i.symbolic_extent for i in indices) make = lambda i: Array( name=template(i), shape=shape, dimensions=indices).indexed # Find the candidate expressions processed = [] candidates = OrderedDict() for k, v in g.items(): # Cost check (to keep the memory footprint under control) naliases = len(mapper.get(v.rhs, [])) cost = estimate_cost(v, True) * naliases if cost >= self.thresholds['min-cost-alias'] and\ (naliases > 1 or time_invariants[v.rhs]): candidates[v.rhs] = k else: processed.append(Eq(k, v.rhs)) # Create temporaries capturing redundant computation expressions = [] stencils = [] rules = OrderedDict() for c, (origin, alias) in enumerate(aliases.items()): if all(i not in candidates for i in alias.aliased): continue # Build alias expression function = make(c) expressions.append(Eq(Indexed(function, *indices), origin)) # Build substitution rules for aliased, distance in alias.with_distance: coordinates = [ sum([i, j]) for i, j in distance.items() if i in indices ] temporary = Indexed(function, *tuple(coordinates)) rules[candidates[aliased]] = temporary rules[aliased] = temporary # Build cluster stencil stencil = alias.anti_stencil.anti(cluster.stencil) if all(time_invariants[i] for i in alias.aliased): # Optimization: drop time dimension if time-invariant and the # alias involves a complex calculation stencil = stencil.section(g.time_indices) stencils.append(stencil) # Create the alias clusters alias_clusters = clusterize(expressions, stencils) alias_clusters = sorted(alias_clusters, key=lambda i: i.is_dense) # Switch temporaries in the expression trees processed = [e.xreplace(rules) for e in processed] return alias_clusters + [cluster.rebuild(processed)]