def callback(): # Derivatives must be evaluated before the introduction of indirect accesses try: _expr = expr.evaluate except AttributeError: # E.g., a generic SymPy expression or a number _expr = expr variables = list(retrieve_function_carriers(_expr)) # Need to get origin of the field in case it is staggered # TODO: handle each variable staggereing spearately field_offset = variables[0].origin # List of indirection indices for all adjacent grid points idx_subs, temps = self._interpolation_indices(variables, offset, field_offset=field_offset) # Substitute coordinate base symbols into the interpolation coefficients args = [_expr.xreplace(v_sub) * b.xreplace(v_sub) for b, v_sub in zip(self._interpolation_coeffs, idx_subs)] # Accumulate point-wise contributions into a temporary rhs = Symbol(name='sum', dtype=self.sfunction.dtype) summands = [Eq(rhs, 0., implicit_dims=self.sfunction.dimensions)] summands.extend([Inc(rhs, i, implicit_dims=self.sfunction.dimensions) for i in args]) # Write/Incr `self` lhs = self.sfunction.subs(self_subs) last = [Inc(lhs, rhs)] if increment else [Eq(lhs, rhs)] return temps + summands + last
def _buffering(expressions, callback, options): async_degree = options['buf-async-degree'] # Locate all Function accesses within the provided `expressions` accessmap = AccessMapper(expressions) # Create the buffers buffers = [] for n, (f, accessv) in enumerate(accessmap.items()): dims = callback(f) if dims is None: # Not a buffer candidate continue if accessv.lastwrite is None: # Read-only Functions cannot be buffering candidates continue buffers.append(Buffer(f, dims, accessv, n, async_degree)) # Create Eqs to initialize buffers. Note: a buffer needs to be initialized # only if the buffered Function is read in at least one place or in the case # of non-uniform SubDimensions, to avoid uninitialized values to be copied-back # into the buffered Function processed = [Eq(b.indexify(), b.function.subs(b.contraction_mapper)) for b in buffers if b.is_read or not b.has_uniform_subdims] # Substitution rules to replace buffered Functions with buffers subs = {} for b in buffers: for a in b.accessv.accesses: subs[a] = b.indexify(a.indices) # Create Eqs to copy back buffers into their buffered Functions for e in expressions: processed.append(uxreplace(e, subs)) # We also append the copy-back if `e` is the last-write of some buffers for b in buffers: if e is b.accessv.lastwrite: items = list(zip(e.lhs.indices, b.function.dimensions)) lhs = b.function[[i if i in b.index_mapper else d for i, d in items]] rhs = b.indexed[[b.index_mapper.get(i, d) for i, d in items]] if b.subdims_mapper: processed.append(uxreplace(Eq(lhs, rhs), b.subdims_mapper)) else: processed.append(Eq(lhs, rhs)) break return processed
def _add_implicit(cls, expressions): """ Create and add any associated implicit expressions. Implicit expressions are those not explicitly defined by the user but instead are requisites of some specified functionality. """ processed = [] seen = set() for e in expressions: if e.subdomain: try: dims = [d.root for d in e.free_symbols if isinstance(d, Dimension)] sub_dims = [d.root for d in e.subdomain.dimensions] sub_dims.append(e.subdomain.implicit_dimension) dims = [d for d in dims if d not in frozenset(sub_dims)] dims.append(e.subdomain.implicit_dimension) if e.subdomain not in seen: processed.extend([i.func(*i.args, implicit_dims=dims) for i in e.subdomain._create_implicit_exprs()]) seen.add(e.subdomain) dims.extend(e.subdomain.dimensions) new_e = Eq(e.lhs, e.rhs, subdomain=e.subdomain, implicit_dims=dims) processed.append(new_e) except AttributeError: processed.append(e) else: processed.append(e) return processed
def _interpolation_indices(self, variables, offset=0, field_offset=0): """ Generate interpolation indices for the DiscreteFunctions in ``variables``. """ index_matrix, points = self.sfunction._index_matrix(offset) idx_subs = [] for i, idx in enumerate(index_matrix): # Introduce ConditionalDimension so that we don't go OOB mapper = {} for j, d in zip(idx, self.grid.dimensions): p = points[j] lb = sympy.And(p >= d.symbolic_min - self.sfunction._radius, evaluate=False) ub = sympy.And(p <= d.symbolic_max + self.sfunction._radius, evaluate=False) condition = sympy.And(lb, ub, evaluate=False) mapper[d] = ConditionalDimension(p.name, self.sfunction._sparse_dim, condition=condition, indirect=True) # Track Indexed substitutions idx_subs.append(mapper) # Temporaries for the position temps = [ Eq(v, k, implicit_dims=self.sfunction.dimensions) for k, v in self.sfunction._position_map.items() ] # Temporaries for the indirection dimensions temps.extend([ Eq(v, k.subs(self.sfunction._position_map), implicit_dims=self.sfunction.dimensions) for k, v in points.items() ]) # Temporaries for the coefficients temps.extend([ Eq(p, c.subs(self.sfunction._position_map), implicit_dims=self.sfunction.dimensions) for p, c in zip(self.sfunction._point_symbols, self.sfunction._coordinate_bases(field_offset)) ]) return idx_subs, temps
def __new__(cls, *args, **kwargs): if len(args) == 1: input_expr = args[0] assert isinstance(input_expr, Eq) obj = LoweredEq(input_expr) elif len(args) == 2: obj = LoweredEq(Eq(*args, evaluate=False)) else: raise ValueError("Cannot construct DummyEq from args=%s" % str(args)) return ClusterizedEq.__new__(cls, obj, ispace=obj.ispace, dspace=obj.dspace)
def callback(): _expr = indexify(expr) _field = indexify(field) p, _ = self.obj.gridpoints.indices dim_subs = [] coeffs = [] for i, d in enumerate(self.obj.grid.dimensions): rd = DefaultDimension(name="r%s" % d.name, default_value=self.r) dim_subs.append((d, INT(rd + self.obj.gridpoints[p, i]))) coeffs.append(self.obj.interpolation_coeffs[p, i, rd]) rhs = prod(coeffs) * _expr _field = _field.subs(dim_subs) return [Eq(_field, _field + rhs.subs(dim_subs))]
def callback(): _expr = indexify(expr) p, _, _ = self.obj.interpolation_coeffs.indices dim_subs = [] coeffs = [] for i, d in enumerate(self.obj.grid.dimensions): rd = DefaultDimension(name="r%s" % d.name, default_value=self.r) dim_subs.append((d, INT(rd + self.obj.gridpoints[p, i]))) coeffs.append(self.obj.interpolation_coeffs[p, i, rd]) # Apply optional time symbol substitutions to lhs of assignment lhs = self.obj.subs(self_subs) rhs = prod(coeffs) * _expr.subs(dim_subs) return [Eq(lhs, lhs + rhs)]
def generate_implicit_exprs(expressions): """ Create and add implicit expressions. Implicit expressions are those not explicitly defined by the user but instead are requisites of some specified functionality. Currently, implicit expressions stem from the following: * MultiSubDomains attached to input equations. """ found = {} processed = [] for e in expressions: if e.subdomain: try: dims = [ d.root for d in e.free_symbols if isinstance(d, Dimension) ] sub_dims = [d.root for d in e.subdomain.dimensions] sub_dims.extend(e.subdomain.implicit_dimensions) dims = [d for d in dims if d not in frozenset(sub_dims)] dims.extend(e.subdomain.implicit_dimensions) if e.subdomain not in found: grid = list(retrieve_functions(e, mode='unique'))[0].grid found[e.subdomain] = [ i.func(*i.args, implicit_dims=dims) for i in e.subdomain._create_implicit_exprs(grid) ] processed.extend(found[e.subdomain]) dims.extend(e.subdomain.dimensions) new_e = Eq(e.lhs, e.rhs, subdomain=e.subdomain, implicit_dims=dims) processed.append(new_e) except AttributeError: # Not a MultiSubDomain processed.append(e) else: processed.append(e) return processed
def process(cluster, chosen, aliases, sregistry, platform): clusters = [] subs = {} for alias, writeto, aliaseds, distances in aliases.iter(cluster.ispace): if all(i not in chosen for i in aliaseds): continue # The Dimensions defining the shape of Array # Note: with SubDimensions, we may have the following situation: # # for zi = z_m + zi_ltkn; zi <= z_M - zi_rtkn; ... # r[zi] = ... # # Instead of `r[zi - z_m - zi_ltkn]` we have just `r[zi]`, so we'll need # as much room as in `zi`'s parent to avoid going OOB # Aside from ugly generated code, the reason we do not rather shift the # indices is that it prevents future passes to transform the loop bounds # (e.g., MPI's comp/comm overlap does that) dimensions = [d.parent if d.is_Sub else d for d in writeto.dimensions] # The halo of the Array halo = [(abs(i.lower), abs(i.upper)) for i in writeto] # The data sharing mode of the Array sharing = 'local' if any(d.is_Incr for d in writeto.dimensions) else 'shared' # Finally create the temporary Array that will store `alias` array = Array(name=sregistry.make_name(), dimensions=dimensions, halo=halo, dtype=cluster.dtype, sharing=sharing) # The access Dimensions may differ from `writeto.dimensions`. This may # happen e.g. if ShiftedDimensions are introduced (`a[x,y]` -> `a[xs,y]`) adims = [aliases.index_mapper.get(d, d) for d in writeto.dimensions] # The expression computing `alias` adims = [aliases.index_mapper.get(d, d) for d in writeto.dimensions] # x -> xs indices = [d - (0 if writeto[d].is_Null else writeto[d].lower) for d in adims] expression = Eq(array[indices], uxreplace(alias, subs)) # Create the substitution rules so that we can use the newly created # temporary in place of the aliasing expressions for aliased, distance in zip(aliaseds, distances): assert all(i.dim in distance.labels for i in writeto) indices = [d - i.lower + distance[i.dim] for d, i in zip(adims, writeto)] subs[aliased] = array[indices] if aliased in chosen: subs[chosen[aliased]] = array[indices] else: # Perhaps part of a composite alias ? pass # Construct the `alias` IterationSpace ispace = cluster.ispace.add(writeto).augment(aliases.index_mapper) # Optimization: if possible, the innermost IterationInterval is # rounded up to a multiple of the vector length try: it = ispace.itintervals[-1] if ROUNDABLE in cluster.properties[it.dim]: vl = platform.simd_items_per_reg(cluster.dtype) ispace = ispace.add(Interval(it.dim, 0, it.interval.size % vl)) except (TypeError, KeyError): pass # Construct the `alias` DataSpace accesses = detect_accesses(expression) parts = {k: IntervalGroup(build_intervals(v)).add(ispace.intervals).relaxed for k, v in accesses.items() if k} dspace = DataSpace(cluster.dspace.intervals, parts) # Finally, build a new Cluster for `alias` built = cluster.rebuild(exprs=expression, ispace=ispace, dspace=dspace) clusters.insert(0, built) return clusters, subs
def lower_schedule(cluster, schedule, sregistry, options): """ Turn a Schedule into a sequence of Clusters. """ ftemps = options['cire-ftemps'] if ftemps: make = TempFunction else: # Typical case -- the user does *not* "see" the CIRE-created temporaries make = Array clusters = [] subs = {} for alias, writeto, ispace, aliaseds, indicess in schedule: # Basic info to create the temporary that will hold the alias name = sregistry.make_name() dtype = cluster.dtype if writeto: # The Dimensions defining the shape of Array # Note: with SubDimensions, we may have the following situation: # # for zi = z_m + zi_ltkn; zi <= z_M - zi_rtkn; ... # r[zi] = ... # # Instead of `r[zi - z_m - zi_ltkn]` we have just `r[zi]`, so we'll need # as much room as in `zi`'s parent to avoid going OOB # Aside from ugly generated code, the reason we do not rather shift the # indices is that it prevents future passes to transform the loop bounds # (e.g., MPI's comp/comm overlap does that) dimensions = [ d.parent if d.is_Sub else d for d in writeto.itdimensions ] # The halo must be set according to the size of writeto space halo = [(abs(i.lower), abs(i.upper)) for i in writeto] # The indices used to write into the Array indices = [] for i in writeto: try: # E.g., `xs` sub_iterators = writeto.sub_iterators[i.dim] assert len(sub_iterators) == 1 indices.append(sub_iterators[0]) except KeyError: # E.g., `z` -- a non-shifted Dimension indices.append(i.dim - i.lower) obj = make(name=name, dimensions=dimensions, halo=halo, dtype=dtype) expression = Eq(obj[indices], alias) callback = lambda idx: obj[idx] else: # Degenerate case: scalar expression assert writeto.size == 0 obj = Symbol(name=name, dtype=dtype) expression = Eq(obj, alias) callback = lambda idx: obj # Create the substitution rules for the aliasing expressions subs.update({ aliased: callback(indices) for aliased, indices in zip(aliaseds, indicess) }) # Construct the `alias` DataSpace accesses = detect_accesses(expression) parts = { k: IntervalGroup(build_intervals(v)).add(ispace.intervals).relaxed for k, v in accesses.items() if k } dspace = DataSpace(cluster.dspace.intervals, parts) # Drop or weaken parallelism if necessary properties = dict(cluster.properties) for d, v in cluster.properties.items(): if any(i.is_Modulo for i in ispace.sub_iterators[d]): properties[d] = normalize_properties(v, {SEQUENTIAL}) elif d not in writeto.dimensions: properties[d] = normalize_properties(v, {PARALLEL_IF_PVT}) # Finally, build the `alias` Cluster clusters.append( cluster.rebuild(exprs=expression, ispace=ispace, dspace=dspace, properties=properties)) return clusters, subs
def _cse(maybe_exprs, make, mode='default'): """ Main common sub-expressions elimination routine. Note: the output is guaranteed to be topologically sorted. Parameters ---------- maybe_exprs : expr-like or list of expr-like or Cluster One or more expressions to which CSE is applied. make : callable Build symbols to store temporary, redundant values. mode : str, optional The CSE algorithm applied. Accepted: ['default']. """ # Note: not defaulting to SymPy's CSE() function for three reasons: # - it also captures array index access functions (eg, i+1 in A[i+1] and B[i+1]); # - it sometimes "captures too much", losing factorization opportunities; # - very slow # TODO: a second "sympy" mode will be provided, relying on SymPy's CSE() but # also ensuring some form of post-processing assert mode == 'default' # Only supported mode ATM # Just for flexibility, accept either Clusters or exprs if isinstance(maybe_exprs, Cluster): cluster = maybe_exprs processed = list(cluster.exprs) scope = cluster.scope else: processed = list(maybe_exprs) scope = Scope(maybe_exprs) # Some sub-expressions aren't really "common" -- that's the case of Dimension- # independent data dependences. For example: # # ... = ... a[i] + 1 ... # a[i] = ... # ... = ... a[i] + 1 ... # # `a[i] + 1` will be excluded, as there's a flow Dimension-independent data # dependence involving `a` exclude = {i.source.access for i in scope.d_flow.independent()} mapped = [] while True: # Detect redundancies counted = count(mapped + processed, q_xop).items() targets = OrderedDict([(k, estimate_cost(k, True)) for k, v in counted if v > 1]) # Rule out Dimension-independent data dependencies targets = OrderedDict([(k, v) for k, v in targets.items() if not k.free_symbols & exclude]) if not targets: break # Create temporaries hit = max(targets.values()) picked = [k for k, v in targets.items() if v == hit] mapper = OrderedDict([(e, make()) for i, e in enumerate(picked)]) # Apply replacements processed = [uxreplace(e, mapper) for e in processed] mapped = [uxreplace(e, mapper) for e in mapped] mapped = [Eq(v, k) for k, v in reversed(list(mapper.items()))] + mapped # Update `exclude` for the same reasons as above -- to rule out CSE across # Dimension-independent data dependences exclude.update({i for i in mapper.values()}) # Prepare for the next round for k in picked: targets.pop(k) processed = mapped + processed # At this point we may have useless temporaries (e.g., r0=r1). Let's drop them processed = _compact_temporaries(processed) return processed
def lower_schedule(cluster, schedule, chosen, sregistry, options): """ Turn a Schedule into a sequence of Clusters. """ onstack = options['cire-onstack'] clusters = [] subs = {} for alias, writeto, ispace, aliaseds, indicess in schedule: if all(i not in chosen for i in aliaseds): continue # The Dimensions defining the shape of Array # Note: with SubDimensions, we may have the following situation: # # for zi = z_m + zi_ltkn; zi <= z_M - zi_rtkn; ... # r[zi] = ... # # Instead of `r[zi - z_m - zi_ltkn]` we have just `r[zi]`, so we'll need # as much room as in `zi`'s parent to avoid going OOB # Aside from ugly generated code, the reason we do not rather shift the # indices is that it prevents future passes to transform the loop bounds # (e.g., MPI's comp/comm overlap does that) dimensions = [d.parent if d.is_Sub else d for d in writeto.itdimensions] halo = [(abs(i.lower), abs(i.upper)) for i in writeto] # The data sharing mode of the Array. It can safely be `shared` only if # all of the PARALLEL `cluster` Dimensions appear in `writeto` parallel = [d for d, v in cluster.properties.items() if PARALLEL in v] sharing = 'shared' if set(parallel) == set(writeto.itdimensions) else 'local' # The memory region of the Array. On the heap, unless the user has # explicitly requested allocation on the stack scope = 'stack' if onstack else 'heap' array = Array(name=sregistry.make_name(), dimensions=dimensions, halo=halo, dtype=cluster.dtype, scope=scope, sharing=sharing) indices = [] for i in writeto: try: # E.g., `xs` sub_iterators = writeto.sub_iterators[i.dim] assert len(sub_iterators) == 1 indices.append(sub_iterators[0]) except KeyError: # E.g., `z` -- a non-shifted Dimension indices.append(i.dim - i.lower) expression = Eq(array[indices], alias) # Create the substitution rules so that we can use the newly created # temporary in place of the aliasing expressions for aliased, indices in zip(aliaseds, indicess): subs[aliased] = array[indices] if aliased in chosen: subs[chosen[aliased]] = array[indices] else: # Perhaps part of a composite alias ? pass # Construct the `alias` DataSpace accesses = detect_accesses(expression) parts = {k: IntervalGroup(build_intervals(v)).add(ispace.intervals).relaxed for k, v in accesses.items() if k} dspace = DataSpace(cluster.dspace.intervals, parts) # Drop parallelism if using ModuloDimensions (due to rotations) properties = dict(cluster.properties) for d, v in cluster.properties.items(): if any(i.is_Modulo for i in ispace.sub_iterators[d]): properties[d] = normalize_properties(v, {SEQUENTIAL}) # Finally, build the `alias` Cluster clusters.append(cluster.rebuild(exprs=expression, ispace=ispace, dspace=dspace, properties=properties)) return clusters, subs
def process(candidates, aliases, cluster, template): """ Create Clusters from aliasing expressions. """ clusters = [] subs = {} for origin, alias in aliases.items(): if all(i not in candidates for i in alias.aliased): continue # The write-to Intervals writeto = [ Interval(i.dim, *alias.relaxed_diameter.get(i.dim, (0, 0))) for i in cluster.ispace.intervals if not i.dim.is_Time ] writeto = IntervalGroup(writeto) # Optimization: no need to retain a SpaceDimension if it does not # induce a flow/anti dependence (below, `i.offsets` captures this, by # telling how much halo will be required to honour such dependences) dep_inducing = [i for i in writeto if any(i.offsets)] try: index = writeto.index(dep_inducing[0]) writeto = IntervalGroup(writeto[index:]) except IndexError: perf_adv("Could not optimize some of the detected redundancies") # Create a temporary to store `alias` dimensions = [d.root for d in writeto.dimensions] halo = [(abs(i.lower), abs(i.upper)) for i in writeto] array = Array(name=template(), dimensions=dimensions, halo=halo, dtype=cluster.dtype) # Build up the expression evaluating `alias` access = tuple(i.dim - i.lower for i in writeto) expression = Eq(array[access], origin.xreplace(subs)) # Create the substitution rules so that we can use the newly created # temporary in place of the aliasing expressions for aliased, distance in alias.with_distance: assert all(i.dim in distance.labels for i in writeto) access = [i.dim - i.lower + distance[i.dim] for i in writeto] if aliased in candidates: # It would *not* be in `candidates` if part of a composite alias subs[candidates[aliased]] = array[access] subs[aliased] = array[access] # Construct the `alias` IterationSpace intervals, sub_iterators, directions = cluster.ispace.args ispace = IterationSpace(intervals.add(writeto), sub_iterators, directions) # Optimize the `alias` IterationSpace: if possible, the innermost # IterationInterval is rounded up to a multiple of the vector length try: it = ispace.itintervals[-1] if ROUNDABLE in cluster.properties[it.dim]: from devito.parameters import configuration vl = configuration['platform'].simd_items_per_reg( cluster.dtype) ispace = ispace.add(Interval(it.dim, 0, it.interval.size % vl)) except (TypeError, KeyError): pass # Construct the `alias` DataSpace mapper = detect_accesses(expression) parts = { k: IntervalGroup(build_intervals(v)).add(ispace.intervals) for k, v in mapper.items() if k } dspace = DataSpace(cluster.dspace.intervals, parts) # Create a new Cluster for `alias` clusters.append( cluster.rebuild(exprs=[expression], ispace=ispace, dspace=dspace)) return clusters, subs
def _eliminate_inter_stencil_redundancies(self, cluster, template, **kwargs): """ Search aliasing expressions and capture them into vector temporaries. Examples -------- 1) temp = (a[x,y,z]+b[x,y,z])*c[t,x,y,z] >>> ti[x,y,z] = a[x,y,z] + b[x,y,z] temp = ti[x,y,z]*c[t,x,y,z] 2) temp1 = 2.0*a[x,y,z]*b[x,y,z] temp2 = 3.0*a[x,y,z+1]*b[x,y,z+1] >>> ti[x,y,z] = a[x,y,z]*b[x,y,z] temp1 = 2.0*ti[x,y,z] temp2 = 3.0*ti[x,y,z+1] """ exprs = cluster.exprs # For more information about "aliases", refer to collect.__doc__ aliases = collect(exprs) # Redundancies will be stored in space-varying temporaries is_time_invariant = make_is_time_invariant(exprs) time_invariants = {e.rhs: is_time_invariant(e) for e in exprs} # Find the candidate expressions processed = [] candidates = OrderedDict() for e in exprs: # Cost check (to keep the memory footprint under control) naliases = len(aliases.get(e.rhs)) cost = estimate_cost(e, True) * naliases test0 = lambda: cost >= self.MIN_COST_ALIAS and naliases > 1 test1 = lambda: cost >= self.MIN_COST_ALIAS_INV and time_invariants[ e.rhs] if test0() or test1(): candidates[e.rhs] = e.lhs else: processed.append(e) # Create alias Clusters and all necessary substitution rules # for the new temporaries alias_clusters = [] subs = {} for origin, alias in aliases.items(): if all(i not in candidates for i in alias.aliased): continue # The write-to Intervals writeto = [ Interval(i.dim, *alias.relaxed_diameter.get(i.dim, (0, 0))) for i in cluster.ispace.intervals if not i.dim.is_Time ] writeto = IntervalGroup(writeto) # Optimization: no need to retain a SpaceDimension if it does not # induce a flow/anti dependence (below, `i.offsets` captures this, by # telling how much halo will be needed to honour such dependences) dep_inducing = [i for i in writeto if any(i.offsets)] try: index = writeto.index(dep_inducing[0]) writeto = IntervalGroup(writeto[index:]) except IndexError: warning("Couldn't optimize some of the detected redundancies") # Create a temporary to store `alias` dimensions = [d.root for d in writeto.dimensions] halo = [(abs(i.lower), abs(i.upper)) for i in writeto] array = Array(name=template(), dimensions=dimensions, halo=halo, dtype=cluster.dtype) # Build up the expression evaluating `alias` access = tuple(i.dim - i.lower for i in writeto) expression = Eq(array[access], origin.xreplace(subs)) # Create the substitution rules so that we can use the newly created # temporary in place of the aliasing expressions for aliased, distance in alias.with_distance: assert all(i.dim in distance.labels for i in writeto) access = [i.dim - i.lower + distance[i.dim] for i in writeto] if aliased in candidates: # It would *not* be in `candidates` if part of a composite alias subs[candidates[aliased]] = array[access] subs[aliased] = array[access] # Construct the `alias` IterationSpace intervals, sub_iterators, directions = cluster.ispace.args ispace = IterationSpace(intervals.add(writeto), sub_iterators, directions) # Construct the `alias` DataSpace mapper = detect_accesses(expression) parts = { k: IntervalGroup(build_intervals(v)).add(ispace.intervals) for k, v in mapper.items() if k } dspace = DataSpace(cluster.dspace.intervals, parts) # Create a new Cluster for `alias` alias_clusters.append( cluster.rebuild(exprs=[expression], ispace=ispace, dspace=dspace)) # Switch temporaries in the expression trees processed = [e.xreplace(subs) for e in processed] return alias_clusters + [cluster.rebuild(processed)]
def callback(self, clusters, prefix, cache=None): # Locate all Function accesses within the provided `clusters` accessmap = AccessMapper(clusters) # Create the buffers buffers = BufferBatch() for f, accessv in accessmap.items(): # Has a buffer already been produced for `f`? if f in cache: continue # Is `f` really a buffering candidate? dims = self.callback0(f) if dims is None: continue if not all(any([i.dim in d._defines for i in prefix]) for d in dims): continue b = cache[f] = buffers.make(f, dims, accessv, self.options, self.sregistry) if not buffers: return clusters try: pd = prefix[-2].dim except IndexError: pd = None # Create Eqs to initialize buffers. Note: a buffer needs to be initialized # only if the buffered Function is read in at least one place or in the case # of non-uniform SubDimensions, to avoid uninitialized values to be copied-back # into the buffered Function noinit = self.options['buf-noinit'] processed = [] for b in buffers: if b.size == 1 and noinit: # Special case: avoid initialization if not strictly necessary # See docstring for more info about what this implies continue if b.is_read or not b.has_uniform_subdims: dims = b.function.dimensions lhs = b.indexed[[b.initmap.get(d, Map(d, d)).b for d in dims]] rhs = b.function[[b.initmap.get(d, Map(d, d)).f for d in dims]] expr = lower_exprs(Eq(lhs, rhs)) ispace = b.writeto guards = {pd: GuardBound(d.root.symbolic_min, d.root.symbolic_max) for d in b.contraction_mapper} properties = {d: {PARALLEL} for d in ispace.itdimensions} processed.append( Cluster(expr, ispace, guards=guards, properties=properties) ) # Substitution rules to replace buffered Functions with buffers subs = {} for b in buffers: for a in b.accessv.accesses: subs[a] = b.indexed[[b.index_mapper_flat.get(i, i) for i in a.indices]] for c in clusters: # If a buffer is read but never written, then we need to add # an Eq to step through the next slot # E.g., `ub[0, x] = u[time+2, x]` for b in buffers: if not b.is_readonly: continue try: c.exprs.index(b.firstread) except ValueError: continue dims = b.function.dimensions lhs = b.indexed[[b.lastmap.get(d, Map(d, d)).b for d in dims]] rhs = b.function[[b.lastmap.get(d, Map(d, d)).f for d in dims]] expr = lower_exprs(uxreplace(Eq(lhs, rhs), b.subdims_mapper)) ispace = b.written # Buffering creates a storage-related dependence along the # contracted dimensions properties = dict(c.properties) for d in b.contraction_mapper: d = ispace[d].dim # E.g., `time_sub -> time` properties[d] = normalize_properties(properties[d], {SEQUENTIAL}) processed.append( c.rebuild(exprs=expr, ispace=ispace, properties=properties) ) # Substitute buffered Functions with the newly created buffers exprs = [uxreplace(e, subs) for e in c.exprs] ispace = c.ispace for b in buffers: ispace = ispace.augment(b.sub_iterators) processed.append(c.rebuild(exprs=exprs, ispace=ispace)) # Also append the copy-back if `e` is the last-write of some buffers # E.g., `u[time + 1, x] = ub[sb1, x]` for b in buffers: if b.is_readonly: continue try: c.exprs.index(b.lastwrite) except ValueError: continue dims = b.function.dimensions lhs = b.function[[b.lastmap.get(d, Map(d, d)).f for d in dims]] rhs = b.indexed[[b.lastmap.get(d, Map(d, d)).b for d in dims]] expr = lower_exprs(uxreplace(Eq(lhs, rhs), b.subdims_mapper)) ispace = b.written # Buffering creates a storage-related dependence along the # contracted dimensions properties = dict(c.properties) for d in b.contraction_mapper: d = ispace[d].dim # E.g., `time_sub -> time` properties[d] = normalize_properties(properties[d], {SEQUENTIAL}) processed.append( c.rebuild(exprs=expr, ispace=ispace, properties=properties) ) return processed