def iet_lower_dimensions(iet): """ Replace all DerivedDimensions within the ``iet``'s expressions with lower-level symbolic objects (other Dimensions or Symbols). * Array indices involving SteppingDimensions are turned into ModuloDimensions. Example: ``u[t+1, x] = u[t, x] + 1 >>> u[t1, x] = u[t0, x] + 1`` * Array indices involving ConditionalDimensions used are turned into integer-division expressions. Example: ``u[t_sub, x] = u[time, x] >>> u[time / 4, x] = u[time, x]`` """ # Lower SteppingDimensions for i in FindNodes(Iteration).visit(iet): if not i.uindices: # Be quick: avoid uselessy reconstructing nodes continue # In an expression, there could be `u[t+1, ...]` and `v[t+1, ...]`, where # `u` and `v` are TimeFunction with circular time buffers (save=None) *but* # different modulo extent. The `t+1` indices above are therefore conceptually # different, so they will be replaced with the proper ModuloDimension through # two different calls to `xreplace` groups = as_mapper(i.uindices, lambda d: d.modulo) for k, v in groups.items(): mapper = {d.origin: d for d in v} rule = lambda i: i.function.is_TimeFunction and i.function._time_size == k replacer = lambda i: xreplace_indices(i, mapper, rule) iet = XSubs(replacer=replacer).visit(iet) # Lower ConditionalDimensions cdims = [d for d in FindSymbols('free-symbols').visit(iet) if isinstance(d, ConditionalDimension)] mapper = {d: IntDiv(d.index, d.factor) for d in cdims} iet = XSubs(mapper).visit(iet) return iet
def callback(self, clusters, prefix): if not prefix: return clusters d = prefix[-1].dim processed = [] for c in clusters: if SKEWABLE not in c.properties[d]: return clusters skew_dims = { i.dim for i in c.ispace if SEQUENTIAL in c.properties[i.dim] } if len(skew_dims) > 1: return clusters skew_dim = skew_dims.pop() # Since we are here, prefix is skewable and nested under a SEQUENTIAL loop intervals = [] for i in c.ispace: if i.dim is d and (not d.is_Block or d._depth == 1): intervals.append(Interval(d, skew_dim, skew_dim)) else: intervals.append(i) intervals = IntervalGroup(intervals, relations=c.ispace.relations) ispace = IterationSpace(intervals, c.ispace.sub_iterators, c.ispace.directions) exprs = xreplace_indices(c.exprs, {d: d - skew_dim}) processed.append(c.rebuild(exprs=exprs, ispace=ispace)) return processed
def _lower_stepping_dims(iet): """ Lower SteppingDimensions: index functions involving SteppingDimensions are turned into ModuloDimensions. Examples -------- u[t+1, x] = u[t, x] + 1 becomes u[t1, x] = u[t0, x] + 1 """ for i in FindNodes(Iteration).visit(iet): if not i.uindices: # Be quick: avoid uselessy reconstructing nodes continue # In an expression, there could be `u[t+1, ...]` and `v[t+1, ...]`, where # `u` and `v` are TimeFunction with circular time buffers (save=None) *but* # different modulo extent. The `t+1` indices above are therefore conceptually # different, so they will be replaced with the proper ModuloDimension through # two different calls to `xreplace` mindices = [d for d in i.uindices if d.is_Modulo] groups = as_mapper(mindices, lambda d: d.modulo) for k, v in groups.items(): mapper = {d.origin: d for d in v} rule = lambda i: i.function.is_TimeFunction and i.function._time_size == k replacer = lambda i: xreplace_indices(i, mapper, rule) iet = XSubs(replacer=replacer).visit(iet) return iet
def scalarize(clusters, template): """ Turn local "isolated" Arrays, that is Arrays appearing only in one Cluster, into Scalars. """ processed = [] for c in clusters: # Get any Arrays appearing only in `c` impacted = set(clusters) - {c} arrays = {i for i in c.scope.writes if i.is_Array} arrays -= set().union(*[i.scope.reads for i in impacted]) # Turn them into scalars # # r[x,y,z] = g(b[x,y,z]) t0 = g(b[x,y,z]) # ... = r[x,y,z] + r[x,y,z+1]` ----> t1 = g(b[x,y,z+1]) # ... = t0 + t1 mapper = {} exprs = [] for n, e in enumerate(c.exprs): f = e.lhs.function if f in arrays: indexeds = [i.indexed for i in c.scope[f] if i.timestamp > n] for i in filter_ordered(indexeds): mapper[i] = Scalar(name=template(), dtype=f.dtype) assert len(f.indices) == len(e.lhs.indices) == len( i.indices) shifting = { idx: idx + (o2 - o1) for idx, o1, o2 in zip(f.indices, e.lhs.indices, i.indices) } handle = e.func(mapper[i], e.rhs.xreplace(mapper)) handle = xreplace_indices(handle, shifting) exprs.append(handle) else: exprs.append(e.func(e.lhs, e.rhs.xreplace(mapper))) processed.append(c.rebuild(exprs)) return processed
def callback(self, clusters, prefix): if not prefix: return clusters d = prefix[-1].dim processed = [] for c in clusters: if SKEWABLE not in c.properties[d]: return clusters if d is c.ispace[-1].dim and not self.skewinner: return clusters skew_dims = {i.dim for i in c.ispace if SEQUENTIAL in c.properties[i.dim]} if len(skew_dims) > 1: return clusters skew_dim = skew_dims.pop() # The level of a given Dimension in the hierarchy of block Dimensions, used # to skew over the outer level of loops. level = lambda dim: len([i for i in dim._defines if i.is_Incr]) # Since we are here, prefix is skewable and nested under a # SEQUENTIAL loop. intervals = [] for i in c.ispace: if i.dim is d and level(d) <= 1: # Skew only at level 0 or 1 intervals.append(Interval(d, skew_dim, skew_dim)) else: intervals.append(i) intervals = IntervalGroup(intervals, relations=c.ispace.relations) ispace = IterationSpace(intervals, c.ispace.sub_iterators, c.ispace.directions) exprs = xreplace_indices(c.exprs, {d: d - skew_dim}) processed.append(c.rebuild(exprs=exprs, ispace=ispace, properties=c.properties)) return processed
def optimize(clusters): """ Attempt scalar promotion. Candidates are tensors that do not appear in any other clusters. """ clusters = merge(clusters) processed = [] for c1 in clusters: mapper = {} temporaries = [] for k, v in c1.trace.items(): if v.function.is_Array and\ not any(v.function in c2.unknown for c2 in clusters): for i in c1.tensors[v.function]: # LHS scalarization scalarized = Scalar(name='s%d' % len(mapper)).indexify() mapper[i] = scalarized # May have to "unroll" some tensor expressions for scalarization; # e.g., if we have two occurrences of r0, say r0[x,y,z] and # r0[x+1,y,z], and r0 is to be scalarized, this will require a # different scalar for each unique set of indices. assert len(v.function.indices) == len(k.indices) == len(i.indices) shifting = {idx: idx + (o2 - o1) for idx, o1, o2 in zip(v.function.indices, k.indices, i.indices)} # Transform /v/, introducing (i) a scalarized LHS and (ii) shifted # indices if necessary handle = v.func(scalarized, v.rhs.xreplace(mapper)) handle = xreplace_indices(handle, shifting) temporaries.append(handle) else: temporaries.append(v.func(k, v.rhs.xreplace(mapper))) processed.append(c1.rebuild(temporaries)) return processed
def optimize_unfolded_tree(unfolded, root): """ Transform folded trees to reduce the memory footprint. Examples -------- Given: .. code-block:: for i = 1 to N - 1 # Folded tree for j = 1 to N - 1 tmp[i,j] = ... for i = 2 to N - 2 # Root for j = 2 to N - 2 ... = ... tmp[i,j] ... The temporary ``tmp`` has shape ``(N-1, N-1)``. However, as soon as the iteration space is blocked, with blocks of shape ``(i_bs, j_bs)``, the ``tmp`` shape can be shrunk to ``(i_bs-1, j_bs-1)``. The resulting iteration tree becomes: .. code-block:: for i = 1 to i_bs + 1 # Folded tree for j = 1 to j_bs + 1 i' = i + i_block - 2 j' = j + j_block - 2 tmp[i,j] = ... # use i' and j' for i = i_block to i_block + i_bs # Root for j = j_block to j_block + j_bs i' = i - x_block j' = j - j_block ... = ... tmp[i',j'] ... """ processed = [] for i, tree in enumerate(unfolded): assert len(tree) == len(root) # We can optimize the folded trees only iff: # test0 := they compute temporary arrays, but not if they compute input data # test1 := the outer Iterations have actually been blocked exprs = FindNodes(Expression).visit(tree) writes = [j.write for j in exprs if j.is_tensor] test0 = not all(j.is_Array for j in writes) test1 = any(not isinstance(j.limits[0], BlockDimension) for j in root) if test0 or test1: processed.append(compose_nodes(tree)) root = compose_nodes(root) continue # Shrink the iteration space modified_tree = [] modified_root = [] modified_dims = {} mapper = {} for t, r in zip(tree, root): udim0 = IncrDimension(t.dim, t.symbolic_min, 1, "%ss%d" % (t.index, i)) modified_tree.append(t._rebuild(limits=(0, t.limits[1] - t.limits[0], t.step), uindices=t.uindices + (udim0,))) mapper[t.dim] = udim0 udim1 = IncrDimension(t.dim, 0, 1, "%ss%d" % (t.index, i)) modified_root.append(r._rebuild(uindices=r.uindices + (udim1,))) d = r.limits[0] assert isinstance(d, BlockDimension) modified_dims[d.root] = d # Temporary arrays can now be moved onto the stack for w in writes: dims = tuple(modified_dims.get(d, d) for d in w.dimensions) shape = tuple(d.symbolic_size for d in dims) w.update(shape=shape, dimensions=dims, scope='stack') # Substitute iteration variables within the folded trees modified_tree = compose_nodes(modified_tree) replaced = xreplace_indices([j.expr for j in exprs], mapper, lambda i: i.function not in writes, True) subs = [j._rebuild(expr=k) for j, k in zip(exprs, replaced)] processed.append(Transformer(dict(zip(exprs, subs))).visit(modified_tree)) # Introduce the new iteration variables within /root/ modified_root = compose_nodes(modified_root) exprs = FindNodes(Expression).visit(modified_root) candidates = [as_symbol(j.output) for j in subs] replaced = xreplace_indices([j.expr for j in exprs], mapper, candidates) subs = [j._rebuild(expr=k) for j, k in zip(exprs, replaced)] root = Transformer(dict(zip(exprs, subs))).visit(modified_root) return processed + [root]
def visit_Expression(self, o): return o._rebuild(expr=xreplace_indices(o.expr, self.subs, self.rule))
def bump_and_contract(targets, source, sink): """ Transform in-place the PartialClusters ``source`` and ``sink`` by turning the :class:`Array`s in ``targets`` into :class:`Scalar`. This is implemented through index bumping and array contraction. :param targets: The :class:`Array` objects that will be contracted. :param source: The source :class:`PartialCluster`. :param sink: The sink :class:`PartialCluster`. Examples ======== Index bumping ------------- Given: :: r[x,y,z] = b[x,y,z]*2 Produce: :: r[x,y,z] = b[x,y,z]*2 r[x,y,z+1] = b[x,y,z+1]*2 Array contraction ----------------- Given: :: r[x,y,z] = b[x,y,z]*2 r[x,y,z+1] = b[x,y,z+1]*2 Produce: :: tmp0 = b[x,y,z]*2 tmp1 = b[x,y,z+1]*2 Full example (bump+contraction) ------------------------------- Given: :: source: [r[x,y,z] = b[x,y,z]*2] sink: [a = ... r[x,y,z] ... r[x,y,z+1] ...] targets: r Produce: :: source: [tmp0 = b[x,y,z]*2, tmp1 = b[x,y,z+1]*2] sink: [a = ... tmp0 ... tmp1 ...] """ if not targets: return mapper = {} # source processed = [] for k, v in source.trace.items(): if any(v.function not in i for i in [targets, sink.tensors]): processed.append(v.func(k, v.rhs.xreplace(mapper))) else: for i in sink.tensors[v.function]: scalarized = Scalar(name='s%d' % len(mapper)).indexify() mapper[i] = scalarized # Index bumping assert len(v.function.indices) == len(k.indices) == len( i.indices) shifting = { idx: idx + (o2 - o1) for idx, o1, o2 in zip(v.function.indices, k.indices, i.indices) } # Array contraction handle = v.func(scalarized, v.rhs.xreplace(mapper)) handle = xreplace_indices(handle, shifting) processed.append(handle) source.exprs = processed # sink processed = [ v.func(k, v.rhs.xreplace(mapper)) for k, v in sink.trace.items() ] sink.exprs = processed
def bump_and_contract(targets, source, sink): """ Transform in-place the PartialClusters ``source`` and ``sink`` by turning the Arrays in ``targets`` into Scalars. This is implemented through index bumping and array contraction. Parameters ---------- targets : list of Array The Arrays that will be contracted. source : PartialCluster The PartialCluster in which the Arrays are initialized. sink : PartialCluster The PartialCluster that consumes (i.e., reads) the Arrays. Examples -------- 1) Index bumping Given: :: r[x,y,z] = b[x,y,z]*2 Produce: :: r[x,y,z] = b[x,y,z]*2 r[x,y,z+1] = b[x,y,z+1]*2 2) Array contraction Given: :: r[x,y,z] = b[x,y,z]*2 r[x,y,z+1] = b[x,y,z+1]*2 Produce: :: tmp0 = b[x,y,z]*2 tmp1 = b[x,y,z+1]*2 3) Full example (bump+contraction) Given: :: source: [r[x,y,z] = b[x,y,z]*2] sink: [a = ... r[x,y,z] ... r[x,y,z+1] ...] targets: r Produce: :: source: [tmp0 = b[x,y,z]*2, tmp1 = b[x,y,z+1]*2] sink: [a = ... tmp0 ... tmp1 ...] """ if not targets: return mapper = {} # Source processed = [] for e in source.exprs: function = e.lhs.function if any(function not in i for i in [targets, sink.tensors]): processed.append(e.func(e.lhs, e.rhs.xreplace(mapper))) else: for i in sink.tensors[function]: scalar = Scalar(name='s%s%d' % (i.function.name, len(mapper))).indexify() mapper[i] = scalar # Index bumping assert len(function.indices) == len(e.lhs.indices) == len( i.indices) shifting = { idx: idx + (o2 - o1) for idx, o1, o2 in zip(function.indices, e.lhs.indices, i.indices) } # Array contraction handle = e.func(scalar, e.rhs.xreplace(mapper)) handle = xreplace_indices(handle, shifting) processed.append(handle) source.exprs = processed # Sink processed = [e.func(e.lhs, e.rhs.xreplace(mapper)) for e in sink.exprs] sink.exprs = processed
def optimize_unfolded_tree(unfolded, root): """ Transform folded trees to reduce the memory footprint. Examples ======== Given: .. code-block:: for i = 1 to N - 1 # Folded tree for j = 1 to N - 1 tmp[i,j] = ... for i = 2 to N - 2 # Root for j = 2 to N - 2 ... = ... tmp[i,j] ... The temporary ``tmp`` has shape ``(N-1, N-1)``. However, as soon as the iteration space is blocked, with blocks of shape ``(i_bs, j_bs)``, the ``tmp`` shape can be shrunk to ``(i_bs-1, j_bs-1)``. The resulting iteration tree becomes: .. code-block:: for i = 1 to i_bs + 1 # Folded tree for j = 1 to j_bs + 1 i' = i + i_block - 2 j' = j + j_block - 2 tmp[i,j] = ... # use i' and j' for i = i_block to i_block + i_bs # Root for j = j_block to j_block + j_bs i' = i - x_block j' = j - j_block ... = ... tmp[i',j'] ... """ processed = [] for i, tree in enumerate(unfolded): assert len(tree) == len(root) modified_tree = [] modified_root = [] mapper = {} # "Shrink" the iteration space for t1, t2 in zip(tree, root): index = Symbol('%ss%d' % (t1.index, i)) mapper[t1.dim] = index t1_uindex = (UnboundedIndex(index, t1.limits[0]), ) t2_uindex = (UnboundedIndex(index, -t1.limits[0]), ) limits = (0, t1.limits[1] - t1.limits[0], t1.incr_symbolic) modified_tree.append( t1._rebuild(limits=limits, uindices=t1.uindices + t1_uindex)) modified_root.append(t2._rebuild(uindices=t2.uindices + t2_uindex)) # Temporary arrays can now be moved onto the stack exprs = FindNodes(Expression).visit(modified_tree[-1]) if all(not j.is_Remainder for j in modified_tree): dimensions = tuple(j.limits[0] for j in modified_root) for j in exprs: if j.write.is_Array: j_dimensions = dimensions + j.write.dimensions[ len(modified_root):] j_shape = tuple(k.symbolic_size for k in j_dimensions) j.write.update(shape=j_shape, dimensions=j_dimensions, onstack=True) # Substitute iteration variables within the folded trees modified_tree = compose_nodes(modified_tree) replaced = xreplace_indices([j.expr for j in exprs], mapper, only_rhs=True) subs = [j._rebuild(expr=k) for j, k in zip(exprs, replaced)] processed.append( Transformer(dict(zip(exprs, subs))).visit(modified_tree)) # Introduce the new iteration variables within /root/ modified_root = compose_nodes(modified_root) exprs = FindNodes(Expression).visit(modified_root) candidates = [as_symbol(j.output) for j in subs] replaced = xreplace_indices([j.expr for j in exprs], mapper, candidates) subs = [j._rebuild(expr=k) for j, k in zip(exprs, replaced)] root = Transformer(dict(zip(exprs, subs))).visit(modified_root) return processed + [root]
def _bump_and_scalarize(arrays, cluster, template): """ Scalarize local Arrays. Parameters ---------- arrays : list of Array The Arrays that will be scalarized. cluster : Cluster The Cluster where the local Arrays are used. Examples -------- This transformation consists of two steps, "index bumping" and the actual "scalarization". Index bumping. Given: r[x,y,z] = b[x,y,z]*2 Produce: r[x,y,z] = b[x,y,z]*2 r[x,y,z+1] = b[x,y,z+1]*2 Scalarization. Given: r[x,y,z] = b[x,y,z]*2 r[x,y,z+1] = b[x,y,z+1]*2 Produce: t0 = b[x,y,z]*2 t1 = b[x,y,z+1]*2 An Array being scalarized could be Indexed multiple times. Before proceeding with scalarization, therefore, we perform index bumping. For example, given: r0[x,y,z] = b[x,y,z]*2 r1[x,y,z] = ... r[x,y,z] ... r[x,y,z-1] ... This function will produce: t0 = b[x,y,z]*2 t1 = b[x,y,z-1]*2 r1[x,y,z] = ... t0 ... t1 ...] """ if not arrays: return cluster mapper = {} processed = [] for e in cluster.exprs: f = e.lhs.function if f in arrays: indexeds = filter_ordered(i.indexed for i in cluster.scope[f]) for i in indexeds: mapper[i] = Scalar(name=template(), dtype=f.dtype) # Index bumping assert len(f.indices) == len(e.lhs.indices) == len(i.indices) shifting = { idx: idx + (o2 - o1) for idx, o1, o2 in zip(f.indices, e.lhs.indices, i.indices) } # Scalarization handle = e.func(mapper[i], e.rhs.xreplace(mapper)) handle = xreplace_indices(handle, shifting) processed.append(handle) else: processed.append(e.func(e.lhs, e.rhs.xreplace(mapper))) return cluster.rebuild(processed)
def optimize_unfolded_tree(unfolded, root): """ Transform folded trees to reduce the memory footprint. Examples -------- Given: .. code-block:: for i = 1 to N - 1 # Folded tree for j = 1 to N - 1 tmp[i,j] = ... for i = 2 to N - 2 # Root for j = 2 to N - 2 ... = ... tmp[i,j] ... The temporary ``tmp`` has shape ``(N-1, N-1)``. However, as soon as the iteration space is blocked, with blocks of shape ``(i_bs, j_bs)``, the ``tmp`` shape can be shrunk to ``(i_bs-1, j_bs-1)``. The resulting iteration tree becomes: .. code-block:: for i = 1 to i_bs + 1 # Folded tree for j = 1 to j_bs + 1 i' = i + i_block - 2 j' = j + j_block - 2 tmp[i,j] = ... # use i' and j' for i = i_block to i_block + i_bs # Root for j = j_block to j_block + j_bs i' = i - x_block j' = j - j_block ... = ... tmp[i',j'] ... """ processed = [] for i, tree in enumerate(unfolded): assert len(tree) == len(root) # We can optimize the folded trees only if they compute temporary # arrays, but not if they compute input data exprs = FindNodes(Expression).visit(tree[-1]) writes = [j.write for j in exprs if j.is_tensor] if not all(j.is_Array for j in writes): processed.append(compose_nodes(tree)) root = compose_nodes(root) continue modified_tree = [] modified_root = [] mapper = {} # "Shrink" the iteration space for t1, t2 in zip(tree, root): t1_udim = IncrDimension(t1.dim, t1.symbolic_min, 1, "%ss%d" % (t1.index, i)) limits = (0, t1.limits[1] - t1.limits[0], t1.step) modified_tree.append(t1._rebuild(limits=limits, uindices=t1.uindices + (t1_udim,))) t2_udim = IncrDimension(t1.dim, 0, 1, "%ss%d" % (t1.index, i)) modified_root.append(t2._rebuild(uindices=t2.uindices + (t2_udim,))) mapper[t1.dim] = t1_udim # Temporary arrays can now be moved onto the stack dimensions = tuple(j.limits[0] for j in modified_root) for j in writes: if j.is_Array: j_dimensions = dimensions + j.dimensions[len(modified_root):] j_shape = tuple(k.symbolic_size for k in j_dimensions) j.update(shape=j_shape, dimensions=j_dimensions, scope='stack') # Substitute iteration variables within the folded trees modified_tree = compose_nodes(modified_tree) replaced = xreplace_indices([j.expr for j in exprs], mapper, only_rhs=True) subs = [j._rebuild(expr=k) for j, k in zip(exprs, replaced)] processed.append(Transformer(dict(zip(exprs, subs))).visit(modified_tree)) # Introduce the new iteration variables within /root/ modified_root = compose_nodes(modified_root) exprs = FindNodes(Expression).visit(modified_root) candidates = [as_symbol(j.output) for j in subs] replaced = xreplace_indices([j.expr for j in exprs], mapper, candidates) subs = [j._rebuild(expr=k) for j, k in zip(exprs, replaced)] root = Transformer(dict(zip(exprs, subs))).visit(modified_root) return processed + [root]