def __init__(self, expressions): mapper = DefaultOrderedDict(lambda: DefaultOrderedDict(AccessTuple)) for e in expressions: for i in retrieve_function_carriers(e.rhs): mapper[i.function][e].reads.append(i) mapper[e.lhs.function][e].write = e.lhs super().__init__([(f, AccessValue(f, mapper[f])) for f in mapper])
def __init__(self, exprs, **kwargs): # Check input legality mapper = OrderedDict([(i.lhs, i) for i in exprs]) if len(set(mapper)) != len(mapper): raise DSEException( "Found redundant node, cannot build TemporariesGraph.") # Construct Temporaries, tracking reads and readby tensor_map = DefaultOrderedDict(list) for i in mapper: tensor_map[as_symbol(i)].append(i) reads = DefaultOrderedDict(set) readby = DefaultOrderedDict(set) for k, v in mapper.items(): handle = retrieve_terminals(v.rhs) for i in list(handle): if i.is_Indexed: for idx in i.indices: handle |= retrieve_terminals(idx) reads[k].update( set(flatten([tensor_map.get(as_symbol(i), []) for i in handle]))) for i in reads[k]: readby[i].add(k) # Make sure read-after-writes are honored for scalar temporaries processed = [i for i in mapper if i.is_Indexed] queue = [i for i in mapper if i not in processed] while queue: k = queue.pop(0) if not readby[k]: processed.insert(0, k) elif all(i in processed for i in readby[k]): index = min(processed.index(i) for i in readby[k]) processed.insert(index, k) else: queue.append(k) # Build up the TemporariesGraph temporaries = [(i, Temporary(*mapper[i].args, inc=q_inc(mapper[i]), reads=reads[i], readby=readby[i])) for i in processed] super(TemporariesGraph, self).__init__(temporaries, **kwargs) # Determine indices along the space and time dimensions terms = [ v for k, v in self.items() if v.is_tensor and not q_indirect(k) ] indices = filter_ordered(flatten([i.function.indices for i in terms])) self.space_indices = tuple(i for i in indices if i.is_Space) self.time_indices = tuple(i for i in indices if i.is_Time)
def __init__(self, exprs, **kwargs): # Always convert to SSA exprs = convert_to_SSA(exprs) mapper = OrderedDict([(i.lhs, i) for i in exprs]) assert len(set(mapper)) == len(exprs), "not SSA Cluster?" # Construct the Nodes, tracking reads and readby tensor_map = DefaultOrderedDict(list) for i in mapper: tensor_map[as_symbol(i)].append(i) reads = DefaultOrderedDict(set) readby = DefaultOrderedDict(set) for k, v in mapper.items(): handle = retrieve_terminals(v.rhs) for i in list(handle): if i.is_Indexed: for idx in i.indices: handle |= retrieve_terminals(idx) reads[k].update( set(flatten([tensor_map.get(as_symbol(i), []) for i in handle]))) for i in reads[k]: readby[i].add(k) # Make sure read-after-writes are honored for scalar temporaries processed = [i for i in mapper if i.is_Indexed] queue = [i for i in mapper if i not in processed] while queue: k = queue.pop(0) if not readby[k]: processed.insert(0, k) elif all(i in processed for i in readby[k]): index = min(processed.index(i) for i in readby[k]) processed.insert(index, k) else: queue.append(k) # Build up the FlowGraph temporaries = [(i, Node(*mapper[i].args, inc=q_inc(mapper[i]), reads=reads[i], readby=readby[i])) for i in processed] super(FlowGraph, self).__init__(temporaries, **kwargs) # Determine indices along the space and time dimensions terms = [ v for k, v in self.items() if v.is_tensor and not q_indirect(k) ] indices = filter_ordered(flatten([i.function.indices for i in terms])) self.space_indices = tuple(i for i in indices if i.is_Space) self.time_indices = tuple(i for i in indices if i.is_Time)
def __init__(self, exprs, **kwargs): # Always convert to SSA exprs = makeit_ssa(exprs) mapper = OrderedDict([(i.lhs, i) for i in exprs]) assert len(set(mapper)) == len(exprs), "not SSA Cluster?" # Construct the Nodes, tracking reads and readby tensor_map = DefaultOrderedDict(list) for i in mapper: tensor_map[as_symbol(i)].append(i) reads = DefaultOrderedDict(set) readby = DefaultOrderedDict(set) for k, v in mapper.items(): handle = retrieve_terminals(v.rhs) for i in list(handle): if i.is_Indexed: for idx in i.indices: handle |= retrieve_terminals(idx) reads[k].update(set(flatten([tensor_map.get(as_symbol(i), []) for i in handle]))) for i in reads[k]: readby[i].add(k) # Make sure read-after-writes are honored for scalar nodes processed = [i for i in mapper if i.is_Indexed] queue = [i for i in mapper if i not in processed] while queue: k = queue.pop(0) if not readby[k] or k in readby[k]: processed.insert(0, k) elif all(i in processed for i in readby[k]): index = min(processed.index(i) for i in readby[k]) processed.insert(index, k) else: queue.append(k) # Build up the FlowGraph nodes = [(i, Node(mapper[i], reads=reads[i], readby=readby[i])) for i in processed] super(FlowGraph, self).__init__(nodes, **kwargs) # Determine indices along the space and time dimensions terms = [v for k, v in self.items() if v.is_Tensor and not q_indirect(k)] indices = filter_ordered(flatten([i.function.indices for i in terms])) self.space_indices = tuple(i for i in indices if i.is_Space) self.time_indices = tuple(i for i in indices if i.is_Time)
def __init__(self, exprs, **kwargs): # Always convert to SSA exprs = makeit_ssa(exprs) mapper = OrderedDict([(i.lhs, i) for i in exprs]) assert len(set(mapper)) == len(exprs), "not SSA Cluster?" # Construct the Nodes, tracking reads and readby tensor_map = DefaultOrderedDict(list) for i in mapper: tensor_map[as_symbol(i)].append(i) reads = DefaultOrderedDict(set) readby = DefaultOrderedDict(set) for k, v in mapper.items(): handle = retrieve_terminals(v.rhs, deep=True) reads[k].update( set(flatten([tensor_map.get(as_symbol(i), []) for i in handle]))) for i in reads[k]: readby[i].add(k) # Make sure read-after-writes are honored for scalar nodes processed = [i for i in mapper if i.is_Indexed] queue = [i for i in mapper if i not in processed] while queue: k = queue.pop(0) if not readby[k] or k in readby[k]: processed.insert(0, k) elif all(i in processed for i in readby[k]): index = min(processed.index(i) for i in readby[k]) processed.insert(index, k) else: queue.append(k) # Build up the FlowGraph nodes = [(i, Node(mapper[i], reads=reads[i], readby=readby[i])) for i in processed] super(FlowGraph, self).__init__(nodes, **kwargs)
def actions_from_unstructured(clusters, key, prefix, actions): it = prefix[-1] d = it.dim direction = it.direction # Locate the streamable Functions first_seen = {} last_seen = {} for c in clusters: candidates = key(c) if not candidates: continue for i in c.scope.accesses: f = i.function if f in candidates: k = (f, i[d]) first_seen.setdefault(k, c) last_seen[k] = c if not first_seen: return clusters callbacks = [(frozendict(first_seen), FetchPrefetch), (frozendict(last_seen), Delete)] # Create and map SyncOps to Clusters for seen, callback in callbacks: mapper = defaultdict(lambda: DefaultOrderedDict(list)) for (f, v), c in seen.items(): mapper[c][f].append(v) for c, m in mapper.items(): for f, v in m.items(): for fetch, s in indices_to_sections(v): if direction is Forward: ifetch = fetch.subs(d, d.symbolic_min) fcond = make_cond(c.guards.get(d), d, direction, d.symbolic_min) pfetch = fetch + 1 pcond = make_cond(c.guards.get(d), d, direction, d + 1) else: ifetch = fetch.subs(d, d.symbolic_max) fcond = make_cond(c.guards.get(d), d, direction, d.symbolic_max) pfetch = fetch - 1 pcond = make_cond(c.guards.get(d), d, direction, d - 1) syncop = callback(d, s, f, fetch, ifetch, fcond, pfetch, pcond) actions[c].syncs[d].append(syncop)
def stree_schedule(clusters): """ Arrange an iterable of Clusters into a ScheduleTree. """ stree = ScheduleTree() prev = None mapper = DefaultOrderedDict(lambda: Bunch(top=None, bottom=None)) def attach_metadata(cluster, d, tip): if d in cluster.guards: tip = NodeConditional(cluster.guards[d], tip) if d in cluster.syncs: tip = NodeSync(cluster.syncs[d], tip) return tip for c in clusters: # Add in any Conditionals and Syncs outside of the outermost Iteration tip = attach_metadata(c, None, stree) if tip is stree: pointers = list(mapper) else: pointers = [] index = 0 for it0, it1 in zip(c.itintervals, pointers): if it0 != it1: break index += 1 d = it0.dim # The reused sub-trees might acquire new sub-iterators as well as # new properties mapper[it0].top.ispace = IterationSpace.union( mapper[it0].top.ispace, c.ispace.project([d])) mapper[it0].top.properties = normalize_properties( mapper[it0].top.properties, c.properties[it0.dim]) # Different guards or syncops cannot be further nested if c.guards.get(d) != prev.guards.get(d) or \ c.syncs.get(d) != prev.syncs.get(d): tip = mapper[it0].top tip = attach_metadata(c, d, tip) mapper[it0].bottom = tip break else: tip = mapper[it0].bottom # Nested sub-trees, instead, will not be used anymore for it in pointers[index:]: mapper.pop(it) # Add in Iterations, Conditionals, and Syncs for it in c.itintervals[index:]: d = it.dim tip = NodeIteration(c.ispace.project([d]), tip, c.properties.get(d)) mapper[it].top = tip tip = attach_metadata(c, d, tip) mapper[it].bottom = tip # Add in Expressions NodeExprs(c.exprs, c.ispace, c.dspace, c.ops, c.traffic, tip) # Prepare for next iteration prev = c return stree
def callback(self, clusters, prefix): if not prefix: return clusters it = prefix[-1] d = it.dim direction = it.direction try: pd = prefix[-2].dim except IndexError: pd = None # What are the stream-able Dimensions? # 0) all sequential Dimensions # 1) all CustomDimensions of fixed (i.e. integer) size, which # implies a bound on the amount of streamed data if all(SEQUENTIAL in c.properties[d] for c in clusters): make_fetch = lambda f, i, s, cb: FetchWaitPrefetch( f, d, direction, i, s, cb) make_delete = lambda f, i, s, cb: Delete(f, d, direction, i, s, cb) syncd = d elif d.is_Custom and is_integer(it.size): make_fetch = lambda f, i, s, cb: FetchWait(f, d, direction, i, it. size, cb) make_delete = lambda f, i, s, cb: Delete(f, d, direction, i, it. size, cb) syncd = pd else: return clusters first_seen = {} last_seen = {} for c in clusters: candidates = self.key(c) if not candidates: continue for i in c.scope.accesses: f = i.function if f in candidates: k = (f, i[d]) first_seen.setdefault(k, c) last_seen[k] = c if not first_seen: return clusters # Bind fetches and deletes to Clusters sync_ops = defaultdict(list) callbacks = [(frozendict(first_seen), make_fetch), (frozendict(last_seen), make_delete)] for seen, callback in callbacks: mapper = defaultdict(lambda: DefaultOrderedDict(list)) for (f, v), c in seen.items(): mapper[c][f].append(v) for c, m in mapper.items(): for f, v in m.items(): for i, s in indices_to_sections(v): next_cbk = make_next_cbk(c.guards.get(d), d, direction) sync_ops[c].append(callback(f, i, s, next_cbk)) # Attach SyncOps to Clusters processed = [] for c in clusters: v = sync_ops.get(c) if v is not None: processed.append( c.rebuild(syncs=normalize_syncs(c.syncs, {syncd: v}))) else: processed.append(c) return processed
def callback(self, clusters, prefix): if not prefix: return clusters d = prefix[-1].dim subiters = flatten( [c.ispace.sub_iterators.get(d, []) for c in clusters]) subiters = {i for i in subiters if i.is_Stepping} if not subiters: return clusters # Collect the index access functions along `d`, e.g., `t + 1` where `t` is # a SteppingDimension for `d = time` mapper = DefaultOrderedDict(lambda: DefaultOrderedDict(set)) for c in clusters: indexeds = [ a.indexed for a in c.scope.accesses if a.function.is_Tensor ] for i in indexeds: try: iaf = i.indices[d] except KeyError: continue # Sanity checks sis = iaf.free_symbols & subiters if len(sis) == 0: continue elif len(sis) == 1: si = sis.pop() else: raise InvalidOperator( "Cannot use multiple SteppingDimensions " "to index into a Function") size = i.function.shape_allocated[d] assert is_integer(size) mapper[size][si].add(iaf) # Construct the ModuloDimensions mds = [] for size, v in mapper.items(): for si, iafs in list(v.items()): # Offsets are sorted so that the semantic order (t0, t1, t2) follows # SymPy's index ordering (t, t-1, t+1) afer modulo replacement so # that associativity errors are consistent. This corresponds to # sorting offsets {-1, 0, 1} as {0, -1, 1} assigning -inf to 0 siafs = sorted(iafs, key=lambda i: -np.inf if i - si == 0 else (i - si)) for iaf in siafs: name = '%s%d' % (si.name, len(mds)) offset = uxreplace(iaf, {si: d.root}) mds.append( ModuloDimension(name, si, offset, size, origin=iaf)) # Replacement rule for ModuloDimensions def rule(size, e): try: return e.function.shape_allocated[d] == size except (AttributeError, KeyError): return False # Reconstruct the Clusters processed = [] for c in clusters: # Apply substitutions to expressions # Note: In an expression, there could be `u[t+1, ...]` and `v[t+1, # ...]`, where `u` and `v` are TimeFunction with circular time # buffers (save=None) *but* different modulo extent. The `t+1` # indices above are therefore conceptually different, so they will # be replaced with the proper ModuloDimension through two different # calls to `xreplace_indices` exprs = c.exprs groups = as_mapper(mds, lambda d: d.modulo) for size, v in groups.items(): mapper = {md.origin: md for md in v} func = partial(xreplace_indices, mapper=mapper, key=partial(rule, size)) exprs = [e.apply(func) for e in exprs] # Augment IterationSpace ispace = IterationSpace(c.ispace.intervals, { **c.ispace.sub_iterators, **{ d: tuple(mds) } }, c.ispace.directions) processed.append(c.rebuild(exprs=exprs, ispace=ispace)) return processed
def __init__(self, *args, **kwargs): super(PerformanceSummary, self).__init__(*args, **kwargs) self.subsections = DefaultOrderedDict(lambda: OrderedDict()) self.input = OrderedDict() self.globals = {}
def linearize_accesses(iet, key, cache, sregistry): """ Turn Indexeds into FIndexeds and create the necessary access Macros. """ # `functions` are all Functions that `iet` may need to linearize functions = [f for f in FindSymbols().visit(iet) if key(f) and f.ndim > 1] functions = sorted(functions, key=lambda f: len(f.dimensions), reverse=True) # `functions_unseen` are all Functions that `iet` may need to linearize # and have not been seen while processing other IETs functions_unseen = [f for f in functions if f not in cache] # Find unique sizes (unique -> minimize necessary registers) mapper = DefaultOrderedDict(list) for f in functions: # NOTE: the outermost dimension is unnecessary for d in f.dimensions[1:]: # TODO: same grid + same halo => same padding, however this is # never asserted throughout the compiler yet... maybe should do # it when in debug mode at `prepare_arguments` time, ie right # before jumping to C? mapper[(d, f._size_halo[d], getattr(f, 'grid', None))].append(f) # For all unseen Functions, build the size exprs. For example: # `x_fsz0 = u_vec->size[1]` imapper = DefaultOrderedDict(dict) for (d, halo, _), v in mapper.items(): v_unseen = [f for f in v if f in functions_unseen] if not v_unseen: continue expr = _generate_fsz(v_unseen[0], d, sregistry) if expr: for f in v_unseen: imapper[f][d] = expr.write cache[f].stmts0.append(expr) # For all unseen Functions, build the stride exprs. For example: # `y_stride0 = y_fsz0*z_fsz0` built = {} mapper = DefaultOrderedDict(dict) for f, v in imapper.items(): for d in v: n = f.dimensions.index(d) expr = prod(v[i] for i in f.dimensions[n:]) try: stmt = built[expr] except KeyError: name = sregistry.make_name(prefix='%s_stride' % d.name) s = Symbol(name=name, dtype=np.int64, is_const=True) stmt = built[expr] = DummyExpr(s, expr, init=True) mapper[f][d] = stmt.write cache[f].stmts1.append(stmt) mapper.update([(f, {}) for f in functions_unseen if f not in mapper]) # For all unseen Functions, build defines. For example: # `#define uL(t, x, y, z) u[(t)*t_stride0 + (x)*x_stride0 + (y)*y_stride0 + (z)]` headers = [] findexeds = {} for f in functions: if cache[f].cbk is None: header, cbk = _generate_macro(f, mapper[f], sregistry) headers.append(header) cache[f].cbk = findexeds[f] = cbk else: findexeds[f] = cache[f].cbk # Build "functional" Indexeds. For example: # `u[t2, x+8, y+9, z+7] => uL(t2, x+8, y+9, z+7)` mapper = {} indexeds = FindSymbols('indexeds').visit(iet) for i in indexeds: try: mapper[i] = findexeds[i.function](i) except KeyError: pass # Introduce the linearized expressions iet = Uxreplace(mapper).visit(iet) # All Functions that actually require linearization in `iet` candidates = [] candidates.extend(filter_ordered(i.function for i in indexeds)) calls = FindNodes(Call).visit(iet) cfuncs = filter_ordered(flatten(i.functions for i in calls)) candidates.extend(i for i in cfuncs if i.function.is_DiscreteFunction) # All Functions that can be linearized in `iet` defines = FindSymbols('defines-aliases').visit(iet) # Place the linearization expressions or delegate to ancestor efunc stmts0 = [] stmts1 = [] args = [] for f in candidates: if f in defines: stmts0.extend(cache[f].stmts0) stmts1.extend(cache[f].stmts1) else: args.extend([e.write for e in cache[f].stmts1]) if stmts0: assert len(stmts1) > 0 stmts0 = filter_ordered(stmts0) + [BlankLine] stmts1 = filter_ordered(stmts1) + [BlankLine] body = iet.body._rebuild(body=tuple(stmts0) + tuple(stmts1) + iet.body.body) iet = iet._rebuild(body=body) else: assert len(stmts0) == 0 return iet, headers, args
def linearize_accesses(iet, cache, sregistry): """ Turn Indexeds into FIndexeds and create the necessary access Macros. """ # Find all objects amenable to linearization symbol_names = {i.name for i in FindSymbols('indexeds').visit(iet)} functions = [f for f in FindSymbols().visit(iet) if ((f.is_DiscreteFunction or f.is_Array) and f.ndim > 1 and f.name in symbol_names)] functions = sorted(functions, key=lambda f: len(f.dimensions), reverse=True) # Find unique sizes (unique -> minimize necessary registers) mapper = DefaultOrderedDict(list) for f in functions: if f not in cache: # NOTE: the outermost dimension is unnecessary for d in f.dimensions[1:]: # TODO: same grid + same halo => same padding, however this is # never asserted throughout the compiler yet... maybe should do # it when in debug mode at `prepare_arguments` time, ie right # before jumping to C? mapper[(d, f._size_halo[d], getattr(f, 'grid', None))].append(f) # Build all exprs such as `x_fsz0 = u_vec->size[1]` imapper = DefaultOrderedDict(list) for (d, halo, _), v in mapper.items(): name = sregistry.make_name(prefix='%s_fsz' % d.name) s = Symbol(name=name, dtype=np.int32, is_const=True) try: expr = DummyExpr(s, v[0]._C_get_field(FULL, d).size, init=True) except AttributeError: assert v[0].is_Array expr = DummyExpr(s, v[0].symbolic_shape[d], init=True) for f in v: imapper[f].append((d, s)) cache[f].stmts0.append(expr) # Build all exprs such as `y_slc0 = y_fsz0*z_fsz0` built = {} mapper = DefaultOrderedDict(list) for f, v in imapper.items(): for n, (d, _) in enumerate(v): expr = prod(list(zip(*v[n:]))[1]) try: stmt = built[expr] except KeyError: name = sregistry.make_name(prefix='%s_slc' % d.name) s = Symbol(name=name, dtype=np.int32, is_const=True) stmt = built[expr] = DummyExpr(s, expr, init=True) mapper[f].append(stmt.write) cache[f].stmts1.append(stmt) mapper.update([(f, []) for f in functions if f not in mapper]) # Build defines. For example: # `define uL(t, x, y, z) u[(t)*t_slice_sz + (x)*x_slice_sz + (y)*y_slice_sz + (z)]` headers = [] findexeds = {} for f, szs in mapper.items(): if cache[f].cbk is not None: # Perhaps we've already built an access macro for `f` through another efunc findexeds[f] = cache[f].cbk else: assert len(szs) == len(f.dimensions) - 1 pname = sregistry.make_name(prefix='%sL' % f.name) expr = sum([MacroArgument(d.name)*s for d, s in zip(f.dimensions, szs)]) expr += MacroArgument(f.dimensions[-1].name) expr = Indexed(IndexedData(f.name, None, f), expr) define = DefFunction(pname, f.dimensions) headers.append((ccode(define), ccode(expr))) cache[f].cbk = findexeds[f] = lambda i, pname=pname: FIndexed(i, pname) # Build "functional" Indexeds. For example: # `u[t2, x+8, y+9, z+7] => uL(t2, x+8, y+9, z+7)` mapper = {} for n in FindNodes(Expression).visit(iet): subs = {} for i in retrieve_indexed(n.expr): try: subs[i] = findexeds[i.function](i) except KeyError: pass mapper[n] = n._rebuild(expr=uxreplace(n.expr, subs)) # Put together all of the necessary exprs for `y_fsz0`, ..., `y_slc0`, ... stmts0 = filter_ordered(flatten(cache[f].stmts0 for f in functions)) if stmts0: stmts0.append(BlankLine) stmts1 = filter_ordered(flatten(cache[f].stmts1 for f in functions)) if stmts1: stmts1.append(BlankLine) iet = Transformer(mapper).visit(iet) body = iet.body._rebuild(body=tuple(stmts0) + tuple(stmts1) + iet.body.body) iet = iet._rebuild(body=body) return iet, headers
def __init__(self, function, contracted_dims, accessv, n, async_degree): self.function = function self.accessv = accessv contraction_mapper = {} index_mapper = {} dims = list(function.dimensions) for d in contracted_dims: assert d in function.dimensions # Determine the buffer size along `d` indices = filter_ordered(i.indices[d] for i in accessv.accesses) slots = [i.xreplace({d: 0, d.spacing: 1}) for i in indices] size = max(slots) - min(slots) + 1 if async_degree is not None: if async_degree < size: warning("Ignoring provided asynchronous degree as it'd be " "too small for the required buffer (provided %d, " "but need at least %d for `%s`)" % (async_degree, size, function.name)) else: size = async_degree # Replace `d` with a suitable CustomDimension bd = CustomDimension('db%d' % n, 0, size-1, size, d) contraction_mapper[d] = dims[dims.index(d)] = bd if size > 1: # Create the necessary SteppingDimensions for indexing sd = SteppingDimension(name='sb%d' % n, parent=bd) index_mapper.update({i: i.xreplace({d: sd}) for i in indices}) else: # Special case, no need to keep a SteppingDimension around index_mapper.update({i: 0 for i in indices}) self.contraction_mapper = contraction_mapper self.index_mapper = index_mapper # Track the SubDimensions used to index into `function` subdims_mapper = DefaultOrderedDict(set) for e in accessv.mapper: try: # Case 1: implicitly via SubDomains m = {d.root: v for d, v in e.subdomain.dimension_map.items()} except AttributeError: # Case 2: explicitly via the lower-level SubDimension API m = {i.root: i for i in e.free_symbols if isinstance(i, Dimension) and (i.is_Sub or not i.is_Derived)} for d, v in m.items(): subdims_mapper[d].add(v) if any(len(v) > 1 for v in subdims_mapper.values()): # Non-uniform SubDimensions. At this point we're going to raise # an exception. It's either illegal or still unsupported for v in subdims_mapper.values(): for d0, d1 in combinations(v, 2): if d0.overlap(d1): raise InvalidOperator("Cannot apply `buffering` to `%s` as it " "is accessed over the overlapping " " SubDimensions `<%s, %s>`" % (function, d0, d1)) self.subdims_mapper = None raise NotImplementedError("`buffering` does not support multiple " "non-overlapping SubDimensions yet.") else: self.subdims_mapper = {d: v.pop() for d, v in subdims_mapper.items()} self.buffer = Array(name='%sb' % function.name, dimensions=dims, dtype=function.dtype, halo=function.halo, space='mapped')
def __init__(self, function, contracted_dims, accessv, options, sregistry, bds=None, mds=None): # Parse compilation options async_degree = options['buf-async-degree'] space = options['buf-mem-space'] dtype = options['buf-dtype'](function) self.function = function self.accessv = accessv self.contraction_mapper = {} self.index_mapper = defaultdict(dict) self.sub_iterators = defaultdict(list) self.subdims_mapper = DefaultOrderedDict(set) # Create the necessary ModuloDimensions for indexing into the buffer # E.g., `u[time,x] + u[time+1,x] -> `ub[sb0,x] + ub[sb1,x]`, where `sb0` # and `sb1` are ModuloDimensions starting at `time` and `time+1` respectively dims = list(function.dimensions) for d in contracted_dims: assert d in function.dimensions # Determine the buffer size, and therefore the span of the ModuloDimension, # along the contracting Dimension `d` indices = filter_ordered(i.indices[d] for i in accessv.accesses) slots = [i.subs({d: 0, d.spacing: 1}) for i in indices] try: size = max(slots) - min(slots) + 1 except TypeError: # E.g., special case `slots=[-1 + time/factor, 2 + time/factor]` # Resort to the fast vector-based comparison machinery (rather than # the slower sympy.simplify) slots = [Vector(i) for i in slots] size = int((vmax(*slots) - vmin(*slots) + 1)[0]) if async_degree is not None: if async_degree < size: warning("Ignoring provided asynchronous degree as it'd be " "too small for the required buffer (provided %d, " "but need at least %d for `%s`)" % (async_degree, size, function.name)) else: size = async_degree # Replace `d` with a suitable CustomDimension `bd` name = sregistry.make_name(prefix='db') bd = bds.setdefault((d, size), CustomDimension(name, 0, size-1, size, d)) self.contraction_mapper[d] = dims[dims.index(d)] = bd # Finally create the ModuloDimensions as children of `bd` if size > 1: # Note: indices are sorted so that the semantic order (sb0, sb1, sb2) # follows SymPy's index ordering (time, time-1, time+1) after modulo # replacement, so that associativity errors are consistent. This very # same strategy is also applied in clusters/algorithms/Stepper p, _ = offset_from_centre(d, indices) indices = sorted(indices, key=lambda i: -np.inf if i - p == 0 else (i - p)) for i in indices: name = sregistry.make_name(prefix='sb') md = mds.setdefault((bd, i), ModuloDimension(name, bd, i, size)) self.index_mapper[d][i] = md self.sub_iterators[d.root].append(md) else: assert len(indices) == 1 self.index_mapper[d][indices[0]] = 0 # Track the SubDimensions used to index into `function` for e in accessv.mapper: m = {i.root: i for i in e.free_symbols if isinstance(i, Dimension) and (i.is_Sub or not i.is_Derived)} for d, v in m.items(): self.subdims_mapper[d].add(v) if any(len(v) > 1 for v in self.subdims_mapper.values()): # Non-uniform SubDimensions. At this point we're going to raise # an exception. It's either illegal or still unsupported for v in self.subdims_mapper.values(): for d0, d1 in combinations(v, 2): if d0.overlap(d1): raise InvalidOperator("Cannot apply `buffering` to `%s` as it " "is accessed over the overlapping " " SubDimensions `<%s, %s>`" % (function, d0, d1)) raise NotImplementedError("`buffering` does not support multiple " "non-overlapping SubDimensions yet.") else: self.subdims_mapper = {d: v.pop() for d, v in self.subdims_mapper.items()} # Build and sanity-check the buffer IterationIntervals self.itintervals_mapper = {} for e in accessv.mapper: for i in e.ispace.itintervals: v = self.itintervals_mapper.setdefault(i.dim, i.args) if v != self.itintervals_mapper[i.dim]: raise NotImplementedError("Cannot apply `buffering` as the buffered " "function `%s` is accessed over multiple, " "non-compatible iteration spaces along the " "Dimension `%s`" % (function.name, i.dim)) # Also add IterationIntervals for initialization along `x`, should `xi` be # the only written Dimension in the `x` hierarchy for d, (interval, _, _) in list(self.itintervals_mapper.items()): for i in d._defines: self.itintervals_mapper.setdefault(i, (interval.relaxed, (), Forward)) # Finally create the actual buffer self.buffer = Array(name=sregistry.make_name(prefix='%sb' % function.name), dimensions=dims, dtype=dtype, halo=function.halo, space=space)
class Buffer(object): """ A buffer with metadata attached. Parameters ---------- function : DiscreteFunction The object for which the buffer is created. contracted_dims : list of Dimension The Dimensions in `function` to be contracted, that is to be replaced by ModuloDimensions. accessv : AccessValue All accesses involving `function`. options : dict, optional The compilation options. See `buffering.__doc__`. sregistry : SymbolRegistry The symbol registry, to create unique names for buffers and Dimensions. bds : dict, optional All CustomDimensions created to define buffer dimensions, potentially reusable in the creation of this buffer. The object gets updated if new CustomDimensions are created. mds : dict, optional All ModuloDimensions created to index into other buffers, potentially reusable for indexing into this buffer. The object gets updated if new ModuloDimensions are created. """ def __init__(self, function, contracted_dims, accessv, options, sregistry, bds=None, mds=None): # Parse compilation options async_degree = options['buf-async-degree'] space = options['buf-mem-space'] dtype = options['buf-dtype'](function) self.function = function self.accessv = accessv self.contraction_mapper = {} self.index_mapper = defaultdict(dict) self.sub_iterators = defaultdict(list) self.subdims_mapper = DefaultOrderedDict(set) # Create the necessary ModuloDimensions for indexing into the buffer # E.g., `u[time,x] + u[time+1,x] -> `ub[sb0,x] + ub[sb1,x]`, where `sb0` # and `sb1` are ModuloDimensions starting at `time` and `time+1` respectively dims = list(function.dimensions) for d in contracted_dims: assert d in function.dimensions # Determine the buffer size, and therefore the span of the ModuloDimension, # along the contracting Dimension `d` indices = filter_ordered(i.indices[d] for i in accessv.accesses) slots = [i.subs({d: 0, d.spacing: 1}) for i in indices] try: size = max(slots) - min(slots) + 1 except TypeError: # E.g., special case `slots=[-1 + time/factor, 2 + time/factor]` # Resort to the fast vector-based comparison machinery (rather than # the slower sympy.simplify) slots = [Vector(i) for i in slots] size = int((vmax(*slots) - vmin(*slots) + 1)[0]) if async_degree is not None: if async_degree < size: warning("Ignoring provided asynchronous degree as it'd be " "too small for the required buffer (provided %d, " "but need at least %d for `%s`)" % (async_degree, size, function.name)) else: size = async_degree # Replace `d` with a suitable CustomDimension `bd` name = sregistry.make_name(prefix='db') bd = bds.setdefault((d, size), CustomDimension(name, 0, size-1, size, d)) self.contraction_mapper[d] = dims[dims.index(d)] = bd # Finally create the ModuloDimensions as children of `bd` if size > 1: # Note: indices are sorted so that the semantic order (sb0, sb1, sb2) # follows SymPy's index ordering (time, time-1, time+1) after modulo # replacement, so that associativity errors are consistent. This very # same strategy is also applied in clusters/algorithms/Stepper p, _ = offset_from_centre(d, indices) indices = sorted(indices, key=lambda i: -np.inf if i - p == 0 else (i - p)) for i in indices: name = sregistry.make_name(prefix='sb') md = mds.setdefault((bd, i), ModuloDimension(name, bd, i, size)) self.index_mapper[d][i] = md self.sub_iterators[d.root].append(md) else: assert len(indices) == 1 self.index_mapper[d][indices[0]] = 0 # Track the SubDimensions used to index into `function` for e in accessv.mapper: m = {i.root: i for i in e.free_symbols if isinstance(i, Dimension) and (i.is_Sub or not i.is_Derived)} for d, v in m.items(): self.subdims_mapper[d].add(v) if any(len(v) > 1 for v in self.subdims_mapper.values()): # Non-uniform SubDimensions. At this point we're going to raise # an exception. It's either illegal or still unsupported for v in self.subdims_mapper.values(): for d0, d1 in combinations(v, 2): if d0.overlap(d1): raise InvalidOperator("Cannot apply `buffering` to `%s` as it " "is accessed over the overlapping " " SubDimensions `<%s, %s>`" % (function, d0, d1)) raise NotImplementedError("`buffering` does not support multiple " "non-overlapping SubDimensions yet.") else: self.subdims_mapper = {d: v.pop() for d, v in self.subdims_mapper.items()} # Build and sanity-check the buffer IterationIntervals self.itintervals_mapper = {} for e in accessv.mapper: for i in e.ispace.itintervals: v = self.itintervals_mapper.setdefault(i.dim, i.args) if v != self.itintervals_mapper[i.dim]: raise NotImplementedError("Cannot apply `buffering` as the buffered " "function `%s` is accessed over multiple, " "non-compatible iteration spaces along the " "Dimension `%s`" % (function.name, i.dim)) # Also add IterationIntervals for initialization along `x`, should `xi` be # the only written Dimension in the `x` hierarchy for d, (interval, _, _) in list(self.itintervals_mapper.items()): for i in d._defines: self.itintervals_mapper.setdefault(i, (interval.relaxed, (), Forward)) # Finally create the actual buffer self.buffer = Array(name=sregistry.make_name(prefix='%sb' % function.name), dimensions=dims, dtype=dtype, halo=function.halo, space=space) def __repr__(self): return "Buffer[%s,<%s>]" % (self.buffer.name, ','.join(str(i) for i in self.contraction_mapper)) @property def size(self): return np.prod([v.symbolic_size for v in self.contraction_mapper.values()]) @property def is_read(self): return self.accessv.is_read @property def is_readonly(self): return self.is_read and self.lastwrite is None @property def firstread(self): return self.accessv.firstread @property def lastwrite(self): return self.accessv.lastwrite @property def has_uniform_subdims(self): return self.subdims_mapper is not None @cached_property def indexed(self): return self.buffer.indexed @cached_property def index_mapper_flat(self): ret = {} for mapper in self.index_mapper.values(): ret.update(mapper) return ret @cached_property def writeto(self): """ The `writeto` IterationSpace, that is the iteration space that must be iterated over in order to initialize the buffer. """ intervals = [] sub_iterators = {} directions = {} for d in self.buffer.dimensions: try: interval, si, direction = self.itintervals_mapper[d] except KeyError: # E.g., the contraction Dimension `db0` assert d in self.contraction_mapper.values() interval, si, direction = Interval(d, 0, 0), (), Forward intervals.append(interval) sub_iterators[d] = si directions[d] = direction relations = (self.buffer.dimensions,) intervals = IntervalGroup(intervals, relations=relations) return IterationSpace(intervals, sub_iterators, directions) @cached_property def written(self): """ The `written` IterationSpace, that is the iteration space that must be iterated over in order to read all of the written buffer values. """ intervals = [] sub_iterators = {} directions = {} for dd in self.function.dimensions: d = dd.xreplace(self.subdims_mapper) try: interval, si, direction = self.itintervals_mapper[d] except KeyError: # E.g., d=time_sub assert d.is_NonlinearDerived d = d.root interval, si, direction = self.itintervals_mapper[d] intervals.append(interval) sub_iterators[d] = si + as_tuple(self.sub_iterators[d]) directions[d] = direction relations = (tuple(i.dim for i in intervals),) intervals = IntervalGroup(intervals, relations=relations) return IterationSpace(intervals, sub_iterators, directions) @cached_property def lastmap(self): """ A mapper from contracted Dimensions to a 2-tuple of indices representing, respectively, the "last" write to the buffer and the "last" read from the buffered Function. For example, `{time: (sb1, time+1)}`. """ mapper = {} for d, m in self.index_mapper.items(): try: func = max if self.written.directions[d.root] is Forward else min v = func(m) except TypeError: func = vmax if self.written.directions[d.root] is Forward else vmin v = func(*[Vector(i) for i in m])[0] mapper[d] = Map(m[v], v) return mapper @cached_property def initmap(self): """ A mapper from contracted Dimensions to indices representing the min points for buffer initialization. For example, in the case of a forward-propagating `time` Dimension, we could have `{time: (time_m + db0) % 2, (time_m + db0)}`; likewise, for backwards, `{time: (time_M - 2 + db0) % 4, time_M - 2 + db0}`. """ mapper = {} for d, bd in self.contraction_mapper.items(): indices = list(self.index_mapper[d]) # The buffer is initialized at `d_m(d_M) - offset`. E.g., a buffer with # six slots, used to replace a buffered Function accessed at `d-3`, `d` # and `d + 2`, will have `offset = 3` p, offset = offset_from_centre(d, indices) if self.written.directions[d.root] is Forward: v = p.subs(d.root, d.root.symbolic_min) - offset + bd else: v = p.subs(d.root, d.root.symbolic_max) - offset + bd mapper[d] = Map(v % bd.symbolic_size, v) return mapper
def stree_schedule(clusters): """ Arrange an iterable of Clusters into a ScheduleTree. """ stree = ScheduleTree() prev = Cluster(None) mapper = DefaultOrderedDict(lambda: Bunch(top=None, bottom=None)) def reuse_metadata(c0, c1, d): return (c0.guards.get(d) == c1.guards.get(d) and c0.syncs.get(d) == c1.syncs.get(d)) def attach_metadata(cluster, d, tip): if d in cluster.guards: tip = NodeConditional(cluster.guards[d], tip) if d in cluster.syncs: tip = NodeSync(cluster.syncs[d], tip) return tip for c in clusters: index = 0 # Reuse or add in any Conditionals and Syncs outside of the outermost Iteration if not reuse_metadata(c, prev, None): tip = attach_metadata(c, None, stree) maybe_reusable = [] else: try: tip = mapper[prev.itintervals[index]].top.parent except IndexError: tip = stree maybe_reusable = prev.itintervals for it0, it1 in zip(c.itintervals, maybe_reusable): if it0 != it1: break index += 1 d = it0.dim # The reused sub-trees might acquire new sub-iterators as well as # new properties mapper[it0].top.ispace = IterationSpace.union( mapper[it0].top.ispace, c.ispace.project([d])) mapper[it0].top.properties = normalize_properties( mapper[it0].top.properties, c.properties[it0.dim]) # Different guards or SyncOps cannot further be nested if not reuse_metadata(c, prev, d): tip = mapper[it0].top tip = attach_metadata(c, d, tip) mapper[it0].bottom = tip break else: tip = mapper[it0].bottom # Nested sub-trees, instead, will not be used anymore for it in prev.itintervals[index:]: mapper.pop(it) # Add in Iterations, Conditionals, and Syncs for it in c.itintervals[index:]: d = it.dim tip = NodeIteration(c.ispace.project([d]), tip, c.properties.get(d, ())) mapper[it].top = tip tip = attach_metadata(c, d, tip) mapper[it].bottom = tip # Add in Expressions exprs = [] for conditionals, g in groupby(c.exprs, key=lambda e: e.conditionals): exprs = list(g) # Indirect ConditionalDimensions induce expression-level guards if conditionals: guard = And(*conditionals.values(), evaluate=False) parent = NodeConditional(guard, tip) else: parent = tip NodeExprs(exprs, c.ispace, c.dspace, c.ops, c.traffic, parent) # Prepare for next iteration prev = c return stree