def run(expr): # Return semantic (rebuilt expression, factorization candidates) if expr.is_Number or expr.is_Symbol: return expr, [expr] elif expr.is_Indexed or expr.is_Atom: return expr, [] elif expr.is_Add: rebuilt, candidates = zip(*[run(arg) for arg in expr.args]) w_numbers = [i for i in rebuilt if any(j.is_Number for j in i.args)] wo_numbers = [i for i in rebuilt if i not in w_numbers] w_numbers = collect_const(expr.func(*w_numbers)) wo_numbers = expr.func(*wo_numbers) if aggressive is True and wo_numbers: for i in flatten(candidates): wo_numbers = collect(wo_numbers, i) rebuilt = expr.func(w_numbers, wo_numbers) return rebuilt, [] elif expr.is_Mul: rebuilt, candidates = zip(*[run(arg) for arg in expr.args]) rebuilt = collect_const(expr.func(*rebuilt)) return rebuilt, flatten(candidates) elif expr.is_Equality: rebuilt, candidates = zip(*[run(expr.lhs), run(expr.rhs)]) return expr.func(*rebuilt, evaluate=False), flatten(candidates) else: rebuilt, candidates = zip(*[run(arg) for arg in expr.args]) return expr.func(*rebuilt), flatten(candidates)
def wrapper(self, state, **kwargs): # Invoke the DSE pass on each Cluster tic = time() state.update(flatten([func(self, c, state.template, **kwargs) for c in state.clusters])) toc = time() # Profiling key = '%s%d' % (func.__name__, len(state.timings)) state.timings[key] = toc - tic if self.profile: candidates = [c.exprs for c in state.clusters if c.is_dense] state.ops[key] = estimate_cost(flatten(candidates))
def _C_make_dataobj(self, data): """ A ctypes object representing the DiscreteFunction that can be passed to an Operator. """ dataobj = byref(self._C_ctype._type_()) dataobj._obj.data = data.ctypes.data_as(c_void_p) dataobj._obj.size = (c_int*self.ndim)(*data.shape) # MPI-related fields dataobj._obj.npsize = (c_int*self.ndim)(*[i - sum(j) for i, j in zip(data.shape, self._size_padding)]) dataobj._obj.dsize = (c_int*self.ndim)(*self._size_domain) dataobj._obj.hsize = (c_int*(self.ndim*2))(*flatten(self._size_halo)) dataobj._obj.hofs = (c_int*(self.ndim*2))(*flatten(self._offset_halo)) dataobj._obj.oofs = (c_int*(self.ndim*2))(*flatten(self._offset_owned)) return dataobj
def visit_Operator(self, o): # Kernel signature and body body = flatten(self._visit(i) for i in o.children) decls = self._args_decl(o.parameters) signature = c.FunctionDeclaration(c.Value(o.retval, o.name), decls) retval = [c.Statement("return 0")] kernel = c.FunctionBody(signature, c.Block(body + retval)) # Elemental functions esigns = [] efuncs = [blankline] for i in o._func_table.values(): if i.local: esigns.append(c.FunctionDeclaration(c.Value(i.root.retval, i.root.name), self._args_decl(i.root.parameters))) efuncs.extend([i.root.ccode, blankline]) # Header files, extra definitions, ... header = [c.Line(i) for i in o._headers] includes = [c.Include(i, system=False) for i in o._includes] includes += [blankline] cdefs = [i._C_typedecl for i in o.parameters if i._C_typedecl is not None] cdefs = filter_sorted(cdefs, key=lambda i: i.tpname) if o._compiler.src_ext == 'cpp': cdefs += [c.Extern('C', signature)] cdefs = [i for j in cdefs for i in (j, blankline)] return c.Module(header + includes + cdefs + esigns + [blankline, kernel] + efuncs)
def set_dle_mode(mode): if not mode: return mode, {} elif isinstance(mode, str): return mode, {} elif isinstance(mode, tuple): if len(mode) == 0: return 'noop', {} elif isinstance(mode[-1], dict): if len(mode) == 2: return mode else: return tuple(flatten(i.split(',') for i in mode[:-1])), mode[-1] else: return tuple(flatten(i.split(',') for i in mode)), {} raise TypeError("Illegal DLE mode %s." % str(mode))
def xreplace_indices(exprs, mapper, key=None, only_rhs=False): """ Replace array indices in expressions. Parameters ---------- exprs : expr-like or list of expr-like One or more expressions to which the replacement is applied. mapper : dict The substitution rules. key : list of symbols or callable An escape hatch to rule out some objects from the replacement. If a list, apply the replacement to the symbols in ``key`` only. If a callable, apply the replacement to a symbol S if and only if ``key(S)`` gives True. only_rhs : bool, optional If True, apply the replacement to Eq right-hand sides only. """ get = lambda i: i.rhs if only_rhs is True else i handle = flatten(retrieve_indexed(get(i)) for i in as_tuple(exprs)) if isinstance(key, Iterable): handle = [i for i in handle if i.base.label in key] elif callable(key): handle = [i for i in handle if key(i)] mapper = dict(zip(handle, [i.xreplace(mapper) for i in handle])) replaced = [i.xreplace(mapper) for i in as_tuple(exprs)] return replaced if isinstance(exprs, Iterable) else replaced[0]
def unknown(self): """ Return all symbols appearing in self for which a node is not available. """ known = {v.function for v in self.values()} reads = set([i.function for i in flatten(retrieve_terminals(v.rhs) for v in self.values())]) return reads - known
def __init__(self, exprs, **kwargs): # Always convert to SSA exprs = makeit_ssa(exprs) mapper = OrderedDict([(i.lhs, i) for i in exprs]) assert len(set(mapper)) == len(exprs), "not SSA Cluster?" # Construct the Nodes, tracking reads and readby tensor_map = DefaultOrderedDict(list) for i in mapper: tensor_map[as_symbol(i)].append(i) reads = DefaultOrderedDict(set) readby = DefaultOrderedDict(set) for k, v in mapper.items(): handle = retrieve_terminals(v.rhs) for i in list(handle): if i.is_Indexed: for idx in i.indices: handle |= retrieve_terminals(idx) reads[k].update(set(flatten([tensor_map.get(as_symbol(i), []) for i in handle]))) for i in reads[k]: readby[i].add(k) # Make sure read-after-writes are honored for scalar nodes processed = [i for i in mapper if i.is_Indexed] queue = [i for i in mapper if i not in processed] while queue: k = queue.pop(0) if not readby[k] or k in readby[k]: processed.insert(0, k) elif all(i in processed for i in readby[k]): index = min(processed.index(i) for i in readby[k]) processed.insert(index, k) else: queue.append(k) # Build up the FlowGraph nodes = [(i, Node(mapper[i], reads=reads[i], readby=readby[i])) for i in processed] super(FlowGraph, self).__init__(nodes, **kwargs) # Determine indices along the space and time dimensions terms = [v for k, v in self.items() if v.is_Tensor and not q_indirect(k)] indices = filter_ordered(flatten([i.function.indices for i in terms])) self.space_indices = tuple(i for i in indices if i.is_Space) self.time_indices = tuple(i for i in indices if i.is_Time)
def estimate_cost(expr, estimate_functions=False): """ Estimate the operation count of an expression. Parameters ---------- expr : expr-like or list of expr-like One or more expressions for which the operation count is calculated. estimate_functions : dict, optional A mapper from known functions (e.g., sin, cos) to (estimated) operation counts. """ external_functions = {sin: 50, cos: 50} try: # Is it a plain SymPy object ? iter(expr) except TypeError: expr = [expr] try: # Is it a dict ? expr = expr.values() except AttributeError: try: # Must be a list of dicts then expr = flatten([i.values() for i in expr]) except AttributeError: pass try: # At this point it must be a list of SymPy objects # We don't use SymPy's count_ops because we do not count integer arithmetic # (e.g., array index functions such as i+1 in A[i+1]) # Also, the routine below is *much* faster than count_ops expr = [i.rhs if i.is_Equality else i for i in expr] operations = flatten(retrieve_ops(i) for i in expr) flops = 0 for op in operations: if op.is_Function: if estimate_functions: flops += external_functions.get(op.__class__, 1) else: flops += 1 else: flops += len(op.args) - (1 + sum(True for i in op.args if i.is_Integer)) return flops except: warning("Cannot estimate cost of %s" % str(expr))
def detect_io(exprs, relax=False): """ ``{exprs} -> ({reads}, {writes})`` Parameters ---------- exprs : expr-like or list of expr-like The searched expressions. relax : bool, optional If False, as by default, collect only Constants and Functions. Otherwise, collect any Basic object. """ exprs = as_tuple(exprs) if relax is False: rule = lambda i: i.is_Input else: rule = lambda i: i.is_Scalar or i.is_Tensor # Don't forget this nasty case, with indirections on the LHS: # >>> u[t, a[x]] = f[x] -> (reads={a, f}, writes={u}) roots = [] for i in exprs: try: roots.append(i.rhs) roots.extend(list(i.lhs.indices)) except AttributeError: # E.g., FunctionFromPointer roots.append(i) reads = [] terminals = flatten(retrieve_terminals(i, deep=True) for i in roots) for i in terminals: candidates = i.free_symbols try: candidates.update({i.function}) except AttributeError: pass for j in candidates: try: if rule(j): reads.append(j) except AttributeError: pass writes = [] for i in exprs: try: f = i.lhs.function except AttributeError: continue if rule(f): writes.append(f) return filter_sorted(reads), filter_sorted(writes)
def _dist_scatter_mask(self): """ A mask to index into ``self.data``, which creates a new data array that logically contains N consecutive groups of sparse data values, where N is the number of MPI ranks. The i-th group contains the sparse data values accessible by the i-th MPI rank. Thus, sparse data values along the boundary of two or more MPI ranks are duplicated. """ dmap = self._dist_datamap mask = np.array(flatten(dmap[i] for i in sorted(dmap)), dtype=int) ret = [slice(None) for i in range(self.ndim)] ret[self._sparse_position] = mask return tuple(ret)
def fold_blockable_tree(node, blockinner=True): """ Create IterationFolds from sequences of nested Iterations. """ mapper = {} for k, v in FindAdjacent(Iteration).visit(node).items(): for i in v: # Pre-condition: they all must be perfect iterations assert len(i) > 1 if any(not IsPerfectIteration().visit(j) for j in i): continue # Only retain consecutive trees having same depth trees = [retrieve_iteration_tree(j)[0] for j in i] handle = [] for j in trees: if len(j) != len(trees[0]): break handle.append(j) trees = handle if not trees: continue # Check foldability pairwise_folds = list(zip(*reversed(trees))) if any(not is_foldable(j) for j in pairwise_folds): continue # Maybe heuristically exclude innermost Iteration if blockinner is False: pairwise_folds = pairwise_folds[:-1] # Perhaps there's nothing to fold if len(pairwise_folds) == 1: continue # TODO: we do not currently support blocking if any of the foldable # iterations writes to user data (need min/max loop bounds?) exprs = flatten(FindNodes(Expression).visit(j.root) for j in trees[:-1]) if any(j.write.is_Input for j in exprs): continue # Perform folding for j in pairwise_folds: root, remainder = j[0], j[1:] folds = [(tuple(y-x for x, y in zip(i.offsets, root.offsets)), i.nodes) for i in remainder] mapper[root] = IterationFold(folds=folds, **root.args) for k in remainder: mapper[k] = None # Insert the IterationFolds in the Iteration/Expression tree processed = Transformer(mapper, nested=True).visit(node) return processed
def visit_Iteration(self, o): body = flatten(self._visit(i) for i in o.children) # Start if o.offsets[0] != 0: _min = str(o.limits[0] + o.offsets[0]) try: _min = eval(_min) except (NameError, TypeError): pass else: _min = o.limits[0] # Bound if o.offsets[1] != 0: _max = str(o.limits[1] + o.offsets[1]) try: _max = eval(_max) except (NameError, TypeError): pass else: _max = o.limits[1] # For backward direction flip loop bounds if o.direction == Backward: loop_init = 'int %s = %s' % (o.index, ccode(_max)) loop_cond = '%s >= %s' % (o.index, ccode(_min)) loop_inc = '%s -= %s' % (o.index, o.limits[2]) else: loop_init = 'int %s = %s' % (o.index, ccode(_min)) loop_cond = '%s <= %s' % (o.index, ccode(_max)) loop_inc = '%s += %s' % (o.index, o.limits[2]) # Append unbounded indices, if any if o.uindices: uinit = ['%s = %s' % (i.name, ccode(i.symbolic_min)) for i in o.uindices] loop_init = c.Line(', '.join([loop_init] + uinit)) ustep = ['%s = %s' % (i.name, ccode(i.symbolic_incr)) for i in o.uindices] loop_inc = c.Line(', '.join([loop_inc] + ustep)) # Create For header+body handle = c.For(loop_init, loop_cond, loop_inc, c.Block(body)) # Attach pragmas, if any if o.pragmas: handle = c.Module(o.pragmas + (handle,)) return handle
def _index_matrix(self, offset): # Note about the use of *memoization* # Since this method is called by `_interpolation_indices`, using # memoization avoids a proliferation of symbolically identical # ConditionalDimensions for a given set of indirection indices # List of indirection indices for all adjacent grid points index_matrix = [tuple(idx + ii + offset for ii, idx in zip(inc, self._coordinate_indices)) for inc in self._point_increments] # A unique symbol for each indirection index indices = filter_ordered(flatten(index_matrix)) points = OrderedDict([(p, Symbol(name='ii_%s_%d' % (self.name, i))) for i, p in enumerate(indices)]) return index_matrix, points
def handle_indexed(indexed): relation = [] for i in indexed.indices: try: maybe_dim = split_affine(i).var if isinstance(maybe_dim, Dimension): relation.append(maybe_dim) except ValueError: # Maybe there are some nested Indexeds (e.g., the situation is A[B[i]]) nested = flatten(handle_indexed(n) for n in retrieve_indexed(i)) if nested: relation.extend(nested) else: # Fallback: Just insert all the Dimensions we find, regardless of # what the user is attempting to do relation.extend([d for d in filter_sorted(i.free_symbols) if isinstance(d, Dimension)]) return tuple(relation)
def st_section(stree): """ Add NodeSections to a ScheduleTree. A NodeSection, or simply "section", defines a sub-tree with the following properties: * The root is a node of type NodeSection; * The immediate children of the root are nodes of type NodeIteration; * The Dimensions of the immediate children are either: * identical, OR * different, but all of type SubDimension; * The Dimension of the immediate children cannot be a TimeDimension. """ class Section(object): def __init__(self, node): self.parent = node.parent self.dim = node.dim self.nodes = [node] def is_compatible(self, node): return self.parent == node.parent and self.dim.root == node.dim.root # Search candidate sections sections = [] for i in range(stree.height): # Find all sections at depth `i` section = None for n in findall(stree, filter_=lambda n: n.depth == i): if any(p in flatten(s.nodes for s in sections) for p in n.ancestors): # Already within a section continue elif not n.is_Iteration or n.dim.is_Time: section = None elif section is None or not section.is_compatible(n): section = Section(n) sections.append(section) else: section.nodes.append(n) # Transform the schedule tree by adding in sections for i in sections: insert(NodeSection(), i.parent, i.nodes) return stree
def instrument(self, iet): """ Enrich the Iteration/Expression tree ``iet`` adding nodes for C-level performance profiling. In particular, turn all :class:`Section`s within ``iet`` into :class:`TimedList`s. """ sections = FindNodes(Section).visit(iet) for section in sections: bundles = FindNodes(ExpressionBundle).visit(section) # Total operation count ops = sum(i.ops for i in bundles) # Operation count at each section iteration sops = sum(estimate_cost(i.expr) for i in flatten(b.exprs for b in bundles)) # Total memory traffic mapper = {} for i in bundles: for k, v in i.traffic.items(): mapper.setdefault(k, []).append(v) traffic = [IntervalGroup.generate('merge', *i) for i in mapper.values()] traffic = sum(i.size for i in traffic) # Each ExpressionBundle lives in its own iteration space itershapes = [i.shape for i in bundles] # Track how many grid points are written within `section` points = [] for i in bundles: writes = {e.write for e in i.exprs if e.is_tensor and e.write.is_TimeFunction} points.append(reduce(mul, i.shape)*len(writes)) points = sum(points) self._sections[section] = SectionData(ops, sops, points, traffic, itershapes) # Transform the Iteration/Expression tree introducing the C-level timers mapper = {i: TimedList(timer=self.timer, lname=i.name, body=i) for i in sections} iet = Transformer(mapper).visit(iet) return iet
def compose_nodes(nodes, retrieve=False): """Build an IET by nesting ``nodes``.""" l = list(nodes) tree = [] if not isinstance(l[0], Iteration): # Nothing to compose body = flatten(l) body = List(body=body) if len(body) > 1 else body[0] else: body = l.pop(-1) while l: handle = l.pop(-1) body = handle._rebuild(body, **handle.args_frozen) tree.append(body) if retrieve is True: tree = list(reversed(tree)) return body, tree else: return body
def visit_Callable(self, o): body = flatten(self._visit(i) for i in o.children) decls = self._args_decl(o.parameters) signature = c.FunctionDeclaration(c.Value(o.retval, o.name), decls) return c.FunctionBody(signature, c.Block(body))
def visit_Block(self, o): body = flatten(self._visit(i) for i in o.children) return c.Module(o.header + (c.Block(body),) + o.footer)
def visit_Section(self, o): header = c.Comment("Begin %s" % o.name) body = flatten(self._visit(i) for i in o.children) footer = c.Comment("End %s" % o.name) return c.Module([header] + body + [footer])
def cire(cluster, template, mode, options, platform): """ Cross-iteration redundancies elimination. Parameters ---------- cluster : Cluster Input Cluster, subject of the optimization pass. template : callable To build the symbols (temporaries) storing the redundant expressions. mode : str The transformation mode. Accepted: ['invariants', 'sops']. * 'invariants' is for sub-expressions that are invariant w.r.t. one or more Dimensions. * 'sops' stands for sums-of-products, that is redundancies are searched across all expressions in sum-of-product form. options : dict The optimization mode. Accepted: ['min-storage']. * 'min-storage': if True, the pass will try to minimize the amount of storage introduced for the tensor temporaries. This might also reduce the operation count. On the other hand, this might affect fusion and therefore data locality. Defaults to False (legacy). platform : Platform The underlying platform. Used to optimize the shape of the introduced tensor symbols. Examples -------- 1) 'invariants'. Below is an expensive sub-expression invariant w.r.t. `t` t0 = (cos(a[x,y,z])*sin(b[x,y,z]))*c[t,x,y,z] becomes t1[x,y,z] = cos(a[x,y,z])*sin(b[x,y,z]) t0 = t1[x,y,z]*c[t,x,y,z] 2) 'sops'. Below are redundant sub-expressions in sum-of-product form (in this case, the sum degenerates to a single product). t0 = 2.0*a[x,y,z]*b[x,y,z] t1 = 3.0*a[x,y,z+1]*b[x,y,z+1] becomes t2[x,y,z] = a[x,y,z]*b[x,y,z] t0 = 2.0*t2[x,y,z] t1 = 3.0*t2[x,y,z+1] """ # Sanity checks assert mode in ['invariants', 'sops'] assert all(i > 0 for i in options['cire-repeats'].values()) # Relevant options min_storage = options['min-storage'] # Setup callbacks def callbacks_invariants(context, *args): extractor = make_is_time_invariant(context) model = lambda e: estimate_cost(e, True) >= MIN_COST_ALIAS_INV ignore_collected = lambda g: False selector = lambda c, n: c >= MIN_COST_ALIAS_INV and n >= 1 return extractor, model, ignore_collected, selector def callbacks_sops(context, n): # The `depth` determines "how big" the extracted sum-of-products will be. # We observe that in typical FD codes: # add(mul, mul, ...) -> stems from first order derivative # add(mul(add(mul, mul, ...), ...), ...) -> stems from second order derivative # To catch the former, we would need `depth=1`; for the latter, `depth=3` depth = 2 * n + 1 extractor = lambda e: q_sum_of_product(e, depth) model = lambda e: not (q_leaf(e) or q_terminalop(e, depth - 1)) ignore_collected = lambda g: len(g) <= 1 selector = lambda c, n: c >= MIN_COST_ALIAS and n > 1 return extractor, model, ignore_collected, selector callbacks_mapper = { 'invariants': callbacks_invariants, 'sops': callbacks_sops } # The main CIRE loop processed = [] context = cluster.exprs for n in reversed(range(options['cire-repeats'][mode])): # Get the callbacks extractor, model, ignore_collected, selector = callbacks_mapper[mode]( context, n) # Extract potentially aliasing expressions exprs, extracted = extract(cluster, extractor, model, template) if not extracted: # Do not waste time continue # There can't be Dimension-dependent data dependences with any of # the `processed` Clusters, otherwise we would risk either OOB accesses # or reading from garbage uncomputed halo scope = Scope(exprs=flatten(c.exprs for c in processed) + extracted) if not all(i.is_indep() for i in scope.d_all_gen()): break # Search aliasing expressions aliases = collect(extracted, min_storage, ignore_collected) # Rule out aliasing expressions with a bad flops/memory trade-off chosen, others = choose(exprs, aliases, selector) if not chosen: # Do not waste time continue # Create Aliases and assign them to Clusters clusters, subs = process(cluster, chosen, aliases, template, platform) # Rebuild `cluster` so as to use the newly created Aliases rebuilt = rebuild(cluster, others, aliases, subs) # Prepare for the next round processed.extend(clusters) cluster = rebuilt context = flatten(c.exprs for c in processed) + list(cluster.exprs) processed.append(cluster) return processed
def _loop_blocking(self, iet): """ Apply loop blocking to PARALLEL Iteration trees. """ blockinner = bool(self.params.get('blockinner')) blockalways = bool(self.params.get('blockalways')) # Make sure loop blocking will span as many Iterations as possible iet = fold_blockable_tree(iet, blockinner) mapper = {} efuncs = [] block_dims = [] for tree in retrieve_iteration_tree(iet): # Is the Iteration tree blockable ? iterations = filter_iterations(tree, lambda i: i.is_Parallel) if not blockinner: iterations = iterations[:-1] if len(iterations) <= 1: continue root = iterations[0] if not (tree.root.is_Sequential or iet.is_Callable) and not blockalways: # Heuristic: avoid polluting the generated code with blocked # nests (thus increasing JIT compilation time and affecting # readability) if the blockable tree isn't embedded in a # sequential loop (e.g., a timestepping loop) continue # Apply loop blocking to `tree` interb = [] intrab = [] for i in iterations: d = BlockDimension(i.dim, name="%s%d_blk" % (i.dim.name, len(mapper))) block_dims.append(d) # Build Iteration over blocks properties = (PARALLEL,) + ((AFFINE,) if i.is_Affine else ()) interb.append(Iteration([], d, d.symbolic_max, properties=properties)) # Build Iteration within a block intrab.append(i._rebuild([], limits=(d, d+d.step-1, 1), offsets=(0, 0))) # Construct the blocked tree blocked = compose_nodes(interb + intrab + [iterations[-1].nodes]) blocked = unfold_blocked_tree(blocked) # Promote to a separate Callable dynamic_parameters = flatten((bi.dim, bi.dim.symbolic_size) for bi in interb) efunc = make_efunc("bf%d" % len(mapper), blocked, dynamic_parameters) efuncs.append(efunc) # Compute the iteration ranges ranges = [] for i, bi in zip(iterations, interb): maxb = i.symbolic_max - (i.symbolic_size % bi.dim.step) ranges.append(((i.symbolic_min, maxb, bi.dim.step), (maxb + 1, i.symbolic_max, i.symbolic_max - maxb))) # Build Calls to the `efunc` body = [] for p in product(*ranges): dynamic_args_mapper = {} for bi, (m, M, b) in zip(interb, p): dynamic_args_mapper[bi.dim] = (m, M) dynamic_args_mapper[bi.dim.step] = (b,) call = efunc.make_call(dynamic_args_mapper) body.append(List(body=call)) mapper[root] = List(body=body) iet = Transformer(mapper).visit(iet) return iet, {'dimensions': block_dims, 'efuncs': efuncs, 'args': [i.step for i in block_dims]}
def visit_Block(self, o): body = flatten(self._visit(i) for i in o.children) return c.Module(o.header + (c.Block(body), ) + o.footer)
def _fetch_scope(self, clusters): exprs = flatten(c.exprs for c in as_tuple(clusters)) key = tuple(exprs) if key not in self.state.scopes: self.state.scopes[key] = Scope(exprs) return self.state.scopes[key]
def _fetch_scope(self, clusters): key = as_tuple(clusters) if key not in self.state.scopes: self.state.scopes[key] = Scope(flatten(c.exprs for c in key)) return self.state.scopes[key]
def detect_io(exprs, relax=False): """ ``{exprs} -> ({reads}, {writes})`` Parameters ---------- exprs : expr-like or list of expr-like The searched expressions. relax : bool, optional If False, as by default, collect only Constants and Functions. Otherwise, collect any Basic object. """ exprs = as_tuple(exprs) if relax is False: rule = lambda i: i.is_Input else: rule = lambda i: i.is_Scalar or i.is_Tensor # Don't forget the nasty case with indirections on the LHS: # >>> u[t, a[x]] = f[x] -> (reads={a, f}, writes={u}) roots = [] for i in exprs: try: roots.append(i.rhs) roots.extend(list(i.lhs.indices)) roots.extend(list(i.conditionals.values())) except AttributeError: # E.g., FunctionFromPointer roots.append(i) reads = [] terminals = flatten(retrieve_terminals(i, deep=True) for i in roots) for i in terminals: candidates = set(i.free_symbols) try: candidates.update({i.function}) except AttributeError: pass for j in candidates: try: if rule(j): reads.append(j) except AttributeError: pass writes = [] for i in exprs: try: f = i.lhs.function except AttributeError: continue try: if rule(f): writes.append(f) except AttributeError: # We only end up here after complex IET transformations which make # use of composite types assert isinstance(i.lhs, FunctionFromPointer) f = i.lhs.base.function if rule(f): writes.append(f) return filter_sorted(reads), filter_sorted(writes)
def visit_HaloSpot(self, o): body = flatten(self._visit(i) for i in o.children) return c.Collection(body)
def generate_loops(self, loop_body): """Assuming that the variable order defined in init (#var_order) is the order the corresponding dimensions are layout in memory, the last variable in that definition should be the fastest varying dimension in the arrays. Therefore reverse the list of dimensions, making the last variable in #var_order (z in the 3D case) vary in the inner-most loop :param loop_body: Statement representing the loop body :returns: :class:`cgen.Block` representing the loop """ # Space loops if not isinstance(loop_body, cgen.Block) or len(loop_body.contents) > 0: if self.cache_blocking is not None: self._decide_block_sizes() loop_body = self.generate_space_loops_blocking(loop_body) else: loop_body = self.generate_space_loops(loop_body) else: loop_body = [] omp_master = [cgen.Pragma("omp master") ] if self.compiler.openmp else [] omp_single = [cgen.Pragma("omp single") ] if self.compiler.openmp else [] omp_parallel = [cgen.Pragma("omp parallel") ] if self.compiler.openmp else [] omp_for = [cgen.Pragma("omp for schedule(static)") ] if self.compiler.openmp else [] t_loop_limits = self.time_loop_limits t_var = str(self._mapper[self.time_dim]) cond_op = "<" if self._forward else ">" if self.save is not True: # To cycle between array elements when we are not saving time history time_stepping = self.get_time_stepping() else: time_stepping = [] if len(loop_body) > 0: loop_body = [cgen.Block(omp_for + loop_body)] # Statements to be inserted into the time loop before the spatial loop pre_stencils = [ self.time_substitutions(x) for x in self.time_loop_stencils_b ] pre_stencils = [ self.convert_equality_to_cgen(x) for x in self.time_loop_stencils_b ] # Statements to be inserted into the time loop after the spatial loop post_stencils = [ self.time_substitutions(x) for x in self.time_loop_stencils_a ] post_stencils = [ self.convert_equality_to_cgen(x) for x in self.time_loop_stencils_a ] if self.profile: pre_stencils = list( flatten([ self.profiler.add_profiling([s], "%s%d" % (PRE_STENCILS.name, i)) for i, s in enumerate(pre_stencils) ])) post_stencils = list( flatten([ self.profiler.add_profiling([s], "%s%d" % (POST_STENCILS.name, i)) for i, s in enumerate(post_stencils) ])) initial_block = time_stepping + pre_stencils if initial_block: initial_block = omp_single + [cgen.Block(initial_block)] end_block = post_stencils if end_block: end_block = omp_single + [cgen.Block(end_block)] if self.profile: loop_body = self.profiler.add_profiling(loop_body, LOOP_BODY.name, omp_flag=omp_master) loop_body = cgen.Block(initial_block + loop_body + end_block) loop_body = cgen.For( cgen.InlineInitializer(cgen.Value("int", t_var), str(t_loop_limits[0])), t_var + cond_op + str(t_loop_limits[1]), t_var + "+=" + str(self._time_step), loop_body) # Code to declare the time stepping variables (outside the time loop) def_time_step = [ cgen.Value("int", t_var_def.name) for t_var_def in self.time_steppers ] body = def_time_step + omp_parallel + [loop_body] return cgen.Block(body)
def visit_Lambda(self, o): body = flatten(self._visit(i) for i in o.children) captures = [str(i) for i in o.captures] decls = [i.inline() for i in self._args_decl(o.parameters)] top = c.Line('[%s](%s)' % (', '.join(captures), ', '.join(decls))) return LambdaCollection([top, c.Block(body)])
def visit_Callable(self, o): body = flatten(self._visit(i) for i in o.children) decls = self._args_decl(o.parameters) signature = c.FunctionDeclaration(c.Value(o.retval, o.name), decls) return c.FunctionBody(signature, c.Block(body))
def visit_List(self, o): body = flatten(self._visit(i) for i in o.children) return c.Module(o.header + (c.Collection(body), ) + o.footer)
def _call_sendrecv(self, name, *args, **kwargs): return Call(name, flatten(args))
def rewrite(clusters, mode='advanced'): """ Given a sequence of N Clusters, produce a sequence of M Clusters with reduced operation count, with M >= N. Parameters ---------- clusters : list of Cluster The Clusters to be transformed. mode : str, optional The aggressiveness of the rewrite. Accepted: - ``noop``: Do nothing. - ``basic``: Apply common sub-expressions elimination. - ``advanced``: Apply all transformations that will reduce the operation count w/ minimum increase to the memory pressure, namely 'basic', factorization, CIRE for time-invariants only. - ``speculative``: Like 'advanced', but apply CIRE also to time-varying sub-expressions, which might further increase the memory pressure. * ``aggressive``: Like 'speculative', but apply CIRE to any non-trivial sub-expression (i.e., anything that is at least in a sum-of-product form). Further, seek and drop cross-cluster redundancies (this is the only pass that attempts to optimize *across* clusters, rather than within a cluster). The 'aggressive' mode may substantially increase the symbolic processing time; it may or may not reduce the JIT-compilation time; it may or may not improve the overall runtime performance. """ if not (mode is None or isinstance(mode, str)): raise ValueError("Parameter 'mode' should be a string, not %s." % type(mode)) if mode is None or mode == 'noop': return clusters elif mode not in dse_registry: dse_warning("Unknown rewrite mode(s) %s" % mode) return clusters # 1) Local optimization # --------------------- # We use separate rewriters for dense and sparse clusters; sparse clusters have # non-affine index functions, thus making it basically impossible, in general, # to apply the more advanced DSE passes. # Note: the sparse rewriter uses the same template for temporaries as # the dense rewriter, thus temporaries are globally unique rewriter = modes[mode]() fallback = BasicRewriter(False, rewriter.template) processed = ClusterGroup( flatten( rewriter.run(c) if c.is_dense else fallback.run(c) for c in clusters)) # 2) Cluster grouping # ------------------- # Different clusters may have created new (smaller) clusters which are # potentially groupable within a single cluster processed = groupby(processed) # 3)Global optimization # --------------------- # After grouping, there may be redundancies in one or more clusters. This final # pass searches and drops such redundancies if mode == 'aggressive': processed = cross_cluster_cse(processed) return processed.finalize()
def visit_Iteration(self, o): symbols = flatten([self._visit(i) for i in o.children]) symbols += self.rule(o) return filter_sorted(symbols, key=attrgetter('name'))
def __expr_finalize__(self, expr): """Finalize the Expression initialization.""" self._expr = expr self._reads, _ = detect_io(expr, relax=True) self._dimensions = flatten(i.indices for i in self.functions if i.is_Indexed) self._dimensions = tuple(filter_ordered(self._dimensions))
def __init__(self, expressions, **kwargs): expressions = as_tuple(expressions) # Input check if any(not isinstance(i, sympy.Eq) for i in expressions): raise InvalidOperator("Only SymPy expressions are allowed.") self.name = kwargs.get("name", "Kernel") subs = kwargs.get("subs", {}) dse = kwargs.get("dse", configuration['dse']) dle = kwargs.get("dle", configuration['dle']) # Header files, etc. self._headers = list(self._default_headers) self._includes = list(self._default_includes) self._globals = list(self._default_globals) # Required for compilation self._compiler = configuration['compiler'] self._lib = None self._cfunction = None # References to local or external routines self.func_table = OrderedDict() # Expression lowering and analysis expressions = [LoweredEq(e, subs=subs) for e in expressions] self.dtype = retrieve_dtype(expressions) self.input = filter_sorted(flatten(e.reads for e in expressions)) self.output = filter_sorted(flatten(e.writes for e in expressions)) self.dimensions = filter_sorted( flatten(e.dimensions for e in expressions)) # Group expressions based on their iteration space and data dependences, # and apply the Devito Symbolic Engine (DSE) for flop optimization clusters = clusterize(expressions) clusters = rewrite(clusters, mode=set_dse_mode(dse)) # Lower Clusters to an Iteration/Expression tree (IET) nodes = iet_build(clusters, self.dtype) # Introduce C-level profiling infrastructure nodes, self.profiler = self._profile_sections(nodes) # Translate into backend-specific representation (e.g., GPU, Yask) nodes = self._specialize(nodes) # Apply the Devito Loop Engine (DLE) for loop optimization dle_state = transform(nodes, *set_dle_mode(dle)) # Update the Operator state based on the DLE self.dle_arguments = dle_state.arguments self.dle_flags = dle_state.flags self.func_table.update( OrderedDict([(i.name, MetaCall(i, True)) for i in dle_state.elemental_functions])) self.dimensions.extend([ i.argument for i in self.dle_arguments if isinstance(i.argument, Dimension) ]) self._includes.extend(list(dle_state.includes)) # Introduce the required symbol declarations nodes = iet_insert_C_decls(dle_state.nodes, self.func_table) # Insert data and pointer casts for array parameters and profiling structs nodes = self._build_casts(nodes) # Derive parameters as symbols not defined in the kernel itself parameters = self._build_parameters(nodes) # Finish instantiation super(Operator, self).__init__(self.name, nodes, 'int', parameters, ())
def __add__(self, other): return flatten([self, other])
def autotune(operator, args, level, mode): """ Operator autotuning. Parameters ---------- operator : Operator Input Operator. args : dict_like The runtime arguments with which `operator` is run. level : str The autotuning aggressiveness (basic, aggressive). A more aggressive autotuning might eventually result in higher performance, though in some circumstances it might instead increase the actual runtime. mode : str The autotuning mode (preemptive, runtime). In preemptive mode, the output runtime values supplied by the user to `operator.apply` are replaced with shadow copies. """ key = [level, mode] accepted = configuration._accepted['autotuning'] if key not in accepted: raise ValueError("The accepted `(level, mode)` combinations are `%s`; " "provided `%s` instead" % (accepted, key)) # We get passed all the arguments, but the cfunction only requires a subset at_args = OrderedDict([(p.name, args[p.name]) for p in operator.parameters]) # User-provided output data won't be altered in `preemptive` mode if mode == 'preemptive': output = {i.name: i for i in operator.output} copies = {k: output[k]._C_as_ndarray(v).copy() for k, v in args.items() if k in output} # WARNING: `copies` keeps references to numpy arrays, which is required # to avoid garbage collection to kick in during autotuning and prematurely # free the shadow copies handed over to C-land at_args.update({k: output[k]._C_make_dataobj(v) for k, v in copies.items()}) # Disable halo exchanges through MPI_PROC_NULL if mode in ['preemptive', 'destructive']: for p in operator.parameters: if isinstance(p, MPINeighborhood): at_args.update(MPINeighborhood(p.fields)._arg_values()) for i in p.fields: setattr(at_args[p.name]._obj, i, MPI.PROC_NULL) elif isinstance(p, MPIMsgEnriched): at_args.update(MPIMsgEnriched(p.name, p.function, p.halos)._arg_values()) for i in at_args[p.name]: i.fromrank = MPI.PROC_NULL i.torank = MPI.PROC_NULL roots = [operator.body] + [i.root for i in operator._func_table.values()] trees = filter_ordered(retrieve_iteration_tree(roots), key=lambda i: i.root) # Detect the time-stepping Iteration; shrink its iteration range so that # each autotuning run only takes a few iterations steppers = {i for i in flatten(trees) if i.dim.is_Time} if len(steppers) == 0: stepper = None timesteps = 1 elif len(steppers) == 1: stepper = steppers.pop() timesteps = init_time_bounds(stepper, at_args) if timesteps is None: return args, {} else: warning("cannot perform autotuning unless there is one time loop; skipping") return args, {} # Perform autotuning timings = {} for n, tree in enumerate(trees): blockable = [i.dim for i in tree if isinstance(i.dim, BlockDimension)] # Tunable arguments try: tunable = [] tunable.append(generate_block_shapes(blockable, args, level)) tunable.append(generate_nthreads(operator.nthreads, args, level)) tunable = list(product(*tunable)) except ValueError: # Some arguments are cumpolsory, otherwise autotuning is skipped continue # Symbolic number of loop-blocking blocks per thread nblocks_per_thread = calculate_nblocks(tree, blockable) / operator.nthreads for bs, nt in tunable: # Can we safely autotune over the given time range? if not check_time_bounds(stepper, at_args, args, mode): break # Update `at_args` to use the new tunable arguments run = [(k, v) for k, v in bs + nt if k in at_args] at_args.update(dict(run)) # Drop run if not at least one block per thread if not configuration['develop-mode'] and nblocks_per_thread.subs(at_args) < 1: continue # Make sure we remain within stack bounds, otherwise skip run try: stack_footprint = operator._mem_summary['stack'] if int(evaluate(stack_footprint, **at_args)) > options['stack_limit']: continue except TypeError: warning("couldn't determine stack size; skipping run %s" % str(i)) continue except AttributeError: assert stack_footprint == 0 # Run the Operator operator.cfunction(*list(at_args.values())) elapsed = operator._profiler.timer.total timings.setdefault(nt, OrderedDict()).setdefault(n, {})[bs] = elapsed log("run <%s> took %f (s) in %d timesteps" % (','.join('%s=%s' % i for i in run), elapsed, timesteps)) # Prepare for the next autotuning run update_time_bounds(stepper, at_args, timesteps, mode) # Reset profiling timers operator._profiler.timer.reset() # The best variant is the one that for a given number of threads had the minium # turnaround time try: runs = 0 mapper = {} for k, v in timings.items(): for i in v.values(): runs += len(i) mapper.setdefault(nt, []).append(min(i, key=i.get)) best = min(mapper, key=mapper.get) best = OrderedDict(chain(best, *mapper[best])) best.pop(None, None) log("selected <%s>" % (','.join('%s=%s' % i for i in best.items()))) except ValueError: warning("couldn't perform any runs") return args, {} # Update the argument list with the tuned arguments args.update(best) # In `runtime` mode, some timesteps have been executed already, so we must # adjust the time range finalize_time_bounds(stepper, at_args, args, mode) # Autotuning summary summary = {} summary['runs'] = runs summary['tpr'] = timesteps # tpr -> timesteps per run summary['tuned'] = dict(best) return args, summary
def cire(cluster, template, mode, options, platform): """ Cross-iteration redundancies elimination. Parameters ---------- cluster : Cluster Input Cluster, subject of the optimization pass. template : callable To build the symbols (temporaries) storing the redundant expressions. mode : str The transformation mode. Accepted: ['invariants', 'sops']. * 'invariants' is for sub-expressions that are invariant w.r.t. one or more Dimensions. * 'sops' stands for sums-of-products, that is redundancies are searched across all expressions in sum-of-product form. options : dict The optimization mode. Accepted: ['min-storage']. * 'min-storage': if True, the pass will try to minimize the amount of storage introduced for the tensor temporaries. This might also reduce the operation count. On the other hand, this might affect fusion and therefore data locality. Defaults to False (legacy). platform : Platform The underlying platform. Used to optimize the shape of the introduced tensor symbols. Examples -------- 1) 'invariants'. Below is an expensive sub-expression invariant w.r.t. `t` t0 = (cos(a[x,y,z])*sin(b[x,y,z]))*c[t,x,y,z] becomes t1[x,y,z] = cos(a[x,y,z])*sin(b[x,y,z]) t0 = t1[x,y,z]*c[t,x,y,z] 2) 'sops'. Below are redundant sub-expressions in sum-of-product form (in this case, the sum degenerates to a single product). t0 = 2.0*a[x,y,z]*b[x,y,z] t1 = 3.0*a[x,y,z+1]*b[x,y,z+1] becomes t2[x,y,z] = a[x,y,z]*b[x,y,z] t0 = 2.0*t2[x,y,z] t1 = 3.0*t2[x,y,z+1] """ # Sanity checks assert mode in ['invariants', 'sops'] assert all(i > 0 for i in options['cire-repeats'].values()) # Relevant options min_storage = options['min-storage'] # Setup callbacks if mode == 'invariants': # Extraction rule def extractor(context): is_time_invariant = make_is_time_invariant(context) return lambda e: is_time_invariant(e) # Extraction model model = lambda e: estimate_cost(e, True) >= MIN_COST_ALIAS_INV # Selection rule selector = lambda c, n: c >= MIN_COST_ALIAS_INV and n >= 1 elif mode == 'sops': # Extraction rule def extractor(context): return lambda e: q_sum_of_product(e) # Extraction model model = lambda e: not (q_leaf(e) or q_terminalop(e)) # Selection rule selector = lambda c, n: c >= MIN_COST_ALIAS and n > 1 # Actual CIRE processed = [] context = cluster.exprs for _ in range(options['cire-repeats'][mode]): # Extract potentially aliasing expressions exprs, extracted = extract(cluster, extractor(context), model, template) if not extracted: # Do not waste time break # Search aliasing expressions aliases = collect(extracted, min_storage) # Rule out aliasing expressions with a bad flops/memory trade-off chosen, others = choose(exprs, aliases, selector) if not chosen: # Do not waste time break # Create Aliases and assign them to Clusters clusters, subs = process(cluster, chosen, aliases, template, platform) # Rebuild `cluster` so as to use the newly created Aliases rebuilt = rebuild(cluster, others, aliases, subs) # Prepare for the next round processed.extend(clusters) cluster = rebuilt context = flatten(c.exprs for c in processed) + list(cluster.exprs) processed.append(cluster) return processed
def estimate_cost(exprs, estimate=False): """ Estimate the operation count of an expression. Parameters ---------- exprs : expr-like or list of expr-like One or more expressions for which the operation count is calculated. estimate : bool, optional Defaults to False; if True, the following rules are applied: * Trascendental functions (e.g., cos, sin, ...) count as 50 ops. * Divisions (powers with a negative exponened) count as 25 ops. """ trascendentals_cost = {sin: 50, cos: 50, exp: 50, log: 50} pow_cost = 50 div_cost = 25 try: # Is it a plain symbol/array ? if exprs.is_AbstractFunction or exprs.is_AbstractSymbol: return 0 except AttributeError: pass try: # Is it a dict ? exprs = exprs.values() except AttributeError: try: # Could still be a list of dicts exprs = flatten([i.values() for i in exprs]) except (AttributeError, TypeError): pass try: # At this point it must be a list of SymPy objects # We don't use SymPy's count_ops because we do not count integer arithmetic # (e.g., array index functions such as i+1 in A[i+1]) # Also, the routine below is *much* faster than count_ops exprs = [i.rhs if i.is_Equality else i for i in as_tuple(exprs)] operations = flatten(retrieve_xops(i) for i in exprs) flops = 0 for op in operations: if op.is_Function: if estimate: flops += trascendentals_cost.get(op.__class__, 1) else: flops += 1 elif op.is_Pow: if estimate: if op.exp.is_Number: if op.exp < 0: flops += div_cost elif op.exp == 0: flops += 0 elif op.exp.is_Integer: # Natural pows a**b are estimated as b-1 Muls flops += op.exp - 1 else: flops += pow_cost else: flops += pow_cost else: flops += 1 else: flops += len( op.args) - (1 + sum(True for i in op.args if i.is_Integer)) return flops except: warning("Cannot estimate cost of `%s`" % str(exprs)) return 0
def detect_flow_directions(exprs): """ Return a mapper from :class:`Dimension`s to iterables of :class:`IterationDirection`s representing the theoretically necessary directions to evaluate ``exprs`` so that the information "naturally flows" from an iteration to another. """ exprs = as_tuple(exprs) writes = [Access(i.lhs, 'W') for i in exprs] reads = flatten(retrieve_indexed(i.rhs, mode='all') for i in exprs) reads = [Access(i, 'R') for i in reads] # Determine indexed-wise direction by looking at the vector distance mapper = defaultdict(set) for w in writes: for r in reads: if r.name != w.name: continue dimensions = [d for d in w.aindices if d is not None] if not dimensions: continue for d in dimensions: distance = None for i in d._defines: try: distance = w.distance(r, i, view=i) except TypeError: pass try: if distance > 0: mapper[d].add(Forward) break elif distance < 0: mapper[d].add(Backward) break else: mapper[d].add(Any) except TypeError: # Nothing can be deduced mapper[d].add(Any) break # Remainder for d in dimensions[dimensions.index(d) + 1:]: mapper[d].add(Any) # Add in any encountered Dimension mapper.update({ d: {Any} for d in flatten(i.aindices for i in reads + writes) if d is not None and d not in mapper }) # Add in derived-dimensions parents, in case they haven't been detected yet mapper.update({ k.parent: set(v) for k, v in mapper.items() if k.is_Derived and mapper.get(k.parent, {Any}) == {Any} }) return mapper
def _call_haloupdate(self, name, f, hse, *args): comm = f.grid.distributor._obj_comm nb = f.grid.distributor._obj_neighborhood args = [f, comm, nb] + list(hse.loc_indices.values()) return Call(name, flatten(args))
def make_blocking(self, iet): """ Apply loop blocking to PARALLEL Iteration trees. """ # Make sure loop blocking will span as many Iterations as possible iet = fold_blockable_tree(iet, self.blockinner) mapper = {} efuncs = [] block_dims = [] for tree in retrieve_iteration_tree(iet): # Is the Iteration tree blockable ? iterations = filter_iterations(tree, lambda i: i.is_Parallel) if not self.blockinner: iterations = iterations[:-1] if len(iterations) <= 1: continue root = iterations[0] if not self.blockalways: # Heuristically bypass loop blocking if we think `tree` # won't be computationally expensive. This will help with code # size/readbility, JIT time, and auto-tuning time if not (tree.root.is_Sequential or iet.is_Callable): # E.g., not inside a time-stepping Iteration continue if any(i.dim.is_Sub and i.dim.local for i in tree): # At least an outer Iteration is over a local SubDimension, # which suggests the computational cost of this Iteration # nest will be negligible w.r.t. the "core" Iteration nest # (making use of non-local (Sub)Dimensions only) continue if not IsPerfectIteration().visit(root): # Don't know how to block non-perfect nests continue # Apply hierarchical loop blocking to `tree` level_0 = [] # Outermost level of blocking level_i = [[] for i in range(1, self.nlevels) ] # Inner levels of blocking intra = [] # Within the smallest block for i in iterations: template = "%s%d_blk%s" % (i.dim.name, self.nblocked, '%d') properties = (PARALLEL, ) + ((AFFINE, ) if i.is_Affine else ()) # Build Iteration across `level_0` blocks d = BlockDimension(i.dim, name=template % 0) level_0.append( Iteration([], d, d.symbolic_max, properties=properties)) # Build Iteration across all `level_i` blocks, `i` in (1, self.nlevels] for n, li in enumerate(level_i, 1): di = BlockDimension(d, name=template % n) li.append( Iteration([], di, limits=(d, d + d.step - 1, di.step), properties=properties)) d = di # Build Iteration within the smallest block intra.append( i._rebuild([], limits=(d, d + d.step - 1, 1), offsets=(0, 0))) level_i = flatten(level_i) # Track all constructed BlockDimensions block_dims.extend(i.dim for i in level_0 + level_i) # Construct the blocked tree blocked = compose_nodes(level_0 + level_i + intra + [iterations[-1].nodes]) blocked = unfold_blocked_tree(blocked) # Promote to a separate Callable dynamic_parameters = flatten((l0.dim, l0.step) for l0 in level_0) dynamic_parameters.extend([li.step for li in level_i]) efunc = make_efunc("bf%d" % self.nblocked, blocked, dynamic_parameters) efuncs.append(efunc) # Compute the iteration ranges ranges = [] for i, l0 in zip(iterations, level_0): maxb = i.symbolic_max - (i.symbolic_size % l0.step) ranges.append( ((i.symbolic_min, maxb, l0.step), (maxb + 1, i.symbolic_max, i.symbolic_max - maxb))) # Build Calls to the `efunc` body = [] for p in product(*ranges): dynamic_args_mapper = {} for l0, (m, M, b) in zip(level_0, p): dynamic_args_mapper[l0.dim] = (m, M) dynamic_args_mapper[l0.step] = (b, ) for li in level_i: if li.dim.root is l0.dim.root: value = li.step if b is l0.step else b dynamic_args_mapper[li.step] = (value, ) call = efunc.make_call(dynamic_args_mapper) body.append(List(body=call)) mapper[root] = List(body=body) # Next blockable nest, use different (unique) variable/function names self.nblocked += 1 iet = Transformer(mapper).visit(iet) # Force-unfold if some folded Iterations haven't been blocked in the end iet = unfold_blocked_tree(iet) return iet, { 'dimensions': block_dims, 'efuncs': efuncs, 'args': [i.step for i in block_dims] }
def visit_List(self, o): body = flatten(self._visit(i) for i in o.children) return c.Module(o.header + (c.Collection(body),) + o.footer)
def autotune(operator, args, level, mode): """ Operator autotuning. Parameters ---------- operator : Operator Input Operator. args : dict_like The runtime arguments with which `operator` is run. level : str The autotuning aggressiveness (basic, aggressive, max). A more aggressive autotuning might eventually result in higher runtime performance, but the autotuning phase will take longer. mode : str The autotuning mode (preemptive, runtime). In preemptive mode, the output runtime values supplied by the user to `operator.apply` are replaced with shadow copies. """ key = [level, mode] accepted = configuration._accepted['autotuning'] if key not in accepted: raise ValueError("The accepted `(level, mode)` combinations are `%s`; " "provided `%s` instead" % (accepted, key)) # We get passed all the arguments, but the cfunction only requires a subset at_args = OrderedDict([(p.name, args[p.name]) for p in operator.parameters]) # User-provided output data won't be altered in `preemptive` mode if mode == 'preemptive': output = {i.name: i for i in operator.output} copies = {k: output[k]._C_as_ndarray(v).copy() for k, v in args.items() if k in output} # WARNING: `copies` keeps references to numpy arrays, which is required # to avoid garbage collection to kick in during autotuning and prematurely # free the shadow copies handed over to C-land at_args.update({k: output[k]._C_make_dataobj(v) for k, v in copies.items()}) # Disable halo exchanges through MPI_PROC_NULL if mode in ['preemptive', 'destructive']: for p in operator.parameters: if isinstance(p, MPINeighborhood): at_args.update(MPINeighborhood(p.fields)._arg_values()) for i in p.fields: setattr(at_args[p.name]._obj, i, MPI.PROC_NULL) elif isinstance(p, MPIMsgEnriched): at_args.update(MPIMsgEnriched(p.name, p.function, p.halos)._arg_values()) for i in at_args[p.name]: i.fromrank = MPI.PROC_NULL i.torank = MPI.PROC_NULL roots = [operator.body] + [i.root for i in operator._func_table.values()] trees = filter_ordered(retrieve_iteration_tree(roots), key=lambda i: i.root) # Detect the time-stepping Iteration; shrink its iteration range so that # each autotuning run only takes a few iterations steppers = {i for i in flatten(trees) if i.dim.is_Time} if len(steppers) == 0: stepper = None timesteps = 1 elif len(steppers) == 1: stepper = steppers.pop() timesteps = init_time_bounds(stepper, at_args) if timesteps is None: return args, {} else: warning("cannot perform autotuning unless there is one time loop; skipping") return args, {} # Perform autotuning timings = {} for n, tree in enumerate(trees): blockable = [i.dim for i in tree if isinstance(i.dim, BlockDimension)] # Tunable arguments try: tunable = [] tunable.append(generate_block_shapes(blockable, args, level)) tunable.append(generate_nthreads(operator.nthreads, args, level)) tunable = list(product(*tunable)) except ValueError: # Some arguments are cumpolsory, otherwise autotuning is skipped continue # Symbolic number of loop-blocking blocks per thread nblocks_per_thread = calculate_nblocks(tree, blockable) / operator.nthreads for bs, nt in tunable: # Can we safely autotune over the given time range? if not check_time_bounds(stepper, at_args, args, mode): break # Update `at_args` to use the new tunable arguments run = [(k, v) for k, v in bs + nt if k in at_args] at_args.update(dict(run)) # Drop run if not at least one block per thread if not configuration['develop-mode'] and nblocks_per_thread.subs(at_args) < 1: continue # Make sure we remain within stack bounds, otherwise skip run try: stack_footprint = operator._mem_summary['stack'] if int(evaluate(stack_footprint, **at_args)) > options['stack_limit']: continue except TypeError: warning("couldn't determine stack size; skipping run %s" % str(i)) continue except AttributeError: assert stack_footprint == 0 # Run the Operator operator.cfunction(*list(at_args.values())) elapsed = operator._profiler.timer.total timings.setdefault(nt, OrderedDict()).setdefault(n, {})[bs] = elapsed log("run <%s> took %f (s) in %d timesteps" % (','.join('%s=%s' % i for i in run), elapsed, timesteps)) # Prepare for the next autotuning run update_time_bounds(stepper, at_args, timesteps, mode) # Reset profiling timers operator._profiler.timer.reset() # The best variant is the one that for a given number of threads had the minium # turnaround time try: runs = 0 mapper = {} for k, v in timings.items(): for i in v.values(): runs += len(i) record = mapper.setdefault(k, Record()) record.add(min(i, key=i.get), min(i.values())) best = min(mapper, key=mapper.get) best = OrderedDict(best + tuple(mapper[best].args)) best.pop(None, None) log("selected <%s>" % (','.join('%s=%s' % i for i in best.items()))) except ValueError: warning("couldn't perform any runs") return args, {} # Update the argument list with the tuned arguments args.update(best) # In `runtime` mode, some timesteps have been executed already, so we must # adjust the time range finalize_time_bounds(stepper, at_args, args, mode) # Autotuning summary summary = {} summary['runs'] = runs summary['tpr'] = timesteps # tpr -> timesteps per run summary['tuned'] = dict(best) return args, summary
def visit_Iteration(self, o): symbols = flatten([self._visit(i) for i in o.children]) symbols += self.rule(o) return filter_sorted(symbols, key=attrgetter('name'))
def visit_tuple(self, o): symbols = flatten([self._visit(i) for i in o]) return filter_sorted(symbols, key=attrgetter('name'))
def visit_HaloSpot(self, o): body = flatten(self._visit(i) for i in o.children) return c.Collection(body)
def __new__(cls, clusters, itintervals=None): obj = super(ClusterGroup, cls).__new__(cls, flatten(as_tuple(clusters))) obj._itintervals = itintervals return obj
def visit_tuple(self, o): symbols = flatten([self._visit(i) for i in o]) return filter_sorted(symbols, key=attrgetter('name'))
def exprs(self): return flatten(c.exprs for c in self)
def callback(self, clusters, prefix, backlog=None, known_break=None): if not prefix: return clusters known_break = known_break or set() backlog = backlog or [] # Take the innermost Dimension -- no other Clusters other than those in # `clusters` are supposed to share it candidates = prefix[-1].dim._defines scope = Scope(exprs=flatten(c.exprs for c in clusters)) # Handle the nastiest case -- ambiguity due to the presence of both a # flow- and an anti-dependence. # # Note: in most cases, `scope.d_anti.cause == {}` -- either because # `scope.d_anti == {}` or because the few anti dependences are not carried # in any Dimension. We exploit this observation so that we only compute # `d_flow`, which instead may be expensive, when strictly necessary maybe_break = scope.d_anti.cause & candidates if len(clusters) > 1 and maybe_break: require_break = scope.d_flow.cause & maybe_break if require_break: backlog = [clusters[-1]] + backlog # Try with increasingly smaller ClusterGroups until the ambiguity is gone return self.callback(clusters[:-1], prefix, backlog, require_break) # Schedule Clusters over different IterationSpaces if this increases parallelism for i in range(1, len(clusters)): if self._break_for_parallelism(scope, candidates, i): return self.callback(clusters[:i], prefix, clusters[i:] + backlog, candidates | known_break) # Compute iteration direction idir = { d: Backward for d in candidates if d.root in scope.d_anti.cause } if maybe_break: idir.update({ d: Forward for d in candidates if d.root in scope.d_flow.cause }) idir.update({d: Forward for d in candidates if d not in idir}) # Enforce iteration direction on each Cluster processed = [] for c in clusters: ispace = IterationSpace(c.ispace.intervals, c.ispace.sub_iterators, { **c.ispace.directions, **idir }) processed.append(c.rebuild(ispace=ispace)) if not backlog: return processed # Handle the backlog -- the Clusters characterized by flow- and anti-dependences # along one or more Dimensions idir = {d: Any for d in known_break} for i, c in enumerate(list(backlog)): ispace = IterationSpace(c.ispace.intervals.lift(known_break), c.ispace.sub_iterators, { **c.ispace.directions, **idir }) dspace = c.dspace.lift(known_break) backlog[i] = c.rebuild(ispace=ispace, dspace=dspace) return processed + self.callback(backlog, prefix)
def __radd__(self, other): return flatten([other, self])
def dimensions(self): retval = flatten(i.indices for i in self.functions if i.is_Indexed) return tuple(filter_ordered(retval))
def make_yask_kernels(iet, **kwargs): yk_solns = kwargs.pop('yk_solns') mapper = {} for n, (section, trees) in enumerate(find_affine_trees(iet).items()): dimensions = tuple(filter_ordered(i.dim.root for i in flatten(trees))) # Retrieve the section dtype exprs = FindNodes(Expression).visit(section) dtypes = {e.dtype for e in exprs} if len(dtypes) != 1: log("Unable to offload in presence of mixed-precision arithmetic") continue dtype = dtypes.pop() context = contexts.fetch(dimensions, dtype) # A unique name for the 'real' compiler and kernel solutions name = namespace['jit-soln'](Signer._digest(configuration, *[i.root for i in trees])) # Create a YASK compiler solution for this Operator yc_soln = context.make_yc_solution(name) try: # Generate YASK vars and populate `yc_soln` with equations local_vars = yaskit(trees, yc_soln) # Build the new IET nodes yk_soln_obj = YASKSolnObject(namespace['code-soln-name'](n)) funcall = make_sharedptr_funcall(namespace['code-soln-run'], ['time'], yk_soln_obj) funcall = Offloaded(funcall, dtype) mapper[trees[0].root] = funcall mapper.update({i.root: mapper.get(i.root) for i in trees}) # Drop trees # JIT-compile the newly-created YASK kernel yk_soln = context.make_yk_solution(name, yc_soln, local_vars) yk_solns[(dimensions, yk_soln_obj)] = yk_soln # Print some useful information about the newly constructed solution log("Solution '%s' contains %d var(s) and %d equation(s)." % (yc_soln.get_name(), yc_soln.get_num_vars(), yc_soln.get_num_equations())) except NotImplementedError as e: log("Unable to offload a candidate tree. Reason: [%s]" % str(e)) iet = Transformer(mapper).visit(iet) if not yk_solns: log("No offloadable trees found") # Some Iteration/Expression trees are not offloaded to YASK and may # require further processing to be executed through YASK, due to the # different storage layout yk_var_objs = { i.name: YASKVarObject(i.name) for i in FindSymbols().visit(iet) if i.from_YASK } yk_var_objs.update({i: YASKVarObject(i) for i in get_local_vars(yk_solns)}) iet = make_var_accesses(iet, yk_var_objs) # The signature needs to be updated # TODO: this could be done automagically through the iet pass engine, but # currently it only supports *appending* to the parameters list. While here # we actually need to change it as some parameters may disappear (x_m, x_M, ...) parameters = derive_parameters(iet, True) iet = iet._rebuild(parameters=parameters) return iet, {}
def get_data(self, timestep): data = flatten([get_symbol_data(s, timestep) for s in self.objects]) return data
def _build(cls, expressions, **kwargs): expressions = as_tuple(expressions) # Input check if any(not isinstance(i, Evaluable) for i in expressions): raise InvalidOperator("Only `devito.Evaluable` are allowed.") # Python-level (i.e., compile time) and C-level (i.e., run time) performance profiler = create_profile('timers') # Lower input expressions expressions = cls._lower_exprs(expressions, **kwargs) # Group expressions based on iteration spaces and data dependences clusters = cls._lower_clusters(expressions, profiler, **kwargs) # Lower Clusters to a ScheduleTree stree = cls._lower_stree(clusters, **kwargs) # Lower ScheduleTree to an Iteration/Expression Tree iet, byproduct = cls._lower_iet(stree, profiler, **kwargs) # Make it an actual Operator op = Callable.__new__(cls, **iet.args) Callable.__init__(op, **op.args) # Header files, etc. op._headers = list(cls._default_headers) op._headers.extend(byproduct.headers) op._globals = list(cls._default_globals) op._includes = list(cls._default_includes) op._includes.extend(profiler._default_includes) op._includes.extend(byproduct.includes) # Required for the jit-compilation op._compiler = configuration['compiler'] op._lib = None op._cfunction = None # References to local or external routines op._func_table = OrderedDict() op._func_table.update( OrderedDict([(i, MetaCall(None, False)) for i in profiler._ext_calls])) op._func_table.update( OrderedDict([(i.root.name, i) for i in byproduct.funcs])) # Internal state. May be used to store information about previous runs, # autotuning reports, etc op._state = cls._initialize_state(**kwargs) # Produced by the various compilation passes op._input = filter_sorted( flatten(e.reads + e.writes for e in expressions)) op._output = filter_sorted(flatten(e.writes for e in expressions)) op._dimensions = filter_sorted( flatten(e.dimensions for e in expressions)) op._dimensions.extend(byproduct.dimensions) op._dtype, op._dspace = clusters.meta op._profiler = profiler return op
def rewrite(clusters, mode='advanced'): """ Given a sequence of N Clusters, produce a sequence of M Clusters with reduced operation count, with M >= N. Parameters ---------- clusters : list of Cluster The Clusters to be transformed. mode : str, optional The aggressiveness of the rewrite. Accepted: - ``noop``: Do nothing. - ``basic``: Apply common sub-expressions elimination. - ``advanced``: Apply all transformations that will reduce the operation count w/ minimum increase to the memory pressure, namely 'basic', factorization, CIRE for time-invariants only. - ``speculative``: Like 'advanced', but apply CIRE also to time-varying sub-expressions, which might further increase the memory pressure. * ``aggressive``: Like 'speculative', but apply CIRE to any non-trivial sub-expression (i.e., anything that is at least in a sum-of-product form). Further, seek and drop cross-cluster redundancies (this is the only pass that attempts to optimize *across* clusters, rather than within a cluster). The 'aggressive' mode may substantially increase the symbolic processing time; it may or may not reduce the JIT-compilation time; it may or may not improve the overall runtime performance. """ if not (mode is None or isinstance(mode, str)): raise ValueError("Parameter 'mode' should be a string, not %s." % type(mode)) if mode is None or mode == 'noop': return clusters elif mode not in dse_registry: dse_warning("Unknown rewrite mode(s) %s" % mode) return clusters # 1) Local optimization # --------------------- # We use separate rewriters for dense and sparse clusters; sparse clusters have # non-affine index functions, thus making it basically impossible, in general, # to apply the more advanced DSE passes. # Note: the sparse rewriter uses the same template for temporaries as # the dense rewriter, thus temporaries are globally unique rewriter = modes[mode]() fallback = BasicRewriter(False, rewriter.template) processed = ClusterGroup(flatten(rewriter.run(c) if c.is_dense else fallback.run(c) for c in clusters)) # 2) Cluster grouping # ------------------- # Different clusters may have created new (smaller) clusters which are # potentially groupable within a single cluster processed = groupby(processed) # 3)Global optimization # --------------------- # After grouping, there may be redundancies in one or more clusters. This final # pass searches and drops such redundancies if mode == 'aggressive': processed = cross_cluster_cse(processed) return processed.finalize()
def _defined_findices(self): return set(flatten(i._defines for i in self.findices))