def _insert_declarations(self, nodes): """Populate the Operator's body with the required array and variable declarations, to generate a legal C file.""" # Resolve function calls first scopes = [] for k, v in FindScopes().visit(nodes).items(): if k.is_FunCall: func = self.func_table[k.name] if func.local: scopes.extend(FindScopes().visit(func.root, queue=list(v)).items()) else: scopes.append((k, v)) # Determine all required declarations allocator = Allocator() mapper = OrderedDict() for k, v in scopes: if k.is_scalar: # Inline declaration mapper[k] = LocalExpression(**k.args) elif k.output_function._mem_external: # Nothing to do, variable passed as kernel argument continue elif k.output_function._mem_stack: # On the stack, as established by the DLE key = lambda i: not i.is_Parallel site = filter_iterations(v, key=key, stop='asap') or [nodes] allocator.push_stack(site[-1], k.output_function) else: # On the heap, as a tensor that must be globally accessible allocator.push_heap(k.output_function) # Introduce declarations on the stack for k, v in allocator.onstack: mapper[k] = tuple(Element(i) for i in v) nodes = NestedTransformer(mapper).visit(nodes) for k, v in list(self.func_table.items()): if v.local: self.func_table[k] = FunMeta( Transformer(mapper).visit(v.root), v.local) # Introduce declarations on the heap (if any) if allocator.onheap: decls, allocs, frees = zip(*allocator.onheap) nodes = List(header=decls + allocs, body=nodes, footer=frees) return nodes
def _insert_declarations(self, dle_state, parameters): """Populate the Operator's body with the required array and variable declarations, to generate a legal C file.""" nodes = dle_state.nodes # Resolve function calls first scopes = [] for k, v in FindScopes().visit(nodes).items(): if k.is_FunCall: function = dle_state.func_table[k.name] scopes.extend(FindScopes().visit(function, queue=list(v)).items()) else: scopes.append((k, v)) # Determine all required declarations allocator = Allocator() mapper = OrderedDict() for k, v in scopes: if k.is_scalar: # Inline declaration mapper[k] = LocalExpression(**k.args) elif k.output_function._mem_external: # Nothing to do, variable passed as kernel argument continue elif k.output_function._mem_stack: # On the stack, as established by the DLE key = lambda i: i.dim not in k.output_function.indices site = filter_iterations(v, key=key, stop='consecutive') allocator.push_stack(site[-1], k.output_function) else: # On the heap, as a tensor that must be globally accessible allocator.push_heap(k.output_function) # Introduce declarations on the stack for k, v in allocator.onstack: allocs = as_tuple([Element(i) for i in v]) mapper[k] = Iteration(allocs + k.nodes, **k.args_frozen) nodes = Transformer(mapper).visit(nodes) elemental_functions = Transformer(mapper).visit( dle_state.elemental_functions) # Introduce declarations on the heap (if any) if allocator.onheap: decls, allocs, frees = zip(*allocator.onheap) nodes = List(header=decls + allocs, body=nodes, footer=frees) return nodes, elemental_functions
def _ompize(self, state, **kwargs): """ Add OpenMP pragmas to the Iteration/Expression tree to emit parallel code """ processed = [] for node in state.nodes: # Reset denormals flag each time a parallel region is entered denormals = FindNodes(Denormals).visit(state.nodes) mapper = { i: List(c.Comment('DLE: moved denormals flag')) for i in denormals } # Handle parallelizable loops for tree in retrieve_iteration_tree(node): # Determine the number of consecutive parallelizable Iterations key = lambda i: i.is_Parallel and not i.is_Vectorizable candidates = filter_iterations(tree, key=key, stop='consecutive') if not candidates: continue # Heuristic: if at least two parallel loops are available and the # physical core count is greater than self.thresholds['collapse'], # then omp-collapse the loops nparallel = len(candidates) if psutil.cpu_count(logical=False) < self.thresholds['collapse'] or\ nparallel < 2: parallelism = omplang['for'] else: parallelism = omplang['collapse'](nparallel) root = candidates[0] mapper[root] = Block(header=omplang['par-region'], body=denormals + [Element(parallelism), root]) processed.append(Transformer(mapper).visit(node)) return {'nodes': processed}
def __init__(self, expressions, **kwargs): super(OperatorDebug, self).__init__(expressions, **kwargs) self._includes.append('stdio.h') # Minimize the trip count of the sequential loops iterations = set(flatten(retrieve_iteration_tree(self.body))) mapper = { i: i._rebuild(limits=(max(i.offsets) + 2)) for i in iterations if i.is_Sequential } self.body = Transformer(mapper).visit(self.body) # Mark entry/exit points of each non-sequential Iteration tree in the body iterations = [ filter_iterations(i, lambda i: not i.is_Sequential, 'any') for i in retrieve_iteration_tree(self.body) ] iterations = [i[0] for i in iterations if i] mapper = { t: List(header=printmark('In nest %d' % i), body=t) for i, t in enumerate(iterations) } self.body = Transformer(mapper).visit(self.body)
def _ompize(self, state, **kwargs): """ Add OpenMP pragmas to the Iteration/Expression tree to emit parallel code """ processed = [] for node in state.nodes: # Reset denormals flag each time a parallel region is entered denormals = FindNodes(Denormals).visit(state.nodes) mapper = OrderedDict([(i, None) for i in denormals]) # Group by outer loop so that we can embed within the same parallel region was_tagged = False groups = OrderedDict() for tree in retrieve_iteration_tree(node): # Determine the number of consecutive parallelizable Iterations key = lambda i: i.is_Parallel and\ not (i.is_Elementizable or i.is_Vectorizable) candidates = filter_iterations(tree, key=key, stop='asap') if not candidates: was_tagged = False continue # Consecutive tagged Iteration go in the same group is_tagged = any(i.tag is not None for i in tree) key = len(groups) - (is_tagged & was_tagged) handle = groups.setdefault(key, OrderedDict()) handle[candidates[0]] = candidates was_tagged = is_tagged # Handle parallelizable loops for group in groups.values(): private = [] for root, tree in group.items(): # Heuristic: if at least two parallel loops are available and the # physical core count is greater than self.thresholds['collapse'], # then omp-collapse the loops nparallel = len(tree) if psutil.cpu_count(logical=False) < self.thresholds['collapse'] or\ nparallel < 2: parallel = omplang['for'] else: parallel = omplang['collapse'](nparallel) mapper[root] = root._rebuild(pragmas=root.pragmas + (parallel, )) # Track the thread-private and thread-shared variables private.extend([ i for i in FindSymbols('symbolics').visit(root) if i.is_TensorFunction and i._mem_stack ]) # Build the parallel region private = sorted(set([i.name for i in private])) private = ('private(%s)' % ','.join(private)) if private else '' rebuilt = [v for k, v in mapper.items() if k in group] par_region = Block(header=omplang['par-region'](private), body=denormals + rebuilt) for k, v in list(mapper.items()): if isinstance(v, Iteration): mapper[k] = None if v.is_Remainder else par_region handle = Transformer(mapper).visit(node) if handle is not None: processed.append(handle) return {'nodes': processed}
def _specialize(self, nodes, parameters): """ Create a YASK representation of this Iteration/Expression tree. ``parameters`` is modified in-place adding YASK-related arguments. """ log("Specializing a Devito Operator for YASK...") # Find offloadable Iteration/Expression trees offloadable = [] for tree in retrieve_iteration_tree(nodes): parallel = filter_iterations(tree, lambda i: i.is_Parallel) if not parallel: # Cannot offload non-parallel loops continue if not (IsPerfectIteration().visit(tree) and all(i.is_Expression for i in tree[-1].nodes)): # Don't know how to offload this Iteration/Expression to YASK continue functions = flatten(i.functions for i in tree[-1].nodes) keys = set((i.indices, i.shape, i.dtype) for i in functions if i.is_TimeData) if len(keys) == 0: continue elif len(keys) > 1: exit("Cannot handle Operators w/ heterogeneous grids") dimensions, shape, dtype = keys.pop() if len(dimensions) == len(tree) and\ all(i.dim == j for i, j in zip(tree, dimensions)): # Detected a "full" Iteration/Expression tree (over both # time and space dimensions) offloadable.append((tree, dimensions, shape, dtype)) # Construct YASK ASTs given Devito expressions. New grids may be allocated. if len(offloadable) == 0: # No offloadable trees found self.context = YaskNullContext() self.yk_soln = YaskNullSolution() processed = nodes log("No offloadable trees found") elif len(offloadable) == 1: # Found *the* offloadable tree for this Operator tree, dimensions, shape, dtype = offloadable[0] self.context = contexts.fetch(dimensions, shape, dtype) # Create a YASK compiler solution for this Operator # Note: this can be dropped as soon as the kernel has been built yc_soln = self.context.make_yc_solution(namespace['jit-yc-soln']) transform = sympy2yask(self.context, yc_soln) try: for i in tree[-1].nodes: transform(i.expr) funcall = make_sharedptr_funcall(namespace['code-soln-run'], ['time'], namespace['code-soln-name']) funcall = Element(c.Statement(ccode(funcall))) processed = Transformer({tree[1]: funcall}).visit(nodes) # Track this is an external function call self.func_table[namespace['code-soln-run']] = FunMeta( None, False) # JIT-compile the newly-created YASK kernel self.yk_soln = self.context.make_yk_solution( namespace['jit-yk-soln'], yc_soln) # Now we must drop a pointer to the YASK solution down to C-land parameters.append( Object(namespace['code-soln-name'], namespace['type-solution'], self.yk_soln.rawpointer)) # Print some useful information about the newly constructed solution log("Solution '%s' contains %d grid(s) and %d equation(s)." % (yc_soln.get_name(), yc_soln.get_num_grids(), yc_soln.get_num_equations())) except: self.yk_soln = YaskNullSolution() processed = nodes log("Unable to offload a candidate tree.") else: exit("Found more than one offloadable trees in a single Operator") # Some Iteration/Expression trees are not offloaded to YASK and may # require further processing to be executed in YASK, due to the differences # in storage layout employed by Devito and YASK processed = make_grid_accesses(processed) # Update the parameters list adding all necessary YASK grids for i in list(parameters): try: if i.from_YASK: parameters.append( Object(namespace['code-grid-name'](i.name), namespace['type-grid'], i.data.rawpointer)) except AttributeError: # Ignore e.g. Dimensions pass log("Specialization successfully performed!") return processed
def _create_elemental_functions(self, state, **kwargs): """ Extract :class:`Iteration` sub-trees and move them into :class:`Function`s. Currently, only tagged, elementizable Iteration objects are targeted. """ noinline = self._compiler_decoration('noinline', c.Comment('noinline?')) functions = OrderedDict() processed = [] for node in state.nodes: mapper = {} for tree in retrieve_iteration_tree(node, mode='superset'): # Search an elementizable sub-tree (if any) tagged = filter_iterations(tree, lambda i: i.tag is not None, 'asap') if not tagged: continue root = tagged[0] if not root.is_Elementizable: continue target = tree[tree.index(root):] # Elemental function arguments args = [] # Found so far (scalars, tensors) maybe_required = set( ) # Scalars that *may* have to be passed in not_required = set( ) # Elemental function locally declared scalars # Build a new Iteration/Expression tree with free bounds free = [] for i in target: name, bounds = i.dim.name, i.bounds_symbolic # Iteration bounds start = ScalarFunction(name='%s_start' % name, dtype=np.int32) finish = ScalarFunction(name='%s_finish' % name, dtype=np.int32) args.extend( zip([ccode(j) for j in bounds], (start, finish))) # Iteration unbounded indices ufunc = [ ScalarFunction(name='%s_ub%d' % (name, j), dtype=np.int32) for j in range(len(i.uindices)) ] args.extend( zip([ccode(j.start) for j in i.uindices], ufunc)) limits = [Symbol(start.name), Symbol(finish.name), 1] uindices = [ UnboundedIndex(j.index, i.dim + as_symbol(k)) for j, k in zip(i.uindices, ufunc) ] free.append( i._rebuild(limits=limits, offsets=None, uindices=uindices)) not_required.update({i.dim}, set(j.index for j in i.uindices)) # Construct elemental function body, and inspect it free = NestedTransformer(dict((zip(target, free)))).visit(root) expressions = FindNodes(Expression).visit(free) fsymbols = FindSymbols('symbolics').visit(free) # Retrieve symbolic arguments for i in fsymbols: if i.is_TensorFunction: args.append( ("(%s*)%s" % (c.dtype_to_ctype(i.dtype), i.name), i)) elif i.is_TensorData: args.append(("%s_vec" % i.name, i)) elif i.is_ConstantData: args.append((i.name, i)) # Retrieve scalar arguments not_required.update( {i.output for i in expressions if i.is_scalar}) maybe_required.update( set(FindSymbols(mode='free-symbols').visit(free))) for i in fsymbols: not_required.update({as_symbol(i), i.indexify()}) for j in i.symbolic_shape: maybe_required.update(j.free_symbols) required = filter_sorted(maybe_required - not_required, key=attrgetter('name')) args.extend([(i.name, ScalarFunction(name=i.name, dtype=np.int32)) for i in required]) call, params = zip(*args) handle = flatten([p.rtargs for p in params]) name = "f_%d" % root.tag # Produce the new FunCall mapper[root] = List(header=noinline, body=FunCall(name, call)) # Produce the new Function functions.setdefault( name, Function(name, free, 'void', handle, ('static', ))) # Transform the main tree processed.append(Transformer(mapper).visit(node)) return {'nodes': processed, 'elemental_functions': functions.values()}