def simple_function_with_paddable_arrays(a_dense, b_dense, exprs, iters): # void foo(a_dense, b_dense) # for i # for j # for k # expr0 symbols = [i.base.function for i in [a_dense, b_dense]] body = iters[0](iters[1](iters[2](exprs[6]))) f = Function('foo', body, 'void', symbols, ()) subs = {} f = ResolveIterationVariable().visit(f, subs=subs) f = SubstituteExpression(subs=subs).visit(f) return f
def simple_function_fissionable(a, b, exprs, iters): # void foo(a, b) # for i # for j # for k # expr0 # expr2 symbols = [i.base.function for i in [a, b]] body = iters[0](iters[1](iters[2]([exprs[0], exprs[2]]))) f = Function('foo', body, 'void', symbols, ()) subs = {} f = ResolveIterationVariable().visit(f, subs=subs) f = SubstituteExpression(subs=subs).visit(f) return f
def complex_function(a, b, c, d, exprs, iters): # void foo(a, b, c, d) # for i # for s # expr0 # for j # for k # expr1 # expr2 # for p # expr3 symbols = [i.base.function for i in [a, b, c, d]] body = iters[0]([iters[3](exprs[2]), iters[1](iters[2]([exprs[3], exprs[4]])), iters[4](exprs[5])]) f = Function('foo', body, 'void', symbols, ()) subs = {} f = ResolveIterationVariable().visit(f, subs=subs) f = SubstituteExpression(subs=subs).visit(f) return f
def __init__(self, expressions, **kwargs): expressions = as_tuple(expressions) # Input check if any(not isinstance(i, sympy.Eq) for i in expressions): raise InvalidOperator("Only SymPy expressions are allowed.") self.name = kwargs.get("name", "Kernel") subs = kwargs.get("subs", {}) time_axis = kwargs.get("time_axis", Forward) dse = kwargs.get("dse", configuration['dse']) dle = kwargs.get("dle", configuration['dle']) # Default attributes required for compilation self._headers = list(self._default_headers) self._includes = list(self._default_includes) self._lib = None self._cfunction = None # Set the direction of time acoording to the given TimeAxis time.reverse = time_axis == Backward # Expression lowering expressions = [indexify(s) for s in expressions] expressions = [s.xreplace(subs) for s in expressions] # Analysis 1 - required *also after* the Operator construction self.dtype = self._retrieve_dtype(expressions) self.output = self._retrieve_output_fields(expressions) # Analysis 2 - required *for* the Operator construction ordering = self._retrieve_loop_ordering(expressions) stencils = self._retrieve_stencils(expressions) # Group expressions based on their Stencil clusters = clusterize(expressions, stencils) # Apply the Devito Symbolic Engine for symbolic optimization clusters = rewrite(clusters, mode=dse) # Wrap expressions with Iterations according to dimensions nodes = self._schedule_expressions(clusters, ordering) # Introduce C-level profiling infrastructure self.sections = OrderedDict() nodes = self._profile_sections(nodes) # Parameters of the Operator (Dimensions necessary for data casts) parameters = FindSymbols('kernel-data').visit(nodes) dimensions = FindSymbols('dimensions').visit(nodes) dimensions += [d.parent for d in dimensions if d.is_Buffered] parameters += filter_ordered([d for d in dimensions if d.size is None], key=operator.attrgetter('name')) # Resolve and substitute dimensions for loop index variables subs = {} nodes = ResolveIterationVariable().visit(nodes, subs=subs) nodes = SubstituteExpression(subs=subs).visit(nodes) # Apply the Devito Loop Engine for loop optimization dle_state = transform(nodes, *set_dle_mode(dle)) parameters += [i.argument for i in dle_state.arguments] self._includes.extend(list(dle_state.includes)) # Introduce all required C declarations nodes, elemental_functions = self._insert_declarations( dle_state, parameters) self.elemental_functions = elemental_functions # Track the DLE output, as it might be useful at execution time self._dle_state = dle_state # Finish instantiation super(OperatorBasic, self).__init__(self.name, nodes, 'int', parameters, ())
def _padding(self, state, **kwargs): """ Introduce temporary buffers padded to the nearest multiple of the vector length, to maximize data alignment. At the bottom of the kernel, the values in the padded temporaries will be copied back into the input arrays. """ mapper = OrderedDict() for node in state.nodes: # Assess feasibility of the transformation handle = FindSymbols('symbolics-writes').visit(node) if not handle: continue shape = max([i.shape for i in handle], key=len) if not shape: continue candidates = [i for i in handle if i.shape[-1] == shape[-1]] if not candidates: continue # Retrieve the maximum number of items in a SIMD register when processing # the expressions in /node/ exprs = FindNodes(Expression).visit(node) exprs = [e for e in exprs if e.output_function in candidates] assert len(exprs) > 0 dtype = exprs[0].dtype assert all(e.dtype == dtype for e in exprs) try: simd_items = get_simd_items(dtype) except KeyError: # Fallback to 16 (maximum expectable padding, for AVX512 registers) simd_items = simdinfo['avx512f'] / np.dtype(dtype).itemsize shapes = { k: k.shape[:-1] + (roundm(k.shape[-1], simd_items), ) for k in candidates } mapper.update( OrderedDict([(k.indexed, TensorFunction(name='p%s' % k.name, shape=shapes[k], dimensions=k.indices, onstack=k._mem_stack).indexed) for k in candidates])) # Substitute original arrays with padded buffers processed = [ SubstituteExpression(mapper).visit(n) for n in state.nodes ] # Build Iteration trees for initialization and copy-back of padded arrays mapper = OrderedDict([(k, v) for k, v in mapper.items() if k.function.is_SymbolicData]) init = copy_arrays(mapper, reverse=True) copyback = copy_arrays(mapper) processed = init + as_tuple(processed) + copyback return {'nodes': processed}
def __init__(self, expressions, **kwargs): expressions = as_tuple(expressions) # Input check if any(not isinstance(i, sympy.Eq) for i in expressions): raise InvalidOperator("Only SymPy expressions are allowed.") self.name = kwargs.get("name", "Kernel") subs = kwargs.get("subs", {}) time_axis = kwargs.get("time_axis", Forward) dse = kwargs.get("dse", configuration['dse']) dle = kwargs.get("dle", configuration['dle']) # Header files, etc. self._headers = list(self._default_headers) self._includes = list(self._default_includes) self._globals = list(self._default_globals) # Required for compilation self._compiler = configuration['compiler'] self._lib = None self._cfunction = None # Set the direction of time acoording to the given TimeAxis time.reverse = time_axis == Backward # Expression lowering expressions = [indexify(s) for s in expressions] expressions = [s.xreplace(subs) for s in expressions] # Analysis self.dtype = self._retrieve_dtype(expressions) self.input, self.output, self.dimensions = self._retrieve_symbols(expressions) stencils = self._retrieve_stencils(expressions) # Parameters of the Operator (Dimensions necessary for data casts) parameters = self.input + [i for i in self.dimensions if i.size is None] # Group expressions based on their Stencil clusters = clusterize(expressions, stencils) # Apply the Devito Symbolic Engine (DSE) for symbolic optimization clusters = rewrite(clusters, mode=set_dse_mode(dse)) # Wrap expressions with Iterations according to dimensions nodes = self._schedule_expressions(clusters) # Introduce C-level profiling infrastructure nodes, self.profiler = self._profile_sections(nodes, parameters) # Resolve and substitute dimensions for loop index variables subs = {} nodes = ResolveIterationVariable().visit(nodes, subs=subs) nodes = SubstituteExpression(subs=subs).visit(nodes) # Apply the Devito Loop Engine (DLE) for loop optimization dle_state = transform(nodes, *set_dle_mode(dle)) # Update the Operator state based on the DLE self.dle_arguments = dle_state.arguments self.dle_flags = dle_state.flags self.func_table = OrderedDict([(i.name, FunMeta(i, True)) for i in dle_state.elemental_functions]) parameters.extend([i.argument for i in self.dle_arguments]) self.dimensions.extend([i.argument for i in self.dle_arguments if isinstance(i.argument, Dimension)]) self._includes.extend(list(dle_state.includes)) # Translate into backend-specific representation (e.g., GPU, Yask) nodes = self._specialize(dle_state.nodes, parameters) # Introduce all required C declarations nodes = self._insert_declarations(nodes) # Finish instantiation super(Operator, self).__init__(self.name, nodes, 'int', parameters, ())