def _getkey(self, grid, dtype, dimensions=None): base = (configuration['platform'].isa, dtype) if grid is not None: dims = filter_sorted((grid.time_dim, grid.stepping_dim) + grid.dimensions) return base + (tuple(dims),) elif dimensions: dims = filter_sorted([i for i in dimensions if i.is_Space]) return base + (tuple(dims),) else: return base + ((),)
def detect_io(exprs, relax=False): """ ``{exprs} -> ({reads}, {writes})`` Parameters ---------- exprs : expr-like or list of expr-like The searched expressions. relax : bool, optional If False, as by default, collect only Constants and Functions. Otherwise, collect any Basic object. """ exprs = as_tuple(exprs) if relax is False: rule = lambda i: i.is_Input else: rule = lambda i: i.is_Scalar or i.is_Tensor # Don't forget this nasty case, with indirections on the LHS: # >>> u[t, a[x]] = f[x] -> (reads={a, f}, writes={u}) roots = [] for i in exprs: try: roots.append(i.rhs) roots.extend(list(i.lhs.indices)) except AttributeError: # E.g., FunctionFromPointer roots.append(i) reads = [] terminals = flatten(retrieve_terminals(i, deep=True) for i in roots) for i in terminals: candidates = i.free_symbols try: candidates.update({i.function}) except AttributeError: pass for j in candidates: try: if rule(j): reads.append(j) except AttributeError: pass writes = [] for i in exprs: try: f = i.lhs.function except AttributeError: continue if rule(f): writes.append(f) return filter_sorted(reads), filter_sorted(writes)
def __repr__(self): tracked = filter_sorted(set(self.reads) | set(self.writes), key=lambda i: i.name) maxlen = max(1, max([len(i.name) for i in tracked])) out = "{:>%d} => W : {}\n{:>%d} R : {}" % (maxlen, maxlen) pad = " "*(maxlen + 9) reads = [self.getreads(i) for i in tracked] for i, r in enumerate(list(reads)): if not r: reads[i] = '' continue first = "%s" % tuple.__repr__(r[0]) shifted = "\n".join("%s%s" % (pad, tuple.__repr__(j)) for j in r[1:]) shifted = "%s%s" % ("\n" if shifted else "", shifted) reads[i] = first + shifted writes = [self.getwrites(i) for i in tracked] for i, w in enumerate(list(writes)): if not w: writes[i] = '' continue first = "%s" % tuple.__repr__(w[0]) shifted = "\n".join("%s%s" % (pad, tuple.__repr__(j)) for j in w[1:]) shifted = "%s%s" % ("\n" if shifted else "", shifted) writes[i] = '\033[1;37;31m%s\033[0m' % (first + shifted) return "\n".join([out.format(i.name, w, '', r) for i, r, w in zip(tracked, reads, writes)])
def visit_Operator(self, o): # Kernel signature and body body = flatten(self._visit(i) for i in o.children) decls = self._args_decl(o.parameters) signature = c.FunctionDeclaration(c.Value(o.retval, o.name), decls) retval = [c.Statement("return 0")] kernel = c.FunctionBody(signature, c.Block(body + retval)) # Elemental functions esigns = [] efuncs = [blankline] for i in o._func_table.values(): if i.local: esigns.append(c.FunctionDeclaration(c.Value(i.root.retval, i.root.name), self._args_decl(i.root.parameters))) efuncs.extend([i.root.ccode, blankline]) # Header files, extra definitions, ... header = [c.Line(i) for i in o._headers] includes = [c.Include(i, system=False) for i in o._includes] includes += [blankline] cdefs = [i._C_typedecl for i in o.parameters if i._C_typedecl is not None] cdefs = filter_sorted(cdefs, key=lambda i: i.tpname) if o._compiler.src_ext == 'cpp': cdefs += [c.Extern('C', signature)] cdefs = [i for j in cdefs for i in (j, blankline)] return c.Module(header + includes + cdefs + esigns + [blankline, kernel] + efuncs)
def dimension_sort(expr): """ Topologically sort the :class:`Dimension`s in ``expr``, based on the order in which they appear within :class:`Indexed`s. """ def handle_indexed(indexed): relation = [] for i in indexed.indices: try: maybe_dim = split_affine(i).var if isinstance(maybe_dim, Dimension): relation.append(maybe_dim) except ValueError: # Maybe there are some nested Indexeds (e.g., the situation is A[B[i]]) nested = flatten(handle_indexed(n) for n in retrieve_indexed(i)) if nested: relation.extend(nested) else: # Fallback: Just insert all the Dimensions we find, regardless of # what the user is attempting to do relation.extend([d for d in filter_sorted(i.free_symbols) if isinstance(d, Dimension)]) return tuple(relation) relations = {handle_indexed(i) for i in retrieve_indexed(expr, mode='all')} # Add in any implicit dimension (typical of scalar temporaries, or Step) relations.add(expr.implicit_dims) # Add in leftover free dimensions (not an Indexed' index) extra = set([i for i in expr.free_symbols if isinstance(i, Dimension)]) # Add in pure data dimensions (e.g., those accessed only via explicit values, # such as A[3]) indexeds = retrieve_indexed(expr, deep=True) extra.update(set().union(*[set(i.function.indices) for i in indexeds])) # Enforce determinism extra = filter_sorted(extra, key=attrgetter('name')) # Add in implicit relations for parent dimensions # ----------------------------------------------- # 1) Note that (d.parent, d) is what we want, while (d, d.parent) would be # wrong; for example, in `((t, time), (t, x, y), (x, y))`, `x` could now # preceed `time`, while `t`, and therefore `time`, *must* appear before `x`, # as indicated by the second relation implicit_relations = {(d.parent, d) for d in extra if d.is_Derived} # 2) To handle cases such as `((time, xi), (x,))`, where `xi` a SubDimension # of `x`, besides `(x, xi)`, we also have to add `(time, x)` so that we # obtain the desired ordering `(time, x, xi)`. W/o `(time, x)`, the ordering # `(x, time, xi)` might be returned instead, which would be non-sense implicit_relations.update({tuple(d.root for d in i) for i in relations}) ordering = PartialOrderTuple(extra, relations=(relations | implicit_relations)) return ordering
def detect_io(exprs, relax=False): """``{exprs} -> ({reads}, {writes}) :param exprs: The expressions inspected. :param relax: (Optional) if False, as by default, collect only :class:`Constant`s and :class:`Function`s. Otherwise, collect any :class:`Basic`s. """ exprs = as_tuple(exprs) if relax is False: rule = lambda i: i.is_Input else: rule = lambda i: i.is_Scalar or i.is_Tensor reads = [] for i in flatten(retrieve_terminals(i, deep=True) for i in exprs): candidates = i.free_symbols try: candidates.update({i.base.function}) except AttributeError: pass for j in candidates: try: if rule(j): reads.append(j) except AttributeError: pass writes = [] for i in exprs: try: f = i.lhs.base.function except AttributeError: continue if rule(f): writes.append(f) return filter_sorted(reads), filter_sorted(writes)
def place_ondevice(self, iet, **kwargs): efuncs = kwargs['efuncs'] storage = Storage() if iet.is_ElementalFunction: return iet, {} # Collect written and read-only symbols writes = set() reads = set() for efunc in efuncs: for i, v in MapExprStmts().visit(efunc).items(): if not i.is_Expression: # No-op continue if not any( isinstance(j, self._Parallelizer._Iteration) for j in v): # Not an offloaded Iteration tree continue if i.write.is_DiscreteFunction: writes.add(i.write) reads = (reads | {r for r in i.reads if r.is_DiscreteFunction}) - writes # Update `storage` for i in filter_sorted(writes): self._map_function_on_high_bw_mem(i, storage) for i in filter_sorted(reads): self._map_function_on_high_bw_mem(i, storage, read_only=True) iet = self._dump_storage(iet, storage) return iet, {}
def handle_indexed(indexed): relation = [] for i in indexed.indices: try: maybe_dim = split_affine(i).var if isinstance(maybe_dim, Dimension): relation.append(maybe_dim) except ValueError: # Maybe there are some nested Indexeds (e.g., the situation is A[B[i]]) nested = flatten(handle_indexed(n) for n in retrieve_indexed(i)) if nested: relation.extend(nested) else: # Fallback: Just insert all the Dimensions we find, regardless of # what the user is attempting to do relation.extend([d for d in filter_sorted(i.free_symbols) if isinstance(d, Dimension)]) return tuple(relation)
def visit_Operator(self, o): blankline = c.Line("") # Kernel signature and body body = flatten(self._visit(i) for i in o.children) decls = self._args_decl(o.parameters) signature = c.FunctionDeclaration(c.Value(o.retval, o.name), decls) retval = [c.Line(), c.Statement("return 0")] kernel = c.FunctionBody(signature, c.Block(body + retval)) # Elemental functions esigns = [] efuncs = [blankline] for i in o._func_table.values(): if i.local: esigns.append( c.FunctionDeclaration(c.Value(i.root.retval, i.root.name), self._args_decl(i.root.parameters))) efuncs.extend([i.root.ccode, blankline]) # Header files, extra definitions, ... header = [c.Define(*i) for i in o._headers] + [blankline] includes = [ c.Include(i, system=(False if i.endswith('.h') else True)) for i in o._includes ] includes += [blankline] cdefs = [ i._C_typedecl for i in o.parameters if i._C_typedecl is not None ] for i in o._func_table.values(): if i.local: cdefs.extend([ j._C_typedecl for j in i.root.parameters if j._C_typedecl is not None ]) cdefs = filter_sorted(cdefs, key=lambda i: i.tpname) if o._compiler.src_ext == 'cpp': cdefs += [c.Extern('C', signature)] cdefs = [i for j in cdefs for i in (j, blankline)] return c.Module(header + includes + cdefs + esigns + [blankline, kernel] + efuncs)
def __init__(self, nodes, dimension, limits, index=None, offsets=None, properties=None, pragmas=None, uindices=None): # Ensure we deal with a list of Expression objects internally nodes = as_tuple(nodes) self.nodes = as_tuple( [n if isinstance(n, Node) else Expression(n) for n in nodes]) assert all(isinstance(i, Node) for i in self.nodes) self.dim = dimension self.index = index or self.dim.name # Store direction, as it might change on the dimension # before we use it during code generation. self.reverse = self.dim.reverse # Generate loop limits if isinstance(limits, Iterable): assert (len(limits) == 3) self.limits = list(limits) else: self.limits = list((0, limits, 1)) # Record offsets to later adjust loop limits accordingly self.offsets = [0, 0] for off in (offsets or {}): self.offsets[0] = min(self.offsets[0], int(off)) self.offsets[1] = max(self.offsets[1], int(off)) # Track this Iteration's properties, pragmas and unbounded indices properties = as_tuple(properties) assert (i in IterationProperty._KNOWN for i in properties) self.properties = as_tuple( filter_sorted(properties, key=lambda i: i.name)) self.pragmas = as_tuple(pragmas) self.uindices = as_tuple(uindices) assert all(isinstance(i, UnboundedIndex) for i in self.uindices)
def mpiize(iet, **kwargs): """ Add MPI routines performing halo exchanges to emit distributed-memory parallel code. """ mode = kwargs.pop('mode') language = kwargs.pop('language') sregistry = kwargs.pop('sregistry') # To produce unique object names generators = {'msg': generator(), 'comm': generator(), 'comp': generator()} sync_heb = HaloExchangeBuilder('basic', language, sregistry, **generators) user_heb = HaloExchangeBuilder(mode, language, sregistry, **generators) mapper = {} for hs in FindNodes(HaloSpot).visit(iet): heb = user_heb if isinstance(hs, OverlappableHaloSpot) else sync_heb mapper[hs] = heb.make(hs) efuncs = sync_heb.efuncs + user_heb.efuncs objs = filter_sorted(sync_heb.objs + user_heb.objs) iet = Transformer(mapper, nested=True).visit(iet) # Must drop the PARALLEL tag from the Iterations within which halo # exchanges are performed mapper = {} for tree in retrieve_iteration_tree(iet): for i in reversed(tree): if i in mapper: # Already seen this subtree, skip break if FindNodes(Call).visit(i): mapper.update({ n: n._rebuild(properties=set(n.properties) - {PARALLEL}) for n in tree[:tree.index(i) + 1] }) break iet = Transformer(mapper, nested=True).visit(iet) return iet, {'includes': ['mpi.h'], 'efuncs': efuncs, 'args': objs}
def __init__(self, nodes, dimension, limits, direction=None, properties=None, pragmas=None, uindices=None): self.nodes = as_tuple(nodes) self.dim = dimension self.index = self.dim.name self.direction = direction or Forward # Generate loop limits if isinstance(limits, Iterable): assert(len(limits) == 3) self.limits = tuple(limits) elif self.dim.is_Incr: self.limits = (self.dim.symbolic_min, limits, self.dim.step) else: self.limits = (0, limits, 1) # Track this Iteration's properties, pragmas and unbounded indices properties = as_tuple(properties) assert (i in Property._KNOWN for i in properties) self.properties = as_tuple(filter_sorted(properties)) self.pragmas = as_tuple(pragmas) self.uindices = as_tuple(uindices) assert all(i.is_Derived and self.dim in i._defines for i in self.uindices)
def __init__(self, nodes, dimension, limits, index=None, offsets=None, direction=None, properties=None, pragmas=None, uindices=None): # Ensure we deal with a list of Expression objects internally self.nodes = as_tuple(nodes) self.dim = dimension self.index = index or self.dim.name self.direction = direction or Forward # Generate loop limits if isinstance(limits, Iterable): assert (len(limits) == 3) self.limits = tuple(limits) else: self.limits = (0, limits, 1) # Record offsets to later adjust loop limits accordingly self.offsets = (0, 0) if offsets is None else as_tuple(offsets) assert len(self.offsets) == 2 # Track this Iteration's properties, pragmas and unbounded indices properties = as_tuple(properties) assert (i in IterationProperty._KNOWN for i in properties) self.properties = as_tuple( filter_sorted(properties, key=lambda i: i.name)) self.pragmas = as_tuple(pragmas) self.uindices = as_tuple(uindices) assert all(isinstance(i, UnboundedIndex) for i in self.uindices)
def _operator_typedecls(self, o, mode='all'): if mode == 'all': xfilter = lambda i: True else: public_types = (AbstractFunction, CompositeObject) if mode == 'public': xfilter = lambda i: isinstance(i, public_types) else: xfilter = lambda i: not isinstance(i, public_types) typedecls = [ i._C_typedecl for i in o.parameters if xfilter(i) and i._C_typedecl is not None ] for i in o._func_table.values(): if not i.local: continue typedecls.extend([ j._C_typedecl for j in i.root.parameters if xfilter(j) and j._C_typedecl is not None ]) typedecls = filter_sorted(typedecls, key=lambda i: i.tpname) return typedecls
def cause(self): ret = [i.cause for i in self if i.cause is not None] return tuple(filter_sorted(ret, key=lambda i: i.name))
def _build(cls, expressions, **kwargs): expressions = as_tuple(expressions) # Input check if any(not isinstance(i, Eq) for i in expressions): raise InvalidOperator("Only `devito.Eq` expressions are allowed.") name = kwargs.get("name", "Kernel") dse = kwargs.get("dse", configuration['dse']) # Python-level (i.e., compile time) and C-level (i.e., run time) performance profiler = create_profile('timers') # Lower input expressions to internal expressions (e.g., attaching metadata) expressions = cls._lower_exprs(expressions, **kwargs) # Group expressions based on their iteration space and data dependences # Several optimizations are applied (fusion, lifting, flop reduction via DSE, ...) clusters = clusterize(expressions, dse_mode=set_dse_mode(dse)) # Lower Clusters to a Schedule tree stree = st_build(clusters) # Lower Schedule tree to an Iteration/Expression tree (IET) iet = iet_build(stree) # Instrument the IET for C-level profiling iet = profiler.instrument(iet) # Wrap the IET with a Callable parameters = derive_parameters(iet, True) op = Callable(name, iet, 'int', parameters, ()) # Lower IET to a Target-specific IET op, target_state = cls._specialize_iet(op, **kwargs) # Make it an actual Operator op = Callable.__new__(cls, **op.args) Callable.__init__(op, **op.args) # Header files, etc. op._headers = list(cls._default_headers) op._headers.extend(target_state.headers) op._globals = list(cls._default_globals) op._includes = list(cls._default_includes) op._includes.extend(profiler._default_includes) op._includes.extend(target_state.includes) # Required for the jit-compilation op._compiler = configuration['compiler'] op._lib = None op._cfunction = None # References to local or external routines op._func_table = OrderedDict() op._func_table.update( OrderedDict([(i, MetaCall(None, False)) for i in profiler._ext_calls])) op._func_table.update( OrderedDict([(i.root.name, i) for i in target_state.funcs])) # Internal state. May be used to store information about previous runs, # autotuning reports, etc op._state = cls._initialize_state(**kwargs) # Produced by the various compilation passes op._input = filter_sorted( flatten(e.reads + e.writes for e in expressions)) op._output = filter_sorted(flatten(e.writes for e in expressions)) op._dimensions = filter_sorted( flatten(e.dimensions for e in expressions)) op._dimensions.extend(target_state.dimensions) op._dtype, op._dspace = clusters.meta op._profiler = profiler return op
def place_definitions(self, iet, **kwargs): """ Create a new IET with symbols allocated/deallocated in some memory space. Parameters ---------- iet : Callable The input Iteration/Expression tree. """ storage = Storage() # Collect and declare symbols for k, v in MapExprStmts().visit(iet).items(): if k.is_Expression: if k.is_definition: site = v[-1] if v else iet self._alloc_scalar_on_low_lat_mem(site, k, storage) continue objs = [k.write] elif k.is_Call: objs = k.arguments for i in objs: try: if i.is_LocalObject: site = v[-1] if v else iet self._alloc_object_on_low_lat_mem(site, i, storage) elif i.is_Array: if i in iet.parameters: # The Array is passed as a Callable argument continue elif i._mem_stack: self._alloc_array_on_low_lat_mem(iet, i, storage) else: self._alloc_array_on_high_bw_mem(i, storage) except AttributeError: # E.g., a generic SymPy expression pass # Place symbols in a memory space if not iet.is_ElementalFunction: writes = set() reads = set() for efunc in kwargs.get('efuncs', []): for i in FindNodes(Expression).visit(efunc): if i.write.is_Function: writes.add(i.write) reads = (reads | {r for r in i.reads if r.is_Function}) - writes for i in filter_sorted(writes): self._map_function_on_high_bw_mem(i, storage) for i in filter_sorted(reads): self._map_function_on_high_bw_mem(i, storage, read_only=True) # Introduce symbol definitions going in the low latency memory mapper = dict(storage._on_low_lat_mem) iet = Transformer(mapper, nested=True).visit(iet) # Introduce symbol definitions going in the high bandwidth memory header = [] footer = [] for decl, alloc, free in storage._on_high_bw_mem: if decl is None: header.append(alloc) else: header.extend([decl, alloc]) footer.append(free) if header or footer: body = List(header=header, body=iet.body, footer=footer) iet = iet._rebuild(body=body) return iet, {}
def dimension_sort(expr): """ Topologically sort the Dimensions in ``expr``, based on the order in which they appear within Indexeds. """ def handle_indexed(indexed): relation = [] for i in indexed.indices: try: maybe_dim = split_affine(i).var if isinstance(maybe_dim, Dimension): relation.append(maybe_dim) except ValueError: # Maybe there are some nested Indexeds (e.g., the situation is A[B[i]]) nested = flatten( handle_indexed(n) for n in retrieve_indexed(i)) if nested: relation.extend(nested) else: # Fallback: Just insert all the Dimensions we find, regardless of # what the user is attempting to do relation.extend([ d for d in filter_sorted(i.free_symbols) if isinstance(d, Dimension) ]) return tuple(relation) if isinstance(expr.implicit_dims, IgnoreDimSort): relations = set() else: relations = {handle_indexed(i) for i in retrieve_indexed(expr)} # Add in any implicit dimension (typical of scalar temporaries, or Step) relations.add(expr.implicit_dims) # Add in leftover free dimensions (not an Indexed' index) extra = set([i for i in expr.free_symbols if isinstance(i, Dimension)]) # Add in pure data dimensions (e.g., those accessed only via explicit values, # such as A[3]) indexeds = retrieve_indexed(expr, deep=True) extra.update(set().union(*[set(i.function.dimensions) for i in indexeds])) # Enforce determinism extra = filter_sorted(extra, key=attrgetter('name')) # Add in implicit relations for parent dimensions # ----------------------------------------------- # 1) Note that (d.parent, d) is what we want, while (d, d.parent) would be # wrong; for example, in `((t, time), (t, x, y), (x, y))`, `x` could now # preceed `time`, while `t`, and therefore `time`, *must* appear before `x`, # as indicated by the second relation implicit_relations = {(d.parent, d) for d in extra if d.is_Derived} # 2) To handle cases such as `((time, xi), (x,))`, where `xi` a SubDimension # of `x`, besides `(x, xi)`, we also have to add `(time, x)` so that we # obtain the desired ordering `(time, x, xi)`. W/o `(time, x)`, the ordering # `(x, time, xi)` might be returned instead, which would be non-sense implicit_relations.update({tuple(d.root for d in i) for i in relations}) ordering = PartialOrderTuple(extra, relations=(relations | implicit_relations)) return ordering
def _specialize_iet(self, iet, **kwargs): warning("The OPS backend is still work-in-progress") affine_trees = find_affine_trees(iet).items() # If there is no affine trees, then there is no loop to be optimized using OPS. if not affine_trees: return iet ops_init = Call(namespace['ops_init'], [0, 0, 2]) ops_partition = Call(namespace['ops_partition'], Literal('""')) ops_exit = Call(namespace['ops_exit']) # Extract all symbols that need to be converted to ops_dat dims = [] to_dat = set() for _, tree in affine_trees: dims.append(len(tree[0].dimensions)) symbols = set(FindSymbols('symbolics').visit(tree[0].root)) symbols -= set(FindSymbols('defines').visit(tree[0].root)) to_dat |= symbols # Create the OPS block for this problem ops_block = OpsBlock('block') ops_block_init = Expression( ClusterizedEq( Eq(ops_block, namespace['ops_decl_block'](dims[0], Literal('"block"'))))) # To ensure deterministic code generation we order the datasets to # be generated (since a set is an unordered collection) to_dat = filter_sorted(to_dat) name_to_ops_dat = {} pre_time_loop = [] after_time_loop = [] for f in to_dat: if f.is_Constant: continue pre_time_loop.extend( list(create_ops_dat(f, name_to_ops_dat, ops_block))) # To return the result to Devito, it is necessary to copy the data # from the dat object back to the CPU memory. after_time_loop.extend( create_ops_fetch(f, name_to_ops_dat, self.time_dimension.extreme_max)) # Generate ops kernels for each offloadable iteration tree mapper = {} for n, (_, tree) in enumerate(affine_trees): pre_loop, ops_kernel, ops_par_loop_call = opsit( tree, n, name_to_ops_dat, ops_block, dims[0]) pre_time_loop.extend(pre_loop) self._func_table[namespace['ops_kernel_file'](ops_kernel.name)] = \ MetaCall(ops_kernel, False) mapper[tree[0].root] = ops_par_loop_call mapper.update({i.root: mapper.get(i.root) for i in tree}) # Drop trees iet = Transformer(mapper).visit(iet) assert (d == dims[0] for d in dims), \ "The OPS backend currently assumes that all kernels \ have the same number of dimensions" self._headers.append(namespace['ops_define_dimension'](dims[0])) self._includes.extend(['stdio.h', 'ops_seq.h']) body = [ ops_init, ops_block_init, *pre_time_loop, ops_partition, iet, *after_time_loop, ops_exit ] return List(body=body)
def _make_fetchprefetch(self, iet, sync_ops, pieces, root): fid = SharedData._field_id fetches = [] prefetches = [] presents = [] for s in sync_ops: f = s.function dimensions = s.dimensions fc = s.fetch ifc = s.ifetch pfc = s.pfetch fcond = s.fcond pcond = s.pcond # Construct init IET imask = [(ifc, s.size) if d.root is s.dim.root else FULL for d in dimensions] fetch = PragmaTransfer(self.lang._map_to, f, imask=imask) fetches.append(Conditional(fcond, fetch)) # Construct present clauses imask = [(fc, s.size) if d.root is s.dim.root else FULL for d in dimensions] presents.append( PragmaTransfer(self.lang._map_present, f, imask=imask)) # Construct prefetch IET imask = [(pfc, s.size) if d.root is s.dim.root else FULL for d in dimensions] prefetch = PragmaTransfer(self.lang._map_to_wait, f, imask=imask, queueid=fid) prefetches.append(Conditional(pcond, prefetch)) # Turn init IET into a Callable functions = filter_ordered(s.function for s in sync_ops) name = self.sregistry.make_name(prefix='init_device') body = List(body=fetches) parameters = filter_sorted(functions + derive_parameters(body)) func = Callable(name, body, 'void', parameters, 'static') pieces.funcs.append(func) # Perform initial fetch by the main thread pieces.init.append( List(header=c.Comment("Initialize data stream"), body=[Call(name, parameters), BlankLine])) # Turn prefetch IET into a ThreadFunction name = self.sregistry.make_name(prefix='prefetch_host_to_device') body = List(header=c.Line(), body=prefetches) tctx = make_thread_ctx(name, body, root, None, sync_ops, self.sregistry) pieces.funcs.extend(tctx.funcs) # Glue together all the IET pieces, including the activation logic sdata = tctx.sdata threads = tctx.threads iet = List(body=[ BlankLine, BusyWait( CondNe( FieldFromComposite(sdata._field_flag, sdata[ threads.index]), 1)) ] + presents + [iet, tctx.activate]) # Fire up the threads pieces.init.append(tctx.init) # Final wait before jumping back to Python land pieces.finalize.append(tctx.finalize) # Keep track of created objects pieces.objs.add(sync_ops, sdata, threads) return iet
def visit_Expression(self, o): return filter_sorted([f for f in self.rule(o)], key=attrgetter('name'))
def visit_tuple(self, o): symbols = flatten([self._visit(i) for i in o]) return filter_sorted(symbols, key=attrgetter('name'))
def visit_Iteration(self, o): symbols = flatten([self._visit(i) for i in o.children]) symbols += self.rule(o) return filter_sorted(symbols, key=attrgetter('name'))
def _make_fetchwaitprefetch(self, iet, sync_ops, pieces, root): threads = self.__make_threads() fetches = [] prefetches = [] presents = [] for s in sync_ops: if s.direction is Forward: fc = s.fetch.subs(s.dim, s.dim.symbolic_min) fsize = s.function._C_get_field(FULL, s.dim).size fc_cond = fc + (s.size - 1) < fsize pfc = s.fetch + 1 pfc_cond = pfc + (s.size - 1) < fsize else: fc = s.fetch.subs(s.dim, s.dim.symbolic_max) fc_cond = fc >= 0 pfc = s.fetch - 1 pfc_cond = pfc >= 0 # Construct fetch IET imask = [(fc, s.size) if d.root is s.dim.root else FULL for d in s.dimensions] fetch = List(header=self._P._map_to(s.function, imask)) fetches.append(Conditional(fc_cond, fetch)) # Construct present clauses imask = [(s.fetch, s.size) if d.root is s.dim.root else FULL for d in s.dimensions] presents.extend(as_list(self._P._map_present(s.function, imask))) # Construct prefetch IET imask = [(pfc, s.size) if d.root is s.dim.root else FULL for d in s.dimensions] prefetch = List(header=self._P._map_to_wait( s.function, imask, SharedData._field_id)) prefetches.append(Conditional(pfc_cond, prefetch)) functions = filter_ordered(s.function for s in sync_ops) casts = [PointerCast(f) for f in functions] # Turn init IET into a Callable name = self.sregistry.make_name(prefix='init_device') body = List(body=casts + fetches) parameters = filter_sorted(functions + derive_parameters(body)) func = Callable(name, body, 'void', parameters, 'static') pieces.funcs.append(func) # Perform initial fetch by the main thread pieces.init.append( List(header=c.Comment("Initialize data stream for `%s`" % threads.name), body=[Call(name, func.parameters), BlankLine])) # Turn prefetch IET into a threaded Callable name = self.sregistry.make_name(prefix='prefetch_host_to_device') body = List(header=c.Line(), body=casts + prefetches) tfunc, sdata = self.__make_tfunc(name, body, root, threads) pieces.funcs.append(tfunc) # Glue together all the IET pieces, including the activation bits iet = List(body=[ BlankLine, BusyWait( CondNe( FieldFromComposite(sdata._field_flag, sdata[ threads.index]), 1)), List(header=presents), iet, self.__make_activate_thread(threads, sdata, sync_ops) ]) # Fire up the threads pieces.init.append( self.__make_init_threads(threads, sdata, tfunc, pieces)) pieces.threads.append(threads) # Final wait before jumping back to Python land pieces.finalize.append(self.__make_finalize_threads(threads, sdata)) return iet
def make_ops_kernels(iet): warning("The OPS backend is still work-in-progress") affine_trees = find_affine_trees(iet).items() # If there is no affine trees, then there is no loop to be optimized using OPS. if not affine_trees: return iet, {} ops_init = Call(namespace['ops_init'], [0, 0, 2]) ops_partition = Call(namespace['ops_partition'], Literal('""')) ops_exit = Call(namespace['ops_exit']) # Extract all symbols that need to be converted to ops_dat dims = [] to_dat = set() for _, tree in affine_trees: dims.append(len(tree[0].dimensions)) symbols = set(FindSymbols('symbolics').visit(tree[0].root)) symbols -= set(FindSymbols('defines').visit(tree[0].root)) to_dat |= symbols # Create the OPS block for this problem ops_block = OpsBlock('block') ops_block_init = Expression( ClusterizedEq( Eq(ops_block, namespace['ops_decl_block'](dims[0], Literal('"block"'))))) # To ensure deterministic code generation we order the datasets to # be generated (since a set is an unordered collection) to_dat = filter_sorted(to_dat) name_to_ops_dat = {} pre_time_loop = [] after_time_loop = [] for f in to_dat: if f.is_Constant: continue pre_time_loop.extend( list(create_ops_dat(f, name_to_ops_dat, ops_block))) # Copy data from device to host after_time_loop.extend( create_ops_fetch(f, name_to_ops_dat, f.grid.time_dim.extreme_max)) # Generate ops kernels for each offloadable iteration tree mapper = {} ffuncs = [] for n, (_, tree) in enumerate(affine_trees): pre_loop, ops_kernel, ops_par_loop_call = opsit( tree, n, name_to_ops_dat, ops_block, dims[0]) pre_time_loop.extend(pre_loop) ffuncs.append(ops_kernel) mapper[tree[0].root] = ops_par_loop_call mapper.update({i.root: mapper.get(i.root) for i in tree}) # Drop trees iet = Transformer(mapper).visit(iet) assert (d == dims[0] for d in dims), \ "The OPS backend currently assumes that all kernels \ have the same number of dimensions" iet = iet._rebuild(body=flatten([ ops_init, ops_block_init, pre_time_loop, ops_partition, iet.body, after_time_loop, ops_exit ])) return iet, { 'includes': ['stdio.h', 'ops_seq.h'], 'ffuncs': ffuncs, 'headers': [namespace['ops_define_dimension'](dims[0])] }
def _build(cls, expressions, **kwargs): expressions = as_tuple(expressions) # Input check if any(not isinstance(i, Evaluable) for i in expressions): raise InvalidOperator("Only `devito.Evaluable` are allowed.") # Python-level (i.e., compile time) and C-level (i.e., run time) performance profiler = create_profile('timers') # Lower input expressions expressions = cls._lower_exprs(expressions, **kwargs) # Group expressions based on iteration spaces and data dependences clusters = cls._lower_clusters(expressions, profiler, **kwargs) # Lower Clusters to a ScheduleTree stree = cls._lower_stree(clusters, **kwargs) # Lower ScheduleTree to an Iteration/Expression Tree iet, byproduct = cls._lower_iet(stree, profiler, **kwargs) # Make it an actual Operator op = Callable.__new__(cls, **iet.args) Callable.__init__(op, **op.args) # Header files, etc. op._headers = list(cls._default_headers) op._headers.extend(byproduct.headers) op._globals = list(cls._default_globals) op._includes = list(cls._default_includes) op._includes.extend(profiler._default_includes) op._includes.extend(byproduct.includes) # Required for the jit-compilation op._compiler = kwargs['compiler'] op._lib = None op._cfunction = None # References to local or external routines op._func_table = OrderedDict() op._func_table.update( OrderedDict([(i, MetaCall(None, False)) for i in profiler._ext_calls])) op._func_table.update( OrderedDict([(i.root.name, i) for i in byproduct.funcs])) # Internal state. May be used to store information about previous runs, # autotuning reports, etc op._state = cls._initialize_state(**kwargs) # Produced by the various compilation passes op._input = filter_sorted( flatten(e.reads + e.writes for e in expressions)) op._output = filter_sorted(flatten(e.writes for e in expressions)) op._dimensions = flatten(c.dimensions for c in clusters) + byproduct.dimensions op._dimensions = sorted(set(op._dimensions), key=attrgetter('name')) op._dtype, op._dspace = clusters.meta op._profiler = profiler return op
def detect_io(exprs, relax=False): """ ``{exprs} -> ({reads}, {writes})`` Parameters ---------- exprs : expr-like or list of expr-like The searched expressions. relax : bool, optional If False, as by default, collect only Constants and Functions. Otherwise, collect any Basic object. """ exprs = as_tuple(exprs) if relax is False: rule = lambda i: i.is_Input else: rule = lambda i: i.is_Scalar or i.is_Tensor # Don't forget the nasty case with indirections on the LHS: # >>> u[t, a[x]] = f[x] -> (reads={a, f}, writes={u}) roots = [] for i in exprs: try: roots.append(i.rhs) roots.extend(list(i.lhs.indices)) roots.extend(list(i.conditionals.values())) except AttributeError: # E.g., FunctionFromPointer roots.append(i) reads = [] terminals = flatten(retrieve_terminals(i, deep=True) for i in roots) for i in terminals: candidates = set(i.free_symbols) try: candidates.update({i.function}) except AttributeError: pass for j in candidates: try: if rule(j): reads.append(j) except AttributeError: pass writes = [] for i in exprs: try: f = i.lhs.function except AttributeError: continue try: if rule(f): writes.append(f) except AttributeError: # We only end up here after complex IET transformations which make # use of composite types assert isinstance(i.lhs, FunctionFromPointer) f = i.lhs.base.function if rule(f): writes.append(f) return filter_sorted(reads), filter_sorted(writes)
def __init__(self, expressions, **kwargs): expressions = as_tuple(expressions) # Input check if any(not isinstance(i, Eq) for i in expressions): raise InvalidOperator("Only `devito.Eq` expressions are allowed.") self.name = kwargs.get("name", "Kernel") subs = kwargs.get("subs", {}) dse = kwargs.get("dse", configuration['dse']) # Header files, etc. self._headers = list(self._default_headers) self._includes = list(self._default_includes) self._globals = list(self._default_globals) # Required for compilation self._compiler = configuration['compiler'] self._lib = None self._cfunction = None # References to local or external routines self._func_table = OrderedDict() # Internal state. May be used to store information about previous runs, # autotuning reports, etc self._state = {} # Form and gather any required implicit expressions expressions = self._add_implicit(expressions) # Expression lowering: indexification, substitution rules, specialization expressions = [indexify(i) for i in expressions] expressions = self._apply_substitutions(expressions, subs) expressions = self._specialize_exprs(expressions) # Expression analysis self._input = filter_sorted(flatten(e.reads + e.writes for e in expressions)) self._output = filter_sorted(flatten(e.writes for e in expressions)) self._dimensions = filter_sorted(flatten(e.dimensions for e in expressions)) # Group expressions based on their iteration space and data dependences, # and apply the Devito Symbolic Engine (DSE) for flop optimization clusters = clusterize(expressions) clusters = rewrite(clusters, mode=set_dse_mode(dse)) self._dtype, self._dspace = clusters.meta # Lower Clusters to a Schedule tree stree = st_build(clusters) # Lower Schedule tree to an Iteration/Expression tree (IET) iet = iet_build(stree) iet, self._profiler = self._profile_sections(iet) iet = self._specialize_iet(iet, **kwargs) # Derive all Operator parameters based on the IET parameters = derive_parameters(iet, True) # Finalization: introduce declarations, type casts, etc iet = self._finalize(iet, parameters) super(Operator, self).__init__(self.name, iet, 'int', parameters, ())
def _make_fetchwaitprefetch(self, iet, sync_ops, pieces, root): fetches = [] prefetches = [] presents = [] for s in sync_ops: if s.direction is Forward: fc = s.fetch.subs(s.dim, s.dim.symbolic_min) pfc = s.fetch + 1 fc_cond = s.next_cbk(s.dim.symbolic_min) pfc_cond = s.next_cbk(s.dim + 1) else: fc = s.fetch.subs(s.dim, s.dim.symbolic_max) pfc = s.fetch - 1 fc_cond = s.next_cbk(s.dim.symbolic_max) pfc_cond = s.next_cbk(s.dim - 1) # Construct init IET imask = [(fc, s.size) if d.root is s.dim.root else FULL for d in s.dimensions] fetch = PragmaList(self.lang._map_to(s.function, imask), {s.function} | fc.free_symbols) fetches.append(Conditional(fc_cond, fetch)) # Construct present clauses imask = [(s.fetch, s.size) if d.root is s.dim.root else FULL for d in s.dimensions] presents.extend(as_list(self.lang._map_present(s.function, imask))) # Construct prefetch IET imask = [(pfc, s.size) if d.root is s.dim.root else FULL for d in s.dimensions] prefetch = PragmaList(self.lang._map_to_wait(s.function, imask, SharedData._field_id), {s.function} | pfc.free_symbols) prefetches.append(Conditional(pfc_cond, prefetch)) # Turn init IET into a Callable functions = filter_ordered(s.function for s in sync_ops) name = self.sregistry.make_name(prefix='init_device') body = List(body=fetches) parameters = filter_sorted(functions + derive_parameters(body)) func = Callable(name, body, 'void', parameters, 'static') pieces.funcs.append(func) # Perform initial fetch by the main thread pieces.init.append(List( header=c.Comment("Initialize data stream"), body=[Call(name, parameters), BlankLine] )) # Turn prefetch IET into a ThreadFunction name = self.sregistry.make_name(prefix='prefetch_host_to_device') body = List(header=c.Line(), body=prefetches) tctx = make_thread_ctx(name, body, root, None, sync_ops, self.sregistry) pieces.funcs.extend(tctx.funcs) # Glue together all the IET pieces, including the activation logic sdata = tctx.sdata threads = tctx.threads iet = List(body=[ BlankLine, BusyWait(CondNe(FieldFromComposite(sdata._field_flag, sdata[threads.index]), 1)), List(header=presents), iet, tctx.activate ]) # Fire up the threads pieces.init.append(tctx.init) pieces.threads.append(threads) # Final wait before jumping back to Python land pieces.finalize.append(tctx.finalize) return iet
def _create_elemental_functions(self, nodes, state): """ Extract :class:`Iteration` sub-trees and move them into :class:`Callable`s. Currently, only tagged, elementizable Iteration objects are targeted. """ noinline = self._compiler_decoration('noinline', c.Comment('noinline?')) functions = OrderedDict() mapper = {} for tree in retrieve_iteration_tree(nodes, mode='superset'): # Search an elementizable sub-tree (if any) tagged = filter_iterations(tree, lambda i: i.tag is not None, 'asap') if not tagged: continue root = tagged[0] if not root.is_Elementizable: continue target = tree[tree.index(root):] # Elemental function arguments args = [] # Found so far (scalars, tensors) maybe_required = set() # Scalars that *may* have to be passed in not_required = set() # Elemental function locally declared scalars # Build a new Iteration/Expression tree with free bounds free = [] for i in target: name, bounds = i.dim.name, i.bounds_symbolic # Iteration bounds start = Scalar(name='%s_start' % name, dtype=np.int32) finish = Scalar(name='%s_finish' % name, dtype=np.int32) args.extend(zip([ccode(j) for j in bounds], (start, finish))) # Iteration unbounded indices ufunc = [ Scalar(name='%s_ub%d' % (name, j), dtype=np.int32) for j in range(len(i.uindices)) ] args.extend(zip([ccode(j.start) for j in i.uindices], ufunc)) limits = [Symbol(start.name), Symbol(finish.name), 1] uindices = [ UnboundedIndex(j.index, i.dim + as_symbol(k)) for j, k in zip(i.uindices, ufunc) ] free.append( i._rebuild(limits=limits, offsets=None, uindices=uindices)) not_required.update({i.dim}, set(j.index for j in i.uindices)) # Construct elemental function body, and inspect it free = NestedTransformer(dict((zip(target, free)))).visit(root) expressions = FindNodes(Expression).visit(free) fsymbols = FindSymbols('symbolics').visit(free) # Add all definitely-required arguments not_required.update({i.output for i in expressions if i.is_scalar}) for i in fsymbols: if i in not_required: continue elif i.is_Array: args.append( ("(%s*)%s" % (c.dtype_to_ctype(i.dtype), i.name), i)) elif i.is_TensorFunction: args.append(("%s_vec" % i.name, i)) elif i.is_Scalar: args.append((i.name, i)) # Add all maybe-required arguments that turn out to be required maybe_required.update( set(FindSymbols(mode='free-symbols').visit(free))) for i in fsymbols: not_required.update({as_symbol(i), i.indexify()}) for j in i.symbolic_shape: maybe_required.update(j.free_symbols) required = filter_sorted(maybe_required - not_required, key=attrgetter('name')) args.extend([(i.name, Scalar(name=i.name, dtype=i.dtype)) for i in required]) call, params = zip(*args) handle = flatten([p.rtargs for p in params]) name = "f_%d" % root.tag # Produce the new Call mapper[root] = List(header=noinline, body=Call(name, call)) # Produce the new Callable functions.setdefault( name, Callable(name, free, 'void', handle, ('static', ))) # Transform the main tree processed = Transformer(mapper).visit(nodes) return processed, {'elemental_functions': functions.values()}
def __init__(self, expressions, **kwargs): expressions = as_tuple(expressions) # Input check if any(not isinstance(i, sympy.Eq) for i in expressions): raise InvalidOperator("Only SymPy expressions are allowed.") self.name = kwargs.get("name", "Kernel") subs = kwargs.get("subs", {}) dse = kwargs.get("dse", configuration['dse']) # Header files, etc. self._headers = list(self._default_headers) self._includes = list(self._default_includes) self._globals = list(self._default_globals) # Required for compilation self._compiler = configuration['compiler'] self._lib = None self._cfunction = None # References to local or external routines self.func_table = OrderedDict() # Expression lowering: indexification, substitution rules, specialization expressions = [indexify(i) for i in expressions] expressions = [i.xreplace(subs) for i in expressions] expressions = self._specialize_exprs(expressions) # Expression analysis self.input = filter_sorted(flatten(e.reads for e in expressions)) self.output = filter_sorted(flatten(e.writes for e in expressions)) self.dimensions = filter_sorted(flatten(e.dimensions for e in expressions)) # Group expressions based on their iteration space and data dependences, # and apply the Devito Symbolic Engine (DSE) for flop optimization clusters = clusterize(expressions) clusters = rewrite(clusters, mode=set_dse_mode(dse)) self._dtype, self._dspace = clusters.meta # Lower Clusters to a Schedule tree stree = schedule(clusters) stree = section(stree) # Lower Sections to an Iteration/Expression tree (IET) iet = iet_build(stree) # Insert code for C-level performance profiling iet, self.profiler = self._profile_sections(iet) # Translate into backend-specific representation iet = self._specialize_iet(iet, **kwargs) # Insert the required symbol declarations iet = iet_insert_C_decls(iet, self.func_table) # Insert data and pointer casts for array parameters and profiling structs iet = self._build_casts(iet) # Derive parameters as symbols not defined in the kernel itself parameters = self._build_parameters(iet) # Finish instantiation super(Operator, self).__init__(self.name, iet, 'int', parameters, ())
def visit_Call(self, o): symbols = self._visit(o.children) symbols.extend([f for f in self.rule(o)]) return filter_sorted(symbols, key=attrgetter('name'))
def _specialize_iet(self, iet, **kwargs): warning("The OPS backend is still work-in-progress") ops_init = Call(namespace['ops_init'], [0, 0, 2]) ops_partition = Call(namespace['ops_partition'], Literal('""')) ops_exit = Call(namespace['ops_exit']) # Extract all symbols that need to be converted to ops_dat dims = [] to_dat = set() for section, trees in find_affine_trees(iet).items(): dims.append(len(trees[0].dimensions)) symbols = set(FindSymbols('symbolics').visit(trees[0].root)) symbols -= set(FindSymbols('defines').visit(trees[0].root)) to_dat |= symbols # Create the OPS block for this problem ops_block = OpsBlock('block') ops_block_init = Expression( ClusterizedEq( Eq(ops_block, namespace['ops_decl_block'](dims[0], Literal('"block"'))))) # To ensure deterministic code generation we order the datasets to # be generated (since a set is an unordered collection) to_dat = filter_sorted(to_dat) name_to_ops_dat = {} pre_time_loop = [] for f in to_dat: if f.is_Constant: continue pre_time_loop.extend(create_ops_dat(f, name_to_ops_dat, ops_block)) # Generate ops kernels for each offloadable iteration tree mapper = {} for n, (section, trees) in enumerate(find_affine_trees(iet).items()): pre_loop, ops_kernel, ops_par_loop_call = opsit( trees, n, name_to_ops_dat, ops_block, dims[0]) pre_time_loop.extend(pre_loop) self._ops_kernels.append(ops_kernel) mapper[trees[0].root] = ops_par_loop_call mapper.update({i.root: mapper.get(i.root) for i in trees}) # Drop trees iet = Transformer(mapper).visit(iet) assert (d == dims[0] for d in dims), \ "The OPS backend currently assumes that all kernels \ have the same number of dimensions" self._headers.append(namespace['ops_define_dimension'](dims[0])) self._includes.append('stdio.h') body = [ ops_init, ops_block_init, *pre_time_loop, ops_partition, iet, ops_exit ] return List(body=body)
def cause(self): ret = [i.cause for i in self if i.cause is not None] ret.extend([i.parent for i in ret if i.is_Derived]) return tuple(filter_sorted(ret, key=lambda i: i.name))
def __init__(self, expressions, **kwargs): expressions = as_tuple(expressions) # Input check if any(not isinstance(i, Eq) for i in expressions): raise InvalidOperator("Only `devito.Eq` expressions are allowed.") self.name = kwargs.get("name", "Kernel") subs = kwargs.get("subs", {}) dse = kwargs.get("dse", configuration['dse']) # Header files, etc. self._headers = list(self._default_headers) self._includes = list(self._default_includes) self._globals = list(self._default_globals) # Required for compilation self._compiler = configuration['compiler'] self._lib = None self._cfunction = None # References to local or external routines self._func_table = OrderedDict() # Internal state. May be used to store information about previous runs, # autotuning reports, etc self._state = self._initialize_state(**kwargs) # Form and gather any required implicit expressions expressions = self._add_implicit(expressions) # Expression lowering: evaluation of derivatives, indexification, # substitution rules, specialization expressions = [i.evaluate for i in expressions] expressions = [indexify(i) for i in expressions] expressions = self._apply_substitutions(expressions, subs) expressions = self._specialize_exprs(expressions) # Expression analysis self._input = filter_sorted( flatten(e.reads + e.writes for e in expressions)) self._output = filter_sorted(flatten(e.writes for e in expressions)) self._dimensions = filter_sorted( flatten(e.dimensions for e in expressions)) # Group expressions based on their iteration space and data dependences, # and apply the Devito Symbolic Engine (DSE) for flop optimization clusters = clusterize(expressions) clusters = rewrite(clusters, mode=set_dse_mode(dse)) self._dtype, self._dspace = clusters.meta # Lower Clusters to a Schedule tree stree = st_build(clusters) # Lower Schedule tree to an Iteration/Expression tree (IET) iet = iet_build(stree) iet, self._profiler = self._profile_sections(iet) iet = self._specialize_iet(iet, **kwargs) # Derive all Operator parameters based on the IET parameters = derive_parameters(iet, True) # Finalization: introduce declarations, type casts, etc iet = self._finalize(iet, parameters) super(Operator, self).__init__(self.name, iet, 'int', parameters, ())
def __init__(self, expressions, **kwargs): expressions = as_tuple(expressions) # Input check if any(not isinstance(i, sympy.Eq) for i in expressions): raise InvalidOperator("Only SymPy expressions are allowed.") self.name = kwargs.get("name", "Kernel") subs = kwargs.get("subs", {}) dse = kwargs.get("dse", configuration['dse']) dle = kwargs.get("dle", configuration['dle']) # Header files, etc. self._headers = list(self._default_headers) self._includes = list(self._default_includes) self._globals = list(self._default_globals) # Required for compilation self._compiler = configuration['compiler'] self._lib = None self._cfunction = None # References to local or external routines self.func_table = OrderedDict() # Expression lowering: indexification, substitution rules, specialization expressions = [indexify(i) for i in expressions] expressions = [i.xreplace(subs) for i in expressions] expressions = self._specialize_exprs(expressions) # Expression analysis self.input = filter_sorted(flatten(e.reads for e in expressions)) self.output = filter_sorted(flatten(e.writes for e in expressions)) self.dimensions = filter_sorted(flatten(e.dimensions for e in expressions)) # Group expressions based on their iteration space and data dependences, # and apply the Devito Symbolic Engine (DSE) for flop optimization clusters = clusterize(expressions) clusters = rewrite(clusters, mode=set_dse_mode(dse)) self._dtype, self._dspace = clusters.meta # Lower Clusters to an Iteration/Expression tree (IET) nodes = iet_build(clusters) # Introduce C-level profiling infrastructure nodes, self.profiler = self._profile_sections(nodes) # Translate into backend-specific representation (e.g., GPU, Yask) nodes = self._specialize_iet(nodes) # Apply the Devito Loop Engine (DLE) for loop optimization dle_state = transform(nodes, *set_dle_mode(dle)) # Update the Operator state based on the DLE self.dle_args = dle_state.arguments self.dle_flags = dle_state.flags self.func_table.update(OrderedDict([(i.name, MetaCall(i, True)) for i in dle_state.elemental_functions])) self.dimensions.extend([i.argument for i in self.dle_args if isinstance(i.argument, Dimension)]) self._includes.extend(list(dle_state.includes)) # Introduce the required symbol declarations nodes = iet_insert_C_decls(dle_state.nodes, self.func_table) # Insert data and pointer casts for array parameters and profiling structs nodes = self._build_casts(nodes) # Derive parameters as symbols not defined in the kernel itself parameters = self._build_parameters(nodes) # Finish instantiation super(Operator, self).__init__(self.name, nodes, 'int', parameters, ())