def _specialize_iet(self, iet, **kwargs): warning("The OPS backend is still work-in-progress") affine_trees = find_affine_trees(iet).items() # If there is no affine trees, then there is no loop to be optimized using OPS. if not affine_trees: return iet ops_init = Call(namespace['ops_init'], [0, 0, 2]) ops_partition = Call(namespace['ops_partition'], Literal('""')) ops_exit = Call(namespace['ops_exit']) # Extract all symbols that need to be converted to ops_dat dims = [] to_dat = set() for _, tree in affine_trees: dims.append(len(tree[0].dimensions)) symbols = set(FindSymbols('symbolics').visit(tree[0].root)) symbols -= set(FindSymbols('defines').visit(tree[0].root)) to_dat |= symbols # Create the OPS block for this problem ops_block = OpsBlock('block') ops_block_init = Expression( ClusterizedEq( Eq(ops_block, namespace['ops_decl_block'](dims[0], Literal('"block"'))))) # To ensure deterministic code generation we order the datasets to # be generated (since a set is an unordered collection) to_dat = filter_sorted(to_dat) name_to_ops_dat = {} pre_time_loop = [] after_time_loop = [] for f in to_dat: if f.is_Constant: continue pre_time_loop.extend( list(create_ops_dat(f, name_to_ops_dat, ops_block))) # To return the result to Devito, it is necessary to copy the data # from the dat object back to the CPU memory. after_time_loop.extend( create_ops_fetch(f, name_to_ops_dat, self.time_dimension.extreme_max)) # Generate ops kernels for each offloadable iteration tree mapper = {} for n, (_, tree) in enumerate(affine_trees): pre_loop, ops_kernel, ops_par_loop_call = opsit( tree, n, name_to_ops_dat, ops_block, dims[0]) pre_time_loop.extend(pre_loop) self._func_table[namespace['ops_kernel_file'](ops_kernel.name)] = \ MetaCall(ops_kernel, False) mapper[tree[0].root] = ops_par_loop_call mapper.update({i.root: mapper.get(i.root) for i in tree}) # Drop trees iet = Transformer(mapper).visit(iet) assert (d == dims[0] for d in dims), \ "The OPS backend currently assumes that all kernels \ have the same number of dimensions" self._headers.append(namespace['ops_define_dimension'](dims[0])) self._includes.extend(['stdio.h', 'ops_seq.h']) body = [ ops_init, ops_block_init, *pre_time_loop, ops_partition, iet, *after_time_loop, ops_exit ] return List(body=body)
def __init__(self, expressions, **kwargs): expressions = as_tuple(expressions) # Input check if any(not isinstance(i, sympy.Eq) for i in expressions): raise InvalidOperator("Only SymPy expressions are allowed.") self.name = kwargs.get("name", "Kernel") subs = kwargs.get("subs", {}) time_axis = kwargs.get("time_axis", Forward) dse = kwargs.get("dse", configuration['dse']) dle = kwargs.get("dle", configuration['dle']) # Header files, etc. self._headers = list(self._default_headers) self._includes = list(self._default_includes) self._globals = list(self._default_globals) # Required for compilation self._compiler = configuration['compiler'] self._lib = None self._cfunction = None # References to local or external routines self.func_table = OrderedDict() # Expression lowering and analysis expressions = [LoweredEq(e, subs=subs) for e in expressions] self.dtype = retrieve_dtype(expressions) self.input, self.output, self.dimensions = retrieve_symbols( expressions) # Set the direction of time acoording to the given TimeAxis for time in [d for d in self.dimensions if d.is_Time]: if not time.is_Stepping: time.reverse = time_axis == Backward # Parameters of the Operator (Dimensions necessary for data casts) parameters = self.input + self.dimensions # Group expressions based on their iteration space and data dependences, # and apply the Devito Symbolic Engine (DSE) for flop optimization clusters = clusterize(expressions) clusters = rewrite(clusters, mode=set_dse_mode(dse)) # Lower Clusters to an Iteration/Expression tree (IET) nodes = iet_build(clusters, self.dtype) # Introduce C-level profiling infrastructure nodes, self.profiler = self._profile_sections(nodes, parameters) # Translate into backend-specific representation (e.g., GPU, Yask) nodes = self._specialize(nodes, parameters) # Apply the Devito Loop Engine (DLE) for loop optimization dle_state = transform(nodes, *set_dle_mode(dle)) # Update the Operator state based on the DLE self.dle_arguments = dle_state.arguments self.dle_flags = dle_state.flags self.func_table.update( OrderedDict([(i.name, MetaCall(i, True)) for i in dle_state.elemental_functions])) parameters.extend([i.argument for i in self.dle_arguments]) self.dimensions.extend([ i.argument for i in self.dle_arguments if isinstance(i.argument, Dimension) ]) self._includes.extend(list(dle_state.includes)) # Introduce the required symbol declarations nodes = iet_insert_C_decls(dle_state.nodes, self.func_table) # Initialise ArgumentEngine self.argument_engine = ArgumentEngine(clusters.ispace, parameters, self.dle_arguments) parameters = self.argument_engine.arguments # Finish instantiation super(Operator, self).__init__(self.name, nodes, 'int', parameters, ())
def _build(cls, expressions, **kwargs): expressions = as_tuple(expressions) # Input check if any(not isinstance(i, Evaluable) for i in expressions): raise InvalidOperator("Only `devito.Evaluable` are allowed.") # Python-level (i.e., compile time) and C-level (i.e., run time) performance profiler = create_profile('timers') # Lower input expressions expressions = cls._lower_exprs(expressions, **kwargs) # Group expressions based on iteration spaces and data dependences clusters = cls._lower_clusters(expressions, profiler, **kwargs) # Lower Clusters to a ScheduleTree stree = cls._lower_stree(clusters, **kwargs) # Lower ScheduleTree to an Iteration/Expression Tree iet, byproduct = cls._lower_iet(stree, profiler, **kwargs) # Make it an actual Operator op = Callable.__new__(cls, **iet.args) Callable.__init__(op, **op.args) # Header files, etc. op._headers = list(cls._default_headers) op._headers.extend(byproduct.headers) op._globals = list(cls._default_globals) op._includes = list(cls._default_includes) op._includes.extend(profiler._default_includes) op._includes.extend(byproduct.includes) # Required for the jit-compilation op._compiler = kwargs['compiler'] op._lib = None op._cfunction = None # References to local or external routines op._func_table = OrderedDict() op._func_table.update( OrderedDict([(i, MetaCall(None, False)) for i in profiler._ext_calls])) op._func_table.update( OrderedDict([(i.root.name, i) for i in byproduct.funcs])) # Internal state. May be used to store information about previous runs, # autotuning reports, etc op._state = cls._initialize_state(**kwargs) # Produced by the various compilation passes op._input = filter_sorted( flatten(e.reads + e.writes for e in expressions)) op._output = filter_sorted(flatten(e.writes for e in expressions)) op._dimensions = flatten(c.dimensions for c in clusters) + byproduct.dimensions op._dimensions = sorted(set(op._dimensions), key=attrgetter('name')) op._dtype, op._dspace = clusters.meta op._profiler = profiler return op
def _specialize_iet(self, iet, **kwargs): """ Transform the Iteration/Expression tree to offload the computation of one or more loop nests onto YASK. This involves calling the YASK compiler to generate YASK code. Such YASK code is then called from within the transformed Iteration/Expression tree. """ mapper = {} self.yk_solns = OrderedDict() for n, (section, trees) in enumerate(find_affine_trees(iet).items()): dimensions = tuple(filter_ordered(i.dim.root for i in flatten(trees))) context = contexts.fetch(dimensions, self._dtype) # A unique name for the 'real' compiler and kernel solutions name = namespace['jit-soln'](Signer._digest(configuration, *[i.root for i in trees])) # Create a YASK compiler solution for this Operator yc_soln = context.make_yc_solution(name) try: # Generate YASK vars and populate `yc_soln` with equations local_vars = yaskit(trees, yc_soln) # Build the new IET nodes yk_soln_obj = YaskSolnObject(namespace['code-soln-name'](n)) funcall = make_sharedptr_funcall(namespace['code-soln-run'], ['time'], yk_soln_obj) funcall = Offloaded(funcall, self._dtype) mapper[trees[0].root] = funcall mapper.update({i.root: mapper.get(i.root) for i in trees}) # Drop trees # Mark `funcall` as an external function call self._func_table[namespace['code-soln-run']] = MetaCall(None, False) # JIT-compile the newly-created YASK kernel yk_soln = context.make_yk_solution(name, yc_soln, local_vars) self.yk_solns[(dimensions, yk_soln_obj)] = yk_soln # Print some useful information about the newly constructed solution log("Solution '%s' contains %d var(s) and %d equation(s)." % (yc_soln.get_name(), yc_soln.get_num_vars(), yc_soln.get_num_equations())) except NotImplementedError as e: log("Unable to offload a candidate tree. Reason: [%s]" % str(e)) iet = Transformer(mapper).visit(iet) if not self.yk_solns: log("No offloadable trees found") # Some Iteration/Expression trees are not offloaded to YASK and may # require further processing to be executed in YASK, due to the differences # in storage layout employed by Devito and YASK yk_var_objs = {i.name: YaskVarObject(i.name) for i in self._input if i.from_YASK} yk_var_objs.update({i: YaskVarObject(i) for i in self._local_vars}) iet = make_var_accesses(iet, yk_var_objs) # Finally optimize all non-yaskized loops iet = super(OperatorYASK, self)._specialize_iet(iet, **kwargs) return iet
def _build(cls, expressions, **kwargs): expressions = as_tuple(expressions) # Input check if any(not isinstance(i, Eq) for i in expressions): raise InvalidOperator("Only `devito.Eq` expressions are allowed.") name = kwargs.get("name", "Kernel") dse = kwargs.get("dse", configuration['dse']) # Python-level (i.e., compile time) and C-level (i.e., run time) performance profiler = create_profile('timers') # Lower input expressions to internal expressions (e.g., attaching metadata) expressions = cls._lower_exprs(expressions, **kwargs) # Group expressions based on their iteration space and data dependences # Several optimizations are applied (fusion, lifting, flop reduction via DSE, ...) clusters = clusterize(expressions, dse_mode=set_dse_mode(dse)) # Lower Clusters to a Schedule tree stree = st_build(clusters) # Lower Schedule tree to an Iteration/Expression tree (IET) iet = iet_build(stree) # Instrument the IET for C-level profiling iet = profiler.instrument(iet) # Wrap the IET with a Callable parameters = derive_parameters(iet, True) op = Callable(name, iet, 'int', parameters, ()) # Lower IET to a Target-specific IET op, target_state = cls._specialize_iet(op, **kwargs) # Make it an actual Operator op = Callable.__new__(cls, **op.args) Callable.__init__(op, **op.args) # Header files, etc. op._headers = list(cls._default_headers) op._headers.extend(target_state.headers) op._globals = list(cls._default_globals) op._includes = list(cls._default_includes) op._includes.extend(profiler._default_includes) op._includes.extend(target_state.includes) # Required for the jit-compilation op._compiler = configuration['compiler'] op._lib = None op._cfunction = None # References to local or external routines op._func_table = OrderedDict() op._func_table.update( OrderedDict([(i, MetaCall(None, False)) for i in profiler._ext_calls])) op._func_table.update( OrderedDict([(i.root.name, i) for i in target_state.funcs])) # Internal state. May be used to store information about previous runs, # autotuning reports, etc op._state = cls._initialize_state(**kwargs) # Produced by the various compilation passes op._input = filter_sorted( flatten(e.reads + e.writes for e in expressions)) op._output = filter_sorted(flatten(e.writes for e in expressions)) op._dimensions = filter_sorted( flatten(e.dimensions for e in expressions)) op._dimensions.extend(target_state.dimensions) op._dtype, op._dspace = clusters.meta op._profiler = profiler return op
def funcs(self): retval = [ MetaCall(v, True) for k, v in self.efuncs.items() if k != 'root' ] retval.extend([MetaCall(i, False) for i in self.ffuncs]) return tuple(retval)
def __init__(self, expressions, **kwargs): expressions = as_tuple(expressions) # Input check if any(not isinstance(i, sympy.Eq) for i in expressions): raise InvalidOperator("Only SymPy expressions are allowed.") self.name = kwargs.get("name", "Kernel") subs = kwargs.get("subs", {}) dse = kwargs.get("dse", configuration['dse']) dle = kwargs.get("dle", configuration['dle']) # Header files, etc. self._headers = list(self._default_headers) self._includes = list(self._default_includes) self._globals = list(self._default_globals) # Required for compilation self._compiler = configuration['compiler'] self._lib = None self._cfunction = None # References to local or external routines self.func_table = OrderedDict() # Expression lowering: indexification, substitution rules, specialization expressions = [indexify(i) for i in expressions] expressions = [i.xreplace(subs) for i in expressions] expressions = self._specialize_exprs(expressions) # Expression analysis self.input = filter_sorted(flatten(e.reads for e in expressions)) self.output = filter_sorted(flatten(e.writes for e in expressions)) self.dimensions = filter_sorted(flatten(e.dimensions for e in expressions)) # Group expressions based on their iteration space and data dependences, # and apply the Devito Symbolic Engine (DSE) for flop optimization clusters = clusterize(expressions) clusters = rewrite(clusters, mode=set_dse_mode(dse)) self._dtype, self._dspace = clusters.meta # Lower Clusters to an Iteration/Expression tree (IET) nodes = iet_build(clusters) # Introduce C-level profiling infrastructure nodes, self.profiler = self._profile_sections(nodes) # Translate into backend-specific representation (e.g., GPU, Yask) nodes = self._specialize_iet(nodes) # Apply the Devito Loop Engine (DLE) for loop optimization dle_state = transform(nodes, *set_dle_mode(dle)) # Update the Operator state based on the DLE self.dle_args = dle_state.arguments self.dle_flags = dle_state.flags self.func_table.update(OrderedDict([(i.name, MetaCall(i, True)) for i in dle_state.elemental_functions])) self.dimensions.extend([i.argument for i in self.dle_args if isinstance(i.argument, Dimension)]) self._includes.extend(list(dle_state.includes)) # Introduce the required symbol declarations nodes = iet_insert_C_decls(dle_state.nodes, self.func_table) # Insert data and pointer casts for array parameters and profiling structs nodes = self._build_casts(nodes) # Derive parameters as symbols not defined in the kernel itself parameters = self._build_parameters(nodes) # Finish instantiation super(Operator, self).__init__(self.name, nodes, 'int', parameters, ())
def _specialize_iet(self, iet, **kwargs): """ Transform the Iteration/Expression tree to offload the computation of one or more loop nests onto YASK. This involves calling the YASK compiler to generate YASK code. Such YASK code is then called from within the transformed Iteration/Expression tree. """ offloadable = find_offloadable_trees(iet) if len(offloadable.trees) == 0: self.yk_soln = YaskNullKernel() log("No offloadable trees found") else: context = contexts.fetch(offloadable.grid, offloadable.dtype) # A unique name for the 'real' compiler and kernel solutions name = namespace['jit-soln'](Signer._digest(iet, configuration)) # Create a YASK compiler solution for this Operator yc_soln = context.make_yc_solution(name) try: trees = offloadable.trees # Generate YASK grids and populate `yc_soln` with equations mapper = yaskizer(trees, yc_soln) local_grids = [i for i in mapper if i.is_Array] # Transform the IET funcall = make_sharedptr_funcall(namespace['code-soln-run'], ['time'], namespace['code-soln-name']) funcall = Element(c.Statement(ccode(funcall))) mapper = {trees[0].root: funcall} mapper.update({i.root: mapper.get(i.root) for i in trees}) # Drop trees iet = Transformer(mapper).visit(iet) # Mark `funcall` as an external function call self.func_table[namespace['code-soln-run']] = MetaCall( None, False) # JIT-compile the newly-created YASK kernel self.yk_soln = context.make_yk_solution( name, yc_soln, local_grids) # Print some useful information about the newly constructed solution log("Solution '%s' contains %d grid(s) and %d equation(s)." % (yc_soln.get_name(), yc_soln.get_num_grids(), yc_soln.get_num_equations())) except NotImplementedError as e: self.yk_soln = YaskNullKernel() log("Unable to offload a candidate tree. Reason: [%s]" % str(e)) # Some Iteration/Expression trees are not offloaded to YASK and may # require further processing to be executed in YASK, due to the differences # in storage layout employed by Devito and YASK iet = make_grid_accesses(iet) # Finally optimize all non-yaskized loops iet = super(Operator, self)._specialize_iet(iet, **kwargs) return iet
def iet_insert_C_decls(iet, func_table=None): """ Given an Iteration/Expression tree ``iet``, build a new tree with the necessary symbol declarations. Declarations are placed as close as possible to the first symbol use. :param iet: The input Iteration/Expression tree. :param func_table: (Optional) a mapper from callable names within ``iet`` to :class:`Callable`s. """ func_table = func_table or {} allocator = Allocator() mapper = OrderedDict() # Detect all IET nodes accessing symbols that need to be declared scopes = [] me = MapExpressions() for k, v in me.visit(iet).items(): if k.is_Call: func = func_table.get(k.name) if func is not None and func.local: scopes.extend(me.visit(func.root, queue=list(v)).items()) scopes.append((k, v)) # Classify, and then schedule declarations to stack/heap for k, v in scopes: if k.is_Expression: if k.is_scalar: # Inline declaration mapper[k] = LocalExpression(**k.args) continue objs = [k.write] elif k.is_Call: objs = k.params else: raise NotImplementedError("Cannot schedule declarations for IET " "node of type `%s`" % type(k)) for i in objs: try: if i.is_LocalObject: # On the stack site = v[-1] if v else iet allocator.push_stack(site, i) elif i.is_Array: if i._mem_external: # Nothing to do; e.g., a user-provided Function continue elif i._mem_stack: # On the stack key = lambda i: not i.is_Parallel site = filter_iterations(v, key=key, stop='asap') or [iet] allocator.push_stack(site[-1], i) else: # On the heap, as a tensor that must be globally accessible allocator.push_heap(i) except AttributeError: # E.g., a generic SymPy expression pass # Introduce declarations on the stack for k, v in allocator.onstack: mapper[k] = tuple(Element(i) for i in v) iet = Transformer(mapper, nested=True).visit(iet) for k, v in list(func_table.items()): if v.local: func_table[k] = MetaCall( Transformer(mapper).visit(v.root), v.local) # Introduce declarations on the heap (if any) if allocator.onheap: decls, allocs, frees = zip(*allocator.onheap) iet = List(header=decls + allocs, body=iet, footer=frees) return iet
def iet_insert_C_decls(iet, func_table=None): """ Given an Iteration/Expression tree ``iet``, build a new tree with the necessary symbol declarations. Declarations are placed as close as possible to the first symbol use. :param iet: The input Iteration/Expression tree. :param func_table: (Optional) a mapper from callable names within ``iet`` to :class:`Callable`s. """ func_table = func_table or {} allocator = Allocator() mapper = OrderedDict() # First, schedule declarations for Expressions scopes = [] me = MapExpressions() for k, v in me.visit(iet).items(): if k.is_Call: func = func_table.get(k.name) if func is not None and func.local: scopes.extend(me.visit(func.root, queue=list(v)).items()) else: scopes.append((k, v)) for k, v in scopes: if k.is_scalar: # Inline declaration mapper[k] = LocalExpression(**k.args) elif k.write is None or k.write._mem_external: # Nothing to do, e.g., variable passed as kernel argument continue elif k.write._mem_stack: # On the stack key = lambda i: not i.is_Parallel site = filter_iterations(v, key=key, stop='asap') or [iet] allocator.push_stack(site[-1], k.write) else: # On the heap, as a tensor that must be globally accessible allocator.push_heap(k.write) # Then, schedule declarations callables arguments passed by reference/pointer # (as modified internally by the callable) scopes = [(k, v) for k, v in me.visit(iet).items() if k.is_Call] for k, v in scopes: site = v[-1] if v else iet for i in k.params: try: if i.is_LocalObject: # On the stack allocator.push_stack(site, i) elif i.is_Array: if i._mem_stack: # On the stack allocator.push_stack(site, i) elif i._mem_heap: # On the heap allocator.push_heap(i) except AttributeError: # E.g., a generic SymPy expression pass # Introduce declarations on the stack for k, v in allocator.onstack: mapper[k] = tuple(Element(i) for i in v) iet = NestedTransformer(mapper).visit(iet) for k, v in list(func_table.items()): if v.local: func_table[k] = MetaCall(Transformer(mapper).visit(v.root), v.local) # Introduce declarations on the heap (if any) if allocator.onheap: decls, allocs, frees = zip(*allocator.onheap) iet = List(header=decls + allocs, body=iet, footer=frees) return iet