def optimize(clusters, dse_mode): """ Optimize a topologically-ordered sequence of Clusters by applying the following transformations: * [cross-cluster] Fusion * [intra-cluster] Several flop-reduction passes via the DSE * [cross-cluster] Lifting * [cross-cluster] Scalarization * [cross-cluster] Arrays Elimination """ # To create temporaries counter = generator() template = lambda: "r%d" % counter() # Fusion clusters = fuse(clusters) from devito.dse import rewrite clusters = rewrite(clusters, template, mode=dse_mode) # Lifting clusters = Lift().process(clusters) # Lifting may create fusion opportunities clusters = fuse(clusters) # Fusion may create opportunities to eliminate Arrays (thus shrinking the # working set) if these store identical expressions clusters = eliminate_arrays(clusters, template) # Fusion may create scalarization opportunities clusters = scalarize(clusters, template) return ClusterGroup(clusters)
def __init__(self, expressions, **kwargs): expressions = as_tuple(expressions) # Input check if any(not isinstance(i, Eq) for i in expressions): raise InvalidOperator("Only `devito.Eq` expressions are allowed.") self.name = kwargs.get("name", "Kernel") subs = kwargs.get("subs", {}) dse = kwargs.get("dse", configuration['dse']) # Header files, etc. self._headers = list(self._default_headers) self._includes = list(self._default_includes) self._globals = list(self._default_globals) # Required for compilation self._compiler = configuration['compiler'] self._lib = None self._cfunction = None # References to local or external routines self._func_table = OrderedDict() # Internal state. May be used to store information about previous runs, # autotuning reports, etc self._state = {} # Expression lowering: indexification, substitution rules, specialization expressions = [indexify(i) for i in expressions] expressions = self._apply_substitutions(expressions, subs) expressions = self._specialize_exprs(expressions) # Expression analysis self.input = filter_sorted(flatten(e.reads for e in expressions)) self.output = filter_sorted(flatten(e.writes for e in expressions)) self.dimensions = filter_sorted( flatten(e.dimensions for e in expressions)) # Group expressions based on their iteration space and data dependences, # and apply the Devito Symbolic Engine (DSE) for flop optimization clusters = clusterize(expressions) clusters = rewrite(clusters, mode=set_dse_mode(dse)) self._dtype, self._dspace = clusters.meta # Lower Clusters to a Schedule tree stree = st_build(clusters) # Lower Schedule tree to an Iteration/Expression tree (IET) iet = iet_build(stree) iet, self._profiler = self._profile_sections(iet) iet = self._specialize_iet(iet, **kwargs) iet = iet_insert_C_decls(iet) iet = self._build_casts(iet) # Derive parameters as symbols not defined in the kernel itself parameters = self._build_parameters(iet) # Finish instantiation super(Operator, self).__init__(self.name, iet, 'int', parameters, ())
def test_tti_clusters_to_graph(): solver = tti_operator() expressions = solver.op_fwd('centered').args['expressions'] subs = solver.op_fwd('centered').args['subs'] expressions = [LoweredEq(e, subs=subs) for e in expressions] clusters = clusterize(expressions) assert len(clusters) == 3 main_cluster = clusters[0] n_output_tensors = len(main_cluster.trace) clusters = rewrite([main_cluster], mode='basic') assert len(clusters) == 1 main_cluster = clusters[0] graph = main_cluster.trace assert len([v for v in graph.values() if v.is_tensor]) == n_output_tensors # u and v assert all(v.reads or v.readby for v in graph.values())
def test_tti_clusters_to_graph(): solver = tti_operator() nodes = FindNodes(Expression).visit(solver.op_fwd.elemental_functions) expressions = [n.expr for n in nodes] stencils = solver.op_fwd._retrieve_stencils(expressions) clusters = clusterize(expressions, stencils) assert len(clusters) == 3 main_cluster = clusters[0] n_output_tensors = len(main_cluster.trace) clusters = rewrite([main_cluster], mode='basic') assert len(clusters) == 1 main_cluster = clusters[0] graph = main_cluster.trace assert len([v for v in graph.values() if v.is_tensor ]) == n_output_tensors # u and v assert all(v.reads or v.readby for v in graph.values())
def optimize(clusters, dse_mode): """ Optimize a topologically-ordered sequence of Clusters by applying the following transformations: * [cross-cluster] Fusion * [intra-cluster] Several flop-reduction passes via the DSE * [cross-cluster] Lifting * [cross-cluster] Scalarization * [cross-cluster] Arrays Elimination """ # To create temporaries counter = generator() template = lambda: "r%d" % counter() # Toposort+Fusion (the former to expose more fusion opportunities) clusters = Toposort().process(clusters) clusters = fuse(clusters) # Flop reduction via the DSE from devito.dse import rewrite clusters = rewrite(clusters, template, mode=dse_mode) # Lifting clusters = Lift().process(clusters) # Lifting may create fusion opportunities clusters = fuse(clusters) # Fusion may create opportunities to eliminate Arrays (thus shrinking the # working set) if these store identical expressions clusters = eliminate_arrays(clusters, template) # Fusion may create scalarization opportunities clusters = scalarize(clusters, template) # Determine computational properties (e.g., parallelism) that will be # necessary for the later passes clusters = analyze(clusters) return ClusterGroup(clusters)
def test_tti_clusters_to_graph(): solver = tti_operator() expressions = solver.op_fwd('centered').args['expressions'] subs = solver.op_fwd('centered').args['subs'] expressions = [indexify(s) for s in expressions] expressions = [s.xreplace(subs) for s in expressions] stencils = solver.op_fwd('centered')._retrieve_stencils(expressions) clusters = clusterize(expressions, stencils) assert len(clusters) == 3 main_cluster = clusters[0] n_output_tensors = len(main_cluster.trace) clusters = rewrite([main_cluster], mode='basic') assert len(clusters) == 1 main_cluster = clusters[0] graph = main_cluster.trace assert len([v for v in graph.values() if v.is_tensor ]) == n_output_tensors # u and v assert all(v.reads or v.readby for v in graph.values())
def __init__(self, expressions, **kwargs): expressions = as_tuple(expressions) # Input check if any(not isinstance(i, sympy.Eq) for i in expressions): raise InvalidOperator("Only SymPy expressions are allowed.") self.name = kwargs.get("name", "Kernel") subs = kwargs.get("subs", {}) time_axis = kwargs.get("time_axis", Forward) dse = kwargs.get("dse", configuration['dse']) dle = kwargs.get("dle", configuration['dle']) # Default attributes required for compilation self._headers = list(self._default_headers) self._includes = list(self._default_includes) self._lib = None self._cfunction = None # Set the direction of time acoording to the given TimeAxis time.reverse = time_axis == Backward # Expression lowering expressions = [indexify(s) for s in expressions] expressions = [s.xreplace(subs) for s in expressions] # Analysis 1 - required *also after* the Operator construction self.dtype = self._retrieve_dtype(expressions) self.output = self._retrieve_output_fields(expressions) # Analysis 2 - required *for* the Operator construction ordering = self._retrieve_loop_ordering(expressions) stencils = self._retrieve_stencils(expressions) # Group expressions based on their Stencil clusters = clusterize(expressions, stencils) # Apply the Devito Symbolic Engine for symbolic optimization clusters = rewrite(clusters, mode=dse) # Wrap expressions with Iterations according to dimensions nodes = self._schedule_expressions(clusters, ordering) # Introduce C-level profiling infrastructure self.sections = OrderedDict() nodes = self._profile_sections(nodes) # Parameters of the Operator (Dimensions necessary for data casts) parameters = FindSymbols('kernel-data').visit(nodes) dimensions = FindSymbols('dimensions').visit(nodes) dimensions += [d.parent for d in dimensions if d.is_Buffered] parameters += filter_ordered([d for d in dimensions if d.size is None], key=operator.attrgetter('name')) # Resolve and substitute dimensions for loop index variables subs = {} nodes = ResolveIterationVariable().visit(nodes, subs=subs) nodes = SubstituteExpression(subs=subs).visit(nodes) # Apply the Devito Loop Engine for loop optimization dle_state = transform(nodes, *set_dle_mode(dle)) parameters += [i.argument for i in dle_state.arguments] self._includes.extend(list(dle_state.includes)) # Introduce all required C declarations nodes, elemental_functions = self._insert_declarations( dle_state, parameters) self.elemental_functions = elemental_functions # Track the DLE output, as it might be useful at execution time self._dle_state = dle_state # Finish instantiation super(OperatorBasic, self).__init__(self.name, nodes, 'int', parameters, ())
def __init__(self, expressions, **kwargs): expressions = as_tuple(expressions) # Input check if any(not isinstance(i, sympy.Eq) for i in expressions): raise InvalidOperator("Only SymPy expressions are allowed.") self.name = kwargs.get("name", "Kernel") subs = kwargs.get("subs", {}) time_axis = kwargs.get("time_axis", Forward) dse = kwargs.get("dse", configuration['dse']) dle = kwargs.get("dle", configuration['dle']) # Header files, etc. self._headers = list(self._default_headers) self._includes = list(self._default_includes) self._globals = list(self._default_globals) # Required for compilation self._compiler = configuration['compiler'] self._lib = None self._cfunction = None # References to local or external routines self.func_table = OrderedDict() # Expression lowering and analysis expressions = [LoweredEq(e, subs=subs) for e in expressions] self.dtype = retrieve_dtype(expressions) self.input, self.output, self.dimensions = retrieve_symbols( expressions) # Set the direction of time acoording to the given TimeAxis for time in [d for d in self.dimensions if d.is_Time]: if not time.is_Stepping: time.reverse = time_axis == Backward # Parameters of the Operator (Dimensions necessary for data casts) parameters = self.input + self.dimensions # Group expressions based on their iteration space and data dependences, # and apply the Devito Symbolic Engine (DSE) for flop optimization clusters = clusterize(expressions) clusters = rewrite(clusters, mode=set_dse_mode(dse)) # Lower Clusters to an Iteration/Expression tree (IET) nodes = iet_build(clusters, self.dtype) # Introduce C-level profiling infrastructure nodes, self.profiler = self._profile_sections(nodes, parameters) # Translate into backend-specific representation (e.g., GPU, Yask) nodes = self._specialize(nodes, parameters) # Apply the Devito Loop Engine (DLE) for loop optimization dle_state = transform(nodes, *set_dle_mode(dle)) # Update the Operator state based on the DLE self.dle_arguments = dle_state.arguments self.dle_flags = dle_state.flags self.func_table.update( OrderedDict([(i.name, MetaCall(i, True)) for i in dle_state.elemental_functions])) parameters.extend([i.argument for i in self.dle_arguments]) self.dimensions.extend([ i.argument for i in self.dle_arguments if isinstance(i.argument, Dimension) ]) self._includes.extend(list(dle_state.includes)) # Introduce the required symbol declarations nodes = iet_insert_C_decls(dle_state.nodes, self.func_table) # Initialise ArgumentEngine self.argument_engine = ArgumentEngine(clusters.ispace, parameters, self.dle_arguments) parameters = self.argument_engine.arguments # Finish instantiation super(Operator, self).__init__(self.name, nodes, 'int', parameters, ())
def __init__(self, expressions, **kwargs): expressions = as_tuple(expressions) # Input check if any(not isinstance(i, sympy.Eq) for i in expressions): raise InvalidOperator("Only SymPy expressions are allowed.") self.name = kwargs.get("name", "Kernel") subs = kwargs.get("subs", {}) time_axis = kwargs.get("time_axis", Forward) dse = kwargs.get("dse", configuration['dse']) dle = kwargs.get("dle", configuration['dle']) # Header files, etc. self._headers = list(self._default_headers) self._includes = list(self._default_includes) self._globals = list(self._default_globals) # Required for compilation self._compiler = configuration['compiler'] self._lib = None self._cfunction = None # Set the direction of time acoording to the given TimeAxis time.reverse = time_axis == Backward # Expression lowering expressions = [indexify(s) for s in expressions] expressions = [s.xreplace(subs) for s in expressions] # Analysis self.dtype = self._retrieve_dtype(expressions) self.input, self.output, self.dimensions = self._retrieve_symbols(expressions) stencils = self._retrieve_stencils(expressions) # Parameters of the Operator (Dimensions necessary for data casts) parameters = self.input + [i for i in self.dimensions if i.size is None] # Group expressions based on their Stencil clusters = clusterize(expressions, stencils) # Apply the Devito Symbolic Engine (DSE) for symbolic optimization clusters = rewrite(clusters, mode=set_dse_mode(dse)) # Wrap expressions with Iterations according to dimensions nodes = self._schedule_expressions(clusters) # Introduce C-level profiling infrastructure nodes, self.profiler = self._profile_sections(nodes, parameters) # Resolve and substitute dimensions for loop index variables subs = {} nodes = ResolveIterationVariable().visit(nodes, subs=subs) nodes = SubstituteExpression(subs=subs).visit(nodes) # Apply the Devito Loop Engine (DLE) for loop optimization dle_state = transform(nodes, *set_dle_mode(dle)) # Update the Operator state based on the DLE self.dle_arguments = dle_state.arguments self.dle_flags = dle_state.flags self.func_table = OrderedDict([(i.name, FunMeta(i, True)) for i in dle_state.elemental_functions]) parameters.extend([i.argument for i in self.dle_arguments]) self.dimensions.extend([i.argument for i in self.dle_arguments if isinstance(i.argument, Dimension)]) self._includes.extend(list(dle_state.includes)) # Translate into backend-specific representation (e.g., GPU, Yask) nodes = self._specialize(dle_state.nodes, parameters) # Introduce all required C declarations nodes = self._insert_declarations(nodes) # Finish instantiation super(Operator, self).__init__(self.name, nodes, 'int', parameters, ())
def __init__(self, expressions, **kwargs): expressions = as_tuple(expressions) # Input check if any(not isinstance(i, sympy.Eq) for i in expressions): raise InvalidOperator("Only SymPy expressions are allowed.") self.name = kwargs.get("name", "Kernel") subs = kwargs.get("subs", {}) dse = kwargs.get("dse", configuration['dse']) # Header files, etc. self._headers = list(self._default_headers) self._includes = list(self._default_includes) self._globals = list(self._default_globals) # Required for compilation self._compiler = configuration['compiler'] self._lib = None self._cfunction = None # References to local or external routines self.func_table = OrderedDict() # Expression lowering: indexification, substitution rules, specialization expressions = [indexify(i) for i in expressions] expressions = [i.xreplace(subs) for i in expressions] expressions = self._specialize_exprs(expressions) # Expression analysis self.input = filter_sorted(flatten(e.reads for e in expressions)) self.output = filter_sorted(flatten(e.writes for e in expressions)) self.dimensions = filter_sorted(flatten(e.dimensions for e in expressions)) # Group expressions based on their iteration space and data dependences, # and apply the Devito Symbolic Engine (DSE) for flop optimization clusters = clusterize(expressions) clusters = rewrite(clusters, mode=set_dse_mode(dse)) self._dtype, self._dspace = clusters.meta # Lower Clusters to a Schedule tree stree = schedule(clusters) stree = section(stree) # Lower Sections to an Iteration/Expression tree (IET) iet = iet_build(stree) # Insert code for C-level performance profiling iet, self.profiler = self._profile_sections(iet) # Translate into backend-specific representation iet = self._specialize_iet(iet, **kwargs) # Insert the required symbol declarations iet = iet_insert_C_decls(iet, self.func_table) # Insert data and pointer casts for array parameters and profiling structs iet = self._build_casts(iet) # Derive parameters as symbols not defined in the kernel itself parameters = self._build_parameters(iet) # Finish instantiation super(Operator, self).__init__(self.name, iet, 'int', parameters, ())
def __init__(self, expressions, **kwargs): expressions = as_tuple(expressions) # Input check if any(not isinstance(i, sympy.Eq) for i in expressions): raise InvalidOperator("Only SymPy expressions are allowed.") self.name = kwargs.get("name", "Kernel") subs = kwargs.get("subs", {}) dse = kwargs.get("dse", configuration['dse']) dle = kwargs.get("dle", configuration['dle']) # Header files, etc. self._headers = list(self._default_headers) self._includes = list(self._default_includes) self._globals = list(self._default_globals) # Required for compilation self._compiler = configuration['compiler'] self._lib = None self._cfunction = None # References to local or external routines self.func_table = OrderedDict() # Expression lowering: indexification, substitution rules, specialization expressions = [indexify(i) for i in expressions] expressions = [i.xreplace(subs) for i in expressions] expressions = self._specialize_exprs(expressions) # Expression analysis self.input = filter_sorted(flatten(e.reads for e in expressions)) self.output = filter_sorted(flatten(e.writes for e in expressions)) self.dimensions = filter_sorted(flatten(e.dimensions for e in expressions)) # Group expressions based on their iteration space and data dependences, # and apply the Devito Symbolic Engine (DSE) for flop optimization clusters = clusterize(expressions) clusters = rewrite(clusters, mode=set_dse_mode(dse)) self._dtype, self._dspace = clusters.meta # Lower Clusters to an Iteration/Expression tree (IET) nodes = iet_build(clusters) # Introduce C-level profiling infrastructure nodes, self.profiler = self._profile_sections(nodes) # Translate into backend-specific representation (e.g., GPU, Yask) nodes = self._specialize_iet(nodes) # Apply the Devito Loop Engine (DLE) for loop optimization dle_state = transform(nodes, *set_dle_mode(dle)) # Update the Operator state based on the DLE self.dle_args = dle_state.arguments self.dle_flags = dle_state.flags self.func_table.update(OrderedDict([(i.name, MetaCall(i, True)) for i in dle_state.elemental_functions])) self.dimensions.extend([i.argument for i in self.dle_args if isinstance(i.argument, Dimension)]) self._includes.extend(list(dle_state.includes)) # Introduce the required symbol declarations nodes = iet_insert_C_decls(dle_state.nodes, self.func_table) # Insert data and pointer casts for array parameters and profiling structs nodes = self._build_casts(nodes) # Derive parameters as symbols not defined in the kernel itself parameters = self._build_parameters(nodes) # Finish instantiation super(Operator, self).__init__(self.name, nodes, 'int', parameters, ())
def __init__(self, expressions, **kwargs): expressions = as_tuple(expressions) # Input check if any(not isinstance(i, Eq) for i in expressions): raise InvalidOperator("Only `devito.Eq` expressions are allowed.") self.name = kwargs.get("name", "Kernel") subs = kwargs.get("subs", {}) dse = kwargs.get("dse", configuration['dse']) # Header files, etc. self._headers = list(self._default_headers) self._includes = list(self._default_includes) self._globals = list(self._default_globals) # Required for compilation self._compiler = configuration['compiler'] self._lib = None self._cfunction = None # References to local or external routines self._func_table = OrderedDict() # Internal state. May be used to store information about previous runs, # autotuning reports, etc self._state = {} # Form and gather any required implicit expressions expressions = self._add_implicit(expressions) # Expression lowering: indexification, substitution rules, specialization expressions = [indexify(i) for i in expressions] expressions = self._apply_substitutions(expressions, subs) expressions = self._specialize_exprs(expressions) # Expression analysis self._input = filter_sorted(flatten(e.reads + e.writes for e in expressions)) self._output = filter_sorted(flatten(e.writes for e in expressions)) self._dimensions = filter_sorted(flatten(e.dimensions for e in expressions)) # Group expressions based on their iteration space and data dependences, # and apply the Devito Symbolic Engine (DSE) for flop optimization clusters = clusterize(expressions) clusters = rewrite(clusters, mode=set_dse_mode(dse)) self._dtype, self._dspace = clusters.meta # Lower Clusters to a Schedule tree stree = st_build(clusters) # Lower Schedule tree to an Iteration/Expression tree (IET) iet = iet_build(stree) iet, self._profiler = self._profile_sections(iet) iet = self._specialize_iet(iet, **kwargs) # Derive all Operator parameters based on the IET parameters = derive_parameters(iet, True) # Finalization: introduce declarations, type casts, etc iet = self._finalize(iet, parameters) super(Operator, self).__init__(self.name, iet, 'int', parameters, ())