def generate_fileheader(self, sdfg: SDFG, global_stream: CodeIOStream): """ Generate a header in every output file that includes custom types and constants. :param sdfg: The input SDFG. :param global_stream: Stream to write to (global). """ ######################################################### # Custom types datatypes = set() # Types of this SDFG for _, arrname, arr in sdfg.arrays_recursive(): if arr is not None: datatypes.add(arr.dtype) # Emit unique definitions wrote_something = False for typ in datatypes: if hasattr(typ, 'emit_definition'): if not wrote_something: global_stream.write("", sdfg) wrote_something = True global_stream.write(typ.emit_definition(), sdfg) if wrote_something: global_stream.write("", sdfg) ######################################################### # Write constants self.generate_constants(sdfg, global_stream) for sd in sdfg.all_sdfgs_recursive(): global_stream.write(sd.global_code, sd)
def make_transients_persistent(sdfg: SDFG, device: dtypes.DeviceType) -> None: ''' Helper function to change several storage and scheduling properties - Makes non-view array lifetimes persistent, with some restrictions depending on the device - Reset nonatomic WCR edges on GPU :param sdfg: SDFG :param device: Device type ''' for nsdfg in sdfg.all_sdfgs_recursive(): for aname, arr in nsdfg.arrays.items(): if arr.transient and not isinstance( arr, dt.View) and not symbolic.issymbolic(arr.total_size): if arr.storage != dtypes.StorageType.Register: arr.lifetime = dtypes.AllocationLifetime.Persistent if device == dtypes.DeviceType.GPU: for aname, arr in sdfg.arrays.items(): if arr.transient and not isinstance( arr, dt.View): #and size only depends on SDFG params if arr.storage == dtypes.StorageType.GPU_Global: arr.lifetime = dtypes.AllocationLifetime.Persistent # Reset nonatomic WCR edges for n, _ in sdfg.all_nodes_recursive(): if isinstance(n, SDFGState): for edge in n.edges(): edge.data.wcr_nonatomic = False
def generate_fileheader(self, sdfg: SDFG, global_stream: CodeIOStream, backend: str = 'frame'): """ Generate a header in every output file that includes custom types and constants. :param sdfg: The input SDFG. :param global_stream: Stream to write to (global). :param backend: Whose backend this header belongs to. """ ######################################################### # Environment-based includes for env in self.environments: if len(env.headers) > 0: global_stream.write( "\n".join("#include \"" + h + "\"" for h in env.headers), sdfg) ######################################################### # Custom types datatypes = set() # Types of this SDFG for _, arrname, arr in sdfg.arrays_recursive(): if arr is not None: datatypes.add(arr.dtype) # Emit unique definitions wrote_something = False for typ in datatypes: if hasattr(typ, 'emit_definition'): if not wrote_something: global_stream.write("", sdfg) wrote_something = True global_stream.write(typ.emit_definition(), sdfg) if wrote_something: global_stream.write("", sdfg) ######################################################### # Write constants self.generate_constants(sdfg, global_stream) ######################################################### # Write state struct structstr = '\n'.join(self.statestruct) global_stream.write( f''' struct {sdfg.name}_t {{ {structstr} }}; ''', sdfg) for sd in sdfg.all_sdfgs_recursive(): if None in sd.global_code: global_stream.write(codeblock_to_cpp(sd.global_code[None]), sd) if backend in sd.global_code: global_stream.write(codeblock_to_cpp(sd.global_code[backend]), sd)
def get_post_state(sdfg: SDFG, state: SDFGState): """ Returns the post state (the state that copies the data a back from the FGPA device) if there is one. """ for s in sdfg.all_sdfgs_recursive(): for post_state in s.states(): if 'post_' + str(state) == str(post_state): return post_state return None
def set_fast_implementations(sdfg: SDFG, device: dtypes.DeviceType, blocklist: List[str] = None): """ Set fast library node implementations for the given device :param sdfg: The SDFG to optimize. :param device: the device to optimize for. :param blocklist: list of disallowed implementations. :note: Operates in-place on the given SDFG. """ if blocklist is None: implementation_prio = find_fast_library(device) else: implementation_prio = [ i for i in find_fast_library(device) if i not in blocklist ] # specialized nodes: pre-expand for current_sdfg in sdfg.all_sdfgs_recursive(): for state in current_sdfg.nodes(): for node in state.nodes(): if isinstance(node, nodes.LibraryNode): if (node.default_implementation == 'specialize' and (len( set(node.implementations) & set(implementation_prio))) == 0): node.expand(current_sdfg, state) # general nodes for node, _ in sdfg.all_nodes_recursive(): if isinstance(node, nodes.LibraryNode): for impl in implementation_prio: if impl in node.implementations: if isinstance(node, dace.libraries.standard.nodes.reduce.Reduce ) and node.implementation == 'CUDA (block)': continue node.implementation = impl break # reduce nodes if device == dtypes.DeviceType.GPU: for node, state in sdfg.all_nodes_recursive(): if isinstance(node, dace.nodes.LibraryNode): # Use CUB for device-level reductions if ('CUDA (device)' in node.implementations and not is_devicelevel_gpu(state.parent, state, node) and state.scope_dict()[node] is None): node.implementation = 'CUDA (device)'
def enumerate_matches(sdfg: SDFG, pattern: gr.Graph, node_match=type_or_class_match, edge_match=None) -> Iterator[gr.SubgraphView]: """ Returns a generator of subgraphs that match the given subgraph pattern. :param sdfg: The SDFG to search in. :param pattern: A subgraph to look for. :param node_match: An optional function to use for matching nodes. :param node_match: An optional function to use for matching edges. :return: Yields SDFG subgraph view objects. """ if len(pattern.nodes()) == 0: raise ValueError('Subgraph pattern cannot be empty') # Find if the subgraph is within states or SDFGs is_interstate = (isinstance(pattern.node(0), SDFGState) or (isinstance(pattern.node(0), type) and pattern.node(0) is SDFGState)) # Collapse multigraphs into directed graphs pattern_digraph = collapse_multigraph_to_nx(pattern) # Find matches in all SDFGs and nested SDFGs for graph in sdfg.all_sdfgs_recursive(): if is_interstate: graph_matcher = iso.DiGraphMatcher( collapse_multigraph_to_nx(graph), pattern_digraph, node_match=node_match, edge_match=edge_match) for subgraph in graph_matcher.subgraph_isomorphisms_iter(): yield gr.SubgraphView(graph, [graph.node(i) for i in subgraph.keys()]) else: for state in graph.nodes(): graph_matcher = iso.DiGraphMatcher( collapse_multigraph_to_nx(state), pattern_digraph, node_match=node_match, edge_match=edge_match) for subgraph in graph_matcher.subgraph_isomorphisms_iter(): yield gr.SubgraphView( state, [state.node(i) for i in subgraph.keys()])
def apply_pass( self, sdfg: SDFG, pipeline_results: Dict[str, Any] ) -> Optional[Dict[SDFGState, Optional[Any]]]: """ Applies the pass to states of the given SDFG by calling ``apply`` on each state. :param sdfg: The SDFG to apply the pass to. :param pipeline_results: If in the context of a ``Pipeline``, a dictionary that is populated with prior Pass results as ``{Pass subclass name: returned object from pass}``. If not run in a pipeline, an empty dictionary is expected. :return: A dictionary of ``{state: return value}`` for visited states with a non-None return value, or None if nothing was returned. """ result = {} for sd in sdfg.all_sdfgs_recursive(): for state in sd.nodes(): retval = self.apply(state, pipeline_results) if retval is not None: result[state] = retval if not result: return None return result
def determine_allocation_lifetime(self, top_sdfg: SDFG): """ Determines where (at which scope/state/SDFG) each data descriptor will be allocated/deallocated. :param top_sdfg: The top-level SDFG to determine for. """ # Gather shared transients shared_transients = {} for sdfg in top_sdfg.all_sdfgs_recursive(): shared_transients[sdfg.sdfg_id] = sdfg.shared_transients( check_toplevel=False) for sdfg, name, desc in top_sdfg.arrays_recursive(): if not desc.transient: continue # NOTE: In the code below we infer where a transient should be # declared, allocated, and deallocated. The information is stored # in the `to_allocate` dictionary. The key of each entry is the # scope where one of the above actions must occur, while the value # is a tuple containing the following information: # 1. The SDFG object that containts the transient. # 2. The State id where the action should (approx.) take place. # 3. The Access Node id of the transient in the above State. # 4. True if declaration should take place, otherwise False. # 5. True if allocation should take place, otherwise False. # 6. True if deallocation should take place, otherwise False. # Possibly confusing control flow below finds the first state # and node of the data descriptor, or continues the # arrays_recursive() loop first_state_instance: int = None first_node_instance: nodes.AccessNode = None last_state_instance: int = None last_node_instance: nodes.AccessNode = None first = True for state in sdfg.topological_sort(): id = sdfg.nodes().index(state) for node in state.data_nodes(): if node.data == name: if first: first_state_instance = id first_node_instance = node first = False last_state_instance = id last_node_instance = node # break else: continue break # Cases if desc.lifetime is dtypes.AllocationLifetime.Persistent: # Persistent memory is allocated in initialization code and # exists in the library state structure # If unused, skip if first_node_instance is None: continue definition = desc.as_arg(name=f'__{sdfg.sdfg_id}_{name}') + ';' self.statestruct.append(definition) self.to_allocate[sdfg].append( (sdfg, first_state_instance, first_node_instance, True, True, True)) continue elif desc.lifetime is dtypes.AllocationLifetime.Global: # Global memory is allocated in the beginning of the program # exists in the library state structure (to be passed along # to the right SDFG) # If unused, skip if first_node_instance is None: continue definition = desc.as_arg(name=f'__{sdfg.sdfg_id}_{name}') + ';' self.statestruct.append(definition) # self.to_allocate[top_sdfg].append( # (sdfg.sdfg_id, sdfg.node_id(state), node)) self.to_allocate[top_sdfg].append( (sdfg, first_state_instance, first_node_instance, True, True, True)) continue # The rest of the cases change the starting scope we attempt to # allocate from, since the descriptors may only be allocated higher # in the hierarchy (e.g., in the case of GPU global memory inside # a kernel). alloc_scope: Union[nodes.EntryNode, SDFGState, SDFG] = None alloc_state: SDFGState = None access_node: nodes.AccessNode = None if (name in shared_transients[sdfg.sdfg_id] or desc.lifetime is dtypes.AllocationLifetime.SDFG): # SDFG memory and shared transients are allocated in the # beginning of their SDFG alloc_scope = sdfg if first_state_instance is not None: alloc_state = sdfg.nodes()[first_state_instance] # If unused, skip if first_node_instance is None: continue elif desc.lifetime is dtypes.AllocationLifetime.State: # State memory is either allocated in the beginning of the # containing state or the SDFG (if used in more than one state) curstate: SDFGState = None multistate = False for state in sdfg.nodes(): if any(n.data == name for n in state.data_nodes()): if curstate is not None: multistate = True break curstate = state if multistate: alloc_scope = sdfg else: alloc_scope = curstate alloc_state = curstate elif desc.lifetime is dtypes.AllocationLifetime.Scope: # Scope memory (default) is either allocated in the innermost # scope (e.g., Map, Consume) it is used in (i.e., greatest # common denominator), or in the SDFG if used in multiple states curscope: Union[nodes.EntryNode, SDFGState] = None curstate: SDFGState = None multistate = False # Does the array appear in inter-state edges? for isedge in sdfg.edges(): if name in isedge.data.free_symbols: multistate = True for state in sdfg.nodes(): if multistate: break sdict = state.scope_dict() for node in state.nodes(): if not isinstance(node, nodes.AccessNode): continue if node.data != name: continue # If already found in another state, set scope to SDFG if curstate is not None and curstate != state: multistate = True break curstate = state # Current scope (or state object if top-level) scope = sdict[node] or state if curscope is None: curscope = scope continue # States always win if isinstance(scope, SDFGState): curscope = scope continue # Lower/Higher/Disjoint scopes: find common denominator if isinstance(curscope, SDFGState): if scope in curscope.nodes(): continue curscope = sdscope.common_parent_scope( sdict, scope, curscope) if multistate: break if multistate: alloc_scope = sdfg else: alloc_scope = curscope alloc_state = curstate else: raise TypeError('Unrecognized allocation lifetime "%s"' % desc.lifetime) if alloc_scope is None: # No allocation necessary continue # If descriptor cannot be allocated in this scope, traverse up the # scope tree until it is possible cursdfg = sdfg curstate = alloc_state curscope = alloc_scope while not self._can_allocate(cursdfg, curstate, desc, curscope): if curscope is None: break if isinstance(curscope, nodes.EntryNode): # Go one scope up curscope = curstate.entry_node(curscope) if curscope is None: curscope = curstate elif isinstance(curscope, (SDFGState, SDFG)): cursdfg: SDFG = (curscope if isinstance(curscope, SDFG) else curscope.parent) # Go one SDFG up if cursdfg.parent_nsdfg_node is None: curscope = None curstate = None else: curstate = cursdfg.parent curscope = curstate.entry_node( cursdfg.parent_nsdfg_node) else: raise TypeError if curscope is None: curscope = top_sdfg # Check if Array/View is dependent on non-free SDFG symbols # NOTE: Tuple is (SDFG, State, Node, declare, allocate, deallocate) fsymbols = sdfg.free_symbols.union(sdfg.constants.keys()) if (not isinstance(curscope, nodes.EntryNode) and utils.is_nonfree_sym_dependent( first_node_instance, desc, alloc_state, fsymbols)): # Declare in current (SDFG) scope self.to_allocate[curscope].append( (sdfg, first_state_instance, first_node_instance, True, False, False)) # Allocate in first State # Deallocate in last State if first_state_instance != last_state_instance: curscope = sdfg.nodes()[first_state_instance] self.to_allocate[curscope].append( (sdfg, first_state_instance, first_node_instance, False, True, False)) curscope = sdfg.nodes()[last_state_instance] self.to_allocate[curscope].append( (sdfg, last_state_instance, last_node_instance, False, False, True)) else: curscope = sdfg.nodes()[first_state_instance] self.to_allocate[curscope].append( (sdfg, first_state_instance, first_node_instance, False, True, True)) else: self.to_allocate[curscope].append( (sdfg, first_state_instance, first_node_instance, True, True, True))
def generate_footer(self, sdfg: SDFG, global_stream: CodeIOStream, callsite_stream: CodeIOStream): """ Generate the footer of the frame-code. Code exists in a separate function for overriding purposes. :param sdfg: The input SDFG. :param global_stream: Stream to write to (global). :param callsite_stream: Stream to write to (at call site). """ import dace.library fname = sdfg.name params = sdfg.signature() paramnames = sdfg.signature(False, for_call=True) initparams = sdfg.signature(with_arrays=False) initparamnames = sdfg.signature(False, for_call=True, with_arrays=False) # Invoke all instrumentation providers for instr in self._dispatcher.instrumentation.values(): if instr is not None: instr.on_sdfg_end(sdfg, callsite_stream, global_stream) # Instrumentation saving if (config.Config.get_bool('instrumentation', 'report_each_invocation') and len(self._dispatcher.instrumentation) > 1): callsite_stream.write( '''__state->report.save("{path}/perf", __HASH_{name});'''. format(path=sdfg.build_folder.replace('\\', '/'), name=sdfg.name), sdfg) # Write closing brace of program callsite_stream.write('}', sdfg) # Write awkward footer to avoid 'extern "C"' issues params_comma = (', ' + params) if params else '' initparams_comma = (', ' + initparams) if initparams else '' paramnames_comma = (', ' + paramnames) if paramnames else '' initparamnames_comma = (', ' + initparamnames) if initparamnames else '' callsite_stream.write( f''' DACE_EXPORTED void __program_{fname}({fname}_t *__state{params_comma}) {{ __program_{fname}_internal(__state{paramnames_comma}); }}''', sdfg) for target in self._dispatcher.used_targets: if target.has_initializer: callsite_stream.write( 'DACE_EXPORTED int __dace_init_%s(%s_t *__state%s);\n' % (target.target_name, sdfg.name, initparams_comma), sdfg) if target.has_finalizer: callsite_stream.write( 'DACE_EXPORTED int __dace_exit_%s(%s_t *__state);\n' % (target.target_name, sdfg.name), sdfg) callsite_stream.write( f""" DACE_EXPORTED {sdfg.name}_t *__dace_init_{sdfg.name}({initparams}) {{ int __result = 0; {sdfg.name}_t *__state = new {sdfg.name}_t; """, sdfg) for target in self._dispatcher.used_targets: if target.has_initializer: callsite_stream.write( '__result |= __dace_init_%s(__state%s);' % (target.target_name, initparamnames_comma), sdfg) for env in self.environments: init_code = _get_or_eval_sdfg_first_arg(env.init_code, sdfg) if init_code: callsite_stream.write("{ // Environment: " + env.__name__, sdfg) callsite_stream.write(init_code) callsite_stream.write("}") for sd in sdfg.all_sdfgs_recursive(): if None in sd.init_code: callsite_stream.write(codeblock_to_cpp(sd.init_code[None]), sd) callsite_stream.write(codeblock_to_cpp(sd.init_code['frame']), sd) callsite_stream.write(self._initcode.getvalue(), sdfg) callsite_stream.write( f""" if (__result) {{ delete __state; return nullptr; }} return __state; }} DACE_EXPORTED void __dace_exit_{sdfg.name}({sdfg.name}_t *__state) {{ """, sdfg) # Instrumentation saving if (not config.Config.get_bool('instrumentation', 'report_each_invocation') and len(self._dispatcher.instrumentation) > 1): callsite_stream.write( '__state->report.save("%s/perf", __HASH_%s);' % (sdfg.build_folder.replace('\\', '/'), sdfg.name), sdfg) callsite_stream.write(self._exitcode.getvalue(), sdfg) for sd in sdfg.all_sdfgs_recursive(): if None in sd.exit_code: callsite_stream.write(codeblock_to_cpp(sd.exit_code[None]), sd) callsite_stream.write(codeblock_to_cpp(sd.exit_code['frame']), sd) for target in self._dispatcher.used_targets: if target.has_finalizer: callsite_stream.write( '__dace_exit_%s(__state);' % target.target_name, sdfg) for env in reversed(self.environments): finalize_code = _get_or_eval_sdfg_first_arg( env.finalize_code, sdfg) if finalize_code: callsite_stream.write("{ // Environment: " + env.__name__, sdfg) callsite_stream.write(finalize_code) callsite_stream.write("}") callsite_stream.write('delete __state;\n}\n', sdfg)
def auto_optimize(sdfg: SDFG, device: dtypes.DeviceType, validate: bool = True, validate_all: bool = False) -> SDFG: """ Runs a basic sequence of transformations to optimize a given SDFG to decent performance. In particular, performs the following: * Strict transformations * Strict auto-parallelization (loop-to-map) * Greedy application of SubgraphFusion * Tiled write-conflict resolution (MapTiling -> AccumulateTransient) * Tiled stream accumulation (MapTiling -> AccumulateTransient) * Collapse all maps to parallelize across all dimensions * Set all library nodes to expand to ``fast`` expansion, which calls the fastest library on the target device :param sdfg: The SDFG to optimize. :param device: the device to optimize for. :param validate: If True, validates the SDFG after all transformations have been applied. :param validate_all: If True, validates the SDFG after every step. :return: The optimized SDFG. :note: Operates in-place on the given SDFG. :note: This function is still experimental and may harm correctness in certain cases. Please report an issue if it does. """ # Strict transformations and loop parallelization transformed = True while transformed: sdfg.apply_strict_transformations(validate=False, validate_all=validate_all) xfh.split_interstate_edges(sdfg) # Try to parallelize loops l2ms = sdfg.apply_transformations_repeated(LoopToMap, strict=True, validate=False, validate_all=validate_all) transformed = l2ms > 0 # Map fusion greedy_fuse(sdfg, validate_all) if device == dtypes.DeviceType.FPGA: # apply FPGA Transformations sdfg.apply_fpga_transformations() fpga_aopt.fpga_global_to_local(sdfg) fpga_aopt.fpga_rr_interleave_containers_to_banks(sdfg) # Set all library nodes to expand to fast library calls set_fast_implementations(sdfg, device) return sdfg # Tiled WCR and streams for nsdfg in list(sdfg.all_sdfgs_recursive()): tile_wcrs(nsdfg, validate_all) # Collapse maps sdfg.apply_transformations_repeated(MapCollapse, strict=True, validate=False, validate_all=validate_all) for node, _ in sdfg.all_nodes_recursive(): if isinstance(node, nodes.MapEntry): node.map.collapse = len(node.map.range) # Set all library nodes to expand to fast library calls set_fast_implementations(sdfg, device) # TODO(later): Safe vectorization # Disable OpenMP parallel sections # TODO(later): Set on a per-SDFG basis config.Config.set('compiler', 'cpu', 'openmp_sections', value=False) # Set all Default storage types that are constant sized to registers move_small_arrays_to_stack(sdfg) # Validate at the end if validate or validate_all: sdfg.validate() return sdfg
def match_patterns(sdfg: SDFG, patterns: Union[Type[xf.Transformation], List[Type[xf.Transformation]]], node_match: Callable[[Any, Any], bool] = type_match, edge_match: Optional[Callable[[Any, Any], bool]] = None, strict: bool = False, metadata: Optional[PatternMetadataType] = None, states: Optional[List[SDFGState]] = None): """ Returns a generator of Transformations that match the input SDFG. Ordered by SDFG ID. :param sdfg: The SDFG to match in. :param patterns: Transformation type (or list thereof) to match. :param node_match: Function for checking whether two nodes match. :param edge_match: Function for checking whether two edges match. :param strict: Only match transformation if strict (i.e., can only improve the performance/reduce complexity of the SDFG). :param metadata: Transformation metadata that can be reused. :param states: If given, only tries to match single-state transformations on this list. :return: A list of Transformation objects that match. """ if isinstance(patterns, type): patterns = [patterns] # Collect transformation metadata if metadata is not None: # Transformation metadata can be evaluated once per apply loop interstate_transformations, singlestate_transformations = metadata else: # Otherwise, precompute all transformation data once (interstate_transformations, singlestate_transformations) = get_transformation_metadata(patterns) # Collect SDFG and nested SDFGs sdfgs = sdfg.all_sdfgs_recursive() # Try to find transformations on each SDFG for tsdfg in sdfgs: ################################### # Match inter-state transformations if len(interstate_transformations) > 0: # Collapse multigraph into directed graph in order to use VF2 digraph = collapse_multigraph_to_nx(tsdfg) for xform, expr_idx, nxpattern, matcher in interstate_transformations: for subgraph in matcher(digraph, nxpattern, node_match, edge_match): match = _try_to_match_transformation(tsdfg, digraph, subgraph, tsdfg, xform, expr_idx, nxpattern, -1, strict) if match is not None: yield match #################################### # Match single-state transformations if len(singlestate_transformations) == 0: continue for state_id, state in enumerate(tsdfg.nodes()): if states is not None and state not in states: continue # Collapse multigraph into directed graph in order to use VF2 digraph = collapse_multigraph_to_nx(state) for xform, expr_idx, nxpattern, matcher in singlestate_transformations: for subgraph in matcher(digraph, nxpattern, node_match, edge_match): match = _try_to_match_transformation( state, digraph, subgraph, tsdfg, xform, expr_idx, nxpattern, state_id, strict) if match is not None: yield match
def auto_optimize(sdfg: SDFG, device: dtypes.DeviceType, validate: bool = True, validate_all: bool = False, symbols: Dict[str, int] = None) -> SDFG: """ Runs a basic sequence of transformations to optimize a given SDFG to decent performance. In particular, performs the following: * Simplify * Auto-parallelization (loop-to-map) * Greedy application of SubgraphFusion * Tiled write-conflict resolution (MapTiling -> AccumulateTransient) * Tiled stream accumulation (MapTiling -> AccumulateTransient) * Collapse all maps to parallelize across all dimensions * Set all library nodes to expand to ``fast`` expansion, which calls the fastest library on the target device :param sdfg: The SDFG to optimize. :param device: the device to optimize for. :param validate: If True, validates the SDFG after all transformations have been applied. :param validate_all: If True, validates the SDFG after every step. :param symbols: Optional dict that maps symbols (str/symbolic) to int/float :return: The optimized SDFG. :note: Operates in-place on the given SDFG. :note: This function is still experimental and may harm correctness in certain cases. Please report an issue if it does. """ debugprint = config.Config.get_bool('debugprint') # Simplification and loop parallelization transformed = True sdfg.apply_transformations_repeated(TrivialMapElimination, validate=validate, validate_all=validate_all) while transformed: sdfg.simplify(validate=False, validate_all=validate_all) for s in sdfg.sdfg_list: xfh.split_interstate_edges(s) l2ms = sdfg.apply_transformations_repeated( (LoopToMap, RefineNestedAccess), validate=False, validate_all=validate_all) transformed = l2ms > 0 # Collapse maps and eliminate trivial dimensions sdfg.simplify() sdfg.apply_transformations_repeated(MapCollapse, validate=False, validate_all=validate_all) # Apply GPU transformations and set library node implementations if device == dtypes.DeviceType.GPU: sdfg.apply_gpu_transformations() sdfg.simplify() # fuse subgraphs greedily sdfg.simplify() greedy_fuse(sdfg, device=device, validate_all=validate_all) # fuse stencils greedily greedy_fuse(sdfg, device=device, validate_all=validate_all, recursive=False, stencil=True) if device == dtypes.DeviceType.FPGA: # apply FPGA Transformations sdfg.apply_fpga_transformations() fpga_auto_opt.fpga_global_to_local(sdfg) fpga_auto_opt.fpga_rr_interleave_containers_to_banks(sdfg) # Set all library nodes to expand to fast library calls set_fast_implementations(sdfg, device) return sdfg # Tiled WCR and streams for nsdfg in list(sdfg.all_sdfgs_recursive()): tile_wcrs(nsdfg, validate_all) # Collapse maps sdfg.apply_transformations_repeated(MapCollapse, validate=False, validate_all=validate_all) for node, _ in sdfg.all_nodes_recursive(): # Set OMP collapse property to map length if isinstance(node, nodes.MapEntry): # FORNOW: Leave out # node.map.collapse = len(node.map.range) pass # Set all library nodes to expand to fast library calls set_fast_implementations(sdfg, device) sdfg.expand_library_nodes() # TODO(later): Safe vectorization # Disable OpenMP parallel sections on a per-SDFG basis for nsdfg in sdfg.all_sdfgs_recursive(): nsdfg.openmp_sections = False if symbols: # Specialize for all known symbols known_symbols = { s: v for (s, v) in symbols.items() if s in sdfg.free_symbols } known_symbols = {} for (s, v) in symbols.items(): if s in sdfg.free_symbols: if isinstance(v, (int, float)): known_symbols[s] = v if isinstance(v, sympy.core.numbers.Integer): try: known_symbols[s] = int(v) except TypeError: pass if debugprint and len(known_symbols) > 0: print("Specializing the SDFG for symbols", known_symbols) sdfg.specialize(known_symbols) # Set all Default storage types that are constant sized to registers move_small_arrays_to_stack(sdfg) ''' # Fix storage and allocation properties, e.g., for benchmarking purposes # FORNOW: Leave out make_transients_persistent(sdfg, device) ''' # Validate at the end if validate or validate_all: sdfg.validate() return sdfg
def generate_footer(self, sdfg: SDFG, used_environments: Set[str], global_stream: CodeIOStream, callsite_stream: CodeIOStream): """ Generate the footer of the frame-code. Code exists in a separate function for overriding purposes. :param sdfg: The input SDFG. :param global_stream: Stream to write to (global). :param callsite_stream: Stream to write to (at call site). """ fname = sdfg.name params = sdfg.signature() paramnames = sdfg.signature(False, for_call=True) environments = [ dace.library.get_environment(env_name) for env_name in used_environments ] # Invoke all instrumentation providers for instr in self._dispatcher.instrumentation.values(): if instr is not None: instr.on_sdfg_end(sdfg, callsite_stream, global_stream) # Instrumentation saving if len(self._dispatcher.instrumentation) > 1: callsite_stream.write( 'dace::perf::report.save("%s/perf");' % sdfg.build_folder.replace('\\', '/'), sdfg) # Write closing brace of program callsite_stream.write('}', sdfg) # Write awkward footer to avoid 'extern "C"' issues callsite_stream.write( """ DACE_EXPORTED void __program_%s(%s) { __program_%s_internal(%s); } """ % (fname, params, fname, paramnames), sdfg) for target in self._dispatcher.used_targets: if target.has_initializer: callsite_stream.write( 'DACE_EXPORTED int __dace_init_%s(%s);\n' % (target.target_name, params), sdfg) if target.has_finalizer: callsite_stream.write( 'DACE_EXPORTED int __dace_exit_%s(%s);\n' % (target.target_name, params), sdfg) callsite_stream.write( """ DACE_EXPORTED int __dace_init_%s(%s) { int __result = 0; """ % (sdfg.name, params), sdfg) for target in self._dispatcher.used_targets: if target.has_initializer: callsite_stream.write( '__result |= __dace_init_%s(%s);' % (target.target_name, paramnames), sdfg) for env in environments: if env.init_code: callsite_stream.write("{ // Environment: " + env.__name__, sdfg) callsite_stream.write(env.init_code) callsite_stream.write("}") for sd in sdfg.all_sdfgs_recursive(): if None in sd.init_code: callsite_stream.write(codeblock_to_cpp(sd.init_code[None]), sd) callsite_stream.write(codeblock_to_cpp(sd.init_code['frame']), sd) callsite_stream.write(self._initcode.getvalue(), sdfg) callsite_stream.write( """ return __result; } DACE_EXPORTED void __dace_exit_%s(%s) { """ % (sdfg.name, params), sdfg) callsite_stream.write(self._exitcode.getvalue(), sdfg) for sd in sdfg.all_sdfgs_recursive(): if None in sd.exit_code: callsite_stream.write(codeblock_to_cpp(sd.exit_code[None]), sd) callsite_stream.write(codeblock_to_cpp(sd.exit_code['frame']), sd) for target in self._dispatcher.used_targets: if target.has_finalizer: callsite_stream.write( '__dace_exit_%s(%s);' % (target.target_name, paramnames), sdfg) for env in environments: if env.finalize_code: callsite_stream.write("{ // Environment: " + env.__name__, sdfg) callsite_stream.write(env.init_code) callsite_stream.write("}") callsite_stream.write('}\n', sdfg)
def make_transients_persistent(sdfg: SDFG, device: dtypes.DeviceType, toplevel_only: bool = True) -> None: ''' Helper function to change several storage and scheduling properties - Makes non-view array lifetimes persistent, with some restrictions depending on the device - Reset nonatomic WCR edges on GPU The only arrays that are made persistent by default are ones that do not exist inside a scope (and thus may be allocated multiple times), and whose symbols are always given as parameters to the SDFG (so that they can be allocated in a persistent manner). :param sdfg: SDFG :param device: Device type :param toplevel_only: If True, only converts access nodes that do not appear in any scope. ''' for nsdfg in sdfg.all_sdfgs_recursive(): fsyms: Set[str] = nsdfg.free_symbols persistent: Set[str] = set() not_persistent: Set[str] = set() for state in nsdfg.nodes(): for dnode in state.data_nodes(): if dnode.data in not_persistent: continue desc = dnode.desc(nsdfg) # Only convert arrays and scalars that are not registers if not desc.transient or type(desc) not in { dt.Array, dt.Scalar }: not_persistent.add(dnode.data) continue if desc.storage == dtypes.StorageType.Register: not_persistent.add(dnode.data) continue # Only convert arrays where the size depends on SDFG parameters try: if set(map(str, desc.total_size.free_symbols)) - fsyms: not_persistent.add(dnode.data) continue except AttributeError: # total_size is an integer / has no free symbols pass # Only convert arrays with top-level access nodes if xfh.get_parent_map(state, dnode) is not None: if toplevel_only: not_persistent.add(dnode.data) continue elif desc.lifetime == dtypes.AllocationLifetime.Scope: not_persistent.add(dnode.data) continue persistent.add(dnode.data) for aname in (persistent - not_persistent): nsdfg.arrays[aname].lifetime = dtypes.AllocationLifetime.Persistent if device == dtypes.DeviceType.GPU: # Reset nonatomic WCR edges for n, _ in sdfg.all_nodes_recursive(): if isinstance(n, SDFGState): for edge in n.edges(): edge.data.wcr_nonatomic = False