def set_fast_implementations(sdfg: SDFG, device: dtypes.DeviceType, blocklist: List[str] = None): """ Set fast library node implementations for the given device :param sdfg: The SDFG to optimize. :param device: the device to optimize for. :param blocklist: list of disallowed implementations. :note: Operates in-place on the given SDFG. """ if blocklist is None: implementation_prio = find_fast_library(device) else: implementation_prio = [ i for i in find_fast_library(device) if i not in blocklist ] # specialized nodes: pre-expand for current_sdfg in sdfg.all_sdfgs_recursive(): for state in current_sdfg.nodes(): for node in state.nodes(): if isinstance(node, nodes.LibraryNode): if (node.default_implementation == 'specialize' and (len( set(node.implementations) & set(implementation_prio))) == 0): node.expand(current_sdfg, state) # general nodes for node, _ in sdfg.all_nodes_recursive(): if isinstance(node, nodes.LibraryNode): for impl in implementation_prio: if impl in node.implementations: if isinstance(node, dace.libraries.standard.nodes.reduce.Reduce ) and node.implementation == 'CUDA (block)': continue node.implementation = impl break # reduce nodes if device == dtypes.DeviceType.GPU: for node, state in sdfg.all_nodes_recursive(): if isinstance(node, dace.nodes.LibraryNode): # Use CUB for device-level reductions if ('CUDA (device)' in node.implementations and not is_devicelevel_gpu(state.parent, state, node) and state.scope_dict()[node] is None): node.implementation = 'CUDA (device)'
def apply_pass( self, sdfg: SDFG, pipeline_results: Dict[str, Any], ) -> Optional[Dict[nodes.EntryNode, Optional[Any]]]: """ Applies the pass to the scopes of the given SDFG by calling ``apply`` on each scope entry node. :param sdfg: The SDFG to apply the pass to. :param pipeline_results: If in the context of a ``Pipeline``, a dictionary that is populated with prior Pass results as ``{Pass subclass name: returned object from pass}``. If not run in a pipeline, an empty dictionary is expected. :return: A dictionary of ``{entry node: return value}`` for visited scopes with a non-None return value, or None if nothing was returned. """ result = {} for node, state in sdfg.all_nodes_recursive(): if not isinstance(node, nodes.EntryNode): continue retval = self.apply(node, state, pipeline_results) if retval is not None: result[node] = retval if not result: return None return result
def set_fast_implementations(sdfg: SDFG, device: dtypes.DeviceType, blocklist: List[str] = None): """ Set fast library node implementations for the given device :param sdfg: The SDFG to optimize. :param device: the device to optimize for. :param blocklist: list of disallowed implementations. :note: Operates in-place on the given SDFG. """ if blocklist is None: implementation_prio = find_fast_library(device) else: implementation_prio = [ i for i in find_fast_library(device) if i not in blocklist ] for node, _ in sdfg.all_nodes_recursive(): if isinstance(node, nodes.LibraryNode): for impl in implementation_prio: if impl in node.implementations: node.implementation = impl break else: warnings.warn('No fast library implementation found for "%s", ' 'falling back to default.' % node.name)
def make_transients_persistent(sdfg: SDFG, device: dtypes.DeviceType) -> None: ''' Helper function to change several storage and scheduling properties - Makes non-view array lifetimes persistent, with some restrictions depending on the device - Reset nonatomic WCR edges on GPU :param sdfg: SDFG :param device: Device type ''' for nsdfg in sdfg.all_sdfgs_recursive(): for aname, arr in nsdfg.arrays.items(): if arr.transient and not isinstance( arr, dt.View) and not symbolic.issymbolic(arr.total_size): if arr.storage != dtypes.StorageType.Register: arr.lifetime = dtypes.AllocationLifetime.Persistent if device == dtypes.DeviceType.GPU: for aname, arr in sdfg.arrays.items(): if arr.transient and not isinstance( arr, dt.View): #and size only depends on SDFG params if arr.storage == dtypes.StorageType.GPU_Global: arr.lifetime = dtypes.AllocationLifetime.Persistent # Reset nonatomic WCR edges for n, _ in sdfg.all_nodes_recursive(): if isinstance(n, SDFGState): for edge in n.edges(): edge.data.wcr_nonatomic = False
def fpga_global_to_local(sdfg: SDFG, max_size: int = 1048576) -> None: """ Takes an entire SDFG and changes the storage type of a global FPGA data container to Local in the following situation: - the data is transient, - the data is not a transient shared with other states, and - the data has a compile-time known size. :param: sdfg: The SDFG to operate on. It must be a top-level SDFG. :param: max_size: maximum size (in bytes) that a container can have to be considered for storage type change :note: Operates in-place on the SDFG. """ converted = [] for name, desc in sdfg.arrays.items(): if desc.transient and name not in sdfg.shared_transients( ) and desc.storage == dtypes.StorageType.FPGA_Global: # Get the total size, trying to resolve it to constant if it is a symbol total_size = symbolic.resolve_symbol_to_constant( desc.total_size, sdfg) if total_size is not None and total_size * desc.dtype.bytes <= max_size: desc.storage = dtypes.StorageType.FPGA_Local converted.append(name) # update all access nodes that refer to this container for node, graph in sdfg.all_nodes_recursive(): if isinstance(node, nodes.AccessNode): trace = trace_nested_access(node, graph, graph.parent) for (_, candidate ), memlet_trace, state_trace, sdfg_trace in trace: if candidate is not None and candidate.data == name: nodedesc = node.desc(graph) nodedesc.storage = dtypes.StorageType.FPGA_Local if config.Config.get_bool('debugprint'): print( f'Applied {len(converted)} Global-To-Local{": " if len(converted)>0 else "."} {", ".join(converted)}' )
def apply_pass( self, sdfg: SDFG, pipeline_results: Dict[str, Any]) -> Optional[Dict[Any, Any]]: """ Visits the given SDFG recursively, calling defined ``visit_*`` methods for each element. :param sdfg: The SDFG to recursively visit. :param pipeline_results: If in the context of a ``Pipeline``, a dictionary that is populated with prior Pass results as ``{Pass subclass name: returned object from pass}``. If not run in a pipeline, an empty dictionary is expected. :return: A dictionary of ``{element: return value}`` for visited elements with a non-None return value, or None if nothing was returned. """ results = {} for node, parent in sdfg.all_nodes_recursive(): # Visit node (SDFGState, AccessNode, ...) f = getattr(self, f'visit_{type(node).__name__}', self.generic_visit) res = f(node, parent, pipeline_results) if res is not None: results[node] = res for edge, parent in sdfg.all_edges_recursive(): # Visit edge (Edge, MultiConnectorEdge) f = getattr(self, f'visit_{type(edge).__name__}', self.generic_visit) res = f(edge, parent, pipeline_results) if res is not None: results[edge] = res # Visit edge data (Memlet, InterstateEdge) f = getattr(self, f'visit_{type(edge.data).__name__}', self.generic_visit) res = f(edge.data, parent, pipeline_results) if res is not None: results[edge.data] = res if not results: return None return results
def auto_optimize(sdfg: SDFG, device: dtypes.DeviceType, validate: bool = True, validate_all: bool = False) -> SDFG: """ Runs a basic sequence of transformations to optimize a given SDFG to decent performance. In particular, performs the following: * Strict transformations * Strict auto-parallelization (loop-to-map) * Greedy application of SubgraphFusion * Tiled write-conflict resolution (MapTiling -> AccumulateTransient) * Tiled stream accumulation (MapTiling -> AccumulateTransient) * Collapse all maps to parallelize across all dimensions * Set all library nodes to expand to ``fast`` expansion, which calls the fastest library on the target device :param sdfg: The SDFG to optimize. :param device: the device to optimize for. :param validate: If True, validates the SDFG after all transformations have been applied. :param validate_all: If True, validates the SDFG after every step. :return: The optimized SDFG. :note: Operates in-place on the given SDFG. :note: This function is still experimental and may harm correctness in certain cases. Please report an issue if it does. """ # Strict transformations and loop parallelization transformed = True while transformed: sdfg.apply_strict_transformations(validate=False, validate_all=validate_all) xfh.split_interstate_edges(sdfg) # Try to parallelize loops l2ms = sdfg.apply_transformations_repeated(LoopToMap, strict=True, validate=False, validate_all=validate_all) transformed = l2ms > 0 # Map fusion greedy_fuse(sdfg, validate_all) if device == dtypes.DeviceType.FPGA: # apply FPGA Transformations sdfg.apply_fpga_transformations() fpga_aopt.fpga_global_to_local(sdfg) fpga_aopt.fpga_rr_interleave_containers_to_banks(sdfg) # Set all library nodes to expand to fast library calls set_fast_implementations(sdfg, device) return sdfg # Tiled WCR and streams for nsdfg in list(sdfg.all_sdfgs_recursive()): tile_wcrs(nsdfg, validate_all) # Collapse maps sdfg.apply_transformations_repeated(MapCollapse, strict=True, validate=False, validate_all=validate_all) for node, _ in sdfg.all_nodes_recursive(): if isinstance(node, nodes.MapEntry): node.map.collapse = len(node.map.range) # Set all library nodes to expand to fast library calls set_fast_implementations(sdfg, device) # TODO(later): Safe vectorization # Disable OpenMP parallel sections # TODO(later): Set on a per-SDFG basis config.Config.set('compiler', 'cpu', 'openmp_sections', value=False) # Set all Default storage types that are constant sized to registers move_small_arrays_to_stack(sdfg) # Validate at the end if validate or validate_all: sdfg.validate() return sdfg
def auto_optimize(sdfg: SDFG, device: dtypes.DeviceType, validate: bool = True, validate_all: bool = False, symbols: Dict[str, int] = None) -> SDFG: """ Runs a basic sequence of transformations to optimize a given SDFG to decent performance. In particular, performs the following: * Simplify * Auto-parallelization (loop-to-map) * Greedy application of SubgraphFusion * Tiled write-conflict resolution (MapTiling -> AccumulateTransient) * Tiled stream accumulation (MapTiling -> AccumulateTransient) * Collapse all maps to parallelize across all dimensions * Set all library nodes to expand to ``fast`` expansion, which calls the fastest library on the target device :param sdfg: The SDFG to optimize. :param device: the device to optimize for. :param validate: If True, validates the SDFG after all transformations have been applied. :param validate_all: If True, validates the SDFG after every step. :param symbols: Optional dict that maps symbols (str/symbolic) to int/float :return: The optimized SDFG. :note: Operates in-place on the given SDFG. :note: This function is still experimental and may harm correctness in certain cases. Please report an issue if it does. """ debugprint = config.Config.get_bool('debugprint') # Simplification and loop parallelization transformed = True sdfg.apply_transformations_repeated(TrivialMapElimination, validate=validate, validate_all=validate_all) while transformed: sdfg.simplify(validate=False, validate_all=validate_all) for s in sdfg.sdfg_list: xfh.split_interstate_edges(s) l2ms = sdfg.apply_transformations_repeated( (LoopToMap, RefineNestedAccess), validate=False, validate_all=validate_all) transformed = l2ms > 0 # Collapse maps and eliminate trivial dimensions sdfg.simplify() sdfg.apply_transformations_repeated(MapCollapse, validate=False, validate_all=validate_all) # Apply GPU transformations and set library node implementations if device == dtypes.DeviceType.GPU: sdfg.apply_gpu_transformations() sdfg.simplify() # fuse subgraphs greedily sdfg.simplify() greedy_fuse(sdfg, device=device, validate_all=validate_all) # fuse stencils greedily greedy_fuse(sdfg, device=device, validate_all=validate_all, recursive=False, stencil=True) if device == dtypes.DeviceType.FPGA: # apply FPGA Transformations sdfg.apply_fpga_transformations() fpga_auto_opt.fpga_global_to_local(sdfg) fpga_auto_opt.fpga_rr_interleave_containers_to_banks(sdfg) # Set all library nodes to expand to fast library calls set_fast_implementations(sdfg, device) return sdfg # Tiled WCR and streams for nsdfg in list(sdfg.all_sdfgs_recursive()): tile_wcrs(nsdfg, validate_all) # Collapse maps sdfg.apply_transformations_repeated(MapCollapse, validate=False, validate_all=validate_all) for node, _ in sdfg.all_nodes_recursive(): # Set OMP collapse property to map length if isinstance(node, nodes.MapEntry): # FORNOW: Leave out # node.map.collapse = len(node.map.range) pass # Set all library nodes to expand to fast library calls set_fast_implementations(sdfg, device) sdfg.expand_library_nodes() # TODO(later): Safe vectorization # Disable OpenMP parallel sections on a per-SDFG basis for nsdfg in sdfg.all_sdfgs_recursive(): nsdfg.openmp_sections = False if symbols: # Specialize for all known symbols known_symbols = { s: v for (s, v) in symbols.items() if s in sdfg.free_symbols } known_symbols = {} for (s, v) in symbols.items(): if s in sdfg.free_symbols: if isinstance(v, (int, float)): known_symbols[s] = v if isinstance(v, sympy.core.numbers.Integer): try: known_symbols[s] = int(v) except TypeError: pass if debugprint and len(known_symbols) > 0: print("Specializing the SDFG for symbols", known_symbols) sdfg.specialize(known_symbols) # Set all Default storage types that are constant sized to registers move_small_arrays_to_stack(sdfg) ''' # Fix storage and allocation properties, e.g., for benchmarking purposes # FORNOW: Leave out make_transients_persistent(sdfg, device) ''' # Validate at the end if validate or validate_all: sdfg.validate() return sdfg
def contains_any_sve(sdfg: SDFG): for node, _ in sdfg.all_nodes_recursive(): if isinstance(node, nodes.Map) and node.schedule == dace.ScheduleType.SVE_Map: return True return False
def apply(self, sdfg: sd.SDFG): ####################################################### # Step 0: SDFG metadata # Find all input and output data descriptors input_nodes = [] output_nodes = [] global_code_nodes: Dict[sd.SDFGState, nodes.Tasklet] = defaultdict(list) for state in sdfg.nodes(): sdict = state.scope_dict() for node in state.nodes(): if (isinstance(node, nodes.AccessNode) and node.desc(sdfg).transient == False): if (state.out_degree(node) > 0 and node.data not in input_nodes): # Special case: nodes that lead to top-level dynamic # map ranges must stay on host for e in state.out_edges(node): last_edge = state.memlet_path(e)[-1] if (isinstance(last_edge.dst, nodes.EntryNode) and last_edge.dst_conn and not last_edge.dst_conn.startswith('IN_') and sdict[last_edge.dst] is None): break else: input_nodes.append((node.data, node.desc(sdfg))) if (state.in_degree(node) > 0 and node.data not in output_nodes): output_nodes.append((node.data, node.desc(sdfg))) # Input nodes may also be nodes with WCR memlets and no identity for e in state.edges(): if e.data.wcr is not None: if (e.data.data not in input_nodes and sdfg.arrays[e.data.data].transient == False): input_nodes.append( (e.data.data, sdfg.arrays[e.data.data])) start_state = sdfg.start_state end_states = sdfg.sink_nodes() ####################################################### # Step 1: Create cloned GPU arrays and replace originals cloned_arrays = {} for inodename, inode in set(input_nodes): if isinstance(inode, data.Scalar): # Scalars can remain on host continue if inode.storage == dtypes.StorageType.GPU_Global: continue newdesc = inode.clone() newdesc.storage = dtypes.StorageType.GPU_Global newdesc.transient = True name = sdfg.add_datadesc('gpu_' + inodename, newdesc, find_new_name=True) cloned_arrays[inodename] = name for onodename, onode in set(output_nodes): if onodename in cloned_arrays: continue if onode.storage == dtypes.StorageType.GPU_Global: continue newdesc = onode.clone() newdesc.storage = dtypes.StorageType.GPU_Global newdesc.transient = True name = sdfg.add_datadesc('gpu_' + onodename, newdesc, find_new_name=True) cloned_arrays[onodename] = name # Replace nodes for state in sdfg.nodes(): for node in state.nodes(): if (isinstance(node, nodes.AccessNode) and node.data in cloned_arrays): node.data = cloned_arrays[node.data] # Replace memlets for state in sdfg.nodes(): for edge in state.edges(): if edge.data.data in cloned_arrays: edge.data.data = cloned_arrays[edge.data.data] ####################################################### # Step 2: Create copy-in state excluded_copyin = self.exclude_copyin.split(',') copyin_state = sdfg.add_state(sdfg.label + '_copyin') sdfg.add_edge(copyin_state, start_state, sd.InterstateEdge()) for nname, desc in dtypes.deduplicate(input_nodes): if nname in excluded_copyin or nname not in cloned_arrays: continue src_array = nodes.AccessNode(nname, debuginfo=desc.debuginfo) dst_array = nodes.AccessNode(cloned_arrays[nname], debuginfo=desc.debuginfo) copyin_state.add_node(src_array) copyin_state.add_node(dst_array) copyin_state.add_nedge( src_array, dst_array, memlet.Memlet.from_array(src_array.data, src_array.desc(sdfg))) ####################################################### # Step 3: Create copy-out state excluded_copyout = self.exclude_copyout.split(',') copyout_state = sdfg.add_state(sdfg.label + '_copyout') for state in end_states: sdfg.add_edge(state, copyout_state, sd.InterstateEdge()) for nname, desc in dtypes.deduplicate(output_nodes): if nname in excluded_copyout or nname not in cloned_arrays: continue src_array = nodes.AccessNode(cloned_arrays[nname], debuginfo=desc.debuginfo) dst_array = nodes.AccessNode(nname, debuginfo=desc.debuginfo) copyout_state.add_node(src_array) copyout_state.add_node(dst_array) copyout_state.add_nedge( src_array, dst_array, memlet.Memlet.from_array(dst_array.data, dst_array.desc(sdfg))) ####################################################### # Step 4: Modify transient data storage const_syms = xfh.constant_symbols(sdfg) for state in sdfg.nodes(): sdict = state.scope_dict() for node in state.nodes(): if isinstance(node, nodes.AccessNode) and node.desc(sdfg).transient: nodedesc = node.desc(sdfg) # Special case: nodes that lead to dynamic map ranges must # stay on host if any( isinstance( state.memlet_path(e)[-1].dst, nodes.EntryNode) for e in state.out_edges(node)): continue gpu_storage = [ dtypes.StorageType.GPU_Global, dtypes.StorageType.GPU_Shared, dtypes.StorageType.CPU_Pinned ] if sdict[ node] is None and nodedesc.storage not in gpu_storage: # NOTE: the cloned arrays match too but it's the same # storage so we don't care nodedesc.storage = dtypes.StorageType.GPU_Global # Try to move allocation/deallocation out of loops dsyms = set(map(str, nodedesc.free_symbols)) if (self.toplevel_trans and not isinstance(nodedesc, (data.Stream, data.View)) and len(dsyms - const_syms) == 0): nodedesc.lifetime = dtypes.AllocationLifetime.SDFG elif nodedesc.storage not in gpu_storage: # Make internal transients registers if self.register_trans: nodedesc.storage = dtypes.StorageType.Register ####################################################### # Step 5: Change all top-level maps and library nodes to GPU schedule for state in sdfg.nodes(): sdict = state.scope_dict() for node in state.nodes(): if sdict[node] is None: if isinstance(node, (nodes.LibraryNode, nodes.NestedSDFG)): node.schedule = dtypes.ScheduleType.GPU_Default elif isinstance(node, nodes.EntryNode): node.schedule = dtypes.ScheduleType.GPU_Device elif self.sequential_innermaps: if isinstance(node, (nodes.EntryNode, nodes.LibraryNode)): node.schedule = dtypes.ScheduleType.Sequential elif isinstance(node, nodes.NestedSDFG): for nnode, _ in node.sdfg.all_nodes_recursive(): if isinstance(nnode, (nodes.EntryNode, nodes.LibraryNode)): nnode.schedule = dtypes.ScheduleType.Sequential ####################################################### # Step 6: Wrap free tasklets and nested SDFGs with a GPU map # Collect free tasklets for node, state in sdfg.all_nodes_recursive(): if isinstance(node, nodes.Tasklet): if (state.entry_node(node) is None and not scope.is_devicelevel_gpu( state.parent, state, node, with_gpu_default=True)): global_code_nodes[state].append(node) for state, gcodes in global_code_nodes.items(): for gcode in gcodes: if gcode.label in self.exclude_tasklets.split(','): continue # Create map and connectors me, mx = state.add_map(gcode.label + '_gmap', {gcode.label + '__gmapi': '0:1'}, schedule=dtypes.ScheduleType.GPU_Device) # Store in/out edges in lists so that they don't get corrupted # when they are removed from the graph in_edges = list(state.in_edges(gcode)) out_edges = list(state.out_edges(gcode)) me.in_connectors = {('IN_' + e.dst_conn): None for e in in_edges} me.out_connectors = {('OUT_' + e.dst_conn): None for e in in_edges} mx.in_connectors = {('IN_' + e.src_conn): None for e in out_edges} mx.out_connectors = {('OUT_' + e.src_conn): None for e in out_edges} # Create memlets through map for e in in_edges: state.remove_edge(e) state.add_edge(e.src, e.src_conn, me, 'IN_' + e.dst_conn, e.data) state.add_edge(me, 'OUT_' + e.dst_conn, e.dst, e.dst_conn, e.data) for e in out_edges: state.remove_edge(e) state.add_edge(e.src, e.src_conn, mx, 'IN_' + e.src_conn, e.data) state.add_edge(mx, 'OUT_' + e.src_conn, e.dst, e.dst_conn, e.data) # Map without inputs if len(in_edges) == 0: state.add_nedge(me, gcode, memlet.Memlet()) ####################################################### # Step 7: Introduce copy-out if data used in outgoing interstate edges for state in list(sdfg.nodes()): arrays_used = set() for e in sdfg.out_edges(state): # Used arrays = intersection between symbols and cloned arrays arrays_used.update( set(e.data.free_symbols) & set(cloned_arrays.keys())) # Create a state and copy out used arrays if len(arrays_used) > 0: co_state = sdfg.add_state(state.label + '_icopyout') # Reconnect outgoing edges to after interim copyout state for e in sdfg.out_edges(state): sdutil.change_edge_src(sdfg, state, co_state) # Add unconditional edge to interim state sdfg.add_edge(state, co_state, sd.InterstateEdge()) # Add copy-out nodes for nname in arrays_used: desc = sdfg.arrays[nname] src_array = nodes.AccessNode(cloned_arrays[nname], debuginfo=desc.debuginfo) dst_array = nodes.AccessNode(nname, debuginfo=desc.debuginfo) co_state.add_node(src_array) co_state.add_node(dst_array) co_state.add_nedge( src_array, dst_array, memlet.Memlet.from_array(dst_array.data, dst_array.desc(sdfg))) ####################################################### # Step 8: Strict transformations if not self.strict_transform: return # Apply strict state fusions greedily. sdfg.apply_strict_transformations()
def _get_codegen_targets(sdfg: SDFG, frame: framecode.DaCeCodeGenerator): """ Queries all code generation targets in this SDFG and all nested SDFGs, as well as instrumentation providers, and stores them in the frame code generator. """ disp = frame._dispatcher provider_mapping = InstrumentationProvider.get_provider_mapping() disp.instrumentation[dtypes.InstrumentationType.No_Instrumentation] = None for node, parent in sdfg.all_nodes_recursive(): # Query nodes and scopes if isinstance(node, SDFGState): frame.targets.add(disp.get_state_dispatcher(parent, node)) elif isinstance(node, dace.nodes.EntryNode): frame.targets.add(disp.get_scope_dispatcher(node.schedule)) elif isinstance(node, dace.nodes.Node): state: SDFGState = parent nsdfg = state.parent frame.targets.add(disp.get_node_dispatcher(nsdfg, state, node)) # Array allocation if isinstance(node, dace.nodes.AccessNode): state: SDFGState = parent nsdfg = state.parent desc = node.desc(nsdfg) frame.targets.add(disp.get_array_dispatcher(desc.storage)) # Copies and memlets - via access nodes and tasklets # To avoid duplicate checks, only look at outgoing edges of access nodes and tasklets if isinstance(node, (dace.nodes.AccessNode, dace.nodes.Tasklet)): state: SDFGState = parent for e in state.out_edges(node): if e.data.is_empty(): continue mtree = state.memlet_tree(e) if mtree.downwards: # Rooted at src_node for leaf_e in mtree.leaves(): dst_node = leaf_e.dst if leaf_e.data.is_empty(): continue tgt = disp.get_copy_dispatcher(node, dst_node, leaf_e, state.parent, state) if tgt is not None: frame.targets.add(tgt) else: # Rooted at dst_node dst_node = mtree.root().edge.dst tgt = disp.get_copy_dispatcher(node, dst_node, e, state.parent, state) if tgt is not None: frame.targets.add(tgt) # Instrumentation-related query if hasattr(node, 'instrument'): disp.instrumentation[node.instrument] = provider_mapping[ node.instrument] elif hasattr(node, 'consume'): disp.instrumentation[node.consume.instrument] = provider_mapping[ node.consume.instrument] elif hasattr(node, 'map'): disp.instrumentation[node.map.instrument] = provider_mapping[ node.map.instrument] # Query instrumentation provider of SDFG if sdfg.instrument != dtypes.InstrumentationType.No_Instrumentation: disp.instrumentation[sdfg.instrument] = provider_mapping[ sdfg.instrument]
def make_transients_persistent(sdfg: SDFG, device: dtypes.DeviceType, toplevel_only: bool = True) -> None: ''' Helper function to change several storage and scheduling properties - Makes non-view array lifetimes persistent, with some restrictions depending on the device - Reset nonatomic WCR edges on GPU The only arrays that are made persistent by default are ones that do not exist inside a scope (and thus may be allocated multiple times), and whose symbols are always given as parameters to the SDFG (so that they can be allocated in a persistent manner). :param sdfg: SDFG :param device: Device type :param toplevel_only: If True, only converts access nodes that do not appear in any scope. ''' for nsdfg in sdfg.all_sdfgs_recursive(): fsyms: Set[str] = nsdfg.free_symbols persistent: Set[str] = set() not_persistent: Set[str] = set() for state in nsdfg.nodes(): for dnode in state.data_nodes(): if dnode.data in not_persistent: continue desc = dnode.desc(nsdfg) # Only convert arrays and scalars that are not registers if not desc.transient or type(desc) not in { dt.Array, dt.Scalar }: not_persistent.add(dnode.data) continue if desc.storage == dtypes.StorageType.Register: not_persistent.add(dnode.data) continue # Only convert arrays where the size depends on SDFG parameters try: if set(map(str, desc.total_size.free_symbols)) - fsyms: not_persistent.add(dnode.data) continue except AttributeError: # total_size is an integer / has no free symbols pass # Only convert arrays with top-level access nodes if xfh.get_parent_map(state, dnode) is not None: if toplevel_only: not_persistent.add(dnode.data) continue elif desc.lifetime == dtypes.AllocationLifetime.Scope: not_persistent.add(dnode.data) continue persistent.add(dnode.data) for aname in (persistent - not_persistent): nsdfg.arrays[aname].lifetime = dtypes.AllocationLifetime.Persistent if device == dtypes.DeviceType.GPU: # Reset nonatomic WCR edges for n, _ in sdfg.all_nodes_recursive(): if isinstance(n, SDFGState): for edge in n.edges(): edge.data.wcr_nonatomic = False