def inline_sdfgs(sdfg: SDFG, strict: bool = True, progress: bool = False) -> int: """ Inlines all possible nested SDFGs (or sub-SDFGs) using an optimized routine that uses the structure of the SDFG hierarchy. :param sdfg: The SDFG to transform. :param strict: If True (default), operates in strict mode. :param progress: If True, prints out a progress bar of inlining (may be inaccurate, requires ``tqdm``) :return: The total number of SDFGs inlined. """ from dace.transformation.interstate import InlineSDFG # Avoid import loop counter = 0 sdfgs = list(sdfg.all_sdfgs_recursive()) if progress: from tqdm import tqdm pbar = tqdm(total=len(sdfgs)) for sd in reversed(sdfgs): id = sd.sdfg_id for state_id, state in enumerate(sd.nodes()): for node in state.nodes(): if not isinstance(node, NestedSDFG): continue # We have to reevaluate every time due to changing IDs node_id = state.node_id(node) candidate = { InlineSDFG._nested_sdfg: node_id, } inliner = InlineSDFG(id, state_id, candidate, 0, override=True) if inliner.can_be_applied(state, candidate, 0, sd, strict=strict): inliner.apply(sd) counter += 1 if progress: pbar.update(1) if progress: pbar.close() if config.Config.get_bool('debugprint'): print(f'Inlined {counter} SDFGs') return counter
def expand(self, sdfg, graph, reduce_node): """ Splits the data dimension into an inner and outer dimension, where the inner dimension are the reduction axes and the outer axes the complement. Pushes the reduce inside a new map consisting of the complement axes. """ out_storage_node = graph.out_edges(reduce_node)[0].dst in_storage_node = graph.in_edges(reduce_node)[0].src wcr = reduce_node.wcr identity = reduce_node.identity schedule = reduce_node.schedule implementation = reduce_node.implementation if implementation and 'warp' in implementation: raise NotImplementedError( "WIP: Warp Reductions are not Implemented yet.") # remove the reduce identity # we will reassign it later after expanding reduce_node.identity = None # expand the reduce node in_edge = graph.in_edges(reduce_node)[0] nsdfg = self._expand_reduce(sdfg, graph, reduce_node) # find the new nodes in the nested sdfg created nstate = nsdfg.sdfg.nodes()[0] for node, scope in nstate.scope_dict().items(): if isinstance(node, nodes.MapEntry): if scope is None: outer_entry = node else: inner_entry = node if isinstance(node, nodes.Tasklet): tasklet_node = node inner_exit = nstate.exit_node(inner_entry) outer_exit = nstate.exit_node(outer_entry) # find earliest parent read-write occurrence of array onto which # we perform the reduction: # do BFS, best complexity O(V+E) queue = [nsdfg] array_closest_ancestor = None while len(queue) > 0: current = queue.pop(0) if isinstance(current, nodes.AccessNode): if current.data == out_storage_node.data: # it suffices to find the first node # no matter what access (ReadWrite or Read) array_closest_ancestor = current break queue.extend([in_edge.src for in_edge in graph.in_edges(current)]) # if ancestor doesn't exist: # if non-transient: create data node accessing it # if transient: ancestor_node = none, set_zero on outer node shortcut = False if (not array_closest_ancestor and sdfg.data(out_storage_node.data).transient) \ or identity is not None: if self.debug: print("ReduceExpansion::Expanding Reduction into Map") # we are lucky shortcut = True nstate.out_edges(outer_exit)[0].data.wcr = None else: if self.debug: print("ReduceExpansion::Expanding Reduction into Map " "and introducing update Tasklet, " "connecting with ancestor.") if not array_closest_ancestor: array_closest_ancestor = nodes.AccessNode( out_storage_node.data, access=dtypes.AccessType.ReadOnly) graph.add_node(array_closest_ancestor) # array_closest_ancestor now points to the node we want to connect # to the map entry # always have to create out transient in this case self.create_out_transient = True if self.create_out_transient: # create an out transient between inner and outer map exit array_out = nstate.out_edges(outer_exit)[0].data.data from dace.transformation.dataflow.local_storage import LocalStorage local_storage_subgraph = { LocalStorage.node_a: nsdfg.sdfg.nodes()[0].nodes().index(inner_exit), LocalStorage.node_b: nsdfg.sdfg.nodes()[0].nodes().index(outer_exit) } nsdfg_id = nsdfg.sdfg.sdfg_list.index(nsdfg.sdfg) nstate_id = 0 local_storage = LocalStorage(nsdfg_id, nstate_id, local_storage_subgraph, 0) local_storage.array = array_out local_storage.apply(nsdfg.sdfg) out_transient_node_inner = local_storage._data_node # push to register nsdfg.sdfg.data(out_transient_node_inner.data ).storage = dtypes.StorageType.Register if shortcut: nstate.out_edges(out_transient_node_inner)[0].data.wcr = None nstate.out_edges(out_transient_node_inner)[0].data.volume = 1 if shortcut: nstate.out_edges(out_transient_node_inner)[0].data.wcr = None nstate.out_edges(out_transient_node_inner)[0].data.volume = 1 if self.create_in_transient: # create an in-transient between inner and outer map entry array_in = nstate.in_edges(outer_entry)[0].data.data from dace.transformation.dataflow.local_storage import LocalStorage local_storage_subgraph = { LocalStorage.node_a: nsdfg.sdfg.nodes()[0].nodes().index(outer_entry), LocalStorage.node_b: nsdfg.sdfg.nodes()[0].nodes().index(inner_entry) } nsdfg_id = nsdfg.sdfg.sdfg_list.index(nsdfg.sdfg) nstate_id = 0 local_storage = LocalStorage(nsdfg_id, nstate_id, local_storage_subgraph, 0) local_storage.array = array_in local_storage.apply(nsdfg.sdfg) in_transient_node_inner = local_storage._data_node # push to shared memory / default nsdfg.sdfg.data(in_transient_node_inner.data ).storage = dtypes.StorageType.Register # first, inline fuse back our nested SDFG from dace.transformation.interstate import InlineSDFG inline_sdfg = InlineSDFG( sdfg.sdfg_list.index(sdfg), sdfg.nodes().index(graph), {InlineSDFG._nested_sdfg: graph.nodes().index(nsdfg)}, 0) inline_sdfg.apply(sdfg) if not shortcut: reduction_type = detect_reduction_type(wcr) try: code = ReduceExpansion.reduction_type_update[reduction_type] except KeyError: raise NotImplementedError( "Not yet implemented for custom reduction") new_tasklet = graph.add_tasklet( name="reduction_transient_update", inputs={"reduction_in", "array_in"}, outputs={"out"}, code=code) edge_to_remove = graph.out_edges(out_transient_node_inner)[0] \ if self.create_out_transient \ else graph.out_edges(inner_exit)[0] new_memlet_array_inner = Memlet(data=out_storage_node.data, volume=1, subset=edge_to_remove.data.subset) new_memlet_array_outer = Memlet( data=array_closest_ancestor.data, volume=graph.in_edges(outer_entry)[0].data.volume, subset=subsets.Range.from_array( sdfg.data(out_storage_node.data))) new_memlet_reduction = Memlet( data=graph.out_edges(inner_exit)[0].data.data, volume=1, subset=graph.out_edges(inner_exit)[0].data.subset) new_memlet_out_inner = Memlet(data=edge_to_remove.data.data, volume=1, subset=edge_to_remove.data.subset) new_memlet_out_outer = dcpy(new_memlet_array_outer) # remove old edges outer_edge_to_remove = None for edge in graph.out_edges(outer_exit): if edge.src == edge_to_remove.dst: outer_edge_to_remove = edge graph.remove_edge_and_connectors(edge_to_remove) graph.remove_edge_and_connectors(outer_edge_to_remove) graph.add_edge(out_transient_node_inner if self.create_out_transient \ else inner_exit, None, new_tasklet, "reduction_in", new_memlet_reduction) graph.add_edge(outer_entry, None, new_tasklet, "array_in", new_memlet_array_inner) graph.add_edge(array_closest_ancestor, None, outer_entry, None, new_memlet_array_outer) graph.add_edge(new_tasklet, "out", outer_exit, None, new_memlet_out_inner) graph.add_edge(outer_exit, None, out_storage_node, None, new_memlet_out_outer) # fill map scope connectors graph.fill_scope_connectors() graph._clear_scopedict_cache() # wcr is already removed # FORNOW: choose default schedule and implementation new_schedule = dtypes.ScheduleType.Default new_implementation = self.reduce_implementation \ if self.reduce_implementation is not None \ else implementation new_axes = dcpy(reduce_node.axes) reduce_node_new = graph.add_reduce(wcr=wcr, axes=new_axes, schedule=new_schedule, identity=identity) reduce_node_new.implementation = new_implementation edge_tmp = graph.in_edges(inner_entry)[0] memlet_src_reduce = dcpy(edge_tmp.data) graph.add_edge(edge_tmp.src, edge_tmp.src_conn, reduce_node_new, None, memlet_src_reduce) edge_tmp = graph.out_edges(inner_exit)[0] memlet_reduce_dst = Memlet(data=edge_tmp.data.data, volume=1, subset=edge_tmp.data.subset) graph.add_edge(reduce_node_new, None, edge_tmp.dst, edge_tmp.dst_conn, memlet_reduce_dst) identity_tasklet = graph.out_edges(inner_entry)[0].dst graph.remove_node(inner_entry) graph.remove_node(inner_exit) graph.remove_node(identity_tasklet) # propagate scope for correct volumes scope_tree = ScopeTree(outer_entry, outer_exit) scope_tree.parent = ScopeTree(None, None) propagate_memlets_scope(sdfg, graph, scope_tree) sdfg.validate() # create variables for outside access self._new_reduce = reduce_node_new self._outer_entry = outer_entry if identity is None and self.create_out_transient: # set the reduction identity accordingly so that the correct # blank result is written to the out_transient node # we use default values deducted from the reduction type reduction_type = detect_reduction_type(wcr) try: reduce_node_new.identity = self.reduction_type_identity[ reduction_type] except KeyError: if reduction_type == dtypes.ReductionType.Min: reduce_node_new.identity = dtypes.max_value( sdfg.arrays[out_storage_node.data].dtype) elif reduction_type == dtypes.ReductionType.Max: reduce_node_new.identity = dtypes.min_value( sdfg.arrays[out_storage_node.data].dtype) else: raise ValueError(f"Cannot infer reduction identity." "Please specify the identity of node" "{reduce_node_new}") return
def expand(self, sdfg, graph, reduce_node): """ Splits the data dimension into an inner and outer dimension, where the inner dimension are the reduction axes and the outer axes the complement. Pushes the reduce inside a new map consisting of the complement axes. """ # get out storage node, might be hidden behind view node out_data = graph.out_edges(reduce_node)[0].data out_storage_node = reduce_node while not isinstance(out_storage_node, nodes.AccessNode): out_storage_node = graph.out_edges(out_storage_node)[0].dst if isinstance(sdfg.data(out_storage_node.data), View): out_storage_node = graph.out_edges(out_storage_node)[0].dst while not isinstance(out_storage_node, nodes.AccessNode): out_storage_node = graph.out_edges(out_storage_node)[0].dst # get other useful quantities from the original reduce node wcr = reduce_node.wcr identity = reduce_node.identity implementation = reduce_node.implementation # remove the reduce identity, will get reassigned after expansion reduce_node.identity = None # expand the reduce node in_edge = graph.in_edges(reduce_node)[0] nsdfg = self._expand_reduce(sdfg, graph, reduce_node) # find the new nodes in the nested sdfg created nstate = nsdfg.sdfg.nodes()[0] for node, scope in nstate.scope_dict().items(): if isinstance(node, nodes.MapEntry): if scope is None: outer_entry = node else: inner_entry = node if isinstance(node, nodes.Tasklet): tasklet_node = node inner_exit = nstate.exit_node(inner_entry) outer_exit = nstate.exit_node(outer_entry) # find earliest parent read-write occurrence of array onto which the reduction is performed: BFS if self.create_out_transient: queue = [nsdfg] enqueued = set() array_closest_ancestor = None while len(queue) > 0: current = queue.pop() if isinstance(current, nodes.AccessNode): if current.data == out_storage_node.data: # it suffices to find the first node # no matter what access (ReadWrite or Read) array_closest_ancestor = current break for in_edge in graph.in_edges(current): if in_edge.src not in enqueued: queue.append(in_edge.src) enqueued.add(in_edge.src) if self.debug and array_closest_ancestor: print( f"ReduceExpansion::Closest ancestor={array_closest_ancestor}" ) elif self.debug: print("ReduceExpansion::No closest ancestor found") if self.create_out_transient: # create an out transient between inner and outer map exit array_out = nstate.out_edges(outer_exit)[0].data.data from dace.transformation.dataflow.local_storage import LocalStorage local_storage_subgraph = { LocalStorage.node_a: nsdfg.sdfg.nodes()[0].nodes().index(inner_exit), LocalStorage.node_b: nsdfg.sdfg.nodes()[0].nodes().index(outer_exit) } nsdfg_id = nsdfg.sdfg.sdfg_list.index(nsdfg.sdfg) nstate_id = 0 local_storage = LocalStorage(nsdfg_id, nstate_id, local_storage_subgraph, 0) local_storage.array = array_out local_storage.apply(nsdfg.sdfg) out_transient_node_inner = local_storage._data_node # push to register nsdfg.sdfg.data(out_transient_node_inner.data ).storage = dtypes.StorageType.Register # remove WCRs from all edges where possible if there is no # prior occurrence if array_closest_ancestor is None: nstate.out_edges(outer_exit)[0].data.wcr = None nstate.out_edges(out_transient_node_inner)[0].data.wcr = None nstate.out_edges(out_transient_node_inner)[0].data.volume = 1 else: # remove WCR from outer exit nstate.out_edges(outer_exit)[0].data.wcr = None if self.create_in_transient: # create an in-transient between inner and outer map entry array_in = nstate.in_edges(outer_entry)[0].data.data from dace.transformation.dataflow.local_storage import LocalStorage local_storage_subgraph = { LocalStorage.node_a: nsdfg.sdfg.nodes()[0].nodes().index(outer_entry), LocalStorage.node_b: nsdfg.sdfg.nodes()[0].nodes().index(inner_entry) } nsdfg_id = nsdfg.sdfg.sdfg_list.index(nsdfg.sdfg) nstate_id = 0 local_storage = LocalStorage(nsdfg_id, nstate_id, local_storage_subgraph, 0) local_storage.array = array_in local_storage.apply(nsdfg.sdfg) in_transient_node_inner = local_storage._data_node # push to register nsdfg.sdfg.data(in_transient_node_inner.data ).storage = dtypes.StorageType.Register # inline fuse back our nested SDFG from dace.transformation.interstate import InlineSDFG inline_sdfg = InlineSDFG( sdfg.sdfg_list.index(sdfg), sdfg.nodes().index(graph), {InlineSDFG._nested_sdfg: graph.nodes().index(nsdfg)}, 0) inline_sdfg.apply(sdfg) new_schedule = dtypes.ScheduleType.Default new_implementation = self.reduce_implementation \ if self.reduce_implementation is not None \ else implementation new_axes = dcpy(reduce_node.axes) reduce_node_new = graph.add_reduce(wcr=wcr, axes=new_axes, schedule=new_schedule, identity=identity) reduce_node_new.implementation = new_implementation # replace inner map with new reduction node edge_tmp = graph.in_edges(inner_entry)[0] memlet_src_reduce = dcpy(edge_tmp.data) graph.add_edge(edge_tmp.src, edge_tmp.src_conn, reduce_node_new, None, memlet_src_reduce) edge_tmp = graph.out_edges(inner_exit)[0] memlet_reduce_dst = Memlet(data=edge_tmp.data.data, volume=1, subset=edge_tmp.data.subset) graph.add_edge(reduce_node_new, None, edge_tmp.dst, edge_tmp.dst_conn, memlet_reduce_dst) identity_tasklet = graph.out_edges(inner_entry)[0].dst graph.remove_node(inner_entry) graph.remove_node(inner_exit) graph.remove_node(identity_tasklet) # propagate scope for correct volumes scope_tree = ScopeTree(outer_entry, outer_exit) scope_tree.parent = ScopeTree(None, None) propagate_memlets_scope(sdfg, graph, scope_tree) sdfg.validate() # create variables for outside access self._reduce = reduce_node_new self._outer_entry = outer_entry if identity is None and self.create_out_transient: if self.debug: print( "ReduceExpansion::Trying to infer reduction WCR type due to out transient created" ) # set the reduction identity accordingly so that the correct # blank result is written to the out_transient node # we use default values deducted from the reduction type reduction_type = detect_reduction_type(wcr) try: reduce_node_new.identity = self.reduction_type_identity[ reduction_type] except KeyError: if reduction_type == dtypes.ReductionType.Min: reduce_node_new.identity = dtypes.max_value( sdfg.arrays[out_storage_node.data].dtype) elif reduction_type == dtypes.ReductionType.Max: reduce_node_new.identity = dtypes.min_value( sdfg.arrays[out_storage_node.data].dtype) else: raise ValueError(f"Cannot infer reduction identity." "Please specify the identity of node" "{reduce_node_new}") return
def inline_sdfgs(sdfg: SDFG, permissive: bool = False, progress: bool = None, multistate: bool = True) -> int: """ Inlines all possible nested SDFGs (or sub-SDFGs) using an optimized routine that uses the structure of the SDFG hierarchy. :param sdfg: The SDFG to transform. :param permissive: If True, operates in permissive mode, which ignores some checks. :param progress: If True, prints out a progress bar of inlining (may be inaccurate, requires ``tqdm``). If None, prints out progress if over 5 seconds have passed. If False, never shows progress bar. :param multistate: Include :return: The total number of SDFGs inlined. """ # Avoid import loops from dace.transformation.interstate import InlineSDFG, InlineMultistateSDFG if progress is True or progress is None: try: from tqdm import tqdm except ImportError: tqdm = None counter = 0 sdfgs = list(sdfg.all_sdfgs_recursive()) if progress is True: pbar = tqdm(total=len(sdfgs), desc='Inlining SDFGs') start = time.time() for sd in reversed(sdfgs): id = sd.sdfg_id for state in sd.nodes(): for node in state.nodes(): if (progress is None and tqdm is not None and (time.time() - start) > 5): progress = True pbar = tqdm(total=len(sdfgs), desc='Inlining SDFG', initial=counter) if not isinstance(node, NestedSDFG): continue # We have to reevaluate every time due to changing IDs node_id = state.node_id(node) state_id = sd.node_id(state) if multistate: candidate = { InlineMultistateSDFG.nested_sdfg: node_id, } inliner = InlineMultistateSDFG(id, state_id, candidate, 0, override=True) if inliner.can_be_applied(state, candidate, 0, sd, permissive=permissive): inliner.apply(sd) counter += 1 if progress: pbar.update(1) continue candidate = { InlineSDFG._nested_sdfg: node_id, } inliner = InlineSDFG(id, state_id, candidate, 0, override=True) if inliner.can_be_applied(state, candidate, 0, sd, permissive=permissive): inliner.apply(sd) counter += 1 if progress: pbar.update(1) if progress: pbar.close() if config.Config.get_bool('debugprint') and counter > 0: print(f'Inlined {counter} SDFGs') return counter