def consolidate_edges(sdfg: SDFG, starting_scope=None) -> int: """ Union scope-entering memlets relating to the same data node in all states. This effectively reduces the number of connectors and allows more transformations to be performed, at the cost of losing the individual per-tasklet memlets. :param sdfg: The SDFG to consolidate. :return: Number of edges removed. """ from dace.sdfg.propagation import propagate_memlets_sdfg, propagate_memlets_scope consolidated = 0 for state in sdfg.nodes(): # Start bottom-up if starting_scope and starting_scope.entry not in state.nodes(): continue queue = [starting_scope] if starting_scope else state.scope_leaves() next_queue = [] while len(queue) > 0: for scope in queue: consolidated += consolidate_edges_scope(state, scope.entry) consolidated += consolidate_edges_scope(state, scope.exit) if scope.parent is not None: next_queue.append(scope.parent) queue = next_queue next_queue = [] if starting_scope is not None: # Repropagate memlets from this scope outwards propagate_memlets_scope(sdfg, state, starting_scope) # No need to traverse other states break # Repropagate memlets if starting_scope is None: propagate_memlets_sdfg(sdfg) return consolidated
def apply(self, _, sdfg: sd.SDFG): # Obtain loop information guard: sd.SDFGState = self.loop_guard body: sd.SDFGState = self.loop_begin # Obtain iteration variable, range, and stride itervar, (start, end, step), _ = find_for_loop(sdfg, guard, body) forward_loop = step > 0 for node in body.nodes(): if isinstance(node, nodes.MapEntry): map_entry = node if isinstance(node, nodes.MapExit): map_exit = node # nest map's content in sdfg map_subgraph = body.scope_subgraph(map_entry, include_entry=False, include_exit=False) nsdfg = helpers.nest_state_subgraph(sdfg, body, map_subgraph, full_data=True) # replicate loop in nested sdfg new_before, new_guard, new_after = nsdfg.sdfg.add_loop( before_state=None, loop_state=nsdfg.sdfg.nodes()[0], loop_end_state=None, after_state=None, loop_var=itervar, initialize_expr=f'{start}', condition_expr=f'{itervar} <= {end}' if forward_loop else f'{itervar} >= {end}', increment_expr=f'{itervar} + {step}' if forward_loop else f'{itervar} - {abs(step)}') # remove outer loop before_guard_edge = nsdfg.sdfg.edges_between(new_before, new_guard)[0] for e in nsdfg.sdfg.out_edges(new_guard): if e.dst is new_after: guard_after_edge = e else: guard_body_edge = e for body_inedge in sdfg.in_edges(body): if body_inedge.src is guard: guard_body_edge.data.assignments.update(body_inedge.data.assignments) sdfg.remove_edge(body_inedge) for body_outedge in sdfg.out_edges(body): sdfg.remove_edge(body_outedge) for guard_inedge in sdfg.in_edges(guard): before_guard_edge.data.assignments.update(guard_inedge.data.assignments) guard_inedge.data.assignments = {} sdfg.add_edge(guard_inedge.src, body, guard_inedge.data) sdfg.remove_edge(guard_inedge) for guard_outedge in sdfg.out_edges(guard): if guard_outedge.dst is body: guard_body_edge.data.assignments.update(guard_outedge.data.assignments) else: guard_after_edge.data.assignments.update(guard_outedge.data.assignments) guard_outedge.data.condition = CodeBlock("1") sdfg.add_edge(body, guard_outedge.dst, guard_outedge.data) sdfg.remove_edge(guard_outedge) sdfg.remove_node(guard) if itervar in nsdfg.symbol_mapping: del nsdfg.symbol_mapping[itervar] if itervar in sdfg.symbols: del sdfg.symbols[itervar] # Add missing data/symbols for s in nsdfg.sdfg.free_symbols: if s in nsdfg.symbol_mapping: continue if s in sdfg.symbols: nsdfg.symbol_mapping[s] = s elif s in sdfg.arrays: desc = sdfg.arrays[s] access = body.add_access(s) conn = nsdfg.sdfg.add_datadesc(s, copy.deepcopy(desc)) nsdfg.sdfg.arrays[s].transient = False nsdfg.add_in_connector(conn) body.add_memlet_path(access, map_entry, nsdfg, memlet=Memlet.from_array(s, desc), dst_conn=conn) else: raise NotImplementedError(f"Free symbol {s} is neither a symbol nor data.") to_delete = set() for s in nsdfg.symbol_mapping: if s not in nsdfg.sdfg.free_symbols: to_delete.add(s) for s in to_delete: del nsdfg.symbol_mapping[s] # propagate scope for correct volumes scope_tree = ScopeTree(map_entry, map_exit) scope_tree.parent = ScopeTree(None, None) # The first execution helps remove apperances of symbols # that are now defined only in the nested SDFG in memlets. propagation.propagate_memlets_scope(sdfg, body, scope_tree) for s in to_delete: if helpers.is_symbol_unused(sdfg, s): sdfg.remove_symbol(s) from dace.transformation.interstate import RefineNestedAccess transformation = RefineNestedAccess() transformation.setup_match(sdfg, 0, sdfg.node_id(body), {RefineNestedAccess.nsdfg: body.node_id(nsdfg)}, 0) transformation.apply(body, sdfg) # Second propagation for refined accesses. propagation.propagate_memlets_scope(sdfg, body, scope_tree)
def expand(self, sdfg, graph, reduce_node): """ Splits the data dimension into an inner and outer dimension, where the inner dimension are the reduction axes and the outer axes the complement. Pushes the reduce inside a new map consisting of the complement axes. """ out_storage_node = graph.out_edges(reduce_node)[0].dst in_storage_node = graph.in_edges(reduce_node)[0].src wcr = reduce_node.wcr identity = reduce_node.identity schedule = reduce_node.schedule implementation = reduce_node.implementation if implementation and 'warp' in implementation: raise NotImplementedError( "WIP: Warp Reductions are not Implemented yet.") # remove the reduce identity # we will reassign it later after expanding reduce_node.identity = None # expand the reduce node in_edge = graph.in_edges(reduce_node)[0] nsdfg = self._expand_reduce(sdfg, graph, reduce_node) # find the new nodes in the nested sdfg created nstate = nsdfg.sdfg.nodes()[0] for node, scope in nstate.scope_dict().items(): if isinstance(node, nodes.MapEntry): if scope is None: outer_entry = node else: inner_entry = node if isinstance(node, nodes.Tasklet): tasklet_node = node inner_exit = nstate.exit_node(inner_entry) outer_exit = nstate.exit_node(outer_entry) # find earliest parent read-write occurrence of array onto which # we perform the reduction: # do BFS, best complexity O(V+E) queue = [nsdfg] array_closest_ancestor = None while len(queue) > 0: current = queue.pop(0) if isinstance(current, nodes.AccessNode): if current.data == out_storage_node.data: # it suffices to find the first node # no matter what access (ReadWrite or Read) array_closest_ancestor = current break queue.extend([in_edge.src for in_edge in graph.in_edges(current)]) # if ancestor doesn't exist: # if non-transient: create data node accessing it # if transient: ancestor_node = none, set_zero on outer node shortcut = False if (not array_closest_ancestor and sdfg.data(out_storage_node.data).transient) \ or identity is not None: if self.debug: print("ReduceExpansion::Expanding Reduction into Map") # we are lucky shortcut = True nstate.out_edges(outer_exit)[0].data.wcr = None else: if self.debug: print("ReduceExpansion::Expanding Reduction into Map " "and introducing update Tasklet, " "connecting with ancestor.") if not array_closest_ancestor: array_closest_ancestor = nodes.AccessNode( out_storage_node.data, access=dtypes.AccessType.ReadOnly) graph.add_node(array_closest_ancestor) # array_closest_ancestor now points to the node we want to connect # to the map entry # always have to create out transient in this case self.create_out_transient = True if self.create_out_transient: # create an out transient between inner and outer map exit array_out = nstate.out_edges(outer_exit)[0].data.data from dace.transformation.dataflow.local_storage import LocalStorage local_storage_subgraph = { LocalStorage.node_a: nsdfg.sdfg.nodes()[0].nodes().index(inner_exit), LocalStorage.node_b: nsdfg.sdfg.nodes()[0].nodes().index(outer_exit) } nsdfg_id = nsdfg.sdfg.sdfg_list.index(nsdfg.sdfg) nstate_id = 0 local_storage = LocalStorage(nsdfg_id, nstate_id, local_storage_subgraph, 0) local_storage.array = array_out local_storage.apply(nsdfg.sdfg) out_transient_node_inner = local_storage._data_node # push to register nsdfg.sdfg.data(out_transient_node_inner.data ).storage = dtypes.StorageType.Register if shortcut: nstate.out_edges(out_transient_node_inner)[0].data.wcr = None nstate.out_edges(out_transient_node_inner)[0].data.volume = 1 if shortcut: nstate.out_edges(out_transient_node_inner)[0].data.wcr = None nstate.out_edges(out_transient_node_inner)[0].data.volume = 1 if self.create_in_transient: # create an in-transient between inner and outer map entry array_in = nstate.in_edges(outer_entry)[0].data.data from dace.transformation.dataflow.local_storage import LocalStorage local_storage_subgraph = { LocalStorage.node_a: nsdfg.sdfg.nodes()[0].nodes().index(outer_entry), LocalStorage.node_b: nsdfg.sdfg.nodes()[0].nodes().index(inner_entry) } nsdfg_id = nsdfg.sdfg.sdfg_list.index(nsdfg.sdfg) nstate_id = 0 local_storage = LocalStorage(nsdfg_id, nstate_id, local_storage_subgraph, 0) local_storage.array = array_in local_storage.apply(nsdfg.sdfg) in_transient_node_inner = local_storage._data_node # push to shared memory / default nsdfg.sdfg.data(in_transient_node_inner.data ).storage = dtypes.StorageType.Register # first, inline fuse back our nested SDFG from dace.transformation.interstate import InlineSDFG inline_sdfg = InlineSDFG( sdfg.sdfg_list.index(sdfg), sdfg.nodes().index(graph), {InlineSDFG._nested_sdfg: graph.nodes().index(nsdfg)}, 0) inline_sdfg.apply(sdfg) if not shortcut: reduction_type = detect_reduction_type(wcr) try: code = ReduceExpansion.reduction_type_update[reduction_type] except KeyError: raise NotImplementedError( "Not yet implemented for custom reduction") new_tasklet = graph.add_tasklet( name="reduction_transient_update", inputs={"reduction_in", "array_in"}, outputs={"out"}, code=code) edge_to_remove = graph.out_edges(out_transient_node_inner)[0] \ if self.create_out_transient \ else graph.out_edges(inner_exit)[0] new_memlet_array_inner = Memlet(data=out_storage_node.data, volume=1, subset=edge_to_remove.data.subset) new_memlet_array_outer = Memlet( data=array_closest_ancestor.data, volume=graph.in_edges(outer_entry)[0].data.volume, subset=subsets.Range.from_array( sdfg.data(out_storage_node.data))) new_memlet_reduction = Memlet( data=graph.out_edges(inner_exit)[0].data.data, volume=1, subset=graph.out_edges(inner_exit)[0].data.subset) new_memlet_out_inner = Memlet(data=edge_to_remove.data.data, volume=1, subset=edge_to_remove.data.subset) new_memlet_out_outer = dcpy(new_memlet_array_outer) # remove old edges outer_edge_to_remove = None for edge in graph.out_edges(outer_exit): if edge.src == edge_to_remove.dst: outer_edge_to_remove = edge graph.remove_edge_and_connectors(edge_to_remove) graph.remove_edge_and_connectors(outer_edge_to_remove) graph.add_edge(out_transient_node_inner if self.create_out_transient \ else inner_exit, None, new_tasklet, "reduction_in", new_memlet_reduction) graph.add_edge(outer_entry, None, new_tasklet, "array_in", new_memlet_array_inner) graph.add_edge(array_closest_ancestor, None, outer_entry, None, new_memlet_array_outer) graph.add_edge(new_tasklet, "out", outer_exit, None, new_memlet_out_inner) graph.add_edge(outer_exit, None, out_storage_node, None, new_memlet_out_outer) # fill map scope connectors graph.fill_scope_connectors() graph._clear_scopedict_cache() # wcr is already removed # FORNOW: choose default schedule and implementation new_schedule = dtypes.ScheduleType.Default new_implementation = self.reduce_implementation \ if self.reduce_implementation is not None \ else implementation new_axes = dcpy(reduce_node.axes) reduce_node_new = graph.add_reduce(wcr=wcr, axes=new_axes, schedule=new_schedule, identity=identity) reduce_node_new.implementation = new_implementation edge_tmp = graph.in_edges(inner_entry)[0] memlet_src_reduce = dcpy(edge_tmp.data) graph.add_edge(edge_tmp.src, edge_tmp.src_conn, reduce_node_new, None, memlet_src_reduce) edge_tmp = graph.out_edges(inner_exit)[0] memlet_reduce_dst = Memlet(data=edge_tmp.data.data, volume=1, subset=edge_tmp.data.subset) graph.add_edge(reduce_node_new, None, edge_tmp.dst, edge_tmp.dst_conn, memlet_reduce_dst) identity_tasklet = graph.out_edges(inner_entry)[0].dst graph.remove_node(inner_entry) graph.remove_node(inner_exit) graph.remove_node(identity_tasklet) # propagate scope for correct volumes scope_tree = ScopeTree(outer_entry, outer_exit) scope_tree.parent = ScopeTree(None, None) propagate_memlets_scope(sdfg, graph, scope_tree) sdfg.validate() # create variables for outside access self._new_reduce = reduce_node_new self._outer_entry = outer_entry if identity is None and self.create_out_transient: # set the reduction identity accordingly so that the correct # blank result is written to the out_transient node # we use default values deducted from the reduction type reduction_type = detect_reduction_type(wcr) try: reduce_node_new.identity = self.reduction_type_identity[ reduction_type] except KeyError: if reduction_type == dtypes.ReductionType.Min: reduce_node_new.identity = dtypes.max_value( sdfg.arrays[out_storage_node.data].dtype) elif reduction_type == dtypes.ReductionType.Max: reduce_node_new.identity = dtypes.min_value( sdfg.arrays[out_storage_node.data].dtype) else: raise ValueError(f"Cannot infer reduction identity." "Please specify the identity of node" "{reduce_node_new}") return
def expand(self, sdfg, graph, reduce_node): """ Splits the data dimension into an inner and outer dimension, where the inner dimension are the reduction axes and the outer axes the complement. Pushes the reduce inside a new map consisting of the complement axes. """ # get out storage node, might be hidden behind view node out_data = graph.out_edges(reduce_node)[0].data out_storage_node = reduce_node while not isinstance(out_storage_node, nodes.AccessNode): out_storage_node = graph.out_edges(out_storage_node)[0].dst if isinstance(sdfg.data(out_storage_node.data), View): out_storage_node = graph.out_edges(out_storage_node)[0].dst while not isinstance(out_storage_node, nodes.AccessNode): out_storage_node = graph.out_edges(out_storage_node)[0].dst # get other useful quantities from the original reduce node wcr = reduce_node.wcr identity = reduce_node.identity implementation = reduce_node.implementation # remove the reduce identity, will get reassigned after expansion reduce_node.identity = None # expand the reduce node in_edge = graph.in_edges(reduce_node)[0] nsdfg = self._expand_reduce(sdfg, graph, reduce_node) # find the new nodes in the nested sdfg created nstate = nsdfg.sdfg.nodes()[0] for node, scope in nstate.scope_dict().items(): if isinstance(node, nodes.MapEntry): if scope is None: outer_entry = node else: inner_entry = node if isinstance(node, nodes.Tasklet): tasklet_node = node inner_exit = nstate.exit_node(inner_entry) outer_exit = nstate.exit_node(outer_entry) # find earliest parent read-write occurrence of array onto which the reduction is performed: BFS if self.create_out_transient: queue = [nsdfg] enqueued = set() array_closest_ancestor = None while len(queue) > 0: current = queue.pop() if isinstance(current, nodes.AccessNode): if current.data == out_storage_node.data: # it suffices to find the first node # no matter what access (ReadWrite or Read) array_closest_ancestor = current break for in_edge in graph.in_edges(current): if in_edge.src not in enqueued: queue.append(in_edge.src) enqueued.add(in_edge.src) if self.debug and array_closest_ancestor: print( f"ReduceExpansion::Closest ancestor={array_closest_ancestor}" ) elif self.debug: print("ReduceExpansion::No closest ancestor found") if self.create_out_transient: # create an out transient between inner and outer map exit array_out = nstate.out_edges(outer_exit)[0].data.data from dace.transformation.dataflow.local_storage import LocalStorage local_storage_subgraph = { LocalStorage.node_a: nsdfg.sdfg.nodes()[0].nodes().index(inner_exit), LocalStorage.node_b: nsdfg.sdfg.nodes()[0].nodes().index(outer_exit) } nsdfg_id = nsdfg.sdfg.sdfg_list.index(nsdfg.sdfg) nstate_id = 0 local_storage = LocalStorage(nsdfg_id, nstate_id, local_storage_subgraph, 0) local_storage.array = array_out local_storage.apply(nsdfg.sdfg) out_transient_node_inner = local_storage._data_node # push to register nsdfg.sdfg.data(out_transient_node_inner.data ).storage = dtypes.StorageType.Register # remove WCRs from all edges where possible if there is no # prior occurrence if array_closest_ancestor is None: nstate.out_edges(outer_exit)[0].data.wcr = None nstate.out_edges(out_transient_node_inner)[0].data.wcr = None nstate.out_edges(out_transient_node_inner)[0].data.volume = 1 else: # remove WCR from outer exit nstate.out_edges(outer_exit)[0].data.wcr = None if self.create_in_transient: # create an in-transient between inner and outer map entry array_in = nstate.in_edges(outer_entry)[0].data.data from dace.transformation.dataflow.local_storage import LocalStorage local_storage_subgraph = { LocalStorage.node_a: nsdfg.sdfg.nodes()[0].nodes().index(outer_entry), LocalStorage.node_b: nsdfg.sdfg.nodes()[0].nodes().index(inner_entry) } nsdfg_id = nsdfg.sdfg.sdfg_list.index(nsdfg.sdfg) nstate_id = 0 local_storage = LocalStorage(nsdfg_id, nstate_id, local_storage_subgraph, 0) local_storage.array = array_in local_storage.apply(nsdfg.sdfg) in_transient_node_inner = local_storage._data_node # push to register nsdfg.sdfg.data(in_transient_node_inner.data ).storage = dtypes.StorageType.Register # inline fuse back our nested SDFG from dace.transformation.interstate import InlineSDFG inline_sdfg = InlineSDFG( sdfg.sdfg_list.index(sdfg), sdfg.nodes().index(graph), {InlineSDFG._nested_sdfg: graph.nodes().index(nsdfg)}, 0) inline_sdfg.apply(sdfg) new_schedule = dtypes.ScheduleType.Default new_implementation = self.reduce_implementation \ if self.reduce_implementation is not None \ else implementation new_axes = dcpy(reduce_node.axes) reduce_node_new = graph.add_reduce(wcr=wcr, axes=new_axes, schedule=new_schedule, identity=identity) reduce_node_new.implementation = new_implementation # replace inner map with new reduction node edge_tmp = graph.in_edges(inner_entry)[0] memlet_src_reduce = dcpy(edge_tmp.data) graph.add_edge(edge_tmp.src, edge_tmp.src_conn, reduce_node_new, None, memlet_src_reduce) edge_tmp = graph.out_edges(inner_exit)[0] memlet_reduce_dst = Memlet(data=edge_tmp.data.data, volume=1, subset=edge_tmp.data.subset) graph.add_edge(reduce_node_new, None, edge_tmp.dst, edge_tmp.dst_conn, memlet_reduce_dst) identity_tasklet = graph.out_edges(inner_entry)[0].dst graph.remove_node(inner_entry) graph.remove_node(inner_exit) graph.remove_node(identity_tasklet) # propagate scope for correct volumes scope_tree = ScopeTree(outer_entry, outer_exit) scope_tree.parent = ScopeTree(None, None) propagate_memlets_scope(sdfg, graph, scope_tree) sdfg.validate() # create variables for outside access self._reduce = reduce_node_new self._outer_entry = outer_entry if identity is None and self.create_out_transient: if self.debug: print( "ReduceExpansion::Trying to infer reduction WCR type due to out transient created" ) # set the reduction identity accordingly so that the correct # blank result is written to the out_transient node # we use default values deducted from the reduction type reduction_type = detect_reduction_type(wcr) try: reduce_node_new.identity = self.reduction_type_identity[ reduction_type] except KeyError: if reduction_type == dtypes.ReductionType.Min: reduce_node_new.identity = dtypes.max_value( sdfg.arrays[out_storage_node.data].dtype) elif reduction_type == dtypes.ReductionType.Max: reduce_node_new.identity = dtypes.min_value( sdfg.arrays[out_storage_node.data].dtype) else: raise ValueError(f"Cannot infer reduction identity." "Please specify the identity of node" "{reduce_node_new}") return