def test_gpu_dma(): sdfg: dace.SDFG = gpu_dma.to_sdfg(strict=True) sdfg.name = 'gpu_dma' sdfg.apply_transformations(GPUTransformSDFG, options={'strict_transform': False}) map_ = next(n for n, _ in sdfg.all_nodes_recursive() if isinstance(n, nodes.MapEntry)) add_gpu_location(sdfg, map_, 0) sdfg.arrays['gpu_X'].location = {'gpu': 1} # clone GPU scalar inodename = 'alpha' inode = sdfg.arrays['alpha'] newdesc = inode.clone() newdesc.location = {'gpu': 0} newdesc.storage = StorageType.GPU_Global newdesc.transient = True name = sdfg.add_datadesc('gpu_' + inodename, newdesc, find_new_name=True) # Replace original scalar for state in sdfg.nodes(): for node in state.nodes(): if (isinstance(node, nodes.AccessNode) and node.data == inodename): node.data = name # Replace memlets for state in sdfg.nodes(): for edge in state.edges(): if edge.data.data == inodename: edge.data.data = name # add GPU scalar to the copyin state copyin_state = sdfg.start_state src_array = nodes.AccessNode(inodename, debuginfo=inode.debuginfo) dst_array = nodes.AccessNode(name, debuginfo=inode.debuginfo) copyin_state.add_node(src_array) copyin_state.add_node(dst_array) copyin_state.add_nedge( src_array, dst_array, Memlet.from_array(src_array.data, src_array.desc(sdfg))) sdfg.apply_strict_transformations() np.random.seed(0) n = 16 X = np.ndarray(shape=n, dtype=np_dtype) alpha = np.ndarray(shape=1, dtype=np_dtype) alpha.fill(np.random.rand()) a_times_X = sdfg(X=X, alpha=alpha[0], N=n) res = X * alpha idx = zip(*np.where(~np.isclose(res, a_times_X, atol=0, rtol=1e-7))) for i in idx: print(i, res[i], a_times_X, X[i] * alpha, X[i], alpha) assert np.allclose(res, a_times_X) print('PASS')
class RedundantArrayCopying2(pm.Transformation): """ Implements the redundant array removal transformation. Removes multiples of array B in pattern A -> B. """ _arrays_removed = 0 _in_array = nodes.AccessNode("_") _out_array = nodes.AccessNode("_") @staticmethod def expressions(): return [ sdutil.node_path_graph(RedundantArrayCopying2._in_array, RedundantArrayCopying2._out_array) ] @staticmethod def can_be_applied(graph, candidate, expr_index, sdfg, strict=False): in_array = graph.nodes()[candidate[RedundantArrayCopying2._in_array]] out_array = graph.nodes()[candidate[RedundantArrayCopying2._out_array]] # Ensure out degree is one (only one target, which is out_array) found = 0 for _, _, dst, _, _ in graph.out_edges(in_array): if (isinstance(dst, nodes.AccessNode) and dst != out_array and dst.data == out_array.data): found += 1 return found > 0 @staticmethod def match_to_str(graph, candidate): out_array = graph.nodes()[candidate[RedundantArrayCopying2._out_array]] return "Remove " + str(out_array) def apply(self, sdfg): def gnode(nname): return graph.nodes()[self.subgraph[nname]] graph = sdfg.nodes()[self.state_id] in_array = gnode(RedundantArrayCopying2._in_array) out_array = gnode(RedundantArrayCopying2._out_array) for e1 in graph.out_edges(in_array): dst = e1.dst if (isinstance(dst, nodes.AccessNode) and dst != out_array and dst.data == out_array.data): for e2 in graph.out_edges(dst): graph.add_edge(out_array, None, e2.dst, e2.dst_conn, e2.data) graph.remove_edge(e2) graph.remove_edge(e1) graph.remove_node(dst) if Config.get_bool("debugprint"): RedundantArrayCopying2._arrays_removed += 1
class MapWCRFusion(pm.Transformation): """ Implements the map expanded-reduce fusion transformation. Fuses a map with an immediately following reduction, where the array between the map and the reduction is not used anywhere else, and the reduction is divided to two maps with a WCR, denoting partial reduction. """ _tasklet = nodes.Tasklet('_') _tmap_exit = nodes.MapExit(nodes.Map("", [], [])) _in_array = nodes.AccessNode('_') _rmap_in_entry = nodes.MapEntry(nodes.Map("", [], [])) _rmap_in_tasklet = nodes.Tasklet('_') _rmap_in_cr = nodes.MapExit(nodes.Map("", [], [])) _rmap_out_entry = nodes.MapEntry(nodes.Map("", [], [])) _rmap_out_exit = nodes.MapExit(nodes.Map("", [], [])) _out_array = nodes.AccessNode('_') @staticmethod def expressions(): return [ # Map, then partial reduction of axes sdutil.node_path_graph( MapWCRFusion._tasklet, MapWCRFusion._tmap_exit, MapWCRFusion._in_array, MapWCRFusion._rmap_out_entry, MapWCRFusion._rmap_in_entry, MapWCRFusion._rmap_in_tasklet, MapWCRFusion._rmap_in_cr, MapWCRFusion._rmap_out_exit, MapWCRFusion._out_array) ] @staticmethod def can_be_applied(graph, candidate, expr_index, sdfg, strict=False): tmap_exit = graph.nodes()[candidate[MapWCRFusion._tmap_exit]] in_array = graph.nodes()[candidate[MapWCRFusion._in_array]] rmap_entry = graph.nodes()[candidate[MapWCRFusion._rmap_out_entry]] # Make sure that the array is only accessed by the map and the reduce if any([ src != tmap_exit for src, _, _, _, memlet in graph.in_edges(in_array) ]): return False if any([ dest != rmap_entry for _, _, dest, _, memlet in graph.out_edges(in_array) ]): return False # Make sure that there is a reduction in the second map rmap_cr = graph.nodes()[candidate[MapWCRFusion._rmap_in_cr]] reduce_edge = graph.in_edges(rmap_cr)[0] if reduce_edge.data.wcr is None: return False # (strict) Make sure that the transient is not accessed anywhere else # in this state or other states if strict and (len([ n for n in graph.nodes() if isinstance(n, nodes.AccessNode) and n.data == in_array.data ]) > 1 or in_array.data in sdfg.shared_transients()): return False # Verify that reduction ranges match tasklet map tout_memlet = graph.in_edges(in_array)[0].data rin_memlet = graph.out_edges(in_array)[0].data if tout_memlet.subset != rin_memlet.subset: return False return True @staticmethod def match_to_str(graph, candidate): tasklet = candidate[MapWCRFusion._tasklet] map_exit = candidate[MapWCRFusion._tmap_exit] reduce = candidate[MapWCRFusion._rmap_in_cr] return ' -> '.join(str(node) for node in [tasklet, map_exit, reduce]) def apply(self, sdfg): graph = sdfg.node(self.state_id) # To apply, collapse the second map and then fuse the two resulting maps map_collapse = MapCollapse( self.sdfg_id, self.state_id, { MapCollapse._outer_map_entry: self.subgraph[MapWCRFusion._rmap_out_entry], MapCollapse._inner_map_entry: self.subgraph[MapWCRFusion._rmap_in_entry] }, 0) map_entry, _ = map_collapse.apply(sdfg) map_fusion = MapFusion( self.sdfg_id, self.state_id, { MapFusion._first_map_exit: self.subgraph[MapWCRFusion._tmap_exit], MapFusion._second_map_entry: graph.node_id(map_entry) }, 0) map_fusion.apply(sdfg)
class RedundantArrayCopying(pm.Transformation): """ Implements the redundant array removal transformation. Removes the last access node in pattern A -> B -> A, and the second (if possible) """ _arrays_removed = 0 _in_array = nodes.AccessNode("_") _med_array = nodes.AccessNode("_") _out_array = nodes.AccessNode("_") @staticmethod def expressions(): return [ sdutil.node_path_graph( RedundantArrayCopying._in_array, RedundantArrayCopying._med_array, RedundantArrayCopying._out_array, ) ] @staticmethod def can_be_applied(graph, candidate, expr_index, sdfg, strict=False): in_array = graph.nodes()[candidate[RedundantArrayCopying._in_array]] med_array = graph.nodes()[candidate[RedundantArrayCopying._med_array]] out_array = graph.nodes()[candidate[RedundantArrayCopying._out_array]] # Ensure out degree is one (only one target, which is out_array) if graph.out_degree(in_array) != 1: return False # Make sure that the removal candidate is a transient variable if strict and not out_array.desc(sdfg).transient: return False # Make sure that the middle access node is not transient. We do this to ensure that everything copied from # B -> A is either copied in from A, or uninitialized memory. if strict and not med_array.desc(sdfg).transient: return False # Make sure that both arrays are using the same storage location if in_array.desc(sdfg).storage != out_array.desc(sdfg).storage: return False # Find occurrences in this and other states # (This could be relaxed) # occurrences = [] # for state in sdfg.nodes(): # occurrences.extend([ # n for n in state.nodes() # if isinstance(n, nodes.AccessNode) and n.desc == med_array.desc # ]) # if len(occurrences) > 1: # return False # Only apply if arrays are of same shape (no need to modify memlet subset) if len(in_array.desc(sdfg).shape) != len( out_array.desc(sdfg).shape) or any(i != o for i, o in zip( in_array.desc(sdfg).shape, out_array.desc(sdfg).shape)): return False return True @staticmethod def match_to_str(graph, candidate): med_array = graph.nodes()[candidate[RedundantArrayCopying._med_array]] out_array = graph.nodes()[candidate[RedundantArrayCopying._out_array]] return "Remove " + str(out_array) + " and (maybe) " + str(med_array) def apply(self, sdfg): def gnode(nname): return graph.nodes()[self.subgraph[nname]] graph = sdfg.nodes()[self.state_id] in_array = gnode(RedundantArrayCopying._in_array) med_array = gnode(RedundantArrayCopying._med_array) out_array = gnode(RedundantArrayCopying._out_array) med_edges = len(graph.out_edges(med_array)) med_out_edges = 0 for med_e in graph.out_edges(med_array): if med_e.dst == out_array: # Modify all outcoming edges to point to in_array for out_e in graph.out_edges(med_e.dst): path = graph.memlet_path(out_e) for pe in path: if pe.data.data == out_array.data or pe.data.data == med_array.data: pe.data.data = in_array.data # Redirect edge to in_array graph.remove_edge(out_e) graph.add_edge(in_array, out_e.src_conn, out_e.dst, out_e.dst_conn, out_e.data) # Remove out_array for e in graph.edges_between(med_e, med_e.dst): graph.remove_edge(e) graph.remove_node(med_e.dst) med_out_edges += 1 # Finally, med_array node if med_array.desc(sdfg).transient and med_edges == med_out_edges: for e in graph.edges_between(in_array, med_array): graph.remove_edge(e) graph.remove_node(med_array) if Config.get_bool("debugprint"): RedundantArrayCopying._arrays_removed += 1
class DoubleBuffering(transformation.Transformation): """ Implements the double buffering pattern, which pipelines reading and processing data by creating a second copy of the memory. In particular, the transformation takes a 1D map and all internal (directly connected) transients, adds an additional dimension of size 2, and turns the map into a for loop that processes and reads the data in a double-buffered manner. Other memlets will not be transformed. """ _map_entry = nodes.MapEntry(nodes.Map('_', [], [])) _transient = nodes.AccessNode('_') @staticmethod def expressions(): return [ sdutil.node_path_graph(DoubleBuffering._map_entry, DoubleBuffering._transient) ] @staticmethod def can_be_applied(graph, candidate, expr_index, sdfg, strict=False): map_entry = graph.nodes()[candidate[DoubleBuffering._map_entry]] transient = graph.nodes()[candidate[DoubleBuffering._transient]] # Only one dimensional maps are allowed if len(map_entry.map.params) != 1: return False # Verify the map can be transformed to a for-loop if not MapToForLoop.can_be_applied( graph, {MapToForLoop._map_entry: candidate[DoubleBuffering._map_entry]}, expr_index, sdfg, strict): return False # Verify that all directly-connected internal access nodes point to # transient arrays first = True for edge in graph.out_edges(map_entry): if isinstance(edge.dst, nodes.AccessNode): desc = sdfg.arrays[edge.dst.data] if not isinstance(desc, data.Array) or not desc.transient: return False else: # To avoid duplicate matches, only match the first transient if first and edge.dst != transient: return False first = False return True @staticmethod def match_to_str(graph, candidate): return str(graph.node(candidate[DoubleBuffering._map_entry])) def apply(self, sdfg: sd.SDFG): graph: sd.SDFGState = sdfg.nodes()[self.state_id] map_entry = graph.node(self.subgraph[DoubleBuffering._map_entry]) map_param = map_entry.map.params[0] # Assuming one dimensional ############################## # Change condition of loop to one fewer iteration (so that the # final one reads from the last buffer) map_rstart, map_rend, map_rstride = map_entry.map.range[0] map_rend = symbolic.pystr_to_symbolic('(%s) - (%s)' % (map_rend, map_rstride)) map_entry.map.range = subsets.Range([(map_rstart, map_rend, map_rstride)]) ############################## # Gather transients to modify transients_to_modify = set(edge.dst.data for edge in graph.out_edges(map_entry) if isinstance(edge.dst, nodes.AccessNode)) # Add dimension to transients and modify memlets for transient in transients_to_modify: desc: data.Array = sdfg.arrays[transient] # Using non-python syntax to ensure properties change desc.strides = [desc.total_size] + list(desc.strides) desc.shape = [2] + list(desc.shape) desc.offset = [0] + list(desc.offset) desc.total_size = desc.total_size * 2 ############################## # Modify memlets to use map parameter as buffer index modified_subsets = [] # Store modified memlets for final state for edge in graph.scope_subgraph(map_entry).edges(): if edge.data.data in transients_to_modify: edge.data.subset = self._modify_memlet(sdfg, edge.data.subset, edge.data.data) modified_subsets.append(edge.data.subset) else: # Could be other_subset path = graph.memlet_path(edge) src_node = path[0].src dst_node = path[-1].dst # other_subset could be None. In that case, recreate from array dataname = None if (isinstance(src_node, nodes.AccessNode) and src_node.data in transients_to_modify): dataname = src_node.data elif (isinstance(dst_node, nodes.AccessNode) and dst_node.data in transients_to_modify): dataname = dst_node.data if dataname is not None: subset = (edge.data.other_subset or subsets.Range.from_array(sdfg.arrays[dataname])) edge.data.other_subset = self._modify_memlet( sdfg, subset, dataname) modified_subsets.append(edge.data.other_subset) ############################## # Turn map into for loop map_to_for = MapToForLoop(self.sdfg_id, self.state_id, { MapToForLoop._map_entry: self.subgraph[DoubleBuffering._map_entry] }, self.expr_index) nsdfg_node, nstate = map_to_for.apply(sdfg) ############################## # Gather node copies and remove memlets edges_to_replace = [] for node in nstate.source_nodes(): for edge in nstate.out_edges(node): if (isinstance(edge.dst, nodes.AccessNode) and edge.dst.data in transients_to_modify): edges_to_replace.append(edge) nstate.remove_edge(edge) if nstate.out_degree(node) == 0: nstate.remove_node(node) ############################## # Add initial reads to initial nested state initial_state: sd.SDFGState = nsdfg_node.sdfg.start_state initial_state.set_label('%s_init' % map_entry.map.label) for edge in edges_to_replace: initial_state.add_node(edge.src) rnode = edge.src wnode = initial_state.add_write(edge.dst.data) initial_state.add_edge(rnode, edge.src_conn, wnode, edge.dst_conn, copy.deepcopy(edge.data)) # All instances of the map parameter in this state become the loop start sd.replace(initial_state, map_param, map_rstart) # Initial writes go to the first buffer sd.replace(initial_state, '__dace_db_param', 0) ############################## # Modify main state's memlets # Divide by loop stride new_expr = symbolic.pystr_to_symbolic('(%s / %s) %% 2' % (map_param, map_rstride)) sd.replace(nstate, '__dace_db_param', new_expr) ############################## # Add the main state's contents to the last state, modifying # memlets appropriately. final_state: sd.SDFGState = nsdfg_node.sdfg.sink_nodes()[0] final_state.set_label('%s_final_computation' % map_entry.map.label) dup_nstate = copy.deepcopy(nstate) final_state.add_nodes_from(dup_nstate.nodes()) for e in dup_nstate.edges(): final_state.add_edge(e.src, e.src_conn, e.dst, e.dst_conn, e.data) # If there is a WCR output with transient, only output in last state nstate: sd.SDFGState for node in nstate.sink_nodes(): for e in list(nstate.in_edges(node)): if e.data.wcr is not None: path = nstate.memlet_path(e) if isinstance(path[0].src, nodes.AccessNode): nstate.remove_memlet_path(e) ############################## # Add reads into next buffers to main state for edge in edges_to_replace: rnode = copy.deepcopy(edge.src) nstate.add_node(rnode) wnode = nstate.add_write(edge.dst.data) new_memlet = copy.deepcopy(edge.data) if new_memlet.data in transients_to_modify: new_memlet.other_subset = self._replace_in_subset( new_memlet.other_subset, map_param, '(%s + %s)' % (map_param, map_rstride)) else: new_memlet.subset = self._replace_in_subset( new_memlet.subset, map_param, '(%s + %s)' % (map_param, map_rstride)) nstate.add_edge(rnode, edge.src_conn, wnode, edge.dst_conn, new_memlet) nstate.set_label('%s_double_buffered' % map_entry.map.label) # Divide by loop stride new_expr = symbolic.pystr_to_symbolic('((%s / %s) + 1) %% 2' % (map_param, map_rstride)) sd.replace(nstate, '__dace_db_param', new_expr) # Remove symbol once done del nsdfg_node.sdfg.symbols['__dace_db_param'] del nsdfg_node.symbol_mapping['__dace_db_param'] return nsdfg_node @staticmethod def _modify_memlet(sdfg, subset, data_name): desc = sdfg.arrays[data_name] if len(subset) == len(desc.shape): # Already in the right shape, modify new dimension subset = list(subset)[1:] new_subset = subsets.Range([('__dace_db_param', '__dace_db_param', 1)] + list(subset)) return new_subset @staticmethod def _replace_in_subset(subset, string_or_symbol, new_string_or_symbol): new_subset = copy.deepcopy(subset) repldict = { symbolic.pystr_to_symbolic(string_or_symbol): symbolic.pystr_to_symbolic(new_string_or_symbol) } for i, dim in enumerate(new_subset): try: new_subset[i] = tuple(d.subs(repldict) for d in dim) except TypeError: new_subset[i] = (dim.subs(repldict) if symbolic.issymbolic(dim) else dim) return new_subset
class MapFusion(pattern_matching.Transformation): """ Implements the MapFusion transformation. It wil check for all patterns MapExit -> AccessNode -> MapEntry, and based on the following rules, fuse them and remove the transient in between. There are several possibilities of what it does to this transient in between. Essentially, if there is some other place in the sdfg where it is required, or if it is not a transient, then it will not be removed. In such a case, it will be linked to the MapExit node of the new fused map. Rules for fusing maps: 0. The map range of the second map should be a permutation of the first map range. 1. Each of the access nodes that are adjacent to the first map exit should have an edge to the second map entry. If it doesn't, then the second map entry should not be reachable from this access node. 2. Any node that has a wcr from the first map exit should not be adjacent to the second map entry. 3. Access pattern for the access nodes in the second map should be the same permutation of the map parameters as the map ranges of the two maps. Alternatively, this access node should not be adjacent to the first map entry. """ _first_map_exit = nodes.ExitNode() _some_array = nodes.AccessNode("_") _second_map_entry = nodes.EntryNode() @staticmethod def annotates_memlets(): return False @staticmethod def expressions(): return [ sdutil.node_path_graph( MapFusion._first_map_exit, MapFusion._some_array, MapFusion._second_map_entry, ) ] @staticmethod def find_permutation(first_map: nodes.Map, second_map: nodes.Map) -> Union[List[int], None]: """ Find permutation between two map ranges. :param first_map: First map. :param second_map: Second map. :return: None if no such permutation exists, otherwise a list of indices L such that L[x]'th parameter of second map has the same range as x'th parameter of the first map. """ result = [] if len(first_map.range) != len(second_map.range): return None # Match map ranges with reduce ranges for i, tmap_rng in enumerate(first_map.range): found = False for j, rng in enumerate(second_map.range): if tmap_rng == rng and j not in result: result.append(j) found = True break if not found: break # Ensure all map ranges matched if len(result) != len(first_map.range): return None return result @staticmethod def can_be_applied(graph, candidate, expr_index, sdfg, strict=False): first_map_exit = graph.nodes()[candidate[MapFusion._first_map_exit]] first_map_entry = graph.entry_node(first_map_exit) second_map_entry = graph.nodes()[candidate[ MapFusion._second_map_entry]] for _in_e in graph.in_edges(first_map_exit): if _in_e.data.wcr is not None: for _out_e in graph.out_edges(second_map_entry): if _out_e.data.data == _in_e.data.data: # wcr is on a node that is used in the second map, quit return False # Check whether there is a pattern map -> access -> map. intermediate_nodes = set() intermediate_data = set() for _, _, dst, _, _ in graph.out_edges(first_map_exit): if isinstance(dst, nodes.AccessNode): intermediate_nodes.add(dst) intermediate_data.add(dst.data) # If array is used anywhere else in this state. num_occurrences = len([ n for n in graph.nodes() if isinstance(n, nodes.AccessNode) and n.data == dst.data ]) if num_occurrences > 1: return False else: return False # Check map ranges perm = MapFusion.find_permutation(first_map_entry.map, second_map_entry.map) if perm is None: return False # Check if any intermediate transient is also going to another location second_inodes = set(e.src for e in graph.in_edges(second_map_entry) if isinstance(e.src, nodes.AccessNode)) transients_to_remove = intermediate_nodes & second_inodes # if any(e.dst != second_map_entry for n in transients_to_remove # for e in graph.out_edges(n)): if any(graph.out_degree(n) > 1 for n in transients_to_remove): return False # Create a dict that maps parameters of the first map to those of the # second map. params_dict = {} for _index, _param in enumerate(first_map_entry.map.params): params_dict[_param] = second_map_entry.map.params[perm[_index]] out_memlets = [e.data for e in graph.in_edges(first_map_exit)] # Check that input set of second map is provided by the output set # of the first map, or other unrelated maps for second_edge in graph.out_edges(second_map_entry): # Memlets that do not come from one of the intermediate arrays if second_edge.data.data not in intermediate_data: # however, if intermediate_data eventually leads to # second_memlet.data, need to fail. for _n in intermediate_nodes: source_node = _n destination_node = graph.memlet_path(second_edge)[0].src # NOTE: Assumes graph has networkx version if destination_node in nx.descendants( graph._nx, source_node): return False continue provided = False # Compute second subset with respect to first subset's symbols sbs_permuted = dcpy(second_edge.data.subset) sbs_permuted.replace({ symbolic.pystr_to_symbolic(k): symbolic.pystr_to_symbolic(v) for k, v in params_dict.items() }) for first_memlet in out_memlets: if first_memlet.data != second_edge.data.data: continue # If there is a covered subset, it is provided if first_memlet.subset.covers(sbs_permuted): provided = True break # If none of the output memlets of the first map provide the info, # fail. if provided is False: return False # Success return True @staticmethod def match_to_str(graph, candidate): first_exit = graph.nodes()[candidate[MapFusion._first_map_exit]] second_entry = graph.nodes()[candidate[MapFusion._second_map_entry]] return " -> ".join(entry.map.label + ": " + str(entry.map.params) for entry in [first_exit, second_entry]) def apply(self, sdfg): """ This method applies the mapfusion transformation. Other than the removal of the second map entry node (SME), and the first map exit (FME) node, it has the following side effects: 1. Any transient adjacent to both FME and SME with degree = 2 will be removed. The tasklets that use/produce it shall be connected directly with a scalar/new transient (if the dataflow is more than a single scalar) 2. If this transient is adjacent to FME and SME and has other uses, it will be adjacent to the new map exit post fusion. Tasklet-> Tasklet edges will ALSO be added as mentioned above. 3. If an access node is adjacent to FME but not SME, it will be adjacent to new map exit post fusion. 4. If an access node is adjacent to SME but not FME, it will be adjacent to the new map entry node post fusion. """ graph = sdfg.nodes()[self.state_id] first_exit = graph.nodes()[self.subgraph[MapFusion._first_map_exit]] first_entry = graph.entry_node(first_exit) second_entry = graph.nodes()[self.subgraph[ MapFusion._second_map_entry]] second_exit = graph.exit_node(second_entry) intermediate_nodes = set() for _, _, dst, _, _ in graph.out_edges(first_exit): intermediate_nodes.add(dst) assert isinstance(dst, nodes.AccessNode) # Check if an access node refers to non transient memory, or transient # is used at another location (cannot erase) do_not_erase = set() for node in intermediate_nodes: if sdfg.arrays[node.data].transient is False: do_not_erase.add(node) else: for edge in graph.in_edges(node): if edge.src != first_exit: do_not_erase.add(node) break else: for edge in graph.out_edges(node): if edge.dst != second_entry: do_not_erase.add(node) break # Find permutation between first and second scopes perm = MapFusion.find_permutation(first_entry.map, second_entry.map) params_dict = {} for index, param in enumerate(first_entry.map.params): params_dict[param] = second_entry.map.params[perm[index]] # Replaces (in memlets and tasklet) the second scope map # indices with the permuted first map indices. # This works in two passes to avoid problems when e.g., exchanging two # parameters (instead of replacing (j,i) and (i,j) to (j,j) and then # i,i). second_scope = graph.scope_subgraph(second_entry) for firstp, secondp in params_dict.items(): if firstp != secondp: replace(second_scope, secondp, '__' + secondp + '_fused') for firstp, secondp in params_dict.items(): if firstp != secondp: replace(second_scope, '__' + secondp + '_fused', firstp) # Isolate First exit node ############################ edges_to_remove = set() nodes_to_remove = set() for edge in graph.in_edges(first_exit): tree = graph.memlet_tree(edge) access_node = tree.root().edge.dst if access_node not in do_not_erase: out_edges = [ e for e in graph.out_edges(access_node) if e.dst == second_entry ] # In this transformation, there can only be one edge to the # second map assert len(out_edges) == 1 # Get source connector to the second map connector = out_edges[0].dst_conn[3:] new_dsts = [] # Look at the second map entry out-edges to get the new # destinations for e in graph.out_edges(second_entry): if e.src_conn[4:] == connector: new_dsts.append(e) if not new_dsts: # Access node is not used in the second map nodes_to_remove.add(access_node) continue # If the source is an access node, modify the memlet to point # to it if (isinstance(edge.src, nodes.AccessNode) and edge.data.data != edge.src.data): edge.data.data = edge.src.data edge.data.subset = ("0" if edge.data.other_subset is None else edge.data.other_subset) edge.data.other_subset = None else: # Add a transient scalar/array self.fuse_nodes(sdfg, graph, edge, new_dsts[0].dst, new_dsts[0].dst_conn, new_dsts[1:]) edges_to_remove.add(edge) # Remove transient node between the two maps nodes_to_remove.add(access_node) else: # The case where intermediate array node cannot be removed # Node will become an output of the second map exit out_e = tree.parent.edge conn = second_exit.next_connector() graph.add_edge( second_exit, 'OUT_' + conn, out_e.dst, out_e.dst_conn, dcpy(out_e.data), ) second_exit.add_out_connector('OUT_' + conn) graph.add_edge(edge.src, edge.src_conn, second_exit, 'IN_' + conn, dcpy(edge.data)) second_exit.add_in_connector('IN_' + conn) edges_to_remove.add(out_e) edges_to_remove.add(edge) # If the second map needs this node, link the connector # that generated this to the place where it is needed, with a # temp transient/scalar for memlet to be generated for out_e in graph.out_edges(second_entry): second_memlet_path = graph.memlet_path(out_e) source_node = second_memlet_path[0].src if source_node == access_node: self.fuse_nodes(sdfg, graph, edge, out_e.dst, out_e.dst_conn) ### # First scope exit is isolated and can now be safely removed for e in edges_to_remove: graph.remove_edge(e) graph.remove_nodes_from(nodes_to_remove) graph.remove_node(first_exit) # Isolate second_entry node ########################### for edge in graph.in_edges(second_entry): tree = graph.memlet_tree(edge) access_node = tree.root().edge.src if access_node in intermediate_nodes: # Already handled above, can be safely removed graph.remove_edge(edge) continue # This is an external input to the second map which will now go # through the first map. conn = first_entry.next_connector() graph.add_edge(edge.src, edge.src_conn, first_entry, 'IN_' + conn, dcpy(edge.data)) first_entry.add_in_connector('IN_' + conn) graph.remove_edge(edge) for out_enode in tree.children: out_e = out_enode.edge graph.add_edge( first_entry, 'OUT_' + conn, out_e.dst, out_e.dst_conn, dcpy(out_e.data), ) graph.remove_edge(out_e) first_entry.add_out_connector('OUT_' + conn) ### # Second node is isolated and can now be safely removed graph.remove_node(second_entry) # Fix scope exit to point to the right map second_exit.map = first_entry.map def fuse_nodes(self, sdfg, graph, edge, new_dst, new_dst_conn, other_edges=None): """ Fuses two nodes via memlets and possibly transient arrays. """ other_edges = other_edges or [] memlet_path = graph.memlet_path(edge) access_node = memlet_path[-1].dst local_name = "__s%d_n%d%s_n%d%s" % ( self.state_id, graph.node_id(edge.src), edge.src_conn, graph.node_id(edge.dst), edge.dst_conn, ) # Add intermediate memory between subgraphs. If a scalar, # uses direct connection. If an array, adds a transient node if edge.data.subset.num_elements() == 1: sdfg.add_scalar( local_name, dtype=access_node.desc(graph).dtype, transient=True, storage=dtypes.StorageType.Register, ) edge.data.data = local_name edge.data.subset = "0" local_node = edge.src src_connector = edge.src_conn # Add edge that leads to the second node graph.add_edge(local_node, src_connector, new_dst, new_dst_conn, dcpy(edge.data)) for e in other_edges: graph.add_edge(local_node, src_connector, e.dst, e.dst_conn, dcpy(edge.data)) else: sdfg.add_transient(local_name, edge.data.subset.size(), dtype=access_node.desc(graph).dtype) old_edge = dcpy(edge) local_node = graph.add_access(local_name) src_connector = None edge.data.data = local_name edge.data.subset = ",".join( ["0:" + str(s) for s in edge.data.subset.size()]) # Add edge that leads to transient node graph.add_edge( edge.src, edge.src_conn, local_node, None, dcpy(edge.data), ) # Add edge that leads to the second node graph.add_edge(local_node, src_connector, new_dst, new_dst_conn, dcpy(edge.data)) for e in other_edges: graph.add_edge(local_node, src_connector, e.dst, e.dst_conn, dcpy(edge.data)) # Modify data and memlets on all surrounding edges to match array for neighbor in graph.all_edges(local_node): for e in graph.memlet_tree(neighbor): e.data.data = local_name e.data.subset.offset(old_edge.data.subset, negative=True)
class BufferTiling(transformation.Transformation): """ Implements the buffer tiling transformation. BufferTiling tiles a buffer that is in between two maps, where the preceding map writes to the buffer and the succeeding map reads from it. It introduces additional computations in exchange for reduced memory footprint. Commonly used to make use of shared memory on GPUs. """ _map1_exit = nodes.MapExit(nodes.Map('', [], [])) _array = nodes.AccessNode('') _map2_entry = nodes.MapEntry(nodes.Map('', [], [])) tile_sizes = ShapeProperty(dtype=tuple, default=(128, 128, 128), desc="Tile size per dimension") # Returns a list of graphs that represent the pattern @staticmethod def expressions(): return [ sdutil.node_path_graph( BufferTiling._map1_exit, BufferTiling._array, BufferTiling._map2_entry, ) ] @staticmethod def can_be_applied(graph, candidate, expr_index, sdfg, strict=False): map1_exit = graph.nodes()[candidate[BufferTiling._map1_exit]] map2_entry = graph.nodes()[candidate[BufferTiling._map2_entry]] for buf in graph.all_nodes_between(map1_exit, map2_entry): # Check that buffers are AccessNodes. if not isinstance(buf, nodes.AccessNode): return False # Check that buffers are transient. if not sdfg.arrays[buf.data].transient: return False # Check that buffers have exactly 1 input and 1 output edge. if graph.in_degree(buf) != 1: return False if graph.out_degree(buf) != 1: return False # Check that buffers are next to the maps. if graph.in_edges(buf)[0].src != map1_exit: return False if graph.out_edges(buf)[0].dst != map2_entry: return False # Check that the data consumed is provided. provided = graph.in_edges(buf)[0].data.subset consumed = graph.out_edges(buf)[0].data.subset if not provided.covers(consumed): return False # Check that buffers occur only once in this state. num_occurrences = len([ n for n in graph.nodes() if isinstance(n, nodes.AccessNode) and n.data == buf ]) if num_occurrences > 1: return False return True @staticmethod def match_to_str(graph, candidate): map1_exit = graph.nodes()[candidate[BufferTiling._map1_exit]] map2_entry = graph.nodes()[candidate[BufferTiling._map2_entry]] return " -> ".join(entry.map.label + ": " + str(entry.map.params) for entry in [map1_exit, map2_entry]) def apply(self, sdfg): graph = sdfg.nodes()[self.state_id] map1_exit = graph.nodes()[self.subgraph[self._map1_exit]] map1_entry = graph.entry_node(map1_exit) map2_entry = graph.nodes()[self.subgraph[self._map2_entry]] buffers = graph.all_nodes_between(map1_exit, map2_entry) # Situation: # -> map1_entry -> ... -> map1_exit -> buffers -> map2_entry -> ... lower_extents = tuple(b - a for a, b in zip( map1_entry.range.min_element(), map2_entry.range.min_element())) upper_extents = tuple(a - b for a, b in zip( map1_entry.range.max_element(), map2_entry.range.max_element())) # Tile the first map with overlap MapTilingWithOverlap.apply_to(sdfg, map_entry=map1_entry, options={ 'tile_sizes': self.tile_sizes, 'lower_overlap': lower_extents, 'upper_overlap': upper_extents }) tile_map1_exit = graph.out_edges(map1_exit)[0].dst tile_map1_entry = graph.entry_node(tile_map1_exit) tile_map1_entry.label = 'BufferTiling' # Tile the second map MapTiling.apply_to(sdfg, map_entry=map2_entry, options={ 'tile_sizes': self.tile_sizes, 'tile_trivial': True }) tile_map2_entry = graph.in_edges(map2_entry)[0].src # Fuse maps some_buffer = next( iter(buffers)) # some dummy to pass to MapFusion.apply_to() MapFusion.apply_to(sdfg, first_map_exit=tile_map1_exit, array=some_buffer, second_map_entry=tile_map2_entry) # Optimize the simple cases map1_entry.range.ranges = [ (r[0], r[0], r[2]) if l_ext == 0 and u_ext == 0 and ts == 1 else r for r, l_ext, u_ext, ts in zip(map1_entry.range.ranges, lower_extents, upper_extents, self.tile_sizes) ] map2_entry.range.ranges = [ (r[0], r[0], r[2]) if ts == 1 else r for r, ts in zip(map2_entry.range.ranges, self.tile_sizes) ] if any(ts == 1 for ts in self.tile_sizes): if any(r[0] == r[1] for r in map1_entry.map.range): TrivialMapElimination.apply_to(sdfg, _map_entry=map1_entry) if any(r[0] == r[1] for r in map2_entry.map.range): TrivialMapElimination.apply_to(sdfg, _map_entry=map2_entry)
class TensorflowRedundantArray(pm.Transformation): """ Implements the redundant array removal transformation, applied to remove ReadVariableOps and control dependencies. """ _arrays_removed = 0 _in_array = nodes.AccessNode("_") _out_array = nodes.AccessNode("_") @staticmethod def expressions(): return [ sdutil.node_path_graph(TensorflowRedundantArray._in_array, TensorflowRedundantArray._out_array) ] @staticmethod def can_be_applied(graph, candidate, expr_index, sdfg, strict=False): in_array = graph.nodes()[candidate[TensorflowRedundantArray._in_array]] out_array = graph.nodes()[candidate[ TensorflowRedundantArray._out_array]] # Just to be sure, check for the OP name in the out array if not ("ReadVariable" in out_array.data or "control_dependency" in out_array.data): return False # Make sure that the candidate is a transient variable if not in_array.desc(sdfg).transient: return False # Make sure that both arrays are using the same storage location if in_array.desc(sdfg).storage != out_array.desc(sdfg).storage: return False # Only apply if arrays are of same shape (no need to modify subset) if len(in_array.desc(sdfg).shape) != len( out_array.desc(sdfg).shape) or any(i != o for i, o in zip( in_array.desc(sdfg).shape, out_array.desc(sdfg).shape)): return False return True @staticmethod def match_to_str(graph, candidate): out_array = graph.nodes()[candidate[ TensorflowRedundantArray._out_array]] return "Remove " + str(out_array) def apply(self, sdfg): def gnode(nname): return graph.nodes()[self.subgraph[nname]] graph = sdfg.nodes()[self.state_id] in_array = gnode(TensorflowRedundantArray._in_array) out_array = gnode(TensorflowRedundantArray._out_array) for e in graph.out_edges(out_array): # Modify all outgoing edges to point to in_array path = graph.memlet_tree(e) for pe in path: if pe.data.data == out_array.data: pe.data.data = in_array.data # Preemptively add edge from in_array to out_array's adjacent # nodes. new_memlet = e.data new_memlet.data = in_array.data graph.add_edge(in_array, e.src_conn, e.dst, e.dst_conn, new_memlet) graph.remove_edge(e) try: assert len(graph.in_edges(out_array)) == 1 except AssertionError: print("Multiple in-edges for ", str(out_array)) e = graph.in_edges(out_array)[0] graph.remove_edge(e) # Finally, remove out_array node graph.remove_node(out_array) if Config.get_bool("debugprint"): TensorflowRedundantArray._arrays_removed += 1
class InMergeArrays(pattern_matching.Transformation): """ Merge duplicate arrays connected to the same scope entry. """ _array1 = nodes.AccessNode("_") _array2 = nodes.AccessNode("_") _map_entry = nodes.EntryNode() @staticmethod def expressions(): # Matching # o o # | | # /======\ g = SDFGState() g.add_node(InMergeArrays._array1) g.add_node(InMergeArrays._array2) g.add_node(InMergeArrays._map_entry) g.add_edge(InMergeArrays._array1, None, InMergeArrays._map_entry, None, memlet.Memlet()) g.add_edge(InMergeArrays._array2, None, InMergeArrays._map_entry, None, memlet.Memlet()) return [g] @staticmethod def can_be_applied(graph, candidate, expr_index, sdfg, strict=False): arr1_id = candidate[InMergeArrays._array1] arr2_id = candidate[InMergeArrays._array2] # Ensure both arrays contain the same data arr1 = graph.node(arr1_id) arr2 = graph.node(arr2_id) if arr1.data != arr2.data: return False # Ensure only arr1's node ID contains incoming edges if graph.in_degree(arr2) > 0: return False # Ensure arr1 and arr2's node IDs are ordered (avoid duplicates) if (graph.in_degree(arr1) == 0 and graph.in_degree(arr2) == 0 and arr1_id >= arr2_id): return False map = graph.node(candidate[InMergeArrays._map_entry]) # If arr1's connector leads directly to map, skip it if all(e.dst_conn and not e.dst_conn.startswith('IN_') for e in graph.edges_between(arr1, map)): return False if (any(e.dst != map for e in graph.out_edges(arr1)) or any(e.dst != map for e in graph.out_edges(arr2))): return False # Ensure arr1 and arr2 are the first two incoming nodes (avoid further # duplicates) all_source_nodes = set( graph.node_id(e.src) for e in graph.in_edges(map) if e.src != arr1 and e.src != arr2 and e.src.data == arr1.data and e.dst_conn and e.dst_conn.startswith('IN_') and graph.in_degree(e.src) == 0) if any(nid < arr1_id or nid < arr2_id for nid in all_source_nodes): return False return True @staticmethod def match_to_str(graph, candidate): arr = graph.node(candidate[InMergeArrays._array1]) map = graph.node(candidate[InMergeArrays._map_entry]) return '%s (%d, %d) -> %s' % ( arr.data, candidate[InMergeArrays._array1], candidate[InMergeArrays._array2], map.label) def apply(self, sdfg): graph = sdfg.node(self.state_id) array = graph.node(self.subgraph[InMergeArrays._array1]) map = graph.node(self.subgraph[InMergeArrays._map_entry]) map_edge = next(e for e in graph.out_edges(array) if e.dst == map) result_connector = map_edge.dst_conn[3:] # Find all other incoming access nodes without incoming edges source_edges = [ e for e in graph.in_edges(map) if isinstance(e.src, nodes.AccessNode) and e.src.data == array.data and e.src != array and e.dst_conn and e.dst_conn.startswith('IN_') and graph.in_degree(e.src) == 0 ] # Modify connectors to point to first array connectors_to_remove = set() for e in source_edges: connector = e.dst_conn[3:] connectors_to_remove.add(connector) for inner_edge in graph.out_edges(map): if inner_edge.src_conn[4:] == connector: inner_edge._src_conn = 'OUT_' + result_connector # Remove other nodes from state graph.remove_nodes_from(set(e.src for e in source_edges)) # Remove connectors from scope entry for c in connectors_to_remove: map.remove_in_connector('IN_' + c) map.remove_out_connector('OUT_' + c) # Re-propagate memlets edge_to_propagate = next(e for e in graph.out_edges(map) if e.src_conn[4:] == result_connector) map_edge._data = propagate_memlet(dfg_state=graph, memlet=edge_to_propagate.data, scope_node=map, union_inner_edges=True)
class MergeSourceSinkArrays(transformation.Transformation): """ Merge duplicate arrays that are source/sink nodes. """ _array1 = nodes.AccessNode("_") @staticmethod def expressions(): # Matching # o o g = SDFGState() g.add_node(MergeSourceSinkArrays._array1) return [g] @staticmethod def can_be_applied(graph, candidate, expr_index, sdfg, permissive=False): arr1_id = candidate[MergeSourceSinkArrays._array1] arr1 = graph.node(arr1_id) # Ensure array is either a source or sink node src_nodes = graph.source_nodes() sink_nodes = graph.sink_nodes() if arr1 in src_nodes: nodes_to_consider = src_nodes elif arr1 in sink_nodes: nodes_to_consider = sink_nodes else: return False # Ensure there are more nodes with the same data other_nodes = [ graph.node_id(n) for n in nodes_to_consider if isinstance(n, nodes.AccessNode) and n.data == arr1.data and n != arr1 ] if len(other_nodes) == 0: return False # Ensure arr1 is the first node to avoid further duplicates nid = min(other_nodes) if nid < arr1_id: return False return True @staticmethod def match_to_str(graph, candidate): arr = graph.node(candidate[MergeSourceSinkArrays._array1]) if arr in graph.source_nodes(): place = 'source' else: place = 'sink' return '%s array %s' % (place, arr.data) def apply(self, sdfg): graph = sdfg.node(self.state_id) array = graph.node(self.subgraph[MergeSourceSinkArrays._array1]) if array in graph.source_nodes(): src_node = True nodes_to_consider = graph.source_nodes() edges_to_consider = lambda n: graph.out_edges(n) else: src_node = False nodes_to_consider = graph.sink_nodes() edges_to_consider = lambda n: graph.in_edges(n) for node in nodes_to_consider: if node == array: continue if not isinstance(node, nodes.AccessNode): continue if node.data != array.data: continue for edge in list(edges_to_consider(node)): if src_node: graph.add_edge(array, edge.src_conn, edge.dst, edge.dst_conn, edge.data) else: graph.add_edge(edge.src, edge.src_conn, array, edge.dst_conn, edge.data) graph.remove_edge(edge) graph.remove_node(node)
class RedundantSecondArray(pm.Transformation): """ Implements the redundant array removal transformation, applied when a transient array is copied from and to (from another array), but never used anywhere else. This transformation removes the second array. """ _arrays_removed = 0 _in_array = nodes.AccessNode("_") _out_array = nodes.AccessNode("_") @staticmethod def expressions(): return [ sdutil.node_path_graph(RedundantSecondArray._in_array, RedundantSecondArray._out_array) ] @staticmethod def can_be_applied(graph, candidate, expr_index, sdfg, strict=False): in_array = graph.nodes()[candidate[RedundantSecondArray._in_array]] out_array = graph.nodes()[candidate[RedundantSecondArray._out_array]] in_desc = in_array.desc(sdfg) out_desc = out_array.desc(sdfg) # Ensure in degree is one (only one source, which is in_array) if graph.in_degree(out_array) != 1: return False # Make sure that the candidate is a transient variable if not out_desc.transient: return False # Dimensionality must be the same in strict mode if strict and len(in_desc.shape) != len(out_desc.shape): return False # Make sure that both arrays are using the same storage location # and are of the same type (e.g., Stream->Stream) if in_desc.storage != out_desc.storage: return False if type(in_desc) != type(out_desc): return False # Find occurrences in this and other states occurrences = [] for state in sdfg.nodes(): occurrences.extend([ n for n in state.nodes() if isinstance(n, nodes.AccessNode) and n.desc(sdfg) == out_desc ]) for isedge in sdfg.edges(): if out_array.data in isedge.data.free_symbols: occurrences.append(isedge) if len(occurrences) > 1: return False # Check whether the data copied from the first datanode cover # the subsets of all the output edges of the second datanode. # We assume the following pattern: A -- e1 --> B -- e2 --> others # 1. Get edge e1 and extract/validate subsets for arrays A and B e1 = graph.edges_between(in_array, out_array)[0] try: _, b1_subset = _validate_subsets(e1, sdfg.arrays) except NotImplementedError: return False # 2. Iterate over the e2 edges for e2 in graph.out_edges(out_array): # 2-a. Extract/validate subsets for array B and others try: b2_subset, _ = _validate_subsets(e2, sdfg.arrays) except NotImplementedError: return False # 2-b. Check where b1_subset covers b2_subset if not b1_subset.covers(b2_subset): return False # 2-c. Validate subsets in memlet tree # (should not be needed for valid SDGs) path = graph.memlet_tree(e2) for e3 in path: if e3 is not e2: try: _validate_subsets(e3, sdfg.arrays, src_name=out_array.data) except NotImplementedError: return False return True @staticmethod def match_to_str(graph, candidate): out_array = graph.nodes()[candidate[RedundantSecondArray._out_array]] return "Remove " + str(out_array) def apply(self, sdfg): def gnode(nname): return graph.nodes()[self.subgraph[nname]] graph = sdfg.nodes()[self.state_id] in_array = gnode(RedundantSecondArray._in_array) out_array = gnode(RedundantSecondArray._out_array) # We assume the following pattern: A -- e1 --> B -- e2 --> others # 1. Get edge e1 and extract subsets for arrays A and B e1 = graph.edges_between(in_array, out_array)[0] a_subset, b1_subset = _validate_subsets(e1, sdfg.arrays) # 2. Iterate over the e2 edges and traverse the memlet tree for e2 in graph.out_edges(out_array): path = graph.memlet_tree(e2) for e3 in path: # 2-a. Extract subsets for array B and others b3_subset, other_subset = _validate_subsets( e3, sdfg.arrays, src_name=out_array.data) # 2-b. Modify memlet to match array A. Example: # A -- (0, a:b)/(c:c+b) --> B -- (c+d)/None --> others # A -- (0, a+d)/None --> others e3.data.data = in_array.data # (c+d) - (c:c+b) = (d) b3_subset.offset(b1_subset, negative=True) # (0, a:b)(d) = (0, a+d) (or offset for indices) if isinstance(a_subset, subsets.Indices): tmp = copy.deepcopy(a_subset) tmp.offset(b3_subset, negative=False) e3.data.subset = tmp else: e3.data.subset = a_subset.compose(b3_subset) e3.data.other_subset = other_subset # 2-c. Remove edge and add new one graph.remove_edge(e2) graph.add_edge(in_array, e2.src_conn, e2.dst, e2.dst_conn, e2.data) # Finally, remove out_array node graph.remove_node(out_array) # TODO: Should the array be removed from the SDFG? # del sdfg.arrays[out_array] if Config.get_bool("debugprint"): RedundantSecondArray._arrays_removed += 1
def __init__(self, name: str, model: onnx.ModelProto, infer_shapes: bool = True, cuda: bool = False, apply_strict: bool = False, auto_optimize: bool = True, parent_pytorch_module: Optional[torch.nn.Module] = None): """ :param name: the name for the SDFG. :param model: the model to import. :param infer_shapes: whether to infer shapes for the model. If this is ``False``, the model must have value infos (with shapes) for all arrays, including intermediate values. :param cuda: if ``True``, the model will be executed on the GPU. :param apply_strict: if ``True``, apply strict transformations after all nodes have been expanded calling (warning: this can be very slow!) :param auto_optimize: if ``True``, apply automatic optimizations before calling. :param parent_pytorch_module: when not None, the weight tensors are loaded from the parameters of this model rather than the ONNX graph. """ self.do_auto_optimize = auto_optimize if infer_shapes: model = shape_inference.infer_shapes(model) graph: onnx.GraphProto = model.graph self.sdfg: SDFG = SDFG(name) #: the generated SDFG. self.sdfg._parent_onnx_model = self self.cuda = cuda self.apply_strict = apply_strict self.state: SDFGState = self.sdfg.add_state( ) #: the state containing the model computation. # Add all values to the SDFG, check for unsupported ops ########################################## self.value_infos = {} self.inputs: List[str] = [] #: the inputs to the model self.outputs: List[str] = [] #: the outputs of the model #: hooks that are executed after the sdfg is compiled self.post_compile_hooks: Dict[str, Callable[[compiled_sdfg.CompiledSDFG], None]] = {} for value, is_input in chain(zip(graph.input, repeat(True)), zip(graph.output, repeat(False))): if not value.HasField("name"): raise ValueError("Got input or output without name") if is_input: self.inputs.append(value.name) else: self.outputs.append(value.name) self.value_infos[value.name] = value self._add_value_info(value) for value in graph.value_info: if not value.HasField("name"): raise ValueError("Got input or output without name") if value.name not in self.value_infos: self.value_infos[value.name] = value # add weights self.weights: Dict[str, torch.Tensor] = { } #: mapping from weight name to array for init in graph.initializer: self._add_constant_tensor(init, parent_pytorch_module) access_nodes = {} self._idx_to_node = [] for i, node in enumerate(graph.node): if not has_onnx_node(node.op_type): raise ValueError("Unsupported ONNX operator: '{}'".format( node.op_type)) # extract the op attributes op_attributes = { attribute_proto.name: convert_attribute_proto(attribute_proto) for attribute_proto in node.attribute } if node.HasField("name"): node_name = clean_onnx_name(node.name) else: node_name = node.op_type + "_" + str(i) # construct the dace node op_node = get_onnx_node(node.op_type)(node_name, **op_attributes) self.state.add_node(op_node) self._idx_to_node.append(op_node) for param_idx, (name, is_input) in chain( enumerate(zip(node.input, repeat(True))), enumerate(zip(node.output, repeat(False)))): if clean_onnx_name(name) not in self.sdfg.arrays: if name not in self.value_infos: raise ValueError( "Could not find array with name '{}'".format(name)) self._add_value_info(self.value_infos[name]) # get the access node if name in access_nodes: access = access_nodes[name] self._update_access_type(access, is_input) else: access = nd.AccessNode( clean_onnx_name(name), dtypes.AccessType.ReadOnly if is_input else dtypes.AccessType.WriteOnly) self.state.add_node(access) access_nodes[name] = access # get the connector name params = op_node.schema.inputs if is_input else op_node.schema.outputs params_len = len(params) if param_idx >= params_len: # this is a variadic parameter. Then the last parameter of the parameter must be variadic. if params[-1].param_type != ONNXParameterType.Variadic: raise ValueError( "Expected the last {i_or_o} parameter to be variadic," " since the {i_or_o} with idx {param_idx} has more parameters than the schema ({params_len})" .format(i_or_o="input" if is_input else "output", param_idx=param_idx, params_len=params_len)) conn_name = params[-1].name + "__" + str(param_idx - params_len + 1) elif params[ param_idx].param_type == ONNXParameterType.Variadic: # this is a variadic parameter, and it is within the range of params, so it must be the first # instance of a variadic parameter conn_name = params[param_idx].name + "__0" else: conn_name = params[param_idx].name data_desc = self.sdfg.arrays[clean_onnx_name(name)] # add the connector if required, and add an edge if is_input: if conn_name not in op_node.in_connectors: assert op_node.add_in_connector(conn_name) self.state.add_edge( access, None, op_node, conn_name, dace.Memlet.from_array(clean_onnx_name(name), data_desc)) else: if conn_name not in op_node.out_connectors: assert op_node.add_out_connector(conn_name) self.state.add_edge( op_node, conn_name, access, None, dace.Memlet.from_array(clean_onnx_name(name), data_desc)) if self.cuda: self.sdfg.apply_gpu_transformations()
class RedundantSecondArray(pm.Transformation): """ Implements the redundant array removal transformation, applied when a transient array is copied from and to (from another array), but never used anywhere else. This transformation removes the second array. """ _arrays_removed = 0 _in_array = nodes.AccessNode("_") _out_array = nodes.AccessNode("_") @staticmethod def expressions(): return [ sdutil.node_path_graph(RedundantSecondArray._in_array, RedundantSecondArray._out_array) ] @staticmethod def can_be_applied(graph, candidate, expr_index, sdfg, strict=False): in_array = graph.nodes()[candidate[RedundantSecondArray._in_array]] out_array = graph.nodes()[candidate[RedundantSecondArray._out_array]] in_desc = in_array.desc(sdfg) out_desc = out_array.desc(sdfg) # Ensure in degree is one (only one source, which is in_array) if graph.in_degree(out_array) != 1: return False # Make sure that the candidate is a transient variable if not out_desc.transient: return False # 1. Get edge e1 and extract/validate subsets for arrays A and B e1 = graph.edges_between(in_array, out_array)[0] a_subset, b1_subset = _validate_subsets(e1, sdfg.arrays) if strict: # In strict mode, make sure the memlet covers the removed array if not b1_subset: return False subset = copy.deepcopy(b1_subset) subset.squeeze() shape = [sz for sz in out_desc.shape if sz != 1] if any(m != a for m, a in zip(subset.size(), shape)): return False # NOTE: Library node check # The transformation must not apply in strict mode if out_array is # not a view, is input to a library node, and an access or a view # of in_desc is also output to the same library node. # The reason is that the application of the transformation will lead # to in_desc being both input and output of the library node. # We do not know if this is safe. # First find the true in_desc (in case in_array is a view). true_in_desc = in_desc if isinstance(in_desc, data.View): e = sdutil.get_view_edge(graph, in_array) if not e: return False true_in_desc = sdfg.arrays[e.dst.data] if not isinstance(out_desc, data.View): edges_to_check = [] for a in graph.out_edges(out_array): if isinstance(a.dst, nodes.LibraryNode): edges_to_check.append(a) elif (isinstance(a.dst, nodes.AccessNode) and isinstance(sdfg.arrays[a.dst.data], data.View)): for b in graph.out_edges(a.dst): edges_to_check.append(graph.memlet_path(b)[-1]) for a in edges_to_check: if isinstance(a.dst, nodes.LibraryNode): for b in graph.out_edges(a.dst): if isinstance(b.dst, nodes.AccessNode): desc = sdfg.arrays[b.dst.data] if isinstance(desc, data.View): e = sdutil.get_view_edge(graph, b.dst) if not e: return False desc = sdfg.arrays[e.dst.data] if desc is true_in_desc: return False # In strict mode, check if the state has two or more access nodes # for in_array and at least one of them is a write access. There # might be a RW, WR, or WW dependency. accesses = [ n for n in graph.nodes() if isinstance(n, nodes.AccessNode) and n.desc(sdfg) == in_desc and n is not in_array ] if len(accesses) > 0: if (graph.in_degree(in_array) > 0 or any(graph.in_degree(a) > 0 for a in accesses)): # We need to ensure that a data race will not happen if we # remove in_array. # First, we simplify the graph G = helpers.simplify_state(graph) # Loop over the accesses for a in accesses: subsets_intersect = False for e in graph.in_edges(a): _, subset = _validate_subsets(e, sdfg.arrays, dst_name=a.data) res = subsets.intersects(a_subset, subset) if res == True or res is None: subsets_intersect = True break if not subsets_intersect: continue try: has_bward_path = nx.has_path(G, a, in_array) except NodeNotFound: has_bward_path = nx.has_path(graph.nx, a, in_array) try: has_fward_path = nx.has_path(G, in_array, a) except NodeNotFound: has_fward_path = nx.has_path(graph.nx, in_array, a) # If there is no path between the access nodes # (disconnected components), then it is definitely # possible to have data races. Abort. if not (has_bward_path or has_fward_path): return False # If there is a forward path then a must not be a direct # successor of in_array. if has_fward_path and a in G.successors(in_array): for src, _ in G.in_edges(a): if src is in_array: continue if (nx.has_path(G, in_array, src) and src != out_array): continue return False # Make sure that both arrays are using the same storage location # and are of the same type (e.g., Stream->Stream) if in_desc.storage != out_desc.storage: return False if in_desc.location != out_desc.location: return False if type(in_desc) != type(out_desc): if isinstance(in_desc, data.View): # Case View -> Access # If the View points to the Access (and has a different shape?) # then we should (probably) not remove the Access. e = sdutil.get_view_edge(graph, in_array) if e and e.dst is out_array and in_desc.shape != out_desc.shape: return False # Check that the View's immediate ancestors are Accesses. # Otherwise, the application of the transformation will result # in an ambiguous View. view_ancestors_desc = [ e.src.desc(sdfg) if isinstance(e.src, nodes.AccessNode) else None for e in graph.in_edges(in_array) ] if any([ not desc or isinstance(desc, data.View) for desc in view_ancestors_desc ]): return False elif isinstance(out_desc, data.View): # Case Access -> View # If the View points to the Access and has the same shape, # it can be removed e = sdutil.get_view_edge(graph, out_array) if e and e.src is in_array and in_desc.shape == out_desc.shape: return True return False else: # Something else, for example, Stream return False else: # Two views connected to each other if isinstance(in_desc, data.View): return False # Find occurrences in this and other states occurrences = [] for state in sdfg.nodes(): occurrences.extend([ n for n in state.nodes() if isinstance(n, nodes.AccessNode) and n.desc(sdfg) == out_desc ]) for isedge in sdfg.edges(): if out_array.data in isedge.data.free_symbols: occurrences.append(isedge) if len(occurrences) > 1: return False # Check whether the data copied from the first datanode cover # the subsets of all the output edges of the second datanode. # We assume the following pattern: A -- e1 --> B -- e2 --> others # 2. Iterate over the e2 edges for e2 in graph.out_edges(out_array): # 2-a. Extract/validate subsets for array B and others try: b2_subset, _ = _validate_subsets(e2, sdfg.arrays) except NotImplementedError: return False # 2-b. Check where b1_subset covers b2_subset if not b1_subset.covers(b2_subset): return False # 2-c. Validate subsets in memlet tree # (should not be needed for valid SDGs) path = graph.memlet_tree(e2) for e3 in path: if e3 is not e2: try: _validate_subsets(e3, sdfg.arrays, src_name=out_array.data) except NotImplementedError: return False return True @staticmethod def match_to_str(graph, candidate): out_array = graph.nodes()[candidate[RedundantSecondArray._out_array]] return "Remove " + str(out_array) def apply(self, sdfg): def gnode(nname): return graph.nodes()[self.subgraph[nname]] graph = sdfg.nodes()[self.state_id] in_array = gnode(RedundantSecondArray._in_array) out_array = gnode(RedundantSecondArray._out_array) in_desc = sdfg.arrays[in_array.data] out_desc = sdfg.arrays[out_array.data] # We assume the following pattern: A -- e1 --> B -- e2 --> others # 1. Get edge e1 and extract subsets for arrays A and B e1 = graph.edges_between(in_array, out_array)[0] a_subset, b1_subset = _validate_subsets(e1, sdfg.arrays) # Find extraneous A or B subset dimensions a_dims_to_pop = [] b_dims_to_pop = [] aset = a_subset popped = [] if a_subset and b1_subset and a_subset.dims() != b1_subset.dims(): a_size = a_subset.size_exact() b_size = b1_subset.size_exact() if a_subset.dims() > b1_subset.dims(): a_dims_to_pop = find_dims_to_pop(a_size, b_size) aset, popped = pop_dims(a_subset, a_dims_to_pop) else: b_dims_to_pop = find_dims_to_pop(b_size, a_size) # If the src subset does not cover the removed array, create a view. if a_subset and any(m != a for m, a in zip(a_subset.size(), out_desc.shape)): # NOTE: We do not want to create another view, if the immediate # successors of out_array are views as well. We just remove it. out_successors_desc = [ e.dst.desc(sdfg) if isinstance(e.dst, nodes.AccessNode) else None for e in graph.out_edges(out_array) ] if all([ desc and isinstance(desc, data.View) for desc in out_successors_desc ]): for e in graph.out_edges(out_array): _, b_subset = _validate_subsets(e, sdfg.arrays) graph.add_edge( in_array, None, e.dst, e.dst_conn, mm.Memlet(in_array.data, subset=a_subset, other_subset=b_subset, wcr=e1.data.wcr, wcr_nonatomic=e1.data.wcr_nonatomic)) graph.remove_edge(e) graph.remove_edge(e1) graph.remove_node(out_array) if out_array.data in sdfg.arrays: del sdfg.arrays[out_array.data] return view_strides = out_desc.strides if (a_dims_to_pop and len(a_dims_to_pop) == len(in_desc.shape) - len(out_desc.shape)): view_strides = [ s for i, s in enumerate(in_desc.strides) if i not in a_dims_to_pop ] sdfg.arrays[out_array.data] = data.View( out_desc.dtype, out_desc.shape, True, out_desc.allow_conflicts, in_desc.storage, in_desc.location, view_strides, out_desc.offset, in_desc.may_alias, dtypes.AllocationLifetime.Scope, out_desc.alignment, out_desc.debuginfo, out_desc.total_size) return # 2. Iterate over the e2 edges and traverse the memlet tree for e2 in graph.out_edges(out_array): path = graph.memlet_tree(e2) wcr = e1.data.wcr wcr_nonatomic = e1.data.wcr_nonatomic for e3 in path: # 2-a. Extract subsets for array B and others b3_subset, other_subset = _validate_subsets( e3, sdfg.arrays, src_name=out_array.data) # 2-b. Modify memlet to match array A. Example: # A -- (0, a:b)/(c:c+b) --> B -- (c+d)/None --> others # A -- (0, a+d)/None --> others e3.data.data = in_array.data # (c+d) - (c:c+b) = (d) b3_subset.offset(b1_subset, negative=True) # (0, a:b)(d) = (0, a+d) (or offset for indices) if b3_subset and b_dims_to_pop: bset, _ = pop_dims(b3_subset, b_dims_to_pop) else: bset = b3_subset e3.data.subset = compose_and_push_back(aset, bset, a_dims_to_pop, popped) # NOTE: This fixes the following case: # A ----> A[subset] ----> ... -----> Tasklet # Tasklet is not data, so it doesn't have an other subset. if isinstance(e3.dst, nodes.AccessNode): e3.data.other_subset = other_subset else: e3.data.other_subset = None wcr = wcr or e3.data.wcr wcr_nonatomic = wcr_nonatomic or e3.data.wcr_nonatomic e3.data.wcr = wcr e3.data.wcr_nonatomic = wcr_nonatomic # 2-c. Remove edge and add new one graph.remove_edge(e2) e2.data.wcr = wcr e2.data.wcr_nonatomic = wcr_nonatomic graph.add_edge(in_array, e2.src_conn, e2.dst, e2.dst_conn, e2.data) # Finally, remove out_array node graph.remove_node(out_array) if out_array.data in sdfg.arrays: try: sdfg.remove_data(out_array.data) except ValueError: # Already in use (e.g., with Views) pass
class StencilFusion(Transformation): """ Transformation that nests a one-dimensional map into a stencil, including it in the computational domain. """ _stencil_a = Stencil('') _stencil_b = Stencil('') _tmp_array = nodes.AccessNode('_') @staticmethod def expressions(): return [ utils.node_path_graph(StencilFusion._stencil_a, StencilFusion._tmp_array, StencilFusion._stencil_b) ] @staticmethod def match_to_str(graph, candidate): stencil_a: Stencil = graph.node(candidate[StencilFusion._stencil_a]) stencil_b: Stencil = graph.node(candidate[StencilFusion._stencil_b]) return '%s -> %s' % (stencil_a.label, stencil_b.label) @staticmethod def can_be_applied(graph: dace.SDFGState, candidate: Dict[Any, int], expr_index: int, sdfg: dace.SDFG, strict=False): stencil_a: Stencil = graph.node(candidate[StencilFusion._stencil_a]) stencil_b: Stencil = graph.node(candidate[StencilFusion._stencil_b]) array: nodes.AccessNode = graph.node( candidate[StencilFusion._tmp_array]) # Ensure the stencil shapes match if len(stencil_a.shape) != len(stencil_b.shape): return False if any(sa != sb for sa, sb in zip(stencil_a.shape, stencil_b.shape)): return False # Ensure that the transient is not used anywhere else and can be # removed if len(graph.all_edges(array)) != 2: return False if not sdfg.arrays[array.data].transient: return False if (len([ n for state in sdfg.nodes() for n in state.nodes() if isinstance(n, nodes.AccessNode) and n.data == array.data ]) > 1): return False # Ensure that second stencil only has one input access of the # candidate transient to remove edge = graph.out_edges(array)[0] if len(stencil_b.accesses[edge.dst_conn][1]) > 1: return False # TODO: Remove check once stencils can be offset if any(a != 0 for a in stencil_b.accesses[edge.dst_conn][1][0]): return False # Code languages must match if stencil_a.code.language != stencil_b.code.language: return False # TODO: Boundary condition matching checks return True def apply(self, sdfg: dace.SDFG): graph: dace.SDFGState = sdfg.node(self.state_id) stencil_a: Stencil = graph.node( self.subgraph[StencilFusion._stencil_a]) stencil_b: Stencil = graph.node( self.subgraph[StencilFusion._stencil_b]) array: nodes.AccessNode = graph.node( self.subgraph[StencilFusion._tmp_array]) intermediate_name = graph.in_edges(array)[0].src_conn intermediate_name_b = graph.out_edges(array)[0].dst_conn # Replace outputs of first stencil with outputs of second stencil # In node and in connectors, reconnect stencil_a.output_fields = stencil_b.output_fields stencil_a.boundary_conditions = stencil_b.boundary_conditions for edge in list(graph.out_edges(stencil_a)): if edge.src_conn == intermediate_name: graph.remove_edge(edge) del stencil_a._out_connectors[intermediate_name] for edge in graph.out_edges(stencil_b): stencil_a.add_out_connector(edge.src_conn) graph.add_edge(stencil_a, edge.src_conn, edge.dst, edge.dst_conn, edge.data) # Add other stencil inputs of the second stencil to the first # In node and in connectors, reconnect for edge in graph.in_edges(stencil_b): # Skip edge to remove if edge.dst_conn == intermediate_name_b: continue if edge.dst_conn not in stencil_a.accesses: stencil_a.accesses[edge.dst_conn] = stencil_b.accesses[ edge.dst_conn] stencil_a.add_in_connector(edge.dst_conn) graph.add_edge(edge.src, edge.src_conn, stencil_a, edge.dst_conn, edge.data) else: # If same input is accessed in both stencils, only append the # inputs that are new to stencil_a for access in stencil_b.accesses[edge.dst_conn][1]: if access not in stencil_a.accesses[edge.dst_conn][1]: stencil_a.accesses[edge.dst_conn][1].append(access) # Add second stencil's statements to first stencil, replacing the input # to the second stencil with the name of the output access if stencil_a.code.language == dace.Language.Python: # Replace first stencil's output with connector name for i, stmt in enumerate(stencil_a.code.code): stencil_a.code.code[i] = ReplaceSubscript({ intermediate_name: intermediate_name_b }).visit(stmt) # Append second stencil's contents, using connector name instead of # accessing the intermediate transient # TODO: Use offsetted stencil for i, stmt in enumerate(stencil_b.code.code): stencil_a.code.code.append( ReplaceSubscript({ intermediate_name_b: intermediate_name_b }).visit(stmt)) elif stencil_a.code.language == dace.Language.CPP: raise NotImplementedError else: raise ValueError('Unrecognized language: %s' % stencil_a.code.language) # Remove array from graph graph.remove_node(array) del sdfg.arrays[array.data] # Remove 2nd stencil graph.remove_node(stencil_b)
class BankSplit(transformation.Transformation): """ A transformation that allow splitting an array and distribute it on another array with one dimension more, or vice versa. Works with arbitrary arrays, but its intended use case is to distribute data on many HBM-banks. Matches any 2 AccessNodes connected by an edge, if the dimensionality of the two accessed arrays differ by exactly one. The sizes of the arrays have to be large enough with respect to the split executed, but this is not verified. While it is allowed to use symbolics for the shapes of the array, it is expected that each dimension is divisible by the number of splits specified. When appling an unrolled map is generated around the accessnodes, which copies the parts of the array to the target array. Examples: Distribute: Suppose for example we copy from A to B, where A has shape [100, 100] and B shape [10, 100, 10]. We can distribute A in that case to B using the transformation by setting split_array_info=[1, 10]. A will then be divided along it's second dimension into 10 parts of size [100, 10] and distributed on B. Gather: Suppose A has shape [4, 50, 50] and B has shape [100, 100]. If one sets split_array_info to [2, 2] and applies the transformation, it will split equally in all dimensions. Therefore A[0] will be copied to B[0:50, 0:50], A[1] to B[0:50, 50:100], A[2] to B[50:100, 0:50] and A[3] to B[50:100, 50:100]. Note that simply reversing the AccessNodes for the arrays in the above examples would have lead to the inverse operation, i.e. the gather would become a distribute and the other way around. """ _src_node = nd.AccessNode("") _dst_node = nd.AccessNode("") # dtype=List[int] split_array_info = properties.Property( dtype=List, default=None, allow_none=True, desc="Describes how many times this array is split in each dimension, " "where the k-th number describes how many times dimension k is split. " "If the k-th number is 1 this means that the array is not split in " "the k-th dimension at all. " "If None, then the transform will split the first dimension exactly shape[0] times." ) default_to_storage = properties.Property( dtype=dtypes.StorageType, default=dtypes.StorageType.CPU_Heap, allow_none=False, desc= "The storage type of involved arrays will be set to the value of this property if " "they have Default storage type. ") def _get_split_size(self, virtual_shape: Iterable, split_count: List[int]) -> List[int]: """ :return: the shape of a part-array on one HBMbank """ new_shape_list = [] for d in range(len(virtual_shape)): if split_count[d] != 1: new_shape_list.append(virtual_shape[d] // split_count[d]) else: new_shape_list.append(virtual_shape[d]) return new_shape_list @staticmethod def can_be_applied(graph: Union[SDFG, SDFGState], candidate: Dict['PatternNode', int], expr_index: int, sdfg: SDFG, strict: bool) -> bool: src = graph.nodes()[candidate[BankSplit._src_node]] dst = graph.nodes()[candidate[BankSplit._dst_node]] src_array = sdfg.arrays[src.data] dst_array = sdfg.arrays[dst.data] plain_array = lambda array: isinstance( array, data.Array) and not isinstance(array, data.View) if not plain_array(src_array): return False if not plain_array(dst_array): return False # same dimensions means HBM-array needs 1 dimension more collect_src = len(src_array.shape) - 1 == len(dst_array.shape) distribute_dst = len(src_array.shape) + 1 == len(dst_array.shape) if collect_src and symbolic.issymbolic(src_array.shape[0], sdfg.constants): return False elif distribute_dst and symbolic.issymbolic(dst_array.shape[0], sdfg.constants): return False return collect_src or distribute_dst @staticmethod def expressions(): return [ utils.node_path_graph(BankSplit._src_node, BankSplit._dst_node) ] def apply(self, sdfg: SDFG) -> Union[Any, None]: # Load/parse infos from the SDFG graph = sdfg.nodes()[self.state_id] src = graph.nodes()[self.subgraph[BankSplit._src_node]] dst = graph.nodes()[self.subgraph[BankSplit._dst_node]] src_array = sdfg.arrays[src.data] dst_array = sdfg.arrays[dst.data] collect_src = len(src_array.shape) - 1 == len( dst_array.shape ) # If this is not true we have to distribute to dst (checked in can_apply) if collect_src: bank_count = int(src_array.shape[0]) true_size = dst_array.shape else: bank_count = int(dst_array.shape[0]) true_size = src_array.shape ndim = len(true_size) # Move Default storage if sdfg.arrays[src.data].storage == dtypes.StorageType.Default: sdfg.arrays[src.data].storage = self.default_to_storage if sdfg.arrays[dst.data].storage == dtypes.StorageType.Default: sdfg.arrays[dst.data].storage = self.default_to_storage # Figure out how to split if self.split_array_info is None: split_info = [1] * ndim split_info[0] = bank_count else: split_info = self.split_array_info if len(split_info) != ndim: raise RuntimeError( "Length of split_array_info must match number of " "dimensions") if functools.reduce(lambda a, b: a * b, split_info) != bank_count: raise RuntimeError( "Splitting is not possible with the selected splits" "and this number of HBM-banks (required number of banks " "!= actual number of banks)") # create the copy-subgraph ndrange = dict() usable_params = [] for i in range(ndim): usable_params.append(f"i{i}") for i in range(ndim): ndrange[usable_params[i]] = f"0:{split_info[i]}" graph.remove_edge_and_connectors(graph.edges_between(src, dst)[0]) copy_map_enter, copy_map_exit = graph.add_map( "hbm_bank_split", ndrange, dtypes.ScheduleType.Unrolled) graph.add_edge(copy_map_enter, None, src, None, memlet.Memlet()) graph.add_edge(dst, None, copy_map_exit, None, memlet.Memlet()) target_size = [ str(x) for x in self._get_split_size(true_size, split_info) ] target_hbm_bank = [] for i in range(ndim): target_hbm_bank.append(usable_params[i]) for j in range(i): target_hbm_bank[j] = f"{split_info[i]}*{target_hbm_bank[j]}" target_offset = [] for i in range(ndim): target_offset.append(f"{usable_params[i]}*{target_size[i]}") target_size_str = ", ".join( [f"{x}:{y}" for x, y in zip([0] * ndim, target_size)]) target_hbm_bank_str = "+ ".join(target_hbm_bank) target_offset_str = ", ".join( [f"({x}):({x}+{y})" for x, y in zip(target_offset, target_size)]) if collect_src: copy_memlet = memlet.Memlet( f"{src.data}[{target_hbm_bank_str}, {target_size_str}]->" f"{target_offset_str}") else: copy_memlet = memlet.Memlet( f"{src.data}[{target_offset_str}]->{target_hbm_bank_str}, " f"{target_size_str}") graph.add_edge(src, None, dst, None, copy_memlet)
class MapReduceFusion(pm.Transformation): """ Implements the map-reduce-fusion transformation. Fuses a map with an immediately following reduction, where the array between the map and the reduction is not used anywhere else. """ no_init = Property(dtype=bool, default=False, desc='If enabled, does not create initialization states ' 'for reduce nodes with identity') _tasklet = nodes.Tasklet('_') _tmap_exit = nodes.MapExit(nodes.Map("", [], [])) _in_array = nodes.AccessNode('_') import dace.libraries.standard as stdlib # Avoid import loop _reduce = stdlib.Reduce() _out_array = nodes.AccessNode('_') @staticmethod def expressions(): return [ sdutil.node_path_graph(MapReduceFusion._tasklet, MapReduceFusion._tmap_exit, MapReduceFusion._in_array, MapReduceFusion._reduce, MapReduceFusion._out_array) ] @staticmethod def can_be_applied(graph, candidate, expr_index, sdfg, strict=False): tmap_exit = graph.nodes()[candidate[MapReduceFusion._tmap_exit]] in_array = graph.nodes()[candidate[MapReduceFusion._in_array]] reduce_node = graph.nodes()[candidate[MapReduceFusion._reduce]] tasklet = graph.nodes()[candidate[MapReduceFusion._tasklet]] # Make sure that the array is only accessed by the map and the reduce if any([ src != tmap_exit for src, _, _, _, memlet in graph.in_edges(in_array) ]): return False if any([ dest != reduce_node for _, _, dest, _, memlet in graph.out_edges(in_array) ]): return False tmem = next(e for e in graph.edges_between(tasklet, tmap_exit) if e.data.data == in_array.data).data # (strict) Make sure that the transient is not accessed anywhere else # in this state or other states if strict and (len([ n for n in graph.nodes() if isinstance(n, nodes.AccessNode) and n.data == in_array.data ]) > 1 or in_array.data in sdfg.shared_transients()): return False # If memlet already has WCR and it is different from reduce node, # do not match if tmem.wcr is not None and tmem.wcr != reduce_node.wcr: return False # Verify that reduction ranges match tasklet map tout_memlet = graph.in_edges(in_array)[0].data rin_memlet = graph.out_edges(in_array)[0].data if tout_memlet.subset != rin_memlet.subset: return False return True @staticmethod def match_to_str(graph, candidate): tasklet = candidate[MapReduceFusion._tasklet] map_exit = candidate[MapReduceFusion._tmap_exit] reduce = candidate[MapReduceFusion._reduce] return ' -> '.join(str(node) for node in [tasklet, map_exit, reduce]) def apply(self, sdfg: SDFG): graph = sdfg.nodes()[self.state_id] tmap_exit = graph.nodes()[self.subgraph[MapReduceFusion._tmap_exit]] in_array = graph.nodes()[self.subgraph[MapReduceFusion._in_array]] reduce_node = graph.nodes()[self.subgraph[MapReduceFusion._reduce]] out_array = graph.nodes()[self.subgraph[MapReduceFusion._out_array]] # Set nodes to remove according to the expression index nodes_to_remove = [in_array] nodes_to_remove.append(reduce_node) memlet_edge = None for edge in graph.in_edges(tmap_exit): if edge.data.data == in_array.data: memlet_edge = edge break if memlet_edge is None: raise RuntimeError('Reduction memlet cannot be None') # Find which indices should be removed from new memlet input_edge = graph.in_edges(reduce_node)[0] axes = reduce_node.axes or list(range(len(input_edge.data.subset))) array_edge = graph.out_edges(reduce_node)[0] # Delete relevant edges and nodes graph.remove_nodes_from(nodes_to_remove) # Filter out reduced dimensions from subset filtered_subset = [ dim for i, dim in enumerate(memlet_edge.data.subset) if i not in axes ] if len(filtered_subset) == 0: # Output is a scalar filtered_subset = [(0, 0, 1)] # Modify edge from tasklet to map exit memlet_edge.data.data = out_array.data memlet_edge.data.wcr = reduce_node.wcr memlet_edge.data.subset = type(memlet_edge.data.subset)(filtered_subset) # Add edge from map exit to output array graph.add_edge( memlet_edge.dst, 'OUT_' + memlet_edge.dst_conn[3:], array_edge.dst, array_edge.dst_conn, Memlet.simple(array_edge.data.data, array_edge.data.subset, num_accesses=array_edge.data.num_accesses, wcr_str=reduce_node.wcr)) # Add initialization state as necessary if reduce_node.identity is not None: init_state = sdfg.add_state_before(graph) init_state.add_mapped_tasklet( 'freduce_init', [('o%d' % i, '%s:%s:%s' % (r[0], r[1] + 1, r[2])) for i, r in enumerate(array_edge.data.subset)], {}, 'out = %s' % reduce_node.identity, { 'out': Memlet.simple( array_edge.data.data, ','.join([ 'o%d' % i for i in range(len(array_edge.data.subset)) ])) }, external_edges=True)
def expand(self, sdfg, graph, reduce_node): """ Splits the data dimension into an inner and outer dimension, where the inner dimension are the reduction axes and the outer axes the complement. Pushes the reduce inside a new map consisting of the complement axes. """ out_storage_node = graph.out_edges(reduce_node)[0].dst in_storage_node = graph.in_edges(reduce_node)[0].src wcr = reduce_node.wcr identity = reduce_node.identity schedule = reduce_node.schedule implementation = reduce_node.implementation if implementation and 'warp' in implementation: raise NotImplementedError( "WIP: Warp Reductions are not Implemented yet.") # remove the reduce identity # we will reassign it later after expanding reduce_node.identity = None # expand the reduce node in_edge = graph.in_edges(reduce_node)[0] nsdfg = self._expand_reduce(sdfg, graph, reduce_node) # find the new nodes in the nested sdfg created nstate = nsdfg.sdfg.nodes()[0] for node, scope in nstate.scope_dict().items(): if isinstance(node, nodes.MapEntry): if scope is None: outer_entry = node else: inner_entry = node if isinstance(node, nodes.Tasklet): tasklet_node = node inner_exit = nstate.exit_node(inner_entry) outer_exit = nstate.exit_node(outer_entry) # find earliest parent read-write occurrence of array onto which # we perform the reduction: # do BFS, best complexity O(V+E) queue = [nsdfg] array_closest_ancestor = None while len(queue) > 0: current = queue.pop(0) if isinstance(current, nodes.AccessNode): if current.data == out_storage_node.data: # it suffices to find the first node # no matter what access (ReadWrite or Read) array_closest_ancestor = current break queue.extend([in_edge.src for in_edge in graph.in_edges(current)]) # if ancestor doesn't exist: # if non-transient: create data node accessing it # if transient: ancestor_node = none, set_zero on outer node shortcut = False if (not array_closest_ancestor and sdfg.data(out_storage_node.data).transient) \ or identity is not None: if self.debug: print("ReduceExpansion::Expanding Reduction into Map") # we are lucky shortcut = True nstate.out_edges(outer_exit)[0].data.wcr = None else: if self.debug: print("ReduceExpansion::Expanding Reduction into Map " "and introducing update Tasklet, " "connecting with ancestor.") if not array_closest_ancestor: array_closest_ancestor = nodes.AccessNode( out_storage_node.data, access=dtypes.AccessType.ReadOnly) graph.add_node(array_closest_ancestor) # array_closest_ancestor now points to the node we want to connect # to the map entry # always have to create out transient in this case self.create_out_transient = True if self.create_out_transient: # create an out transient between inner and outer map exit array_out = nstate.out_edges(outer_exit)[0].data.data from dace.transformation.dataflow.local_storage import LocalStorage local_storage_subgraph = { LocalStorage.node_a: nsdfg.sdfg.nodes()[0].nodes().index(inner_exit), LocalStorage.node_b: nsdfg.sdfg.nodes()[0].nodes().index(outer_exit) } nsdfg_id = nsdfg.sdfg.sdfg_list.index(nsdfg.sdfg) nstate_id = 0 local_storage = LocalStorage(nsdfg_id, nstate_id, local_storage_subgraph, 0) local_storage.array = array_out local_storage.apply(nsdfg.sdfg) out_transient_node_inner = local_storage._data_node # push to register nsdfg.sdfg.data(out_transient_node_inner.data ).storage = dtypes.StorageType.Register if shortcut: nstate.out_edges(out_transient_node_inner)[0].data.wcr = None nstate.out_edges(out_transient_node_inner)[0].data.volume = 1 if shortcut: nstate.out_edges(out_transient_node_inner)[0].data.wcr = None nstate.out_edges(out_transient_node_inner)[0].data.volume = 1 if self.create_in_transient: # create an in-transient between inner and outer map entry array_in = nstate.in_edges(outer_entry)[0].data.data from dace.transformation.dataflow.local_storage import LocalStorage local_storage_subgraph = { LocalStorage.node_a: nsdfg.sdfg.nodes()[0].nodes().index(outer_entry), LocalStorage.node_b: nsdfg.sdfg.nodes()[0].nodes().index(inner_entry) } nsdfg_id = nsdfg.sdfg.sdfg_list.index(nsdfg.sdfg) nstate_id = 0 local_storage = LocalStorage(nsdfg_id, nstate_id, local_storage_subgraph, 0) local_storage.array = array_in local_storage.apply(nsdfg.sdfg) in_transient_node_inner = local_storage._data_node # push to shared memory / default nsdfg.sdfg.data(in_transient_node_inner.data ).storage = dtypes.StorageType.Register # first, inline fuse back our nested SDFG from dace.transformation.interstate import InlineSDFG inline_sdfg = InlineSDFG( sdfg.sdfg_list.index(sdfg), sdfg.nodes().index(graph), {InlineSDFG._nested_sdfg: graph.nodes().index(nsdfg)}, 0) inline_sdfg.apply(sdfg) if not shortcut: reduction_type = detect_reduction_type(wcr) try: code = ReduceExpansion.reduction_type_update[reduction_type] except KeyError: raise NotImplementedError( "Not yet implemented for custom reduction") new_tasklet = graph.add_tasklet( name="reduction_transient_update", inputs={"reduction_in", "array_in"}, outputs={"out"}, code=code) edge_to_remove = graph.out_edges(out_transient_node_inner)[0] \ if self.create_out_transient \ else graph.out_edges(inner_exit)[0] new_memlet_array_inner = Memlet(data=out_storage_node.data, volume=1, subset=edge_to_remove.data.subset) new_memlet_array_outer = Memlet( data=array_closest_ancestor.data, volume=graph.in_edges(outer_entry)[0].data.volume, subset=subsets.Range.from_array( sdfg.data(out_storage_node.data))) new_memlet_reduction = Memlet( data=graph.out_edges(inner_exit)[0].data.data, volume=1, subset=graph.out_edges(inner_exit)[0].data.subset) new_memlet_out_inner = Memlet(data=edge_to_remove.data.data, volume=1, subset=edge_to_remove.data.subset) new_memlet_out_outer = dcpy(new_memlet_array_outer) # remove old edges outer_edge_to_remove = None for edge in graph.out_edges(outer_exit): if edge.src == edge_to_remove.dst: outer_edge_to_remove = edge graph.remove_edge_and_connectors(edge_to_remove) graph.remove_edge_and_connectors(outer_edge_to_remove) graph.add_edge(out_transient_node_inner if self.create_out_transient \ else inner_exit, None, new_tasklet, "reduction_in", new_memlet_reduction) graph.add_edge(outer_entry, None, new_tasklet, "array_in", new_memlet_array_inner) graph.add_edge(array_closest_ancestor, None, outer_entry, None, new_memlet_array_outer) graph.add_edge(new_tasklet, "out", outer_exit, None, new_memlet_out_inner) graph.add_edge(outer_exit, None, out_storage_node, None, new_memlet_out_outer) # fill map scope connectors graph.fill_scope_connectors() graph._clear_scopedict_cache() # wcr is already removed # FORNOW: choose default schedule and implementation new_schedule = dtypes.ScheduleType.Default new_implementation = self.reduce_implementation \ if self.reduce_implementation is not None \ else implementation new_axes = dcpy(reduce_node.axes) reduce_node_new = graph.add_reduce(wcr=wcr, axes=new_axes, schedule=new_schedule, identity=identity) reduce_node_new.implementation = new_implementation edge_tmp = graph.in_edges(inner_entry)[0] memlet_src_reduce = dcpy(edge_tmp.data) graph.add_edge(edge_tmp.src, edge_tmp.src_conn, reduce_node_new, None, memlet_src_reduce) edge_tmp = graph.out_edges(inner_exit)[0] memlet_reduce_dst = Memlet(data=edge_tmp.data.data, volume=1, subset=edge_tmp.data.subset) graph.add_edge(reduce_node_new, None, edge_tmp.dst, edge_tmp.dst_conn, memlet_reduce_dst) identity_tasklet = graph.out_edges(inner_entry)[0].dst graph.remove_node(inner_entry) graph.remove_node(inner_exit) graph.remove_node(identity_tasklet) # propagate scope for correct volumes scope_tree = ScopeTree(outer_entry, outer_exit) scope_tree.parent = ScopeTree(None, None) propagate_memlets_scope(sdfg, graph, scope_tree) sdfg.validate() # create variables for outside access self._new_reduce = reduce_node_new self._outer_entry = outer_entry if identity is None and self.create_out_transient: # set the reduction identity accordingly so that the correct # blank result is written to the out_transient node # we use default values deducted from the reduction type reduction_type = detect_reduction_type(wcr) try: reduce_node_new.identity = self.reduction_type_identity[ reduction_type] except KeyError: if reduction_type == dtypes.ReductionType.Min: reduce_node_new.identity = dtypes.max_value( sdfg.arrays[out_storage_node.data].dtype) elif reduction_type == dtypes.ReductionType.Max: reduce_node_new.identity = dtypes.min_value( sdfg.arrays[out_storage_node.data].dtype) else: raise ValueError(f"Cannot infer reduction identity." "Please specify the identity of node" "{reduce_node_new}") return
class RedundantSecondArray(pm.Transformation): """ Implements the redundant array removal transformation, applied when a transient array is copied from and to (from another array), but never used anywhere else. This transformation removes the second array. """ _arrays_removed = 0 _in_array = nodes.AccessNode("_") _out_array = nodes.AccessNode("_") @staticmethod def expressions(): return [ sdutil.node_path_graph(RedundantSecondArray._in_array, RedundantSecondArray._out_array) ] @staticmethod def can_be_applied(graph, candidate, expr_index, sdfg, strict=False): in_array = graph.nodes()[candidate[RedundantSecondArray._in_array]] out_array = graph.nodes()[candidate[RedundantSecondArray._out_array]] # Ensure in degree is one (only one source, which is in_array) if graph.in_degree(out_array) != 1: return False # Make sure that the candidate is a transient variable if not out_array.desc(sdfg).transient: return False # Make sure that both arrays are using the same storage location if in_array.desc(sdfg).storage != out_array.desc(sdfg).storage: return False # Find occurrences in this and other states occurrences = [] for state in sdfg.nodes(): occurrences.extend([ n for n in state.nodes() if isinstance(n, nodes.AccessNode) and n.desc(sdfg) == out_array.desc(sdfg) ]) if len(occurrences) > 1: return False # Only apply if arrays are of same shape (no need to modify memlet subset) # if len(in_array.desc(sdfg).shape) != len( # out_array.desc(sdfg).shape) or any(i != o for i, o in zip( # in_array.desc(sdfg).shape, # out_array.desc(sdfg).shape)): # return False return True @staticmethod def match_to_str(graph, candidate): out_array = graph.nodes()[candidate[RedundantSecondArray._out_array]] return "Remove " + str(out_array) def apply(self, sdfg): def gnode(nname): return graph.nodes()[self.subgraph[nname]] graph = sdfg.nodes()[self.state_id] in_array = gnode(RedundantSecondArray._in_array) out_array = gnode(RedundantSecondArray._out_array) memlet = graph.edges_between(in_array, out_array)[0].data if memlet.data == in_array.data: subset = memlet.subset else: subset = memlet.other_subset for e in graph.out_edges(out_array): # Modify all outgoing edges to point to in_array path = graph.memlet_tree(e) for pe in path: if pe.data.data == out_array.data: pe.data.data = in_array.data if isinstance(subset, subsets.Indices): pe.data.subset.offset(subset, False) else: pe.data.subset = subset.compose(pe.data.subset) elif pe.data.other_subset: if isinstance(subset, subsets.Indices): pe.data.other_subset.offset(subset, False) else: pe.data.other_subset = subset.compose( pe.data.other_subset) # Redirect edge to out_array graph.remove_edge(e) graph.add_edge(in_array, e.src_conn, e.dst, e.dst_conn, e.data) # Finally, remove out_array node graph.remove_node(out_array) # TODO: Should the array be removed from the SDFG? # del sdfg.arrays[out_array] if Config.get_bool("debugprint"): RedundantSecondArray._arrays_removed += 1
class MatrixProductTranspose(transformation.Transformation): """ Implements the matrix-matrix product transpose transformation. T(A) @ T(B) = T(B @ A) """ _transpose_a = blas.Transpose("") _at = nodes.AccessNode("") _transpose_b = blas.Transpose("") _bt = nodes.AccessNode("") _a_times_b = blas.MatMul("") @staticmethod def expressions(): graph = dace.sdfg.graph.OrderedDiGraph() graph.add_node(MatrixProductTranspose._transpose_a) graph.add_node(MatrixProductTranspose._at) graph.add_node(MatrixProductTranspose._transpose_b) graph.add_node(MatrixProductTranspose._bt) graph.add_node(MatrixProductTranspose._a_times_b) graph.add_edge(MatrixProductTranspose._transpose_a, MatrixProductTranspose._at, None) graph.add_edge(MatrixProductTranspose._at, MatrixProductTranspose._a_times_b, None) graph.add_edge(MatrixProductTranspose._transpose_b, MatrixProductTranspose._bt, None) graph.add_edge(MatrixProductTranspose._bt, MatrixProductTranspose._a_times_b, None) return [graph] @staticmethod def can_be_applied(graph, candidate, expr_index, sdfg, permissive=False): _at = graph.nodes()[candidate[MatrixProductTranspose._at]] _a_times_b = graph.nodes()[candidate[ MatrixProductTranspose._a_times_b]] edges = graph.edges_between(_at, _a_times_b) # Enforce unique match if len(edges) != 1: return False _, _, _, dst_conn, _ = edges[0] if dst_conn != '_a': return False return True @staticmethod def match_to_str(graph, candidate): transpose_a = graph.nodes()[candidate[ MatrixProductTranspose._transpose_a]] transpose_b = graph.nodes()[candidate[ MatrixProductTranspose._transpose_b]] a_times_b = graph.nodes()[candidate[MatrixProductTranspose._a_times_b]] return f"{transpose_a.name} -> {a_times_b.name} <- {transpose_b.name}" def apply(self, sdfg): graph = sdfg.nodes()[self.state_id] transpose_a = graph.nodes()[self.subgraph[ MatrixProductTranspose._transpose_a]] _at = graph.nodes()[self.subgraph[MatrixProductTranspose._at]] transpose_b = graph.nodes()[self.subgraph[ MatrixProductTranspose._transpose_b]] _bt = graph.nodes()[self.subgraph[MatrixProductTranspose._bt]] a_times_b = graph.nodes()[self.subgraph[ MatrixProductTranspose._a_times_b]] for src, src_conn, _, _, memlet in graph.in_edges(transpose_a): graph.add_edge(src, src_conn, a_times_b, '_b', memlet) graph.remove_node(transpose_a) for src, src_conn, _, _, memlet in graph.in_edges(transpose_b): graph.add_edge(src, src_conn, a_times_b, '_a', memlet) graph.remove_node(transpose_b) graph.remove_node(_at) graph.remove_node(_bt) for _, _, dst, dst_conn, memlet in graph.out_edges(a_times_b): subset = dcpy(memlet.subset) subset.squeeze() size = subset.size() shape = [size[1], size[0]] break tmp_name, tmp_arr = sdfg.add_temp_transient(shape, a_times_b.dtype) tmp_acc = graph.add_access(tmp_name) transpose_c = blas.Transpose('_Transpose_', a_times_b.dtype) for edge in graph.out_edges(a_times_b): _, _, dst, dst_conn, memlet = edge graph.remove_edge(edge) graph.add_edge(transpose_c, '_out', dst, dst_conn, memlet) graph.add_edge(a_times_b, '_c', tmp_acc, None, dace.Memlet.from_array(tmp_name, tmp_arr)) graph.add_edge(tmp_acc, None, transpose_c, '_inp', dace.Memlet.from_array(tmp_name, tmp_arr))
class RedundantArray(pm.Transformation): """ Implements the redundant array removal transformation, applied when a transient array is copied to and from (to another array), but never used anywhere else. """ _arrays_removed = 0 _in_array = nodes.AccessNode("_") _out_array = nodes.AccessNode("_") @staticmethod def expressions(): return [ sdutil.node_path_graph(RedundantArray._in_array, RedundantArray._out_array) ] @staticmethod def can_be_applied(graph, candidate, expr_index, sdfg, strict=False): in_array = graph.nodes()[candidate[RedundantArray._in_array]] out_array = graph.nodes()[candidate[RedundantArray._out_array]] # Ensure out degree is one (only one target, which is out_array) if graph.out_degree(in_array) != 1: return False # Make sure that the candidate is a transient variable if not in_array.desc(sdfg).transient: return False # Make sure that both arrays are using the same storage location if in_array.desc(sdfg).storage != out_array.desc(sdfg).storage: return False # Find occurrences in this and other states occurrences = [] for state in sdfg.nodes(): occurrences.extend([ n for n in state.nodes() if isinstance(n, nodes.AccessNode) and n.desc(sdfg) == in_array.desc(sdfg) ]) if len(occurrences) > 1: return False # Only apply if arrays are of same shape (no need to modify subset) if len(in_array.desc(sdfg).shape) != len( out_array.desc(sdfg).shape) or any(i != o for i, o in zip( in_array.desc(sdfg).shape, out_array.desc(sdfg).shape)): return False if strict: # In strict mode, make sure the memlet covers the removed array edge = graph.edges_between(in_array, out_array)[0] if any(m != a for m, a in zip(edge.data.subset.size(), in_array.desc(sdfg).shape)): return False return True @staticmethod def match_to_str(graph, candidate): in_array = graph.nodes()[candidate[RedundantArray._in_array]] return "Remove " + str(in_array) def apply(self, sdfg): def gnode(nname): return graph.nodes()[self.subgraph[nname]] graph = sdfg.nodes()[self.state_id] in_array = gnode(RedundantArray._in_array) out_array = gnode(RedundantArray._out_array) for e in graph.in_edges(in_array): # Modify all incoming edges to point to out_array path = graph.memlet_path(e) for pe in path: if pe.data.data == in_array.data: pe.data.data = out_array.data # Redirect edge to out_array graph.remove_edge(e) graph.add_edge(e.src, e.src_conn, out_array, e.dst_conn, e.data) # Finally, remove in_array node graph.remove_node(in_array) # TODO: Should the array be removed from the SDFG? # del sdfg.arrays[in_array] if Config.get_bool("debugprint"): RedundantArray._arrays_removed += 1
def apply(self, sdfg: sd.SDFG): ####################################################### # Step 0: SDFG metadata # Find all input and output data descriptors input_nodes = [] output_nodes = [] global_code_nodes = [[] for _ in sdfg.nodes()] for i, state in enumerate(sdfg.nodes()): sdict = state.scope_dict() for node in state.nodes(): if (isinstance(node, nodes.AccessNode) and node.desc(sdfg).transient == False): if (state.out_degree(node) > 0 and node.data not in input_nodes): # Special case: nodes that lead to top-level dynamic # map ranges must stay on host for e in state.out_edges(node): last_edge = state.memlet_path(e)[-1] if (isinstance(last_edge.dst, nodes.EntryNode) and last_edge.dst_conn and not last_edge.dst_conn.startswith('IN_') and sdict[last_edge.dst] is None): break else: input_nodes.append((node.data, node.desc(sdfg))) if (state.in_degree(node) > 0 and node.data not in output_nodes): output_nodes.append((node.data, node.desc(sdfg))) elif isinstance(node, nodes.CodeNode) and sdict[node] is None: if not isinstance(node, (nodes.LibraryNode, nodes.NestedSDFG)): global_code_nodes[i].append(node) # Input nodes may also be nodes with WCR memlets and no identity for e in state.edges(): if e.data.wcr is not None: if (e.data.data not in input_nodes and sdfg.arrays[e.data.data].transient == False): input_nodes.append( (e.data.data, sdfg.arrays[e.data.data])) start_state = sdfg.start_state end_states = sdfg.sink_nodes() ####################################################### # Step 1: Create cloned GPU arrays and replace originals cloned_arrays = {} for inodename, inode in set(input_nodes): if isinstance(inode, data.Scalar): # Scalars can remain on host continue if inode.storage == dtypes.StorageType.GPU_Global: continue newdesc = inode.clone() newdesc.storage = dtypes.StorageType.GPU_Global newdesc.transient = True name = sdfg.add_datadesc('gpu_' + inodename, newdesc, find_new_name=True) cloned_arrays[inodename] = name for onodename, onode in set(output_nodes): if onodename in cloned_arrays: continue if onode.storage == dtypes.StorageType.GPU_Global: continue newdesc = onode.clone() newdesc.storage = dtypes.StorageType.GPU_Global newdesc.transient = True name = sdfg.add_datadesc('gpu_' + onodename, newdesc, find_new_name=True) cloned_arrays[onodename] = name # Replace nodes for state in sdfg.nodes(): for node in state.nodes(): if (isinstance(node, nodes.AccessNode) and node.data in cloned_arrays): node.data = cloned_arrays[node.data] # Replace memlets for state in sdfg.nodes(): for edge in state.edges(): if edge.data.data in cloned_arrays: edge.data.data = cloned_arrays[edge.data.data] ####################################################### # Step 2: Create copy-in state excluded_copyin = self.exclude_copyin.split(',') copyin_state = sdfg.add_state(sdfg.label + '_copyin') sdfg.add_edge(copyin_state, start_state, sd.InterstateEdge()) for nname, desc in dtypes.deduplicate(input_nodes): if nname in excluded_copyin or nname not in cloned_arrays: continue src_array = nodes.AccessNode(nname, debuginfo=desc.debuginfo) dst_array = nodes.AccessNode(cloned_arrays[nname], debuginfo=desc.debuginfo) copyin_state.add_node(src_array) copyin_state.add_node(dst_array) copyin_state.add_nedge( src_array, dst_array, memlet.Memlet.from_array(src_array.data, src_array.desc(sdfg))) ####################################################### # Step 3: Create copy-out state excluded_copyout = self.exclude_copyout.split(',') copyout_state = sdfg.add_state(sdfg.label + '_copyout') for state in end_states: sdfg.add_edge(state, copyout_state, sd.InterstateEdge()) for nname, desc in dtypes.deduplicate(output_nodes): if nname in excluded_copyout or nname not in cloned_arrays: continue src_array = nodes.AccessNode(cloned_arrays[nname], debuginfo=desc.debuginfo) dst_array = nodes.AccessNode(nname, debuginfo=desc.debuginfo) copyout_state.add_node(src_array) copyout_state.add_node(dst_array) copyout_state.add_nedge( src_array, dst_array, memlet.Memlet.from_array(dst_array.data, dst_array.desc(sdfg))) ####################################################### # Step 4: Modify transient data storage for state in sdfg.nodes(): sdict = state.scope_dict() for node in state.nodes(): if isinstance(node, nodes.AccessNode) and node.desc(sdfg).transient: nodedesc = node.desc(sdfg) # Special case: nodes that lead to dynamic map ranges must # stay on host if any( isinstance( state.memlet_path(e)[-1].dst, nodes.EntryNode) for e in state.out_edges(node)): continue gpu_storage = [ dtypes.StorageType.GPU_Global, dtypes.StorageType.GPU_Shared, dtypes.StorageType.CPU_Pinned ] if sdict[ node] is None and nodedesc.storage not in gpu_storage: # NOTE: the cloned arrays match too but it's the same # storage so we don't care nodedesc.storage = dtypes.StorageType.GPU_Global # Try to move allocation/deallocation out of loops if (self.toplevel_trans and not isinstance(nodedesc, data.Stream)): nodedesc.lifetime = dtypes.AllocationLifetime.SDFG elif nodedesc.storage not in gpu_storage: # Make internal transients registers if self.register_trans: nodedesc.storage = dtypes.StorageType.Register ####################################################### # Step 5: Wrap free tasklets and nested SDFGs with a GPU map for state, gcodes in zip(sdfg.nodes(), global_code_nodes): for gcode in gcodes: if gcode.label in self.exclude_tasklets.split(','): continue # Create map and connectors me, mx = state.add_map(gcode.label + '_gmap', {gcode.label + '__gmapi': '0:1'}, schedule=dtypes.ScheduleType.GPU_Device) # Store in/out edges in lists so that they don't get corrupted # when they are removed from the graph in_edges = list(state.in_edges(gcode)) out_edges = list(state.out_edges(gcode)) me.in_connectors = {('IN_' + e.dst_conn): None for e in in_edges} me.out_connectors = {('OUT_' + e.dst_conn): None for e in in_edges} mx.in_connectors = {('IN_' + e.src_conn): None for e in out_edges} mx.out_connectors = {('OUT_' + e.src_conn): None for e in out_edges} # Create memlets through map for e in in_edges: state.remove_edge(e) state.add_edge(e.src, e.src_conn, me, 'IN_' + e.dst_conn, e.data) state.add_edge(me, 'OUT_' + e.dst_conn, e.dst, e.dst_conn, e.data) for e in out_edges: state.remove_edge(e) state.add_edge(e.src, e.src_conn, mx, 'IN_' + e.src_conn, e.data) state.add_edge(mx, 'OUT_' + e.src_conn, e.dst, e.dst_conn, e.data) # Map without inputs if len(in_edges) == 0: state.add_nedge(me, gcode, memlet.Memlet()) ####################################################### # Step 6: Change all top-level maps and library nodes to GPU schedule for i, state in enumerate(sdfg.nodes()): sdict = state.scope_dict() for node in state.nodes(): if isinstance(node, (nodes.EntryNode, nodes.LibraryNode)): if sdict[node] is None: node.schedule = dtypes.ScheduleType.GPU_Device elif (isinstance(node, (nodes.EntryNode, nodes.LibraryNode)) and self.sequential_innermaps): node.schedule = dtypes.ScheduleType.Sequential ####################################################### # Step 7: Introduce copy-out if data used in outgoing interstate edges for state in list(sdfg.nodes()): arrays_used = set() for e in sdfg.out_edges(state): # Used arrays = intersection between symbols and cloned arrays arrays_used.update( set(e.data.free_symbols) & set(cloned_arrays.keys())) # Create a state and copy out used arrays if len(arrays_used) > 0: co_state = sdfg.add_state(state.label + '_icopyout') # Reconnect outgoing edges to after interim copyout state for e in sdfg.out_edges(state): sdutil.change_edge_src(sdfg, state, co_state) # Add unconditional edge to interim state sdfg.add_edge(state, co_state, sd.InterstateEdge()) # Add copy-out nodes for nname in arrays_used: desc = sdfg.arrays[nname] src_array = nodes.AccessNode(cloned_arrays[nname], debuginfo=desc.debuginfo) dst_array = nodes.AccessNode(nname, debuginfo=desc.debuginfo) co_state.add_node(src_array) co_state.add_node(dst_array) co_state.add_nedge( src_array, dst_array, memlet.Memlet.from_array(dst_array.data, dst_array.desc(sdfg))) ####################################################### # Step 8: Strict transformations if not self.strict_transform: return # Apply strict state fusions greedily. sdfg.apply_strict_transformations()
def apply(self, sdfg): graph = sdfg.nodes()[self.state_id] node_a = self.node_a(sdfg) node_b = self.node_b(sdfg) # Determine direction of new memlet scope_dict = graph.scope_dict() propagate_forward = sd.scope_contains_scope(scope_dict, node_a, node_b) array = self.array if array is None or len(array) == 0: array = next(e.data.data for e in graph.edges_between(node_a, node_b) if e.data.data is not None and e.data.wcr is None) original_edge = None invariant_memlet = None for edge in graph.edges_between(node_a, node_b): if array == edge.data.data: original_edge = edge invariant_memlet = edge.data break if invariant_memlet is None: for edge in graph.edges_between(node_a, node_b): original_edge = edge invariant_memlet = edge.data warnings.warn('Array %s not found! Using array %s instead.' % (array, invariant_memlet.data)) array = invariant_memlet.data break if invariant_memlet is None: raise NameError('Array %s not found!' % array) # Add transient array new_data, _ = sdfg.add_array('trans_' + invariant_memlet.data, [ symbolic.overapproximate(r) for r in invariant_memlet.bounding_box_size() ], sdfg.arrays[invariant_memlet.data].dtype, transient=True, find_new_name=True) data_node = nodes.AccessNode(new_data) # Store as fields so that other transformations can use them self._local_name = new_data self._data_node = data_node to_data_mm = copy.deepcopy(invariant_memlet) from_data_mm = copy.deepcopy(invariant_memlet) offset = subsets.Indices([r[0] for r in invariant_memlet.subset]) # Reconnect, assuming one edge to the access node graph.remove_edge(original_edge) if propagate_forward: graph.add_edge(node_a, original_edge.src_conn, data_node, None, to_data_mm) new_edge = graph.add_edge(data_node, None, node_b, original_edge.dst_conn, from_data_mm) else: new_edge = graph.add_edge(node_a, original_edge.src_conn, data_node, None, to_data_mm) graph.add_edge(data_node, None, node_b, original_edge.dst_conn, from_data_mm) # Offset all edges in the memlet tree (including the new edge) for edge in graph.memlet_tree(new_edge): edge.data.subset.offset(offset, True) edge.data.data = new_data return data_node
def __init__(self, name, model: onnx.ModelProto, cuda=False): """ Constructs a new ONNXImporter. :param name: the name for the SDFG. :param model: the model to import. :param cuda: if `True`, weights will be passed as cuda arrays. """ graph: onnx.GraphProto = model.graph self.sdfg = SDFG(name) self.cuda = cuda self.state = self.sdfg.add_state() # Add all values to the SDFG, check for unsupported ops ########################################## self.value_infos = {} self.inputs = [] self.outputs = [] for value, is_input in chain(zip(graph.input, repeat(True)), zip(graph.output, repeat(False))): if not value.HasField("name"): raise ValueError("Got input or output without name") if is_input: self.inputs.append(value.name) else: self.outputs.append(value.name) self.value_infos[value.name] = value self._add_value_info(value) for value in graph.value_info: if not value.HasField("name"): raise ValueError("Got input or output without name") if value.name not in self.value_infos: self.value_infos[value.name] = value # add weights self.weights = {} for init in graph.initializer: self._add_constant_tensor(init) access_nodes = {} self._idx_to_node = [] for i, node in enumerate(graph.node): if not has_onnx_node(node.op_type): raise ValueError("Unsupported ONNX operator: '{}'".format( node.op_type)) # extract the op attributes op_attributes = { attribute_proto.name: convert_attribute_proto(attribute_proto) for attribute_proto in node.attribute } if node.HasField("name"): node_name = clean_onnx_name(node.name) else: node_name = node.op_type + "_" + str(i) # construct the dace node op_node = get_onnx_node(node.op_type)(node_name, **op_attributes) self.state.add_node(op_node) self._idx_to_node.append(op_node) for param_idx, (name, is_input) in chain( enumerate(zip(node.input, repeat(True))), enumerate(zip(node.output, repeat(False)))): if clean_onnx_name(name) not in self.sdfg.arrays: if name not in self.value_infos: raise ValueError( "Could not find array with name '{}'".format(name)) self._add_value_info(self.value_infos[name]) # get the access node if name in access_nodes: access = access_nodes[name] self._update_access_type(access, is_input) else: access = nd.AccessNode( clean_onnx_name(name), AccessType.ReadOnly if is_input else AccessType.WriteOnly) self.state.add_node(access) access_nodes[name] = access # get the connector name params = op_node.schema.inputs if is_input else op_node.schema.outputs params_len = len(params) if param_idx >= params_len: # this is a variadic parameter. Then the last parameter of the parameter must be variadic. if params[-1].param_type != ONNXParameterType.Variadic: raise ValueError( "Expected the last {i_or_o} parameter to be variadic," " since the {i_or_o} with idx {param_idx} has more parameters than the schema ({params_len})" .format(i_or_o="input" if is_input else "output", param_idx=param_idx, params_len=params_len)) conn_name = params[-1].name + "__" + str(param_idx - params_len + 1) elif params[ param_idx].param_type == ONNXParameterType.Variadic: # this is a variadic parameter, and it is within the range of params, so it must be the first # instance of a variadic parameter conn_name = params[param_idx].name + "__0" else: conn_name = params[param_idx].name data_desc = self.sdfg.arrays[clean_onnx_name(name)] # add the connector if required, and add an edge if is_input: if conn_name not in op_node.in_connectors: op_node.add_in_connector(conn_name) self.state.add_edge( access, None, op_node, conn_name, dace.Memlet.from_array(clean_onnx_name(name), data_desc)) else: if conn_name not in op_node.out_connectors: op_node.add_out_connector(conn_name) self.state.add_edge( op_node, conn_name, access, None, dace.Memlet.from_array(clean_onnx_name(name), data_desc)) if self.cuda: self.sdfg.apply_strict_transformations() self.sdfg.apply_gpu_transformations() self.sdfg.apply_strict_transformations() # set all gpu transients to be persistent for _, _, arr in self.sdfg.arrays_recursive(): if arr.transient and arr.storage == StorageType.GPU_Global: arr.lifetime = AllocationLifetime.Persistent
class RedundantArrayCopyingIn(pm.Transformation): """ Implements the redundant array removal transformation. Removes the first and second access nodeds in pattern A -> B -> A """ _arrays_removed = 0 _in_array = nodes.AccessNode("_") _med_array = nodes.AccessNode("_") _out_array = nodes.AccessNode("_") @staticmethod def expressions(): return [ sdutil.node_path_graph( RedundantArrayCopying._in_array, RedundantArrayCopying._med_array, RedundantArrayCopying._out_array, ) ] @staticmethod def can_be_applied(graph, candidate, expr_index, sdfg, strict=False): in_array = graph.nodes()[candidate[RedundantArrayCopying._in_array]] med_array = graph.nodes()[candidate[RedundantArrayCopying._med_array]] out_array = graph.nodes()[candidate[RedundantArrayCopying._out_array]] # Safety first (could be relaxed) if not (graph.out_degree(in_array) == 1 and graph.in_degree(med_array) == 1 and graph.out_degree(med_array)): return False # Make sure that the removal candidates are transient if not (in_array.desc(sdfg).transient and med_array.desc(sdfg).transient): return False # Make sure that both arrays are using the same storage location if in_array.desc(sdfg).storage != out_array.desc(sdfg).storage: return False # Only apply if arrays are of same shape (no need to modify memlet subset) if len(in_array.desc(sdfg).shape) != len( out_array.desc(sdfg).shape) or any(i != o for i, o in zip( in_array.desc(sdfg).shape, out_array.desc(sdfg).shape)): return False return True @staticmethod def match_to_str(graph, candidate): in_array = graph.nodes()[candidate[RedundantArrayCopying._in_array]] med_array = graph.nodes()[candidate[RedundantArrayCopying._med_array]] return "Remove " + str(in_array) + " and " + str(med_array) def apply(self, sdfg): def gnode(nname): return graph.nodes()[self.subgraph[nname]] graph = sdfg.nodes()[self.state_id] in_array = gnode(RedundantArrayCopying._in_array) med_array = gnode(RedundantArrayCopying._med_array) out_array = gnode(RedundantArrayCopying._out_array) # Modify all edges that point to in_array to point to out_array for in_edge in graph.in_edges(in_array): # Make all memlets that write to in_array write to out_array instead tree = graph.memlet_tree(in_edge) for te in tree: if te.data.data == in_array.data: te.data.data = out_array.data # Redirect edge to in_array graph.remove_edge(in_edge) graph.add_edge(in_edge.src, in_edge.src_conn, out_array, None, in_edge.data) graph.remove_node(med_array) graph.remove_node(in_array)
def apply(self, sdfg): state = sdfg.nodes()[self.state_id] nested_sdfg = state.nodes()[self.subgraph[CopyToDevice._nested_sdfg]] storage = self.storage created_arrays = set() for _, edge in enumerate(state.in_edges(nested_sdfg)): src, src_conn, dst, dst_conn, memlet = edge dataname = memlet.data if dataname is None: continue memdata = sdfg.arrays[dataname] name = 'device_' + dataname + '_in' if name not in created_arrays: if isinstance(memdata, data.Array): name, _ = sdfg.add_array( 'device_' + dataname + '_in', shape=[ symbolic.overapproximate(r) for r in memlet.bounding_box_size() ], dtype=memdata.dtype, transient=True, storage=storage, find_new_name=True) elif isinstance(memdata, data.Scalar): name, _ = sdfg.add_scalar('device_' + dataname + '_in', dtype=memdata.dtype, transient=True, storage=storage, find_new_name=True) else: raise NotImplementedError created_arrays.add(name) data_node = nodes.AccessNode(name) to_data_mm = dcpy(memlet) from_data_mm = dcpy(memlet) from_data_mm.data = name offset = [] for ind, r in enumerate(memlet.subset): offset.append(r[0]) if isinstance(memlet.subset[ind], tuple): begin = memlet.subset[ind][0] - r[0] end = memlet.subset[ind][1] - r[0] step = memlet.subset[ind][2] from_data_mm.subset[ind] = (begin, end, step) else: from_data_mm.subset[ind] -= r[0] state.remove_edge(edge) state.add_edge(src, src_conn, data_node, None, to_data_mm) state.add_edge(data_node, None, dst, dst_conn, from_data_mm) for _, edge in enumerate(state.out_edges(nested_sdfg)): src, src_conn, dst, dst_conn, memlet = edge dataname = memlet.data if dataname is None: continue memdata = sdfg.arrays[dataname] name = 'device_' + dataname + '_out' if name not in created_arrays: if isinstance(memdata, data.Array): name, _ = sdfg.add_array( name, shape=[ symbolic.overapproximate(r) for r in memlet.bounding_box_size() ], dtype=memdata.dtype, transient=True, storage=storage, find_new_name=True) elif isinstance(memdata, data.Scalar): name, _ = sdfg.add_scalar(name, dtype=memdata.dtype, transient=True, storage=storage) else: raise NotImplementedError created_arrays.add(name) data_node = nodes.AccessNode(name) to_data_mm = dcpy(memlet) from_data_mm = dcpy(memlet) to_data_mm.data = name offset = [] for ind, r in enumerate(memlet.subset): offset.append(r[0]) if isinstance(memlet.subset[ind], tuple): begin = memlet.subset[ind][0] - r[0] end = memlet.subset[ind][1] - r[0] step = memlet.subset[ind][2] to_data_mm.subset[ind] = (begin, end, step) else: to_data_mm.subset[ind] -= r[0] state.remove_edge(edge) state.add_edge(src, src_conn, data_node, None, to_data_mm) state.add_edge(data_node, None, dst, dst_conn, from_data_mm) # Change storage for all data inside nested SDFG to device. change_storage(nested_sdfg.sdfg, storage)
class OnTheFlyMapFusion(Transformation): _first_map_entry = nodes.MapEntry(nodes.Map('', [], [])) _first_tasklet = nodes.Tasklet('') _first_map_exit = nodes.MapExit(nodes.Map('', [], [])) _array_access = nodes.AccessNode('') _second_map_entry = nodes.MapEntry(nodes.Map('', [], [])) _second_tasklet = nodes.Tasklet('') @staticmethod def expressions(): return [ sdutils.node_path_graph(OnTheFlyMapFusion._first_map_entry, OnTheFlyMapFusion._first_tasklet, OnTheFlyMapFusion._first_map_exit, OnTheFlyMapFusion._array_access, OnTheFlyMapFusion._second_map_entry, OnTheFlyMapFusion._second_tasklet) ] @staticmethod def can_be_applied(graph, candidate, expr_index, sdfg, strict=False): first_map_entry = graph.node( candidate[OnTheFlyMapFusion._first_map_entry]) first_tasklet = graph.node(candidate[OnTheFlyMapFusion._first_tasklet]) first_map_exit = graph.node( candidate[OnTheFlyMapFusion._first_map_exit]) array_access = graph.node(candidate[OnTheFlyMapFusion._array_access]) if len(first_map_exit.in_connectors) != 1: return False if (graph.in_degree(array_access) != 1 or graph.out_degree(array_access) != 1): return False return True @staticmethod def _memlet_offsets(base_memlet, offset_memlet): """ Compute subset offset of `offset_memlet` relative to `base_memlet`. """ def offset(base_range, offset_range): b0, e0, s0 = base_range b1, e1, s1 = offset_range assert e1 - e0 == b1 - b0 and s0 == s1 return int(e1 - e0) return tuple( offset(b, o) for b, o in zip(base_memlet.subset.ranges, offset_memlet.subset.ranges)) @staticmethod def _update_map_connectors(state, array_access, first_map_entry, second_map_entry): """ Remove unused connector (of the to-be-replaced array) from second map entry, add new connectors to second map entry for the inputs used in the first map’s tasklets. """ # Remove edges and connectors from arrays access to second map entry for edge in state.edges_between(array_access, second_map_entry): state.remove_edge_and_connectors(edge) state.remove_node(array_access) # Add new connectors to second map # TODO: implement for the general case with random naming for edge in state.in_edges(first_map_entry): if second_map_entry.add_in_connector(edge.dst_conn): state.add_edge(edge.src, edge.src_conn, second_map_entry, edge.dst_conn, edge.data) @staticmethod def _read_offsets(state, array_name, first_map_exit, second_map_entry): """ Compute offsets of read accesses in second map. """ # Get output memlet of first tasklet output_edges = state.in_edges(first_map_exit) assert len(output_edges) == 1 write_memlet = output_edges[0].data # Find read offsets by looping over second map entry connectors offsets = defaultdict(list) for edge in state.out_edges(second_map_entry): if edge.data.data == array_name: second_map_entry.remove_out_connector(edge.src_conn) state.remove_edge(edge) offset = OnTheFlyMapFusion._memlet_offsets( write_memlet, edge.data) offsets[offset].append(edge) return offsets @staticmethod def _copy_first_map_contents(state, first_map_entry, first_map_exit): nodes = list( state.all_nodes_between(first_map_entry, first_map_exit) - {first_map_entry}) new_nodes = [copy.deepcopy(node) for node in nodes] for node in new_nodes: state.add_node(node) id_map = { state.node_id(old): state.node_id(new) for old, new in zip(nodes, new_nodes) } def map(node): return state.node(id_map[state.node_id(node)]) for edge in state.edges(): if edge.src in nodes or edge.dst in nodes: src = map(edge.src) if edge.src in nodes else edge.src dst = map(edge.dst) if edge.dst in nodes else edge.dst state.add_edge(src, edge.src_conn, dst, edge.dst_conn, copy.deepcopy(edge.data)) return new_nodes def _replicate_first_map(self, sdfg, array_access, first_map_entry, first_map_exit, second_map_entry): """ Replicate tasklet of first map for reach read access in second map. """ state = sdfg.node(self.state_id) array_name = array_access.data array = sdfg.arrays[array_name] read_offsets = self._read_offsets(state, array_name, first_map_exit, second_map_entry) # Replicate first map tasklets once for each read offset access and # connect them to other tasklets accordingly for offset, edges in read_offsets.items(): nodes = self._copy_first_map_contents(state, first_map_entry, first_map_exit) tmp_name = sdfg.temp_data_name() sdfg.add_scalar(tmp_name, array.dtype, transient=True) tmp_access = state.add_access(tmp_name) for node in nodes: for edge in state.edges_between(node, first_map_exit): state.add_edge(edge.src, edge.src_conn, tmp_access, None, dace.Memlet(tmp_name)) state.remove_edge(edge) for edge in state.edges_between(first_map_entry, node): memlet = copy.deepcopy(edge.data) memlet.subset.offset(list(offset), negative=False) second_map_entry.add_out_connector(edge.src_conn) state.add_edge(second_map_entry, edge.src_conn, node, edge.dst_conn, memlet) state.remove_edge(edge) for edge in edges: state.add_edge(tmp_access, None, edge.dst, edge.dst_conn, dace.Memlet(tmp_name)) def apply(self, sdfg: dace.SDFG): state = sdfg.node(self.state_id) first_map_entry = state.node(self.subgraph[self._first_map_entry]) first_tasklet = state.node(self.subgraph[self._first_tasklet]) first_map_exit = state.node(self.subgraph[self._first_map_exit]) array_access = state.node(self.subgraph[self._array_access]) second_map_entry = state.node(self.subgraph[self._second_map_entry]) self._update_map_connectors(state, array_access, first_map_entry, second_map_entry) self._replicate_first_map(sdfg, array_access, first_map_entry, first_map_exit, second_map_entry) state.remove_nodes_from( state.all_nodes_between(first_map_entry, first_map_exit) | {first_map_exit})