class RedundantArrayCopying2(pm.Transformation): """ Implements the redundant array removal transformation. Removes multiples of array B in pattern A -> B. """ _in_array = nodes.AccessNode('_') _out_array = nodes.AccessNode('_') @staticmethod def expressions(): return [ nxutil.node_path_graph(RedundantArrayCopying2._in_array, RedundantArrayCopying2._out_array), ] @staticmethod def can_be_applied(graph, candidate, expr_index, sdfg, strict=False): in_array = graph.nodes()[candidate[RedundantArrayCopying2._in_array]] out_array = graph.nodes()[candidate[RedundantArrayCopying2._out_array]] # Ensure out degree is one (only one target, which is out_array) found = 0 for _, _, dst, _, _ in graph.out_edges(in_array): if (isinstance(dst, nodes.AccessNode) and dst != out_array and dst.data == out_array.data): found += 1 return found > 0 @staticmethod def match_to_str(graph, candidate): out_array = graph.nodes()[candidate[RedundantArrayCopying2._out_array]] return 'Remove ' + str(out_array) def apply(self, sdfg): def gnode(nname): return graph.nodes()[self.subgraph[nname]] graph = sdfg.nodes()[self.state_id] in_array = gnode(RedundantArrayCopying2._in_array) out_array = gnode(RedundantArrayCopying2._out_array) for e1 in graph.out_edges(in_array): dst = e1.dst if (isinstance(dst, nodes.AccessNode) and dst != out_array and dst.data == out_array.data): for e2 in graph.out_edges(dst): graph.add_edge(out_array, None, e2.dst, e2.dst_conn, e2.data) graph.remove_edge(e2) graph.remove_edge(e1) graph.remove_node(dst) def modifies_graph(self): return True
def output_node_for_array(state, data: str): for n in state.sink_nodes(): if isinstance(n, nd.AccessNode): if n.data == data: return n return nd.AccessNode(data)
class RedundantArrayCopying3(pm.Transformation): """ Implements the redundant array removal transformation. Removes multiples of array B in pattern MapEntry -> B. """ _arrays_removed = 0 _map_entry = nodes.MapEntry(nodes.Map("", [], [])) _out_array = nodes.AccessNode("_") @staticmethod def expressions(): return [ nxutil.node_path_graph(RedundantArrayCopying3._map_entry, RedundantArrayCopying3._out_array) ] @staticmethod def can_be_applied(graph, candidate, expr_index, sdfg, strict=False): map_entry = graph.nodes()[candidate[RedundantArrayCopying3._map_entry]] out_array = graph.nodes()[candidate[RedundantArrayCopying3._out_array]] # Ensure out degree is one (only one target, which is out_array) found = 0 for _, _, dst, _, _ in graph.out_edges(map_entry): if (isinstance(dst, nodes.AccessNode) and dst != out_array and dst.data == out_array.data): found += 1 return found > 0 @staticmethod def match_to_str(graph, candidate): out_array = graph.nodes()[candidate[RedundantArrayCopying3._out_array]] return "Remove " + str(out_array) def apply(self, sdfg): def gnode(nname): return graph.nodes()[self.subgraph[nname]] graph = sdfg.nodes()[self.state_id] map_entry = gnode(RedundantArrayCopying3._map_entry) out_array = gnode(RedundantArrayCopying3._out_array) for e1 in graph.out_edges(map_entry): dst = e1.dst if (isinstance(dst, nodes.AccessNode) and dst != out_array and dst.data == out_array.data): for e2 in graph.out_edges(dst): graph.add_edge(out_array, None, e2.dst, e2.dst_conn, e2.data) graph.remove_edge(e2) graph.remove_edge(e1) graph.remove_node(dst) if Config.get_bool("debugprint"): RedundantArrayCopying3._arrays_removed += 1
def input_node_for_array(state, data: str): # If the node appears as one of the source nodes, return it first for n in state.source_nodes(): if isinstance(n, nd.AccessNode): if n.data == data: return n # Otherwise, if the node is located elsewhere, return it for n in state.nodes(): if isinstance(n, nd.AccessNode): if n.data == data: return n return nd.AccessNode(data)
def apply(self, sdfg): graph = sdfg.nodes()[self.state_id] tasklet = graph.nodes()[self.subgraph[StreamTransient._tasklet]] map_exit = graph.nodes()[self.subgraph[StreamTransient._map_exit]] outer_map_exit = graph.nodes()[self.subgraph[ StreamTransient._outer_map_exit]] memlet = None edge = None for e in graph.out_edges(map_exit): memlet = e.data # TODO: What if there's more than one? if e.dst == outer_map_exit and isinstance(sdfg.arrays[memlet.data], data.Stream): edge = e break tasklet_memlet = None for e in graph.out_edges(tasklet): tasklet_memlet = e.data if tasklet_memlet.data == memlet.data: break bbox = map_exit.map.range.bounding_box_size() bbox_approx = [symbolic.overapproximate(dim) for dim in bbox] dataname = memlet.data # Create the new node: Temporary stream and an access node newstream = sdfg.add_stream( 'tile_' + dataname, sdfg.arrays[memlet.data].dtype, 1, bbox_approx[0], [1], transient=True, ) snode = nodes.AccessNode('tile_' + dataname) to_stream_mm = copy.deepcopy(memlet) to_stream_mm.data = snode.data tasklet_memlet.data = snode.data # Reconnect, assuming one edge to the stream graph.remove_edge(edge) graph.add_edge(map_exit, None, snode, None, to_stream_mm) graph.add_edge(snode, None, outer_map_exit, None, memlet) return
def apply(self, sdfg): graph = sdfg.nodes()[self.state_id] tasklet = graph.nodes()[self.subgraph[StreamTransient._tasklet]] map_exit = graph.nodes()[self.subgraph[StreamTransient._map_exit]] outer_map_exit = graph.nodes()[self.subgraph[ StreamTransient._outer_map_exit]] memlet = None edge = None for e in graph.out_edges(tasklet): memlet = e.data # TODO: What if there's more than one? if e.dst == map_exit and e.data.wcr is not None: break out_memlet = None for e in graph.out_edges(map_exit): out_memlet = e.data if out_memlet.data == memlet.data: edge = e break dataname = memlet.data # Create a new node with the same size as the output newdata = sdfg.add_array('trans_' + dataname, sdfg.arrays[memlet.data].shape, sdfg.arrays[memlet.data].dtype, transient=True) dnode = nodes.AccessNode('trans_' + dataname) to_data_mm = copy.deepcopy(memlet) to_data_mm.data = dnode.data to_data_mm.num_accesses = memlet.num_elements() to_exit_mm = copy.deepcopy(out_memlet) to_exit_mm.num_accesses = out_memlet.num_elements() memlet.data = dnode.data # Reconnect, assuming one edge to the stream graph.remove_edge(edge) graph.add_edge(map_exit, edge.src_conn, dnode, None, to_data_mm) graph.add_edge(dnode, None, outer_map_exit, edge.dst_conn, to_exit_mm) return
class MapReduceFusion(pm.Transformation): """ Implements the map-reduce-fusion transformation. Fuses a map with an immediately following reduction, where the array between the map and the reduction is not used anywhere else. """ _tasklet = nodes.Tasklet('_') _tmap_exit = nodes.MapExit(nodes.Map("", [], [])) _in_array = nodes.AccessNode('_') _reduce = nodes.Reduce('lambda: None', None) _out_array = nodes.AccessNode('_') @staticmethod def expressions(): return [ nxutil.node_path_graph(MapReduceFusion._tasklet, MapReduceFusion._tmap_exit, MapReduceFusion._in_array, MapReduceFusion._reduce, MapReduceFusion._out_array) ] @staticmethod def can_be_applied(graph, candidate, expr_index, sdfg, strict=False): tmap_exit = graph.nodes()[candidate[MapReduceFusion._tmap_exit]] in_array = graph.nodes()[candidate[MapReduceFusion._in_array]] reduce_node = graph.nodes()[candidate[MapReduceFusion._reduce]] tasklet = graph.nodes()[candidate[MapReduceFusion._tasklet]] # Make sure that the array is only accessed by the map and the reduce if any([ src != tmap_exit for src, _, _, _, memlet in graph.in_edges(in_array) ]): return False if any([ dest != reduce_node for _, _, dest, _, memlet in graph.out_edges(in_array) ]): return False tmem = next(e for e in graph.edges_between(tasklet, tmap_exit) if e.data.data == in_array.data).data # (strict) Make sure that the transient is not accessed anywhere else # in this state or other states if strict and (len([ n for n in graph.nodes() if isinstance(n, nodes.AccessNode) and n.data == in_array.data ]) > 1 or in_array.data in sdfg.shared_transients()): return False # If memlet already has WCR and it is different from reduce node, # do not match if tmem.wcr is not None and tmem.wcr != reduce_node.wcr: return False # Verify that reduction ranges match tasklet map tout_memlet = graph.in_edges(in_array)[0].data rin_memlet = graph.out_edges(in_array)[0].data if tout_memlet.subset != rin_memlet.subset: return False return True @staticmethod def match_to_str(graph, candidate): tasklet = candidate[MapReduceFusion._tasklet] map_exit = candidate[MapReduceFusion._tmap_exit] reduce = candidate[MapReduceFusion._reduce] return ' -> '.join(str(node) for node in [tasklet, map_exit, reduce]) def apply(self, sdfg): graph = sdfg.nodes()[self.state_id] tmap_exit = graph.nodes()[self.subgraph[MapReduceFusion._tmap_exit]] in_array = graph.nodes()[self.subgraph[MapReduceFusion._in_array]] reduce_node = graph.nodes()[self.subgraph[MapReduceFusion._reduce]] out_array = graph.nodes()[self.subgraph[MapReduceFusion._out_array]] # Set nodes to remove according to the expression index nodes_to_remove = [in_array] nodes_to_remove.append(reduce_node) memlet_edge = None for edge in graph.in_edges(tmap_exit): if edge.data.data == in_array.data: memlet_edge = edge break if memlet_edge is None: raise RuntimeError('Reduction memlet cannot be None') # Find which indices should be removed from new memlet input_edge = graph.in_edges(reduce_node)[0] axes = reduce_node.axes or list(range(input_edge.data.subset)) array_edge = graph.out_edges(reduce_node)[0] # Delete relevant edges and nodes graph.remove_nodes_from(nodes_to_remove) # Filter out reduced dimensions from subset filtered_subset = [ dim for i, dim in enumerate(memlet_edge.data.subset) if i not in axes ] if len(filtered_subset) == 0: # Output is a scalar filtered_subset = [0] # Modify edge from tasklet to map exit memlet_edge.data.data = out_array.data memlet_edge.data.wcr = reduce_node.wcr memlet_edge.data.wcr_identity = reduce_node.identity memlet_edge.data.subset = type( memlet_edge.data.subset)(filtered_subset) # Add edge from map exit to output array graph.add_edge( memlet_edge.dst, 'OUT_' + memlet_edge.dst_conn[3:], array_edge.dst, array_edge.dst_conn, Memlet(array_edge.data.data, array_edge.data.num_accesses, array_edge.data.subset, array_edge.data.veclen, reduce_node.wcr, reduce_node.identity))
def add_indirection_subgraph(sdfg, graph, src, dst, memlet): """ Replaces the specified edge in the specified graph with a subgraph that implements indirection without nested AST memlet objects. """ if not isinstance(memlet, astnodes._Memlet): raise TypeError("Expected memlet to be astnodes._Memlet") indirect_inputs = set() indirect_outputs = set() # Scheme for multi-array indirection: # 1. look for all arrays and accesses, create set of arrays+indices # from which the index memlets will be constructed from # 2. each separate array creates a memlet, of which num_accesses = len(set) # 3. one indirection tasklet receives them all + original array and # produces the right output index/range memlet ######################### # Step 1 accesses = OrderedDict() newsubset = dcpy(memlet.subset) for dimidx, dim in enumerate(memlet.subset): # Range/Index disambiguation direct_assignment = False if not isinstance(dim, tuple): dim = [dim] direct_assignment = True for i, r in enumerate(dim): for expr in sympy.preorder_traversal(r): if symbolic.is_sympy_userfunction(expr): fname = expr.func.__name__ if fname not in accesses: accesses[fname] = [] # Replace function with symbol (memlet local name to-be) if expr.args in accesses[fname]: aindex = accesses[fname].index(expr.args) toreplace = 'index_' + fname + '_' + str(aindex) else: accesses[fname].append(expr.args) toreplace = 'index_' + fname + '_' + str( len(accesses[fname]) - 1) if direct_assignment: newsubset[dimidx] = r.subs(expr, toreplace) else: newsubset[dimidx][i] = r.subs(expr, toreplace) ######################### # Step 2 ind_inputs = {'__ind_' + memlet.local_name} ind_outputs = {'lookup'} # Add accesses to inputs for arrname, arr_accesses in accesses.items(): for i in range(len(arr_accesses)): ind_inputs.add('index_%s_%d' % (arrname, i)) tasklet = nd.Tasklet("Indirection", ind_inputs, ind_outputs) input_index_memlets = [] for arrname, arr_accesses in accesses.items(): arr = memlet.otherdeps[arrname] for i, access in enumerate(arr_accesses): # Memlet to load the indirection index indexMemlet = Memlet(arrname, 1, sbs.Indices(list(access)), 1) input_index_memlets.append(indexMemlet) graph.add_edge(src, None, tasklet, "index_%s_%d" % (arrname, i), indexMemlet) ######################### # Step 3 # Create new tasklet that will perform the indirection indirection_ast = ast.parse("lookup = {arr}[{index}]".format( arr='__ind_' + memlet.local_name, index=', '.join([symbolic.symstr(s) for s in newsubset]))) # Conserve line number of original indirection code tasklet.code = ast.copy_location(indirection_ast.body[0], memlet.ast) # Create transient variable to trigger the indirected load if memlet.num_accesses == 1: storage = sdfg.add_scalar('__' + memlet.local_name + '_value', memlet.data.dtype, transient=True) else: storage = sdfg.add_array('__' + memlet.local_name + '_value', memlet.data.dtype, storage=types.StorageType.Default, transient=True, shape=memlet.bounding_box_size()) indirectRange = sbs.Range([(0, s - 1, 1) for s in storage.shape]) dataNode = nd.AccessNode('__' + memlet.local_name + '_value') # Create memlet that depends on the full array that we look up in fullRange = sbs.Range([(0, s - 1, 1) for s in memlet.data.shape]) fullMemlet = Memlet(memlet.dataname, memlet.num_accesses, fullRange, memlet.veclen) graph.add_edge(src, None, tasklet, '__ind_' + memlet.local_name, fullMemlet) # Memlet to store the final value into the transient, and to load it into # the tasklet that needs it indirectMemlet = Memlet('__' + memlet.local_name + '_value', memlet.num_accesses, indirectRange, memlet.veclen) graph.add_edge(tasklet, 'lookup', dataNode, None, indirectMemlet) valueMemlet = Memlet('__' + memlet.local_name + '_value', memlet.num_accesses, indirectRange, memlet.veclen) graph.add_edge(dataNode, None, dst, memlet.local_name, valueMemlet)
def apply(self, sdfg: sd.SDFG): ####################################################### # Step 0: SDFG metadata # Find all input and output data descriptors input_nodes = [] output_nodes = [] global_code_nodes = [[] for _ in sdfg.nodes()] for i, state in enumerate(sdfg.nodes()): sdict = state.scope_dict() for node in state.nodes(): if (isinstance(node, nodes.AccessNode) and node.desc(sdfg).transient == False): if (state.out_degree(node) > 0 and node.data not in input_nodes): # Special case: nodes that lead to dynamic map ranges # must stay on host for e in state.out_edges(node): last_edge = state.memlet_path(e)[-1] if (isinstance(last_edge.dst, nodes.EntryNode) and last_edge.dst_conn and not last_edge.dst_conn.startswith('IN_')): break else: input_nodes.append((node.data, node.desc(sdfg))) if (state.in_degree(node) > 0 and node.data not in output_nodes): output_nodes.append((node.data, node.desc(sdfg))) elif isinstance(node, nodes.CodeNode) and sdict[node] is None: if not isinstance(node, nodes.EmptyTasklet): global_code_nodes[i].append(node) # Input nodes may also be nodes with WCR memlets and no identity for e in state.edges(): if e.data.wcr is not None and e.data.wcr_identity is None: if (e.data.data not in input_nodes and sdfg.arrays[e.data.data].transient == False): input_nodes.append( (e.data.data, sdfg.arrays[e.data.data])) start_state = sdfg.start_state end_states = sdfg.sink_nodes() ####################################################### # Step 1: Create cloned GPU arrays and replace originals cloned_arrays = {} for inodename, inode in set(input_nodes): if isinstance(inode, data.Scalar): # Scalars can remain on host continue newdesc = inode.clone() newdesc.storage = dtypes.StorageType.GPU_Global newdesc.transient = True name = sdfg.add_datadesc('gpu_' + inodename, newdesc, find_new_name=True) cloned_arrays[inodename] = name for onodename, onode in set(output_nodes): if onodename in cloned_arrays: continue newdesc = onode.clone() newdesc.storage = dtypes.StorageType.GPU_Global newdesc.transient = True name = sdfg.add_datadesc('gpu_' + onodename, newdesc, find_new_name=True) cloned_arrays[onodename] = name # Replace nodes for state in sdfg.nodes(): for node in state.nodes(): if (isinstance(node, nodes.AccessNode) and node.data in cloned_arrays): node.data = cloned_arrays[node.data] # Replace memlets for state in sdfg.nodes(): for edge in state.edges(): if edge.data.data in cloned_arrays: edge.data.data = cloned_arrays[edge.data.data] ####################################################### # Step 2: Create copy-in state excluded_copyin = self.exclude_copyin.split(',') copyin_state = sdfg.add_state(sdfg.label + '_copyin') sdfg.add_edge(copyin_state, start_state, ed.InterstateEdge()) for nname, desc in dtypes.deduplicate(input_nodes): if nname in excluded_copyin or nname not in cloned_arrays: continue src_array = nodes.AccessNode(nname, debuginfo=desc.debuginfo) dst_array = nodes.AccessNode(cloned_arrays[nname], debuginfo=desc.debuginfo) copyin_state.add_node(src_array) copyin_state.add_node(dst_array) copyin_state.add_nedge( src_array, dst_array, memlet.Memlet.from_array(src_array.data, src_array.desc(sdfg))) ####################################################### # Step 3: Create copy-out state excluded_copyout = self.exclude_copyout.split(',') copyout_state = sdfg.add_state(sdfg.label + '_copyout') for state in end_states: sdfg.add_edge(state, copyout_state, ed.InterstateEdge()) for nname, desc in dtypes.deduplicate(output_nodes): if nname in excluded_copyout or nname not in cloned_arrays: continue src_array = nodes.AccessNode(cloned_arrays[nname], debuginfo=desc.debuginfo) dst_array = nodes.AccessNode(nname, debuginfo=desc.debuginfo) copyout_state.add_node(src_array) copyout_state.add_node(dst_array) copyout_state.add_nedge( src_array, dst_array, memlet.Memlet.from_array(dst_array.data, dst_array.desc(sdfg))) ####################################################### # Step 4: Modify transient data storage for state in sdfg.nodes(): sdict = state.scope_dict() for node in state.nodes(): if isinstance(node, nodes.AccessNode) and node.desc(sdfg).transient: nodedesc = node.desc(sdfg) # Special case: nodes that lead to dynamic map ranges must # stay on host if any( isinstance( state.memlet_path(e)[-1].dst, nodes.EntryNode) for e in state.out_edges(node)): continue if sdict[node] is None: # NOTE: the cloned arrays match too but it's the same # storage so we don't care nodedesc.storage = dtypes.StorageType.GPU_Global # Try to move allocation/deallocation out of loops if (self.toplevel_trans and not isinstance(nodedesc, data.Stream)): nodedesc.toplevel = True else: # Make internal transients registers if self.register_trans: nodedesc.storage = dtypes.StorageType.Register ####################################################### # Step 5: Wrap free tasklets and nested SDFGs with a GPU map for state, gcodes in zip(sdfg.nodes(), global_code_nodes): for gcode in gcodes: if gcode.label in self.exclude_tasklets.split(','): continue # Create map and connectors me, mx = state.add_map(gcode.label + '_gmap', {gcode.label + '__gmapi': '0:1'}, schedule=dtypes.ScheduleType.GPU_Device) # Store in/out edges in lists so that they don't get corrupted # when they are removed from the graph in_edges = list(state.in_edges(gcode)) out_edges = list(state.out_edges(gcode)) me.in_connectors = set('IN_' + e.dst_conn for e in in_edges) me.out_connectors = set('OUT_' + e.dst_conn for e in in_edges) mx.in_connectors = set('IN_' + e.src_conn for e in out_edges) mx.out_connectors = set('OUT_' + e.src_conn for e in out_edges) # Create memlets through map for e in in_edges: state.remove_edge(e) state.add_edge(e.src, e.src_conn, me, 'IN_' + e.dst_conn, e.data) state.add_edge(me, 'OUT_' + e.dst_conn, e.dst, e.dst_conn, e.data) for e in out_edges: state.remove_edge(e) state.add_edge(e.src, e.src_conn, mx, 'IN_' + e.src_conn, e.data) state.add_edge(mx, 'OUT_' + e.src_conn, e.dst, e.dst_conn, e.data) # Map without inputs if len(in_edges) == 0: state.add_nedge(me, gcode, memlet.EmptyMemlet()) ####################################################### # Step 6: Change all top-level maps and Reduce nodes to GPU schedule for i, state in enumerate(sdfg.nodes()): sdict = state.scope_dict() for node in state.nodes(): if isinstance(node, (nodes.EntryNode, nodes.Reduce)): if sdict[node] is None: node.schedule = dtypes.ScheduleType.GPU_Device elif (isinstance(node, nodes.EntryNode) and self.sequential_innermaps): node.schedule = dtypes.ScheduleType.Sequential ####################################################### # Step 7: Introduce copy-out if data used in outgoing interstate edges for state in list(sdfg.nodes()): arrays_used = set() for e in sdfg.out_edges(state): # Used arrays = intersection between symbols and cloned arrays arrays_used.update( set(e.data.condition_symbols()) & set(cloned_arrays.keys())) # Create a state and copy out used arrays if len(arrays_used) > 0: co_state = sdfg.add_state(state.label + '_icopyout') # Reconnect outgoing edges to after interim copyout state for e in sdfg.out_edges(state): nxutil.change_edge_src(sdfg, state, co_state) # Add unconditional edge to interim state sdfg.add_edge(state, co_state, ed.InterstateEdge()) # Add copy-out nodes for nname in arrays_used: desc = sdfg.arrays[nname] src_array = nodes.AccessNode(cloned_arrays[nname], debuginfo=desc.debuginfo) dst_array = nodes.AccessNode(nname, debuginfo=desc.debuginfo) co_state.add_node(src_array) co_state.add_node(dst_array) co_state.add_nedge( src_array, dst_array, memlet.Memlet.from_array(dst_array.data, dst_array.desc(sdfg))) ####################################################### # Step 8: Strict transformations if not self.strict_transform: return # Apply strict state fusions greedily. sdfg.apply_strict_transformations()
class TensorflowRedundantArray(pm.Transformation): """ Implements the redundant array removal transformation, applied to remove ReadVariableOps and control dependencies. """ _arrays_removed = 0 _in_array = nodes.AccessNode("_") _out_array = nodes.AccessNode("_") @staticmethod def expressions(): return [ nxutil.node_path_graph(TensorflowRedundantArray._in_array, TensorflowRedundantArray._out_array) ] @staticmethod def can_be_applied(graph, candidate, expr_index, sdfg, strict=False): in_array = graph.nodes()[candidate[TensorflowRedundantArray._in_array]] out_array = graph.nodes()[candidate[ TensorflowRedundantArray._out_array]] # Just to be sure, check for the OP name in the out array if not ("ReadVariable" in out_array.data or "control_dependency" in out_array.data): return False # Make sure that the candidate is a transient variable if not in_array.desc(sdfg).transient: return False # Make sure that both arrays are using the same storage location if in_array.desc(sdfg).storage != out_array.desc(sdfg).storage: return False # Only apply if arrays are of same shape (no need to modify subset) if len(in_array.desc(sdfg).shape) != len( out_array.desc(sdfg).shape) or any(i != o for i, o in zip( in_array.desc(sdfg).shape, out_array.desc(sdfg).shape)): return False return True @staticmethod def match_to_str(graph, candidate): out_array = graph.nodes()[candidate[ TensorflowRedundantArray._out_array]] return "Remove " + str(out_array) def apply(self, sdfg): def gnode(nname): return graph.nodes()[self.subgraph[nname]] graph = sdfg.nodes()[self.state_id] in_array = gnode(TensorflowRedundantArray._in_array) out_array = gnode(TensorflowRedundantArray._out_array) for e in graph.out_edges(out_array): # Modify all outgoing edges to point to in_array path = graph.memlet_tree(e) for pe in path: if pe.data.data == out_array.data: pe.data.data = in_array.data # Preemptively add edge from in_array to out_array's adjacent # nodes. new_memlet = e.data new_memlet.data = in_array.data graph.add_edge(in_array, e.src_conn, e.dst, e.dst_conn, new_memlet) graph.remove_edge(e) try: assert len(graph.in_edges(out_array)) == 1 except AssertionError: print("Multiple in-edges for ", str(out_array)) e = graph.in_edges(out_array)[0] graph.remove_edge(e) # Finally, remove out_array node graph.remove_node(out_array) if Config.get_bool("debugprint"): TensorflowRedundantArray._arrays_removed += 1
class MapFusion(pattern_matching.Transformation): """ Implements the MapFusion transformation. It wil check for all patterns MapExit -> AccessNode -> MapEntry, and based on the following rules, fuse them and remove the transient in between. There are several possibilities of what it does to this transient in between. Essentially, if there is some other place in the sdfg where it is required, or if it is not a transient, then it will not be removed. In such a case, it will be linked to the MapExit node of the new fused map. Rules for fusing maps: 0. The map range of the second map should be a permutation of the first map range. 1. Each of the access nodes that are adjacent to the first map exit should have an edge to the second map entry. If it doesn't, then the second map entry should not be reachable from this access node. 2. Any node that has a wcr from the first map exit should not be adjacent to the second map entry. 3. Access pattern for the access nodes in the second map should be the same permutation of the map parameters as the map ranges of the two maps. Alternatively, this access node should not be adjacent to the first map entry. """ _first_map_exit = nodes.ExitNode() _some_array = nodes.AccessNode("_") _second_map_entry = nodes.EntryNode() @staticmethod def annotates_memlets(): return False @staticmethod def expressions(): return [ nxutil.node_path_graph( MapFusion._first_map_exit, MapFusion._some_array, MapFusion._second_map_entry, ) ] @staticmethod def find_permutation(first_map: nodes.Map, second_map: nodes.Map) -> Union[List[int], None]: """ Find permutation between two map ranges. @param first_map: First map. @param second_map: Second map. @return: None if no such permutation exists, otherwise a list of indices L such that L[x]'th parameter of second map has the same range as x'th parameter of the first map. """ result = [] if len(first_map.range) != len(second_map.range): return None # Match map ranges with reduce ranges for i, tmap_rng in enumerate(first_map.range): found = False for j, rng in enumerate(second_map.range): if tmap_rng == rng and j not in result: result.append(j) found = True break if not found: break # Ensure all map ranges matched if len(result) != len(first_map.range): return None return result @staticmethod def can_be_applied(graph, candidate, expr_index, sdfg, strict=False): first_map_exit = graph.nodes()[candidate[MapFusion._first_map_exit]] first_map_entry = graph.entry_node(first_map_exit) second_map_entry = graph.nodes()[candidate[ MapFusion._second_map_entry]] for _in_e in graph.in_edges(first_map_exit): if _in_e.data.wcr is not None: for _out_e in graph.out_edges(second_map_entry): if _out_e.data.data == _in_e.data.data: # wcr is on a node that is used in the second map, quit return False # Check whether there is a pattern map -> access -> map. intermediate_nodes = set() intermediate_data = set() for _, _, dst, _, _ in graph.out_edges(first_map_exit): if isinstance(dst, nodes.AccessNode): intermediate_nodes.add(dst) intermediate_data.add(dst.data) else: return False # Check map ranges perm = MapFusion.find_permutation(first_map_entry.map, second_map_entry.map) if perm is None: return False # Create a dict that maps parameters of the first map to those of the # second map. params_dict = {} for _index, _param in enumerate(first_map_entry.map.params): params_dict[_param] = second_map_entry.map.params[perm[_index]] out_memlets = [e.data for e in graph.in_edges(first_map_exit)] # Check that input set of second map is provided by the output set # of the first map, or other unrelated maps for _, _, _, _, second_memlet in graph.out_edges(second_map_entry): # Memlets that do not come from one of the intermediate arrays if second_memlet.data not in intermediate_data: # however, if intermediate_data eventually leads to # second_memlet.data, need to fail. for _n in intermediate_nodes: source_node = _n # graph.find_node(_n.data) destination_node = graph.find_node(second_memlet.data) # NOTE: Assumes graph has networkx version if destination_node in nx.descendants( graph._nx, source_node): return False continue provided = False for first_memlet in out_memlets: if first_memlet.data != second_memlet.data: continue # If there is an equivalent subset, it is provided expected_second_subset = [] for _tup in first_memlet.subset: new_tuple = [] if isinstance(_tup, symbolic.symbol): new_tuple = symbolic.symbol(params_dict[str(_tup)]) elif isinstance(_tup, (list, tuple)): for _sym in _tup: if isinstance(_sym, symbolic.symbol): new_tuple.append( symbolic.symbol(params_dict[str(_sym)])) else: new_tuple.append(_sym) new_tuple = tuple(new_tuple) else: new_tuple = _tup expected_second_subset.append(new_tuple) if expected_second_subset == list(second_memlet.subset): provided = True break # If none of the output memlets of the first map provide the info, # fail. if provided is False: return False # Success return True @staticmethod def match_to_str(graph, candidate): first_exit = graph.nodes()[candidate[MapFusion._first_map_exit]] second_entry = graph.nodes()[candidate[MapFusion._second_map_entry]] return " -> ".join(entry.map.label + ": " + str(entry.map.params) for entry in [first_exit, second_entry]) def apply(self, sdfg): """ This method applies the mapfusion transformation. Other than the removal of the second map entry node (SME), and the first map exit (FME) node, it has the following side effects: 1. Any transient adjacent to both FME and SME with degree = 2 will be removed. The tasklets that use/produce it shall be connected directly with a scalar/new transient (if the dataflow is more than a single scalar) 2. If this transient is adjacent to FME and SME and has other uses, it will be adjacent to the new map exit post fusion. Tasklet-> Tasklet edges will ALSO be added as mentioned above. 3. If an access node is adjacent to FME but not SME, it will be adjacent to new map exit post fusion. 4. If an access node is adjacent to SME but not FME, it will be adjacent to the new map entry node post fusion. """ graph = sdfg.nodes()[self.state_id] first_exit = graph.nodes()[self.subgraph[MapFusion._first_map_exit]] first_entry = graph.entry_node(first_exit) second_entry = graph.nodes()[self.subgraph[ MapFusion._second_map_entry]] second_exit = graph.exit_nodes(second_entry)[0] intermediate_nodes = set() for _, _, dst, _, _ in graph.out_edges(first_exit): intermediate_nodes.add(dst) assert isinstance(dst, nodes.AccessNode) # Check if an access node refers to non transient memory, or transient # is used at another location (cannot erase) do_not_erase = set() for node in intermediate_nodes: if sdfg.arrays[node.data].transient is False: do_not_erase.add(node) else: # If array is used anywhere else in this state. num_occurrences = len([ n for n in graph.nodes() if isinstance(n, nodes.AccessNode) and n.data == node.data ]) if num_occurrences > 1: return False for edge in graph.in_edges(node): if edge.src != first_exit: do_not_erase.add(node) break else: for edge in graph.out_edges(node): if edge.dst != second_entry: do_not_erase.add(node) break # Find permutation between first and second scopes if first_entry.map.params != second_entry.map.params: perm = MapFusion.find_permutation(first_entry.map, second_entry.map) params_dict = {} for _index, _param in enumerate(first_entry.map.params): params_dict[_param] = second_entry.map.params[perm[_index]] # Hopefully replaces (in memlets and tasklet) the second scope map # indices with the permuted first map indices second_scope = graph.scope_subgraph(second_entry) for _firstp, _secondp in params_dict.items(): replace(second_scope, _secondp, _firstp) ########Isolate First MapExit node########### for _edge in graph.in_edges(first_exit): __some_str = _edge.data.data _access_node = graph.find_node(__some_str) # all outputs of first_exit are in intermediate_nodes set, so all inputs to # first_exit should also be! if _access_node not in do_not_erase: _new_dst = None _new_dst_conn = None # look at the second map entry out-edges to get the new destination for _e in graph.out_edges(second_entry): if _e.data.data == _access_node.data: _new_dst = _e.dst _new_dst_conn = _e.dst_conn break if _new_dst is None: # Access node is not even used in the second map graph.remove_node(_access_node) continue if _edge.data.data == _access_node.data and isinstance( _edge._src, nodes.AccessNode): _edge.data.data = _edge._src.data _edge.data.subset = "0" graph.add_edge( _edge._src, _edge.src_conn, _new_dst, _new_dst_conn, dcpy(_edge.data), ) else: if _edge.data.subset.num_elements() == 1: # We will add a scalar local_name = "__s%d_n%d%s_n%d%s" % ( self.state_id, graph.node_id(_edge._src), _edge.src_conn, graph.node_id(_edge._dst), _edge.dst_conn, ) local_node = sdfg.add_scalar( local_name, dtype=_access_node.desc(graph).dtype, toplevel=False, transient=True, storage=dtypes.StorageType.Register, ) _edge.data.data = ( local_name) # graph.add_access(local_name).data _edge.data.subset = "0" graph.add_edge( _edge._src, _edge.src_conn, _new_dst, _new_dst_conn, dcpy(_edge.data), ) else: # We will add a transient of size = memlet subset # size local_name = "__s%d_n%d%s_n%d%s" % ( self.state_id, graph.node_id(_edge._src), _edge.src_conn, graph.node_id(_edge._dst), _edge.dst_conn, ) local_node = graph.add_transient( local_name, _edge.data.subset.size(), dtype=_access_node.desc(graph).dtype, toplevel=False, ) _edge.data.data = ( local_name) # graph.add_access(local_name).data _edge.data.subset = ",".join([ "0:" + str(_s) for _s in _edge.data.subset.size() ]) graph.add_edge( _edge._src, _edge.src_conn, local_node, None, dcpy(_edge.data), ) graph.add_edge(local_node, None, _new_dst, _new_dst_conn, dcpy(_edge.data)) graph.remove_edge(_edge) ####Isolate this node##### for _in_e in graph.in_edges(_access_node): graph.remove_edge(_in_e) for _out_e in graph.out_edges(_access_node): graph.remove_edge(_out_e) graph.remove_node(_access_node) else: # _access_node will become an output of the second map exit for _out_e in graph.out_edges(first_exit): if _out_e.data.data == _access_node.data: graph.add_edge( second_exit, None, _out_e._dst, _out_e.dst_conn, dcpy(_out_e.data), ) graph.remove_edge(_out_e) break else: raise AssertionError( "No out-edge was found that leads to {}".format( _access_node)) graph.add_edge(_edge._src, _edge.src_conn, second_exit, None, dcpy(_edge.data)) ### If the second map needs this node then link the connector # that generated this to the place where it is needed, with a # temp transient/scalar for memlet to be generated for _out_e in graph.out_edges(second_entry): if _out_e.data.data == _access_node.data: if _edge.data.subset.num_elements() == 1: # We will add a scalar local_name = "__s%d_n%d%s_n%d%s" % ( self.state_id, graph.node_id(_edge._src), _edge.src_conn, graph.node_id(_edge._dst), _edge.dst_conn, ) local_node = sdfg.add_scalar( local_name, dtype=_access_node.desc(graph).dtype, storage=dtypes.StorageType.Register, toplevel=False, transient=True, ) _edge.data.data = ( local_name ) # graph.add_access(local_name).data _edge.data.subset = "0" graph.add_edge( _edge._src, _edge.src_conn, _out_e._dst, _out_e.dst_conn, dcpy(_edge.data), ) else: # We will add a transient of size = memlet subset # size local_name = "__s%d_n%d%s_n%d%s" % ( self.state_id, graph.node_id(_edge._src), _edge.src_conn, graph.node_id(_edge._dst), _edge.dst_conn, ) local_node = sdfg.add_transient( local_name, _edge.data.subset.size(), dtype=_access_node.desc(graph).dtype, toplevel=False, ) _edge.data.data = ( local_name ) # graph.add_access(local_name).data _edge.data.subset = ",".join([ "0:" + str(_s) for _s in _edge.data.subset.size() ]) graph.add_edge( _edge._src, _edge.src_conn, local_node, None, dcpy(_edge.data), ) graph.add_edge( local_node, None, _out_e._dst, _out_e.dst_conn, dcpy(_edge.data), ) break graph.remove_edge(_edge) graph.remove_node(first_exit) # Take a leap of faith #############Isolate second_entry node################ for _edge in graph.in_edges(second_entry): _access_node = graph.find_node(_edge.data.data) if _access_node in intermediate_nodes: # Already handled above, just remove this graph.remove_edge(_edge) continue else: # This is an external input to the second map which will now go through the first # map. graph.add_edge(_edge._src, _edge.src_conn, first_entry, None, dcpy(_edge.data)) graph.remove_edge(_edge) for _out_e in graph.out_edges(second_entry): if _out_e.data.data == _access_node.data: graph.add_edge( first_entry, None, _out_e._dst, _out_e.dst_conn, dcpy(_out_e.data), ) graph.remove_edge(_out_e) break else: raise AssertionError( "No out-edge was found that leads to {}".format( _access_node)) graph.remove_node(second_entry) # Fix scope exit second_exit.map = first_entry.map graph.fill_scope_connectors()
def apply(self, sdfg): graph = sdfg.nodes()[self.state_id] outer_map_entry = graph.nodes()[self.subgraph[ InLocalStorage._outer_map_entry]] inner_map_entry = graph.nodes()[self.subgraph[ InLocalStorage._inner_map_entry]] array = self.array if array is None: array = graph.edges_between(outer_map_entry, inner_map_entry)[0].data.data original_edge = None invariant_memlet = None for edge in graph.in_edges(inner_map_entry): src = edge.src if src != outer_map_entry: continue memlet = edge.data if array == memlet.data: original_edge = edge invariant_memlet = memlet break if invariant_memlet is None: for edge in graph.in_edges(inner_map_entry): src = edge.src if src != outer_map_entry: continue original_edge = edge invariant_memlet = edge.data print('WARNING: Array %s not found! Using array %s instead.' % (array, invariant_memlet.data)) array = invariant_memlet.data break if invariant_memlet is None: raise KeyError('Array %s not found!' % array) new_data = sdfg.add_array('trans_' + invariant_memlet.data, [ symbolic.overapproximate(r) for r in invariant_memlet.bounding_box_size() ], sdfg.arrays[invariant_memlet.data].dtype, transient=True) data_node = nodes.AccessNode('trans_' + invariant_memlet.data) to_data_mm = copy.deepcopy(invariant_memlet) from_data_mm = copy.deepcopy(invariant_memlet) from_data_mm.data = data_node.data offset = [] for ind, r in enumerate(invariant_memlet.subset): offset.append(r[0]) if isinstance(invariant_memlet.subset[ind], tuple): begin = invariant_memlet.subset[ind][0] - r[0] end = invariant_memlet.subset[ind][1] - r[0] step = invariant_memlet.subset[ind][2] from_data_mm.subset[ind] = (begin, end, step) else: from_data_mm.subset[ind] -= r[0] to_data_mm.other_subset = copy.deepcopy(from_data_mm.subset) # Reconnect, assuming one edge to the stream graph.remove_edge(original_edge) graph.add_edge(outer_map_entry, original_edge.src_conn, data_node, None, to_data_mm) graph.add_edge(data_node, None, inner_map_entry, original_edge.dst_conn, from_data_mm) for _parent, _, _child, _, memlet in graph.bfs_edges(inner_map_entry, reverse=False): if memlet.data != array: continue for ind, r in enumerate(memlet.subset): if isinstance(memlet.subset[ind], tuple): begin = r[0] - offset[ind] end = r[1] - offset[ind] step = r[2] memlet.subset[ind] = (begin, end, step) else: memlet.subset[ind] -= offset[ind] memlet.data = 'trans_' + invariant_memlet.data return
def apply(self, sdfg): # Retrieve map entry and exit nodes. graph = sdfg.nodes()[self.state_id] map_entry = graph.nodes()[self.subgraph[MapToForLoop._map_entry]] map_exits = graph.exit_nodes(map_entry) loop_idx = map_entry.map.params[0] loop_from, loop_to, loop_step = map_entry.map.range[0] nested_sdfg = dace.SDFG(graph.label + '_' + map_entry.map.label) # Construct nested SDFG begin = nested_sdfg.add_state('begin') guard = nested_sdfg.add_state('guard') body = nested_sdfg.add_state('body') end = nested_sdfg.add_state('end') nested_sdfg.add_edge( begin, guard, edges.InterstateEdge(assignments={str(loop_idx): str(loop_from)})) nested_sdfg.add_edge( guard, body, edges.InterstateEdge(condition = str(loop_idx) + ' <= ' + \ str(loop_to)) ) nested_sdfg.add_edge( guard, end, edges.InterstateEdge(condition = str(loop_idx) + ' > ' + \ str(loop_to)) ) nested_sdfg.add_edge( body, guard, edges.InterstateEdge(assignments = {str(loop_idx): str(loop_idx) + \ ' + ' +str(loop_step)}) ) # Add map contents map_subgraph = graph.scope_subgraph(map_entry) for node in map_subgraph.nodes(): if node is not map_entry and node not in map_exits: body.add_node(node) for src, src_conn, dst, dst_conn, memlet in map_subgraph.edges(): if src is not map_entry and dst not in map_exits: body.add_edge(src, src_conn, dst, dst_conn, memlet) # Reconnect inputs nested_in_data_nodes = {} nested_in_connectors = {} nested_in_memlets = {} for i, edge in enumerate(graph.in_edges(map_entry)): src, src_conn, dst, dst_conn, memlet = edge data_label = '_in_' + memlet.data memdata = sdfg.arrays[memlet.data] if isinstance(memdata, data.Array): data_array = sdfg.add_array(data_label, memdata.dtype, [ symbolic.overapproximate(r) for r in memlet.bounding_box_size() ]) elif isinstance(memdata, data.Scalar): data_array = sdfg.add_scalar(data_label, memdata.dtype) else: raise NotImplementedError() data_node = nodes.AccessNode(data_label) body.add_node(data_node) nested_in_data_nodes.update({i: data_node}) nested_in_connectors.update({i: data_label}) nested_in_memlets.update({i: memlet}) for _, _, _, _, old_memlet in body.edges(): if old_memlet.data == memlet.data: old_memlet.data = data_label #body.add_edge(data_node, None, dst, dst_conn, memlet) # Reconnect outputs nested_out_data_nodes = {} nested_out_connectors = {} nested_out_memlets = {} for map_exit in map_exits: for i, edge in enumerate(graph.out_edges(map_exit)): src, src_conn, dst, dst_conn, memlet = edge data_label = '_out_' + memlet.data memdata = sdfg.arrays[memlet.data] if isinstance(memdata, data.Array): data_array = sdfg.add_array(data_label, memdata.dtype, [ symbolic.overapproximate(r) for r in memlet.bounding_box_size() ]) elif isinstance(memdata, data.Scalar): data_array = sdfg.add_scalar(data_label, memdata.dtype) else: raise NotImplementedError() data_node = nodes.AccessNode(data_label) body.add_node(data_node) nested_out_data_nodes.update({i: data_node}) nested_out_connectors.update({i: data_label}) nested_out_memlets.update({i: memlet}) for _, _, _, _, old_memlet in body.edges(): if old_memlet.data == memlet.data: old_memlet.data = data_label #body.add_edge(src, src_conn, data_node, None, memlet) # Add nested SDFG and reconnect it nested_node = graph.add_nested_sdfg( nested_sdfg, sdfg, set(nested_in_connectors.values()), set(nested_out_connectors.values())) for i, edge in enumerate(graph.in_edges(map_entry)): src, src_conn, dst, dst_conn, memlet = edge graph.add_edge(src, src_conn, nested_node, nested_in_connectors[i], nested_in_memlets[i]) for map_exit in map_exits: for i, edge in enumerate(graph.out_edges(map_exit)): src, src_conn, dst, dst_conn, memlet = edge graph.add_edge(nested_node, nested_out_connectors[i], dst, dst_conn, nested_out_memlets[i]) for src, src_conn, dst, dst_conn, memlet in graph.out_edges(map_entry): i = int(src_conn[4:]) - 1 new_memlet = dcpy(memlet) new_memlet.data = nested_in_data_nodes[i].data body.add_edge(nested_in_data_nodes[i], None, dst, dst_conn, new_memlet) for map_exit in map_exits: for src, src_conn, dst, dst_conn, memlet in graph.in_edges( map_exit): i = int(dst_conn[3:]) - 1 new_memlet = dcpy(memlet) new_memlet.data = nested_out_data_nodes[i].data body.add_edge(src, src_conn, nested_out_data_nodes[i], None, new_memlet) for node in map_subgraph: graph.remove_node(node)
def apply(self, sdfg): state = sdfg.nodes()[self.state_id] nested_sdfg = state.nodes()[self.subgraph[CopyToDevice._nested_sdfg]] storage = self.storage for _, edge in enumerate(state.in_edges(nested_sdfg)): src, src_conn, dst, dst_conn, memlet = edge dataname = memlet.data memdata = sdfg.arrays[dataname] if isinstance(memdata, data.Array): new_data = sdfg.add_array( 'device_' + dataname + '_in', memdata.dtype, [ symbolic.overapproximate(r) for r in memlet.bounding_box_size() ], transient=True, storage=storage) elif isinstance(memdata, data.Scalar): new_data = sdfg.add_scalar( 'device_' + dataname + '_in', memdata.dtype, transient=True, storage=storage) else: raise NotImplementedError data_node = nodes.AccessNode('device_' + dataname + '_in') to_data_mm = dcpy(memlet) from_data_mm = dcpy(memlet) from_data_mm.data = 'device_' + dataname + '_in' offset = [] for ind, r in enumerate(memlet.subset): offset.append(r[0]) if isinstance(memlet.subset[ind], tuple): begin = memlet.subset[ind][0] - r[0] end = memlet.subset[ind][1] - r[0] step = memlet.subset[ind][2] from_data_mm.subset[ind] = (begin, end, step) else: from_data_mm.subset[ind] -= r[0] state.remove_edge(edge) state.add_edge(src, src_conn, data_node, None, to_data_mm) state.add_edge(data_node, None, dst, dst_conn, from_data_mm) for _, edge in enumerate(state.out_edges(nested_sdfg)): src, src_conn, dst, dst_conn, memlet = edge dataname = memlet.data memdata = sdfg.arrays[dataname] if isinstance(memdata, data.Array): new_data = data.Array( 'device_' + dataname + '_out', memdata.dtype, [ symbolic.overapproximate(r) for r in memlet.bounding_box_size() ], transient=True, storage=storage) elif isinstance(memdata, data.Scalar): new_data = sdfg.add_scalar( 'device_' + dataname + '_out', memdata.dtype, transient=True, storage=storage) else: raise NotImplementedError data_node = nodes.AccessNode('device_' + dataname + '_out') to_data_mm = dcpy(memlet) from_data_mm = dcpy(memlet) to_data_mm.data = 'device_' + dataname + '_out' offset = [] for ind, r in enumerate(memlet.subset): offset.append(r[0]) if isinstance(memlet.subset[ind], tuple): begin = memlet.subset[ind][0] - r[0] end = memlet.subset[ind][1] - r[0] step = memlet.subset[ind][2] to_data_mm.subset[ind] = (begin, end, step) else: to_data_mm.subset[ind] -= r[0] state.remove_edge(edge) state.add_edge(src, src_conn, data_node, None, to_data_mm) state.add_edge(data_node, None, dst, dst_conn, from_data_mm) # Change storage for all data inside nested SDFG to device. change_storage(nested_sdfg.sdfg, storage)
def apply(self, sdfg): graph = sdfg.nodes()[self.state_id] map_entry = graph.nodes()[self.subgraph[FPGATransformMap._map_entry]] map_entry.map._schedule = dtypes.ScheduleType.FPGA_Device # Find map exit nodes exit_nodes = graph.exit_nodes(map_entry) fpga_storage_types = [ dtypes.StorageType.FPGA_Global, dtypes.StorageType.FPGA_Local, dtypes.StorageType.CPU_Pinned ] ####################################################### # Add FPGA copies of CPU arrays (i.e., not already on FPGA) # First, understand which arrays to clone all_out_edges = [] for enode in exit_nodes: all_out_edges.extend(list(graph.out_edges(enode))) in_arrays_to_clone = set() out_arrays_to_clone = set() for e in graph.in_edges(map_entry): data_node = sd.find_input_arraynode(graph, e) if data_node.desc(sdfg).storage not in fpga_storage_types: in_arrays_to_clone.add(data_node) for e in all_out_edges: data_node = sd.find_output_arraynode(graph, e) if data_node.desc(sdfg).storage not in fpga_storage_types: out_arrays_to_clone.add(data_node) # Second, create a FPGA clone of each array cloned_arrays = {} in_cloned_arraynodes = {} out_cloned_arraynodes = {} for array_node in in_arrays_to_clone: array = array_node.desc(sdfg) if array_node.data in cloned_arrays: pass elif 'fpga_' + array_node.data in sdfg.arrays: pass else: sdfg.add_array('fpga_' + array_node.data, dtype=array.dtype, shape=array.shape, materialize_func=array.materialize_func, transient=True, storage=dtypes.StorageType.FPGA_Global, allow_conflicts=array.allow_conflicts, access_order=array.access_order, strides=array.strides, offset=array.offset) cloned_arrays[array_node.data] = 'fpga_' + array_node.data cloned_node = nodes.AccessNode('fpga_' + array_node.data) in_cloned_arraynodes[array_node.data] = cloned_node for array_node in out_arrays_to_clone: array = array_node.desc(sdfg) if array_node.data in cloned_arrays: pass elif 'fpga_' + array_node.data in sdfg.arrays: pass else: sdfg.add_array('fpga_' + array_node.data, dtype=array.dtype, shape=array.shape, materialize_func=array.materialize_func, transient=True, storage=dtypes.StorageType.FPGA_Global, allow_conflicts=array.allow_conflicts, access_order=array.access_order, strides=array.strides, offset=array.offset) cloned_arrays[array_node.data] = 'fpga_' + array_node.data cloned_node = nodes.AccessNode('fpga_' + array_node.data) out_cloned_arraynodes[array_node.data] = cloned_node # Third, connect the cloned arrays to the originals # TODO(later): Shift indices and create only the necessary sub-arrays for array_name, node in in_cloned_arraynodes.items(): graph.add_node(node) for edge in graph.in_edges(map_entry): if edge.data.data == array_name: graph.remove_edge(edge) graph.add_edge(edge.src, None, node, None, edge.data) newmemlet = copy.copy(edge.data) newmemlet.data = node.data graph.add_edge(node, edge.src_conn, edge.dst, edge.dst_conn, newmemlet) for array_name, node in out_cloned_arraynodes.items(): graph.add_node(node) for edge in all_out_edges: if edge.data.data == array_name: graph.remove_edge(edge) graph.add_edge(node, None, edge.dst, None, edge.data) newmemlet = copy.copy(edge.data) newmemlet.data = node.data graph.add_edge(edge.src, edge.src_conn, node, edge.dst_conn, newmemlet) # Fourth, replace memlet arrays as necessary scope_subgraph = graph.scope_subgraph(map_entry) for edge in scope_subgraph.edges(): if (edge.data.data is not None and edge.data.data in cloned_arrays): edge.data.data = cloned_arrays[edge.data.data]
def apply(self, sdfg: sd.SDFG): ####################################################### # Step 0: SDFG metadata # Find all input and output data descriptors input_nodes = [] output_nodes = [] global_code_nodes = [[] for _ in sdfg.nodes()] for i, state in enumerate(sdfg.nodes()): sdict = state.scope_dict() for node in state.nodes(): if (isinstance(node, nodes.AccessNode) and node.desc(sdfg).transient == False): if (state.out_degree(node) > 0 and node.data not in input_nodes): input_nodes.append((node.data, node.desc(sdfg))) if (state.in_degree(node) > 0 and node.data not in output_nodes): output_nodes.append((node.data, node.desc(sdfg))) elif isinstance(node, nodes.CodeNode) and sdict[node] is None: if not isinstance(node, nodes.EmptyTasklet): global_code_nodes[i].append(node) # Input nodes may also be nodes with WCR memlets and no identity for e in state.edges(): if e.data.wcr is not None and e.data.wcr_identity is None: if (e.data.data not in input_nodes and sdfg.arrays[e.data.data].transient == False): input_nodes.append(e.data.data) start_state = sdfg.start_state end_states = sdfg.sink_nodes() ####################################################### # Step 1: Create cloned GPU arrays and replace originals cloned_arrays = {} for inodename, inode in input_nodes: newdesc = inode.clone() newdesc.storage = types.StorageType.GPU_Global newdesc.transient = True sdfg.add_datadesc('gpu_' + inodename, newdesc) cloned_arrays[inodename] = 'gpu_' + inodename for onodename, onode in output_nodes: if onodename in cloned_arrays: continue newdesc = onode.clone() newdesc.storage = types.StorageType.GPU_Global newdesc.transient = True sdfg.add_datadesc('gpu_' + onodename, newdesc) cloned_arrays[onodename] = 'gpu_' + onodename # Replace nodes for state in sdfg.nodes(): for node in state.nodes(): if (isinstance(node, nodes.AccessNode) and node.data in cloned_arrays): node.data = cloned_arrays[node.data] # Replace memlets for state in sdfg.nodes(): for edge in state.edges(): if edge.data.data in cloned_arrays: edge.data.data = cloned_arrays[edge.data.data] ####################################################### # Step 2: Create copy-in state copyin_state = sdfg.add_state(sdfg.label + '_copyin') sdfg.add_edge(copyin_state, start_state, ed.InterstateEdge()) for nname, desc in input_nodes: src_array = nodes.AccessNode(nname, debuginfo=desc.debuginfo) dst_array = nodes.AccessNode(cloned_arrays[nname], debuginfo=desc.debuginfo) copyin_state.add_node(src_array) copyin_state.add_node(dst_array) copyin_state.add_nedge( src_array, dst_array, memlet.Memlet.from_array(src_array.data, src_array.desc(sdfg))) ####################################################### # Step 3: Create copy-out state copyout_state = sdfg.add_state(sdfg.label + '_copyout') for state in end_states: sdfg.add_edge(state, copyout_state, ed.InterstateEdge()) for nname, desc in output_nodes: src_array = nodes.AccessNode(cloned_arrays[nname], debuginfo=desc.debuginfo) dst_array = nodes.AccessNode(nname, debuginfo=desc.debuginfo) copyout_state.add_node(src_array) copyout_state.add_node(dst_array) copyout_state.add_nedge( src_array, dst_array, memlet.Memlet.from_array(dst_array.data, dst_array.desc(sdfg))) ####################################################### # Step 4: Modify transient data storage for state in sdfg.nodes(): sdict = state.scope_dict() for node in state.nodes(): if isinstance(node, nodes.AccessNode) and node.desc(sdfg).transient: nodedesc = node.desc(sdfg) if sdict[node] is None: # NOTE: the cloned arrays match too but it's the same # storage so we don't care nodedesc.storage = types.StorageType.GPU_Global # Try to move allocation/deallocation out of loops if self.toplevel_trans: nodedesc.toplevel = True else: # Make internal transients registers if self.register_trans: nodedesc.storage = types.StorageType.Register ####################################################### # Step 5: Wrap free tasklets and nested SDFGs with a GPU map for state, gcodes in zip(sdfg.nodes(), global_code_nodes): for gcode in gcodes: # Create map and connectors me, mx = state.add_map(gcode.label + '_gmap', {gcode.label + '__gmapi': '0:1'}, schedule=types.ScheduleType.GPU_Device) # Store in/out edges in lists so that they don't get corrupted # when they are removed from the graph in_edges = list(state.in_edges(gcode)) out_edges = list(state.out_edges(gcode)) me.in_connectors = set('IN_' + e.dst_conn for e in in_edges) me.out_connectors = set('OUT_' + e.dst_conn for e in in_edges) mx.in_connectors = set('IN_' + e.src_conn for e in out_edges) mx.out_connectors = set('OUT_' + e.src_conn for e in out_edges) # Create memlets through map for e in in_edges: state.remove_edge(e) state.add_edge(e.src, e.src_conn, me, 'IN_' + e.dst_conn, e.data) state.add_edge(me, 'OUT_' + e.dst_conn, e.dst, e.dst_conn, e.data) for e in out_edges: state.remove_edge(e) state.add_edge(e.src, e.src_conn, mx, 'IN_' + e.src_conn, e.data) state.add_edge(mx, 'OUT_' + e.src_conn, e.dst, e.dst_conn, e.data) # Map without inputs if len(in_edges) == 0: state.add_nedge(me, gcode, memlet.EmptyMemlet()) ####################################################### # Step 6: Change all top-level maps to GPU maps for i, state in enumerate(sdfg.nodes()): sdict = state.scope_dict() for node in state.nodes(): if isinstance(node, nodes.EntryNode): if sdict[node] is None: node.schedule = types.ScheduleType.GPU_Device elif self.sequential_innermaps: node.schedule = types.ScheduleType.Sequential ####################################################### # Step 7: Strict transformations if not self.strict_transform: return # Apply strict state fusions greedily. opt = optimizer.SDFGOptimizer(sdfg, inplace=True) fusions = 0 arrays = 0 options = [ match for match in opt.get_pattern_matches(strict=True) if isinstance(match, (StateFusion, RedundantArray)) ] while options: ssdfg = sdfg.sdfg_list[options[0].sdfg_id] options[0].apply(ssdfg) ssdfg.validate() if isinstance(options[0], StateFusion): fusions += 1 if isinstance(options[0], RedundantArray): arrays += 1 options = [ match for match in opt.get_pattern_matches(strict=True) if isinstance(match, (StateFusion, RedundantArray)) ] if Config.get_bool('debugprint') and (fusions > 0 or arrays > 0): print('Automatically applied {} strict state fusions and removed' ' {} redundant arrays.'.format(fusions, arrays))
class DoubleBuffering(pattern_matching.Transformation): """ Implements the double buffering pattern, which pipelines reading and processing data by creating a second copy of the memory. In particular, the transformation takes a 1D map and all internal (directly connected) transients, adds an additional dimension of size 2, and turns the map into a for loop that processes and reads the data in a double-buffered manner. Other memlets will not be transformed. """ _map_entry = nodes.MapEntry(nodes.Map('_', [], [])) _transient = nodes.AccessNode('_') @staticmethod def expressions(): return [ nxutil.node_path_graph(DoubleBuffering._map_entry, DoubleBuffering._transient) ] @staticmethod def can_be_applied(graph, candidate, expr_index, sdfg, strict=False): map_entry = graph.nodes()[candidate[DoubleBuffering._map_entry]] transient = graph.nodes()[candidate[DoubleBuffering._transient]] # Only one dimensional maps are allowed if len(map_entry.map.params) != 1: return False # Verify the map can be transformed to a for-loop if not MapToForLoop.can_be_applied( graph, {MapToForLoop._map_entry: candidate[DoubleBuffering._map_entry]}, expr_index, sdfg, strict): return False # Verify that all directly-connected internal access nodes point to # transient arrays first = True for edge in graph.out_edges(map_entry): if isinstance(edge.dst, nodes.AccessNode): desc = sdfg.arrays[edge.dst.data] if not isinstance(desc, data.Array) or not desc.transient: return False else: # To avoid duplicate matches, only match the first transient if first and edge.dst != transient: return False first = False return True @staticmethod def match_to_str(graph, candidate): return str(graph.node(candidate[DoubleBuffering._map_entry])) def apply(self, sdfg: sd.SDFG): graph: sd.SDFGState = sdfg.nodes()[self.state_id] map_entry = graph.node(self.subgraph[DoubleBuffering._map_entry]) map_param = map_entry.map.params[0] # Assuming one dimensional ############################## # Change condition of loop to one fewer iteration (so that the # final one reads from the last buffer) map_rstart, map_rend, map_rstride = map_entry.map.range[0] map_rend = symbolic.pystr_to_symbolic('(%s) - (%s)' % (map_rend, map_rstride)) map_entry.map.range = subsets.Range([(map_rstart, map_rend, map_rstride)]) ############################## # Gather transients to modify transients_to_modify = set(edge.dst.data for edge in graph.out_edges(map_entry) if isinstance(edge.dst, nodes.AccessNode)) # Add dimension to transients and modify memlets for transient in transients_to_modify: desc: data.Array = sdfg.arrays[transient] # Using non-python syntax to ensure properties change desc.strides = [desc.total_size] + list(desc.strides) desc.shape = [2] + list(desc.shape) desc.offset = [0] + list(desc.offset) desc.total_size = desc.total_size * 2 ############################## # Modify memlets to use map parameter as buffer index modified_subsets = [] # Store modified memlets for final state for edge in graph.scope_subgraph(map_entry).edges(): if edge.data.data in transients_to_modify: edge.data.subset = self._modify_memlet(sdfg, edge.data.subset, edge.data.data) modified_subsets.append(edge.data.subset) else: # Could be other_subset path = graph.memlet_path(edge) src_node = path[0].src dst_node = path[-1].dst # other_subset could be None. In that case, recreate from array dataname = None if (isinstance(src_node, nodes.AccessNode) and src_node.data in transients_to_modify): dataname = src_node.data elif (isinstance(dst_node, nodes.AccessNode) and dst_node.data in transients_to_modify): dataname = dst_node.data if dataname is not None: subset = (edge.data.other_subset or subsets.Range.from_array(sdfg.arrays[dataname])) edge.data.other_subset = self._modify_memlet( sdfg, subset, dataname) modified_subsets.append(edge.data.other_subset) ############################## # Turn map into for loop map_to_for = MapToForLoop(self.sdfg_id, self.state_id, { MapToForLoop._map_entry: self.subgraph[DoubleBuffering._map_entry] }, self.expr_index) nsdfg_node, nstate = map_to_for.apply(sdfg) ############################## # Gather node copies and remove memlets edges_to_replace = [] for node in nstate.source_nodes(): for edge in nstate.out_edges(node): if (isinstance(edge.dst, nodes.AccessNode) and edge.dst.data in transients_to_modify): edges_to_replace.append(edge) nstate.remove_edge(edge) if nstate.out_degree(node) == 0: nstate.remove_node(node) ############################## # Add initial reads to initial nested state initial_state: sd.SDFGState = nsdfg_node.sdfg.start_state initial_state.set_label('%s_init' % map_entry.map.label) for edge in edges_to_replace: initial_state.add_node(edge.src) rnode = edge.src wnode = initial_state.add_write(edge.dst.data) initial_state.add_edge(rnode, edge.src_conn, wnode, edge.dst_conn, copy.deepcopy(edge.data)) # All instances of the map parameter in this state become the loop start sd.replace(initial_state, map_param, map_rstart) # Initial writes go to the first buffer sd.replace(initial_state, '__dace_db_param', '0') ############################## # Modify main state's memlets # Divide by loop stride new_expr = symbolic.pystr_to_symbolic('(%s / %s) %% 2' % (map_param, map_rstride)) sd.replace(nstate, '__dace_db_param', new_expr) ############################## # Add the main state's contents to the last state, modifying # memlets appropriately. final_state: sd.SDFGState = nsdfg_node.sdfg.sink_nodes()[0] final_state.set_label('%s_final_computation' % map_entry.map.label) dup_nstate = copy.deepcopy(nstate) final_state.add_nodes_from(dup_nstate.nodes()) for e in dup_nstate.edges(): final_state.add_edge(e.src, e.src_conn, e.dst, e.dst_conn, e.data) ############################## # Add reads into next buffers to main state for edge in edges_to_replace: rnode = copy.deepcopy(edge.src) nstate.add_node(rnode) wnode = nstate.add_write(edge.dst.data) new_memlet = copy.deepcopy(edge.data) if new_memlet.data in transients_to_modify: new_memlet.other_subset = self._replace_in_subset( new_memlet.other_subset, map_param, '(%s + %s)' % (map_param, map_rstride)) else: new_memlet.subset = self._replace_in_subset( new_memlet.subset, map_param, '(%s + %s)' % (map_param, map_rstride)) nstate.add_edge(rnode, edge.src_conn, wnode, edge.dst_conn, new_memlet) nstate.set_label('%s_double_buffered' % map_entry.map.label) # Divide by loop stride new_expr = symbolic.pystr_to_symbolic('((%s / %s) + 1) %% 2' % (map_param, map_rstride)) sd.replace(nstate, '__dace_db_param', new_expr) @staticmethod def _modify_memlet(sdfg, subset, data_name): desc = sdfg.arrays[data_name] if len(subset) == len(desc.shape): # Already in the right shape, modify new dimension subset = list(subset)[1:] new_subset = subsets.Range([('__dace_db_param', '__dace_db_param', 1)] + list(subset)) return new_subset @staticmethod def _replace_in_subset(subset, string_or_symbol, new_string_or_symbol): new_subset = copy.deepcopy(subset) repldict = { symbolic.pystr_to_symbolic(string_or_symbol): symbolic.pystr_to_symbolic(new_string_or_symbol) } for i, dim in enumerate(new_subset): try: new_subset[i] = tuple(d.subs(repldict) for d in dim) except TypeError: new_subset[i] = (dim.subs(repldict) if symbolic.issymbolic(dim) else dim) return new_subset
class RedundantArray(pm.Transformation): """ Implements the redundant array removal transformation, applied when a transient array is copied to and from (to another array), but never used anywhere else. """ _arrays_removed = 0 _in_array = nodes.AccessNode("_") _out_array = nodes.AccessNode("_") @staticmethod def expressions(): return [ nxutil.node_path_graph(RedundantArray._in_array, RedundantArray._out_array) ] @staticmethod def can_be_applied(graph, candidate, expr_index, sdfg, strict=False): in_array = graph.nodes()[candidate[RedundantArray._in_array]] out_array = graph.nodes()[candidate[RedundantArray._out_array]] # Ensure out degree is one (only one target, which is out_array) if graph.out_degree(in_array) != 1: return False # Make sure that the candidate is a transient variable if not in_array.desc(sdfg).transient: return False # Make sure that both arrays are using the same storage location if in_array.desc(sdfg).storage != out_array.desc(sdfg).storage: return False # Find occurrences in this and other states occurrences = [] for state in sdfg.nodes(): occurrences.extend([ n for n in state.nodes() if isinstance(n, nodes.AccessNode) and n.desc(sdfg) == in_array.desc(sdfg) ]) if len(occurrences) > 1: return False # Only apply if arrays are of same shape (no need to modify subset) if len(in_array.desc(sdfg).shape) != len( out_array.desc(sdfg).shape) or any(i != o for i, o in zip( in_array.desc(sdfg).shape, out_array.desc(sdfg).shape)): return False if strict: # In strict mode, make sure the memlet covers the removed array edge = graph.edges_between(in_array, out_array)[0] if any(m != a for m, a in zip(edge.data.subset.size(), in_array.desc(sdfg).shape)): return False return True @staticmethod def match_to_str(graph, candidate): in_array = graph.nodes()[candidate[RedundantArray._in_array]] return "Remove " + str(in_array) def apply(self, sdfg): def gnode(nname): return graph.nodes()[self.subgraph[nname]] graph = sdfg.nodes()[self.state_id] in_array = gnode(RedundantArray._in_array) out_array = gnode(RedundantArray._out_array) for e in graph.in_edges(in_array): # Modify all incoming edges to point to out_array path = graph.memlet_path(e) for pe in path: if pe.data.data == in_array.data: pe.data.data = out_array.data # Redirect edge to out_array graph.remove_edge(e) graph.add_edge(e.src, e.src_conn, out_array, e.dst_conn, e.data) # Finally, remove in_array node graph.remove_node(in_array) # TODO: Should the array be removed from the SDFG? # del sdfg.arrays[in_array] if Config.get_bool("debugprint"): RedundantArray._arrays_removed += 1
class MapWCRFusion(pm.Transformation): """ Implements the map expanded-reduce fusion transformation. Fuses a map with an immediately following reduction, where the array between the map and the reduction is not used anywhere else, and the reduction is divided to two maps with a WCR, denoting partial reduction. """ _tasklet = nodes.Tasklet('_') _tmap_exit = nodes.MapExit(nodes.Map("", [], [])) _in_array = nodes.AccessNode('_') _rmap_in_entry = nodes.MapEntry(nodes.Map("", [], [])) _rmap_in_tasklet = nodes.Tasklet('_') _rmap_in_cr = nodes.MapExit(nodes.Map("", [], [])) _rmap_out_entry = nodes.MapEntry(nodes.Map("", [], [])) _rmap_out_exit = nodes.MapExit(nodes.Map("", [], [])) _out_array = nodes.AccessNode('_') @staticmethod def expressions(): return [ # Map, then partial reduction of axes nxutil.node_path_graph( MapWCRFusion._tasklet, MapWCRFusion._tmap_exit, MapWCRFusion._in_array, MapWCRFusion._rmap_out_entry, MapWCRFusion._rmap_in_entry, MapWCRFusion._rmap_in_tasklet, MapWCRFusion._rmap_in_cr, MapWCRFusion._rmap_out_exit, MapWCRFusion._out_array) ] @staticmethod def can_be_applied(graph, candidate, expr_index, sdfg, strict=False): tmap_exit = graph.nodes()[candidate[MapWCRFusion._tmap_exit]] in_array = graph.nodes()[candidate[MapWCRFusion._in_array]] rmap_entry = graph.nodes()[candidate[MapWCRFusion._rmap_out_entry]] # Make sure that the array is only accessed by the map and the reduce if any([ src != tmap_exit for src, _, _, _, memlet in graph.in_edges(in_array) ]): return False if any([ dest != rmap_entry for _, _, dest, _, memlet in graph.out_edges(in_array) ]): return False # Make sure that there is a reduction in the second map rmap_cr = graph.nodes()[candidate[MapWCRFusion._rmap_in_cr]] reduce_edge = graph.in_edges(rmap_cr)[0] if reduce_edge.data.wcr is None: return False # (strict) Make sure that the transient is not accessed anywhere else # in this state or other states if strict and (len([ n for n in graph.nodes() if isinstance(n, nodes.AccessNode) and n.data == in_array.data ]) > 1 or in_array.data in sdfg.shared_transients()): return False # Verify that reduction ranges match tasklet map tout_memlet = graph.in_edges(in_array)[0].data rin_memlet = graph.out_edges(in_array)[0].data if tout_memlet.subset != rin_memlet.subset: return False return True @staticmethod def match_to_str(graph, candidate): tasklet = candidate[MapWCRFusion._tasklet] map_exit = candidate[MapWCRFusion._tmap_exit] reduce = candidate[MapWCRFusion._rmap_in_cr] return ' -> '.join(str(node) for node in [tasklet, map_exit, reduce]) def apply(self, sdfg): graph = sdfg.node(self.state_id) # To apply, collapse the second map and then fuse the two resulting maps map_collapse = MapCollapse( self.sdfg_id, self.state_id, { MapCollapse._outer_map_entry: self.subgraph[MapWCRFusion._rmap_out_entry], MapCollapse._inner_map_entry: self.subgraph[MapWCRFusion._rmap_in_entry] }, 0) map_entry, _ = map_collapse.apply(sdfg) map_fusion = MapFusion( self.sdfg_id, self.state_id, { MapFusion._first_map_exit: self.subgraph[MapWCRFusion._tmap_exit], MapFusion._second_map_entry: graph.node_id(map_entry) }, 0) map_fusion.apply(sdfg)
class MergeArrays(pattern_matching.Transformation): """ Merge duplicate arrays connected to the same scope entry. """ _array1 = nodes.AccessNode("_") _array2 = nodes.AccessNode("_") _map_entry = nodes.EntryNode() @staticmethod def expressions(): # Matching # o o # | | # /======\ g = SDFGState() g.add_node(MergeArrays._array1) g.add_node(MergeArrays._array2) g.add_node(MergeArrays._map_entry) g.add_edge(MergeArrays._array1, None, MergeArrays._map_entry, None, memlet.EmptyMemlet()) g.add_edge(MergeArrays._array2, None, MergeArrays._map_entry, None, memlet.EmptyMemlet()) return [g] @staticmethod def can_be_applied(graph, candidate, expr_index, sdfg, strict=False): arr1_id = candidate[MergeArrays._array1] arr2_id = candidate[MergeArrays._array2] # Ensure both arrays contain the same data arr1 = graph.node(arr1_id) arr2 = graph.node(arr2_id) if arr1.data != arr2.data: return False # Ensure only arr1's node ID contains incoming edges if graph.in_degree(arr1) == 0 and graph.in_degree(arr2) > 0: return False # Ensure arr1 and arr2's node IDs are ordered (avoid duplicates) if (graph.in_degree(arr1) == 0 and graph.in_degree(arr2) == 0 and arr1_id >= arr2_id): return False map = graph.node(candidate[MergeArrays._map_entry]) # If arr1's connector leads directly to map, skip it if all( e.dst_conn and not e.dst_conn.startswith('IN_') for e in graph.edges_between(arr1, map)): return False if (any(e.dst != map for e in graph.out_edges(arr1)) or any(e.dst != map for e in graph.out_edges(arr2))): return False # Ensure arr1 and arr2 are the first two incoming nodes (avoid further # duplicates) all_source_nodes = set( graph.node_id(e.src) for e in graph.in_edges(map) if e.src != arr1 and e.src != arr2 and e.dst_conn and e.dst_conn.startswith('IN_') and graph.in_degree(e.src) == 0) if any(nid < arr1_id or nid < arr2_id for nid in all_source_nodes): return False return True @staticmethod def match_to_str(graph, candidate): arr = graph.node(candidate[MergeArrays._array1]) map = graph.node(candidate[MergeArrays._map_entry]) return '%s (%d, %d) -> %s' % (arr.data, candidate[MergeArrays._array1], candidate[MergeArrays._array2], map.label) def apply(self, sdfg): graph = sdfg.node(self.state_id) array = graph.node(self.subgraph[MergeArrays._array1]) map = graph.node(self.subgraph[MergeArrays._map_entry]) map_edge = next(e for e in graph.out_edges(array) if e.dst == map) result_connector = map_edge.dst_conn[3:] # Find all other incoming access nodes without incoming edges source_edges = [ e for e in graph.in_edges(map) if isinstance(e.src, nodes.AccessNode) and e.src.data == array.data and e.src != array and e.dst_conn and e.dst_conn.startswith('IN_') and graph.in_degree(e.src) == 0 ] # Modify connectors to point to first array connectors_to_remove = set() for e in source_edges: connector = e.dst_conn[3:] connectors_to_remove.add(connector) for inner_edge in graph.out_edges(map): if inner_edge.src_conn[4:] == connector: inner_edge._src_conn = 'OUT_' + result_connector # Remove other nodes from state graph.remove_nodes_from(set(e.src for e in source_edges)) # Remove connectors from scope entry map.in_connectors -= set('IN_' + c for c in connectors_to_remove) map.out_connectors -= set('OUT_' + c for c in connectors_to_remove)
class MapFusion(pattern_matching.Transformation): """ Implements the MapFusion transformation. It wil check for all patterns MapExit -> AccessNode -> MapEntry, and based on the following rules, fuse them and remove the transient in between. There are several possibilities of what it does to this transient in between. Essentially, if there is some other place in the sdfg where it is required, or if it is not a transient, then it will not be removed. In such a case, it will be linked to the MapExit node of the new fused map. Rules for fusing maps: 0. The map range of the second map should be a permutation of the first map range. 1. Each of the access nodes that are adjacent to the first map exit should have an edge to the second map entry. If it doesn't, then the second map entry should not be reachable from this access node. 2. Any node that has a wcr from the first map exit should not be adjacent to the second map entry. 3. Access pattern for the access nodes in the second map should be the same permutation of the map parameters as the map ranges of the two maps. Alternatively, this access node should not be adjacent to the first map entry. """ _first_map_exit = nodes.ExitNode() _some_array = nodes.AccessNode("_") _second_map_entry = nodes.EntryNode() @staticmethod def annotates_memlets(): return False @staticmethod def expressions(): return [ nxutil.node_path_graph( MapFusion._first_map_exit, MapFusion._some_array, MapFusion._second_map_entry, ) ] @staticmethod def find_permutation(first_map: nodes.Map, second_map: nodes.Map) -> Union[List[int], None]: """ Find permutation between two map ranges. :param first_map: First map. :param second_map: Second map. :return: None if no such permutation exists, otherwise a list of indices L such that L[x]'th parameter of second map has the same range as x'th parameter of the first map. """ result = [] if len(first_map.range) != len(second_map.range): return None # Match map ranges with reduce ranges for i, tmap_rng in enumerate(first_map.range): found = False for j, rng in enumerate(second_map.range): if tmap_rng == rng and j not in result: result.append(j) found = True break if not found: break # Ensure all map ranges matched if len(result) != len(first_map.range): return None return result @staticmethod def can_be_applied(graph, candidate, expr_index, sdfg, strict=False): first_map_exit = graph.nodes()[candidate[MapFusion._first_map_exit]] first_map_entry = graph.entry_node(first_map_exit) second_map_entry = graph.nodes()[candidate[ MapFusion._second_map_entry]] for _in_e in graph.in_edges(first_map_exit): if _in_e.data.wcr is not None: for _out_e in graph.out_edges(second_map_entry): if _out_e.data.data == _in_e.data.data: # wcr is on a node that is used in the second map, quit return False # Check whether there is a pattern map -> access -> map. intermediate_nodes = set() intermediate_data = set() for _, _, dst, _, _ in graph.out_edges(first_map_exit): if isinstance(dst, nodes.AccessNode): intermediate_nodes.add(dst) intermediate_data.add(dst.data) # If array is used anywhere else in this state. num_occurrences = len([ n for n in graph.nodes() if isinstance(n, nodes.AccessNode) and n.data == dst.data ]) if num_occurrences > 1: return False else: return False # Check map ranges perm = MapFusion.find_permutation(first_map_entry.map, second_map_entry.map) if perm is None: return False # Create a dict that maps parameters of the first map to those of the # second map. params_dict = {} for _index, _param in enumerate(first_map_entry.map.params): params_dict[_param] = second_map_entry.map.params[perm[_index]] out_memlets = [e.data for e in graph.in_edges(first_map_exit)] # Check that input set of second map is provided by the output set # of the first map, or other unrelated maps for _, _, _, _, second_memlet in graph.out_edges(second_map_entry): # Memlets that do not come from one of the intermediate arrays if second_memlet.data not in intermediate_data: # however, if intermediate_data eventually leads to # second_memlet.data, need to fail. for _n in intermediate_nodes: source_node = _n # graph.find_node(_n.data) destination_node = graph.find_node(second_memlet.data) # NOTE: Assumes graph has networkx version if destination_node in nx.descendants( graph._nx, source_node): return False continue provided = False for first_memlet in out_memlets: if first_memlet.data != second_memlet.data: continue # If there is an equivalent subset, it is provided expected_second_subset = [] for _tup in first_memlet.subset: new_tuple = [] if isinstance(_tup, symbolic.symbol): new_tuple = symbolic.symbol(params_dict[str(_tup)]) elif isinstance(_tup, (list, tuple)): for _sym in _tup: if (isinstance(_sym, symbolic.symbol) and str(_sym) in params_dict): new_tuple.append( symbolic.symbol(params_dict[str(_sym)])) else: new_tuple.append(_sym) new_tuple = tuple(new_tuple) else: new_tuple = _tup expected_second_subset.append(new_tuple) if expected_second_subset == list(second_memlet.subset): provided = True break # If none of the output memlets of the first map provide the info, # fail. if provided is False: return False # Success return True @staticmethod def match_to_str(graph, candidate): first_exit = graph.nodes()[candidate[MapFusion._first_map_exit]] second_entry = graph.nodes()[candidate[MapFusion._second_map_entry]] return " -> ".join(entry.map.label + ": " + str(entry.map.params) for entry in [first_exit, second_entry]) def apply(self, sdfg): """ This method applies the mapfusion transformation. Other than the removal of the second map entry node (SME), and the first map exit (FME) node, it has the following side effects: 1. Any transient adjacent to both FME and SME with degree = 2 will be removed. The tasklets that use/produce it shall be connected directly with a scalar/new transient (if the dataflow is more than a single scalar) 2. If this transient is adjacent to FME and SME and has other uses, it will be adjacent to the new map exit post fusion. Tasklet-> Tasklet edges will ALSO be added as mentioned above. 3. If an access node is adjacent to FME but not SME, it will be adjacent to new map exit post fusion. 4. If an access node is adjacent to SME but not FME, it will be adjacent to the new map entry node post fusion. """ graph = sdfg.nodes()[self.state_id] first_exit = graph.nodes()[self.subgraph[MapFusion._first_map_exit]] first_entry = graph.entry_node(first_exit) second_entry = graph.nodes()[self.subgraph[ MapFusion._second_map_entry]] second_exit = graph.exit_nodes(second_entry)[0] intermediate_nodes = set() for _, _, dst, _, _ in graph.out_edges(first_exit): intermediate_nodes.add(dst) assert isinstance(dst, nodes.AccessNode) # Check if an access node refers to non transient memory, or transient # is used at another location (cannot erase) do_not_erase = set() for node in intermediate_nodes: if sdfg.arrays[node.data].transient is False: do_not_erase.add(node) else: for edge in graph.in_edges(node): if edge.src != first_exit: do_not_erase.add(node) break else: for edge in graph.out_edges(node): if edge.dst != second_entry: do_not_erase.add(node) break # Find permutation between first and second scopes perm = MapFusion.find_permutation(first_entry.map, second_entry.map) params_dict = {} for index, param in enumerate(first_entry.map.params): params_dict[param] = second_entry.map.params[perm[index]] # Replaces (in memlets and tasklet) the second scope map # indices with the permuted first map indices. # This works in two passes to avoid problems when e.g., exchanging two # parameters (instead of replacing (j,i) and (i,j) to (j,j) and then # i,i). second_scope = graph.scope_subgraph(second_entry) for firstp, secondp in params_dict.items(): if firstp != secondp: replace(second_scope, secondp, '__' + secondp + '_fused') for firstp, secondp in params_dict.items(): if firstp != secondp: replace(second_scope, '__' + secondp + '_fused', firstp) # Isolate First exit node ############################ edges_to_remove = set() nodes_to_remove = set() for edge in graph.in_edges(first_exit): memlet_path = graph.memlet_path(edge) edge_index = next(i for i, e in enumerate(memlet_path) if e == edge) access_node = memlet_path[-1].dst if access_node not in do_not_erase: out_edges = [ e for e in graph.out_edges(access_node) if e.dst == second_entry ] # In this transformation, there can only be one edge to the # second map assert len(out_edges) == 1 # Get source connector to the second map connector = out_edges[0].dst_conn[3:] new_dst = None new_dst_conn = None # Look at the second map entry out-edges to get the new # destination for _e in graph.out_edges(second_entry): if _e.src_conn[4:] == connector: new_dst = _e.dst new_dst_conn = _e.dst_conn break if new_dst is None: # Access node is not used in the second map nodes_to_remove.add(access_node) continue # If the source is an access node, modify the memlet to point # to it if (isinstance(edge.src, nodes.AccessNode) and edge.data.data != edge.src.data): edge.data.data = edge.src.data edge.data.subset = ("0" if edge.data.other_subset is None else edge.data.other_subset) edge.data.other_subset = None else: # Add a transient scalar/array self.fuse_nodes(sdfg, graph, edge, new_dst, new_dst_conn) edges_to_remove.add(edge) # Remove transient node between the two maps nodes_to_remove.add(access_node) else: # The case where intermediate array node cannot be removed # Node will become an output of the second map exit out_e = memlet_path[edge_index + 1] conn = second_exit.next_connector() graph.add_edge( second_exit, 'OUT_' + conn, out_e.dst, out_e.dst_conn, dcpy(out_e.data), ) second_exit.add_out_connector('OUT_' + conn) graph.add_edge(edge.src, edge.src_conn, second_exit, 'IN_' + conn, dcpy(edge.data)) second_exit.add_in_connector('IN_' + conn) edges_to_remove.add(out_e) # If the second map needs this node, link the connector # that generated this to the place where it is needed, with a # temp transient/scalar for memlet to be generated for out_e in graph.out_edges(second_entry): second_memlet_path = graph.memlet_path(out_e) source_node = second_memlet_path[0].src if source_node == access_node: self.fuse_nodes(sdfg, graph, edge, out_e.dst, out_e.dst_conn) edges_to_remove.add(edge) ### # First scope exit is isolated and can now be safely removed for e in edges_to_remove: graph.remove_edge(e) graph.remove_nodes_from(nodes_to_remove) graph.remove_node(first_exit) # Isolate second_entry node ########################### for edge in graph.in_edges(second_entry): memlet_path = graph.memlet_path(edge) edge_index = next(i for i, e in enumerate(memlet_path) if e == edge) access_node = memlet_path[0].src if access_node in intermediate_nodes: # Already handled above, can be safely removed graph.remove_edge(edge) continue # This is an external input to the second map which will now go # through the first map. conn = first_entry.next_connector() graph.add_edge(edge.src, edge.src_conn, first_entry, 'IN_' + conn, dcpy(edge.data)) first_entry.add_in_connector('IN_' + conn) graph.remove_edge(edge) out_e = memlet_path[edge_index + 1] graph.add_edge( first_entry, 'OUT_' + conn, out_e.dst, out_e.dst_conn, dcpy(out_e.data), ) first_entry.add_out_connector('OUT_' + conn) graph.remove_edge(out_e) ### # Second node is isolated and can now be safely removed graph.remove_node(second_entry) # Fix scope exit to point to the right map second_exit.map = first_entry.map def fuse_nodes(self, sdfg, graph, edge, new_dst, new_dst_conn): """ Fuses two nodes via memlets and possibly transient arrays. """ memlet_path = graph.memlet_path(edge) access_node = memlet_path[-1].dst local_name = "__s%d_n%d%s_n%d%s" % ( self.state_id, graph.node_id(edge.src), edge.src_conn, graph.node_id(edge.dst), edge.dst_conn, ) # Add intermediate memory between subgraphs. If a scalar, # uses direct connection. If an array, adds a transient node if edge.data.subset.num_elements() == 1: sdfg.add_scalar( local_name, dtype=access_node.desc(graph).dtype, transient=True, storage=dtypes.StorageType.Register, ) edge.data.data = local_name edge.data.subset = "0" local_node = edge.src src_connector = edge.src_conn else: sdfg.add_transient(local_name, edge.data.subset.size(), dtype=access_node.desc(graph).dtype) local_node = graph.add_access(local_name) src_connector = None edge.data.data = local_name edge.data.subset = ",".join( ["0:" + str(s) for s in edge.data.subset.size()]) # Add edge that leads to transient node graph.add_edge( edge.src, edge.src_conn, local_node, None, dcpy(edge.data), ) ######## # Add edge that leads to the second node graph.add_edge(local_node, src_connector, new_dst, new_dst_conn, dcpy(edge.data))
class RedundantArrayCopying(pm.Transformation): """ Implements the redundant array removal transformation. Removes array B in pattern A -> B -> A. """ _arrays_removed = 0 _in_array = nodes.AccessNode("_") _med_array = nodes.AccessNode("_") _out_array = nodes.AccessNode("_") @staticmethod def expressions(): return [ nxutil.node_path_graph( RedundantArrayCopying._in_array, RedundantArrayCopying._med_array, RedundantArrayCopying._out_array, ) ] @staticmethod def can_be_applied(graph, candidate, expr_index, sdfg, strict=False): in_array = graph.nodes()[candidate[RedundantArrayCopying._in_array]] med_array = graph.nodes()[candidate[RedundantArrayCopying._med_array]] out_array = graph.nodes()[candidate[RedundantArrayCopying._out_array]] # Ensure out degree is one (only one target, which is out_array) if graph.out_degree(in_array) != 1: return False # Make sure that the candidate is a transient variable # if not in_array.desc.transient: # return False # Make sure that both arrays are using the same storage location if in_array.desc(sdfg).storage != out_array.desc(sdfg).storage: return False # Find occurrences in this and other states # (This could be relaxed) # occurrences = [] # for state in sdfg.nodes(): # occurrences.extend([ # n for n in state.nodes() # if isinstance(n, nodes.AccessNode) and n.desc == med_array.desc # ]) # if len(occurrences) > 1: # return False # Only apply if arrays are of same shape (no need to modify memlet subset) if len(in_array.desc(sdfg).shape) != len( out_array.desc(sdfg).shape) or any(i != o for i, o in zip( in_array.desc(sdfg).shape, out_array.desc(sdfg).shape)): return False return True @staticmethod def match_to_str(graph, candidate): med_array = graph.nodes()[candidate[RedundantArrayCopying._med_array]] return "Remove " + str(med_array) def apply(self, sdfg): def gnode(nname): return graph.nodes()[self.subgraph[nname]] graph = sdfg.nodes()[self.state_id] in_array = gnode(RedundantArrayCopying._in_array) med_array = gnode(RedundantArrayCopying._med_array) out_array = gnode(RedundantArrayCopying._out_array) med_edges = len(graph.out_edges(med_array)) med_out_edges = 0 for med_e in graph.out_edges(med_array): if (isinstance(med_e.dst, nodes.AccessNode) and med_e.dst.data == out_array.data): # Modify all outcoming edges to point to in_array for out_e in graph.out_edges(med_e.dst): path = graph.memlet_path(out_e) for pe in path: if pe.data.data == out_array.data: pe.data.data = in_array.data # Redirect edge to in_array graph.remove_edge(out_e) graph.add_edge(in_array, out_e.src_conn, out_e.dst, out_e.dst_conn, out_e.data) # Remove out_array for e in graph.edges_between(med_e, med_e.dst): graph.remove_edge(e) graph.remove_node(med_e.dst) med_out_edges += 1 # Finally, med_array node if med_array.desc(sdfg).transient and med_edges == med_out_edges: for e in graph.edges_between(in_array, med_array): graph.remove_edge(e) graph.remove_node(med_array) if Config.get_bool("debugprint"): RedundantArrayCopying._arrays_removed += 1
def apply(self, sdfg): graph = sdfg.nodes()[self.state_id] inner_map_exit = graph.nodes()[self.subgraph[ OutLocalStorage._inner_map_exit]] outer_map_exit = graph.nodes()[self.subgraph[ OutLocalStorage._outer_map_exit]] original_edge = None invariant_memlet = None array = None for edge in graph.in_edges(outer_map_exit): src = edge.src if src != inner_map_exit: continue memlet = edge.data original_edge = edge invariant_memlet = memlet array = memlet.data break new_data = sdfg.add_array( graph.label + '_trans_' + invariant_memlet.data, [ symbolic.overapproximate(r) for r in invariant_memlet.bounding_box_size() ], sdfg.arrays[invariant_memlet.data].dtype, transient=True) data_node = nodes.AccessNode(graph.label + '_trans_' + invariant_memlet.data) data_node.setzero = True from_data_mm = copy.deepcopy(invariant_memlet) to_data_mm = copy.deepcopy(invariant_memlet) to_data_mm.data = data_node.data offset = [] for ind, r in enumerate(invariant_memlet.subset): offset.append(r[0]) if isinstance(invariant_memlet.subset[ind], tuple): begin = invariant_memlet.subset[ind][0] - r[0] end = invariant_memlet.subset[ind][1] - r[0] step = invariant_memlet.subset[ind][2] to_data_mm.subset[ind] = (begin, end, step) else: to_data_mm.subset[ind] -= r[0] # Reconnect, assuming one edge to the stream graph.remove_edge(original_edge) graph.add_edge(inner_map_exit, original_edge.src_conn, data_node, None, to_data_mm) graph.add_edge(data_node, None, outer_map_exit, original_edge.dst_conn, from_data_mm) for _parent, _, _child, _, memlet in graph.bfs_edges(inner_map_exit, reverse=True): if isinstance(_child, nodes.CodeNode): break if memlet.data != array: continue for ind, r in enumerate(memlet.subset): if isinstance(memlet.subset[ind], tuple): begin = r[0] - offset[ind] end = r[1] - offset[ind] step = r[2] memlet.subset[ind] = (begin, end, step) else: memlet.subset[ind] -= offset[ind] memlet.data = graph.label + '_trans_' + invariant_memlet.data return
class MapReduceFusion(pm.Transformation): """ Implements the map-reduce-fusion transformation. Fuses a map with an immediately following reduction, where the array between the map and the reduction is not used anywhere else. """ _tasklet = nodes.Tasklet('_') _tmap_exit = nodes.MapExit(nodes.Map("", [], [])) _in_array = nodes.AccessNode('_') _rmap_in_entry = nodes.MapEntry(nodes.Map("", [], [])) _rmap_in_tasklet = nodes.Tasklet('_') _rmap_in_cr = nodes.MapExit(nodes.Map("", [], [])) _rmap_out_entry = nodes.MapEntry(nodes.Map("", [], [])) _rmap_out_exit = nodes.MapExit(nodes.Map("", [], [])) _out_array = nodes.AccessNode('_') _reduce = nodes.Reduce('lambda: None', None) @staticmethod def expressions(): return [ # Map, then reduce of all axes nxutil.node_path_graph( MapReduceFusion._tasklet, MapReduceFusion._tmap_exit, MapReduceFusion._in_array, MapReduceFusion._rmap_in_entry, MapReduceFusion._rmap_in_tasklet, MapReduceFusion._rmap_in_cr, MapReduceFusion._out_array), # Map, then partial reduction of axes nxutil.node_path_graph( MapReduceFusion._tasklet, MapReduceFusion._tmap_exit, MapReduceFusion._in_array, MapReduceFusion._rmap_out_entry, MapReduceFusion._rmap_in_entry, MapReduceFusion._rmap_in_tasklet, MapReduceFusion._rmap_in_cr, MapReduceFusion._rmap_out_exit, MapReduceFusion._out_array), # Map, then reduce node nxutil.node_path_graph( MapReduceFusion._tasklet, MapReduceFusion._tmap_exit, MapReduceFusion._in_array, MapReduceFusion._reduce, MapReduceFusion._out_array) ] @staticmethod def can_be_applied(graph, candidate, expr_index, sdfg, strict=False): tmap_exit = graph.nodes()[candidate[MapReduceFusion._tmap_exit]] in_array = graph.nodes()[candidate[MapReduceFusion._in_array]] if expr_index == 0: # Reduce without outer map rmap_entry = graph.nodes()[candidate[ MapReduceFusion._rmap_in_entry]] # rmap_in_entry = rmap_entry elif expr_index == 1: # Reduce with outer map rmap_entry = graph.nodes()[candidate[ MapReduceFusion._rmap_out_entry]] # rmap_in_entry = graph.nodes()[candidate[ # MapReduceFusion._rmap_in_entry]] else: # Reduce node rmap_entry = graph.nodes()[candidate[MapReduceFusion._reduce]] # Make sure that the array is only accessed by the map and the reduce if any([ src != tmap_exit for src, _, _, _, memlet in graph.in_edges(in_array) ]): return False if any([ dest != rmap_entry for _, _, dest, _, memlet in graph.out_edges(in_array) ]): return False # Make sure that there is a reduction in the second map if expr_index < 2: rmap_cr = graph.nodes()[candidate[MapReduceFusion._rmap_in_cr]] reduce_edge = graph.in_edges(rmap_cr)[0] if reduce_edge.data.wcr is None: return False # Make sure that the transient is not accessed by other states # if garr.get_unique_name() in cgen_state.sdfg.shared_transients(): # return False # reduce_inarr = reduce.in_array # reduce_outarr = reduce.out_array # reduce_inslice = reduce.inslice # reduce_outslice = reduce.outslice # insize = cgen_state.var_sizes[reduce_inarr] # outsize = cgen_state.var_sizes[reduce_outarr] # Currently only supports full-range arrays # TODO(later): Support fusion of partial reductions and refactor slice/subarray handling #if not nxutil.fullrange(reduce_inslice, insize) or \ # not nxutil.fullrange(reduce_outslice, outsize): # return False # Verify acceses from tasklet through MapExit #already_found = False #for _src, _, _dest, _, memlet in graph.in_edges(map_exit): # if isinstance(memlet.subset, subsets.Indices): # # Make sure that only one value is reduced at a time # if memlet.data == in_array.desc: # if already_found: # return False # already_found = True ## Find axes after reduction #indims = len(reduce.inslice) #axis_after_reduce = [None] * indims #ctr = 0 #for i in range(indims): # if reduce.axes is not None and i in reduce.axes: # axis_after_reduce[i] = None # else: # axis_after_reduce[i] = ctr # ctr += 1 ## Match map ranges with reduce ranges #curaxis = 0 #for dim, var in enumerate(memlet.subset): # # Make sure that indices are direct symbols # #if not isinstance(symbolic.pystr_to_symbolic(var), sympy.Symbol): # # return False # perm = None # for i, mapvar in enumerate(map_exit.map.params): # if symbolic.pystr_to_symbolic(mapvar) == var: # perm = i # break # if perm is None: # If symbol is not found in map range # return False # # Make sure that map ranges match output slice after reduction # map_range = map_exit.map.range[perm] # if map_range[0] != 0: # return False # Disallow start from middle # if map_range[2] is not None and map_range[2] != 1: # return False # Disallow skip # if reduce.axes is not None and dim not in reduce.axes: # if map_range[1] != symbolic.pystr_to_symbolic( # reduce.outslice[axis_after_reduce[dim]][1]): # return False # Range check (output axis) # else: # if map_range[1] != symbolic.pystr_to_symbolic(reduce.inslice[dim][1]): # return False # Range check (reduction axis) # Verify that reduction ranges match tasklet map tout_memlet = graph.in_edges(in_array)[0].data rin_memlet = graph.out_edges(in_array)[0].data if tout_memlet.subset != rin_memlet.subset: return False return True @staticmethod def match_to_str(graph, candidate): tasklet = candidate[MapReduceFusion._tasklet] map_exit = candidate[MapReduceFusion._tmap_exit] if len(candidate) == 5: # Expression 2 reduce = candidate[MapReduceFusion._reduce] else: reduce = candidate[MapReduceFusion._rmap_in_cr] return ' -> '.join(str(node) for node in [tasklet, map_exit, reduce]) @staticmethod def find_memlet_map_permutation(memlet: Memlet, map: nodes.Map): perm = [None] * len(memlet.subset) indices = set() for i, dim in enumerate(memlet.subset): for j, mapdim in enumerate(map.params): if symbolic.pystr_to_symbolic( mapdim) == dim and j not in indices: perm[i] = j indices.add(j) break return perm @staticmethod def find_permutation(tasklet_map: nodes.Map, red_outer_map: nodes.Map, red_inner_map: nodes.Map, tmem: Memlet): """ Find permutation between tasklet-exit memlet and tasklet map. """ result = [], [] assert len(tasklet_map.range) == len(red_inner_map.range) + len( red_outer_map.range) # Match map ranges with reduce ranges unavailable_ranges_out = set() unavailable_ranges_in = set() for i, tmap_rng in enumerate(tasklet_map.range): found = False for j, rng in enumerate(red_outer_map.range): if tmap_rng == rng and j not in unavailable_ranges_out: result[0].append(i) unavailable_ranges_out.add(j) found = True break if found: continue for j, rng in enumerate(red_inner_map.range): if tmap_rng == rng and j not in unavailable_ranges_in: result[1].append(i) unavailable_ranges_in.add(j) found = True break if not found: break # Ensure all map variables matched with reduce variables assert len(result[0]) + len(result[1]) == len(tasklet_map.range) # Returns ([outer map indices], [inner (CR) map indices]) return result @staticmethod def find_permutation_reduce(tasklet_map: nodes.Map, reduce_node: nodes.Reduce, graph: SDFGState, tmem: Memlet): in_memlet = graph.in_edges(reduce_node)[0].data out_memlet = graph.out_edges(reduce_node)[0].data assert len(tasklet_map.range) == in_memlet.subset.dims() # Find permutation between tasklet-exit memlet and tasklet map tmem_perm = MapReduceFusion.find_memlet_map_permutation( tmem, tasklet_map) mapred_perm = [] # Match map ranges with reduce ranges unavailable_ranges = set() for i, tmap_rng in enumerate(tasklet_map.range): found = False for j, in_rng in enumerate(in_memlet.subset): if tmap_rng == in_rng and j not in unavailable_ranges: mapred_perm.append(i) unavailable_ranges.add(j) found = True break if not found: break # Ensure all map variables matched with reduce variables assert len(tmem_perm) == len(tmem.subset) assert len(mapred_perm) == len(in_memlet.subset) # Prepare result from the two permutations and the reduction axes result = [] for i in range(len(mapred_perm)): if reduce_node.axes is None or i in reduce_node.axes: continue result.append(mapred_perm[tmem_perm[i]]) return result def apply(self, sdfg): def gnode(nname): return graph.nodes()[self.subgraph[nname]] expr_index = self.expr_index graph = sdfg.nodes()[self.state_id] tasklet = gnode(MapReduceFusion._tasklet) tmap_exit = graph.nodes()[self.subgraph[MapReduceFusion._tmap_exit]] in_array = graph.nodes()[self.subgraph[MapReduceFusion._in_array]] if expr_index == 0: # Reduce without outer map rmap_entry = graph.nodes()[self.subgraph[ MapReduceFusion._rmap_in_entry]] elif expr_index == 1: # Reduce with outer map rmap_out_entry = graph.nodes()[self.subgraph[ MapReduceFusion._rmap_out_entry]] rmap_out_exit = graph.nodes()[self.subgraph[ MapReduceFusion._rmap_out_exit]] rmap_in_entry = graph.nodes()[self.subgraph[ MapReduceFusion._rmap_in_entry]] rmap_tasklet = graph.nodes()[self.subgraph[ MapReduceFusion._rmap_in_tasklet]] if expr_index == 2: rmap_cr = graph.nodes()[self.subgraph[MapReduceFusion._reduce]] else: rmap_cr = graph.nodes()[self.subgraph[MapReduceFusion._rmap_in_cr]] out_array = gnode(MapReduceFusion._out_array) # Set nodes to remove according to the expression index nodes_to_remove = [in_array] if expr_index == 0: nodes_to_remove.append(gnode(MapReduceFusion._rmap_in_entry)) elif expr_index == 1: nodes_to_remove.append(gnode(MapReduceFusion._rmap_out_entry)) nodes_to_remove.append(gnode(MapReduceFusion._rmap_in_entry)) nodes_to_remove.append(gnode(MapReduceFusion._rmap_out_exit)) else: nodes_to_remove.append(gnode(MapReduceFusion._reduce)) # If no other edges lead to mapexit, remove it. Otherwise, keep # it and remove reduction incoming/outgoing edges if expr_index != 2 and len(graph.in_edges(tmap_exit)) == 1: nodes_to_remove.append(tmap_exit) memlet_edge = None for edge in graph.in_edges(tmap_exit): if edge.data.data == in_array.data: memlet_edge = edge break if memlet_edge is None: raise RuntimeError('Reduction memlet cannot be None') if expr_index == 0: # Reduce without outer map # Index order does not matter, merge as-is pass elif expr_index == 1: # Reduce with outer map tmap = tmap_exit.map perm_outer, perm_inner = MapReduceFusion.find_permutation( tmap, rmap_out_entry.map, rmap_in_entry.map, memlet_edge.data) # Split tasklet map into tmap_out -> tmap_in (according to # reduction) omap = nodes.Map( tmap.label + '_nonreduce', [p for i, p in enumerate(tmap.params) if i in perm_outer], [r for i, r in enumerate(tmap.range) if i in perm_outer], tmap.schedule, tmap.unroll, tmap.is_async) tmap.params = [ p for i, p in enumerate(tmap.params) if i in perm_inner ] tmap.range = [ r for i, r in enumerate(tmap.range) if i in perm_inner ] omap_entry = nodes.MapEntry(omap) omap_exit = rmap_out_exit rmap_out_exit.map = omap # Reconnect graph to new map tmap_entry = graph.entry_node(tmap_exit) tmap_in_edges = list(graph.in_edges(tmap_entry)) for e in tmap_in_edges: nxutil.change_edge_dest(graph, tmap_entry, omap_entry) for e in tmap_in_edges: graph.add_edge(omap_entry, e.src_conn, tmap_entry, e.dst_conn, copy.copy(e.data)) elif expr_index == 2: # Reduce node # Find correspondence between map indices and array outputs tmap = tmap_exit.map perm = MapReduceFusion.find_permutation_reduce( tmap, rmap_cr, graph, memlet_edge.data) output_subset = [tmap.params[d] for d in perm] if len(output_subset) == 0: # Output is a scalar output_subset = [0] array_edge = graph.out_edges(rmap_cr)[0] # Delete relevant edges and nodes graph.remove_edge(memlet_edge) graph.remove_nodes_from(nodes_to_remove) # Add new edges and nodes # From tasklet to map exit graph.add_edge( memlet_edge.src, memlet_edge.src_conn, memlet_edge.dst, memlet_edge.dst_conn, Memlet(out_array.data, memlet_edge.data.num_accesses, subsets.Indices(output_subset), memlet_edge.data.veclen, rmap_cr.wcr, rmap_cr.identity)) # From map exit to output array graph.add_edge( memlet_edge.dst, 'OUT_' + memlet_edge.dst_conn[3:], array_edge.dst, array_edge.dst_conn, Memlet(array_edge.data.data, array_edge.data.num_accesses, array_edge.data.subset, array_edge.data.veclen, rmap_cr.wcr, rmap_cr.identity)) return # Remove tmp array node prior to the others, so that a new one # can be created in its stead (see below) graph.remove_node(nodes_to_remove[0]) nodes_to_remove = nodes_to_remove[1:] # Create tasklet -> tmp -> tasklet connection tmp = graph.add_array( 'tmp', memlet_edge.data.subset.bounding_box_size(), sdfg.arrays[memlet_edge.data.data].dtype, transient=True) tasklet_tmp_memlet = copy.deepcopy(memlet_edge.data) tasklet_tmp_memlet.data = tmp.data tasklet_tmp_memlet.subset = ShapeProperty.to_string(tmp.shape) # Modify memlet to point to output array memlet_edge.data.data = out_array.data # Recover reduction axes from CR reduce subset reduce_cr_subset = graph.in_edges(rmap_tasklet)[0].data.subset reduce_axes = [] for ind, crvar in enumerate(reduce_cr_subset.indices): if '__i' in str(crvar): reduce_axes.append(ind) # Modify memlet access index by filtering out reduction axes if True: # expr_index == 0: newindices = [] for ind, ovar in enumerate(memlet_edge.data.subset.indices): if ind not in reduce_axes: newindices.append(ovar) if len(newindices) == 0: newindices = [0] memlet_edge.data.subset = subsets.Indices(newindices) graph.remove_edge(memlet_edge) graph.add_edge(memlet_edge.src, memlet_edge.src_conn, tmp, memlet_edge.dst_conn, tasklet_tmp_memlet) red_edges = list(graph.in_edges(rmap_tasklet)) if len(red_edges) != 1: raise RuntimeError('CR edge must be unique') tmp_tasklet_memlet = copy.deepcopy(tasklet_tmp_memlet) graph.add_edge(tmp, None, rmap_tasklet, red_edges[0].dst_conn, tmp_tasklet_memlet) for e in graph.edges_between(rmap_tasklet, rmap_cr): e.data.subset = memlet_edge.data.subset # Move output edges to point directly to CR node if expr_index == 1: # Set output memlet between CR node and outer reduction map to # contain the same subset as the one pointing to the CR node for e in graph.out_edges(rmap_cr): e.data.subset = memlet_edge.data.subset rmap_out = gnode(MapReduceFusion._rmap_out_exit) nxutil.change_edge_src(graph, rmap_out, omap_exit) # Remove nodes graph.remove_nodes_from(nodes_to_remove) # For unrelated outputs, connect original output to rmap_out if expr_index == 1 and tmap_exit not in nodes_to_remove: other_out_edges = list(graph.out_edges(tmap_exit)) for e in other_out_edges: graph.remove_edge(e) graph.add_edge(e.src, e.src_conn, omap_exit, None, e.data) graph.add_edge(omap_exit, None, e.dst, e.dst_conn, copy.copy(e.data)) def modifies_graph(self): return True
class RedundantSecondArray(pm.Transformation): """ Implements the redundant array removal transformation, applied when a transient array is copied from and to (from another array), but never used anywhere else. This transformation removes the second array. """ _arrays_removed = 0 _in_array = nodes.AccessNode("_") _out_array = nodes.AccessNode("_") @staticmethod def expressions(): return [ nxutil.node_path_graph(RedundantSecondArray._in_array, RedundantSecondArray._out_array) ] @staticmethod def can_be_applied(graph, candidate, expr_index, sdfg, strict=False): in_array = graph.nodes()[candidate[RedundantSecondArray._in_array]] out_array = graph.nodes()[candidate[RedundantSecondArray._out_array]] # Ensure in degree is one (only one source, which is in_array) if graph.in_degree(out_array) != 1: return False # Make sure that the candidate is a transient variable if not out_array.desc(sdfg).transient: return False # Make sure that both arrays are using the same storage location if in_array.desc(sdfg).storage != out_array.desc(sdfg).storage: return False # Find occurrences in this and other states occurrences = [] for state in sdfg.nodes(): occurrences.extend([ n for n in state.nodes() if isinstance(n, nodes.AccessNode) and n.desc(sdfg) == out_array.desc(sdfg) ]) if len(occurrences) > 1: return False # Only apply if arrays are of same shape (no need to modify memlet subset) # if len(in_array.desc(sdfg).shape) != len( # out_array.desc(sdfg).shape) or any(i != o for i, o in zip( # in_array.desc(sdfg).shape, # out_array.desc(sdfg).shape)): # return False return True @staticmethod def match_to_str(graph, candidate): out_array = graph.nodes()[candidate[RedundantSecondArray._out_array]] return "Remove " + str(out_array) def apply(self, sdfg): def gnode(nname): return graph.nodes()[self.subgraph[nname]] graph = sdfg.nodes()[self.state_id] in_array = gnode(RedundantSecondArray._in_array) out_array = gnode(RedundantSecondArray._out_array) memlet = graph.edges_between(in_array, out_array)[0].data if memlet.data == in_array.data: subset = memlet.subset else: subset = memlet.other_subset for e in graph.out_edges(out_array): # Modify all outgoing edges to point to in_array path = graph.memlet_tree(e) for pe in path: if pe.data.data == out_array.data: pe.data.data = in_array.data if isinstance(subset, subsets.Indices): pe.data.subset.offset(subset, False) else: pe.data.subset = subset.compose(pe.data.subset) elif pe.data.other_subset: if isinstance(subset, subsets.Indices): pe.data.other_subset.offset(subset, False) else: pe.data.other_subset = subset.compose( pe.data.other_subset) # Redirect edge to out_array graph.remove_edge(e) graph.add_edge(in_array, e.src_conn, e.dst, e.dst_conn, e.data) # Finally, remove out_array node graph.remove_node(out_array) # TODO: Should the array be removed from the SDFG? # del sdfg.arrays[out_array] if Config.get_bool("debugprint"): RedundantSecondArray._arrays_removed += 1
def apply(self, sdfg): graph = sdfg.nodes()[self.state_id] node_a = graph.nodes()[self.subgraph[LocalStorage._node_a]] node_b = graph.nodes()[self.subgraph[LocalStorage._node_b]] # Determine direction of new memlet scope_dict = graph.scope_dict() propagate_forward = sd.scope_contains_scope(scope_dict, node_a, node_b) array = self.array if array is None or len(array) == 0: array = graph.edges_between(node_a, node_b)[0].data.data original_edge = None invariant_memlet = None for edge in graph.edges_between(node_a, node_b): if array == edge.data.data: original_edge = edge invariant_memlet = edge.data break if invariant_memlet is None: for edge in graph.edges_between(node_a, node_b): original_edge = edge invariant_memlet = edge.data warnings.warn('Array %s not found! Using array %s instead.' % (array, invariant_memlet.data)) array = invariant_memlet.data break if invariant_memlet is None: raise NameError('Array %s not found!' % array) # Add transient array new_data, _ = sdfg.add_array( 'trans_' + invariant_memlet.data, [ symbolic.overapproximate(r) for r in invariant_memlet.bounding_box_size() ], sdfg.arrays[invariant_memlet.data].dtype, transient=True, find_new_name=True) data_node = nodes.AccessNode(new_data) # Store as fields so that other transformations can use them self._local_name = new_data self._data_node = data_node to_data_mm = copy.deepcopy(invariant_memlet) from_data_mm = copy.deepcopy(invariant_memlet) offset = subsets.Indices([r[0] for r in invariant_memlet.subset]) # Reconnect, assuming one edge to the access node graph.remove_edge(original_edge) if propagate_forward: graph.add_edge(node_a, original_edge.src_conn, data_node, None, to_data_mm) new_edge = graph.add_edge(data_node, None, node_b, original_edge.dst_conn, from_data_mm) else: new_edge = graph.add_edge(node_a, original_edge.src_conn, data_node, None, to_data_mm) graph.add_edge(data_node, None, node_b, original_edge.dst_conn, from_data_mm) # Offset all edges in the memlet tree (including the new edge) for edge in graph.memlet_tree(new_edge): edge.data.subset.offset(offset, True) edge.data.data = new_data