def state_fission(sdfg: SDFG, subgraph: graph.SubgraphView) -> SDFGState: ''' Given a subgraph, adds a new SDFG state before the state that contains it, removes the subgraph from the original state, and connects the two states. :param subgraph: the subgraph to remove. :return: the newly created SDFG state. ''' state: SDFGState = subgraph.graph newstate = sdfg.add_state_before(state) # Save edges before removing nodes orig_edges = subgraph.edges() # Mark boundary access nodes to keep after fission nodes_to_remove = set(subgraph.nodes()) nodes_to_remove -= set(n for n in subgraph.source_nodes() if state.out_degree(n) > 1) nodes_to_remove -= set(n for n in subgraph.sink_nodes() if state.in_degree(n) > 1) state.remove_nodes_from(nodes_to_remove) for n in subgraph.nodes(): if isinstance(n, nodes.NestedSDFG): # Set the new parent state n.sdfg.parent = newstate newstate.add_nodes_from(subgraph.nodes()) for e in orig_edges: newstate.add_edge(e.src, e.src_conn, e.dst, e.dst_conn, e.data) return newstate
def state_fission(sdfg: SDFG, subgraph: graph.SubgraphView) -> SDFGState: ''' Given a subgraph, adds a new SDFG state before the state that contains it, removes the subgraph from the original state, and connects the two states. :param subgraph: the subgraph to remove. :return: the newly created SDFG state. ''' state: SDFGState = subgraph.graph newstate = sdfg.add_state_before(state) # Save edges before removing nodes orig_edges = subgraph.edges() # Mark boundary access nodes to keep after fission nodes_to_remove = set(subgraph.nodes()) boundary_nodes = [ n for n in subgraph.nodes() if len(state.out_edges(n)) > len(subgraph.out_edges(n)) ] + [ n for n in subgraph.nodes() if len(state.in_edges(n)) > len(subgraph.in_edges(n)) ] # Make dictionary of nodes to add to new state new_nodes = {n: n for n in subgraph.nodes()} new_nodes.update({b: copy.deepcopy(b) for b in boundary_nodes}) nodes_to_remove -= set(boundary_nodes) state.remove_nodes_from(nodes_to_remove) for n in new_nodes.values(): if isinstance(n, nodes.NestedSDFG): # Set the new parent state n.sdfg.parent = newstate newstate.add_nodes_from(new_nodes.values()) for e in orig_edges: newstate.add_edge(new_nodes[e.src], e.src_conn, new_nodes[e.dst], e.dst_conn, e.data) return newstate
def apply(self, graph: SDFGState, sdfg: SDFG): tmap_exit = self.tmap_exit in_array = self.in_array reduce_node = self.reduce out_array = self.out_array # Set nodes to remove according to the expression index nodes_to_remove = [in_array] nodes_to_remove.append(reduce_node) memlet_edge = None for edge in graph.in_edges(tmap_exit): if edge.data.data == in_array.data: memlet_edge = edge break if memlet_edge is None: raise RuntimeError('Reduction memlet cannot be None') # Find which indices should be removed from new memlet input_edge = graph.in_edges(reduce_node)[0] axes = reduce_node.axes or list(range(len(input_edge.data.subset))) array_edge = graph.out_edges(reduce_node)[0] # Delete relevant edges and nodes graph.remove_nodes_from(nodes_to_remove) # Delete relevant data descriptors for node in set(nodes_to_remove): if isinstance(node, nodes.AccessNode): # try to delete it try: sdfg.remove_data(node.data) # will raise ValueError if the datadesc is used somewhere else except ValueError: pass # Filter out reduced dimensions from subset filtered_subset = [ dim for i, dim in enumerate(memlet_edge.data.subset) if i not in axes ] if len(filtered_subset) == 0: # Output is a scalar filtered_subset = [(0, 0, 1)] # Modify edge from tasklet to map exit memlet_edge.data.data = out_array.data memlet_edge.data.wcr = reduce_node.wcr memlet_edge.data.subset = type( memlet_edge.data.subset)(filtered_subset) # Add edge from map exit to output array graph.add_edge( memlet_edge.dst, 'OUT_' + memlet_edge.dst_conn[3:], array_edge.dst, array_edge.dst_conn, Memlet.simple(array_edge.data.data, array_edge.data.subset, num_accesses=array_edge.data.num_accesses, wcr_str=reduce_node.wcr)) # Add initialization state as necessary if not self.no_init and reduce_node.identity is not None: init_state = sdfg.add_state_before(graph) init_state.add_mapped_tasklet( 'freduce_init', [('o%d' % i, '%s:%s:%s' % (r[0], r[1] + 1, r[2])) for i, r in enumerate(array_edge.data.subset)], {}, '__out = %s' % reduce_node.identity, { '__out': Memlet.simple( array_edge.data.data, ','.join([ 'o%d' % i for i in range(len(array_edge.data.subset)) ])) }, external_edges=True)
def apply(self, sdfg: SDFG): graph = sdfg.nodes()[self.state_id] tmap_exit = graph.nodes()[self.subgraph[MapReduceFusion._tmap_exit]] in_array = graph.nodes()[self.subgraph[MapReduceFusion._in_array]] reduce_node = graph.nodes()[self.subgraph[MapReduceFusion._reduce]] out_array = graph.nodes()[self.subgraph[MapReduceFusion._out_array]] # Set nodes to remove according to the expression index nodes_to_remove = [in_array] nodes_to_remove.append(reduce_node) memlet_edge = None for edge in graph.in_edges(tmap_exit): if edge.data.data == in_array.data: memlet_edge = edge break if memlet_edge is None: raise RuntimeError('Reduction memlet cannot be None') # Find which indices should be removed from new memlet input_edge = graph.in_edges(reduce_node)[0] axes = reduce_node.axes or list(range(len(input_edge.data.subset))) array_edge = graph.out_edges(reduce_node)[0] # Delete relevant edges and nodes graph.remove_nodes_from(nodes_to_remove) # Filter out reduced dimensions from subset filtered_subset = [ dim for i, dim in enumerate(memlet_edge.data.subset) if i not in axes ] if len(filtered_subset) == 0: # Output is a scalar filtered_subset = [(0, 0, 1)] # Modify edge from tasklet to map exit memlet_edge.data.data = out_array.data memlet_edge.data.wcr = reduce_node.wcr memlet_edge.data.subset = type(memlet_edge.data.subset)(filtered_subset) # Add edge from map exit to output array graph.add_edge( memlet_edge.dst, 'OUT_' + memlet_edge.dst_conn[3:], array_edge.dst, array_edge.dst_conn, Memlet.simple(array_edge.data.data, array_edge.data.subset, num_accesses=array_edge.data.num_accesses, wcr_str=reduce_node.wcr)) # Add initialization state as necessary if reduce_node.identity is not None: init_state = sdfg.add_state_before(graph) init_state.add_mapped_tasklet( 'freduce_init', [('o%d' % i, '%s:%s:%s' % (r[0], r[1] + 1, r[2])) for i, r in enumerate(array_edge.data.subset)], {}, 'out = %s' % reduce_node.identity, { 'out': Memlet.simple( array_edge.data.data, ','.join([ 'o%d' % i for i in range(len(array_edge.data.subset)) ])) }, external_edges=True)
def apply(self, sdfg: SDFG): subgraph = self.subgraph_view(sdfg) entry_states_in, entry_states_out = self.get_entry_states( sdfg, subgraph) _, exit_states_out = self.get_exit_states(sdfg, subgraph) entry_state_in = entry_states_in.pop() entry_state_out = entry_states_out.pop() \ if len(entry_states_out) > 0 else None exit_state_out = exit_states_out.pop() \ if len(exit_states_out) > 0 else None launch_state = None entry_guard_state = None exit_guard_state = None # generate entry guard state if needed if self.include_in_assignment and entry_state_out is not None: entry_edge = sdfg.edges_between(entry_state_out, entry_state_in)[0] if len(entry_edge.data.assignments) > 0: entry_guard_state = sdfg.add_state( label='{}kernel_entry_guard'.format( self.kernel_prefix + '_' if self.kernel_prefix != '' else '')) sdfg.add_edge(entry_state_out, entry_guard_state, InterstateEdge(entry_edge.data.condition)) sdfg.add_edge( entry_guard_state, entry_state_in, InterstateEdge(None, entry_edge.data.assignments)) sdfg.remove_edge(entry_edge) # Update SubgraphView new_node_list = subgraph.nodes() new_node_list.append(entry_guard_state) subgraph = SubgraphView(sdfg, new_node_list) launch_state = sdfg.add_state_before( entry_guard_state, label='{}kernel_launch'.format( self.kernel_prefix + '_' if self.kernel_prefix != '' else '')) # generate exit guard state if exit_state_out is not None: exit_guard_state = sdfg.add_state_before( exit_state_out, label='{}kernel_exit_guard'.format( self.kernel_prefix + '_' if self.kernel_prefix != '' else '')) # Update SubgraphView new_node_list = subgraph.nodes() new_node_list.append(exit_guard_state) subgraph = SubgraphView(sdfg, new_node_list) if launch_state is None: launch_state = sdfg.add_state_before( exit_state_out, label='{}kernel_launch'.format( self.kernel_prefix + '_' if self.kernel_prefix != '' else '')) # If the launch state doesn't exist at this point then there is no other # states outside of the kernel, so create a stand alone launch state if launch_state is None: assert (entry_state_in is None and exit_state_out is None) launch_state = sdfg.add_state(label='{}kernel_launch'.format( self.kernel_prefix + '_' if self.kernel_prefix != '' else '')) # create sdfg for kernel and fill it with states and edges from # ssubgraph dfg will be nested at the end kernel_sdfg = SDFG( '{}kernel'.format(self.kernel_prefix + '_' if self.kernel_prefix != '' else '')) edges = subgraph.edges() for edge in edges: kernel_sdfg.add_edge(edge.src, edge.dst, edge.data) # Setting entry node in nested SDFG if no entry guard was created if entry_guard_state is None: kernel_sdfg.start_state = kernel_sdfg.node_id(entry_state_in) for state in subgraph: state.parent = kernel_sdfg # remove the now nested nodes from the outer sdfg and make sure the # launch state is properly connected to remaining states sdfg.remove_nodes_from(subgraph.nodes()) if entry_state_out is not None \ and len(sdfg.edges_between(entry_state_out, launch_state)) == 0: sdfg.add_edge(entry_state_out, launch_state, InterstateEdge()) if exit_state_out is not None \ and len(sdfg.edges_between(launch_state, exit_state_out)) == 0: sdfg.add_edge(launch_state, exit_state_out, InterstateEdge()) # Handle data for kernel kernel_data = set(node.data for state in kernel_sdfg for node in state.nodes() if isinstance(node, nodes.AccessNode)) # move Streams and Register data into the nested SDFG # normal data will be added as kernel argument kernel_args = [] for data in kernel_data: if (isinstance(sdfg.arrays[data], dace.data.Stream) or (isinstance(sdfg.arrays[data], dace.data.Array) and sdfg.arrays[data].storage == StorageType.Register)): kernel_sdfg.add_datadesc(data, sdfg.arrays[data]) del sdfg.arrays[data] else: copy_desc = copy.deepcopy(sdfg.arrays[data]) copy_desc.transient = False copy_desc.storage = StorageType.Default kernel_sdfg.add_datadesc(data, copy_desc) kernel_args.append(data) # read only data will be passed as input, writeable data will be passed # as 'output' otherwise kernel cannot write to data kernel_args_read = set() kernel_args_write = set() for data in kernel_args: data_accesses_read_only = [ node.access == dtypes.AccessType.ReadOnly for state in kernel_sdfg for node in state if isinstance(node, nodes.AccessNode) and node.data == data ] if all(data_accesses_read_only): kernel_args_read.add(data) else: kernel_args_write.add(data) # Kernel SDFG is complete at this point if self.validate: kernel_sdfg.validate() # Filling launch state with nested SDFG, map and access nodes map_entry, map_exit = launch_state.add_map( '{}kernel_launch_map'.format( self.kernel_prefix + '_' if self.kernel_prefix != '' else ''), dict(ignore='0'), schedule=ScheduleType.GPU_Persistent, ) nested_sdfg = launch_state.add_nested_sdfg( kernel_sdfg, sdfg, kernel_args_read, kernel_args_write, ) # Create and connect read only data access nodes for arg in kernel_args_read: read_node = launch_state.add_read(arg) launch_state.add_memlet_path(read_node, map_entry, nested_sdfg, dst_conn=arg, memlet=Memlet.from_array( arg, sdfg.arrays[arg])) # Create and connect writable data access nodes for arg in kernel_args_write: write_node = launch_state.add_write(arg) launch_state.add_memlet_path(nested_sdfg, map_exit, write_node, src_conn=arg, memlet=Memlet.from_array( arg, sdfg.arrays[arg])) # Transformation is done if self.validate: sdfg.validate()
def _create_einsum_internal(sdfg: SDFG, state: SDFGState, einsum_string: str, *arrays: str, dtype: Optional[dtypes.typeclass] = None, optimize: bool = False, output: Optional[str] = None, nodes: Optional[Dict[str, AccessNode]] = None, init_output: bool = None): # Infer shapes and strides of input/output arrays einsum = EinsumParser(einsum_string) if len(einsum.inputs) != len(arrays): raise ValueError('Invalid number of arrays for einsum expression') # Get shapes from arrays and verify dimensionality chardict = {} for inp, inpname in zip(einsum.inputs, arrays): inparr = sdfg.arrays[inpname] if len(inp) != len(inparr.shape): raise ValueError('Dimensionality mismatch in input "%s"' % inpname) for char, shp in zip(inp, inparr.shape): if char in chardict and shp != chardict[char]: raise ValueError('Dimension mismatch in einsum expression') chardict[char] = shp if optimize: # Try to import opt_einsum try: import opt_einsum as oe except (ModuleNotFoundError, NameError, ImportError): raise ImportError('To optimize einsum expressions, please install ' 'the "opt_einsum" package.') for char, shp in chardict.items(): if symbolic.issymbolic(shp): raise ValueError('Einsum optimization cannot be performed ' 'on symbolically-sized array dimension "%s" ' 'for subscript character "%s"' % (shp, char)) # Create optimal contraction path # noinspection PyTypeChecker _, path_info = oe.contract_path( einsum_string, *oe.helpers.build_views(einsum_string, chardict)) input_nodes = nodes or {arr: state.add_read(arr) for arr in arrays} result_node = None # Follow path and create a chain of operation SDFG states for pair, nonfree, expr, after, blas in path_info.contraction_list: result, result_node = _create_einsum_internal(sdfg, state, expr, arrays[pair[0]], arrays[pair[1]], dtype=dtype, optimize=False, output=None, nodes=input_nodes) arrays = ([a for i, a in enumerate(arrays) if i not in pair] + [result]) input_nodes[result] = result_node return arrays[0], result_node # END of einsum optimization input_nodes = nodes or {arr: state.add_read(arr) for arr in arrays} # Get output shape from chardict, or [1] for a scalar output output_shape = list(map(lambda k: chardict[k], einsum.output)) or [1] output_index = ','.join(o for o in einsum.output) or '0' if output is None: dtype = dtype or sdfg.arrays[arrays[0]].dtype output, odesc = sdfg.add_temp_transient(output_shape, dtype) to_init = True else: odesc = sdfg.arrays[output] dtype = dtype or odesc.dtype to_init = init_output or True is_conflicted = not all( all(indim in einsum.output for indim in inp) for inp in einsum.inputs) if not is_conflicted and init_output is None: to_init = False if not einsum.is_bmm(): # Fall back to "pure" SDFG einsum with conflict resolution c = state.add_write(output) # Add state before this one to initialize the output value if to_init: init_state = sdfg.add_state_before(state) if len(einsum.output) > 0: init_state.add_mapped_tasklet( 'einsum_reset', {k: '0:%s' % chardict[k] for k in einsum.output}, {}, 'out_%s = 0' % output, {'out_%s' % output: Memlet.simple(output, output_index)}, external_edges=True) else: # Scalar output t = init_state.add_tasklet('einsum_reset', set(), {'out_%s' % output}, 'out_%s = 0' % output) onode = init_state.add_write(output) init_state.add_edge(t, 'out_%s' % output, onode, None, Memlet.simple(output, '0')) wcr = 'lambda a,b: a+b' if is_conflicted else None # Pure einsum map state.add_mapped_tasklet( 'einsum', {k: '0:%s' % v for k, v in chardict.items()}, { 'inp_%s' % arr: Memlet.simple(arr, ','.join(inp)) for inp, arr in zip(einsum.inputs, arrays) }, 'out_%s = %s' % (output, ' * '.join('inp_%s' % arr for arr in arrays)), { 'out_%s' % output: Memlet.simple( output, output_index, wcr_str=wcr) }, input_nodes=input_nodes, output_nodes={output: c}, external_edges=True) else: # Represent einsum as a GEMM or batched GEMM (using library nodes) a_shape = sdfg.arrays[arrays[0]].shape b_shape = sdfg.arrays[arrays[1]].shape c_shape = output_shape a = input_nodes[arrays[0]] b = input_nodes[arrays[1]] c = state.add_write(output) # Compute GEMM dimensions and strides strides = dict( BATCH=prod([c_shape[dim] for dim in einsum.c_batch]), M=prod([a_shape[dim] for dim in einsum.a_only]), K=prod([a_shape[dim] for dim in einsum.a_sum]), N=prod([b_shape[dim] for dim in einsum.b_only]), sAM=prod(a_shape[einsum.a_only[-1] + 1:]) if einsum.a_only else 1, sAK=prod(a_shape[einsum.a_sum[-1] + 1:]) if einsum.a_sum else 1, sAB=prod(a_shape[einsum.a_batch[-1] + 1:]) if einsum.a_batch else 1, sBK=prod(b_shape[einsum.b_sum[-1] + 1:]) if einsum.b_sum else 1, sBN=prod(b_shape[einsum.b_only[-1] + 1:]) if einsum.b_only else 1, sBB=prod(b_shape[einsum.b_batch[-1] + 1:]) if einsum.b_batch else 1, sCM=prod(c_shape[einsum.c_a_only[-1] + 1:]) if einsum.c_a_only else 1, sCN=prod(c_shape[einsum.c_b_only[-1] + 1:]) if einsum.c_b_only else 1, sCB=prod(c_shape[einsum.c_batch[-1] + 1:]) if einsum.c_batch else 1) # Complement strides to make matrices as necessary if len(a_shape) == 1 and len(einsum.a_sum) == 1: strides['sAK'] = 1 strides['sAB'] = strides['sAM'] = strides['K'] if len(b_shape) == 1 and len(einsum.b_sum) == 1: strides['sBN'] = 1 strides['sBK'] = 1 strides['sBB'] = strides['K'] if len(c_shape) == 1 and len(einsum.a_sum) == len(einsum.b_sum): strides['sCN'] = 1 strides['sCB'] = strides['sCM'] = strides['N'] # Create nested SDFG for GEMM nsdfg = create_batch_gemm_sdfg(dtype, strides) nsdfg_node = state.add_nested_sdfg(nsdfg, None, {'X', 'Y'}, {'Z'}, strides) state.add_edge(a, None, nsdfg_node, 'X', Memlet.from_array(a.data, a.desc(sdfg))) state.add_edge(b, None, nsdfg_node, 'Y', Memlet.from_array(b.data, b.desc(sdfg))) state.add_edge(nsdfg_node, 'Z', c, None, Memlet.from_array(c.data, c.desc(sdfg))) return output, c