def apply(self, sdfg: sd.SDFG): #################################################################### # Obtain loop information guard: sd.SDFGState = sdfg.node(self.subgraph[DetectLoop._loop_guard]) begin: sd.SDFGState = sdfg.node(self.subgraph[DetectLoop._loop_begin]) after_state: sd.SDFGState = sdfg.node( self.subgraph[DetectLoop._exit_state]) # Obtain iteration variable, range, and stride guard_inedges = sdfg.in_edges(guard) condition_edge = sdfg.edges_between(guard, begin)[0] itervar = list(guard_inedges[0].data.assignments.keys())[0] condition = condition_edge.data.condition_sympy() rng = self._loop_range(itervar, guard_inedges, condition) # Find the state prior to the loop if rng[0] == symbolic.pystr_to_symbolic( guard_inedges[0].data.assignments[itervar]): init_edge: sd.InterstateEdge = guard_inedges[0] before_state: sd.SDFGState = guard_inedges[0].src last_state: sd.SDFGState = guard_inedges[1].src else: init_edge: sd.InterstateEdge = guard_inedges[1] before_state: sd.SDFGState = guard_inedges[1].src last_state: sd.SDFGState = guard_inedges[0].src # Get loop states loop_states = list( sdutil.dfs_conditional(sdfg, sources=[begin], condition=lambda _, child: child != guard)) first_id = loop_states.index(begin) last_id = loop_states.index(last_state) loop_subgraph = gr.SubgraphView(sdfg, loop_states) #################################################################### # Transform # If begin, change initialization assignment and prepend states before # guard init_edge.data.assignments[itervar] = rng[0] + self.count * rng[2] append_state = before_state # Add `count` states, each with instantiated iteration variable unrolled_states = [] for i in range(self.count): # Instantiate loop states with iterate value new_states = self.instantiate_loop(sdfg, loop_states, loop_subgraph, itervar, rng[0] + i * rng[2]) # Connect states to before the loop with unconditional edges sdfg.add_edge(append_state, new_states[first_id], sd.InterstateEdge()) append_state = new_states[last_id] # Reconnect edge to guard state from last peeled iteration if append_state != before_state: sdfg.remove_edge(init_edge) sdfg.add_edge(append_state, guard, init_edge.data)
def instantiate_loop( self, sdfg: sd.SDFG, loop_states: List[sd.SDFGState], loop_subgraph: gr.SubgraphView, itervar: str, value: symbolic.SymbolicType, state_suffix=None, ): # Using to/from JSON copies faster than deepcopy (which will also # copy the parent SDFG) new_states = [ sd.SDFGState.from_json(s.to_json(), context={'sdfg': sdfg}) for s in loop_states ] # Replace iterate with value in each state for state in new_states: state.set_label(state.label + '_' + itervar + '_' + ( state_suffix if state_suffix is not None else '%d' % value)) state.replace(itervar, value) # Add subgraph to original SDFG for edge in loop_subgraph.edges(): src = new_states[loop_states.index(edge.src)] dst = new_states[loop_states.index(edge.dst)] # Replace conditions in subgraph edges data: sd.InterstateEdge = copy.deepcopy(edge.data) if data.condition: ASTFindReplace({itervar: str(value)}).visit(data.condition) sdfg.add_edge(src, dst, data) return new_states
def split_interstate_edges(sdfg: SDFG) -> None: """ Splits all inter-state edges into edges with conditions and edges with assignments. This procedure helps in nested loop detection. :param sdfg: The SDFG to split :note: Operates in-place on the SDFG. """ for e in sdfg.edges(): if e.data.assignments and not e.data.is_unconditional(): tmpstate = sdfg.add_state() sdfg.add_edge(e.src, tmpstate, InterstateEdge(condition=e.data.condition)) sdfg.add_edge(tmpstate, e.dst, InterstateEdge(assignments=e.data.assignments)) sdfg.remove_edge(e)
def make_write_sdfg(): sdfg = SDFG("spmv_write") begin = sdfg.add_state("begin") entry = sdfg.add_state("entry") state = sdfg.add_state("body") end = sdfg.add_state("end") sdfg.add_edge(begin, entry, InterstateEdge(assignments={"h": "0"})) sdfg.add_edge( entry, state, InterstateEdge(condition=CodeProperty.from_string( "h < H", language=Language.Python))) sdfg.add_edge( entry, end, InterstateEdge(condition=CodeProperty.from_string( "h >= H", language=Language.Python))) sdfg.add_edge(state, entry, InterstateEdge(assignments={"h": "h + 1"})) result_to_write_in = state.add_stream("b_pipe", dtype, storage=StorageType.FPGA_Local) b = state.add_array("b_mem", (H, ), dtype, storage=StorageType.FPGA_Global) state.add_memlet_path(result_to_write_in, b, memlet=Memlet.simple(b, "h")) return sdfg
def apply(self, _, sdfg: sd.SDFG): # Obtain loop information guard: sd.SDFGState = self.loop_guard body: sd.SDFGState = self.loop_begin # Obtain iteration variable, range and stride itervar, (start, end, step), (_, body_end) = find_for_loop(sdfg, guard, body) # Find all loop-body states states = set() to_visit = [body] while to_visit: state = to_visit.pop(0) for _, dst, _ in sdfg.out_edges(state): if dst not in states and dst is not guard: to_visit.append(dst) states.add(state) for state in states: state.replace(itervar, start) # remove loop for body_inedge in sdfg.in_edges(body): sdfg.remove_edge(body_inedge) for body_outedge in sdfg.out_edges(body_end): sdfg.remove_edge(body_outedge) for guard_inedge in sdfg.in_edges(guard): guard_inedge.data.assignments = {} sdfg.add_edge(guard_inedge.src, body, guard_inedge.data) sdfg.remove_edge(guard_inedge) for guard_outedge in sdfg.out_edges(guard): guard_outedge.data.condition = CodeBlock("1") sdfg.add_edge(body_end, guard_outedge.dst, guard_outedge.data) sdfg.remove_edge(guard_outedge) sdfg.remove_node(guard) if itervar in sdfg.symbols and helpers.is_symbol_unused(sdfg, itervar): sdfg.remove_symbol(itervar)
def apply(self, _, sdfg: sd.SDFG): # Obtain loop information guard: sd.SDFGState = self.loop_guard body: sd.SDFGState = self.loop_begin # Obtain iteration variable, range, and stride itervar, (start, end, step), _ = find_for_loop(sdfg, guard, body) forward_loop = step > 0 for node in body.nodes(): if isinstance(node, nodes.MapEntry): map_entry = node if isinstance(node, nodes.MapExit): map_exit = node # nest map's content in sdfg map_subgraph = body.scope_subgraph(map_entry, include_entry=False, include_exit=False) nsdfg = helpers.nest_state_subgraph(sdfg, body, map_subgraph, full_data=True) # replicate loop in nested sdfg new_before, new_guard, new_after = nsdfg.sdfg.add_loop( before_state=None, loop_state=nsdfg.sdfg.nodes()[0], loop_end_state=None, after_state=None, loop_var=itervar, initialize_expr=f'{start}', condition_expr=f'{itervar} <= {end}' if forward_loop else f'{itervar} >= {end}', increment_expr=f'{itervar} + {step}' if forward_loop else f'{itervar} - {abs(step)}') # remove outer loop before_guard_edge = nsdfg.sdfg.edges_between(new_before, new_guard)[0] for e in nsdfg.sdfg.out_edges(new_guard): if e.dst is new_after: guard_after_edge = e else: guard_body_edge = e for body_inedge in sdfg.in_edges(body): if body_inedge.src is guard: guard_body_edge.data.assignments.update(body_inedge.data.assignments) sdfg.remove_edge(body_inedge) for body_outedge in sdfg.out_edges(body): sdfg.remove_edge(body_outedge) for guard_inedge in sdfg.in_edges(guard): before_guard_edge.data.assignments.update(guard_inedge.data.assignments) guard_inedge.data.assignments = {} sdfg.add_edge(guard_inedge.src, body, guard_inedge.data) sdfg.remove_edge(guard_inedge) for guard_outedge in sdfg.out_edges(guard): if guard_outedge.dst is body: guard_body_edge.data.assignments.update(guard_outedge.data.assignments) else: guard_after_edge.data.assignments.update(guard_outedge.data.assignments) guard_outedge.data.condition = CodeBlock("1") sdfg.add_edge(body, guard_outedge.dst, guard_outedge.data) sdfg.remove_edge(guard_outedge) sdfg.remove_node(guard) if itervar in nsdfg.symbol_mapping: del nsdfg.symbol_mapping[itervar] if itervar in sdfg.symbols: del sdfg.symbols[itervar] # Add missing data/symbols for s in nsdfg.sdfg.free_symbols: if s in nsdfg.symbol_mapping: continue if s in sdfg.symbols: nsdfg.symbol_mapping[s] = s elif s in sdfg.arrays: desc = sdfg.arrays[s] access = body.add_access(s) conn = nsdfg.sdfg.add_datadesc(s, copy.deepcopy(desc)) nsdfg.sdfg.arrays[s].transient = False nsdfg.add_in_connector(conn) body.add_memlet_path(access, map_entry, nsdfg, memlet=Memlet.from_array(s, desc), dst_conn=conn) else: raise NotImplementedError(f"Free symbol {s} is neither a symbol nor data.") to_delete = set() for s in nsdfg.symbol_mapping: if s not in nsdfg.sdfg.free_symbols: to_delete.add(s) for s in to_delete: del nsdfg.symbol_mapping[s] # propagate scope for correct volumes scope_tree = ScopeTree(map_entry, map_exit) scope_tree.parent = ScopeTree(None, None) # The first execution helps remove apperances of symbols # that are now defined only in the nested SDFG in memlets. propagation.propagate_memlets_scope(sdfg, body, scope_tree) for s in to_delete: if helpers.is_symbol_unused(sdfg, s): sdfg.remove_symbol(s) from dace.transformation.interstate import RefineNestedAccess transformation = RefineNestedAccess() transformation.setup_match(sdfg, 0, sdfg.node_id(body), {RefineNestedAccess.nsdfg: body.node_id(nsdfg)}, 0) transformation.apply(body, sdfg) # Second propagation for refined accesses. propagation.propagate_memlets_scope(sdfg, body, scope_tree)
def expansion(node: 'Reduce', state: SDFGState, sdfg: SDFG): node.validate(sdfg, state) inedge: graph.MultiConnectorEdge = state.in_edges(node)[0] outedge: graph.MultiConnectorEdge = state.out_edges(node)[0] input_dims = len(inedge.data.subset) output_dims = len(outedge.data.subset) input_data = sdfg.arrays[inedge.data.data] output_data = sdfg.arrays[outedge.data.data] # Standardize axes axes = node.axes if node.axes else [i for i in range(input_dims)] # Create nested SDFG nsdfg = SDFG('reduce') nsdfg.add_array('_in', inedge.data.subset.size(), input_data.dtype, strides=input_data.strides, storage=input_data.storage) nsdfg.add_array('_out', outedge.data.subset.size(), output_data.dtype, strides=output_data.strides, storage=output_data.storage) # If identity is defined, add an initialization state if node.identity is not None: init_state = nsdfg.add_state() nstate = nsdfg.add_state() nsdfg.add_edge(init_state, nstate, dace.InterstateEdge()) # Add initialization as a map init_state.add_mapped_tasklet( 'reduce_init', { '_o%d' % i: '0:%s' % symstr(d) for i, d in enumerate(outedge.data.subset.size()) }, {}, 'out = %s' % node.identity, { 'out': dace.Memlet.simple( '_out', ','.join( ['_o%d' % i for i in range(output_dims)])) }, external_edges=True) else: nstate = nsdfg.add_state() # END OF INIT # (If axes != all) Add outer map, which corresponds to the output range if len(axes) != input_dims: # Interleave input and output axes to match input memlet ictr, octr = 0, 0 input_subset = [] for i in range(input_dims): if i in axes: input_subset.append('_i%d' % ictr) ictr += 1 else: input_subset.append('_o%d' % octr) octr += 1 output_size = outedge.data.subset.size() ome, omx = nstate.add_map( 'reduce_output', { '_o%d' % i: '0:%s' % symstr(sz) for i, sz in enumerate(outedge.data.subset.size()) }) outm = dace.Memlet.simple( '_out', ','.join(['_o%d' % i for i in range(output_dims)]), wcr_str=node.wcr) inmm = dace.Memlet.simple('_in', ','.join(input_subset)) else: ome, omx = None, None outm = dace.Memlet.simple('_out', '0', wcr_str=node.wcr) inmm = dace.Memlet.simple( '_in', ','.join(['_i%d' % i for i in range(len(axes))])) # Add inner map, which corresponds to the range to reduce, containing # an identity tasklet ime, imx = nstate.add_map( 'reduce_values', { '_i%d' % i: '0:%s' % symstr(inedge.data.subset.size()[axis]) for i, axis in enumerate(sorted(axes)) }) # Add identity tasklet for reduction t = nstate.add_tasklet('identity', {'inp'}, {'out'}, 'out = inp') # Connect everything r = nstate.add_read('_in') w = nstate.add_read('_out') if ome: nstate.add_memlet_path(r, ome, ime, t, dst_conn='inp', memlet=inmm) nstate.add_memlet_path(t, imx, omx, w, src_conn='out', memlet=outm) else: nstate.add_memlet_path(r, ime, t, dst_conn='inp', memlet=inmm) nstate.add_memlet_path(t, imx, w, src_conn='out', memlet=outm) # Rename outer connectors and add to node inedge._dst_conn = '_in' outedge._src_conn = '_out' node.add_in_connector('_in') node.add_out_connector('_out') return nsdfg
def apply(self, sdfg: sd.SDFG): # Obtain loop information guard: sd.SDFGState = sdfg.node(self.subgraph[DetectLoop._loop_guard]) body: sd.SDFGState = sdfg.node(self.subgraph[DetectLoop._loop_begin]) after: sd.SDFGState = sdfg.node(self.subgraph[DetectLoop._exit_state]) # Obtain iteration variable, range, and stride itervar, (start, end, step), (_, body_end) = find_for_loop( sdfg, guard, body, itervar=self.itervar) # Find all loop-body states states = set([body_end]) to_visit = [body] while to_visit: state = to_visit.pop(0) if state is body_end: continue for _, dst, _ in sdfg.out_edges(state): if dst not in states: to_visit.append(dst) states.add(state) # Nest loop-body states if len(states) > 1: # Find read/write sets read_set, write_set = set(), set() for state in states: rset, wset = state.read_and_write_sets() read_set |= rset write_set |= wset # Add data from edges for src in states: for dst in states: for edge in sdfg.edges_between(src, dst): for s in edge.data.free_symbols: if s in sdfg.arrays: read_set.add(s) # Find NestedSDFG's unique data rw_set = read_set | write_set unique_set = set() for name in rw_set: if not sdfg.arrays[name].transient: continue found = False for state in sdfg.states(): if state in states: continue for node in state.nodes(): if (isinstance(node, nodes.AccessNode) and node.data == name): found = True break if not found: unique_set.add(name) # Find NestedSDFG's connectors read_set = {n for n in read_set if n not in unique_set or not sdfg.arrays[n].transient} write_set = {n for n in write_set if n not in unique_set or not sdfg.arrays[n].transient} # Create NestedSDFG and add all loop-body states and edges # Also, find defined symbols in NestedSDFG fsymbols = set(sdfg.free_symbols) new_body = sdfg.add_state('single_state_body') nsdfg = SDFG("loop_body", constants=sdfg.constants, parent=new_body) nsdfg.add_node(body, is_start_state=True) body.parent = nsdfg exit_state = nsdfg.add_state('exit') nsymbols = dict() for state in states: if state is body: continue nsdfg.add_node(state) state.parent = nsdfg for state in states: if state is body: continue for src, dst, data in sdfg.in_edges(state): nsymbols.update({s: sdfg.symbols[s] for s in data.assignments.keys() if s in sdfg.symbols}) nsdfg.add_edge(src, dst, data) nsdfg.add_edge(body_end, exit_state, InterstateEdge()) # Move guard -> body edge to guard -> new_body for src, dst, data, in sdfg.edges_between(guard, body): sdfg.add_edge(src, new_body, data) # Move body_end -> guard edge to new_body -> guard for src, dst, data in sdfg.edges_between(body_end, guard): sdfg.add_edge(new_body, dst, data) # Delete loop-body states and edges from parent SDFG for state in states: for e in sdfg.all_edges(state): sdfg.remove_edge(e) sdfg.remove_node(state) # Add NestedSDFG arrays for name in read_set | write_set: nsdfg.arrays[name] = copy.deepcopy(sdfg.arrays[name]) nsdfg.arrays[name].transient = False for name in unique_set: nsdfg.arrays[name] = sdfg.arrays[name] del sdfg.arrays[name] # Add NestedSDFG node cnode = new_body.add_nested_sdfg(nsdfg, None, read_set, write_set) if sdfg.parent: for s, m in sdfg.parent_nsdfg_node.symbol_mapping.items(): if s not in cnode.symbol_mapping: cnode.symbol_mapping[s] = m nsdfg.add_symbol(s, sdfg.symbols[s]) for name in read_set: r = new_body.add_read(name) new_body.add_edge( r, None, cnode, name, memlet.Memlet.from_array(name, sdfg.arrays[name])) for name in write_set: w = new_body.add_write(name) new_body.add_edge( cnode, name, w, None, memlet.Memlet.from_array(name, sdfg.arrays[name])) # Fix SDFG symbols for sym in sdfg.free_symbols - fsymbols: del sdfg.symbols[sym] for sym, dtype in nsymbols.items(): nsdfg.symbols[sym] = dtype # Change body state reference body = new_body if (step < 0) == True: # If step is negative, we have to flip start and end to produce a # correct map with a positive increment start, end, step = end, start, -step # If necessary, make a nested SDFG with assignments isedge = sdfg.edges_between(guard, body)[0] symbols_to_remove = set() if len(isedge.data.assignments) > 0: nsdfg = helpers.nest_state_subgraph( sdfg, body, gr.SubgraphView(body, body.nodes())) for sym in isedge.data.free_symbols: if sym in nsdfg.symbol_mapping or sym in nsdfg.in_connectors: continue if sym in sdfg.symbols: nsdfg.symbol_mapping[sym] = symbolic.pystr_to_symbolic(sym) nsdfg.sdfg.add_symbol(sym, sdfg.symbols[sym]) elif sym in sdfg.arrays: if sym in nsdfg.sdfg.arrays: raise NotImplementedError rnode = body.add_read(sym) nsdfg.add_in_connector(sym) desc = copy.deepcopy(sdfg.arrays[sym]) desc.transient = False nsdfg.sdfg.add_datadesc(sym, desc) body.add_edge(rnode, None, nsdfg, sym, memlet.Memlet(sym)) nstate = nsdfg.sdfg.node(0) init_state = nsdfg.sdfg.add_state_before(nstate) nisedge = nsdfg.sdfg.edges_between(init_state, nstate)[0] nisedge.data.assignments = isedge.data.assignments symbols_to_remove = set(nisedge.data.assignments.keys()) for k in nisedge.data.assignments.keys(): if k in nsdfg.symbol_mapping: del nsdfg.symbol_mapping[k] isedge.data.assignments = {} source_nodes = body.source_nodes() sink_nodes = body.sink_nodes() map = nodes.Map(body.label + "_map", [itervar], [(start, end, step)]) entry = nodes.MapEntry(map) exit = nodes.MapExit(map) body.add_node(entry) body.add_node(exit) # If the map uses symbols from data containers, instantiate reads containers_to_read = entry.free_symbols & sdfg.arrays.keys() for rd in containers_to_read: # We are guaranteed that this is always a scalar, because # can_be_applied makes sure there are no sympy functions in each of # the loop expresions access_node = body.add_read(rd) body.add_memlet_path(access_node, entry, dst_conn=rd, memlet=memlet.Memlet(rd)) # Reroute all memlets through the entry and exit nodes for n in source_nodes: if isinstance(n, nodes.AccessNode): for e in body.out_edges(n): body.remove_edge(e) body.add_edge_pair(entry, e.dst, n, e.data, internal_connector=e.dst_conn) else: body.add_nedge(entry, n, memlet.Memlet()) for n in sink_nodes: if isinstance(n, nodes.AccessNode): for e in body.in_edges(n): body.remove_edge(e) body.add_edge_pair(exit, e.src, n, e.data, internal_connector=e.src_conn) else: body.add_nedge(n, exit, memlet.Memlet()) # Get rid of the loop exit condition edge after_edge = sdfg.edges_between(guard, after)[0] sdfg.remove_edge(after_edge) # Remove the assignment on the edge to the guard for e in sdfg.in_edges(guard): if itervar in e.data.assignments: del e.data.assignments[itervar] # Remove the condition on the entry edge condition_edge = sdfg.edges_between(guard, body)[0] condition_edge.data.condition = CodeBlock("1") # Get rid of backedge to guard sdfg.remove_edge(sdfg.edges_between(body, guard)[0]) # Route body directly to after state, maintaining any other assignments # it might have had sdfg.add_edge( body, after, sd.InterstateEdge(assignments=after_edge.data.assignments)) # If this had made the iteration variable a free symbol, we can remove # it from the SDFG symbols if itervar in sdfg.free_symbols: sdfg.remove_symbol(itervar) for sym in symbols_to_remove: if helpers.is_symbol_unused(sdfg, sym): sdfg.remove_symbol(sym)
def apply(self, sdfg: SDFG): subgraph = self.subgraph_view(sdfg) entry_states_in, entry_states_out = self.get_entry_states( sdfg, subgraph) _, exit_states_out = self.get_exit_states(sdfg, subgraph) entry_state_in = entry_states_in.pop() entry_state_out = entry_states_out.pop() \ if len(entry_states_out) > 0 else None exit_state_out = exit_states_out.pop() \ if len(exit_states_out) > 0 else None launch_state = None entry_guard_state = None exit_guard_state = None # generate entry guard state if needed if self.include_in_assignment and entry_state_out is not None: entry_edge = sdfg.edges_between(entry_state_out, entry_state_in)[0] if len(entry_edge.data.assignments) > 0: entry_guard_state = sdfg.add_state( label='{}kernel_entry_guard'.format( self.kernel_prefix + '_' if self.kernel_prefix != '' else '')) sdfg.add_edge(entry_state_out, entry_guard_state, InterstateEdge(entry_edge.data.condition)) sdfg.add_edge( entry_guard_state, entry_state_in, InterstateEdge(None, entry_edge.data.assignments)) sdfg.remove_edge(entry_edge) # Update SubgraphView new_node_list = subgraph.nodes() new_node_list.append(entry_guard_state) subgraph = SubgraphView(sdfg, new_node_list) launch_state = sdfg.add_state_before( entry_guard_state, label='{}kernel_launch'.format( self.kernel_prefix + '_' if self.kernel_prefix != '' else '')) # generate exit guard state if exit_state_out is not None: exit_guard_state = sdfg.add_state_before( exit_state_out, label='{}kernel_exit_guard'.format( self.kernel_prefix + '_' if self.kernel_prefix != '' else '')) # Update SubgraphView new_node_list = subgraph.nodes() new_node_list.append(exit_guard_state) subgraph = SubgraphView(sdfg, new_node_list) if launch_state is None: launch_state = sdfg.add_state_before( exit_state_out, label='{}kernel_launch'.format( self.kernel_prefix + '_' if self.kernel_prefix != '' else '')) # If the launch state doesn't exist at this point then there is no other # states outside of the kernel, so create a stand alone launch state if launch_state is None: assert (entry_state_in is None and exit_state_out is None) launch_state = sdfg.add_state(label='{}kernel_launch'.format( self.kernel_prefix + '_' if self.kernel_prefix != '' else '')) # create sdfg for kernel and fill it with states and edges from # ssubgraph dfg will be nested at the end kernel_sdfg = SDFG( '{}kernel'.format(self.kernel_prefix + '_' if self.kernel_prefix != '' else '')) edges = subgraph.edges() for edge in edges: kernel_sdfg.add_edge(edge.src, edge.dst, edge.data) # Setting entry node in nested SDFG if no entry guard was created if entry_guard_state is None: kernel_sdfg.start_state = kernel_sdfg.node_id(entry_state_in) for state in subgraph: state.parent = kernel_sdfg # remove the now nested nodes from the outer sdfg and make sure the # launch state is properly connected to remaining states sdfg.remove_nodes_from(subgraph.nodes()) if entry_state_out is not None \ and len(sdfg.edges_between(entry_state_out, launch_state)) == 0: sdfg.add_edge(entry_state_out, launch_state, InterstateEdge()) if exit_state_out is not None \ and len(sdfg.edges_between(launch_state, exit_state_out)) == 0: sdfg.add_edge(launch_state, exit_state_out, InterstateEdge()) # Handle data for kernel kernel_data = set(node.data for state in kernel_sdfg for node in state.nodes() if isinstance(node, nodes.AccessNode)) # move Streams and Register data into the nested SDFG # normal data will be added as kernel argument kernel_args = [] for data in kernel_data: if (isinstance(sdfg.arrays[data], dace.data.Stream) or (isinstance(sdfg.arrays[data], dace.data.Array) and sdfg.arrays[data].storage == StorageType.Register)): kernel_sdfg.add_datadesc(data, sdfg.arrays[data]) del sdfg.arrays[data] else: copy_desc = copy.deepcopy(sdfg.arrays[data]) copy_desc.transient = False copy_desc.storage = StorageType.Default kernel_sdfg.add_datadesc(data, copy_desc) kernel_args.append(data) # read only data will be passed as input, writeable data will be passed # as 'output' otherwise kernel cannot write to data kernel_args_read = set() kernel_args_write = set() for data in kernel_args: data_accesses_read_only = [ node.access == dtypes.AccessType.ReadOnly for state in kernel_sdfg for node in state if isinstance(node, nodes.AccessNode) and node.data == data ] if all(data_accesses_read_only): kernel_args_read.add(data) else: kernel_args_write.add(data) # Kernel SDFG is complete at this point if self.validate: kernel_sdfg.validate() # Filling launch state with nested SDFG, map and access nodes map_entry, map_exit = launch_state.add_map( '{}kernel_launch_map'.format( self.kernel_prefix + '_' if self.kernel_prefix != '' else ''), dict(ignore='0'), schedule=ScheduleType.GPU_Persistent, ) nested_sdfg = launch_state.add_nested_sdfg( kernel_sdfg, sdfg, kernel_args_read, kernel_args_write, ) # Create and connect read only data access nodes for arg in kernel_args_read: read_node = launch_state.add_read(arg) launch_state.add_memlet_path(read_node, map_entry, nested_sdfg, dst_conn=arg, memlet=Memlet.from_array( arg, sdfg.arrays[arg])) # Create and connect writable data access nodes for arg in kernel_args_write: write_node = launch_state.add_write(arg) launch_state.add_memlet_path(nested_sdfg, map_exit, write_node, src_conn=arg, memlet=Memlet.from_array( arg, sdfg.arrays[arg])) # Transformation is done if self.validate: sdfg.validate()
def apply(self, sdfg: sd.SDFG): ####################################################### # Step 0: SDFG metadata # Find all input and output data descriptors input_nodes = [] output_nodes = [] global_code_nodes = [[] for _ in sdfg.nodes()] for i, state in enumerate(sdfg.nodes()): sdict = state.scope_dict() for node in state.nodes(): if (isinstance(node, nodes.AccessNode) and node.desc(sdfg).transient == False): if (state.out_degree(node) > 0 and node.data not in input_nodes): input_nodes.append((node.data, node.desc(sdfg))) if (state.in_degree(node) > 0 and node.data not in output_nodes): output_nodes.append((node.data, node.desc(sdfg))) elif isinstance(node, nodes.CodeNode) and sdict[node] is None: if not isinstance(node, nodes.EmptyTasklet): global_code_nodes[i].append(node) # Input nodes may also be nodes with WCR memlets and no identity for e in state.edges(): if e.data.wcr is not None and e.data.wcr_identity is None: if (e.data.data not in input_nodes and sdfg.arrays[e.data.data].transient == False): input_nodes.append(e.data.data) start_state = sdfg.start_state end_states = sdfg.sink_nodes() ####################################################### # Step 1: Create cloned GPU arrays and replace originals cloned_arrays = {} for inodename, inode in input_nodes: newdesc = inode.clone() newdesc.storage = types.StorageType.GPU_Global newdesc.transient = True sdfg.add_datadesc('gpu_' + inodename, newdesc) cloned_arrays[inodename] = 'gpu_' + inodename for onodename, onode in output_nodes: if onodename in cloned_arrays: continue newdesc = onode.clone() newdesc.storage = types.StorageType.GPU_Global newdesc.transient = True sdfg.add_datadesc('gpu_' + onodename, newdesc) cloned_arrays[onodename] = 'gpu_' + onodename # Replace nodes for state in sdfg.nodes(): for node in state.nodes(): if (isinstance(node, nodes.AccessNode) and node.data in cloned_arrays): node.data = cloned_arrays[node.data] # Replace memlets for state in sdfg.nodes(): for edge in state.edges(): if edge.data.data in cloned_arrays: edge.data.data = cloned_arrays[edge.data.data] ####################################################### # Step 2: Create copy-in state copyin_state = sdfg.add_state(sdfg.label + '_copyin') sdfg.add_edge(copyin_state, start_state, ed.InterstateEdge()) for nname, desc in input_nodes: src_array = nodes.AccessNode(nname, debuginfo=desc.debuginfo) dst_array = nodes.AccessNode(cloned_arrays[nname], debuginfo=desc.debuginfo) copyin_state.add_node(src_array) copyin_state.add_node(dst_array) copyin_state.add_nedge( src_array, dst_array, memlet.Memlet.from_array(src_array.data, src_array.desc(sdfg))) ####################################################### # Step 3: Create copy-out state copyout_state = sdfg.add_state(sdfg.label + '_copyout') for state in end_states: sdfg.add_edge(state, copyout_state, ed.InterstateEdge()) for nname, desc in output_nodes: src_array = nodes.AccessNode(cloned_arrays[nname], debuginfo=desc.debuginfo) dst_array = nodes.AccessNode(nname, debuginfo=desc.debuginfo) copyout_state.add_node(src_array) copyout_state.add_node(dst_array) copyout_state.add_nedge( src_array, dst_array, memlet.Memlet.from_array(dst_array.data, dst_array.desc(sdfg))) ####################################################### # Step 4: Modify transient data storage for state in sdfg.nodes(): sdict = state.scope_dict() for node in state.nodes(): if isinstance(node, nodes.AccessNode) and node.desc(sdfg).transient: nodedesc = node.desc(sdfg) if sdict[node] is None: # NOTE: the cloned arrays match too but it's the same # storage so we don't care nodedesc.storage = types.StorageType.GPU_Global # Try to move allocation/deallocation out of loops if self.toplevel_trans: nodedesc.toplevel = True else: # Make internal transients registers if self.register_trans: nodedesc.storage = types.StorageType.Register ####################################################### # Step 5: Wrap free tasklets and nested SDFGs with a GPU map for state, gcodes in zip(sdfg.nodes(), global_code_nodes): for gcode in gcodes: # Create map and connectors me, mx = state.add_map(gcode.label + '_gmap', {gcode.label + '__gmapi': '0:1'}, schedule=types.ScheduleType.GPU_Device) # Store in/out edges in lists so that they don't get corrupted # when they are removed from the graph in_edges = list(state.in_edges(gcode)) out_edges = list(state.out_edges(gcode)) me.in_connectors = set('IN_' + e.dst_conn for e in in_edges) me.out_connectors = set('OUT_' + e.dst_conn for e in in_edges) mx.in_connectors = set('IN_' + e.src_conn for e in out_edges) mx.out_connectors = set('OUT_' + e.src_conn for e in out_edges) # Create memlets through map for e in in_edges: state.remove_edge(e) state.add_edge(e.src, e.src_conn, me, 'IN_' + e.dst_conn, e.data) state.add_edge(me, 'OUT_' + e.dst_conn, e.dst, e.dst_conn, e.data) for e in out_edges: state.remove_edge(e) state.add_edge(e.src, e.src_conn, mx, 'IN_' + e.src_conn, e.data) state.add_edge(mx, 'OUT_' + e.src_conn, e.dst, e.dst_conn, e.data) # Map without inputs if len(in_edges) == 0: state.add_nedge(me, gcode, memlet.EmptyMemlet()) ####################################################### # Step 6: Change all top-level maps to GPU maps for i, state in enumerate(sdfg.nodes()): sdict = state.scope_dict() for node in state.nodes(): if isinstance(node, nodes.EntryNode): if sdict[node] is None: node.schedule = types.ScheduleType.GPU_Device elif self.sequential_innermaps: node.schedule = types.ScheduleType.Sequential ####################################################### # Step 7: Strict transformations if not self.strict_transform: return # Apply strict state fusions greedily. opt = optimizer.SDFGOptimizer(sdfg, inplace=True) fusions = 0 arrays = 0 options = [ match for match in opt.get_pattern_matches(strict=True) if isinstance(match, (StateFusion, RedundantArray)) ] while options: ssdfg = sdfg.sdfg_list[options[0].sdfg_id] options[0].apply(ssdfg) ssdfg.validate() if isinstance(options[0], StateFusion): fusions += 1 if isinstance(options[0], RedundantArray): arrays += 1 options = [ match for match in opt.get_pattern_matches(strict=True) if isinstance(match, (StateFusion, RedundantArray)) ] if Config.get_bool('debugprint') and (fusions > 0 or arrays > 0): print('Automatically applied {} strict state fusions and removed' ' {} redundant arrays.'.format(fusions, arrays))
def generate_sdfg(name, chain, synthetic_reads=False, specialize_scalars=False): sdfg = SDFG(name) for k, v in chain.constants.items(): sdfg.add_constant(k, v["value"], dace.data.Scalar(v["data_type"])) if specialize_scalars: for k, v in chain.inputs.items(): if len(v["input_dims"]) == 0: try: val = stencilflow.load_array(v) except FileNotFoundError: continue print(f"Specialized constant {k} to {val}.") sdfg.add_constant(k, val) pre_state = sdfg.add_state("initialize") state = sdfg.add_state("compute") post_state = sdfg.add_state("finalize") sdfg.add_edge(pre_state, state, InterstateEdge()) sdfg.add_edge(state, post_state, InterstateEdge()) (dimensions_to_skip, shape, vector_length, parameters, iterators, memcopy_indices, memcopy_accesses) = _generate_init(chain) vshape = list(shape) # Copy if vector_length > 1: vshape[-1] //= vector_length def add_input(node, bank): # Collapse iterators and shape if input is lower dimensional for output in node.outputs.values(): try: input_pars = output["input_dims"][:] except (KeyError, TypeError): input_pars = list(parameters) # Copy break # Just needed any output to retrieve the dimensions else: raise ValueError("Input {} is not connected to anything.".format( node.name)) # If scalar, just add a symbol if len(input_pars) == 0: sdfg.add_symbol(node.name, node.data_type) return # We're done input_shape = [shape[list(parameters).index(i)] for i in input_pars] input_accesses = str(functools.reduce(operator.mul, input_shape, 1)) # Only vectorize the read if the innermost dimensions is read input_vector_length = (vector_length if input_pars[-1] == parameters[-1] else 1) input_vtype = (dace.dtypes.vector(node.data_type, input_vector_length) if input_vector_length > 1 else node.data_type) input_vshape = list(input_shape) if input_vector_length > 1: input_vshape[-1] //= input_vector_length # Sort to get deterministic output outputs = sorted([e[1].name for e in chain.graph.out_edges(node)]) out_memlets = ["_" + o for o in outputs] entry, exit = state.add_map("read_" + node.name, iterators, schedule=ScheduleType.FPGA_Device) if not synthetic_reads: # Generate synthetic inputs without memory # Host-side array, which will be an input argument sdfg.add_array(node.name + "_host", input_shape, node.data_type) # Device-side copy _, array = sdfg.add_array(node.name, input_vshape, input_vtype, storage=StorageType.FPGA_Global, transient=True) array.location["bank"] = bank access_node = state.add_read(node.name) # Copy data to the FPGA copy_host = pre_state.add_read(node.name + "_host") copy_fpga = pre_state.add_write(node.name) pre_state.add_memlet_path(copy_host, copy_fpga, memlet=Memlet.simple( copy_fpga, ", ".join("0:{}".format(s) for s in input_vshape), num_accesses=input_accesses)) tasklet_code = "\n".join( ["{} = memory".format(o) for o in out_memlets]) tasklet = state.add_tasklet("read_" + node.name, {"memory"}, out_memlets, tasklet_code) vectorized_pars = input_pars # if input_vector_length > 1: # vectorized_pars[-1] = "{}*{}".format(input_vector_length, # vectorized_pars[-1]) # Lower-dimensional arrays should buffer values and send them # multiple times is_lower_dim = len(input_shape) != len(shape) if is_lower_dim: buffer_name = node.name + "_buffer" sdfg.add_array(buffer_name, input_shape, input_vtype, storage=StorageType.FPGA_Local, transient=True) buffer_node = state.add_access(buffer_name) buffer_entry, buffer_exit = state.add_map( "buffer_" + node.name, { k: "0:{}".format(v) for k, v in zip(input_pars, input_shape) }, schedule=dace.ScheduleType.FPGA_Device) buffer_tasklet = state.add_tasklet("buffer_" + node.name, {"memory"}, {"buffer"}, "buffer = memory") state.add_memlet_path(access_node, buffer_entry, buffer_tasklet, dst_conn="memory", memlet=dace.Memlet.simple( access_node.data, ", ".join(vectorized_pars), num_accesses=1)) state.add_memlet_path(buffer_tasklet, buffer_exit, buffer_node, src_conn="buffer", memlet=dace.Memlet.simple( buffer_node.data, ", ".join(input_pars), num_accesses=1)) state.add_memlet_path(buffer_node, entry, tasklet, dst_conn="memory", memlet=dace.Memlet.simple( buffer_node.data, ", ".join(input_pars), num_accesses=1)) else: state.add_memlet_path(access_node, entry, tasklet, dst_conn="memory", memlet=Memlet.simple( node.name, ", ".join(vectorized_pars), num_accesses=1)) else: tasklet_code = "\n".join([ "{} = {}".format(o, float(synthetic_reads)) for o in out_memlets ]) tasklet = state.add_tasklet("read_" + node.name, {}, out_memlets, tasklet_code) state.add_memlet_path(entry, tasklet, memlet=dace.Memlet()) # Add memlets to all FIFOs connecting to compute units for out_name, out_memlet in zip(outputs, out_memlets): stream_name = "read_{}_to_{}".format(node.name, out_name) write_node = state.add_write(stream_name) state.add_memlet_path(tasklet, exit, write_node, src_conn=out_memlet, memlet=Memlet.simple(stream_name, "0", num_accesses=1)) def add_output(node, bank): # Host-side array, which will be an output argument try: sdfg.add_array(node.name + "_host", shape, node.data_type) _, array = sdfg.add_array(node.name, vshape, dace.dtypes.vector( node.data_type, vector_length), storage=StorageType.FPGA_Global, transient=True) array.location["bank"] = bank except NameError: # This array is also read sdfg.data(node.name + "_host").access = dace.AccessType.ReadWrite sdfg.data(node.name).access = dace.AccessType.ReadWrite # Device-side copy write_node = state.add_write(node.name) # Copy data to the host copy_fpga = post_state.add_read(node.name) copy_host = post_state.add_write(node.name + "_host") post_state.add_memlet_path(copy_fpga, copy_host, memlet=Memlet.simple( copy_fpga, ", ".join(memcopy_indices), num_accesses=memcopy_accesses)) entry, exit = state.add_map("write_" + node.name, iterators, schedule=ScheduleType.FPGA_Device) src = chain.graph.in_edges(node) if len(src) > 1: raise RuntimeError("Only one writer per output supported") src = next(iter(src))[0] in_memlet = "_" + src.name tasklet_code = "memory = " + in_memlet tasklet = state.add_tasklet("write_" + node.name, {in_memlet}, {"memory"}, tasklet_code) vectorized_pars = copy.copy(parameters) # if vector_length > 1: # vectorized_pars[-1] = "{}*{}".format(vector_length, # vectorized_pars[-1]) stream_name = "{}_to_write_{}".format(src.name, node.name) read_node = state.add_read(stream_name) state.add_memlet_path(read_node, entry, tasklet, dst_conn=in_memlet, memlet=Memlet.simple(stream_name, "0", num_accesses=1)) state.add_memlet_path(tasklet, exit, write_node, src_conn="memory", memlet=Memlet.simple(node.name, ", ".join(vectorized_pars), num_accesses=1)) def add_kernel(node): (stencil_node, input_to_connector, output_to_connector) = _generate_stencil(node, chain, shape, dimensions_to_skip) if len(stencil_node.output_fields) == 0: if len(input_to_connector) == 0: warnings.warn("Ignoring orphan stencil: {}".format(node.name)) else: raise ValueError("Orphan stencil with inputs: {}".format( node.name)) return vendor_str = dace.config.Config.get("compiler", "fpga_vendor") if vendor_str == "intel_fpga": stencil_node.implementation = "Intel FPGA" elif vendor_str == "xilinx": stencil_node.implementation = "Xilinx" else: raise ValueError(f"Unsupported FPGA backend: {vendor_str}") state.add_node(stencil_node) is_from_memory = { e[0].name: not isinstance(e[0], stencilflow.kernel.Kernel) for e in chain.graph.in_edges(node) } is_to_memory = { e[1].name: not isinstance(e[1], stencilflow.kernel.Kernel) for e in chain.graph.out_edges(node) } # Add read nodes and memlets for field_name, connector in input_to_connector.items(): input_vector_length = vector_length try: # Scalars are symbols rather than data nodes if len(node.inputs[field_name]["input_dims"]) == 0: continue else: # If the innermost dimension of this field is not the # vectorized one, read it as scalars if (node.inputs[field_name]["input_dims"][-1] != parameters[-1]): input_vector_length = 1 except (KeyError, TypeError): pass # input_dim is not defined or is None if is_from_memory[field_name]: stream_name = "read_{}_to_{}".format(field_name, node.name) else: stream_name = "{}_to_{}".format(field_name, node.name) # Outer memory read read_node = state.add_read(stream_name) state.add_memlet_path(read_node, stencil_node, dst_conn=connector, memlet=Memlet.simple( stream_name, "0", num_accesses=memcopy_accesses)) # Add read nodes and memlets for output_name, connector in output_to_connector.items(): # Add write node and memlet if is_to_memory[output_name]: stream_name = "{}_to_write_{}".format(node.name, output_name) else: stream_name = "{}_to_{}".format(node.name, output_name) # Outer write write_node = state.add_write(stream_name) state.add_memlet_path(stencil_node, write_node, src_conn=connector, memlet=Memlet.simple( stream_name, "0", num_accesses=memcopy_accesses)) # First generate all connections between kernels and memories for link in chain.graph.edges(data=True): _add_pipe(sdfg, link, parameters, vector_length) bank = 0 # Now generate all memory access functions so arrays are registered for node in chain.graph.nodes(): if isinstance(node, Input): add_input(node, bank) bank = (bank + 1) % NUM_BANKS elif isinstance(node, Output): add_output(node, bank) bank = (bank + 1) % NUM_BANKS elif isinstance(node, Kernel): # Generate these separately after pass else: raise RuntimeError("Unexpected node type: {}".format( node.node_type)) # Finally generate the compute kernels for node in chain.graph.nodes(): if isinstance(node, Kernel): add_kernel(node) return sdfg
def apply(self, _, sdfg: sd.SDFG): #################################################################### # Obtain loop information guard: sd.SDFGState = self.loop_guard begin: sd.SDFGState = self.loop_begin after_state: sd.SDFGState = self.exit_state # Obtain iteration variable, range, and stride condition_edge = sdfg.edges_between(guard, begin)[0] not_condition_edge = sdfg.edges_between(guard, after_state)[0] itervar, rng, loop_struct = find_for_loop(sdfg, guard, begin) # Get loop states loop_states = list( sdutil.dfs_conditional(sdfg, sources=[begin], condition=lambda _, child: child != guard)) first_id = loop_states.index(begin) last_state = loop_struct[1] last_id = loop_states.index(last_state) loop_subgraph = gr.SubgraphView(sdfg, loop_states) #################################################################### # Transform if self.begin: # If begin, change initialization assignment and prepend states before # guard init_edges = [] before_states = loop_struct[0] for before_state in before_states: init_edge = sdfg.edges_between(before_state, guard)[0] init_edge.data.assignments[itervar] = str(rng[0] + self.count * rng[2]) init_edges.append(init_edge) append_states = before_states # Add `count` states, each with instantiated iteration variable for i in range(self.count): # Instantiate loop states with iterate value state_name: str = 'start_' + itervar + str(i * rng[2]) state_name = state_name.replace('-', 'm').replace( '+', 'p').replace('*', 'M').replace('/', 'D') new_states = self.instantiate_loop( sdfg, loop_states, loop_subgraph, itervar, rng[0] + i * rng[2], state_name, ) # Connect states to before the loop with unconditional edges for append_state in append_states: sdfg.add_edge(append_state, new_states[first_id], sd.InterstateEdge()) append_states = [new_states[last_id]] # Reconnect edge to guard state from last peeled iteration for append_state in append_states: if append_state not in before_states: for init_edge in init_edges: sdfg.remove_edge(init_edge) sdfg.add_edge(append_state, guard, init_edges[0].data) else: # If begin, change initialization assignment and prepend states before # guard itervar_sym = pystr_to_symbolic(itervar) condition_edge.data.condition = CodeBlock( self._modify_cond(condition_edge.data.condition, itervar, rng[2])) not_condition_edge.data.condition = CodeBlock( self._modify_cond(not_condition_edge.data.condition, itervar, rng[2])) prepend_state = after_state # Add `count` states, each with instantiated iteration variable for i in reversed(range(self.count)): # Instantiate loop states with iterate value state_name: str = 'end_' + itervar + str(-i * rng[2]) state_name = state_name.replace('-', 'm').replace( '+', 'p').replace('*', 'M').replace('/', 'D') new_states = self.instantiate_loop( sdfg, loop_states, loop_subgraph, itervar, itervar_sym + i * rng[2], state_name, ) # Connect states to before the loop with unconditional edges sdfg.add_edge(new_states[last_id], prepend_state, sd.InterstateEdge()) prepend_state = new_states[first_id] # Reconnect edge to guard state from last peeled iteration if prepend_state != after_state: sdfg.remove_edge(not_condition_edge) sdfg.add_edge(guard, prepend_state, not_condition_edge.data)
def apply(self, sdfg: sd.SDFG): ####################################################### # Step 0: SDFG metadata # Find all input and output data descriptors input_nodes = [] output_nodes = [] global_code_nodes = [[] for _ in sdfg.nodes()] for i, state in enumerate(sdfg.nodes()): sdict = state.scope_dict() for node in state.nodes(): if (isinstance(node, nodes.AccessNode) and node.desc(sdfg).transient == False): if (state.out_degree(node) > 0 and node.data not in input_nodes): # Special case: nodes that lead to top-level dynamic # map ranges must stay on host for e in state.out_edges(node): last_edge = state.memlet_path(e)[-1] if (isinstance(last_edge.dst, nodes.EntryNode) and last_edge.dst_conn and not last_edge.dst_conn.startswith('IN_') and sdict[last_edge.dst] is None): break else: input_nodes.append((node.data, node.desc(sdfg))) if (state.in_degree(node) > 0 and node.data not in output_nodes): output_nodes.append((node.data, node.desc(sdfg))) elif isinstance(node, nodes.CodeNode) and sdict[node] is None: if not isinstance(node, (nodes.LibraryNode, nodes.NestedSDFG)): global_code_nodes[i].append(node) # Input nodes may also be nodes with WCR memlets and no identity for e in state.edges(): if e.data.wcr is not None: if (e.data.data not in input_nodes and sdfg.arrays[e.data.data].transient == False): input_nodes.append( (e.data.data, sdfg.arrays[e.data.data])) start_state = sdfg.start_state end_states = sdfg.sink_nodes() ####################################################### # Step 1: Create cloned GPU arrays and replace originals cloned_arrays = {} for inodename, inode in set(input_nodes): if isinstance(inode, data.Scalar): # Scalars can remain on host continue if inode.storage == dtypes.StorageType.GPU_Global: continue newdesc = inode.clone() newdesc.storage = dtypes.StorageType.GPU_Global newdesc.transient = True name = sdfg.add_datadesc('gpu_' + inodename, newdesc, find_new_name=True) cloned_arrays[inodename] = name for onodename, onode in set(output_nodes): if onodename in cloned_arrays: continue if onode.storage == dtypes.StorageType.GPU_Global: continue newdesc = onode.clone() newdesc.storage = dtypes.StorageType.GPU_Global newdesc.transient = True name = sdfg.add_datadesc('gpu_' + onodename, newdesc, find_new_name=True) cloned_arrays[onodename] = name # Replace nodes for state in sdfg.nodes(): for node in state.nodes(): if (isinstance(node, nodes.AccessNode) and node.data in cloned_arrays): node.data = cloned_arrays[node.data] # Replace memlets for state in sdfg.nodes(): for edge in state.edges(): if edge.data.data in cloned_arrays: edge.data.data = cloned_arrays[edge.data.data] ####################################################### # Step 2: Create copy-in state excluded_copyin = self.exclude_copyin.split(',') copyin_state = sdfg.add_state(sdfg.label + '_copyin') sdfg.add_edge(copyin_state, start_state, sd.InterstateEdge()) for nname, desc in dtypes.deduplicate(input_nodes): if nname in excluded_copyin or nname not in cloned_arrays: continue src_array = nodes.AccessNode(nname, debuginfo=desc.debuginfo) dst_array = nodes.AccessNode(cloned_arrays[nname], debuginfo=desc.debuginfo) copyin_state.add_node(src_array) copyin_state.add_node(dst_array) copyin_state.add_nedge( src_array, dst_array, memlet.Memlet.from_array(src_array.data, src_array.desc(sdfg))) ####################################################### # Step 3: Create copy-out state excluded_copyout = self.exclude_copyout.split(',') copyout_state = sdfg.add_state(sdfg.label + '_copyout') for state in end_states: sdfg.add_edge(state, copyout_state, sd.InterstateEdge()) for nname, desc in dtypes.deduplicate(output_nodes): if nname in excluded_copyout or nname not in cloned_arrays: continue src_array = nodes.AccessNode(cloned_arrays[nname], debuginfo=desc.debuginfo) dst_array = nodes.AccessNode(nname, debuginfo=desc.debuginfo) copyout_state.add_node(src_array) copyout_state.add_node(dst_array) copyout_state.add_nedge( src_array, dst_array, memlet.Memlet.from_array(dst_array.data, dst_array.desc(sdfg))) ####################################################### # Step 4: Modify transient data storage for state in sdfg.nodes(): sdict = state.scope_dict() for node in state.nodes(): if isinstance(node, nodes.AccessNode) and node.desc(sdfg).transient: nodedesc = node.desc(sdfg) # Special case: nodes that lead to dynamic map ranges must # stay on host if any( isinstance( state.memlet_path(e)[-1].dst, nodes.EntryNode) for e in state.out_edges(node)): continue gpu_storage = [ dtypes.StorageType.GPU_Global, dtypes.StorageType.GPU_Shared, dtypes.StorageType.CPU_Pinned ] if sdict[ node] is None and nodedesc.storage not in gpu_storage: # NOTE: the cloned arrays match too but it's the same # storage so we don't care nodedesc.storage = dtypes.StorageType.GPU_Global # Try to move allocation/deallocation out of loops if (self.toplevel_trans and not isinstance(nodedesc, data.Stream)): nodedesc.lifetime = dtypes.AllocationLifetime.SDFG elif nodedesc.storage not in gpu_storage: # Make internal transients registers if self.register_trans: nodedesc.storage = dtypes.StorageType.Register ####################################################### # Step 5: Wrap free tasklets and nested SDFGs with a GPU map for state, gcodes in zip(sdfg.nodes(), global_code_nodes): for gcode in gcodes: if gcode.label in self.exclude_tasklets.split(','): continue # Create map and connectors me, mx = state.add_map(gcode.label + '_gmap', {gcode.label + '__gmapi': '0:1'}, schedule=dtypes.ScheduleType.GPU_Device) # Store in/out edges in lists so that they don't get corrupted # when they are removed from the graph in_edges = list(state.in_edges(gcode)) out_edges = list(state.out_edges(gcode)) me.in_connectors = {('IN_' + e.dst_conn): None for e in in_edges} me.out_connectors = {('OUT_' + e.dst_conn): None for e in in_edges} mx.in_connectors = {('IN_' + e.src_conn): None for e in out_edges} mx.out_connectors = {('OUT_' + e.src_conn): None for e in out_edges} # Create memlets through map for e in in_edges: state.remove_edge(e) state.add_edge(e.src, e.src_conn, me, 'IN_' + e.dst_conn, e.data) state.add_edge(me, 'OUT_' + e.dst_conn, e.dst, e.dst_conn, e.data) for e in out_edges: state.remove_edge(e) state.add_edge(e.src, e.src_conn, mx, 'IN_' + e.src_conn, e.data) state.add_edge(mx, 'OUT_' + e.src_conn, e.dst, e.dst_conn, e.data) # Map without inputs if len(in_edges) == 0: state.add_nedge(me, gcode, memlet.Memlet()) ####################################################### # Step 6: Change all top-level maps and library nodes to GPU schedule for i, state in enumerate(sdfg.nodes()): sdict = state.scope_dict() for node in state.nodes(): if isinstance(node, (nodes.EntryNode, nodes.LibraryNode)): if sdict[node] is None: node.schedule = dtypes.ScheduleType.GPU_Device elif (isinstance(node, (nodes.EntryNode, nodes.LibraryNode)) and self.sequential_innermaps): node.schedule = dtypes.ScheduleType.Sequential ####################################################### # Step 7: Introduce copy-out if data used in outgoing interstate edges for state in list(sdfg.nodes()): arrays_used = set() for e in sdfg.out_edges(state): # Used arrays = intersection between symbols and cloned arrays arrays_used.update( set(e.data.free_symbols) & set(cloned_arrays.keys())) # Create a state and copy out used arrays if len(arrays_used) > 0: co_state = sdfg.add_state(state.label + '_icopyout') # Reconnect outgoing edges to after interim copyout state for e in sdfg.out_edges(state): sdutil.change_edge_src(sdfg, state, co_state) # Add unconditional edge to interim state sdfg.add_edge(state, co_state, sd.InterstateEdge()) # Add copy-out nodes for nname in arrays_used: desc = sdfg.arrays[nname] src_array = nodes.AccessNode(cloned_arrays[nname], debuginfo=desc.debuginfo) dst_array = nodes.AccessNode(nname, debuginfo=desc.debuginfo) co_state.add_node(src_array) co_state.add_node(dst_array) co_state.add_nedge( src_array, dst_array, memlet.Memlet.from_array(dst_array.data, dst_array.desc(sdfg))) ####################################################### # Step 8: Strict transformations if not self.strict_transform: return # Apply strict state fusions greedily. sdfg.apply_strict_transformations()
def make_read_row(): sdfg = SDFG("spmv_read_row") begin = sdfg.add_state("begin") entry = sdfg.add_state("entry") end = sdfg.add_state("end") body = sdfg.add_state("body") sdfg.add_edge(begin, entry, InterstateEdge(assignments={"h": "0"})) sdfg.add_edge( entry, body, InterstateEdge(condition=CodeProperty.from_string( "h < H + 1", language=Language.Python))) sdfg.add_edge( entry, end, InterstateEdge(condition=CodeProperty.from_string( "h >= H + 1", language=Language.Python))) sdfg.add_edge(body, entry, InterstateEdge(assignments={"h": "h + 1"})) a_row_mem = body.add_array("A_row_mem", (H + 1, ), itype, storage=StorageType.FPGA_Global) to_val_pipe = body.add_stream("to_val_pipe", itype, storage=StorageType.FPGA_Local) to_col_pipe = body.add_stream("to_col_pipe", itype, storage=StorageType.FPGA_Local) to_compute_pipe = body.add_stream("to_compute_pipe", itype, storage=StorageType.FPGA_Local) to_x_pipe = body.add_stream("to_x_pipe", itype, storage=StorageType.FPGA_Local) tasklet = body.add_tasklet( "read_row", {"row_in"}, {"to_val_out", "to_col_out", "to_compute_out", "to_x_out"}, "to_val_out = row_in\n" "to_col_out = row_in\n" "to_compute_out = row_in\n" "to_x_out = row_in") body.add_memlet_path(a_row_mem, tasklet, dst_conn="row_in", memlet=Memlet.simple(a_row_mem, "h")) body.add_memlet_path(tasklet, to_val_pipe, src_conn="to_val_out", memlet=Memlet.simple(to_val_pipe, "0")) body.add_memlet_path(tasklet, to_col_pipe, src_conn="to_col_out", memlet=Memlet.simple(to_col_pipe, "0")) body.add_memlet_path(tasklet, to_compute_pipe, src_conn="to_compute_out", memlet=Memlet.simple(to_compute_pipe, "0")) body.add_memlet_path(tasklet, to_x_pipe, src_conn="to_x_out", memlet=Memlet.simple(to_x_pipe, "0")) return sdfg
def make_compute_nested_sdfg(): sdfg = SDFG("spmv_compute_nested") if_state = sdfg.add_state("if") then_state = sdfg.add_state("then") else_state = sdfg.add_state("else") end_state = sdfg.add_state("end") sdfg.add_edge( if_state, then_state, InterstateEdge(condition=CodeProperty.from_string( "c == 0", language=Language.Python))) sdfg.add_edge( if_state, else_state, InterstateEdge(condition=CodeProperty.from_string( "c != 0", language=Language.Python))) sdfg.add_edge(then_state, end_state, InterstateEdge()) sdfg.add_edge(else_state, end_state, InterstateEdge()) a_in = if_state.add_scalar("a_in", dtype, storage=StorageType.FPGA_Registers) x_in = if_state.add_scalar("x_in", dtype, storage=StorageType.FPGA_Registers) b_tmp_out = if_state.add_scalar("b_tmp", dtype, transient=True, storage=StorageType.FPGA_Registers) tasklet = if_state.add_tasklet("compute", {"_a_in", "_x_in"}, {"_b_out"}, "_b_out = _a_in * _x_in") if_state.add_memlet_path(a_in, tasklet, dst_conn="_a_in", memlet=Memlet.simple(a_in, "0")) if_state.add_memlet_path(x_in, tasklet, dst_conn="_x_in", memlet=Memlet.simple(x_in, "0")) if_state.add_memlet_path(tasklet, b_tmp_out, src_conn="_b_out", memlet=Memlet.simple(b_tmp_out, "0")) b_tmp_then_in = then_state.add_scalar("b_tmp", dtype, transient=True, storage=StorageType.FPGA_Registers) b_then_out = then_state.add_scalar("b_out", dtype, storage=StorageType.FPGA_Registers) then_state.add_memlet_path(b_tmp_then_in, b_then_out, memlet=Memlet.simple(b_then_out, "0")) b_tmp_else_in = else_state.add_scalar("b_tmp", dtype, transient=True, storage=StorageType.FPGA_Registers) b_else_in = else_state.add_scalar("b_in", dtype, storage=StorageType.FPGA_Registers) b_else_out = else_state.add_scalar("b_out", dtype, storage=StorageType.FPGA_Registers) else_tasklet = else_state.add_tasklet("b_wcr", {"_b_in", "b_prev"}, {"_b_out"}, "_b_out = b_prev + _b_in") else_state.add_memlet_path(b_tmp_else_in, else_tasklet, dst_conn="_b_in", memlet=Memlet.simple(b_tmp_else_in, "0")) else_state.add_memlet_path(b_else_in, else_tasklet, dst_conn="b_prev", memlet=Memlet.simple(b_else_in, "0")) else_state.add_memlet_path(else_tasklet, b_else_out, src_conn="_b_out", memlet=Memlet.simple(b_else_out, "0")) return sdfg
def apply(self, sdfg: sd.SDFG): #################################################################### # Obtain loop information guard: sd.SDFGState = sdfg.node(self.subgraph[DetectLoop._loop_guard]) begin: sd.SDFGState = sdfg.node(self.subgraph[DetectLoop._loop_begin]) after_state: sd.SDFGState = sdfg.node( self.subgraph[DetectLoop._exit_state]) # Obtain iteration variable, range, and stride guard_inedges = sdfg.in_edges(guard) condition_edge = sdfg.edges_between(guard, begin)[0] not_condition_edge = sdfg.edges_between(guard, after_state)[0] itervar = list(guard_inedges[0].data.assignments.keys())[0] condition = condition_edge.data.condition_sympy() rng = self._loop_range(itervar, guard_inedges, condition) # Find the state prior to the loop if rng[0] == symbolic.pystr_to_symbolic( guard_inedges[0].data.assignments[itervar]): init_edge: sd.InterstateEdge = guard_inedges[0] before_state: sd.SDFGState = guard_inedges[0].src last_state: sd.SDFGState = guard_inedges[1].src else: init_edge: sd.InterstateEdge = guard_inedges[1] before_state: sd.SDFGState = guard_inedges[1].src last_state: sd.SDFGState = guard_inedges[0].src # Get loop states loop_states = list( sdutil.dfs_conditional(sdfg, sources=[begin], condition=lambda _, child: child != guard)) first_id = loop_states.index(begin) last_id = loop_states.index(last_state) loop_subgraph = gr.SubgraphView(sdfg, loop_states) #################################################################### # Transform if self.begin: # If begin, change initialization assignment and prepend states before # guard init_edge.data.assignments[itervar] = str(rng[0] + self.count * rng[2]) append_state = before_state # Add `count` states, each with instantiated iteration variable for i in range(self.count): # Instantiate loop states with iterate value state_name: str = 'start_' + itervar + str(i * rng[2]) state_name = state_name.replace('-', 'm').replace( '+', 'p').replace('*', 'M').replace('/', 'D') new_states = self.instantiate_loop( sdfg, loop_states, loop_subgraph, itervar, rng[0] + i * rng[2], state_name, ) # Connect states to before the loop with unconditional edges sdfg.add_edge(append_state, new_states[first_id], sd.InterstateEdge()) append_state = new_states[last_id] # Reconnect edge to guard state from last peeled iteration if append_state != before_state: sdfg.remove_edge(init_edge) sdfg.add_edge(append_state, guard, init_edge.data) else: # If begin, change initialization assignment and prepend states before # guard itervar_sym = pystr_to_symbolic(itervar) condition_edge.data.condition = CodeBlock( self._modify_cond(condition_edge.data.condition, itervar, rng[2])) not_condition_edge.data.condition = CodeBlock( self._modify_cond(not_condition_edge.data.condition, itervar, rng[2])) prepend_state = after_state # Add `count` states, each with instantiated iteration variable for i in reversed(range(self.count)): # Instantiate loop states with iterate value state_name: str = 'end_' + itervar + str(-i * rng[2]) state_name = state_name.replace('-', 'm').replace( '+', 'p').replace('*', 'M').replace('/', 'D') new_states = self.instantiate_loop( sdfg, loop_states, loop_subgraph, itervar, itervar_sym + i * rng[2], state_name, ) # Connect states to before the loop with unconditional edges sdfg.add_edge(new_states[last_id], prepend_state, sd.InterstateEdge()) prepend_state = new_states[first_id] # Reconnect edge to guard state from last peeled iteration if prepend_state != after_state: sdfg.remove_edge(not_condition_edge) sdfg.add_edge(guard, prepend_state, not_condition_edge.data)
def generate_reference(name, chain): """Generates a simple, unoptimized SDFG to run on the CPU, for verification purposes.""" sdfg = SDFG(name) for k, v in chain.constants.items(): sdfg.add_constant(k, v["value"], dace.data.Scalar(v["data_type"])) (dimensions_to_skip, shape, vector_length, parameters, iterators, memcopy_indices, memcopy_accesses) = _generate_init(chain) prev_state = sdfg.add_state("init") # Throw vectorization in the bin for the reference code vector_length = 1 shape = tuple(map(int, shape)) input_shapes = {} # Maps inputs to their shape tuple for node in chain.graph.nodes(): if isinstance(node, Input) or isinstance(node, Output): if isinstance(node, Input): for output in node.outputs.values(): pars = tuple( output["input_dims"] ) if "input_dims" in output and output[ "input_dims"] is not None else tuple(parameters) arr_shape = tuple(s for s, p in zip(shape, parameters) if p in pars) input_shapes[node.name] = arr_shape break else: raise ValueError("No outputs found for input node.") else: arr_shape = shape if len(arr_shape) > 0: try: sdfg.add_array(node.name, arr_shape, node.data_type) except NameError: sdfg.data( node.name).access = dace.dtypes.AccessType.ReadWrite else: sdfg.add_symbol(node.name, node.data_type) for link in chain.graph.edges(data=True): name = link[0].name if name not in sdfg.arrays and name not in sdfg.symbols: sdfg.add_array(name, shape, link[0].data_type, transient=True) input_shapes[name] = tuple(shape) input_iterators = { k: tuple("0:{}".format(s) for s in v) for k, v in input_shapes.items() } # Enforce dependencies via topological sort for node in nx.topological_sort(chain.graph): if not isinstance(node, Kernel): continue state = sdfg.add_state(node.name) sdfg.add_edge(prev_state, state, dace.InterstateEdge()) (stencil_node, input_to_connector, output_to_connector) = _generate_stencil(node, chain, shape, dimensions_to_skip) stencil_node.implementation = "CPU" for field, connector in input_to_connector.items(): if len(input_iterators[field]) == 0: continue # Scalar variable # Outer memory read read_node = state.add_read(field) state.add_memlet_path(read_node, stencil_node, dst_conn=connector, memlet=Memlet.simple( field, ", ".join(input_iterators[field]))) for _, connector in output_to_connector.items(): # Outer write write_node = state.add_write(node.name) state.add_memlet_path(stencil_node, write_node, src_conn=connector, memlet=Memlet.simple( node.name, ", ".join("0:{}".format(s) for s in shape))) prev_state = state return sdfg
def apply(self, outer_state: SDFGState, sdfg: SDFG): nsdfg_node = self.nested_sdfg nsdfg: SDFG = nsdfg_node.sdfg if nsdfg_node.schedule is not dtypes.ScheduleType.Default: infer_types.set_default_schedule_and_storage_types( nsdfg, nsdfg_node.schedule) ####################################################### # Collect and update top-level SDFG metadata # Global/init/exit code for loc, code in nsdfg.global_code.items(): sdfg.append_global_code(code.code, loc) for loc, code in nsdfg.init_code.items(): sdfg.append_init_code(code.code, loc) for loc, code in nsdfg.exit_code.items(): sdfg.append_exit_code(code.code, loc) # Environments for nstate in nsdfg.nodes(): for node in nstate.nodes(): if isinstance(node, nodes.CodeNode): node.environments |= nsdfg_node.environments # Constants for cstname, cstval in nsdfg.constants.items(): if cstname in sdfg.constants: if cstval != sdfg.constants[cstname]: warnings.warn('Constant value mismatch for "%s" while ' 'inlining SDFG. Inner = %s != %s = outer' % (cstname, cstval, sdfg.constants[cstname])) else: sdfg.add_constant(cstname, cstval) # Symbols outer_symbols = {str(k): v for k, v in sdfg.symbols.items()} for ise in sdfg.edges(): outer_symbols.update(ise.data.new_symbols(sdfg, outer_symbols)) # Find original source/destination edges (there is only one edge per # connector, according to match) inputs: Dict[str, MultiConnectorEdge] = {} outputs: Dict[str, MultiConnectorEdge] = {} input_set: Dict[str, str] = {} output_set: Dict[str, str] = {} for e in outer_state.in_edges(nsdfg_node): inputs[e.dst_conn] = e input_set[e.data.data] = e.dst_conn for e in outer_state.out_edges(nsdfg_node): outputs[e.src_conn] = e output_set[e.data.data] = e.src_conn # Replace symbols using invocation symbol mapping # Two-step replacement (N -> __dacesym_N --> map[N]) to avoid clashes symbolic.safe_replace(nsdfg_node.symbol_mapping, nsdfg.replace_dict) # Access nodes that need to be reshaped # reshapes: Set(str) = set() # for aname, array in nsdfg.arrays.items(): # if array.transient: # continue # edge = None # if aname in inputs: # edge = inputs[aname] # if len(array.shape) > len(edge.data.subset): # reshapes.add(aname) # continue # if aname in outputs: # edge = outputs[aname] # if len(array.shape) > len(edge.data.subset): # reshapes.add(aname) # continue # if edge is not None and not InlineMultistateSDFG._check_strides( # array.strides, sdfg.arrays[edge.data.data].strides, # edge.data, nsdfg_node): # reshapes.add(aname) # Mapping from nested transient name to top-level name transients: Dict[str, str] = {} # All transients become transients of the parent (if data already # exists, find new name) for nstate in nsdfg.nodes(): for node in nstate.nodes(): if isinstance(node, nodes.AccessNode): datadesc = nsdfg.arrays[node.data] if node.data not in transients and datadesc.transient: new_name = node.data if (new_name in sdfg.arrays or new_name in outer_symbols or new_name in sdfg.constants): new_name = f'{nsdfg.label}_{node.data}' name = sdfg.add_datadesc(new_name, datadesc, find_new_name=True) transients[node.data] = name # All transients of edges between code nodes are also added to parent for edge in nstate.edges(): if (isinstance(edge.src, nodes.CodeNode) and isinstance(edge.dst, nodes.CodeNode)): if edge.data.data is not None: datadesc = nsdfg.arrays[edge.data.data] if edge.data.data not in transients and datadesc.transient: new_name = edge.data.data if (new_name in sdfg.arrays or new_name in outer_symbols or new_name in sdfg.constants): new_name = f'{nsdfg.label}_{edge.data.data}' name = sdfg.add_datadesc(new_name, datadesc, find_new_name=True) transients[edge.data.data] = name ####################################################### # Replace data on inlined SDFG nodes/edges # Replace data names with their top-level counterparts repldict = {} repldict.update(transients) repldict.update({ k: v.data.data for k, v in itertools.chain(inputs.items(), outputs.items()) }) symbolic.safe_replace(repldict, lambda m: replace_datadesc_names(nsdfg, m), value_as_string=True) # Add views whenever reshapes are necessary # for dname in reshapes: # desc = nsdfg.arrays[dname] # # To avoid potential confusion, rename protected __return keyword # if dname.startswith('__return'): # newname = f'{nsdfg.name}_ret{dname[8:]}' # else: # newname = dname # newname, _ = sdfg.add_view(newname, # desc.shape, # desc.dtype, # storage=desc.storage, # strides=desc.strides, # offset=desc.offset, # debuginfo=desc.debuginfo, # allow_conflicts=desc.allow_conflicts, # total_size=desc.total_size, # alignment=desc.alignment, # may_alias=desc.may_alias, # find_new_name=True) # repldict[dname] = newname # Add extra access nodes for out/in view nodes # inv_reshapes = {repldict[r]: r for r in reshapes} # for nstate in nsdfg.nodes(): # for node in nstate.nodes(): # if isinstance(node, # nodes.AccessNode) and node.data in inv_reshapes: # if nstate.in_degree(node) > 0 and nstate.out_degree( # node) > 0: # # Such a node has to be in the output set # edge = outputs[inv_reshapes[node.data]] # # Redirect outgoing edges through access node # out_edges = list(nstate.out_edges(node)) # anode = nstate.add_access(edge.data.data) # vnode = nstate.add_access(node.data) # nstate.add_nedge(node, anode, edge.data) # nstate.add_nedge(anode, vnode, edge.data) # for e in out_edges: # nstate.remove_edge(e) # nstate.add_edge(vnode, e.src_conn, e.dst, # e.dst_conn, e.data) # Make unique names for states statenames = set(s.label for s in sdfg.nodes()) for nstate in nsdfg.nodes(): if nstate.label in statenames: newname = data.find_new_name(nstate.label, statenames) statenames.add(newname) nstate.set_label(newname) ####################################################### # Collect and modify interstate edges as necessary outer_assignments = set() for e in sdfg.edges(): outer_assignments |= e.data.assignments.keys() inner_assignments = set() for e in nsdfg.edges(): inner_assignments |= e.data.assignments.keys() assignments_to_replace = inner_assignments & outer_assignments sym_replacements: Dict[str, str] = {} allnames = set(outer_symbols.keys()) | set(sdfg.arrays.keys()) for assign in assignments_to_replace: newname = data.find_new_name(assign, allnames) allnames.add(newname) sym_replacements[assign] = newname nsdfg.replace_dict(sym_replacements) ####################################################### # Add nested SDFG states into top-level SDFG outer_start_state = sdfg.start_state sdfg.add_nodes_from(nsdfg.nodes()) for ise in nsdfg.edges(): sdfg.add_edge(ise.src, ise.dst, ise.data) ####################################################### # Reconnect inlined SDFG source = nsdfg.start_state sinks = nsdfg.sink_nodes() # Reconnect state machine for e in sdfg.in_edges(outer_state): sdfg.add_edge(e.src, source, e.data) for e in sdfg.out_edges(outer_state): for sink in sinks: sdfg.add_edge(sink, e.dst, e.data) # Modify start state as necessary if outer_start_state is outer_state: sdfg.start_state = sdfg.node_id(source) # TODO: Modify memlets by offsetting # If both source and sink nodes are inputs/outputs, reconnect once # edges_to_ignore = self._modify_access_to_access(new_incoming_edges, # nsdfg, nstate, state, # orig_data) # source_to_outer = {n: e.src for n, e in new_incoming_edges.items()} # sink_to_outer = {n: e.dst for n, e in new_outgoing_edges.items()} # # If a source/sink node is one of the inputs/outputs, reconnect it, # # replacing memlets in outgoing/incoming paths # modified_edges = set() # modified_edges |= self._modify_memlet_path(new_incoming_edges, nstate, # state, sink_to_outer, True, # edges_to_ignore) # modified_edges |= self._modify_memlet_path(new_outgoing_edges, nstate, # state, source_to_outer, # False, edges_to_ignore) # # Reshape: add connections to viewed data # self._modify_reshape_data(reshapes, repldict, inputs, nstate, state, # True) # self._modify_reshape_data(reshapes, repldict, outputs, nstate, state, # False) # Modify all other internal edges pertaining to input/output nodes # for nstate in nsdfg.nodes(): # for node in nstate.nodes(): # if isinstance(node, nodes.AccessNode): # if node.data in input_set or node.data in output_set: # if node.data in input_set: # outer_edge = inputs[input_set[node.data]] # else: # outer_edge = outputs[output_set[node.data]] # for edge in state.all_edges(node): # if (edge not in modified_edges # and edge.data.data == node.data): # for e in state.memlet_tree(edge): # if e.data.data == node.data: # e._data = helpers.unsqueeze_memlet( # e.data, outer_edge.data) # Replace nested SDFG parents with new SDFG for nstate in nsdfg.nodes(): nstate.parent = sdfg for node in nstate.nodes(): if isinstance(node, nodes.NestedSDFG): node.sdfg.parent_sdfg = sdfg node.sdfg.parent_nsdfg_node = node ####################################################### # Remove nested SDFG and state sdfg.remove_node(outer_state) return nsdfg.nodes()
def apply(self, sdfg: sd.SDFG): # Obtain loop information guard: sd.SDFGState = sdfg.node(self.subgraph[DetectLoop._loop_guard]) body: sd.SDFGState = sdfg.node(self.subgraph[DetectLoop._loop_begin]) after: sd.SDFGState = sdfg.node(self.subgraph[DetectLoop._exit_state]) # Obtain iteration variable, range, and stride itervar, (start, end, step), _ = find_for_loop(sdfg, guard, body) if (step < 0) == True: # If step is negative, we have to flip start and end to produce a # correct map with a positive increment start, end, step = end, start, -step # If necessary, make a nested SDFG with assignments isedge = sdfg.edges_between(guard, body)[0] symbols_to_remove = set() if len(isedge.data.assignments) > 0: nsdfg = helpers.nest_state_subgraph( sdfg, body, gr.SubgraphView(body, body.nodes())) for sym in isedge.data.free_symbols: if sym in nsdfg.symbol_mapping or sym in nsdfg.in_connectors: continue if sym in sdfg.symbols: nsdfg.symbol_mapping[sym] = symbolic.pystr_to_symbolic(sym) nsdfg.sdfg.add_symbol(sym, sdfg.symbols[sym]) elif sym in sdfg.arrays: if sym in nsdfg.sdfg.arrays: raise NotImplementedError rnode = body.add_read(sym) nsdfg.add_in_connector(sym) desc = copy.deepcopy(sdfg.arrays[sym]) desc.transient = False nsdfg.sdfg.add_datadesc(sym, desc) body.add_edge(rnode, None, nsdfg, sym, memlet.Memlet(sym)) nstate = nsdfg.sdfg.node(0) init_state = nsdfg.sdfg.add_state_before(nstate) nisedge = nsdfg.sdfg.edges_between(init_state, nstate)[0] nisedge.data.assignments = isedge.data.assignments symbols_to_remove = set(nisedge.data.assignments.keys()) for k in nisedge.data.assignments.keys(): if k in nsdfg.symbol_mapping: del nsdfg.symbol_mapping[k] isedge.data.assignments = {} source_nodes = body.source_nodes() sink_nodes = body.sink_nodes() map = nodes.Map(body.label + "_map", [itervar], [(start, end, step)]) entry = nodes.MapEntry(map) exit = nodes.MapExit(map) body.add_node(entry) body.add_node(exit) # If the map uses symbols from data containers, instantiate reads containers_to_read = entry.free_symbols & sdfg.arrays.keys() for rd in containers_to_read: # We are guaranteed that this is always a scalar, because # can_be_applied makes sure there are no sympy functions in each of # the loop expresions access_node = body.add_read(rd) body.add_memlet_path(access_node, entry, dst_conn=rd, memlet=memlet.Memlet(rd)) # Reroute all memlets through the entry and exit nodes for n in source_nodes: if isinstance(n, nodes.AccessNode): for e in body.out_edges(n): body.remove_edge(e) body.add_edge_pair(entry, e.dst, n, e.data, internal_connector=e.dst_conn) else: body.add_nedge(entry, n, memlet.Memlet()) for n in sink_nodes: if isinstance(n, nodes.AccessNode): for e in body.in_edges(n): body.remove_edge(e) body.add_edge_pair(exit, e.src, n, e.data, internal_connector=e.src_conn) else: body.add_nedge(n, exit, memlet.Memlet()) # Get rid of the loop exit condition edge after_edge = sdfg.edges_between(guard, after)[0] sdfg.remove_edge(after_edge) # Remove the assignment on the edge to the guard for e in sdfg.in_edges(guard): if itervar in e.data.assignments: del e.data.assignments[itervar] # Remove the condition on the entry edge condition_edge = sdfg.edges_between(guard, body)[0] condition_edge.data.condition = CodeBlock("1") # Get rid of backedge to guard sdfg.remove_edge(sdfg.edges_between(body, guard)[0]) # Route body directly to after state, maintaining any other assignments # it might have had sdfg.add_edge( body, after, sd.InterstateEdge(assignments=after_edge.data.assignments)) # If this had made the iteration variable a free symbol, we can remove # it from the SDFG symbols if itervar in sdfg.free_symbols: sdfg.remove_symbol(itervar) for sym in symbols_to_remove: if helpers.is_symbol_unused(sdfg, sym): sdfg.remove_symbol(sym)
def make_write_sdfg(): sdfg = SDFG("filter_write") loop_begin = sdfg.add_state("loop_begin") loop_entry = sdfg.add_state("loop_entry") state = sdfg.add_state("loop_body") loop_end = sdfg.add_state("loop_end") i_write_zero = loop_begin.add_scalar("i_write", dtype=dace.dtypes.uint32, transient=True, storage=StorageType.FPGA_Registers) zero_tasklet = loop_begin.add_tasklet("zero", {}, {"i_write_out"}, "i_write_out = 0") loop_begin.add_memlet_path(zero_tasklet, i_write_zero, src_conn="i_write_out", memlet=Memlet.simple(i_write_zero, "0")) sdfg.add_edge(loop_begin, loop_entry, dace.sdfg.InterstateEdge(assignments={"i": 0})) sdfg.add_edge( loop_entry, state, dace.sdfg.InterstateEdge( condition=dace.properties.CodeProperty.from_string( "i < N + W", language=dace.dtypes.Language.Python))) sdfg.add_edge( loop_entry, loop_end, dace.sdfg.InterstateEdge( condition=dace.properties.CodeProperty.from_string( "i >= N + W", language=dace.dtypes.Language.Python))) sdfg.add_edge(state, loop_entry, dace.sdfg.InterstateEdge(assignments={"i": "i + W"})) B = state.add_array("B_mem", [N / W], dtype=vtype, storage=StorageType.FPGA_Global) B_pipe = state.add_stream("_B_pipe", dtype=vtype, buffer_size=buffer_size, storage=StorageType.FPGA_Local) valid_pipe = state.add_stream("_valid_pipe", dtype=dace.dtypes.bool, buffer_size=buffer_size, storage=StorageType.FPGA_Local) i_write_in = state.add_scalar("i_write", dtype=dace.dtypes.uint32, transient=True, storage=StorageType.FPGA_Registers) i_write_out = state.add_scalar("i_write", dtype=dace.dtypes.uint32, transient=True, storage=StorageType.FPGA_Registers) tasklet = state.add_tasklet( "write", {"b_in", "valid_in", "i_write_in"}, {"b_out", "i_write_out"}, "if valid_in:" "\n\tb_out[i_write_in] = b_in" "\n\ti_write_out = i_write_in + 1" "\nelse:" "\n\ti_write_out = i_write_in") state.add_memlet_path(B_pipe, tasklet, dst_conn="b_in", memlet=Memlet.simple(B_pipe, "0")) state.add_memlet_path(valid_pipe, tasklet, dst_conn="valid_in", memlet=Memlet.simple(valid_pipe, "0")) state.add_memlet_path(i_write_in, tasklet, dst_conn="i_write_in", memlet=Memlet.simple(i_write_in, "0")) state.add_memlet_path(tasklet, i_write_out, src_conn="i_write_out", memlet=Memlet.simple(i_write_out, "0")) state.add_memlet_path(tasklet, B, src_conn="b_out", memlet=Memlet.simple(B, "0:N")) return sdfg
def parse_from_function(function, *compilation_args, strict=None): """ Try to parse a DaceProgram object and return the `dace.SDFG` object that corresponds to it. @param function: DaceProgram object (obtained from the `@dace.program` decorator). @param compilation_args: Various compilation arguments e.g. types. @param strict: Whether to apply strict transformations or not (None uses configuration-defined value). @return: The generated SDFG object. """ if not isinstance(function, DaceProgram): raise TypeError( 'Function must be of type dace.frontend.python.DaceProgram') # Obtain parsed DaCe program pdp, modules = function.generate_pdp(*compilation_args) # Create an empty SDFG sdfg = SDFG(pdp.name, pdp.argtypes) sdfg.set_sourcecode(pdp.source, 'python') # Populate SDFG with states and nodes, according to the parsed DaCe program # 1) Inherit dependencies and inject tasklets # 2) Traverse program graph and recursively split into states, # annotating edges with their transition conditions. # 3) Add arrays, streams, and scalars to the SDFG array store # 4) Eliminate empty states with no conditional outgoing transitions # 5) Label states in topological order # 6) Construct dataflow graph for each state # Step 1) for primitive in pdp.children: depanalysis.inherit_dependencies(primitive) # Step 2) state_primitives = depanalysis.create_states_simple(pdp, sdfg) # Step 3) for dataname, datadesc in pdp.all_arrays().items(): sdfg.add_datadesc(dataname, datadesc) # Step 4) Absorb next state into current, if possible oldstates = list(sdfg.topological_sort(sdfg.start_state)) for state in oldstates: if state not in sdfg.nodes(): # State already removed continue if sdfg.out_degree(state) == 1: edge = sdfg.out_edges(state)[0] nextState = edge.dst if not edge.data.is_unconditional(): continue if sdfg.in_degree(nextState) > 1: # If other edges point to state continue if len(state_primitives[nextState]) > 0: # Don't fuse full states continue outEdges = list(sdfg.out_edges(nextState)) for e in outEdges: # Construct new edge from the current assignments, new # assignments, and new conditions newEdge = copy.deepcopy(edge.data) newEdge.assignments.update(e.data.assignments) newEdge.condition = e.data.condition sdfg.add_edge(state, e.dst, newEdge) sdfg.remove_node(nextState) # Step 5) stateList = sdfg.topological_sort(sdfg.start_state) for i, state in enumerate(stateList): if state.label is None or state.label == "": state.set_label("s" + str(i)) # Step 6) for i, state in enumerate(stateList): depanalysis.build_dataflow_graph(sdfg, state, state_primitives[state], modules) # Fill in scope entry/exit connectors sdfg.fill_scope_connectors() # Memlet propagation if sdfg.propagate: labeling.propagate_labels_sdfg(sdfg) # Drawing the SDFG before strict transformations sdfg.draw_to_file(recursive=True) # Apply strict transformations automatically if (strict == True or (strict is None and Config.get_bool('optimizer', 'automatic_state_fusion'))): sdfg.apply_strict_transformations() # Drawing the SDFG (again) to a .dot file sdfg.draw_to_file(recursive=True) # Validate SDFG sdfg.validate() return sdfg