def _initialize_return_values(self, kwargs): # Obtain symbol values from arguments and constants syms = dict() syms.update({k: v for k, v in kwargs.items() if k not in self.sdfg.arrays}) syms.update(self.sdfg.constants) if self._initialized: if self._return_syms == syms: if not self._create_new_arrays: return else: self._create_new_arrays = False # Use stored sizes to recreate arrays (fast path) if self._return_arrays is None: return elif isinstance(self._return_arrays, tuple): self._return_arrays = tuple(kwargs[desc[0]] if desc[0] in kwargs else self._create_array(*desc) for desc in self._retarray_shapes) return else: # Single array return value desc = self._retarray_shapes[0] arr = (kwargs[desc[0]] if desc[0] in kwargs else self._create_array(*desc)) self._return_arrays = arr return self._return_syms = syms self._create_new_arrays = False # Initialize return values with numpy arrays self._retarray_shapes = [] self._return_arrays = [] for arrname, arr in sorted(self.sdfg.arrays.items()): if arrname.startswith('__return') and not arr.transient: if arrname in kwargs: self._return_arrays.append(kwargs[arrname]) self._retarray_shapes.append((arrname, )) continue if isinstance(arr, dt.Stream): raise NotImplementedError('Return streams are unsupported') shape = tuple(symbolic.evaluate(s, syms) for s in arr.shape) dtype = arr.dtype.as_numpy_dtype() total_size = int(symbolic.evaluate(arr.total_size, syms)) strides = tuple(symbolic.evaluate(s, syms) * arr.dtype.bytes for s in arr.strides) shape_desc = (arrname, dtype, arr.storage, shape, strides, total_size) self._retarray_shapes.append(shape_desc) # Create an array with the properties of the SDFG array arr = self._create_array(*shape_desc) self._return_arrays.append(arr) # Set up return_arrays field if len(self._return_arrays) == 0: self._return_arrays = None elif len(self._return_arrays) == 1: self._return_arrays = self._return_arrays[0] else: self._return_arrays = tuple(self._return_arrays)
def _initialize_return_values(self, kwargs): # Obtain symbol values from arguments and constants syms = dict() syms.update( {k: v for k, v in kwargs.items() if k not in self.sdfg.arrays}) syms.update(self.sdfg.constants) if self._initialized: if self._return_syms == syms: return self._return_kwarrays self._return_syms = syms # Initialize return values with numpy arrays self._return_arrays = [] self._return_kwarrays = {} for arrname, arr in sorted(self.sdfg.arrays.items()): if arrname.startswith('__return') and not arr.transient: if arrname in kwargs: self._return_arrays.append(kwargs[arrname]) self._return_kwarrays[arrname] = kwargs[arrname] continue if isinstance(arr, dt.Stream): raise NotImplementedError('Return streams are unsupported') if arr.storage in [ dtypes.StorageType.GPU_Global, dtypes.StorageType.FPGA_Global ]: raise NotImplementedError('Non-host return values are ' 'unsupported') # Create an array with the properties of the SDFG array self._return_arrays.append( np.ndarray([symbolic.evaluate(s, syms) for s in arr.shape], arr.dtype.as_numpy_dtype(), buffer=np.zeros( [symbolic.evaluate(arr.total_size, syms)], arr.dtype.as_numpy_dtype()), strides=[ symbolic.evaluate(s, syms) * arr.dtype.bytes for s in arr.strides ])) self._return_kwarrays[arrname] = self._return_arrays[-1] # Set up return_arrays field if len(self._return_arrays) == 0: self._return_arrays = None elif len(self._return_arrays) == 1: self._return_arrays = self._return_arrays[0] else: self._return_arrays = tuple(self._return_arrays) return self._return_kwarrays
def generate_rtl_inputs_outputs(self, sdfg, tasklet): # construct input / output module header inputs = list() for inp in tasklet.in_connectors: # add vector index idx_str = "" # catch symbolic (compile time variables) check_issymbolic([ tasklet.in_connectors[inp].veclen, tasklet.in_connectors[inp].bytes ], sdfg) # extract parameters vec_len = int( symbolic.evaluate(tasklet.in_connectors[inp].veclen, sdfg.constants)) total_size = int( symbolic.evaluate(tasklet.in_connectors[inp].bytes, sdfg.constants)) # generate vector representation if vec_len > 1: idx_str = "[{}:0]".format(vec_len - 1) # add element index idx_str += "[{}:0]".format(int(total_size / vec_len) * 8 - 1) # generate padded string and add to list inputs.append(", input{padding}{idx_str} {name}".format( padding=" " * (17 - len(idx_str)), idx_str=idx_str, name=inp)) outputs = list() for inp in tasklet.out_connectors: # add vector index idx_str = "" # catch symbolic (compile time variables) check_issymbolic([ tasklet.out_connectors[inp].veclen, tasklet.out_connectors[inp].bytes ], sdfg) # extract parameters vec_len = int( symbolic.evaluate(tasklet.out_connectors[inp].veclen, sdfg.constants)) total_size = int( symbolic.evaluate(tasklet.out_connectors[inp].bytes, sdfg.constants)) # generate vector representation if vec_len > 1: idx_str = "[{}:0]".format(vec_len - 1) # add element index idx_str += "[{}:0]".format(int(total_size / vec_len) * 8 - 1) # generate padded string and add to list outputs.append(", output reg{padding}{idx_str} {name}".format( padding=" " * (12 - len(idx_str)), idx_str=idx_str, name=inp)) return inputs, outputs
def can_be_applied(graph, candidate, expr_index, sdfg, strict=False): map_entry = graph.nodes()[candidate[MapUnroll._map_entry]] # Must be top-level map if graph.scope_dict()[map_entry] is not None: return False # All map ranges must be constant try: for begin, end, step in map_entry.map.range: symbolic.evaluate(begin, sdfg.constants) symbolic.evaluate(end, sdfg.constants) symbolic.evaluate(step, sdfg.constants) except TypeError: return False return True
def copy_memory(self, sdfg: sdfg.SDFG, dfg: state.StateSubgraphView, state_id: int, src_node: nodes.Node, dst_node: nodes.Node, edge: graph.MultiConnectorEdge, function_stream: prettycode.CodeIOStream, callsite_stream: prettycode.CodeIOStream): """ Generate input/output memory copies from the array references to local variables (i.e. for the tasklet code). """ if isinstance(edge.src, nodes.AccessNode) and isinstance( edge.dst, nodes.Tasklet): # handle AccessNode->Tasklet if isinstance(dst_node.in_connectors[edge.dst_conn], dtypes.pointer): # pointer accessor line: str = "{} {} = &{}[0];".format( dst_node.in_connectors[edge.dst_conn].ctype, edge.dst_conn, edge.src.data) elif isinstance(dst_node.in_connectors[edge.dst_conn], dtypes.vector): # vector accessor line: str = "{} {} = *({} *)(&{}[0]);".format( dst_node.in_connectors[edge.dst_conn].ctype, edge.dst_conn, dst_node.in_connectors[edge.dst_conn].ctype, edge.src.data) else: # scalar accessor arr = sdfg.arrays[edge.data.data] if isinstance(arr, data.Array): line: str = "{}* {} = &{}[0];".format( dst_node.in_connectors[edge.dst_conn].ctype, edge.dst_conn, edge.src.data) elif isinstance(arr, data.Scalar): line: str = "{} {} = {};".format( dst_node.in_connectors[edge.dst_conn].ctype, edge.dst_conn, edge.src.data) elif isinstance(edge.src, nodes.MapEntry) and isinstance( edge.dst, nodes.Tasklet): rtl_name = self.unique_name(edge.dst, sdfg.nodes()[state_id], sdfg) self.n_unrolled[rtl_name] = symbolic.evaluate( edge.src.map.range[0][1] + 1, sdfg.constants) line: str = f'{dst_node.in_connectors[edge.dst_conn]} {edge.dst_conn} = &{edge.data.data}[{edge.src.map.params[0]}*{edge.data.volume}];' else: raise RuntimeError( "Not handling copy_memory case of type {} -> {}.".format( type(edge.src), type(edge.dst))) # write accessor to file callsite_stream.write(line)
def move_small_arrays_to_stack(sdfg: SDFG) -> None: """ Set all Default storage types that are constant sized and less than the auto-tile size to the stack (as StorageType.Register). :param sdfg: The SDFG to operate on. :note: Operates in-place on the SDFG. """ converted = 0 tile_size = config.Config.get('optimizer', 'autotile_size') for sd, aname, array in sdfg.arrays_recursive(): if isinstance(array, dt.Stream): continue if (array.transient and array.storage == dtypes.StorageType.Default and array.lifetime == dtypes.AllocationLifetime.Scope): if not symbolic.issymbolic(array.total_size, sd.constants): eval_size = symbolic.evaluate(array.total_size, sd.constants) if eval_size <= tile_size: array.storage = dtypes.StorageType.Register converted += 1 if config.Config.get_bool('debugprint') and converted > 0: print(f'Statically allocating {converted} transient arrays')
def apply(self, sdfg): from dace.transformation.dataflow import TrivialMapElimination state = sdfg.nodes()[self.state_id] map_entry = state.nodes()[self.subgraph[MapUnroll._map_entry]] map_exit = state.exit_node(map_entry) # Collect all nodes in this weakly connected component subgraph = sdutil.weakly_connected_component(state, map_entry) # Save nested SDFGs to JSON, then deserialize them for every copy we # need to make nested_sdfgs = {} for node in subgraph: if isinstance(node, nodes.NestedSDFG): nested_sdfgs[node.sdfg] = node.sdfg.to_json() # Check for local memories that need to be replicated local_memories = [ name for name in sdutil.local_transients( sdfg, subgraph, entry_node=map_entry, include_nested=True) if not isinstance(sdfg.arrays[name], dt.Stream) and not isinstance(sdfg.arrays[name], dt.View) ] params = map_entry.map.params ranges = map_entry.map.range.ranges constant_ranges = [] for r in ranges: begin = symbolic.evaluate(r[0], sdfg.constants) end = symbolic.evaluate(r[1], sdfg.constants) step = symbolic.evaluate(r[2], sdfg.constants) end += step # Make non-inclusive constant_ranges.append(range(begin, end, step)) index_tuples = itertools.product(*constant_ranges) for t in index_tuples: suffix = "_" + "_".join(map(str, t)) node_to_unrolled = {} # Copy all nodes for node in subgraph: if isinstance(node, nodes.NestedSDFG): # Avoid deep-copying the nested SDFG nsdfg = node.sdfg # Don't copy the nested SDFG, as we will do this separately node.sdfg = None unrolled_node = copy.deepcopy(node) node.sdfg = nsdfg # Deserialize into a new SDFG specific to this copy nsdfg_json = nested_sdfgs[nsdfg] name = nsdfg_json["attributes"]["name"] nsdfg_json["attributes"]["name"] += suffix unrolled_nsdfg = SDFG.from_json(nsdfg_json) nsdfg_json["attributes"]["name"] = name # Reinstate # Set all the references unrolled_nsdfg.parent = state unrolled_nsdfg.parent_sdfg = sdfg unrolled_nsdfg.update_sdfg_list([]) unrolled_node.sdfg = unrolled_nsdfg unrolled_nsdfg.parent_nsdfg_node = unrolled_node else: unrolled_node = copy.deepcopy(node) if node == map_entry: # Fix the map bounds to only this iteration unrolled_node.map.range = [(i, i, 1) for i in t] if (isinstance(node, nodes.AccessNode) and node.data in local_memories): # If this is a local memory only used in this subgraph, # we need to replicate it for each new subgraph unrolled_name = node.data + suffix if unrolled_name not in sdfg.arrays: unrolled_desc = copy.deepcopy( sdfg.arrays[node.data]) sdfg.add_datadesc(unrolled_name, unrolled_desc) unrolled_node.data = unrolled_name state.add_node(unrolled_node) node_to_unrolled[node] = unrolled_node # Remember mapping # Copy all edges for src, src_conn, dst, dst_conn, memlet in subgraph.edges(): src = node_to_unrolled[src] dst = node_to_unrolled[dst] memlet = copy.deepcopy(memlet) if memlet.data in local_memories: memlet.data = memlet.data + suffix state.add_edge(src, src_conn, dst, dst_conn, memlet) # Eliminate the now trivial map TrivialMapElimination.apply_to( sdfg, verify=False, annotate=False, save=False, _map_entry=node_to_unrolled[map_entry]) # Now we can delete the original subgraph. This implicitly also remove # memlets between nodes state.remove_nodes_from(subgraph) # If we added a bunch of new nested SDFGs, reset the internal list if len(nested_sdfgs) > 0: sdfg.reset_sdfg_list() # Remove local memories that were replicated for mem in local_memories: sdfg.remove_data(mem)
def apply(self, sdfg): graph = sdfg.nodes()[self.state_id] subgraph = self.subgraph_view(sdfg) map_entries = helpers.get_outermost_scope_maps(sdfg, graph, subgraph) result = StencilTiling.topology(sdfg, graph, map_entries) (children_dict, parent_dict, sink_maps) = result # next up, calculate inferred ranges for each map # for each map entry, this contains a tuple of dicts: # each of those maps from data_name of the array to # inferred outer ranges. An inferred outer range is created # by taking the union of ranges of inner subsets corresponding # to that data and substituting this subset by the min / max of the # parametrized map boundaries # finally, from these outer ranges we can easily calculate # strides and tile sizes required for every map inferred_ranges = defaultdict(dict) # create array of reverse topologically sorted map entries # to iterate over topo_reversed = [] queue = set(sink_maps.copy()) while len(queue) > 0: element = next(e for e in queue if not children_dict[e] - set(topo_reversed)) topo_reversed.append(element) queue.remove(element) for parent in parent_dict[element]: queue.add(parent) # main loop # first get coverage dicts for each map entry # for each map, contains a tuple of two dicts # each of those two maps from data name to outer range coverage = {} for map_entry in map_entries: coverage[map_entry] = StencilTiling.coverage_dicts( sdfg, graph, map_entry, outer_range=True) # we have a mapping from data name to outer range # however we want a mapping from map parameters to outer ranges # for this we need to find out how all array dimensions map to # outer ranges variable_mapping = defaultdict(list) for map_entry in topo_reversed: map = map_entry.map # first find out variable mapping for e in itertools.chain( graph.out_edges(map_entry), graph.in_edges(graph.exit_node(map_entry))): mapping = [] for dim in e.data.subset: syms = set() for d in dim: syms |= symbolic.symlist(d).keys() if len(syms) > 1: raise NotImplementedError( "One incoming or outgoing stencil subset is indexed " "by multiple map parameters. " "This is not supported yet.") try: mapping.append(syms.pop()) except KeyError: # just append None if there is no map symbol in it. # we don't care for now. mapping.append(None) if e.data in variable_mapping: # assert that this is the same everywhere. # else we might run into problems assert variable_mapping[e.data.data] == mapping else: variable_mapping[e.data.data] = mapping # now do mapping data name -> outer range # and from that infer mapping variable -> outer range local_ranges = {dn: None for dn in coverage[map_entry][1].keys()} for data_name, cov in coverage[map_entry][1].items(): local_ranges[data_name] = subsets.union( local_ranges[data_name], cov) # now look at proceeding maps # and union those subsets -> could be larger with stencil indent for child_map in children_dict[map_entry]: if data_name in coverage[child_map][0]: local_ranges[data_name] = subsets.union( local_ranges[data_name], coverage[child_map][0][data_name]) # final assignent: combine local_ranges and variable_mapping # together into inferred_ranges inferred_ranges[map_entry] = {p: None for p in map.params} for data_name, ranges in local_ranges.items(): for param, r in zip(variable_mapping[data_name], ranges): # create new range from this subset and assign rng = subsets.Range((r, )) if param: inferred_ranges[map_entry][param] = subsets.union( inferred_ranges[map_entry][param], rng) # get parameters -- should all be the same params = next(iter(map_entries)).map.params.copy() # define reference range as inferred range of one of the sink maps self.reference_range = inferred_ranges[next(iter(sink_maps))] if self.debug: print("StencilTiling::Reference Range", self.reference_range) # next up, search for the ranges that don't change invariant_dims = [] for idx, p in enumerate(params): different = False if self.reference_range[p] is None: invariant_dims.append(idx) warnings.warn( f"StencilTiling::No Stencil pattern detected for parameter {p}" ) continue for m in map_entries: if inferred_ranges[m][p] != self.reference_range[p]: different = True break if not different: invariant_dims.append(idx) warnings.warn( f"StencilTiling::No Stencil pattern detected for parameter {p}" ) # during stripmining, we will create new outer map entries # for easy access self._outer_entries = set() # with inferred_ranges constructed, we can begin to strip mine for map_entry in map_entries: # Retrieve map entry and exit nodes. map = map_entry.map stripmine_subgraph = { StripMining._map_entry: graph.nodes().index(map_entry) } sdfg_id = sdfg.sdfg_id last_map_entry = None original_schedule = map_entry.schedule self.tile_sizes = [] self.tile_offset_lower = [] self.tile_offset_upper = [] # strip mining each dimension where necessary removed_maps = 0 for dim_idx, param in enumerate(map_entry.map.params): # get current_node tile size if dim_idx >= len(self.strides): tile_stride = symbolic.pystr_to_symbolic(self.strides[-1]) else: tile_stride = symbolic.pystr_to_symbolic( self.strides[dim_idx]) trivial = False if dim_idx in invariant_dims: self.tile_sizes.append(tile_stride) self.tile_offset_lower.append(0) self.tile_offset_upper.append(0) else: target_range_current = inferred_ranges[map_entry][param] reference_range_current = self.reference_range[param] min_diff = symbolic.SymExpr(reference_range_current.min_element()[0] \ - target_range_current.min_element()[0]) max_diff = symbolic.SymExpr(target_range_current.max_element()[0] \ - reference_range_current.max_element()[0]) try: min_diff = symbolic.evaluate(min_diff, {}) max_diff = symbolic.evaluate(max_diff, {}) except TypeError: raise RuntimeError("Symbolic evaluation of map " "ranges failed. Please check " "your parameters and match.") self.tile_sizes.append(tile_stride + max_diff + min_diff) self.tile_offset_lower.append( symbolic.pystr_to_symbolic(str(min_diff))) self.tile_offset_upper.append( symbolic.pystr_to_symbolic(str(max_diff))) # get calculated parameters tile_size = self.tile_sizes[-1] dim_idx -= removed_maps # If map or tile sizes are trivial, skip strip-mining map dimension # special cases: # if tile size is trivial AND we have an invariant dimension, skip if tile_size == map.range.size()[dim_idx] and ( dim_idx + removed_maps) in invariant_dims: continue # trivial map: we just continue if map.range.size()[dim_idx] in [0, 1]: continue if tile_size == 1 and tile_stride == 1 and ( dim_idx + removed_maps) in invariant_dims: trivial = True removed_maps += 1 # indent all map ranges accordingly and then perform # strip mining on these. Offset inner maps accordingly afterwards range_tuple = (map.range[dim_idx][0] + self.tile_offset_lower[-1], map.range[dim_idx][1] - self.tile_offset_upper[-1], map.range[dim_idx][2]) map.range[dim_idx] = range_tuple stripmine = StripMining(sdfg_id, self.state_id, stripmine_subgraph, 0) stripmine.tiling_type = 'ceilrange' stripmine.dim_idx = dim_idx stripmine.new_dim_prefix = self.prefix if not trivial else '' # use tile_stride for both -- we will extend # the inner tiles later stripmine.tile_size = str(tile_stride) stripmine.tile_stride = str(tile_stride) outer_map = stripmine.apply(sdfg) outer_map.schedule = original_schedule # apply to the new map the schedule of the original one map_entry.schedule = self.schedule # if tile stride is 1, we can make a nice simplification by just # taking the overapproximated inner range as inner range # this eliminates the min/max in the range which # enables loop unrolling if tile_stride == 1: map_entry.range[dim_idx] = tuple( symbolic.SymExpr(el._approx_expr) if isinstance( el, symbolic.SymExpr) else el for el in map_entry.range[dim_idx]) # in map_entry: enlarge tiles by upper and lower offset # doing it this way and not via stripmine strides ensures # that the max gets changed as well old_range = map_entry.range[dim_idx] map_entry.range[dim_idx] = ((old_range[0] - self.tile_offset_lower[-1]), (old_range[1] + self.tile_offset_upper[-1]), old_range[2]) # We have to propagate here for correct outer volume and subset sizes _propagate_node(graph, map_entry) _propagate_node(graph, graph.exit_node(map_entry)) # usual tiling pipeline if last_map_entry: new_map_entry = graph.in_edges(map_entry)[0].src mapcollapse_subgraph = { MapCollapse._outer_map_entry: graph.node_id(last_map_entry), MapCollapse._inner_map_entry: graph.node_id(new_map_entry) } mapcollapse = MapCollapse(sdfg_id, self.state_id, mapcollapse_subgraph, 0) mapcollapse.apply(sdfg) last_map_entry = graph.in_edges(map_entry)[0].src # add last instance of map entries to _outer_entries if last_map_entry: self._outer_entries.add(last_map_entry) # Map Unroll Feature: only unroll if conditions are met: # Only unroll if at least one of the inner map ranges is strictly larger than 1 # Only unroll if strides all are one if self.unroll_loops and all(s == 1 for s in self.strides) and any( s not in [0, 1] for s in map_entry.range.size()): l = len(map_entry.params) if l > 1: subgraph = { MapExpansion.map_entry: graph.nodes().index(map_entry) } trafo_expansion = MapExpansion(sdfg.sdfg_id, sdfg.nodes().index(graph), subgraph, 0) trafo_expansion.apply(sdfg) maps = [map_entry] for _ in range(l - 1): map_entry = graph.out_edges(map_entry)[0].dst maps.append(map_entry) for map in reversed(maps): # MapToForLoop subgraph = { MapToForLoop._map_entry: graph.nodes().index(map) } trafo_for_loop = MapToForLoop(sdfg.sdfg_id, sdfg.nodes().index(graph), subgraph, 0) trafo_for_loop.apply(sdfg) nsdfg = trafo_for_loop.nsdfg # LoopUnroll guard = trafo_for_loop.guard end = trafo_for_loop.after_state begin = next(e.dst for e in nsdfg.out_edges(guard) if e.dst != end) subgraph = { DetectLoop._loop_guard: nsdfg.nodes().index(guard), DetectLoop._loop_begin: nsdfg.nodes().index(begin), DetectLoop._exit_state: nsdfg.nodes().index(end) } transformation = LoopUnroll(0, 0, subgraph, 0) transformation.apply(nsdfg) elif self.unroll_loops: warnings.warn( "Did not unroll loops. Either all ranges are equal to " "one or range difference is symbolic.") self._outer_entries = list(self._outer_entries)
def unparse_tasklet(self, sdfg: sdfg.SDFG, dfg: state.StateSubgraphView, state_id: int, node: nodes.Node, function_stream: prettycode.CodeIOStream, callsite_stream: prettycode.CodeIOStream): # extract data state = sdfg.nodes()[state_id] tasklet = node # construct variables paths unique_name: str = "{}_{}_{}_{}".format(tasklet.name, sdfg.sdfg_id, sdfg.node_id(state), state.node_id(tasklet)) # Collect all of the input and output connectors into buses and scalars buses = {} scalars = {} for edge in state.in_edges(tasklet): arr = sdfg.arrays[edge.src.data] # catch symbolic (compile time variables) check_issymbolic([ tasklet.in_connectors[edge.dst_conn].veclen, tasklet.in_connectors[edge.dst_conn].bytes ], sdfg) # extract parameters vec_len = int( symbolic.evaluate(tasklet.in_connectors[edge.dst_conn].veclen, sdfg.constants)) total_size = int( symbolic.evaluate(tasklet.in_connectors[edge.dst_conn].bytes, sdfg.constants)) if isinstance(arr, data.Array): if self.hardware_target: raise NotImplementedError( 'Array input for hardware* not implemented') else: buses[edge.dst_conn] = (False, total_size, vec_len) elif isinstance(arr, data.Stream): buses[edge.dst_conn] = (False, total_size, vec_len) elif isinstance(arr, data.Scalar): scalars[edge.dst_conn] = (False, total_size * 8) for edge in state.out_edges(tasklet): arr = sdfg.arrays[edge.dst.data] # catch symbolic (compile time variables) check_issymbolic([ tasklet.out_connectors[edge.src_conn].veclen, tasklet.out_connectors[edge.src_conn].bytes ], sdfg) # extract parameters vec_len = int( symbolic.evaluate(tasklet.out_connectors[edge.src_conn].veclen, sdfg.constants)) total_size = int( symbolic.evaluate(tasklet.out_connectors[edge.src_conn].bytes, sdfg.constants)) if isinstance(arr, data.Array): if self.hardware_target: raise NotImplementedError( 'Array input for hardware* not implemented') else: buses[edge.src_conn] = (True, total_size, vec_len) elif isinstance(arr, data.Stream): buses[edge.src_conn] = (True, total_size, vec_len) elif isinstance(arr, data.Scalar): print('Scalar output not implemented') # generate system verilog module components parameter_string: str = self.generate_rtl_parameters(sdfg.constants) inputs, outputs = self.generate_rtl_inputs_outputs(buses, scalars) # create rtl code object (that is later written to file) self.code_objects.append( codeobject.CodeObject( name="{}".format(unique_name), code=RTLCodeGen.RTL_HEADER.format(name=unique_name, parameters=parameter_string, inputs="\n".join(inputs), outputs="\n".join(outputs)) + tasklet.code.code + RTLCodeGen.RTL_FOOTER, language="sv", target=RTLCodeGen, title="rtl", target_type="{}".format(unique_name), additional_compiler_kwargs="", linkable=True, environments=None)) if self.hardware_target: if self.vendor == 'xilinx': rtllib_config = { "name": unique_name, "buses": { name: ('m_axis' if is_output else 's_axis', vec_len) for name, (is_output, _, vec_len) in buses.items() }, "params": { "scalars": { name: total_size for name, (_, total_size) in scalars.items() }, "memory": {} }, "ip_cores": tasklet.ip_cores if isinstance( tasklet, nodes.RTLTasklet) else {}, } self.code_objects.append( codeobject.CodeObject(name=f"{unique_name}_control", code=rtllib_control(rtllib_config), language="v", target=RTLCodeGen, title="rtl", target_type="{}".format(unique_name), additional_compiler_kwargs="", linkable=True, environments=None)) self.code_objects.append( codeobject.CodeObject(name=f"{unique_name}_top", code=rtllib_top(rtllib_config), language="v", target=RTLCodeGen, title="rtl", target_type="{}".format(unique_name), additional_compiler_kwargs="", linkable=True, environments=None)) self.code_objects.append( codeobject.CodeObject(name=f"{unique_name}_package", code=rtllib_package(rtllib_config), language="tcl", target=RTLCodeGen, title="rtl", target_type="scripts", additional_compiler_kwargs="", linkable=True, environments=None)) self.code_objects.append( codeobject.CodeObject(name=f"{unique_name}_synth", code=rtllib_synth(rtllib_config), language="tcl", target=RTLCodeGen, title="rtl", target_type="scripts", additional_compiler_kwargs="", linkable=True, environments=None)) else: # self.vendor != "xilinx" raise NotImplementedError( 'Only RTL codegen for Xilinx is implemented') else: # not hardware_target # generate verilator simulation cpp code components inputs, outputs = self.generate_cpp_inputs_outputs(tasklet) valid_zeros, ready_zeros = self.generate_cpp_zero_inits(tasklet) vector_init = self.generate_cpp_vector_init(tasklet) num_elements = self.generate_cpp_num_elements(tasklet) internal_state_str, internal_state_var = self.generate_cpp_internal_state( tasklet) read_input_hs = self.generate_input_hs(tasklet) feed_elements = self.generate_feeding(tasklet, inputs) in_ptrs, out_ptrs = self.generate_ptrs(tasklet) export_elements = self.generate_exporting(tasklet, outputs) write_output_hs = self.generate_write_output_hs(tasklet) hs_flags = self.generate_hs_flags(tasklet) input_hs_toggle = self.generate_input_hs_toggle(tasklet) output_hs_toggle = self.generate_output_hs_toggle(tasklet) running_condition = self.generate_running_condition(tasklet) # add header code to stream if not self.cpp_general_header_added: sdfg.append_global_code( cpp_code=RTLCodeGen.CPP_GENERAL_HEADER_TEMPLATE.format( debug_include="// generic includes\n#include <iostream>" if self.verilator_debug else "")) self.cpp_general_header_added = True sdfg.append_global_code( cpp_code=RTLCodeGen.CPP_MODEL_HEADER_TEMPLATE.format( name=unique_name)) # add main cpp code to stream callsite_stream.write(contents=RTLCodeGen.CPP_MAIN_TEMPLATE.format( name=unique_name, inputs=inputs, outputs=outputs, num_elements=str.join('\n', num_elements), vector_init=vector_init, valid_zeros=str.join('\n', valid_zeros), ready_zeros=str.join('\n', ready_zeros), read_input_hs=str.join('\n', read_input_hs), feed_elements=str.join('\n', feed_elements), in_ptrs=str.join('\n', in_ptrs), out_ptrs=str.join('\n', out_ptrs), export_elements=str.join('\n', export_elements), write_output_hs=str.join('\n', write_output_hs), hs_flags=str.join('\n', hs_flags), input_hs_toggle=str.join('\n', input_hs_toggle), output_hs_toggle=str.join('\n', output_hs_toggle), running_condition=str.join(' && ', running_condition), internal_state_str=internal_state_str, internal_state_var=internal_state_var, debug_sim_start="std::cout << \"SIM {name} START\" << std::endl;" if self.verilator_debug else "", debug_internal_state=""" // report internal state VL_PRINTF("[t=%lu] ap_aclk=%u ap_areset=%u valid_i=%u ready_i=%u valid_o=%u ready_o=%u \\n", main_time, model->ap_aclk, model->ap_areset, model->valid_i, model->ready_i, model->valid_o, model->ready_o); VL_PRINTF("{internal_state_str}\\n", {internal_state_var}); std::cout << std::flush; """.format(internal_state_str=internal_state_str, internal_state_var=internal_state_var) if self.verilator_debug else "", debug_sim_end="std::cout << \"SIM {name} END\" << std::endl;" if self.verilator_debug else ""), sdfg=sdfg, state_id=state_id, node_id=node)
def apply(self, sdfg): # Obtain loop information guard: sd.SDFGState = sdfg.node(self.subgraph[DetectLoop._loop_guard]) begin: sd.SDFGState = sdfg.node(self.subgraph[DetectLoop._loop_begin]) after_state: sd.SDFGState = sdfg.node( self.subgraph[DetectLoop._exit_state]) # Obtain iteration variable, range, and stride guard_inedges = sdfg.in_edges(guard) condition_edge = sdfg.edges_between(guard, begin)[0] itervar = list(guard_inedges[0].data.assignments.keys())[0] condition = condition_edge.data.condition_sympy() rng = LoopUnroll._loop_range(itervar, guard_inedges, condition) # Loop must be unrollable if self.count == 0 and any( symbolic.issymbolic(r, sdfg.constants) for r in rng): raise ValueError('Loop cannot be fully unrolled, size is symbolic') if self.count != 0: raise NotImplementedError # TODO(later) # Find the state prior to the loop if rng[0] == symbolic.pystr_to_symbolic( guard_inedges[0].data.assignments[itervar]): before_state: sd.SDFGState = guard_inedges[0].src last_state: sd.SDFGState = guard_inedges[1].src else: before_state: sd.SDFGState = guard_inedges[1].src last_state: sd.SDFGState = guard_inedges[0].src # Get loop states loop_states = list( sdutil.dfs_conditional(sdfg, sources=[begin], condition=lambda _, child: child != guard)) first_id = loop_states.index(begin) last_id = loop_states.index(last_state) loop_subgraph = gr.SubgraphView(sdfg, loop_states) # Evaluate the real values of the loop start, end, stride = (symbolic.evaluate(r, sdfg.constants) for r in rng) # Create states for loop subgraph unrolled_states = [] for i in range(start, end + 1, stride): # Instantiate loop states with iterate value new_states = self.instantiate_loop(sdfg, loop_states, loop_subgraph, itervar, i) # Connect iterations with unconditional edges if len(unrolled_states) > 0: sdfg.add_edge(unrolled_states[-1][1], new_states[first_id], sd.InterstateEdge()) unrolled_states.append((new_states[first_id], new_states[last_id])) # Connect new states to before and after states without conditions if unrolled_states: sdfg.add_edge(before_state, unrolled_states[0][0], sd.InterstateEdge()) sdfg.add_edge(unrolled_states[-1][1], after_state, sd.InterstateEdge()) # Remove old states from SDFG sdfg.remove_nodes_from([guard] + loop_states)
def apply(self, sdfg): # Obtain loop information guard: sd.SDFGState = sdfg.node(self.subgraph[DetectLoop._loop_guard]) begin: sd.SDFGState = sdfg.node(self.subgraph[DetectLoop._loop_begin]) after_state: sd.SDFGState = sdfg.node( self.subgraph[DetectLoop._exit_state]) # Obtain iteration variable, range, and stride guard_inedges = sdfg.in_edges(guard) condition_edge = sdfg.edges_between(guard, begin)[0] itervar = list(guard_inedges[0].data.assignments.keys())[0] condition = condition_edge.data.condition_sympy() rng = LoopUnroll._loop_range(itervar, guard_inedges, condition) # Loop must be unrollable if self.count == 0 and any( symbolic.issymbolic(r, sdfg.constants) for r in rng): raise ValueError('Loop cannot be fully unrolled, size is symbolic') if self.count != 0: raise NotImplementedError # TODO(later) # Find the state prior to the loop if rng[0] == symbolic.pystr_to_symbolic( guard_inedges[0].data.assignments[itervar]): before_state: sd.SDFGState = guard_inedges[0].src last_state: sd.SDFGState = guard_inedges[1].src else: before_state: sd.SDFGState = guard_inedges[1].src last_state: sd.SDFGState = guard_inedges[0].src # Get loop states loop_states = list( sdutil.dfs_topological_sort( sdfg, sources=[begin], condition=lambda _, child: child != guard)) first_id = loop_states.index(begin) last_id = loop_states.index(last_state) loop_subgraph = gr.SubgraphView(sdfg, loop_states) # Evaluate the real values of the loop start, end, stride = (symbolic.evaluate(r, sdfg.constants) for r in rng) # Create states for loop subgraph unrolled_states = [] for i in range(start, end + 1, stride): # Using to/from JSON copies faster than deepcopy (which will also # copy the parent SDFG) new_states = [ sd.SDFGState.from_json(s.to_json(), context={'sdfg': sdfg}) for s in loop_states ] # Replace iterate with value in each state for state in new_states: state.set_label(state.label + '_%s_%d' % (itervar, i)) state.replace(itervar, i) # Add subgraph to original SDFG for edge in loop_subgraph.edges(): src = new_states[loop_states.index(edge.src)] dst = new_states[loop_states.index(edge.dst)] # Replace conditions in subgraph edges data: sd.InterstateEdge = copy.deepcopy(edge.data) if data.condition: ASTFindReplace({itervar: str(i)}).visit(data.condition) sdfg.add_edge(src, dst, data) # Connect iterations with unconditional edges if len(unrolled_states) > 0: sdfg.add_edge(unrolled_states[-1][1], new_states[first_id], sd.InterstateEdge()) unrolled_states.append((new_states[first_id], new_states[last_id])) # Connect new states to before and after states without conditions if unrolled_states: sdfg.add_edge(before_state, unrolled_states[0][0], sd.InterstateEdge()) sdfg.add_edge(unrolled_states[-1][1], after_state, sd.InterstateEdge()) # Remove old states from SDFG sdfg.remove_nodes_from([guard] + loop_states)
def apply(self, sdfg): # Obtain loop information guard: sd.SDFGState = sdfg.node(self.subgraph[DetectLoop._loop_guard]) begin: sd.SDFGState = sdfg.node(self.subgraph[DetectLoop._loop_begin]) after_state: sd.SDFGState = sdfg.node( self.subgraph[DetectLoop._exit_state]) # Obtain iteration variable, range, and stride, together with the last # state(s) before the loop and the last loop state. itervar, rng, loop_struct = find_for_loop(sdfg, guard, begin) # Loop must be fully unrollable for now. if self.count != 0: raise NotImplementedError # TODO(later) # Get loop states loop_states = list( sdutil.dfs_conditional(sdfg, sources=[begin], condition=lambda _, child: child != guard)) first_id = loop_states.index(begin) last_state = loop_struct[1] last_id = loop_states.index(last_state) loop_subgraph = gr.SubgraphView(sdfg, loop_states) try: start, end, stride = (r for r in rng) stride = symbolic.evaluate(stride, sdfg.constants) loop_diff = int(symbolic.evaluate(end - start + 1, sdfg.constants)) is_symbolic = any([symbolic.issymbolic(r) for r in rng[:2]]) except TypeError: raise TypeError('Loop difference and strides cannot be symbolic.') # Create states for loop subgraph unrolled_states = [] for i in range(0, loop_diff, stride): current_index = start + i # Instantiate loop states with iterate value new_states = self.instantiate_loop(sdfg, loop_states, loop_subgraph, itervar, current_index, str(i) if is_symbolic else None) # Connect iterations with unconditional edges if len(unrolled_states) > 0: sdfg.add_edge(unrolled_states[-1][1], new_states[first_id], sd.InterstateEdge()) unrolled_states.append((new_states[first_id], new_states[last_id])) # Get any assignments that might be on the edge to the after state after_assignments = (sdfg.edges_between( guard, after_state)[0].data.assignments) # Connect new states to before and after states without conditions if unrolled_states: before_states = loop_struct[0] for before_state in before_states: sdfg.add_edge(before_state, unrolled_states[0][0], sd.InterstateEdge()) sdfg.add_edge(unrolled_states[-1][1], after_state, sd.InterstateEdge(assignments=after_assignments)) # Remove old states from SDFG sdfg.remove_nodes_from([guard] + loop_states)
def _initialize_return_values(self, kwargs): # Obtain symbol values from arguments and constants syms = dict() syms.update( {k: v for k, v in kwargs.items() if k not in self.sdfg.arrays}) syms.update(self.sdfg.constants) if self._initialized: if self._return_syms == syms: return self._return_kwarrays self._return_syms = syms # Initialize return values with numpy arrays self._return_arrays = [] self._return_kwarrays = {} for arrname, arr in sorted(self.sdfg.arrays.items()): if arrname.startswith('__return') and not arr.transient: if arrname in kwargs: self._return_arrays.append(kwargs[arrname]) self._return_kwarrays[arrname] = kwargs[arrname] continue if isinstance(arr, dt.Stream): raise NotImplementedError('Return streams are unsupported') ndarray = np.ndarray zeros = np.zeros if arr.storage is dtypes.StorageType.GPU_Global: try: import cupy # Set allocator to GPU def ndarray(*args, buffer=None, **kwargs): if buffer is not None: buffer = buffer.data return cupy.ndarray(*args, memptr=buffer, **kwargs) zeros = cupy.zeros except (ImportError, ModuleNotFoundError): raise NotImplementedError('GPU return values are ' 'unsupported if cupy is not ' 'installed') if arr.storage is dtypes.StorageType.FPGA_Global: raise NotImplementedError('FPGA return values are ' 'unsupported') # Create an array with the properties of the SDFG array self._return_arrays.append( ndarray([symbolic.evaluate(s, syms) for s in arr.shape], arr.dtype.as_numpy_dtype(), buffer=zeros( [symbolic.evaluate(arr.total_size, syms)], arr.dtype.as_numpy_dtype()), strides=[ symbolic.evaluate(s, syms) * arr.dtype.bytes for s in arr.strides ])) self._return_kwarrays[arrname] = self._return_arrays[-1] # Set up return_arrays field if len(self._return_arrays) == 0: self._return_arrays = None elif len(self._return_arrays) == 1: self._return_arrays = self._return_arrays[0] else: self._return_arrays = tuple(self._return_arrays) return self._return_kwarrays