def generate_no_dependence_post(self, kernel_stream, sdfg, state_id, node, var_name): ''' Adds post loop pragma for ignoring loop carried dependencies on a given variable ''' defined_type, _ = self._dispatcher.defined_vars.get(var_name) if defined_type == DefinedType.ArrayInterface: var_name = cpp.array_interface_variable(var_name, True, self._dispatcher) kernel_stream.write( "#pragma HLS DEPENDENCE variable={} false".format(var_name), sdfg, state_id, node)
def generate_host_header(self, sdfg, kernel_function_name, parameters, host_code_stream): kernel_args = [] for is_output, name, arg, if_id in parameters: if isinstance(arg, dt.Array): argname = cpp.array_interface_variable(name, is_output, None) if if_id is not None: argname = f"{argname}_{if_id}" kernel_args.append(arg.as_arg(with_types=True, name=argname)) else: kernel_args.append(arg.as_arg(with_types=True, name=name)) host_code_stream.write( """\ // Signature of kernel function (with raw pointers) for argument matching DACE_EXPORTED void {kernel_function_name}({kernel_args});\n\n""".format( kernel_function_name=kernel_function_name, kernel_args=", ".join(kernel_args)), sdfg)
def make_kernel_argument(data, var_name, is_output, with_vectorization, interface_id=None): if isinstance(data, dt.Array): var_name = cpp.array_interface_variable(var_name, is_output, None) if interface_id is not None: var_name = var_name = f"{var_name}_{interface_id}" if with_vectorization: dtype = data.dtype else: dtype = data.dtype.base_type return "{} *{}".format(dtype.ctype, var_name) if isinstance(data, dt.Stream): ctype = "dace::FIFO<{}, {}, {}>".format(data.dtype.base_type.ctype, data.dtype.veclen, data.buffer_size) return "{} &{}".format(ctype, var_name) else: return data.as_arg(with_types=True, name=var_name)
def generate_nsdfg_arguments(self, sdfg, dfg, state, node): # Connectors that are both input and output share the same name, unless # they are pointers to global memory in device code, in which case they # are split into explicit input and output interfaces inout = set(node.in_connectors.keys() & node.out_connectors.keys()) memlet_references = [] for _, _, _, vconn, in_memlet in sorted( state.in_edges(node), key=lambda e: e.dst_conn or ""): if in_memlet.data is None: continue is_memory_interface = (self._dispatcher.defined_vars.get( in_memlet.data, 1)[0] == DefinedType.ArrayInterface) if is_memory_interface: interface_name = cpp.array_interface_variable( vconn, False, None) # Register the raw pointer as a defined variable self._dispatcher.defined_vars.add( interface_name, DefinedType.Pointer, node.in_connectors[vconn].ctype) interface_ref = cpp.emit_memlet_reference( self._dispatcher, sdfg, in_memlet, interface_name, conntype=node.in_connectors[vconn], is_write=False) memlet_references.append(interface_ref) if vconn in inout: continue ref = cpp.emit_memlet_reference(self._dispatcher, sdfg, in_memlet, vconn, conntype=node.in_connectors[vconn], is_write=False) if not is_memory_interface: memlet_references.append(ref) for _, uconn, _, _, out_memlet in sorted( state.out_edges(node), key=lambda e: e.src_conn or ""): if out_memlet.data is None: continue ref = cpp.emit_memlet_reference( self._dispatcher, sdfg, out_memlet, uconn, conntype=node.out_connectors[uconn], is_write=True) is_memory_interface = (self._dispatcher.defined_vars.get( out_memlet.data, 1)[0] == DefinedType.ArrayInterface) if is_memory_interface: interface_name = cpp.array_interface_variable( uconn, True, None) # Register the raw pointer as a defined variable self._dispatcher.defined_vars.add( interface_name, DefinedType.Pointer, node.out_connectors[uconn].ctype) memlet_references.append( cpp.emit_memlet_reference( self._dispatcher, sdfg, out_memlet, interface_name, conntype=node.out_connectors[uconn], is_write=True)) else: memlet_references.append(ref) return memlet_references
def generate_module(self, sdfg, state, name, subgraph, parameters, module_stream, entry_stream, host_stream): """Generates a module that will run as a dataflow function in the FPGA kernel.""" state_id = sdfg.node_id(state) dfg = sdfg.nodes()[state_id] kernel_args_call = [] kernel_args_module = [] for is_output, pname, p, interface_id in parameters: if isinstance(p, dt.Array): arr_name = cpp.array_interface_variable(pname, is_output, None) # Add interface ID to called module, but not to the module # arguments argname = arr_name if interface_id is not None: argname = f"{arr_name}_{interface_id}" kernel_args_call.append(argname) dtype = p.dtype kernel_args_module.append("{} {}*{}".format( dtype.ctype, "const " if not is_output else "", arr_name)) else: if isinstance(p, dt.Stream): kernel_args_call.append( p.as_arg(with_types=False, name=pname)) if p.is_stream_array(): kernel_args_module.append( "dace::FIFO<{}, {}, {}> {}[{}]".format( p.dtype.base_type.ctype, p.veclen, p.buffer_size, pname, p.size_string())) else: kernel_args_module.append( "dace::FIFO<{}, {}, {}> &{}".format( p.dtype.base_type.ctype, p.veclen, p.buffer_size, pname)) else: kernel_args_call.append( p.as_arg(with_types=False, name=pname)) kernel_args_module.append( p.as_arg(with_types=True, name=pname)) # Check if we are generating an RTL module, in which case only the # accesses to the streams should be handled rtl_tasklet = None for n in subgraph.nodes(): if (isinstance(n, dace.nodes.Tasklet) and n.language == dace.dtypes.Language.SystemVerilog): rtl_tasklet = n break if rtl_tasklet: entry_stream.write( f'// [RTL] HLSLIB_DATAFLOW_FUNCTION({name}, {", ".join(kernel_args_call)});' ) module_stream.write( f'// [RTL] void {name}({", ".join(kernel_args_module)});\n\n') # _1 in names are due to vitis for node in subgraph.source_nodes(): if isinstance(sdfg.arrays[node.data], dt.Stream): if node.data not in self._stream_connections: self._stream_connections[node.data] = [None, None] for edge in state.out_edges(node): rtl_name = "{}_{}_{}_{}".format( edge.dst, sdfg.sdfg_id, sdfg.node_id(state), state.node_id(edge.dst)) self._stream_connections[ node.data][1] = '{}_top_1.s_axis_{}'.format( rtl_name, edge.dst_conn) for node in subgraph.sink_nodes(): if isinstance(sdfg.arrays[node.data], dt.Stream): if node.data not in self._stream_connections: self._stream_connections[node.data] = [None, None] for edge in state.in_edges(node): rtl_name = "{}_{}_{}_{}".format( edge.src, sdfg.sdfg_id, sdfg.node_id(state), state.node_id(edge.src)) self._stream_connections[ node.data][0] = '{}_top_1.m_axis_{}'.format( rtl_name, edge.src_conn) # Make the dispatcher trigger generation of the RTL module, but # ignore the generated code, as the RTL codegen will generate the # appropriate files. ignore_stream = CodeIOStream() self._dispatcher.dispatch_subgraph(sdfg, subgraph, state_id, ignore_stream, ignore_stream, skip_entry_node=False) # Launch the kernel from the host code rtl_name = self.rtl_tasklet_name(rtl_tasklet, state, sdfg) host_stream.write( f" auto kernel_{rtl_name} = program.MakeKernel(\"{rtl_name}_top\", {', '.join([name for _, name, p, _ in parameters if not isinstance(p, dt.Stream)])}).ExecuteTaskFork();", sdfg, state_id, rtl_tasklet) return # create a unique module name to prevent name clashes module_function_name = f"module_{name}_{sdfg.sdfg_id}" # Unrolling processing elements: if there first scope of the subgraph # is an unrolled map, generate a processing element for each iteration scope_children = subgraph.scope_children() top_scopes = [ n for n in scope_children[None] if isinstance(n, dace.sdfg.nodes.EntryNode) ] unrolled_loops = 0 if len(top_scopes) == 1: scope = top_scopes[0] if scope.unroll: self._unrolled_pes.add(scope.map) kernel_args_call += ", ".join(scope.map.params) kernel_args_module += ["int " + p for p in scope.params] for p, r in zip(scope.map.params, scope.map.range): if len(r) > 3: raise cgx.CodegenError("Strided unroll not supported") entry_stream.write( "for (size_t {param} = {begin}; {param} < {end}; " "{param} += {increment}) {{\n#pragma HLS UNROLL". format(param=p, begin=r[0], end=r[1] + 1, increment=r[2])) unrolled_loops += 1 # Generate caller code in top-level function entry_stream.write( "HLSLIB_DATAFLOW_FUNCTION({}, {});".format( module_function_name, ", ".join(kernel_args_call)), sdfg, state_id) for _ in range(unrolled_loops): entry_stream.write("}") # ---------------------------------------------------------------------- # Generate kernel code # ---------------------------------------------------------------------- self._dispatcher.defined_vars.enter_scope(subgraph) module_body_stream = CodeIOStream() module_body_stream.write( "void {}({}) {{".format(module_function_name, ", ".join(kernel_args_module)), sdfg, state_id) # Register the array interface as a naked pointer for use inside the # FPGA kernel interfaces_added = set() for is_output, argname, arg, _ in parameters: if (not (isinstance(arg, dt.Array) and arg.storage == dace.dtypes.StorageType.FPGA_Global)): continue ctype = dtypes.pointer(arg.dtype).ctype ptr_name = cpp.array_interface_variable(argname, is_output, None) if not is_output: ctype = f"const {ctype}" self._dispatcher.defined_vars.add(ptr_name, DefinedType.Pointer, ctype) if argname in interfaces_added: continue interfaces_added.add(argname) self._dispatcher.defined_vars.add(argname, DefinedType.ArrayInterface, ctype, allow_shadowing=True) module_body_stream.write("\n") # Allocate local transients data_to_allocate = (set(subgraph.top_level_transients()) - set(sdfg.shared_transients()) - set([p[1] for p in parameters])) allocated = set() for node in subgraph.nodes(): if not isinstance(node, dace.sdfg.nodes.AccessNode): continue if node.data not in data_to_allocate or node.data in allocated: continue allocated.add(node.data) self._dispatcher.dispatch_allocate(sdfg, state, state_id, node, module_stream, module_body_stream) self._dispatcher.dispatch_subgraph(sdfg, subgraph, state_id, module_stream, module_body_stream, skip_entry_node=False) module_stream.write(module_body_stream.getvalue(), sdfg, state_id) module_stream.write("}\n\n") self._dispatcher.defined_vars.exit_scope(subgraph)