def add(self, name: str, dtype: DefinedType, ctype: str, ancestor: int = 0, allow_shadowing: bool = False): if not isinstance(name, str): raise TypeError('Variable name type cannot be %s' % type(name).__name__) for _, scope, can_access_parent in reversed(self._scopes): if name in scope: err_str = "Shadowing variable {} from type {} to {}".format( name, scope[name], dtype) if (allow_shadowing or config.Config.get_bool( "compiler", "allow_shadowing")): if not allow_shadowing: print("WARNING: " + err_str) else: raise cgx.CodegenError(err_str) if not can_access_parent: break self._scopes[-1 - ancestor][1][name] = (dtype, ctype)
def get_generated_codeobjects(self): execution_mode = Config.get("compiler", "xilinx", "mode") kernel_file_name = "DACE_BINARY_DIR \"/{}".format(self._program_name) if execution_mode == "software_emulation": kernel_file_name += "_sw_emu.xclbin\"" xcl_emulation_mode = "\"sw_emu\"" xilinx_sdx = "DACE_VITIS_DIR" elif execution_mode == "hardware_emulation": kernel_file_name += "_hw_emu.xclbin\"" xcl_emulation_mode = "\"hw_emu\"" xilinx_sdx = "DACE_VITIS_DIR" elif execution_mode == "hardware" or execution_mode == "simulation": kernel_file_name += "_hw.xclbin\"" xcl_emulation_mode = None xilinx_sdx = None else: raise cgx.CodegenError( "Unknown Xilinx execution mode: {}".format(execution_mode)) set_env_vars = "" set_str = "dace::set_environment_variable(\"{}\", {});\n" unset_str = "dace::unset_environment_variable(\"{}\");\n" set_env_vars += (set_str.format("XCL_EMULATION_MODE", xcl_emulation_mode) if xcl_emulation_mode is not None else unset_str.format("XCL_EMULATION_MODE")) set_env_vars += (set_str.format("XILINX_SDX", xilinx_sdx) if xilinx_sdx is not None else unset_str.format("XILINX_SDX")) set_env_vars += set_str.format( "EMCONFIG_PATH", "DACE_BINARY_DIR" ) if execution_mode == 'hardware_emulation' else unset_str.format( "EMCONFIG_PATH") host_code = CodeIOStream() host_code.write("""\ #include "dace/xilinx/host.h" #include "dace/dace.h" """) if len(self._dispatcher.instrumentation) > 1: host_code.write("""\ #include "dace/perf/reporting.h" #include <chrono> #include <iomanip> #include <iostream> #include <limits> """) host_code.write("\n\n") self._frame.generate_fileheader(self._global_sdfg, host_code, 'xilinx_host') params_comma = self._global_sdfg.signature(with_arrays=False) if params_comma: params_comma = ', ' + params_comma host_code.write(""" DACE_EXPORTED int __dace_init_xilinx({sdfg.name}_t *__state{signature}) {{ {environment_variables} __state->fpga_context = new dace::fpga::Context(); __state->fpga_context->Get().MakeProgram({kernel_file_name}); return 0; }} DACE_EXPORTED void __dace_exit_xilinx({sdfg.name}_t *__state) {{ delete __state->fpga_context; }} {host_code}""".format(signature=params_comma, sdfg=self._global_sdfg, environment_variables=set_env_vars, kernel_file_name=kernel_file_name, host_code="".join([ "{separator}\n// Kernel: {kernel_name}" "\n{separator}\n\n{code}\n\n".format( separator="/" * 79, kernel_name=name, code=code) for (name, code) in self._host_codes ]))) host_code_obj = CodeObject(self._program_name, host_code.getvalue(), "cpp", XilinxCodeGen, "Xilinx", target_type="host") kernel_code_objs = [ CodeObject(kernel_name, code, "cpp", XilinxCodeGen, "Xilinx", target_type="device") for (kernel_name, code) in self._kernel_codes ] # Memory bank and streaming interfaces connectivity configuration file link_cfg = CodeIOStream() self._other_codes["link.cfg"] = link_cfg link_cfg.write("[connectivity]") are_assigned = [v is not None for v in self._bank_assignments.values()] if any(are_assigned): if not all(are_assigned): raise RuntimeError("Some, but not all global memory arrays " "were assigned to memory banks: {}".format( self._bank_assignments)) # Emit mapping from kernel memory interfaces to DRAM banks for (kernel_name, interface_name), ( memory_type, memory_bank) in self._bank_assignments.items(): link_cfg.write( f"sp={kernel_name}_1.m_axi_{interface_name}:{memory_type}[{memory_bank}]" ) # Emit mapping between inter-kernel streaming interfaces for _, (src, dst) in self._stream_connections.items(): link_cfg.write(f"stream_connect={src}:{dst}") other_objs = [] for name, code in self._other_codes.items(): name = name.split(".") other_objs.append( CodeObject(name[0], code.getvalue(), ".".join(name[1:]), XilinxCodeGen, "Xilinx", target_type="device")) return [host_code_obj] + kernel_code_objs + other_objs
def get_generated_codeobjects(self): execution_mode = Config.get("compiler", "xilinx", "mode") kernel_file_name = "DACE_BINARY_DIR \"/{}".format(self._program_name) if execution_mode == "software_emulation": kernel_file_name += "_sw_emu.xclbin\"" xcl_emulation_mode = "\"sw_emu\"" xilinx_sdx = "DACE_VITIS_DIR" elif execution_mode == "hardware_emulation": kernel_file_name += "_hw_emu.xclbin\"" xcl_emulation_mode = "\"hw_emu\"" xilinx_sdx = "DACE_VITIS_DIR" elif execution_mode == "hardware" or execution_mode == "simulation": kernel_file_name += "_hw.xclbin\"" xcl_emulation_mode = None xilinx_sdx = None else: raise cgx.CodegenError( "Unknown Xilinx execution mode: {}".format(execution_mode)) set_env_vars = "" set_str = "dace::set_environment_variable(\"{}\", {});\n" unset_str = "dace::unset_environment_variable(\"{}\");\n" set_env_vars += (set_str.format("XCL_EMULATION_MODE", xcl_emulation_mode) if xcl_emulation_mode is not None else unset_str.format("XCL_EMULATION_MODE")) set_env_vars += (set_str.format("XILINX_SDX", xilinx_sdx) if xilinx_sdx is not None else unset_str.format("XILINX_SDX")) host_code = CodeIOStream() host_code.write("""\ #include "dace/xilinx/host.h" #include "dace/dace.h" #include <iostream>\n\n""") self._frame.generate_fileheader(self._global_sdfg, host_code, 'xilinx_host') params_comma = self._global_sdfg.signature(with_arrays=False) if params_comma: params_comma = ', ' + params_comma host_code.write(""" DACE_EXPORTED int __dace_init_xilinx({sdfg.name}_t *__state{signature}) {{ {environment_variables} __state->fpga_context = new dace::fpga::Context(); __state->fpga_context->Get().MakeProgram({kernel_file_name}); return 0; }} DACE_EXPORTED void __dace_exit_xilinx({sdfg.name}_t *__state) {{ delete __state->fpga_context; }} {host_code}""".format(signature=params_comma, sdfg=self._global_sdfg, environment_variables=set_env_vars, kernel_file_name=kernel_file_name, host_code="".join([ "{separator}\n// Kernel: {kernel_name}" "\n{separator}\n\n{code}\n\n".format( separator="/" * 79, kernel_name=name, code=code) for (name, code) in self._host_codes ]))) host_code_obj = CodeObject(self._program_name, host_code.getvalue(), "cpp", XilinxCodeGen, "Xilinx", target_type="host") kernel_code_objs = [ CodeObject(kernel_name, code, "cpp", XilinxCodeGen, "Xilinx", target_type="device") for (kernel_name, code) in self._kernel_codes ] # Configuration file with interface assignments are_assigned = [ v is not None for v in self._interface_assignments.values() ] bank_assignment_code = [] if any(are_assigned): if not all(are_assigned): raise RuntimeError("Some, but not all global memory arrays " "were assigned to memory banks: {}".format( self._interface_assignments)) are_assigned = True else: are_assigned = False for name, _ in self._host_codes: # Only iterate over assignments if any exist if are_assigned: for (kernel_name, interface_name), ( memory_type, memory_bank) in self._interface_assignments.items(): if kernel_name != name: continue bank_assignment_code.append("{},{},{}".format( interface_name, memory_type.name, memory_bank)) # Create file even if there are no assignments kernel_code_objs.append( CodeObject("{}_memory_interfaces".format(name), "\n".join(bank_assignment_code), "csv", XilinxCodeGen, "Xilinx", target_type="device")) return [host_code_obj] + kernel_code_objs
def generate_module(self, sdfg, state, kernel_name, name, subgraph, parameters, module_stream, entry_stream, host_stream, instrumentation_stream): """Generates a module that will run as a dataflow function in the FPGA kernel.""" state_id = sdfg.node_id(state) dfg = sdfg.nodes()[state_id] kernel_args_call = [] kernel_args_module = [] for is_output, pname, p, interface_ids in parameters: if isinstance(p, dt.Array): for bank, interface_id in fpga.iterate_hbm_interface_ids( p, interface_ids): arr_name = fpga.fpga_ptr(pname, p, sdfg, bank, is_output, is_array_interface=True) # Add interface ID to called module, but not to the module # arguments argname = fpga.fpga_ptr(pname, p, sdfg, bank, is_output, is_array_interface=True, interface_id=interface_id) kernel_args_call.append(argname) dtype = p.dtype kernel_args_module.append("{} {}*{}".format( dtype.ctype, "const " if not is_output else "", arr_name)) else: if isinstance(p, dt.Stream): kernel_args_call.append( p.as_arg(with_types=False, name=pname)) if p.is_stream_array(): kernel_args_module.append( "dace::FIFO<{}, {}, {}> {}[{}]".format( p.dtype.base_type.ctype, p.veclen, p.buffer_size, pname, p.size_string())) else: kernel_args_module.append( "dace::FIFO<{}, {}, {}> &{}".format( p.dtype.base_type.ctype, p.veclen, p.buffer_size, pname)) else: kernel_args_call.append( p.as_arg(with_types=False, name=pname)) kernel_args_module.append( p.as_arg(with_types=True, name=pname)) # Check if we are generating an RTL module, in which case only the # accesses to the streams should be handled rtl_tasklet = None for n in subgraph.nodes(): if (isinstance(n, dace.nodes.Tasklet) and n.language == dace.dtypes.Language.SystemVerilog): rtl_tasklet = n break if rtl_tasklet: entry_stream.write( f'// [RTL] HLSLIB_DATAFLOW_FUNCTION({name}, {", ".join(kernel_args_call)});' ) module_stream.write( f'// [RTL] void {name}({", ".join(kernel_args_module)});\n\n') # _1 in names are due to vitis for node in subgraph.source_nodes(): if isinstance(sdfg.arrays[node.data], dt.Stream): if node.data not in self._stream_connections: self._stream_connections[node.data] = [None, None] for edge in state.out_edges(node): rtl_name = "{}_{}_{}_{}".format( edge.dst, sdfg.sdfg_id, sdfg.node_id(state), state.node_id(edge.dst)) self._stream_connections[ node.data][1] = '{}_top_1.s_axis_{}'.format( rtl_name, edge.dst_conn) for node in subgraph.sink_nodes(): if isinstance(sdfg.arrays[node.data], dt.Stream): if node.data not in self._stream_connections: self._stream_connections[node.data] = [None, None] for edge in state.in_edges(node): rtl_name = "{}_{}_{}_{}".format( edge.src, sdfg.sdfg_id, sdfg.node_id(state), state.node_id(edge.src)) self._stream_connections[ node.data][0] = '{}_top_1.m_axis_{}'.format( rtl_name, edge.src_conn) # Make the dispatcher trigger generation of the RTL module, but # ignore the generated code, as the RTL codegen will generate the # appropriate files. ignore_stream = CodeIOStream() self._dispatcher.dispatch_subgraph(sdfg, subgraph, state_id, ignore_stream, ignore_stream, skip_entry_node=False) # Launch the kernel from the host code rtl_name = self.rtl_tasklet_name(rtl_tasklet, state, sdfg) host_stream.write( f" auto kernel_{rtl_name} = program.MakeKernel(\"{rtl_name}_top\"{', '.join([''] + [name for _, name, p, _ in parameters if not isinstance(p, dt.Stream)])}).ExecuteTaskFork();", sdfg, state_id, rtl_tasklet) if state.instrument == dtypes.InstrumentationType.FPGA: self.instrument_opencl_kernel(rtl_name, state_id, sdfg.sdfg_id, instrumentation_stream) return # create a unique module name to prevent name clashes module_function_name = f"module_{name}_{sdfg.sdfg_id}" # Unrolling processing elements: if there first scope of the subgraph # is an unrolled map, generate a processing element for each iteration scope_children = subgraph.scope_children() top_scopes = [ n for n in scope_children[None] if isinstance(n, dace.sdfg.nodes.EntryNode) ] unrolled_loops = 0 if len(top_scopes) == 1: scope = top_scopes[0] if scope.unroll: self._unrolled_pes.add(scope.map) kernel_args_call += ", ".join(scope.map.params) kernel_args_module += ["int " + p for p in scope.params] for p, r in zip(scope.map.params, scope.map.range): if len(r) > 3: raise cgx.CodegenError("Strided unroll not supported") entry_stream.write( "for (size_t {param} = {begin}; {param} < {end}; " "{param} += {increment}) {{\n#pragma HLS UNROLL". format(param=p, begin=r[0], end=r[1] + 1, increment=r[2])) unrolled_loops += 1 # Generate caller code in top-level function entry_stream.write( "HLSLIB_DATAFLOW_FUNCTION({}, {});".format( module_function_name, ", ".join(kernel_args_call)), sdfg, state_id) for _ in range(unrolled_loops): entry_stream.write("}") # ---------------------------------------------------------------------- # Generate kernel code # ---------------------------------------------------------------------- self._dispatcher.defined_vars.enter_scope(subgraph) module_body_stream = CodeIOStream() module_body_stream.write( "void {}({}) {{".format(module_function_name, ", ".join(kernel_args_module)), sdfg, state_id) # Register the array interface as a naked pointer for use inside the # FPGA kernel interfaces_added = set() for is_output, argname, arg, interface_id in parameters: for bank, _ in fpga.iterate_hbm_interface_ids(arg, interface_id): if (not (isinstance(arg, dt.Array) and arg.storage == dace.dtypes.StorageType.FPGA_Global)): continue ctype = dtypes.pointer(arg.dtype).ctype ptr_name = fpga.fpga_ptr(argname, arg, sdfg, bank, is_output, None, is_array_interface=True) if not is_output: ctype = f"const {ctype}" self._dispatcher.defined_vars.add(ptr_name, DefinedType.Pointer, ctype) if argname in interfaces_added: continue interfaces_added.add(argname) self._dispatcher.defined_vars.add(argname, DefinedType.ArrayInterface, ctype, allow_shadowing=True) module_body_stream.write("\n") # Allocate local transients data_to_allocate = (set(subgraph.top_level_transients()) - set(sdfg.shared_transients()) - set([p[1] for p in parameters])) allocated = set() for node in subgraph.nodes(): if not isinstance(node, dace.sdfg.nodes.AccessNode): continue if node.data not in data_to_allocate or node.data in allocated: continue allocated.add(node.data) self._dispatcher.dispatch_allocate(sdfg, state, state_id, node, node.desc(sdfg), module_stream, module_body_stream) self._dispatcher.dispatch_subgraph(sdfg, subgraph, state_id, module_stream, module_body_stream, skip_entry_node=False) module_stream.write(module_body_stream.getvalue(), sdfg, state_id) module_stream.write("}\n\n") self._dispatcher.defined_vars.exit_scope(subgraph)
def generate_module(self, sdfg, state, name, subgraph, parameters, symbol_parameters, module_stream, entry_stream, host_stream): """Generates a module that will run as a dataflow function in the FPGA kernel.""" state_id = sdfg.node_id(state) dfg = sdfg.nodes()[state_id] kernel_args_call = [] kernel_args_module = [] added = set() parameters = list(sorted(parameters, key=lambda t: t[1])) arrays = dtypes.deduplicate( [p for p in parameters if not isinstance(p[2], dace.data.Scalar)]) scalars = [p for p in parameters if isinstance(p[2], dace.data.Scalar)] scalars += ((False, k, v, None) for k, v in symbol_parameters.items()) scalars = dace.dtypes.deduplicate(sorted(scalars, key=lambda t: t[1])) for is_output, pname, p, interface_id in itertools.chain( arrays, scalars): if isinstance(p, dace.data.Array): arr_name = "{}_{}".format(pname, "out" if is_output else "in") # Add interface ID to called module, but not to the module # arguments argname = arr_name if interface_id is not None: argname = arr_name + "_%d" % interface_id kernel_args_call.append(argname) dtype = p.dtype kernel_args_module.append("{} {}*{}".format( dtype.ctype, "const " if not is_output else "", arr_name)) else: # Don't make duplicate arguments for other types than arrays if pname in added: continue added.add(pname) if isinstance(p, dace.data.Stream): kernel_args_call.append( p.as_arg(with_types=False, name=pname)) if p.is_stream_array(): kernel_args_module.append( "dace::FIFO<{}, {}, {}> {}[{}]".format( p.dtype.base_type.ctype, p.veclen, p.buffer_size, pname, p.size_string())) else: kernel_args_module.append( "dace::FIFO<{}, {}, {}> &{}".format( p.dtype.base_type.ctype, p.veclen, p.buffer_size, pname)) else: kernel_args_call.append( p.as_arg(with_types=False, name=pname)) kernel_args_module.append( p.as_arg(with_types=True, name=pname)) # create a unique module name to prevent name clashes module_function_name = f"module_{name}_{sdfg.sdfg_id}" # Unrolling processing elements: if there first scope of the subgraph # is an unrolled map, generate a processing element for each iteration scope_children = subgraph.scope_children() top_scopes = [ n for n in scope_children[None] if isinstance(n, dace.sdfg.nodes.EntryNode) ] unrolled_loops = 0 if len(top_scopes) == 1: scope = top_scopes[0] if scope.unroll: self._unrolled_pes.add(scope.map) kernel_args_call += ", ".join(scope.map.params) kernel_args_module += ["int " + p for p in scope.params] for p, r in zip(scope.map.params, scope.map.range): if len(r) > 3: raise cgx.CodegenError("Strided unroll not supported") entry_stream.write( "for (size_t {param} = {begin}; {param} < {end}; " "{param} += {increment}) {{\n#pragma HLS UNROLL". format(param=p, begin=r[0], end=r[1] + 1, increment=r[2])) unrolled_loops += 1 # Generate caller code in top-level function entry_stream.write( "HLSLIB_DATAFLOW_FUNCTION({}, {});".format( module_function_name, ", ".join(kernel_args_call)), sdfg, state_id) for _ in range(unrolled_loops): entry_stream.write("}") # ---------------------------------------------------------------------- # Generate kernel code # ---------------------------------------------------------------------- self._dispatcher.defined_vars.enter_scope(subgraph) module_body_stream = CodeIOStream() module_body_stream.write( "void {}({}) {{".format(module_function_name, ", ".join(kernel_args_module)), sdfg, state_id) # Construct ArrayInterface wrappers to pack input and output pointers # to the same global array in_args = { argname for out, argname, arg, _ in parameters if isinstance(arg, dace.data.Array) and arg.storage == dace.dtypes.StorageType.FPGA_Global and not out } out_args = { argname for out, argname, arg, _ in parameters if isinstance(arg, dace.data.Array) and arg.storage == dace.dtypes.StorageType.FPGA_Global and out } if len(in_args) > 0 or len(out_args) > 0: # Add ArrayInterface objects to wrap input and output pointers to # the same array module_body_stream.write("\n") interfaces_added = set() for _, argname, arg, _ in parameters: if argname in interfaces_added: continue interfaces_added.add(argname) has_in_ptr = argname in in_args has_out_ptr = argname in out_args if not has_in_ptr and not has_out_ptr: continue in_ptr = ("{}_in".format(argname) if has_in_ptr else "nullptr") out_ptr = ("{}_out".format(argname) if has_out_ptr else "nullptr") ctype = "dace::ArrayInterface<{}>".format(arg.dtype.ctype) module_body_stream.write("{} {}({}, {});".format( ctype, argname, in_ptr, out_ptr)) self._dispatcher.defined_vars.add(argname, DefinedType.ArrayInterface, ctype, allow_shadowing=True) module_body_stream.write("\n") # Allocate local transients data_to_allocate = (set(subgraph.top_level_transients()) - set(sdfg.shared_transients()) - set([p[1] for p in parameters])) allocated = set() for node in subgraph.nodes(): if not isinstance(node, dace.sdfg.nodes.AccessNode): continue if node.data not in data_to_allocate or node.data in allocated: continue allocated.add(node.data) self._dispatcher.dispatch_allocate(sdfg, state, state_id, node, module_stream, module_body_stream) self._dispatcher.dispatch_subgraph(sdfg, subgraph, state_id, module_stream, module_body_stream, skip_entry_node=False) module_stream.write(module_body_stream.getvalue(), sdfg, state_id) module_stream.write("}\n\n") self._dispatcher.defined_vars.exit_scope(subgraph)