def generate_kernel_internal(self, sdfg, state, kernel_name, subgraphs, kernel_stream, function_stream, callsite_stream): """Main entry function for generating a Xilinx kernel.""" (global_data_parameters, top_level_local_data, subgraph_parameters, scalar_parameters, symbol_parameters, nested_global_transients) = self.make_parameters( sdfg, state, subgraphs) # Scalar parameters are never output sc_parameters = [(False, pname, param) for pname, param in scalar_parameters] host_code_stream = CodeIOStream() # Generate host code self.generate_host_header(sdfg, kernel_name, global_data_parameters + sc_parameters, symbol_parameters, host_code_stream) self.generate_host_function_boilerplate( sdfg, state, kernel_name, global_data_parameters + sc_parameters, symbol_parameters, nested_global_transients, host_code_stream, function_stream, callsite_stream) self.generate_host_function_body( sdfg, state, kernel_name, global_data_parameters + sc_parameters, symbol_parameters, host_code_stream) # Store code to be passed to compilation phase self._host_codes.append((kernel_name, host_code_stream.getvalue())) # Now we write the device code module_stream = CodeIOStream() entry_stream = CodeIOStream() state_id = sdfg.node_id(state) self.generate_kernel_boilerplate_pre(sdfg, state_id, kernel_name, global_data_parameters, scalar_parameters, symbol_parameters, module_stream, entry_stream) # Emit allocations for node in top_level_local_data: self._dispatcher.dispatch_allocate(sdfg, state, state_id, node, module_stream, entry_stream) self._dispatcher.dispatch_initialize(sdfg, state, state_id, node, module_stream, entry_stream) self.generate_modules(sdfg, state, kernel_name, subgraphs, subgraph_parameters, sc_parameters, symbol_parameters, module_stream, entry_stream, host_code_stream) kernel_stream.write(module_stream.getvalue()) kernel_stream.write(entry_stream.getvalue()) self.generate_kernel_boilerplate_post(kernel_stream, sdfg, state_id)
def __init__(self, frame_codegen, sdfg): self._frame = frame_codegen self._dispatcher = frame_codegen.dispatcher dispatcher = self._dispatcher fileheader = CodeIOStream() self._frame.generate_fileheader(sdfg, fileheader) self._codeobj = CodeObject( sdfg.name + '_mpi', """ #include <dace/dace.h> #include <mpi.h> MPI_Comm __dace_mpi_comm; int __dace_comm_size = 1; int __dace_comm_rank = 0; {file_header} DACE_EXPORTED int __dace_init_mpi({params}); DACE_EXPORTED void __dace_exit_mpi({params}); int __dace_init_mpi({params}) {{ int isinit = 0; if (MPI_Initialized(&isinit) != MPI_SUCCESS) return 1; if (!isinit) {{ if (MPI_Init(NULL, NULL) != MPI_SUCCESS) return 1; }} MPI_Comm_dup(MPI_COMM_WORLD, &__dace_mpi_comm); MPI_Comm_rank(__dace_mpi_comm, &__dace_comm_rank); MPI_Comm_size(__dace_mpi_comm, &__dace_comm_size); printf(\"MPI was initialized on proc %i of %i\\n\", __dace_comm_rank, __dace_comm_size); return 0; }} void __dace_exit_mpi({params}) {{ MPI_Comm_free(&__dace_mpi_comm); MPI_Finalize(); printf(\"MPI was finalized on proc %i of %i\\n\", __dace_comm_rank, __dace_comm_size); }} """.format(params=sdfg.signature(), file_header=fileheader.getvalue()), 'cpp', MPICodeGen, 'MPI') # Register dispatchers dispatcher.register_map_dispatcher(dtypes.ScheduleType.MPI, self)
def get_generated_codeobjects(self): fileheader = CodeIOStream() sdfg = self._global_sdfg self._frame.generate_fileheader(sdfg, fileheader, 'mpi') params_comma = self._global_sdfg.init_signature( free_symbols=self._frame.free_symbols(self._global_sdfg)) if params_comma: params_comma = ', ' + params_comma codeobj = CodeObject( sdfg.name + '_mpi', """ #include <dace/dace.h> #include <mpi.h> MPI_Comm __dace_mpi_comm; int __dace_comm_size = 1; int __dace_comm_rank = 0; {file_header} DACE_EXPORTED int __dace_init_mpi({sdfg.name}_t *__state{params}); DACE_EXPORTED void __dace_exit_mpi({sdfg.name}_t *__state); int __dace_init_mpi({sdfg.name}_t *__state{params}) {{ int isinit = 0; if (MPI_Initialized(&isinit) != MPI_SUCCESS) return 1; if (!isinit) {{ if (MPI_Init(NULL, NULL) != MPI_SUCCESS) return 1; }} MPI_Comm_dup(MPI_COMM_WORLD, &__dace_mpi_comm); MPI_Comm_rank(__dace_mpi_comm, &__dace_comm_rank); MPI_Comm_size(__dace_mpi_comm, &__dace_comm_size); printf(\"MPI was initialized on proc %i of %i\\n\", __dace_comm_rank, __dace_comm_size); return 0; }} void __dace_exit_mpi({sdfg.name}_t *__state) {{ MPI_Comm_free(&__dace_mpi_comm); MPI_Finalize(); printf(\"MPI was finalized on proc %i of %i\\n\", __dace_comm_rank, __dace_comm_size); }} """.format(params=params_comma, sdfg=sdfg, file_header=fileheader.getvalue()), 'cpp', MPICodeGen, 'MPI') return [codeobj]
def dispatch_allocate(self, sdfg, dfg, state_id, node, function_stream, allocation_stream): """ Dispatches a code generator for data allocation. """ nodedesc = node.desc(sdfg) storage = (nodedesc.storage if not isinstance(node, nodes.Tasklet) else dtypes.StorageType.Register) self._used_targets.add(self._array_dispatchers[storage]) # TODO: Move to central allocator (see PR #434) if nodedesc.lifetime is dtypes.AllocationLifetime.Persistent: declaration_stream = CodeIOStream() allocation_stream = self.frame._initcode else: declaration_stream = allocation_stream self._array_dispatchers[storage].allocate_array( sdfg, dfg, state_id, node, function_stream, declaration_stream, allocation_stream) # TODO: Move to central allocator (see PR #434) if nodedesc.lifetime is dtypes.AllocationLifetime.Persistent: self.frame.statestruct.append(declaration_stream.getvalue())
def get_generated_codeobjects(self): execution_mode = Config.get("compiler", "xilinx", "mode") kernel_file_name = "DACE_BINARY_DIR \"/{}".format(self._program_name) if execution_mode == "software_emulation": kernel_file_name += "_sw_emu.xclbin\"" xcl_emulation_mode = "\"sw_emu\"" xilinx_sdx = "DACE_VITIS_DIR" elif execution_mode == "hardware_emulation": kernel_file_name += "_hw_emu.xclbin\"" xcl_emulation_mode = "\"hw_emu\"" xilinx_sdx = "DACE_VITIS_DIR" elif execution_mode == "hardware" or execution_mode == "simulation": kernel_file_name += "_hw.xclbin\"" xcl_emulation_mode = None xilinx_sdx = None else: raise dace.codegen.codegen.CodegenError( "Unknown Xilinx execution mode: {}".format(execution_mode)) set_env_vars = "" set_str = "dace::set_environment_variable(\"{}\", {});\n" unset_str = "dace::unset_environment_variable(\"{}\");\n" set_env_vars += (set_str.format("XCL_EMULATION_MODE", xcl_emulation_mode) if xcl_emulation_mode is not None else unset_str.format("XCL_EMULATION_MODE")) set_env_vars += (set_str.format("XILINX_SDX", xilinx_sdx) if xilinx_sdx is not None else unset_str.format("XILINX_SDX")) host_code = CodeIOStream() host_code.write("""\ #include "dace/xilinx/host.h" #include "dace/dace.h" #include <iostream>\n\n""") self._frame.generate_fileheader(self._global_sdfg, host_code) host_code.write(""" dace::fpga::Context *dace::fpga::_context; DACE_EXPORTED int __dace_init_xilinx({signature}) {{ {environment_variables} dace::fpga::_context = new dace::fpga::Context(); dace::fpga::_context->Get().MakeProgram({kernel_file_name}); return 0; }} DACE_EXPORTED void __dace_exit_xilinx({signature}) {{ delete dace::fpga::_context; }} {host_code}""".format(signature=self._global_sdfg.signature(), environment_variables=set_env_vars, kernel_file_name=kernel_file_name, host_code="".join([ "{separator}\n// Kernel: {kernel_name}" "\n{separator}\n\n{code}\n\n".format(separator="/" * 79, kernel_name=name, code=code) for (name, code) in self._host_codes ]))) host_code_obj = CodeObject(self._program_name, host_code.getvalue(), "cpp", XilinxCodeGen, "Xilinx", target_type="host") kernel_code_objs = [ CodeObject(kernel_name, code, "cpp", XilinxCodeGen, "Xilinx", target_type="device") for (kernel_name, code) in self._kernel_codes ] # Configuration file with interface assignments are_assigned = [ v is not None for v in self._interface_assignments.values() ] bank_assignment_code = [] if any(are_assigned): if not all(are_assigned): raise RuntimeError("Some, but not all global memory arrays " "were assigned to memory banks: {}".format( self._interface_assignments)) are_assigned = True else: are_assigned = False for name, _ in self._host_codes: # Only iterate over assignments if any exist if are_assigned: for (kernel_name, interface_name), ( memory_type, memory_bank) in self._interface_assignments.items(): if kernel_name != name: continue bank_assignment_code.append("{},{},{}".format( interface_name, memory_type.name, memory_bank)) # Create file even if there are no assignments kernel_code_objs.append( CodeObject("{}_memory_interfaces".format(name), "\n".join(bank_assignment_code), "csv", XilinxCodeGen, "Xilinx", target_type="device")) return [host_code_obj] + kernel_code_objs
def generate_module(self, sdfg, state, kernel_name, name, subgraph, parameters, module_stream, entry_stream, host_stream, instrumentation_stream): """Generates a module that will run as a dataflow function in the FPGA kernel.""" state_id = sdfg.node_id(state) dfg = sdfg.nodes()[state_id] kernel_args_call = [] kernel_args_module = [] for is_output, pname, p, interface_ids in parameters: if isinstance(p, dt.Array): for bank, interface_id in fpga.iterate_hbm_interface_ids( p, interface_ids): arr_name = fpga.fpga_ptr(pname, p, sdfg, bank, is_output, is_array_interface=True) # Add interface ID to called module, but not to the module # arguments argname = fpga.fpga_ptr(pname, p, sdfg, bank, is_output, is_array_interface=True, interface_id=interface_id) kernel_args_call.append(argname) dtype = p.dtype kernel_args_module.append("{} {}*{}".format( dtype.ctype, "const " if not is_output else "", arr_name)) else: if isinstance(p, dt.Stream): kernel_args_call.append( p.as_arg(with_types=False, name=pname)) if p.is_stream_array(): kernel_args_module.append( "dace::FIFO<{}, {}, {}> {}[{}]".format( p.dtype.base_type.ctype, p.veclen, p.buffer_size, pname, p.size_string())) else: kernel_args_module.append( "dace::FIFO<{}, {}, {}> &{}".format( p.dtype.base_type.ctype, p.veclen, p.buffer_size, pname)) else: kernel_args_call.append( p.as_arg(with_types=False, name=pname)) kernel_args_module.append( p.as_arg(with_types=True, name=pname)) # Check if we are generating an RTL module, in which case only the # accesses to the streams should be handled rtl_tasklet = None for n in subgraph.nodes(): if (isinstance(n, dace.nodes.Tasklet) and n.language == dace.dtypes.Language.SystemVerilog): rtl_tasklet = n break if rtl_tasklet: entry_stream.write( f'// [RTL] HLSLIB_DATAFLOW_FUNCTION({name}, {", ".join(kernel_args_call)});' ) module_stream.write( f'// [RTL] void {name}({", ".join(kernel_args_module)});\n\n') # _1 in names are due to vitis for node in subgraph.source_nodes(): if isinstance(sdfg.arrays[node.data], dt.Stream): if node.data not in self._stream_connections: self._stream_connections[node.data] = [None, None] for edge in state.out_edges(node): rtl_name = "{}_{}_{}_{}".format( edge.dst, sdfg.sdfg_id, sdfg.node_id(state), state.node_id(edge.dst)) self._stream_connections[ node.data][1] = '{}_top_1.s_axis_{}'.format( rtl_name, edge.dst_conn) for node in subgraph.sink_nodes(): if isinstance(sdfg.arrays[node.data], dt.Stream): if node.data not in self._stream_connections: self._stream_connections[node.data] = [None, None] for edge in state.in_edges(node): rtl_name = "{}_{}_{}_{}".format( edge.src, sdfg.sdfg_id, sdfg.node_id(state), state.node_id(edge.src)) self._stream_connections[ node.data][0] = '{}_top_1.m_axis_{}'.format( rtl_name, edge.src_conn) # Make the dispatcher trigger generation of the RTL module, but # ignore the generated code, as the RTL codegen will generate the # appropriate files. ignore_stream = CodeIOStream() self._dispatcher.dispatch_subgraph(sdfg, subgraph, state_id, ignore_stream, ignore_stream, skip_entry_node=False) # Launch the kernel from the host code rtl_name = self.rtl_tasklet_name(rtl_tasklet, state, sdfg) host_stream.write( f" auto kernel_{rtl_name} = program.MakeKernel(\"{rtl_name}_top\"{', '.join([''] + [name for _, name, p, _ in parameters if not isinstance(p, dt.Stream)])}).ExecuteTaskFork();", sdfg, state_id, rtl_tasklet) if state.instrument == dtypes.InstrumentationType.FPGA: self.instrument_opencl_kernel(rtl_name, state_id, sdfg.sdfg_id, instrumentation_stream) return # create a unique module name to prevent name clashes module_function_name = f"module_{name}_{sdfg.sdfg_id}" # Unrolling processing elements: if there first scope of the subgraph # is an unrolled map, generate a processing element for each iteration scope_children = subgraph.scope_children() top_scopes = [ n for n in scope_children[None] if isinstance(n, dace.sdfg.nodes.EntryNode) ] unrolled_loops = 0 if len(top_scopes) == 1: scope = top_scopes[0] if scope.unroll: self._unrolled_pes.add(scope.map) kernel_args_call += ", ".join(scope.map.params) kernel_args_module += ["int " + p for p in scope.params] for p, r in zip(scope.map.params, scope.map.range): if len(r) > 3: raise cgx.CodegenError("Strided unroll not supported") entry_stream.write( "for (size_t {param} = {begin}; {param} < {end}; " "{param} += {increment}) {{\n#pragma HLS UNROLL". format(param=p, begin=r[0], end=r[1] + 1, increment=r[2])) unrolled_loops += 1 # Generate caller code in top-level function entry_stream.write( "HLSLIB_DATAFLOW_FUNCTION({}, {});".format( module_function_name, ", ".join(kernel_args_call)), sdfg, state_id) for _ in range(unrolled_loops): entry_stream.write("}") # ---------------------------------------------------------------------- # Generate kernel code # ---------------------------------------------------------------------- self._dispatcher.defined_vars.enter_scope(subgraph) module_body_stream = CodeIOStream() module_body_stream.write( "void {}({}) {{".format(module_function_name, ", ".join(kernel_args_module)), sdfg, state_id) # Register the array interface as a naked pointer for use inside the # FPGA kernel interfaces_added = set() for is_output, argname, arg, interface_id in parameters: for bank, _ in fpga.iterate_hbm_interface_ids(arg, interface_id): if (not (isinstance(arg, dt.Array) and arg.storage == dace.dtypes.StorageType.FPGA_Global)): continue ctype = dtypes.pointer(arg.dtype).ctype ptr_name = fpga.fpga_ptr(argname, arg, sdfg, bank, is_output, None, is_array_interface=True) if not is_output: ctype = f"const {ctype}" self._dispatcher.defined_vars.add(ptr_name, DefinedType.Pointer, ctype) if argname in interfaces_added: continue interfaces_added.add(argname) self._dispatcher.defined_vars.add(argname, DefinedType.ArrayInterface, ctype, allow_shadowing=True) module_body_stream.write("\n") # Allocate local transients data_to_allocate = (set(subgraph.top_level_transients()) - set(sdfg.shared_transients()) - set([p[1] for p in parameters])) allocated = set() for node in subgraph.nodes(): if not isinstance(node, dace.sdfg.nodes.AccessNode): continue if node.data not in data_to_allocate or node.data in allocated: continue allocated.add(node.data) self._dispatcher.dispatch_allocate(sdfg, state, state_id, node, node.desc(sdfg), module_stream, module_body_stream) self._dispatcher.dispatch_subgraph(sdfg, subgraph, state_id, module_stream, module_body_stream, skip_entry_node=False) module_stream.write(module_body_stream.getvalue(), sdfg, state_id) module_stream.write("}\n\n") self._dispatcher.defined_vars.exit_scope(subgraph)
def generate_code( self, sdfg: SDFG, schedule: Optional[dtypes.ScheduleType], sdfg_id: str = "" ) -> Tuple[str, str, Set[TargetCodeGenerator], Set[str]]: """ Generate frame code for a given SDFG, calling registered targets' code generation callbacks for them to generate their own code. :param sdfg: The SDFG to generate code for. :param schedule: The schedule the SDFG is currently located, or None if the SDFG is top-level. :param sdfg_id: An optional string id given to the SDFG label :return: A tuple of the generated global frame code, local frame code, and a set of targets that have been used in the generation of this SDFG. """ if len(sdfg_id) == 0 and sdfg.sdfg_id != 0: sdfg_id = '_%d' % sdfg.sdfg_id global_stream = CodeIOStream() callsite_stream = CodeIOStream() is_top_level = sdfg.parent is None # Analyze allocation lifetime of SDFG and all nested SDFGs if is_top_level: self.determine_allocation_lifetime(sdfg) # Generate code ########################### # Keep track of allocated variables allocated = set() # Add symbol mappings to allocated variables if sdfg.parent_nsdfg_node is not None: allocated |= sdfg.parent_nsdfg_node.symbol_mapping.keys() # Invoke all instrumentation providers for instr in self._dispatcher.instrumentation.values(): if instr is not None: instr.on_sdfg_begin(sdfg, callsite_stream, global_stream) # Allocate outer-level transients self.allocate_arrays_in_scope(sdfg, sdfg, global_stream, callsite_stream) # Allocate inter-state variables global_symbols = copy.deepcopy(sdfg.symbols) global_symbols.update( {aname: arr.dtype for aname, arr in sdfg.arrays.items()}) interstate_symbols = {} for e in sdfg.edges(): symbols = e.data.new_symbols(sdfg, global_symbols) # Inferred symbols only take precedence if global symbol not defined symbols = { k: v if k not in global_symbols else global_symbols[k] for k, v in symbols.items() } interstate_symbols.update(symbols) global_symbols.update(symbols) for isvarName, isvarType in interstate_symbols.items(): isvar = data.Scalar(isvarType) callsite_stream.write( '%s;\n' % (isvar.as_arg(with_types=True, name=isvarName)), sdfg) self.dispatcher.defined_vars.add(isvarName, disp.DefinedType.Scalar, isvarType.ctype) callsite_stream.write('\n', sdfg) ####################################################################### # Generate actual program body states_generated = self.generate_states(sdfg, global_stream, callsite_stream) ####################################################################### # Sanity check if len(states_generated) != len(sdfg.nodes()): raise RuntimeError( "Not all states were generated in SDFG {}!" "\n Generated: {}\n Missing: {}".format( sdfg.label, [s.label for s in states_generated], [s.label for s in (set(sdfg.nodes()) - states_generated)])) # Deallocate transients self.deallocate_arrays_in_scope(sdfg, sdfg, global_stream, callsite_stream) # Now that we have all the information about dependencies, generate # header and footer if is_top_level: # Let each target append code to frame code state before generating # header and footer for target in self._dispatcher.used_targets: target.on_target_used() header_stream = CodeIOStream() header_global_stream = CodeIOStream() footer_stream = CodeIOStream() footer_global_stream = CodeIOStream() # Get all environments used in the generated code, including # dependent environments import dace.library # Avoid import loops self.environments = dace.library.get_environments_and_dependencies( self._dispatcher.used_environments) self.generate_header(sdfg, header_global_stream, header_stream) # Open program function params = sdfg.signature() if params: params = ', ' + params function_signature = ( 'void __program_%s_internal(%s_t *__state%s)\n{\n' % (sdfg.name, sdfg.name, params)) self.generate_footer(sdfg, footer_global_stream, footer_stream) header_global_stream.write(global_stream.getvalue()) header_global_stream.write(footer_global_stream.getvalue()) generated_header = header_global_stream.getvalue() all_code = CodeIOStream() all_code.write(function_signature) all_code.write(header_stream.getvalue()) all_code.write(callsite_stream.getvalue()) all_code.write(footer_stream.getvalue()) generated_code = all_code.getvalue() else: generated_header = global_stream.getvalue() generated_code = callsite_stream.getvalue() # Clean up generated code gotos = re.findall(r'goto (.*?);', generated_code) clean_code = '' for line in generated_code.split('\n'): # Empty line with semicolon if re.match(r'^\s*;\s*', line): continue # Label that might be unused label = re.findall( r'^\s*([a-zA-Z_][a-zA-Z_0-9]*):\s*[;]?\s*////.*$', line) if len(label) > 0: if label[0] not in gotos: continue clean_code += line + '\n' # Return the generated global and local code strings return (generated_header, clean_code, self._dispatcher.used_targets, self._dispatcher.used_environments)
def generate_module(self, sdfg, state, name, subgraph, parameters, symbol_parameters, module_stream, entry_stream, host_stream): """Generates a module that will run as a dataflow function in the FPGA kernel.""" state_id = sdfg.node_id(state) dfg = sdfg.nodes()[state_id] kernel_args_call = [] kernel_args_module = [] added = set() parameters = list(sorted(parameters, key=lambda t: t[1])) arrays = [ p for p in parameters if not isinstance(p[2], dace.data.Scalar) ] scalars = [p for p in parameters if isinstance(p[2], dace.data.Scalar)] scalars += ((False, k, v) for k, v in symbol_parameters.items()) scalars = dace.dtypes.deduplicate(sorted(scalars, key=lambda t: t[1])) for is_output, pname, p in itertools.chain(parameters, scalars): if isinstance(p, dace.data.Array): arr_name = "{}_{}".format(pname, "out" if is_output else "in") kernel_args_call.append(arr_name) dtype = p.dtype kernel_args_module.append("{} {}*{}".format( dtype.ctype, "const " if not is_output else "", arr_name)) else: # Don't make duplicate arguments for other types than arrays if pname in added: continue added.add(pname) if isinstance(p, dace.data.Stream): kernel_args_call.append( p.as_arg(with_types=False, name=pname)) if p.is_stream_array(): kernel_args_module.append( "dace::FIFO<{}, {}, {}> {}[{}]".format( p.dtype.base_type.ctype, p.veclen, p.buffer_size, pname, p.size_string())) else: kernel_args_module.append( "dace::FIFO<{}, {}, {}> &{}".format( p.dtype.base_type.ctype, p.veclen, p.buffer_size, pname)) else: kernel_args_call.append( p.as_arg(with_types=False, name=pname)) kernel_args_module.append( p.as_arg(with_types=True, name=pname)) module_function_name = "module_" + name # Unrolling processing elements: if there first scope of the subgraph # is an unrolled map, generate a processing element for each iteration scope_dict = subgraph.scope_dict(node_to_children=True) top_scopes = [ n for n in scope_dict[None] if isinstance(n, dace.sdfg.nodes.EntryNode) ] unrolled_loops = 0 if len(top_scopes) == 1: scope = top_scopes[0] if scope.unroll: self._unrolled_pes.add(scope.map) kernel_args_call += ", ".join(scope.map.params) kernel_args_module += ["int " + p for p in scope.params] for p, r in zip(scope.map.params, scope.map.range): if len(r) > 3: raise dace.codegen.codegen.CodegenError( "Strided unroll not supported") entry_stream.write( "for (size_t {param} = {begin}; {param} < {end}; " "{param} += {increment}) {{\n#pragma HLS UNROLL".format( param=p, begin=r[0], end=r[1] + 1, increment=r[2])) unrolled_loops += 1 # Generate caller code in top-level function entry_stream.write( "HLSLIB_DATAFLOW_FUNCTION({}, {});".format( module_function_name, ", ".join(kernel_args_call)), sdfg, state_id) for _ in range(unrolled_loops): entry_stream.write("}") # ---------------------------------------------------------------------- # Generate kernel code # ---------------------------------------------------------------------- self._dispatcher.defined_vars.enter_scope(subgraph) module_body_stream = CodeIOStream() module_body_stream.write( "void {}({}) {{".format(module_function_name, ", ".join(kernel_args_module)), sdfg, state_id) # Construct ArrayInterface wrappers to pack input and output pointers # to the same global array in_args = { argname for out, argname, arg in parameters if isinstance(arg, dace.data.Array) and arg.storage == dace.dtypes.StorageType.FPGA_Global and not out } out_args = { argname for out, argname, arg in parameters if isinstance(arg, dace.data.Array) and arg.storage == dace.dtypes.StorageType.FPGA_Global and out } if len(in_args) > 0 or len(out_args) > 0: # Add ArrayInterface objects to wrap input and output pointers to # the same array module_body_stream.write("\n") interfaces_added = set() for _, argname, arg in parameters: if argname in interfaces_added: continue interfaces_added.add(argname) has_in_ptr = argname in in_args has_out_ptr = argname in out_args if not has_in_ptr and not has_out_ptr: continue in_ptr = ("{}_in".format(argname) if has_in_ptr else "nullptr") out_ptr = ("{}_out".format(argname) if has_out_ptr else "nullptr") ctype = "dace::ArrayInterface<{}>".format(arg.dtype.ctype) module_body_stream.write("{} {}({}, {});".format( ctype, argname, in_ptr, out_ptr)) self._dispatcher.defined_vars.add(argname, DefinedType.ArrayInterface, ctype, allow_shadowing=True) module_body_stream.write("\n") # Allocate local transients data_to_allocate = (set(subgraph.top_level_transients()) - set(sdfg.shared_transients()) - set([p[1] for p in parameters])) allocated = set() for node in subgraph.nodes(): if not isinstance(node, dace.sdfg.nodes.AccessNode): continue if node.data not in data_to_allocate or node.data in allocated: continue allocated.add(node.data) self._dispatcher.dispatch_allocate(sdfg, state, state_id, node, module_stream, module_body_stream) self._dispatcher.dispatch_subgraph(sdfg, subgraph, state_id, module_stream, module_body_stream, skip_entry_node=False) module_stream.write(module_body_stream.getvalue(), sdfg, state_id) module_stream.write("}\n\n") self._dispatcher.defined_vars.exit_scope(subgraph)
class DaCeCodeGenerator(object): """ DaCe code generator class that writes the generated code for SDFG state machines, and uses a dispatcher to generate code for individual states based on the target. """ def __init__(self, *args, **kwargs): self._dispatcher = disp.TargetDispatcher(self) self._dispatcher.register_state_dispatcher(self) self._initcode = CodeIOStream() self._exitcode = CodeIOStream() self.statestruct: List[str] = [] self.environments: List[Any] = [] self.to_allocate: DefaultDict[ Union[SDFG, SDFGState, nodes.EntryNode], List[Tuple[int, int, nodes.AccessNode]]] = collections.defaultdict(list) ################################################################## # Target registry @property def dispatcher(self): return self._dispatcher ################################################################## # Code generation def generate_constants(self, sdfg: SDFG, callsite_stream: CodeIOStream): # Write constants for cstname, (csttype, cstval) in sdfg.constants_prop.items(): if isinstance(csttype, data.Array): const_str = "constexpr " + csttype.dtype.ctype + \ " " + cstname + "[" + str(cstval.size) + "] = {" it = np.nditer(cstval, order='C') for i in range(cstval.size - 1): const_str += str(it[0]) + ", " it.iternext() const_str += str(it[0]) + "};\n" callsite_stream.write(const_str, sdfg) else: callsite_stream.write( "constexpr %s %s = %s;\n" % (csttype.dtype.ctype, cstname, sym2cpp(cstval)), sdfg) def generate_fileheader(self, sdfg: SDFG, global_stream: CodeIOStream, backend: str = 'frame'): """ Generate a header in every output file that includes custom types and constants. :param sdfg: The input SDFG. :param global_stream: Stream to write to (global). :param backend: Whose backend this header belongs to. """ # Hash file include if backend == 'frame': global_stream.write('#include "../../include/hash.h"\n', sdfg) ######################################################### # Environment-based includes for env in self.environments: if len(env.headers) > 0: global_stream.write( "\n".join("#include \"" + h + "\"" for h in env.headers), sdfg) ######################################################### # Custom types datatypes = set() # Types of this SDFG for _, arrname, arr in sdfg.arrays_recursive(): if arr is not None: datatypes.add(arr.dtype) # Emit unique definitions wrote_something = False for typ in datatypes: if hasattr(typ, 'emit_definition'): if not wrote_something: global_stream.write("", sdfg) wrote_something = True global_stream.write(typ.emit_definition(), sdfg) if wrote_something: global_stream.write("", sdfg) ######################################################### # Write constants self.generate_constants(sdfg, global_stream) ######################################################### # Write state struct structstr = '\n'.join(self.statestruct) global_stream.write( f''' struct {sdfg.name}_t {{ {structstr} }}; ''', sdfg) for sd in sdfg.all_sdfgs_recursive(): if None in sd.global_code: global_stream.write(codeblock_to_cpp(sd.global_code[None]), sd) if backend in sd.global_code: global_stream.write(codeblock_to_cpp(sd.global_code[backend]), sd) def generate_header(self, sdfg: SDFG, global_stream: CodeIOStream, callsite_stream: CodeIOStream): """ Generate the header of the frame-code. Code exists in a separate function for overriding purposes. :param sdfg: The input SDFG. :param global_stream: Stream to write to (global). :param callsite_stream: Stream to write to (at call site). """ # Write frame code - header global_stream.write( '/* DaCe AUTO-GENERATED FILE. DO NOT MODIFY */\n' + '#include <dace/dace.h>\n', sdfg) # Write header required by environments for env in self.environments: self.statestruct.extend(env.state_fields) # Instrumentation preamble if len(self._dispatcher.instrumentation) > 1: self.statestruct.append('dace::perf::Report report;') # Reset report if written every invocation if config.Config.get_bool('instrumentation', 'report_each_invocation'): callsite_stream.write('__state->report.reset();', sdfg) self.generate_fileheader(sdfg, global_stream, 'frame') def generate_footer(self, sdfg: SDFG, global_stream: CodeIOStream, callsite_stream: CodeIOStream): """ Generate the footer of the frame-code. Code exists in a separate function for overriding purposes. :param sdfg: The input SDFG. :param global_stream: Stream to write to (global). :param callsite_stream: Stream to write to (at call site). """ import dace.library fname = sdfg.name params = sdfg.signature() paramnames = sdfg.signature(False, for_call=True) initparams = sdfg.signature(with_arrays=False) initparamnames = sdfg.signature(False, for_call=True, with_arrays=False) # Invoke all instrumentation providers for instr in self._dispatcher.instrumentation.values(): if instr is not None: instr.on_sdfg_end(sdfg, callsite_stream, global_stream) # Instrumentation saving if (config.Config.get_bool('instrumentation', 'report_each_invocation') and len(self._dispatcher.instrumentation) > 1): callsite_stream.write( '''__state->report.save("{path}/perf", __HASH_{name});'''. format(path=sdfg.build_folder.replace('\\', '/'), name=sdfg.name), sdfg) # Write closing brace of program callsite_stream.write('}', sdfg) # Write awkward footer to avoid 'extern "C"' issues params_comma = (', ' + params) if params else '' initparams_comma = (', ' + initparams) if initparams else '' paramnames_comma = (', ' + paramnames) if paramnames else '' initparamnames_comma = (', ' + initparamnames) if initparamnames else '' callsite_stream.write( f''' DACE_EXPORTED void __program_{fname}({fname}_t *__state{params_comma}) {{ __program_{fname}_internal(__state{paramnames_comma}); }}''', sdfg) for target in self._dispatcher.used_targets: if target.has_initializer: callsite_stream.write( 'DACE_EXPORTED int __dace_init_%s(%s_t *__state%s);\n' % (target.target_name, sdfg.name, initparams_comma), sdfg) if target.has_finalizer: callsite_stream.write( 'DACE_EXPORTED int __dace_exit_%s(%s_t *__state);\n' % (target.target_name, sdfg.name), sdfg) callsite_stream.write( f""" DACE_EXPORTED {sdfg.name}_t *__dace_init_{sdfg.name}({initparams}) {{ int __result = 0; {sdfg.name}_t *__state = new {sdfg.name}_t; """, sdfg) for target in self._dispatcher.used_targets: if target.has_initializer: callsite_stream.write( '__result |= __dace_init_%s(__state%s);' % (target.target_name, initparamnames_comma), sdfg) for env in self.environments: init_code = _get_or_eval_sdfg_first_arg(env.init_code, sdfg) if init_code: callsite_stream.write("{ // Environment: " + env.__name__, sdfg) callsite_stream.write(init_code) callsite_stream.write("}") for sd in sdfg.all_sdfgs_recursive(): if None in sd.init_code: callsite_stream.write(codeblock_to_cpp(sd.init_code[None]), sd) callsite_stream.write(codeblock_to_cpp(sd.init_code['frame']), sd) callsite_stream.write(self._initcode.getvalue(), sdfg) callsite_stream.write( f""" if (__result) {{ delete __state; return nullptr; }} return __state; }} DACE_EXPORTED void __dace_exit_{sdfg.name}({sdfg.name}_t *__state) {{ """, sdfg) # Instrumentation saving if (not config.Config.get_bool('instrumentation', 'report_each_invocation') and len(self._dispatcher.instrumentation) > 1): callsite_stream.write( '__state->report.save("%s/perf", __HASH_%s);' % (sdfg.build_folder.replace('\\', '/'), sdfg.name), sdfg) callsite_stream.write(self._exitcode.getvalue(), sdfg) for sd in sdfg.all_sdfgs_recursive(): if None in sd.exit_code: callsite_stream.write(codeblock_to_cpp(sd.exit_code[None]), sd) callsite_stream.write(codeblock_to_cpp(sd.exit_code['frame']), sd) for target in self._dispatcher.used_targets: if target.has_finalizer: callsite_stream.write( '__dace_exit_%s(__state);' % target.target_name, sdfg) for env in reversed(self.environments): finalize_code = _get_or_eval_sdfg_first_arg( env.finalize_code, sdfg) if finalize_code: callsite_stream.write("{ // Environment: " + env.__name__, sdfg) callsite_stream.write(finalize_code) callsite_stream.write("}") callsite_stream.write('delete __state;\n}\n', sdfg) def generate_state(self, sdfg, state, global_stream, callsite_stream, generate_state_footer=True): sid = sdfg.node_id(state) # Emit internal transient array allocation self.allocate_arrays_in_scope(sdfg, state, global_stream, callsite_stream) callsite_stream.write('\n') # Invoke all instrumentation providers for instr in self._dispatcher.instrumentation.values(): if instr is not None: instr.on_state_begin(sdfg, state, callsite_stream, global_stream) ##################### # Create dataflow graph for state's children. # DFG to code scheme: Only generate code for nodes whose all # dependencies have been executed (topological sort). # For different connected components, run them concurrently. components = dace.sdfg.concurrent_subgraphs(state) if len(components) == 1: self._dispatcher.dispatch_subgraph(sdfg, state, sid, global_stream, callsite_stream, skip_entry_node=False) else: if sdfg.openmp_sections: callsite_stream.write("#pragma omp parallel sections\n{") for c in components: if sdfg.openmp_sections: callsite_stream.write("#pragma omp section\n{") self._dispatcher.dispatch_subgraph(sdfg, c, sid, global_stream, callsite_stream, skip_entry_node=False) if sdfg.openmp_sections: callsite_stream.write("} // End omp section") if sdfg.openmp_sections: callsite_stream.write("} // End omp sections") ##################### # Write state footer if generate_state_footer: # Emit internal transient array deallocation self.deallocate_arrays_in_scope(sdfg, state, global_stream, callsite_stream) # Invoke all instrumentation providers for instr in self._dispatcher.instrumentation.values(): if instr is not None: instr.on_state_end(sdfg, state, callsite_stream, global_stream) def generate_states(self, sdfg, global_stream, callsite_stream): states_generated = set() # Create closure + function for state dispatcher def dispatch_state(state: SDFGState) -> str: stream = CodeIOStream() self._dispatcher.dispatch_state(sdfg, state, global_stream, stream) states_generated.add(state) # For sanity check return stream.getvalue() # Handle specialized control flow if config.Config.get_bool('optimizer', 'detect_control_flow'): # Avoid import loop from dace.transformation import helpers as xfh # Clean up the state machine by separating combined condition and assignment # edges. xfh.split_interstate_edges(sdfg) cft = cflow.structured_control_flow_tree(sdfg, dispatch_state) else: # If disabled, generate entire graph as general control flow block states_topological = list(sdfg.topological_sort(sdfg.start_state)) last = states_topological[-1] cft = cflow.GeneralBlock(dispatch_state, [ cflow.SingleState(dispatch_state, s, s is last) for s in states_topological ], [], []) callsite_stream.write( cft.as_cpp(self.dispatcher.defined_vars, sdfg.symbols), sdfg) # Write exit label callsite_stream.write(f'__state_exit_{sdfg.sdfg_id}:;', sdfg) return states_generated def _get_schedule( self, scope: Union[nodes.EntryNode, SDFGState, SDFG]) -> dtypes.ScheduleType: TOP_SCHEDULE = dtypes.ScheduleType.Sequential if scope is None: return TOP_SCHEDULE elif isinstance(scope, nodes.EntryNode): return scope.schedule elif isinstance(scope, (SDFGState, SDFG)): sdfg: SDFG = (scope if isinstance(scope, SDFG) else scope.parent) if sdfg.parent_nsdfg_node is None: return TOP_SCHEDULE return (sdfg.parent_nsdfg_node.schedule or TOP_SCHEDULE) else: raise TypeError def _can_allocate(self, sdfg: SDFG, state: SDFGState, desc: data.Data, scope: Union[nodes.EntryNode, SDFGState, SDFG]) -> bool: schedule = self._get_schedule(scope) # if not dtypes.can_allocate(desc.storage, schedule): # return False if dtypes.can_allocate(desc.storage, schedule): return True # Check for device-level memory recursively node = scope if isinstance(scope, nodes.EntryNode) else None cstate = scope if isinstance(scope, SDFGState) else state csdfg = scope if isinstance(scope, SDFG) else sdfg if desc.storage in dtypes.FPGA_STORAGES: return sdscope.is_devicelevel_fpga(csdfg, cstate, node) elif desc.storage in dtypes.GPU_STORAGES: return sdscope.is_devicelevel_gpu(csdfg, cstate, node) return False def determine_allocation_lifetime(self, top_sdfg: SDFG): """ Determines where (at which scope/state/SDFG) each data descriptor will be allocated/deallocated. :param top_sdfg: The top-level SDFG to determine for. """ # Gather shared transients shared_transients = {} for sdfg in top_sdfg.all_sdfgs_recursive(): shared_transients[sdfg.sdfg_id] = sdfg.shared_transients( check_toplevel=False) for sdfg, name, desc in top_sdfg.arrays_recursive(): if not desc.transient: continue # NOTE: In the code below we infer where a transient should be # declared, allocated, and deallocated. The information is stored # in the `to_allocate` dictionary. The key of each entry is the # scope where one of the above actions must occur, while the value # is a tuple containing the following information: # 1. The SDFG object that containts the transient. # 2. The State id where the action should (approx.) take place. # 3. The Access Node id of the transient in the above State. # 4. True if declaration should take place, otherwise False. # 5. True if allocation should take place, otherwise False. # 6. True if deallocation should take place, otherwise False. # Possibly confusing control flow below finds the first state # and node of the data descriptor, or continues the # arrays_recursive() loop first_state_instance: int = None first_node_instance: nodes.AccessNode = None last_state_instance: int = None last_node_instance: nodes.AccessNode = None first = True for state in sdfg.topological_sort(): id = sdfg.nodes().index(state) for node in state.data_nodes(): if node.data == name: if first: first_state_instance = id first_node_instance = node first = False last_state_instance = id last_node_instance = node # break else: continue break # Cases if desc.lifetime is dtypes.AllocationLifetime.Persistent: # Persistent memory is allocated in initialization code and # exists in the library state structure # If unused, skip if first_node_instance is None: continue definition = desc.as_arg(name=f'__{sdfg.sdfg_id}_{name}') + ';' self.statestruct.append(definition) self.to_allocate[sdfg].append( (sdfg, first_state_instance, first_node_instance, True, True, True)) continue elif desc.lifetime is dtypes.AllocationLifetime.Global: # Global memory is allocated in the beginning of the program # exists in the library state structure (to be passed along # to the right SDFG) # If unused, skip if first_node_instance is None: continue definition = desc.as_arg(name=f'__{sdfg.sdfg_id}_{name}') + ';' self.statestruct.append(definition) # self.to_allocate[top_sdfg].append( # (sdfg.sdfg_id, sdfg.node_id(state), node)) self.to_allocate[top_sdfg].append( (sdfg, first_state_instance, first_node_instance, True, True, True)) continue # The rest of the cases change the starting scope we attempt to # allocate from, since the descriptors may only be allocated higher # in the hierarchy (e.g., in the case of GPU global memory inside # a kernel). alloc_scope: Union[nodes.EntryNode, SDFGState, SDFG] = None alloc_state: SDFGState = None access_node: nodes.AccessNode = None if (name in shared_transients[sdfg.sdfg_id] or desc.lifetime is dtypes.AllocationLifetime.SDFG): # SDFG memory and shared transients are allocated in the # beginning of their SDFG alloc_scope = sdfg if first_state_instance is not None: alloc_state = sdfg.nodes()[first_state_instance] # If unused, skip if first_node_instance is None: continue elif desc.lifetime is dtypes.AllocationLifetime.State: # State memory is either allocated in the beginning of the # containing state or the SDFG (if used in more than one state) curstate: SDFGState = None multistate = False for state in sdfg.nodes(): if any(n.data == name for n in state.data_nodes()): if curstate is not None: multistate = True break curstate = state if multistate: alloc_scope = sdfg else: alloc_scope = curstate alloc_state = curstate elif desc.lifetime is dtypes.AllocationLifetime.Scope: # Scope memory (default) is either allocated in the innermost # scope (e.g., Map, Consume) it is used in (i.e., greatest # common denominator), or in the SDFG if used in multiple states curscope: Union[nodes.EntryNode, SDFGState] = None curstate: SDFGState = None multistate = False # Does the array appear in inter-state edges? for isedge in sdfg.edges(): if name in isedge.data.free_symbols: multistate = True for state in sdfg.nodes(): if multistate: break sdict = state.scope_dict() for node in state.nodes(): if not isinstance(node, nodes.AccessNode): continue if node.data != name: continue # If already found in another state, set scope to SDFG if curstate is not None and curstate != state: multistate = True break curstate = state # Current scope (or state object if top-level) scope = sdict[node] or state if curscope is None: curscope = scope continue # States always win if isinstance(scope, SDFGState): curscope = scope continue # Lower/Higher/Disjoint scopes: find common denominator if isinstance(curscope, SDFGState): if scope in curscope.nodes(): continue curscope = sdscope.common_parent_scope( sdict, scope, curscope) if multistate: break if multistate: alloc_scope = sdfg else: alloc_scope = curscope alloc_state = curstate else: raise TypeError('Unrecognized allocation lifetime "%s"' % desc.lifetime) if alloc_scope is None: # No allocation necessary continue # If descriptor cannot be allocated in this scope, traverse up the # scope tree until it is possible cursdfg = sdfg curstate = alloc_state curscope = alloc_scope while not self._can_allocate(cursdfg, curstate, desc, curscope): if curscope is None: break if isinstance(curscope, nodes.EntryNode): # Go one scope up curscope = curstate.entry_node(curscope) if curscope is None: curscope = curstate elif isinstance(curscope, (SDFGState, SDFG)): cursdfg: SDFG = (curscope if isinstance(curscope, SDFG) else curscope.parent) # Go one SDFG up if cursdfg.parent_nsdfg_node is None: curscope = None curstate = None else: curstate = cursdfg.parent curscope = curstate.entry_node( cursdfg.parent_nsdfg_node) else: raise TypeError if curscope is None: curscope = top_sdfg # Check if Array/View is dependent on non-free SDFG symbols # NOTE: Tuple is (SDFG, State, Node, declare, allocate, deallocate) fsymbols = sdfg.free_symbols.union(sdfg.constants.keys()) if (not isinstance(curscope, nodes.EntryNode) and utils.is_nonfree_sym_dependent( first_node_instance, desc, alloc_state, fsymbols)): # Declare in current (SDFG) scope self.to_allocate[curscope].append( (sdfg, first_state_instance, first_node_instance, True, False, False)) # Allocate in first State # Deallocate in last State if first_state_instance != last_state_instance: curscope = sdfg.nodes()[first_state_instance] self.to_allocate[curscope].append( (sdfg, first_state_instance, first_node_instance, False, True, False)) curscope = sdfg.nodes()[last_state_instance] self.to_allocate[curscope].append( (sdfg, last_state_instance, last_node_instance, False, False, True)) else: curscope = sdfg.nodes()[first_state_instance] self.to_allocate[curscope].append( (sdfg, first_state_instance, first_node_instance, False, True, True)) else: self.to_allocate[curscope].append( (sdfg, first_state_instance, first_node_instance, True, True, True)) def allocate_arrays_in_scope(self, sdfg: SDFG, scope: Union[nodes.EntryNode, SDFGState, SDFG], function_stream: CodeIOStream, callsite_stream: CodeIOStream): """ Dispatches allocation of all arrays in the given scope. """ for tsdfg, state_id, node, declare, allocate, _ in self.to_allocate[ scope]: if state_id is not None: state = tsdfg.node(state_id) else: state = None desc = node.desc(tsdfg) self._dispatcher.dispatch_allocate(tsdfg, state, state_id, node, desc, function_stream, callsite_stream, declare, allocate) def deallocate_arrays_in_scope(self, sdfg: SDFG, scope: Union[nodes.EntryNode, SDFGState, SDFG], function_stream: CodeIOStream, callsite_stream: CodeIOStream): """ Dispatches deallocation of all arrays in the given scope. """ for tsdfg, state_id, node, _, _, deallocate in self.to_allocate[scope]: if not deallocate: continue if state_id is not None: state = tsdfg.node(state_id) else: state = None desc = node.desc(tsdfg) self._dispatcher.dispatch_deallocate(tsdfg, state, state_id, node, desc, function_stream, callsite_stream) def generate_code( self, sdfg: SDFG, schedule: Optional[dtypes.ScheduleType], sdfg_id: str = "" ) -> Tuple[str, str, Set[TargetCodeGenerator], Set[str]]: """ Generate frame code for a given SDFG, calling registered targets' code generation callbacks for them to generate their own code. :param sdfg: The SDFG to generate code for. :param schedule: The schedule the SDFG is currently located, or None if the SDFG is top-level. :param sdfg_id: An optional string id given to the SDFG label :return: A tuple of the generated global frame code, local frame code, and a set of targets that have been used in the generation of this SDFG. """ if len(sdfg_id) == 0 and sdfg.sdfg_id != 0: sdfg_id = '_%d' % sdfg.sdfg_id global_stream = CodeIOStream() callsite_stream = CodeIOStream() is_top_level = sdfg.parent is None # Analyze allocation lifetime of SDFG and all nested SDFGs if is_top_level: self.determine_allocation_lifetime(sdfg) # Generate code ########################### # Keep track of allocated variables allocated = set() # Add symbol mappings to allocated variables if sdfg.parent_nsdfg_node is not None: allocated |= sdfg.parent_nsdfg_node.symbol_mapping.keys() # Invoke all instrumentation providers for instr in self._dispatcher.instrumentation.values(): if instr is not None: instr.on_sdfg_begin(sdfg, callsite_stream, global_stream) # Allocate outer-level transients self.allocate_arrays_in_scope(sdfg, sdfg, global_stream, callsite_stream) # Allocate inter-state variables global_symbols = copy.deepcopy(sdfg.symbols) global_symbols.update( {aname: arr.dtype for aname, arr in sdfg.arrays.items()}) interstate_symbols = {} for e in sdfg.edges(): symbols = e.data.new_symbols(sdfg, global_symbols) # Inferred symbols only take precedence if global symbol not defined symbols = { k: v if k not in global_symbols else global_symbols[k] for k, v in symbols.items() } interstate_symbols.update(symbols) global_symbols.update(symbols) for isvarName, isvarType in interstate_symbols.items(): isvar = data.Scalar(isvarType) callsite_stream.write( '%s;\n' % (isvar.as_arg(with_types=True, name=isvarName)), sdfg) self.dispatcher.defined_vars.add(isvarName, disp.DefinedType.Scalar, isvarType.ctype) callsite_stream.write('\n', sdfg) ####################################################################### # Generate actual program body states_generated = self.generate_states(sdfg, global_stream, callsite_stream) ####################################################################### # Sanity check if len(states_generated) != len(sdfg.nodes()): raise RuntimeError( "Not all states were generated in SDFG {}!" "\n Generated: {}\n Missing: {}".format( sdfg.label, [s.label for s in states_generated], [s.label for s in (set(sdfg.nodes()) - states_generated)])) # Deallocate transients self.deallocate_arrays_in_scope(sdfg, sdfg, global_stream, callsite_stream) # Now that we have all the information about dependencies, generate # header and footer if is_top_level: # Let each target append code to frame code state before generating # header and footer for target in self._dispatcher.used_targets: target.on_target_used() header_stream = CodeIOStream() header_global_stream = CodeIOStream() footer_stream = CodeIOStream() footer_global_stream = CodeIOStream() # Get all environments used in the generated code, including # dependent environments import dace.library # Avoid import loops self.environments = dace.library.get_environments_and_dependencies( self._dispatcher.used_environments) self.generate_header(sdfg, header_global_stream, header_stream) # Open program function params = sdfg.signature() if params: params = ', ' + params function_signature = ( 'void __program_%s_internal(%s_t *__state%s)\n{\n' % (sdfg.name, sdfg.name, params)) self.generate_footer(sdfg, footer_global_stream, footer_stream) header_global_stream.write(global_stream.getvalue()) header_global_stream.write(footer_global_stream.getvalue()) generated_header = header_global_stream.getvalue() all_code = CodeIOStream() all_code.write(function_signature) all_code.write(header_stream.getvalue()) all_code.write(callsite_stream.getvalue()) all_code.write(footer_stream.getvalue()) generated_code = all_code.getvalue() else: generated_header = global_stream.getvalue() generated_code = callsite_stream.getvalue() # Clean up generated code gotos = re.findall(r'goto (.*?);', generated_code) clean_code = '' for line in generated_code.split('\n'): # Empty line with semicolon if re.match(r'^\s*;\s*', line): continue # Label that might be unused label = re.findall( r'^\s*([a-zA-Z_][a-zA-Z_0-9]*):\s*[;]?\s*////.*$', line) if len(label) > 0: if label[0] not in gotos: continue clean_code += line + '\n' # Return the generated global and local code strings return (generated_header, clean_code, self._dispatcher.used_targets, self._dispatcher.used_environments)
def dispatch_state(state: SDFGState) -> str: stream = CodeIOStream() self._dispatcher.dispatch_state(sdfg, state, global_stream, stream) states_generated.add(state) # For sanity check return stream.getvalue()
def get_generated_codeobjects(self): execution_mode = Config.get("compiler", "xilinx", "mode") kernel_file_name = "DACE_BINARY_DIR \"/{}".format(self._program_name) if execution_mode == "software_emulation": kernel_file_name += "_sw_emu.xclbin\"" xcl_emulation_mode = "\"sw_emu\"" xilinx_sdx = "DACE_VITIS_DIR" elif execution_mode == "hardware_emulation": kernel_file_name += "_hw_emu.xclbin\"" xcl_emulation_mode = "\"hw_emu\"" xilinx_sdx = "DACE_VITIS_DIR" elif execution_mode == "hardware" or execution_mode == "simulation": kernel_file_name += "_hw.xclbin\"" xcl_emulation_mode = None xilinx_sdx = None else: raise dace.codegen.codegen.CodegenError( "Unknown Xilinx execution mode: {}".format(execution_mode)) set_env_vars = "" set_str = "dace::set_environment_variable(\"{}\", {});\n" unset_str = "dace::unset_environment_variable(\"{}\");\n" set_env_vars += (set_str.format("XCL_EMULATION_MODE", xcl_emulation_mode) if xcl_emulation_mode is not None else unset_str.format("XCL_EMULATION_MODE")) set_env_vars += (set_str.format("XILINX_SDX", xilinx_sdx) if xilinx_sdx is not None else unset_str.format("XILINX_SDX")) host_code = CodeIOStream() host_code.write("""\ #include "dace/xilinx/host.h" #include "dace/dace.h" #include <iostream>\n\n""") self._frame.generate_fileheader(self._global_sdfg, host_code) host_code.write(""" dace::fpga::Context *dace::fpga::_context; DACE_EXPORTED int __dace_init_xilinx({signature}) {{ {environment_variables} dace::fpga::_context = new dace::fpga::Context(); dace::fpga::_context->Get().MakeProgram({kernel_file_name}); return 0; }} DACE_EXPORTED void __dace_exit_xilinx({signature}) {{ delete dace::fpga::_context; }} {host_code}""".format(signature=self._global_sdfg.signature(), environment_variables=set_env_vars, kernel_file_name=kernel_file_name, host_code="".join([ "{separator}\n// Kernel: {kernel_name}" "\n{separator}\n\n{code}\n\n".format( separator="/" * 79, kernel_name=name, code=code) for (name, code) in self._host_codes ]))) host_code_obj = CodeObject(self._program_name, host_code.getvalue(), "cpp", XilinxCodeGen, "Xilinx", target_type="host") kernel_code_objs = [ CodeObject(kernel_name, code, "cpp", XilinxCodeGen, "Xilinx", target_type="device") for (kernel_name, code) in self._kernel_codes ] return [host_code_obj] + kernel_code_objs
def expansion(node: 'Reduce', state: SDFGState, sdfg: SDFG): node.validate(sdfg, state) input_edge: graph.MultiConnectorEdge = state.in_edges(node)[0] output_edge: graph.MultiConnectorEdge = state.out_edges(node)[0] input_dims = len(input_edge.data.subset) input_data = sdfg.arrays[input_edge.data.data] output_data = sdfg.arrays[output_edge.data.data] # Setup all locations in which code will be written cuda_globalcode = CodeIOStream() localcode = CodeIOStream() # Try to autodetect reduction type redtype = detect_reduction_type(node.wcr) node_id = state.node_id(node) state_id = sdfg.node_id(state) idstr = '{sdfg}_{state}_{node}'.format(sdfg=sdfg.name, state=state_id, node=node_id) # Obtain some SDFG-related information input_memlet = input_edge.data output_memlet = output_edge.data if node.out_connectors: dtype = next(node.out_connectors.values()) else: dtype = sdfg.arrays[output_memlet.data].dtype output_type = dtype.ctype if node.identity is None: raise ValueError('For device reduce nodes, initial value must be ' 'specified') # Create a functor or use an existing one for reduction if redtype == dtypes.ReductionType.Custom: body, [arg1, arg2] = unparse_cr_split(sdfg, node.wcr) cuda_globalcode.write( """ struct __reduce_{id} {{ template <typename T> DACE_HDFI T operator()(const T &{arg1}, const T &{arg2}) const {{ {contents} }} }};""".format(id=idstr, arg1=arg1, arg2=arg2, contents=body), sdfg, state_id, node_id) reduce_op = ', __reduce_' + idstr + '(), ' + symstr(node.identity) elif redtype in ExpandReduceCUDADevice._SPECIAL_RTYPES: reduce_op = '' else: credtype = 'dace::ReductionType::' + str( redtype)[str(redtype).find('.') + 1:] reduce_op = ((', dace::_wcr_fixed<%s, %s>()' % (credtype, output_type)) + ', ' + symstr(node.identity)) # Try to obtain the number of threads in the block, or use the default # configuration block_threads = devicelevel_block_size(sdfg, state, node) if block_threads is not None: block_threads = functools.reduce(lambda a, b: a * b, block_threads, 1) # Checks if block_threads is None: raise ValueError('Block-wide GPU reduction must occur within' ' a GPU kernel') if issymbolic(block_threads, sdfg.constants): raise ValueError('Block size has to be constant for block-wide ' 'reduction (got %s)' % str(block_threads)) if (node.axes is not None and len(node.axes) < input_dims): raise ValueError( 'Only full reduction is supported for block-wide reduce,' ' please use the pure expansion') if (input_data.storage != dtypes.StorageType.Register or output_data.storage != dtypes.StorageType.Register): raise ValueError( 'Block-wise reduction only supports GPU register inputs ' 'and outputs') if redtype in ExpandReduceCUDABlock._SPECIAL_RTYPES: raise ValueError('%s block reduction not supported' % redtype) credtype = 'dace::ReductionType::' + str( redtype)[str(redtype).find('.') + 1:] if redtype == dtypes.ReductionType.Custom: redop = '__reduce_%s()' % idstr else: redop = 'dace::_wcr_fixed<%s, %s>()' % (credtype, output_type) # Allocate shared memory for block reduce localcode.write(""" typedef cub::BlockReduce<{type}, {numthreads}> BlockReduce_{id}; __shared__ typename BlockReduce_{id}::TempStorage temp_storage_{id}; """.format(id=idstr, type=output_data.dtype.ctype, numthreads=block_threads)) input = (input_memlet.data + ' + ' + cpp_array_expr(sdfg, input_memlet, with_brackets=False)) output = cpp_array_expr(sdfg, output_memlet) localcode.write(""" {output} = BlockReduce_{id}(temp_storage_{id}).Reduce({input}, {redop}); """.format(id=idstr, redop=redop, input=input_memlet.data, output=output)) # Make tasklet tnode = dace.nodes.Tasklet('reduce', {'_in': dace.pointer(input_data.dtype)}, {'_out': dace.pointer(output_data.dtype)}, localcode.getvalue(), language=dace.Language.CPP) # Add the rest of the code sdfg.append_global_code(cuda_globalcode.getvalue(), 'cuda') # Rename outer connectors and add to node input_edge._dst_conn = '_in' output_edge._src_conn = '_out' node.add_in_connector('_in') node.add_out_connector('_out') return tnode
def get_generated_codeobjects(self): execution_mode = Config.get("compiler", "xilinx", "mode") kernel_file_name = "DACE_BINARY_DIR \"/{}".format(self._program_name) if execution_mode == "software_emulation": kernel_file_name += "_sw_emu.xclbin\"" xcl_emulation_mode = "\"sw_emu\"" xilinx_sdx = "DACE_VITIS_DIR" elif execution_mode == "hardware_emulation": kernel_file_name += "_hw_emu.xclbin\"" xcl_emulation_mode = "\"hw_emu\"" xilinx_sdx = "DACE_VITIS_DIR" elif execution_mode == "hardware" or execution_mode == "simulation": kernel_file_name += "_hw.xclbin\"" xcl_emulation_mode = None xilinx_sdx = None else: raise cgx.CodegenError( "Unknown Xilinx execution mode: {}".format(execution_mode)) set_env_vars = "" set_str = "dace::set_environment_variable(\"{}\", {});\n" unset_str = "dace::unset_environment_variable(\"{}\");\n" set_env_vars += (set_str.format("XCL_EMULATION_MODE", xcl_emulation_mode) if xcl_emulation_mode is not None else unset_str.format("XCL_EMULATION_MODE")) set_env_vars += (set_str.format("XILINX_SDX", xilinx_sdx) if xilinx_sdx is not None else unset_str.format("XILINX_SDX")) set_env_vars += set_str.format( "EMCONFIG_PATH", "DACE_BINARY_DIR" ) if execution_mode == 'hardware_emulation' else unset_str.format( "EMCONFIG_PATH") host_code = CodeIOStream() host_code.write("""\ #include "dace/xilinx/host.h" #include "dace/dace.h" """) if len(self._dispatcher.instrumentation) > 1: host_code.write("""\ #include "dace/perf/reporting.h" #include <chrono> #include <iomanip> #include <iostream> #include <limits> """) host_code.write("\n\n") self._frame.generate_fileheader(self._global_sdfg, host_code, 'xilinx_host') params_comma = self._global_sdfg.signature(with_arrays=False) if params_comma: params_comma = ', ' + params_comma host_code.write(""" DACE_EXPORTED int __dace_init_xilinx({sdfg.name}_t *__state{signature}) {{ {environment_variables} __state->fpga_context = new dace::fpga::Context(); __state->fpga_context->Get().MakeProgram({kernel_file_name}); return 0; }} DACE_EXPORTED void __dace_exit_xilinx({sdfg.name}_t *__state) {{ delete __state->fpga_context; }} {host_code}""".format(signature=params_comma, sdfg=self._global_sdfg, environment_variables=set_env_vars, kernel_file_name=kernel_file_name, host_code="".join([ "{separator}\n// Kernel: {kernel_name}" "\n{separator}\n\n{code}\n\n".format( separator="/" * 79, kernel_name=name, code=code) for (name, code) in self._host_codes ]))) host_code_obj = CodeObject(self._program_name, host_code.getvalue(), "cpp", XilinxCodeGen, "Xilinx", target_type="host") kernel_code_objs = [ CodeObject(kernel_name, code, "cpp", XilinxCodeGen, "Xilinx", target_type="device") for (kernel_name, code) in self._kernel_codes ] # Memory bank and streaming interfaces connectivity configuration file link_cfg = CodeIOStream() self._other_codes["link.cfg"] = link_cfg link_cfg.write("[connectivity]") are_assigned = [v is not None for v in self._bank_assignments.values()] if any(are_assigned): if not all(are_assigned): raise RuntimeError("Some, but not all global memory arrays " "were assigned to memory banks: {}".format( self._bank_assignments)) # Emit mapping from kernel memory interfaces to DRAM banks for (kernel_name, interface_name), ( memory_type, memory_bank) in self._bank_assignments.items(): link_cfg.write( f"sp={kernel_name}_1.m_axi_{interface_name}:{memory_type}[{memory_bank}]" ) # Emit mapping between inter-kernel streaming interfaces for _, (src, dst) in self._stream_connections.items(): link_cfg.write(f"stream_connect={src}:{dst}") other_objs = [] for name, code in self._other_codes.items(): name = name.split(".") other_objs.append( CodeObject(name[0], code.getvalue(), ".".join(name[1:]), XilinxCodeGen, "Xilinx", target_type="device")) return [host_code_obj] + kernel_code_objs + other_objs
def generate_code( self, sdfg: SDFG, schedule: Optional[dtypes.ScheduleType], sdfg_id: str = "" ) -> Tuple[str, str, Set[TargetCodeGenerator], Set[str]]: """ Generate frame code for a given SDFG, calling registered targets' code generation callbacks for them to generate their own code. :param sdfg: The SDFG to generate code for. :param schedule: The schedule the SDFG is currently located, or None if the SDFG is top-level. :param sdfg_id: An optional string id given to the SDFG label :return: A tuple of the generated global frame code, local frame code, and a set of targets that have been used in the generation of this SDFG. """ sdfg_label = sdfg.name + sdfg_id global_stream = CodeIOStream() callsite_stream = CodeIOStream() # Set default storage/schedule types in SDFG set_default_schedule_and_storage_types(sdfg, schedule) is_top_level = sdfg.parent is None # Generate code ########################### # Invoke all instrumentation providers for instr in self._dispatcher.instrumentation.values(): if instr is not None: instr.on_sdfg_begin(sdfg, callsite_stream, global_stream) # Allocate outer-level transients shared_transients = sdfg.shared_transients() allocated = set() for state in sdfg.nodes(): for node in state.data_nodes(): if (node.data in shared_transients and node.data not in allocated): self._dispatcher.dispatch_allocate(sdfg, state, None, node, global_stream, callsite_stream) self._dispatcher.dispatch_initialize( sdfg, state, None, node, global_stream, callsite_stream) allocated.add(node.data) # Allocate inter-state variables assigned, _ = sdfg.interstate_symbols() for isvarName, isvarType in assigned.items(): # Skip symbols that have been declared as outer-level transients if isvarName in allocated: continue callsite_stream.write( '%s;\n' % (isvarType.signature(with_types=True, name=isvarName)), sdfg) # Initialize parameter arrays for argnode in dtypes.deduplicate(sdfg.input_arrays() + sdfg.output_arrays()): # Ignore transient arrays if argnode.desc(sdfg).transient: continue self._dispatcher.dispatch_initialize(sdfg, sdfg, None, argnode, global_stream, callsite_stream) callsite_stream.write('\n', sdfg) states_topological = list(sdfg.topological_sort(sdfg.start_state)) # {edge: [dace.edges.ControlFlow]} control_flow = {e: [] for e in sdfg.edges()} if dace.config.Config.get_bool('optimizer', 'detect_control_flow'): #################################################################### # Loop detection procedure all_cycles = list(sdfg.find_cycles()) # Returns a list of lists # Order according to topological sort all_cycles = [ sorted(c, key=lambda x: states_topological.index(x)) for c in all_cycles ] # Group in terms of starting node starting_nodes = [c[0] for c in all_cycles] # Order cycles according to starting node in topological sort starting_nodes = sorted(starting_nodes, key=lambda x: states_topological.index(x)) cycles_by_node = [[c for c in all_cycles if c[0] == n] for n in starting_nodes] for cycles in cycles_by_node: # Use arbitrary cycle to find the first and last nodes first_node = cycles[0][0] last_node = cycles[0][-1] if not first_node.is_empty(): # The entry node should not contain any computations continue if not all([c[-1] == last_node for c in cycles]): # There are multiple back edges: not a for or while loop continue previous_edge = [ e for e in sdfg.in_edges(first_node) if e.src != last_node ] if len(previous_edge) != 1: # No single starting point: not a for or while continue previous_edge = previous_edge[0] back_edge = sdfg.edges_between(last_node, first_node) if len(back_edge) != 1: raise RuntimeError("Expected exactly one edge in cycle") back_edge = back_edge[0] # Build a set of all nodes in all cycles associated with this # set of start and end node internal_nodes = functools.reduce( lambda a, b: a | b, [set(c) for c in cycles]) - {first_node} exit_edge = [ e for e in sdfg.out_edges(first_node) if e.dst not in internal_nodes | {first_node} ] if len(exit_edge) != 1: # No single stopping condition: not a for or while # (we don't support continue or break) continue exit_edge = exit_edge[0] entry_edge = [ e for e in sdfg.out_edges(first_node) if e != exit_edge ] if len(entry_edge) != 1: # No single starting condition: not a for or while continue entry_edge = entry_edge[0] # Make sure this is not already annotated to be another construct if (len(control_flow[entry_edge]) != 0 or len(control_flow[back_edge]) != 0): continue # Nested loops case I - previous edge of internal loop is a # loop-entry of an external loop (first state in a loop is # another loop) if (len(control_flow[previous_edge]) == 1 and isinstance(control_flow[previous_edge][0], dace.graph.edges.LoopEntry)): # Nested loop, mark parent scope loop_parent = control_flow[previous_edge][0].scope # Nested loops case II - exit edge of internal loop is a # back-edge of an external loop (last state in a loop is another # loop) elif (len(control_flow[exit_edge]) == 1 and isinstance(control_flow[exit_edge][0], dace.graph.edges.LoopBack)): # Nested loop, mark parent scope loop_parent = control_flow[exit_edge][0].scope elif (len(control_flow[exit_edge]) == 0 or len(control_flow[previous_edge]) == 0): loop_parent = None else: continue if entry_edge == back_edge: # No entry check (we don't support do-loops) # TODO: do we want to add some support for self-loops? continue # Now we make sure that there is no other way to exit this # cycle, by checking that there's no reachable node *not* # included in any cycle between the first and last node. if any([len(set(c) - internal_nodes) > 1 for c in cycles]): continue # This is a loop! Generate the necessary annotation objects. loop_scope = dace.graph.edges.LoopScope(internal_nodes) if ((len(previous_edge.data.assignments) > 0 or len(back_edge.data.assignments) > 0) and (len(control_flow[previous_edge]) == 0 or (len(control_flow[previous_edge]) == 1 and control_flow[previous_edge][0].scope == loop_parent))): # Generate assignment edge, if available control_flow[previous_edge].append( dace.graph.edges.LoopAssignment( loop_scope, previous_edge)) # Assign remaining control flow constructs control_flow[entry_edge].append( dace.graph.edges.LoopEntry(loop_scope, entry_edge)) control_flow[exit_edge].append( dace.graph.edges.LoopExit(loop_scope, exit_edge)) control_flow[back_edge].append( dace.graph.edges.LoopBack(loop_scope, back_edge)) ################################################################### # If/then/else detection procedure candidates = [ n for n in states_topological if sdfg.out_degree(n) == 2 ] for candidate in candidates: # A valid if occurs when then are no reachable nodes for either # path that does not pass through a common dominator. dominators = nx.dominance.dominance_frontiers( sdfg.nx, candidate) left_entry, right_entry = sdfg.out_edges(candidate) if (len(control_flow[left_entry]) > 0 or len(control_flow[right_entry]) > 0): # Already assigned to a control flow construct # TODO: carefully allow this in some cases continue left, right = left_entry.dst, right_entry.dst dominator = dominators[left] & dominators[right] if len(dominator) != 1: # There must be a single dominator across both branches, # unless one of the nodes _is_ the next dominator # if (len(dominator) == 0 and dominators[left] == {right} # or dominators[right] == {left}): # dominator = dominators[left] | dominators[right] # else: # continue continue dominator = next(iter(dominator)) # Exactly one dominator exit_edges = sdfg.in_edges(dominator) if len(exit_edges) != 2: # There must be a single entry and a single exit. This # could be relaxed in the future. continue left_exit, right_exit = exit_edges if (len(control_flow[left_exit]) > 0 or len(control_flow[right_exit]) > 0): # Already assigned to a control flow construct # TODO: carefully allow this in some cases continue # Now traverse from the source and verify that all possible paths # pass through the dominator left_nodes = sdfg.all_nodes_between(left, dominator) if left_nodes is None: # Not all paths lead to the next dominator continue right_nodes = sdfg.all_nodes_between(right, dominator) if right_nodes is None: # Not all paths lead to the next dominator continue all_nodes = left_nodes | right_nodes # Make sure there is no overlap between left and right nodes if len(left_nodes & right_nodes) > 0: continue # This is a valid if/then/else construct. Generate annotations if_then_else = dace.graph.edges.IfThenElse( candidate, dominator) # Arbitrarily assign then/else to the two branches. If one edge # has no dominator but leads to the dominator, it means there's # only a then clause (and no else). has_else = False if len(dominators[left]) == 1: then_scope = dace.graph.edges.IfThenScope( if_then_else, left_nodes) else_scope = dace.graph.edges.IfElseScope( if_then_else, right_nodes) control_flow[left_entry].append( dace.graph.edges.IfEntry(then_scope, left_entry)) control_flow[left_exit].append( dace.graph.edges.IfExit(then_scope, left_exit)) control_flow[right_exit].append( dace.graph.edges.IfExit(else_scope, right_exit)) if len(dominators[right]) == 1: control_flow[right_entry].append( dace.graph.edges.IfEntry(else_scope, right_entry)) has_else = True else: then_scope = dace.graph.edges.IfThenScope( if_then_else, right_nodes) else_scope = dace.graph.edges.IfElseScope( if_then_else, left_nodes) control_flow[right_entry].append( dace.graph.edges.IfEntry(then_scope, right_entry)) control_flow[right_exit].append( dace.graph.edges.IfExit(then_scope, right_exit)) control_flow[left_exit].append( dace.graph.edges.IfExit(else_scope, left_exit)) ####################################################################### # Generate actual program body states_generated = set() # For sanity check generated_edges = set() self.generate_states(sdfg, "sdfg", control_flow, global_stream, callsite_stream, set(states_topological), states_generated, generated_edges) ####################################################################### # Sanity check if len(states_generated) != len(sdfg.nodes()): raise RuntimeError( "Not all states were generated in SDFG {}!" "\n Generated: {}\n Missing: {}".format( sdfg.label, [s.label for s in states_generated], [s.label for s in (set(sdfg.nodes()) - states_generated)])) # Deallocate transients shared_transients = sdfg.shared_transients() deallocated = set() for state in sdfg.nodes(): for node in state.data_nodes(): if (node.data in shared_transients and node.data not in deallocated): self._dispatcher.dispatch_deallocate( sdfg, state, None, node, global_stream, callsite_stream) deallocated.add(node.data) # Now that we have all the information about dependencies, generate # header and footer if is_top_level: header_stream = CodeIOStream() header_global_stream = CodeIOStream() footer_stream = CodeIOStream() footer_global_stream = CodeIOStream() self.generate_header(sdfg, self._dispatcher.used_environments, header_global_stream, header_stream) # Open program function function_signature = 'void __program_%s_internal(%s)\n{\n' % ( sdfg.name, sdfg.signature()) self.generate_footer(sdfg, self._dispatcher.used_environments, footer_global_stream, footer_stream) header_global_stream.write(global_stream.getvalue()) header_global_stream.write(footer_global_stream.getvalue()) generated_header = header_global_stream.getvalue() all_code = CodeIOStream() all_code.write(function_signature) all_code.write(header_stream.getvalue()) all_code.write(callsite_stream.getvalue()) all_code.write(footer_stream.getvalue()) generated_code = all_code.getvalue() else: generated_header = global_stream.getvalue() generated_code = callsite_stream.getvalue() # Return the generated global and local code strings return (generated_header, generated_code, self._dispatcher.used_targets, self._dispatcher.used_environments)
class DaCeCodeGenerator(object): """ DaCe code generator class that writes the generated code for SDFG state machines, and uses a dispatcher to generate code for individual states based on the target. """ def __init__(self, *args, **kwargs): self._dispatcher = TargetDispatcher() self._dispatcher.register_state_dispatcher(self) self._initcode = CodeIOStream() self._exitcode = CodeIOStream() ################################################################## # Target registry @property def dispatcher(self): return self._dispatcher ################################################################## # Code generation def generate_constants(self, sdfg: SDFG, callsite_stream: CodeIOStream): # Write constants for cstname, (csttype, cstval) in sdfg.constants_prop.items(): if isinstance(csttype, data.Array): const_str = "constexpr " + csttype.dtype.ctype + \ " " + cstname + "[" + str(cstval.size) + "] = {" it = np.nditer(cstval, order='C') for i in range(cstval.size - 1): const_str += str(it[0]) + ", " it.iternext() const_str += str(it[0]) + "};\n" callsite_stream.write(const_str, sdfg) else: callsite_stream.write( "constexpr %s %s = %s;\n" % (csttype.dtype.ctype, cstname, str(cstval)), sdfg) def generate_fileheader(self, sdfg: SDFG, global_stream: CodeIOStream): """ Generate a header in every output file that includes custom types and constants. :param sdfg: The input SDFG. :param global_stream: Stream to write to (global). """ ######################################################### # Custom types datatypes = set() # Types of this SDFG for _, arrname, arr in sdfg.arrays_recursive(): if arr is not None: datatypes.add(arr.dtype) # Emit unique definitions wrote_something = False for typ in datatypes: if hasattr(typ, 'emit_definition'): if not wrote_something: global_stream.write("", sdfg) wrote_something = True global_stream.write(typ.emit_definition(), sdfg) if wrote_something: global_stream.write("", sdfg) ######################################################### # Write constants self.generate_constants(sdfg, global_stream) for sd in sdfg.all_sdfgs_recursive(): global_stream.write(sd.global_code, sd) def generate_header(self, sdfg: SDFG, used_environments: Set[str], global_stream: CodeIOStream, callsite_stream: CodeIOStream): """ Generate the header of the frame-code. Code exists in a separate function for overriding purposes. :param sdfg: The input SDFG. :param global_stream: Stream to write to (global). :param callsite_stream: Stream to write to (at call site). """ environments = [ dace.library.get_environment(env_name) for env_name in used_environments ] # Write frame code - header global_stream.write( '/* DaCe AUTO-GENERATED FILE. DO NOT MODIFY */\n' + '#include <dace/dace.h>\n', sdfg) # Write header required by environments for env in environments: if len(env.headers) > 0: global_stream.write( "\n".join("#include \"" + h + "\"" for h in env.headers), sdfg) global_stream.write("\n", sdfg) self.generate_fileheader(sdfg, global_stream) # Instrumentation preamble if len(self._dispatcher.instrumentation) > 1: global_stream.write( 'namespace dace { namespace perf { Report report; } }', sdfg) callsite_stream.write('dace::perf::report.reset();', sdfg) def generate_footer(self, sdfg: SDFG, used_environments: Set[str], global_stream: CodeIOStream, callsite_stream: CodeIOStream): """ Generate the footer of the frame-code. Code exists in a separate function for overriding purposes. :param sdfg: The input SDFG. :param global_stream: Stream to write to (global). :param callsite_stream: Stream to write to (at call site). """ fname = sdfg.name params = sdfg.signature() paramnames = sdfg.signature(False, for_call=True) environments = [ dace.library.get_environment(env_name) for env_name in used_environments ] # Invoke all instrumentation providers for instr in self._dispatcher.instrumentation.values(): if instr is not None: instr.on_sdfg_end(sdfg, callsite_stream, global_stream) # Instrumentation saving if len(self._dispatcher.instrumentation) > 1: callsite_stream.write( 'dace::perf::report.save(".dacecache/%s/perf");' % sdfg.name, sdfg) # Write closing brace of program callsite_stream.write('}', sdfg) # Write awkward footer to avoid 'extern "C"' issues callsite_stream.write( """ DACE_EXPORTED void __program_%s(%s) { __program_%s_internal(%s); } """ % (fname, params, fname, paramnames), sdfg) for target in self._dispatcher.used_targets: if target.has_initializer: callsite_stream.write( 'DACE_EXPORTED int __dace_init_%s(%s);\n' % (target.target_name, params), sdfg) if target.has_finalizer: callsite_stream.write( 'DACE_EXPORTED int __dace_exit_%s(%s);\n' % (target.target_name, params), sdfg) callsite_stream.write( """ DACE_EXPORTED int __dace_init_%s(%s) { int __result = 0; """ % (sdfg.name, params), sdfg) for target in self._dispatcher.used_targets: if target.has_initializer: callsite_stream.write( '__result |= __dace_init_%s(%s);' % (target.target_name, paramnames), sdfg) for env in environments: if env.init_code: callsite_stream.write("{ // Environment: " + env.__name__, sdfg) callsite_stream.write(env.init_code) callsite_stream.write("}") callsite_stream.write(sdfg.init_code, sdfg) callsite_stream.write(self._initcode.getvalue(), sdfg) callsite_stream.write( """ return __result; } DACE_EXPORTED void __dace_exit_%s(%s) { """ % (sdfg.name, params), sdfg) callsite_stream.write(self._exitcode.getvalue(), sdfg) callsite_stream.write(sdfg.exit_code, sdfg) for target in self._dispatcher.used_targets: if target.has_finalizer: callsite_stream.write( '__dace_exit_%s(%s);' % (target.target_name, paramnames), sdfg) for env in environments: if env.finalize_code: callsite_stream.write("{ // Environment: " + env.__name__, sdfg) callsite_stream.write(env.init_code) callsite_stream.write("}") callsite_stream.write('}\n', sdfg) def generate_state(self, sdfg, state, global_stream, callsite_stream, generate_state_footer=True): sid = sdfg.node_id(state) # Emit internal transient array allocation # Don't allocate transients shared with another state data_to_allocate = (set(state.top_level_transients()) - set(sdfg.shared_transients())) allocated = set() for node in state.data_nodes(): if node.data not in data_to_allocate or node.data in allocated: continue allocated.add(node.data) self._dispatcher.dispatch_allocate(sdfg, state, sid, node, global_stream, callsite_stream) self._dispatcher.dispatch_initialize(sdfg, state, sid, node, global_stream, callsite_stream) # Invoke all instrumentation providers for instr in self._dispatcher.instrumentation.values(): if instr is not None: instr.on_state_begin(sdfg, state, callsite_stream, global_stream) ##################### # Create dataflow graph for state's children. # DFG to code scheme: Only generate code for nodes whose all # dependencies have been executed (topological sort). # For different connected components, run them concurrently. components = dace.sdfg.concurrent_subgraphs(state) if len(components) == 1: self._dispatcher.dispatch_subgraph(sdfg, state, sid, global_stream, callsite_stream, skip_entry_node=False) else: callsite_stream.write("#pragma omp parallel sections\n{") for c in components: callsite_stream.write("#pragma omp section\n{") self._dispatcher.dispatch_subgraph(sdfg, c, sid, global_stream, callsite_stream, skip_entry_node=False) callsite_stream.write("} // End omp section") callsite_stream.write("} // End omp sections") ##################### # Write state footer if generate_state_footer: # Emit internal transient array deallocation deallocated = set() for node in state.data_nodes(): if (node.data not in data_to_allocate or node.data in deallocated or (node.data in sdfg.arrays and sdfg.arrays[node.data].transient == False)): continue deallocated.add(node.data) self._dispatcher.dispatch_deallocate(sdfg, state, sid, node, global_stream, callsite_stream) # Invoke all instrumentation providers for instr in self._dispatcher.instrumentation.values(): if instr is not None: instr.on_state_end(sdfg, state, callsite_stream, global_stream) @staticmethod def _generate_assignments(assignments): return [ "{} = {}".format(variable, value) for variable, value in assignments.items() ] @staticmethod def _is_always_true(condition_string): return condition_string in ["true", "1"] def _generate_transition(self, sdfg, sid, callsite_stream, edge, assignments): condition_string = cppunparse.cppunparse(edge.data.condition, False) always_true = self._is_always_true(condition_string) if not always_true: callsite_stream.write("if ({}) {{".format(condition_string), sdfg, sid) if len(assignments) > 0: callsite_stream.write( ";\n".join( DaCeCodeGenerator._generate_assignments(assignments) + [""]), sdfg, sid) callsite_stream.write( "goto __state_{}_{};".format(sdfg.name, edge.dst.label), sdfg, sid) if not always_true: callsite_stream.write("}") def generate_states(self, sdfg, scope_label, control_flow, global_stream, callsite_stream, scope, states_generated, generated_edges): states_topological = list(sdfg.topological_sort(sdfg.start_state)) states_to_generate = collections.deque([ s for s in states_topological if s in scope and s not in states_generated ]) if len(states_to_generate) == 0: return while len(states_to_generate) > 0: state = states_to_generate.popleft() # When generating control flow constructs, we will not necessarily # move in topological order, so make sure this state has not # already been generated. if state in states_generated or state not in scope: continue states_generated.add(state) sid = sdfg.node_id(state) callsite_stream.write( "__state_{}_{}:\n".format(sdfg.name, state.label), sdfg, sid) # Don't generate brackets and comments for empty states if len([ n for n in state.nodes() if not isinstance(n, dace.graph.nodes.EmptyTasklet) ]) > 0: callsite_stream.write('{', sdfg, sid) self._dispatcher.dispatch_state(sdfg, state, global_stream, callsite_stream) callsite_stream.write('}', sdfg, sid) else: callsite_stream.write(";") out_edges = sdfg.out_edges(state) # Write conditional branches to next states for edge in out_edges: generate_assignments = True generate_transition = True # Handle specialized control flow if (dace.config.Config.get_bool('optimizer', 'detect_control_flow')): for control in control_flow[edge]: if isinstance(control, dace.graph.edges.LoopAssignment): # Generate the transition, but leave the # assignments to the loop generate_transition &= True generate_assignments &= False elif isinstance(control, dace.graph.edges.LoopBack): generate_transition &= False generate_assignments &= False elif isinstance(control, dace.graph.edges.LoopExit): # Need to strip the condition, so generate it from # the loop entry generate_transition &= False generate_assignments &= True elif isinstance(control, dace.graph.edges.LoopEntry): generate_transition &= False generate_assignments &= False if control.scope.assignment is not None: assignment_edge = control.scope.assignment.edge init_assignments = ", ".join( DaCeCodeGenerator._generate_assignments( assignment_edge.data.assignments)) generated_edges.add(assignment_edge) else: init_assignments = "" back_edge = control.scope.back.edge continue_assignments = ", ".join( DaCeCodeGenerator._generate_assignments( back_edge.data.assignments)) generated_edges.add(back_edge) entry_edge = control.scope.entry.edge condition = cppunparse.cppunparse( entry_edge.data.condition, False) generated_edges.add(entry_edge) if (len(init_assignments) > 0 or len(continue_assignments) > 0): callsite_stream.write( "for ({}; {}; {}) {{".format( init_assignments, condition, continue_assignments), sdfg, sid) else: callsite_stream.write( "while ({}) {{".format(condition), sdfg, sid) # Generate loop body self.generate_states( sdfg, entry_edge.src.label + "_loop", control_flow, global_stream, callsite_stream, control.scope, states_generated, generated_edges) callsite_stream.write("}", sdfg, sid) exit_edge = control.scope.exit.edge # Update states to generate after nested call states_to_generate = collections.deque([ s for s in states_to_generate if s not in states_generated ]) # If the next state to be generated is the exit # state, we can omit the goto if (len(states_to_generate) > 0 and states_to_generate[0] == exit_edge.dst and exit_edge.dst not in states_generated): pass elif edge in generated_edges: # This edge has more roles, goto doesn't apply pass else: callsite_stream.write( "goto __state_{}_{};".format( sdfg.name, control.scope.exit.edge.dst)) generated_edges.add(control.scope.exit.edge) elif isinstance(control, dace.graph.edges.IfExit): generate_transition &= True generate_assignments &= True elif isinstance(control, dace.graph.edges.IfEntry): generate_transition &= False generate_assignments &= True if len(set(control.scope) - states_generated) == 0: continue then_scope = control.scope.if_then_else.then_scope else_scope = control.scope.if_then_else.else_scope then_entry = then_scope.entry.edge condition = cppunparse.cppunparse( then_entry.data.condition, False) callsite_stream.write( "if ({}) {{".format(condition), sdfg, sid) generated_edges.add(then_entry) # Generate the then-scope self.generate_states(sdfg, state.label + "_then", control_flow, global_stream, callsite_stream, then_scope, states_generated, generated_edges) callsite_stream.write("} else {", sdfg, sid) generated_edges.add(else_scope.entry.edge) # Generate the else-scope self.generate_states(sdfg, state.label + "_else", control_flow, global_stream, callsite_stream, else_scope, states_generated, generated_edges) callsite_stream.write("}", sdfg, sid) generated_edges.add(else_scope.exit.edge) # Update states to generate after nested call states_to_generate = collections.deque([ s for s in states_to_generate if s not in states_generated ]) if_exit_state = control.scope.exit.edge.dst if ((if_exit_state not in states_generated) and ((len(states_to_generate) > 0) and (states_to_generate[0] == if_exit_state))): pass else: callsite_stream.write( "goto __state_{}_{};".format( sdfg.name, control.scope.exit.edge.dst)) else: raise TypeError( "Unknown control flow \"{}\"".format( type(control).__name__)) if generate_assignments and len(edge.data.assignments) > 0: assignments_to_generate = edge.data.assignments else: assignments_to_generate = {} if generate_transition: if ((len(out_edges) == 1) and (edge.dst not in states_generated) and ((len(states_to_generate) > 0) and (states_to_generate[0] == edge.dst))): # If there is only one outgoing edge, the target will # be generated next, we can omit the goto pass elif (len(out_edges) == 1 and len(states_to_generate) == 0 and (edge.dst not in scope)): # This scope has ended, and we don't need to generate # any output edge pass else: self._generate_transition(sdfg, sid, callsite_stream, edge, assignments_to_generate) # Assignments will be generated in the transition generate_assignments = False if generate_assignments: callsite_stream.write( ";\n".join( DaCeCodeGenerator._generate_assignments( assignments_to_generate) + [""]), sdfg, sid) generated_edges.add(edge) # End of out_edges loop if (((len(out_edges) == 0) or (not isinstance(scope, dace.graph.edges.ControlFlowScope) and (len(states_to_generate) == 0))) and (len(states_generated) != sdfg.number_of_nodes())): callsite_stream.write( "goto __state_exit_{}_{};".format(sdfg.name, scope_label), sdfg, sid) # Write exit state callsite_stream.write( "__state_exit_{}_{}:;".format(sdfg.name, scope_label), sdfg) def generate_code( self, sdfg: SDFG, schedule: Optional[dtypes.ScheduleType], sdfg_id: str = "" ) -> Tuple[str, str, Set[TargetCodeGenerator], Set[str]]: """ Generate frame code for a given SDFG, calling registered targets' code generation callbacks for them to generate their own code. :param sdfg: The SDFG to generate code for. :param schedule: The schedule the SDFG is currently located, or None if the SDFG is top-level. :param sdfg_id: An optional string id given to the SDFG label :return: A tuple of the generated global frame code, local frame code, and a set of targets that have been used in the generation of this SDFG. """ sdfg_label = sdfg.name + sdfg_id global_stream = CodeIOStream() callsite_stream = CodeIOStream() # Set default storage/schedule types in SDFG set_default_schedule_and_storage_types(sdfg, schedule) is_top_level = sdfg.parent is None # Generate code ########################### # Invoke all instrumentation providers for instr in self._dispatcher.instrumentation.values(): if instr is not None: instr.on_sdfg_begin(sdfg, callsite_stream, global_stream) # Allocate outer-level transients shared_transients = sdfg.shared_transients() allocated = set() for state in sdfg.nodes(): for node in state.data_nodes(): if (node.data in shared_transients and node.data not in allocated): self._dispatcher.dispatch_allocate(sdfg, state, None, node, global_stream, callsite_stream) self._dispatcher.dispatch_initialize( sdfg, state, None, node, global_stream, callsite_stream) allocated.add(node.data) # Allocate inter-state variables assigned, _ = sdfg.interstate_symbols() for isvarName, isvarType in assigned.items(): # Skip symbols that have been declared as outer-level transients if isvarName in allocated: continue callsite_stream.write( '%s;\n' % (isvarType.signature(with_types=True, name=isvarName)), sdfg) # Initialize parameter arrays for argnode in dtypes.deduplicate(sdfg.input_arrays() + sdfg.output_arrays()): # Ignore transient arrays if argnode.desc(sdfg).transient: continue self._dispatcher.dispatch_initialize(sdfg, sdfg, None, argnode, global_stream, callsite_stream) callsite_stream.write('\n', sdfg) states_topological = list(sdfg.topological_sort(sdfg.start_state)) # {edge: [dace.edges.ControlFlow]} control_flow = {e: [] for e in sdfg.edges()} if dace.config.Config.get_bool('optimizer', 'detect_control_flow'): #################################################################### # Loop detection procedure all_cycles = list(sdfg.find_cycles()) # Returns a list of lists # Order according to topological sort all_cycles = [ sorted(c, key=lambda x: states_topological.index(x)) for c in all_cycles ] # Group in terms of starting node starting_nodes = [c[0] for c in all_cycles] # Order cycles according to starting node in topological sort starting_nodes = sorted(starting_nodes, key=lambda x: states_topological.index(x)) cycles_by_node = [[c for c in all_cycles if c[0] == n] for n in starting_nodes] for cycles in cycles_by_node: # Use arbitrary cycle to find the first and last nodes first_node = cycles[0][0] last_node = cycles[0][-1] if not first_node.is_empty(): # The entry node should not contain any computations continue if not all([c[-1] == last_node for c in cycles]): # There are multiple back edges: not a for or while loop continue previous_edge = [ e for e in sdfg.in_edges(first_node) if e.src != last_node ] if len(previous_edge) != 1: # No single starting point: not a for or while continue previous_edge = previous_edge[0] back_edge = sdfg.edges_between(last_node, first_node) if len(back_edge) != 1: raise RuntimeError("Expected exactly one edge in cycle") back_edge = back_edge[0] # Build a set of all nodes in all cycles associated with this # set of start and end node internal_nodes = functools.reduce( lambda a, b: a | b, [set(c) for c in cycles]) - {first_node} exit_edge = [ e for e in sdfg.out_edges(first_node) if e.dst not in internal_nodes | {first_node} ] if len(exit_edge) != 1: # No single stopping condition: not a for or while # (we don't support continue or break) continue exit_edge = exit_edge[0] entry_edge = [ e for e in sdfg.out_edges(first_node) if e != exit_edge ] if len(entry_edge) != 1: # No single starting condition: not a for or while continue entry_edge = entry_edge[0] # Make sure this is not already annotated to be another construct if (len(control_flow[entry_edge]) != 0 or len(control_flow[back_edge]) != 0): continue # Nested loops case I - previous edge of internal loop is a # loop-entry of an external loop (first state in a loop is # another loop) if (len(control_flow[previous_edge]) == 1 and isinstance(control_flow[previous_edge][0], dace.graph.edges.LoopEntry)): # Nested loop, mark parent scope loop_parent = control_flow[previous_edge][0].scope # Nested loops case II - exit edge of internal loop is a # back-edge of an external loop (last state in a loop is another # loop) elif (len(control_flow[exit_edge]) == 1 and isinstance(control_flow[exit_edge][0], dace.graph.edges.LoopBack)): # Nested loop, mark parent scope loop_parent = control_flow[exit_edge][0].scope elif (len(control_flow[exit_edge]) == 0 or len(control_flow[previous_edge]) == 0): loop_parent = None else: continue if entry_edge == back_edge: # No entry check (we don't support do-loops) # TODO: do we want to add some support for self-loops? continue # Now we make sure that there is no other way to exit this # cycle, by checking that there's no reachable node *not* # included in any cycle between the first and last node. if any([len(set(c) - internal_nodes) > 1 for c in cycles]): continue # This is a loop! Generate the necessary annotation objects. loop_scope = dace.graph.edges.LoopScope(internal_nodes) if ((len(previous_edge.data.assignments) > 0 or len(back_edge.data.assignments) > 0) and (len(control_flow[previous_edge]) == 0 or (len(control_flow[previous_edge]) == 1 and control_flow[previous_edge][0].scope == loop_parent))): # Generate assignment edge, if available control_flow[previous_edge].append( dace.graph.edges.LoopAssignment( loop_scope, previous_edge)) # Assign remaining control flow constructs control_flow[entry_edge].append( dace.graph.edges.LoopEntry(loop_scope, entry_edge)) control_flow[exit_edge].append( dace.graph.edges.LoopExit(loop_scope, exit_edge)) control_flow[back_edge].append( dace.graph.edges.LoopBack(loop_scope, back_edge)) ################################################################### # If/then/else detection procedure candidates = [ n for n in states_topological if sdfg.out_degree(n) == 2 ] for candidate in candidates: # A valid if occurs when then are no reachable nodes for either # path that does not pass through a common dominator. dominators = nx.dominance.dominance_frontiers( sdfg.nx, candidate) left_entry, right_entry = sdfg.out_edges(candidate) if (len(control_flow[left_entry]) > 0 or len(control_flow[right_entry]) > 0): # Already assigned to a control flow construct # TODO: carefully allow this in some cases continue left, right = left_entry.dst, right_entry.dst dominator = dominators[left] & dominators[right] if len(dominator) != 1: # There must be a single dominator across both branches, # unless one of the nodes _is_ the next dominator # if (len(dominator) == 0 and dominators[left] == {right} # or dominators[right] == {left}): # dominator = dominators[left] | dominators[right] # else: # continue continue dominator = next(iter(dominator)) # Exactly one dominator exit_edges = sdfg.in_edges(dominator) if len(exit_edges) != 2: # There must be a single entry and a single exit. This # could be relaxed in the future. continue left_exit, right_exit = exit_edges if (len(control_flow[left_exit]) > 0 or len(control_flow[right_exit]) > 0): # Already assigned to a control flow construct # TODO: carefully allow this in some cases continue # Now traverse from the source and verify that all possible paths # pass through the dominator left_nodes = sdfg.all_nodes_between(left, dominator) if left_nodes is None: # Not all paths lead to the next dominator continue right_nodes = sdfg.all_nodes_between(right, dominator) if right_nodes is None: # Not all paths lead to the next dominator continue all_nodes = left_nodes | right_nodes # Make sure there is no overlap between left and right nodes if len(left_nodes & right_nodes) > 0: continue # This is a valid if/then/else construct. Generate annotations if_then_else = dace.graph.edges.IfThenElse( candidate, dominator) # Arbitrarily assign then/else to the two branches. If one edge # has no dominator but leads to the dominator, it means there's # only a then clause (and no else). has_else = False if len(dominators[left]) == 1: then_scope = dace.graph.edges.IfThenScope( if_then_else, left_nodes) else_scope = dace.graph.edges.IfElseScope( if_then_else, right_nodes) control_flow[left_entry].append( dace.graph.edges.IfEntry(then_scope, left_entry)) control_flow[left_exit].append( dace.graph.edges.IfExit(then_scope, left_exit)) control_flow[right_exit].append( dace.graph.edges.IfExit(else_scope, right_exit)) if len(dominators[right]) == 1: control_flow[right_entry].append( dace.graph.edges.IfEntry(else_scope, right_entry)) has_else = True else: then_scope = dace.graph.edges.IfThenScope( if_then_else, right_nodes) else_scope = dace.graph.edges.IfElseScope( if_then_else, left_nodes) control_flow[right_entry].append( dace.graph.edges.IfEntry(then_scope, right_entry)) control_flow[right_exit].append( dace.graph.edges.IfExit(then_scope, right_exit)) control_flow[left_exit].append( dace.graph.edges.IfExit(else_scope, left_exit)) ####################################################################### # Generate actual program body states_generated = set() # For sanity check generated_edges = set() self.generate_states(sdfg, "sdfg", control_flow, global_stream, callsite_stream, set(states_topological), states_generated, generated_edges) ####################################################################### # Sanity check if len(states_generated) != len(sdfg.nodes()): raise RuntimeError( "Not all states were generated in SDFG {}!" "\n Generated: {}\n Missing: {}".format( sdfg.label, [s.label for s in states_generated], [s.label for s in (set(sdfg.nodes()) - states_generated)])) # Deallocate transients shared_transients = sdfg.shared_transients() deallocated = set() for state in sdfg.nodes(): for node in state.data_nodes(): if (node.data in shared_transients and node.data not in deallocated): self._dispatcher.dispatch_deallocate( sdfg, state, None, node, global_stream, callsite_stream) deallocated.add(node.data) # Now that we have all the information about dependencies, generate # header and footer if is_top_level: header_stream = CodeIOStream() header_global_stream = CodeIOStream() footer_stream = CodeIOStream() footer_global_stream = CodeIOStream() self.generate_header(sdfg, self._dispatcher.used_environments, header_global_stream, header_stream) # Open program function function_signature = 'void __program_%s_internal(%s)\n{\n' % ( sdfg.name, sdfg.signature()) self.generate_footer(sdfg, self._dispatcher.used_environments, footer_global_stream, footer_stream) header_global_stream.write(global_stream.getvalue()) header_global_stream.write(footer_global_stream.getvalue()) generated_header = header_global_stream.getvalue() all_code = CodeIOStream() all_code.write(function_signature) all_code.write(header_stream.getvalue()) all_code.write(callsite_stream.getvalue()) all_code.write(footer_stream.getvalue()) generated_code = all_code.getvalue() else: generated_header = global_stream.getvalue() generated_code = callsite_stream.getvalue() # Return the generated global and local code strings return (generated_header, generated_code, self._dispatcher.used_targets, self._dispatcher.used_environments)
class DaCeCodeGenerator(object): """ DaCe code generator class that writes the generated code for SDFG state machines, and uses a dispatcher to generate code for individual states based on the target. """ def __init__(self, *args, **kwargs): self._dispatcher = disp.TargetDispatcher(self) self._dispatcher.register_state_dispatcher(self) self._initcode = CodeIOStream() self._exitcode = CodeIOStream() self.statestruct: List[str] = [] self.environments: List[Any] = [] ################################################################## # Target registry @property def dispatcher(self): return self._dispatcher ################################################################## # Code generation def generate_constants(self, sdfg: SDFG, callsite_stream: CodeIOStream): # Write constants for cstname, (csttype, cstval) in sdfg.constants_prop.items(): if isinstance(csttype, data.Array): const_str = "constexpr " + csttype.dtype.ctype + \ " " + cstname + "[" + str(cstval.size) + "] = {" it = np.nditer(cstval, order='C') for i in range(cstval.size - 1): const_str += str(it[0]) + ", " it.iternext() const_str += str(it[0]) + "};\n" callsite_stream.write(const_str, sdfg) else: callsite_stream.write( "constexpr %s %s = %s;\n" % (csttype.dtype.ctype, cstname, sym2cpp(cstval)), sdfg) def generate_fileheader(self, sdfg: SDFG, global_stream: CodeIOStream, backend: str = 'frame'): """ Generate a header in every output file that includes custom types and constants. :param sdfg: The input SDFG. :param global_stream: Stream to write to (global). :param backend: Whose backend this header belongs to. """ # Hash file include if backend == 'frame': global_stream.write('#include "../../include/hash.h"\n', sdfg) ######################################################### # Environment-based includes for env in self.environments: if len(env.headers) > 0: global_stream.write( "\n".join("#include \"" + h + "\"" for h in env.headers), sdfg) ######################################################### # Custom types datatypes = set() # Types of this SDFG for _, arrname, arr in sdfg.arrays_recursive(): if arr is not None: datatypes.add(arr.dtype) # Emit unique definitions wrote_something = False for typ in datatypes: if hasattr(typ, 'emit_definition'): if not wrote_something: global_stream.write("", sdfg) wrote_something = True global_stream.write(typ.emit_definition(), sdfg) if wrote_something: global_stream.write("", sdfg) ######################################################### # Write constants self.generate_constants(sdfg, global_stream) ######################################################### # Write state struct structstr = '\n'.join(self.statestruct) global_stream.write( f''' struct {sdfg.name}_t {{ {structstr} }}; ''', sdfg) for sd in sdfg.all_sdfgs_recursive(): if None in sd.global_code: global_stream.write(codeblock_to_cpp(sd.global_code[None]), sd) if backend in sd.global_code: global_stream.write(codeblock_to_cpp(sd.global_code[backend]), sd) def generate_header(self, sdfg: SDFG, global_stream: CodeIOStream, callsite_stream: CodeIOStream): """ Generate the header of the frame-code. Code exists in a separate function for overriding purposes. :param sdfg: The input SDFG. :param global_stream: Stream to write to (global). :param callsite_stream: Stream to write to (at call site). """ # Write frame code - header global_stream.write( '/* DaCe AUTO-GENERATED FILE. DO NOT MODIFY */\n' + '#include <dace/dace.h>\n', sdfg) # Write header required by environments for env in self.environments: self.statestruct.extend(env.state_fields) # Instrumentation preamble if len(self._dispatcher.instrumentation) > 1: self.statestruct.append('dace::perf::Report report;') # Reset report if written every invocation if config.Config.get_bool('instrumentation', 'report_each_invocation'): callsite_stream.write('__state->report.reset();', sdfg) self.generate_fileheader(sdfg, global_stream, 'frame') def generate_footer(self, sdfg: SDFG, global_stream: CodeIOStream, callsite_stream: CodeIOStream): """ Generate the footer of the frame-code. Code exists in a separate function for overriding purposes. :param sdfg: The input SDFG. :param global_stream: Stream to write to (global). :param callsite_stream: Stream to write to (at call site). """ import dace.library fname = sdfg.name params = sdfg.signature() paramnames = sdfg.signature(False, for_call=True) initparams = sdfg.signature(with_arrays=False) initparamnames = sdfg.signature(False, for_call=True, with_arrays=False) # Invoke all instrumentation providers for instr in self._dispatcher.instrumentation.values(): if instr is not None: instr.on_sdfg_end(sdfg, callsite_stream, global_stream) # Instrumentation saving if (config.Config.get_bool('instrumentation', 'report_each_invocation') and len(self._dispatcher.instrumentation) > 1): callsite_stream.write( '''__state->report.save("{path}/perf", __HASH_{name});'''. format(path=sdfg.build_folder.replace('\\', '/'), name=sdfg.name), sdfg) # Write closing brace of program callsite_stream.write('}', sdfg) # Write awkward footer to avoid 'extern "C"' issues params_comma = (', ' + params) if params else '' initparams_comma = (', ' + initparams) if initparams else '' paramnames_comma = (', ' + paramnames) if paramnames else '' initparamnames_comma = (', ' + initparamnames) if initparamnames else '' callsite_stream.write( f''' DACE_EXPORTED void __program_{fname}({fname}_t *__state{params_comma}) {{ __program_{fname}_internal(__state{paramnames_comma}); }}''', sdfg) for target in self._dispatcher.used_targets: if target.has_initializer: callsite_stream.write( 'DACE_EXPORTED int __dace_init_%s(%s_t *__state%s);\n' % (target.target_name, sdfg.name, initparams_comma), sdfg) if target.has_finalizer: callsite_stream.write( 'DACE_EXPORTED int __dace_exit_%s(%s_t *__state);\n' % (target.target_name, sdfg.name), sdfg) callsite_stream.write( f""" DACE_EXPORTED {sdfg.name}_t *__dace_init_{sdfg.name}({initparams}) {{ int __result = 0; {sdfg.name}_t *__state = new {sdfg.name}_t; """, sdfg) for target in self._dispatcher.used_targets: if target.has_initializer: callsite_stream.write( '__result |= __dace_init_%s(__state%s);' % (target.target_name, initparamnames_comma), sdfg) for env in self.environments: init_code = _get_or_eval_sdfg_first_arg(env.init_code, sdfg) if init_code: callsite_stream.write("{ // Environment: " + env.__name__, sdfg) callsite_stream.write(init_code) callsite_stream.write("}") for sd in sdfg.all_sdfgs_recursive(): if None in sd.init_code: callsite_stream.write(codeblock_to_cpp(sd.init_code[None]), sd) callsite_stream.write(codeblock_to_cpp(sd.init_code['frame']), sd) callsite_stream.write(self._initcode.getvalue(), sdfg) callsite_stream.write( f""" if (__result) {{ delete __state; return nullptr; }} return __state; }} DACE_EXPORTED void __dace_exit_{sdfg.name}({sdfg.name}_t *__state) {{ """, sdfg) # Instrumentation saving if (not config.Config.get_bool('instrumentation', 'report_each_invocation') and len(self._dispatcher.instrumentation) > 1): callsite_stream.write( '__state->report.save("%s/perf", __HASH_%s);' % (sdfg.build_folder.replace('\\', '/'), sdfg.name), sdfg) callsite_stream.write(self._exitcode.getvalue(), sdfg) for sd in sdfg.all_sdfgs_recursive(): if None in sd.exit_code: callsite_stream.write(codeblock_to_cpp(sd.exit_code[None]), sd) callsite_stream.write(codeblock_to_cpp(sd.exit_code['frame']), sd) for target in self._dispatcher.used_targets: if target.has_finalizer: callsite_stream.write( '__dace_exit_%s(__state);' % target.target_name, sdfg) for env in reversed(self.environments): finalize_code = _get_or_eval_sdfg_first_arg( env.finalize_code, sdfg) if finalize_code: callsite_stream.write("{ // Environment: " + env.__name__, sdfg) callsite_stream.write(finalize_code) callsite_stream.write("}") callsite_stream.write('delete __state;\n}\n', sdfg) def generate_state(self, sdfg, state, global_stream, callsite_stream, generate_state_footer=True): sid = sdfg.node_id(state) # Emit internal transient array allocation # Don't allocate transients shared with another state data_to_allocate = (set(state.top_level_transients()) - set(sdfg.shared_transients())) allocated = set() for node in state.data_nodes(): if node.data not in data_to_allocate or node.data in allocated: continue allocated.add(node.data) self._dispatcher.dispatch_allocate(sdfg, state, sid, node, global_stream, callsite_stream) callsite_stream.write('\n') # Emit internal transient array allocation for nested SDFGs # TODO: Replace with global allocation management gpu_persistent_subgraphs = [ state.scope_subgraph(node) for node in state.nodes() if isinstance(node, dace.nodes.MapEntry) and node.map.schedule == dace.ScheduleType.GPU_Persistent ] nested_allocated = set() for sub_graph in gpu_persistent_subgraphs: for nested_sdfg in [ n.sdfg for n in sub_graph.nodes() if isinstance(n, nodes.NestedSDFG) ]: nested_shared_transients = set(nested_sdfg.shared_transients()) for nested_state in nested_sdfg.nodes(): nested_sid = nested_sdfg.node_id(nested_state) nested_to_allocate = ( set(nested_state.top_level_transients()) - nested_shared_transients) nodes_to_allocate = [ n for n in nested_state.data_nodes() if n.data in nested_to_allocate and n.data not in nested_allocated ] for nested_node in nodes_to_allocate: nested_allocated.add(nested_node.data) self._dispatcher.dispatch_allocate( nested_sdfg, nested_state, nested_sid, nested_node, global_stream, callsite_stream) callsite_stream.write('\n') # Invoke all instrumentation providers for instr in self._dispatcher.instrumentation.values(): if instr is not None: instr.on_state_begin(sdfg, state, callsite_stream, global_stream) ##################### # Create dataflow graph for state's children. # DFG to code scheme: Only generate code for nodes whose all # dependencies have been executed (topological sort). # For different connected components, run them concurrently. components = dace.sdfg.concurrent_subgraphs(state) if len(components) == 1: self._dispatcher.dispatch_subgraph(sdfg, state, sid, global_stream, callsite_stream, skip_entry_node=False) else: if sdfg.openmp_sections: callsite_stream.write("#pragma omp parallel sections\n{") for c in components: if sdfg.openmp_sections: callsite_stream.write("#pragma omp section\n{") self._dispatcher.dispatch_subgraph(sdfg, c, sid, global_stream, callsite_stream, skip_entry_node=False) if sdfg.openmp_sections: callsite_stream.write("} // End omp section") if sdfg.openmp_sections: callsite_stream.write("} // End omp sections") ##################### # Write state footer if generate_state_footer: # Emit internal transient array deallocation for nested SDFGs # TODO: Replace with global allocation management gpu_persistent_subgraphs = [ state.scope_subgraph(node) for node in state.nodes() if isinstance(node, dace.nodes.MapEntry) and node.map.schedule == dace.ScheduleType.GPU_Persistent ] nested_deallocated = set() for sub_graph in gpu_persistent_subgraphs: for nested_sdfg in [ n.sdfg for n in sub_graph.nodes() if isinstance(n, nodes.NestedSDFG) ]: nested_shared_transients = \ set(nested_sdfg.shared_transients()) for nested_state in nested_sdfg: nested_sid = nested_sdfg.node_id(nested_state) nested_to_allocate = ( set(nested_state.top_level_transients()) - nested_shared_transients) nodes_to_deallocate = [ n for n in nested_state.data_nodes() if n.data in nested_to_allocate and n.data not in nested_deallocated ] for nested_node in nodes_to_deallocate: nested_deallocated.add(nested_node.data) self._dispatcher.dispatch_deallocate( nested_sdfg, nested_state, nested_sid, nested_node, global_stream, callsite_stream) # Emit internal transient array deallocation deallocated = set() for node in state.data_nodes(): if (node.data not in data_to_allocate or node.data in deallocated or (node.data in sdfg.arrays and sdfg.arrays[node.data].transient == False)): continue deallocated.add(node.data) self._dispatcher.dispatch_deallocate(sdfg, state, sid, node, global_stream, callsite_stream) # Invoke all instrumentation providers for instr in self._dispatcher.instrumentation.values(): if instr is not None: instr.on_state_end(sdfg, state, callsite_stream, global_stream) def generate_states(self, sdfg, global_stream, callsite_stream): states_generated = set() # Create closure + function for state dispatcher def dispatch_state(state: SDFGState) -> str: stream = CodeIOStream() self._dispatcher.dispatch_state(sdfg, state, global_stream, stream) states_generated.add(state) # For sanity check return stream.getvalue() # Handle specialized control flow if config.Config.get_bool('optimizer', 'detect_control_flow'): # Avoid import loop from dace.transformation import helpers as xfh # Clean up the state machine by separating combined condition and assignment # edges. xfh.split_interstate_edges(sdfg) cft = cflow.structured_control_flow_tree(sdfg, dispatch_state) else: # If disabled, generate entire graph as general control flow block states_topological = list(sdfg.topological_sort(sdfg.start_state)) last = states_topological[-1] cft = cflow.GeneralBlock(dispatch_state, [ cflow.SingleState(dispatch_state, s, s is last) for s in states_topological ], []) callsite_stream.write( cft.as_cpp(self.dispatcher.defined_vars, sdfg.symbols), sdfg) # Write exit label callsite_stream.write(f'__state_exit_{sdfg.sdfg_id}:;', sdfg) return states_generated def generate_code( self, sdfg: SDFG, schedule: Optional[dtypes.ScheduleType], sdfg_id: str = "" ) -> Tuple[str, str, Set[TargetCodeGenerator], Set[str]]: """ Generate frame code for a given SDFG, calling registered targets' code generation callbacks for them to generate their own code. :param sdfg: The SDFG to generate code for. :param schedule: The schedule the SDFG is currently located, or None if the SDFG is top-level. :param sdfg_id: An optional string id given to the SDFG label :return: A tuple of the generated global frame code, local frame code, and a set of targets that have been used in the generation of this SDFG. """ if len(sdfg_id) == 0 and sdfg.sdfg_id != 0: sdfg_id = '_%d' % sdfg.sdfg_id global_stream = CodeIOStream() callsite_stream = CodeIOStream() is_top_level = sdfg.parent is None # Generate code ########################### # Keep track of allocated variables allocated = set() # Add symbol mappings to allocated variables if sdfg.parent_nsdfg_node is not None: allocated |= sdfg.parent_nsdfg_node.symbol_mapping.keys() # Invoke all instrumentation providers for instr in self._dispatcher.instrumentation.values(): if instr is not None: instr.on_sdfg_begin(sdfg, callsite_stream, global_stream) # Allocate outer-level transients shared_transients = sdfg.shared_transients() for state in sdfg.nodes(): for node in state.data_nodes(): if (node.data in shared_transients and node.data not in allocated): self._dispatcher.dispatch_allocate(sdfg, state, None, node, global_stream, callsite_stream) allocated.add(node.data) # Allocate inter-state variables global_symbols = copy.deepcopy(sdfg.symbols) global_symbols.update( {aname: arr.dtype for aname, arr in sdfg.arrays.items()}) interstate_symbols = {} for e in sdfg.edges(): symbols = e.data.new_symbols(global_symbols) # Inferred symbols only take precedence if global symbol not defined symbols = { k: v if k not in global_symbols else global_symbols[k] for k, v in symbols.items() } interstate_symbols.update(symbols) global_symbols.update(symbols) for isvarName, isvarType in interstate_symbols.items(): # Skip symbols that have been declared as outer-level transients if isvarName in allocated: continue isvar = data.Scalar(isvarType) callsite_stream.write( '%s;\n' % (isvar.as_arg(with_types=True, name=isvarName)), sdfg) self.dispatcher.defined_vars.add(isvarName, isvarType, isvarType.ctype) callsite_stream.write('\n', sdfg) ####################################################################### # Generate actual program body states_generated = self.generate_states(sdfg, global_stream, callsite_stream) ####################################################################### # Sanity check if len(states_generated) != len(sdfg.nodes()): raise RuntimeError( "Not all states were generated in SDFG {}!" "\n Generated: {}\n Missing: {}".format( sdfg.label, [s.label for s in states_generated], [s.label for s in (set(sdfg.nodes()) - states_generated)])) # Deallocate transients shared_transients = sdfg.shared_transients() deallocated = set() for state in sdfg.nodes(): for node in state.data_nodes(): if (node.data in shared_transients and node.data not in deallocated): self._dispatcher.dispatch_deallocate( sdfg, state, None, node, global_stream, callsite_stream) deallocated.add(node.data) # Now that we have all the information about dependencies, generate # header and footer if is_top_level: # Let each target append code to frame code state before generating # header and footer for target in self._dispatcher.used_targets: target.on_target_used() header_stream = CodeIOStream() header_global_stream = CodeIOStream() footer_stream = CodeIOStream() footer_global_stream = CodeIOStream() # Get all environments used in the generated code, including # dependent environments import dace.library # Avoid import loops self.environments = dace.library.get_environments_and_dependencies( self._dispatcher.used_environments) self.generate_header(sdfg, header_global_stream, header_stream) # Open program function params = sdfg.signature() if params: params = ', ' + params function_signature = ( 'void __program_%s_internal(%s_t *__state%s)\n{\n' % (sdfg.name, sdfg.name, params)) self.generate_footer(sdfg, footer_global_stream, footer_stream) header_global_stream.write(global_stream.getvalue()) header_global_stream.write(footer_global_stream.getvalue()) generated_header = header_global_stream.getvalue() all_code = CodeIOStream() all_code.write(function_signature) all_code.write(header_stream.getvalue()) all_code.write(callsite_stream.getvalue()) all_code.write(footer_stream.getvalue()) generated_code = all_code.getvalue() else: generated_header = global_stream.getvalue() generated_code = callsite_stream.getvalue() # Clean up generated code gotos = re.findall(r'goto (.*);', generated_code) clean_code = '' for line in generated_code.split('\n'): # Empty line with semicolon if re.match(r'^\s*;\s*', line): continue # Label that might be unused label = re.findall( r'^\s*([a-zA-Z_][a-zA-Z_0-9]*):\s*[;]?\s*////.*$', line) if len(label) > 0: if label[0] not in gotos: continue clean_code += line + '\n' # Return the generated global and local code strings return (generated_header, clean_code, self._dispatcher.used_targets, self._dispatcher.used_environments)
def generate_kernel_internal(self, sdfg, state, kernel_name, subgraphs, kernel_stream, function_stream, callsite_stream): """Main entry function for generating a Xilinx kernel.""" (global_data_parameters, top_level_local_data, subgraph_parameters, nested_global_transients, bank_assignments, external_streams) = self.make_parameters(sdfg, state, subgraphs) # Detect RTL tasklets, which will be launched as individual kernels rtl_tasklet_names = [ self.rtl_tasklet_name(nd, state, sdfg) for nd in state.nodes() if isinstance(nd, nodes.RTLTasklet) ] host_code_stream = CodeIOStream() # Generate host code self.generate_host_header(sdfg, kernel_name, global_data_parameters, host_code_stream) self.generate_host_function_boilerplate(sdfg, state, kernel_name, global_data_parameters, nested_global_transients, host_code_stream, function_stream, callsite_stream) # Now we write the device code module_stream = CodeIOStream() entry_stream = CodeIOStream() state_id = sdfg.node_id(state) self.generate_kernel_boilerplate_pre(sdfg, state_id, kernel_name, global_data_parameters, bank_assignments, module_stream, entry_stream, external_streams) # Emit allocations for node in top_level_local_data: self._dispatcher.dispatch_allocate(sdfg, state, state_id, node, module_stream, entry_stream) for is_output, name, node, _ in external_streams: self._dispatcher.defined_vars.add_global(name, DefinedType.Stream, node.ctype) if name not in self._stream_connections: self._stream_connections[name] = [None, None] self._stream_connections[name][ 0 if is_output else 1] = '{}_1.{}'.format(kernel_name, name) self.generate_modules(sdfg, state, kernel_name, subgraphs, subgraph_parameters, module_stream, entry_stream, host_code_stream) self.generate_host_function_body(sdfg, state, kernel_name, global_data_parameters, rtl_tasklet_names, host_code_stream) # Store code to be passed to compilation phase self._host_codes.append((kernel_name, host_code_stream.getvalue())) kernel_stream.write(module_stream.getvalue()) kernel_stream.write(entry_stream.getvalue()) self.generate_kernel_boilerplate_post(kernel_stream, sdfg, state_id)
def generate_kernel_internal(self, sdfg: dace.SDFG, state: dace.SDFGState, kernel_name: str, predecessors: list, subgraphs: list, kernel_stream: CodeIOStream, state_host_header_stream: CodeIOStream, state_host_body_stream: CodeIOStream, instrumentation_stream: CodeIOStream, function_stream: CodeIOStream, callsite_stream: CodeIOStream, state_parameters: list): ''' Generates Kernel code, both device and host side. :param sdfg: :param state: :param kernel_name: :param predecessors: list containing all the name of kernels from which this one depends :param subgraphs: :param kernel_stream: Device code stream, contains the kernel code :param state_host_header_stream: Device-specific code stream: contains the host code for the state global declarations. :param state_host_body_stream: Device-specific code stream: contains all the code related to this state, for creating transient buffers, spawning kernels, and synchronizing them. :param instrumentation_stream: Code for profiling kernel execution time. :param function_stream: CPU code stream. :param callsite_stream: CPU code stream. :param state_parameters: list of state parameters. The kernel-specific parameters will be appended to it. ''' (global_data_parameters, top_level_local_data, subgraph_parameters, nested_global_transients, bank_assignments, external_streams) = self.make_parameters(sdfg, state, subgraphs) state_parameters.extend(global_data_parameters) # Detect RTL tasklets, which will be launched as individual kernels rtl_tasklet_names = [ self.rtl_tasklet_name(nd, state, sdfg) for nd in state.nodes() if isinstance(nd, nodes.RTLTasklet) ] # Generate host code self.generate_host_header(sdfg, kernel_name, global_data_parameters, state_host_header_stream) self.generate_host_function_boilerplate(sdfg, state, nested_global_transients, state_host_body_stream) # Now we write the device code module_stream = CodeIOStream() entry_stream = CodeIOStream() state_id = sdfg.node_id(state) self.generate_kernel_boilerplate_pre(sdfg, state_id, kernel_name, global_data_parameters, bank_assignments, module_stream, entry_stream, external_streams) # Emit allocations for node in top_level_local_data: self._dispatcher.dispatch_allocate(sdfg, state, state_id, node, node.desc(sdfg), module_stream, entry_stream) for is_output, name, node, _ in external_streams: self._dispatcher.defined_vars.add_global(name, DefinedType.Stream, node.ctype) if name not in self._stream_connections: self._stream_connections[name] = [None, None] key = 0 if is_output else 1 val = '{}_1.{}'.format(kernel_name, name) self._stream_connections[name][key] = val self.generate_modules(sdfg, state, kernel_name, subgraphs, subgraph_parameters, module_stream, entry_stream, state_host_body_stream, instrumentation_stream) self.generate_host_function_body(sdfg, state, kernel_name, predecessors, global_data_parameters, rtl_tasklet_names, state_host_body_stream, instrumentation_stream) # Store code to be passed to compilation phase # self._host_codes.append((kernel_name, host_code_stream.getvalue())) kernel_stream.write(module_stream.getvalue()) kernel_stream.write(entry_stream.getvalue()) self.generate_kernel_boilerplate_post(kernel_stream, sdfg, state_id)
def expansion(node: 'Reduce', state: SDFGState, sdfg: SDFG): node.validate(sdfg, state) input_edge: graph.MultiConnectorEdge = state.in_edges(node)[0] output_edge: graph.MultiConnectorEdge = state.out_edges(node)[0] input_dims = len(input_edge.data.subset) output_dims = len(output_edge.data.subset) input_data = sdfg.arrays[input_edge.data.data] output_data = sdfg.arrays[output_edge.data.data] # Setup all locations in which code will be written cuda_globalcode = CodeIOStream() cuda_initcode = CodeIOStream() cuda_exitcode = CodeIOStream() host_globalcode = CodeIOStream() host_localcode = CodeIOStream() output_memlet = output_edge.data # Try to autodetect reduction type redtype = detect_reduction_type(node.wcr) node_id = state.node_id(node) state_id = sdfg.node_id(state) idstr = '{sdfg}_{state}_{node}'.format(sdfg=sdfg.name, state=state_id, node=node_id) if node.out_connectors: dtype = next(node.out_connectors.values()) else: dtype = sdfg.arrays[output_memlet.data].dtype output_type = dtype.ctype if node.identity is None: raise ValueError('For device reduce nodes, initial value must be ' 'specified') # Create a functor or use an existing one for reduction if redtype == dtypes.ReductionType.Custom: body, [arg1, arg2] = unparse_cr_split(sdfg, node.wcr) cuda_globalcode.write( """ struct __reduce_{id} {{ template <typename T> DACE_HDFI T operator()(const T &{arg1}, const T &{arg2}) const {{ {contents} }} }};""".format(id=idstr, arg1=arg1, arg2=arg2, contents=body), sdfg, state_id, node_id) reduce_op = ', __reduce_' + idstr + '(), ' + symstr(node.identity) elif redtype in ExpandReduceCUDADevice._SPECIAL_RTYPES: reduce_op = '' else: credtype = 'dace::ReductionType::' + str( redtype)[str(redtype).find('.') + 1:] reduce_op = ((', dace::_wcr_fixed<%s, %s>()' % (credtype, output_type)) + ', ' + symstr(node.identity)) # Obtain some SDFG-related information input_memlet = input_edge.data reduce_shape = input_memlet.subset.bounding_box_size() num_items = ' * '.join(symstr(s) for s in reduce_shape) input = (input_memlet.data + ' + ' + cpp_array_expr(sdfg, input_memlet, with_brackets=False)) output = (output_memlet.data + ' + ' + cpp_array_expr(sdfg, output_memlet, with_brackets=False)) input_dims = input_memlet.subset.dims() output_dims = output_memlet.subset.data_dims() reduce_all_axes = (node.axes is None or len(node.axes) == input_dims) if reduce_all_axes: reduce_last_axes = False else: reduce_last_axes = sorted(node.axes) == list( range(input_dims - len(node.axes), input_dims)) if (not reduce_all_axes) and (not reduce_last_axes): raise NotImplementedError( 'Multiple axis reductions not supported on GPUs. Please use ' 'the pure expansion or make reduce axes the last in the array.' ) # Verify that data is on the GPU if input_data.storage not in [ dtypes.StorageType.GPU_Global, dtypes.StorageType.CPU_Pinned ]: raise ValueError('Input of GPU reduction must either reside ' ' in global GPU memory or pinned CPU memory') if output_data.storage not in [ dtypes.StorageType.GPU_Global, dtypes.StorageType.CPU_Pinned ]: raise ValueError('Output of GPU reduction must either reside ' ' in global GPU memory or pinned CPU memory') # Determine reduction type kname = (ExpandReduceCUDADevice._SPECIAL_RTYPES[redtype] if redtype in ExpandReduceCUDADevice._SPECIAL_RTYPES else 'Reduce') # Create temp memory for this GPU cuda_globalcode.write( """ void *__cub_storage_{sdfg}_{state}_{node} = NULL; size_t __cub_ssize_{sdfg}_{state}_{node} = 0; """.format(sdfg=sdfg.name, state=state_id, node=node_id), sdfg, state_id, node) if reduce_all_axes: reduce_type = 'DeviceReduce' reduce_range = num_items reduce_range_def = 'size_t num_items' reduce_range_use = 'num_items' reduce_range_call = num_items elif reduce_last_axes: num_reduce_axes = len(node.axes) not_reduce_axes = reduce_shape[:-num_reduce_axes] reduce_axes = reduce_shape[-num_reduce_axes:] num_segments = ' * '.join([symstr(s) for s in not_reduce_axes]) segment_size = ' * '.join([symstr(s) for s in reduce_axes]) reduce_type = 'DeviceSegmentedReduce' iterator = 'dace::stridedIterator({size})'.format( size=segment_size) reduce_range = '{num}, {it}, {it} + 1'.format(num=num_segments, it=iterator) reduce_range_def = 'size_t num_segments, size_t segment_size' iterator_use = 'dace::stridedIterator(segment_size)' reduce_range_use = 'num_segments, {it}, {it} + 1'.format( it=iterator_use) reduce_range_call = '%s, %s' % (num_segments, segment_size) # Call CUB to get the storage size, allocate and free it cuda_initcode.write( """ cub::{reduce_type}::{kname}(nullptr, __cub_ssize_{sdfg}_{state}_{node}, ({intype}*)nullptr, ({outtype}*)nullptr, {reduce_range}{redop}); cudaMalloc(&__cub_storage_{sdfg}_{state}_{node}, __cub_ssize_{sdfg}_{state}_{node}); """.format(sdfg=sdfg.name, state=state_id, node=node_id, reduce_type=reduce_type, reduce_range=reduce_range, redop=reduce_op, intype=input_data.dtype.ctype, outtype=output_data.dtype.ctype, kname=kname), sdfg, state_id, node) cuda_exitcode.write( 'cudaFree(__cub_storage_{sdfg}_{state}_{node});'.format( sdfg=sdfg.name, state=state_id, node=node_id), sdfg, state_id, node) # Write reduction function definition cuda_globalcode.write(""" DACE_EXPORTED void __dace_reduce_{id}({intype} *input, {outtype} *output, {reduce_range_def}, cudaStream_t stream); void __dace_reduce_{id}({intype} *input, {outtype} *output, {reduce_range_def}, cudaStream_t stream) {{ cub::{reduce_type}::{kname}(__cub_storage_{id}, __cub_ssize_{id}, input, output, {reduce_range_use}{redop}, stream); }} """.format(id=idstr, intype=input_data.dtype.ctype, outtype=output_data.dtype.ctype, reduce_type=reduce_type, reduce_range_def=reduce_range_def, reduce_range_use=reduce_range_use, kname=kname, redop=reduce_op)) # Write reduction function definition in caller file host_globalcode.write( """ DACE_EXPORTED void __dace_reduce_{id}({intype} *input, {outtype} *output, {reduce_range_def}, cudaStream_t stream); """.format(id=idstr, reduce_range_def=reduce_range_def, intype=input_data.dtype.ctype, outtype=output_data.dtype.ctype), sdfg, state_id, node) # Call reduction function where necessary host_localcode.write( '__dace_reduce_{id}({input}, {output}, {reduce_range_call}, __dace_current_stream);' .format(id=idstr, input=input, output=output, reduce_range_call=reduce_range_call)) # Make tasklet tnode = dace.nodes.Tasklet('reduce', {'_in': dace.pointer(input_data.dtype)}, {'_out': dace.pointer(output_data.dtype)}, host_localcode.getvalue(), language=dace.Language.CPP) # Add the rest of the code sdfg.append_global_code(host_globalcode.getvalue()) sdfg.append_global_code(cuda_globalcode.getvalue(), 'cuda') sdfg.append_init_code(cuda_initcode.getvalue(), 'cuda') sdfg.append_exit_code(cuda_exitcode.getvalue(), 'cuda') # Rename outer connectors and add to node input_edge._dst_conn = '_in' output_edge._src_conn = '_out' node.add_in_connector('_in') node.add_out_connector('_out') return tnode