def on_tend(self, timer_name: str, stream: CodeIOStream, sdfg=None, state=None, node=None): idstr = self._idstr(sdfg, state, node) state_id = -1 node_id = -1 if state is not None: state_id = sdfg.node_id(state) if node is not None: node_id = state.node_id(node) stream.write( '''auto __dace_tend_{id} = std::chrono::high_resolution_clock::now(); unsigned long int __dace_ts_start_{id} = std::chrono::duration_cast<std::chrono::microseconds>(__dace_tbegin_{id}.time_since_epoch()).count(); unsigned long int __dace_ts_end_{id} = std::chrono::duration_cast<std::chrono::microseconds>(__dace_tend_{id}.time_since_epoch()).count(); __state->report.add_completion("{timer_name}", "Timer", __dace_ts_start_{id}, __dace_ts_end_{id}, {sdfg_id}, {state_id}, {node_id});''' .format(timer_name=timer_name, id=idstr, sdfg_id=sdfg.sdfg_id, state_id=state_id, node_id=node_id))
def __init__(self, *args, **kwargs): self._dispatcher = disp.TargetDispatcher(self) self._dispatcher.register_state_dispatcher(self) self._initcode = CodeIOStream() self._exitcode = CodeIOStream() self.statestruct: List[str] = [] self.environments: List[Any] = []
def generate_header(self, sdfg: SDFG, global_stream: CodeIOStream, callsite_stream: CodeIOStream): """ Generate the header of the frame-code. Code exists in a separate function for overriding purposes. :param sdfg: The input SDFG. :param global_stream: Stream to write to (global). :param callsite_stream: Stream to write to (at call site). """ fname = sdfg.name params = sdfg.signature() # Write frame code - header global_stream.write( '/* DaCe AUTO-GENERATED FILE. DO NOT MODIFY */\n' + '#include <dace/dace.h>\n', sdfg) self.generate_fileheader(sdfg, callsite_stream) callsite_stream.write( 'void __program_%s_internal(%s)\n{\n' % (fname, params), sdfg) # Invoke all instrumentation providers for instr in self._dispatcher.instrumentation.values(): if instr is not None: instr.on_sdfg_begin(sdfg, callsite_stream, global_stream)
def copy_memory(self, sdfg: SDFG, dfg: SDFGState, state_id: int, src_node: nodes.Node, dst_node: nodes.Node, edge: gr.MultiConnectorEdge[mm.Memlet], function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> None: # Check whether it is a known reduction that is possible in SVE reduction_type = detect_reduction_type(edge.data.wcr) if reduction_type not in util.REDUCTION_TYPE_TO_SVE: raise util.NotSupportedError('Unsupported reduction in SVE') nc = not is_write_conflicted(dfg, edge) desc = edge.src.desc(sdfg) if not nc or not isinstance(desc.dtype, (dtypes.pointer, dtypes.vector)): # WCR on vectors works in two steps: # 1. Reduce the SVE register using SVE instructions into a scalar # 2. WCR the scalar to memory using DaCe functionality wcr = self.cpu_codegen.write_and_resolve_expr(sdfg, edge.data, not nc, None, '@', dtype=desc.dtype) callsite_stream.write(wcr[:wcr.find('@')] + util.REDUCTION_TYPE_TO_SVE[reduction_type] + f'(svptrue_{util.TYPE_TO_SVE_SUFFIX[desc.dtype]}(), ' + src_node.label + wcr[wcr.find('@') + 1:] + ');') return else: ###################### # Horizontal non-atomic reduction raise NotImplementedError() return super().copy_memory(sdfg, dfg, state_id, src_node, dst_node, edge, function_stream, callsite_stream)
def generate_header(self, sdfg: SDFG, global_stream: CodeIOStream, callsite_stream: CodeIOStream): """ Generate the header of the frame-code. Code exists in a separate function for overriding purposes. :param sdfg: The input SDFG. :param global_stream: Stream to write to (global). :param callsite_stream: Stream to write to (at call site). """ # Write frame code - header global_stream.write( '/* DaCe AUTO-GENERATED FILE. DO NOT MODIFY */\n' + '#include <dace/dace.h>\n', sdfg) # Write header required by environments for env in self.environments: self.statestruct.extend(env.state_fields) # Instrumentation preamble if len(self._dispatcher.instrumentation) > 1: self.statestruct.append('dace::perf::Report report;') # Reset report if written every invocation if config.Config.get_bool('instrumentation', 'report_each_invocation'): callsite_stream.write('__state->report.reset();', sdfg) self.generate_fileheader(sdfg, global_stream, 'frame')
def allocate_array(self, sdfg: SDFG, dfg: SDFGState, state_id: int, node: nodes.Node, nodedesc: data.Data, global_stream: CodeIOStream, declaration_stream: CodeIOStream, allocation_stream: CodeIOStream) -> None: if nodedesc.storage == dtypes.StorageType.SVE_Register: sve_type = util.TYPE_TO_SVE[nodedesc.dtype] self.dispatcher.defined_vars.add(node.data, DefinedType.Scalar, sve_type) return if util.get_sve_scope(sdfg, dfg, node) is not None and isinstance( nodedesc, data.Scalar) and isinstance(nodedesc.dtype, dtypes.vector): # Special allocation if vector Code->Code register in SVE scope # We prevent dace::vec<>'s and allocate SVE registers instead if self.dispatcher.defined_vars.has(node.data): sve_type = util.TYPE_TO_SVE[nodedesc.dtype.vtype] self.dispatcher.defined_vars.add(node.data, DefinedType.Scalar, sve_type) declaration_stream.write(f'{sve_type} {node.data};') return self.cpu_codegen.allocate_array(sdfg, dfg, state_id, node, nodedesc, global_stream, declaration_stream, allocation_stream)
def define_out_memlet(self, sdfg: sdfg.SDFG, dfg: state.StateSubgraphView, state_id: int, src_node: nodes.Node, dst_node: nodes.Node, edge: graph.MultiConnectorEdge, function_stream: prettycode.CodeIOStream, callsite_stream: prettycode.CodeIOStream): """ Generate output copy code (handled within the rtl tasklet code). """ if isinstance(edge.src, nodes.Tasklet) and isinstance( edge.dst, nodes.AccessNode): if isinstance(src_node.out_connectors[edge.src_conn], dtypes.pointer): # pointer accessor line: str = "{} {} = &{}[0];".format( src_node.out_connectors[edge.src_conn].ctype, edge.src_conn, edge.dst.data) elif isinstance(src_node.out_connectors[edge.src_conn], dtypes.vector): # vector accessor line: str = "{} {} = *({} *)(&{}[0]);".format( src_node.out_connectors[edge.src_conn].ctype, edge.src_conn, src_node.out_connectors[edge.src_conn].ctype, edge.dst.data) else: # scalar accessor line: str = "{}* {} = &{}[0];".format( src_node.out_connectors[edge.src_conn].ctype, edge.src_conn, edge.dst.data) else: raise RuntimeError( "Not handling define_out_memlet case of type {} -> {}.".format( type(edge.src), type(edge.dst))) # write accessor to file callsite_stream.write(line)
def on_tbegin(self, stream: CodeIOStream, sdfg=None, state=None, node=None): idstr = self._idstr(sdfg, state, node) stream.write( 'auto __dace_tbegin_%s = std::chrono::high_resolution_clock::now();' % idstr)
def copy_memory(self, sdfg: sdfg.SDFG, dfg: state.StateSubgraphView, state_id: int, src_node: nodes.Node, dst_node: nodes.Node, edge: graph.MultiConnectorEdge, function_stream: prettycode.CodeIOStream, callsite_stream: prettycode.CodeIOStream): """ Generate input/output memory copies from the array references to local variables (i.e. for the tasklet code). """ if isinstance(edge.src, nodes.AccessNode) and isinstance( edge.dst, nodes.Tasklet): # handle AccessNode->Tasklet if isinstance(dst_node.in_connectors[edge.dst_conn], dtypes.pointer): # pointer accessor line: str = "{} {} = &{}[0];".format( dst_node.in_connectors[edge.dst_conn].ctype, edge.dst_conn, edge.src.data) elif isinstance(dst_node.in_connectors[edge.dst_conn], dtypes.vector): # vector accessor line: str = "{} {} = *({} *)(&{}[0]);".format( dst_node.in_connectors[edge.dst_conn].ctype, edge.dst_conn, dst_node.in_connectors[edge.dst_conn].ctype, edge.src.data) else: # scalar accessor line: str = "{}* {} = &{}[0];".format( dst_node.in_connectors[edge.dst_conn].ctype, edge.dst_conn, edge.src.data) else: raise RuntimeError( "Not handling copy_memory case of type {} -> {}.".format( type(edge.src), type(edge.dst))) # write accessor to file callsite_stream.write(line)
def allocate_array(self, sdfg: dace.SDFG, dfg: StateSubgraphView, state_id: int, node: nodes.AccessNode, function_stream: CodeIOStream, declaration_stream: CodeIOStream, allocation_stream: CodeIOStream): name = node.data nodedesc = node.desc(sdfg) # Based on the hardware, the total size must be 16^2 assert nodedesc.total_size == 16 * 16 # Majority is detected by the strides of the data maj = 'row' if nodedesc.strides[-1] == 1 else 'col' # Write a fragment based on the storage type if nodedesc.storage == dace.StorageType.TensorCore_Accumulator: declaration_stream.write( 'wmma::fragment<wmma::accumulator, ' '16, 16, 16, float> {};'.format(name), sdfg, state_id, node) else: declaration_stream.write( 'wmma::fragment<wmma::matrix_{mat}, ' '16, 16, 16, half, wmma::{maj}_major> ' '{name};'.format( mat=('a' if 'A' in nodedesc.storage.name else 'b'), maj=maj, name=name), sdfg, state_id, node)
def define_out_memlet(self, sdfg: dace.SDFG, dfg: StateSubgraphView, state_id: int, src_node: nodes.Node, dst_node: nodes.Node, edge: MultiConnectorEdge, function_stream: CodeIOStream, callsite_stream: CodeIOStream): # Output memlets that are directed at WMMA fragments can use the "auto" # keyword for simplicity. callsite_stream.write(f'auto& {edge.src_conn} = {edge.data.data};')
def add_header(self, function_stream: CodeIOStream): if self.has_generated_header: return self.has_generated_header = True function_stream.write('#include <arm_sve.h>\n') # TODO: Find this automatically at compile time function_stream.write(f'#define {util.REGISTER_BYTE_SIZE} 64\n')
def __init__(self, *args, **kwargs): self._dispatcher = disp.TargetDispatcher(self) self._dispatcher.register_state_dispatcher(self) self._initcode = CodeIOStream() self._exitcode = CodeIOStream() self.statestruct: List[str] = [] self.environments: List[Any] = [] self.to_allocate: DefaultDict[ Union[SDFG, SDFGState, nodes.EntryNode], List[Tuple[int, int, nodes.AccessNode]]] = collections.defaultdict(list)
def generate_kernel_internal(self, sdfg, state, kernel_name, subgraphs, kernel_stream, function_stream, callsite_stream): """Main entry function for generating a Xilinx kernel.""" (global_data_parameters, top_level_local_data, subgraph_parameters, scalar_parameters, symbol_parameters, nested_global_transients) = self.make_parameters( sdfg, state, subgraphs) # Scalar parameters are never output sc_parameters = [(False, pname, param) for pname, param in scalar_parameters] host_code_stream = CodeIOStream() # Generate host code self.generate_host_header(sdfg, kernel_name, global_data_parameters + sc_parameters, symbol_parameters, host_code_stream) self.generate_host_function_boilerplate( sdfg, state, kernel_name, global_data_parameters + sc_parameters, symbol_parameters, nested_global_transients, host_code_stream, function_stream, callsite_stream) self.generate_host_function_body( sdfg, state, kernel_name, global_data_parameters + sc_parameters, symbol_parameters, host_code_stream) # Store code to be passed to compilation phase self._host_codes.append((kernel_name, host_code_stream.getvalue())) # Now we write the device code module_stream = CodeIOStream() entry_stream = CodeIOStream() state_id = sdfg.node_id(state) self.generate_kernel_boilerplate_pre(sdfg, state_id, kernel_name, global_data_parameters, scalar_parameters, symbol_parameters, module_stream, entry_stream) # Emit allocations for node in top_level_local_data: self._dispatcher.dispatch_allocate(sdfg, state, state_id, node, module_stream, entry_stream) self._dispatcher.dispatch_initialize(sdfg, state, state_id, node, module_stream, entry_stream) self.generate_modules(sdfg, state, kernel_name, subgraphs, subgraph_parameters, sc_parameters, symbol_parameters, module_stream, entry_stream, host_code_stream) kernel_stream.write(module_stream.getvalue()) kernel_stream.write(entry_stream.getvalue()) self.generate_kernel_boilerplate_post(kernel_stream, sdfg, state_id)
def on_sdfg_begin(self, sdfg: SDFG, local_stream: CodeIOStream, global_stream: CodeIOStream, codegen: 'DaCeCodeGenerator'): # Initialize serializer versioning object if sdfg.parent is None: self.codegen = codegen codegen.statestruct.append('dace::DataSerializer *serializer;') sdfg.append_init_code( f'__state->serializer = new dace::DataSerializer("");\n') # Add method that controls serializer input global_stream.write(self._generate_report_setter(sdfg))
def __init__(self, frame_codegen, sdfg): self._frame = frame_codegen self._dispatcher = frame_codegen.dispatcher dispatcher = self._dispatcher fileheader = CodeIOStream() self._frame.generate_fileheader(sdfg, fileheader) self._codeobj = CodeObject( sdfg.name + '_mpi', """ #include <dace/dace.h> #include <mpi.h> MPI_Comm __dace_mpi_comm; int __dace_comm_size = 1; int __dace_comm_rank = 0; {file_header} DACE_EXPORTED int __dace_init_mpi({params}); DACE_EXPORTED void __dace_exit_mpi({params}); int __dace_init_mpi({params}) {{ int isinit = 0; if (MPI_Initialized(&isinit) != MPI_SUCCESS) return 1; if (!isinit) {{ if (MPI_Init(NULL, NULL) != MPI_SUCCESS) return 1; }} MPI_Comm_dup(MPI_COMM_WORLD, &__dace_mpi_comm); MPI_Comm_rank(__dace_mpi_comm, &__dace_comm_rank); MPI_Comm_size(__dace_mpi_comm, &__dace_comm_size); printf(\"MPI was initialized on proc %i of %i\\n\", __dace_comm_rank, __dace_comm_size); return 0; }} void __dace_exit_mpi({params}) {{ MPI_Comm_free(&__dace_mpi_comm); MPI_Finalize(); printf(\"MPI was finalized on proc %i of %i\\n\", __dace_comm_rank, __dace_comm_size); }} """.format(params=sdfg.signature(), file_header=fileheader.getvalue()), 'cpp', MPICodeGen, 'MPI') # Register dispatchers dispatcher.register_map_dispatcher(dtypes.ScheduleType.MPI, self)
def get_generated_codeobjects(self): fileheader = CodeIOStream() sdfg = self._global_sdfg self._frame.generate_fileheader(sdfg, fileheader, 'mpi') params_comma = self._global_sdfg.init_signature( free_symbols=self._frame.free_symbols(self._global_sdfg)) if params_comma: params_comma = ', ' + params_comma codeobj = CodeObject( sdfg.name + '_mpi', """ #include <dace/dace.h> #include <mpi.h> MPI_Comm __dace_mpi_comm; int __dace_comm_size = 1; int __dace_comm_rank = 0; {file_header} DACE_EXPORTED int __dace_init_mpi({sdfg.name}_t *__state{params}); DACE_EXPORTED void __dace_exit_mpi({sdfg.name}_t *__state); int __dace_init_mpi({sdfg.name}_t *__state{params}) {{ int isinit = 0; if (MPI_Initialized(&isinit) != MPI_SUCCESS) return 1; if (!isinit) {{ if (MPI_Init(NULL, NULL) != MPI_SUCCESS) return 1; }} MPI_Comm_dup(MPI_COMM_WORLD, &__dace_mpi_comm); MPI_Comm_rank(__dace_mpi_comm, &__dace_comm_rank); MPI_Comm_size(__dace_mpi_comm, &__dace_comm_size); printf(\"MPI was initialized on proc %i of %i\\n\", __dace_comm_rank, __dace_comm_size); return 0; }} void __dace_exit_mpi({sdfg.name}_t *__state) {{ MPI_Comm_free(&__dace_mpi_comm); MPI_Finalize(); printf(\"MPI was finalized on proc %i of %i\\n\", __dace_comm_rank, __dace_comm_size); }} """.format(params=params_comma, sdfg=sdfg, file_header=fileheader.getvalue()), 'cpp', MPICodeGen, 'MPI') return [codeobj]
def on_tend(self, timer_name: str, stream: CodeIOStream, sdfg=None, state=None, node=None): idstr = self._idstr(sdfg, state, node) stream.write( '''auto __dace_tend_{id} = std::chrono::high_resolution_clock::now(); std::chrono::duration<double, std::milli> __dace_tdiff_{id} = __dace_tend_{id} - __dace_tbegin_{id}; dace::perf::report.add("timer_{timer_name}", __dace_tdiff_{id}.count());'''. format(timer_name=timer_name, id=idstr))
def generate_fileheader(self, sdfg: SDFG, global_stream: CodeIOStream): """ Generate a header in every output file that includes custom types and constants. :param sdfg: The input SDFG. :param global_stream: Stream to write to (global). """ ######################################################### # Custom types datatypes = set() # Types of this SDFG for _, arrname, arr in sdfg.arrays_recursive(): if arr is not None: datatypes.add(arr.dtype) # Emit unique definitions wrote_something = False for typ in datatypes: if hasattr(typ, 'emit_definition'): if not wrote_something: global_stream.write("", sdfg) wrote_something = True global_stream.write(typ.emit_definition(), sdfg) if wrote_something: global_stream.write("", sdfg) ######################################################### # Write constants self.generate_constants(sdfg, global_stream) for sd in sdfg.all_sdfgs_recursive(): global_stream.write(sd.global_code, sd)
def dispatch_allocate(self, sdfg: SDFG, dfg: ScopeSubgraphView, state_id: int, node: nodes.AccessNode, datadesc: dt.Data, function_stream: prettycode.CodeIOStream, callsite_stream: prettycode.CodeIOStream, declare: bool = True, allocate: bool = True): """ Dispatches a code generator for data allocation. """ self._used_targets.add(self._array_dispatchers[datadesc.storage]) if datadesc.lifetime is dtypes.AllocationLifetime.Persistent: declaration_stream = CodeIOStream() callsite_stream = self.frame._initcode else: declaration_stream = callsite_stream if declare and not allocate: self._array_dispatchers[datadesc.storage].declare_array( sdfg, dfg, state_id, node, datadesc, function_stream, declaration_stream) elif allocate: self._array_dispatchers[datadesc.storage].allocate_array( sdfg, dfg, state_id, node, datadesc, function_stream, declaration_stream, callsite_stream)
def generate_constants(self, sdfg: SDFG, callsite_stream: CodeIOStream): # Write constants for cstname, (csttype, cstval) in sdfg.constants_prop.items(): if isinstance(csttype, data.Array): const_str = "constexpr " + csttype.dtype.ctype + \ " " + cstname + "[" + str(cstval.size) + "] = {" it = np.nditer(cstval, order='C') for i in range(cstval.size - 1): const_str += str(it[0]) + ", " it.iternext() const_str += str(it[0]) + "};\n" callsite_stream.write(const_str, sdfg) else: callsite_stream.write( "constexpr %s %s = %s;\n" % (csttype.dtype.ctype, cstname, sym2cpp(cstval)), sdfg)
def generate_constants(self, sdfg: SDFG, callsite_stream: CodeIOStream): # Write constants for cstname, cstval in sdfg.constants.items(): if isinstance(cstval, np.ndarray): dtype = dtypes.typeclass(cstval.dtype.type) const_str = "constexpr " + dtype.ctype + \ " " + cstname + "[" + str(cstval.size) + "] = {" it = np.nditer(cstval, order='C') for i in range(cstval.size - 1): const_str += str(it[0]) + ", " it.iternext() const_str += str(it[0]) + "};\n" callsite_stream.write(const_str, sdfg) else: callsite_stream.write( "constexpr auto %s = %s;\n" % (cstname, str(cstval)), sdfg)
def _setup_gpu_runtime(self, sdfg: SDFG, global_stream: CodeIOStream): if self.gpu_runtime_init: return self.gpu_runtime_init = True self.backend = config.Config.get('compiler', 'cuda', 'backend') if self.backend == 'cuda': header_name = 'cuda_runtime.h' elif self.backend == 'hip': header_name = 'hip/hip_runtime.h' else: raise NameError('GPU backend "%s" not recognized' % self.backend) global_stream.write('#include <%s>' % header_name) # For other file headers sdfg.append_global_code('\n#include <%s>' % header_name, None)
def generate_host_function_body(self, sdfg: dace.SDFG, state: dace.SDFGState, kernel_name: str, predecessors: list, parameters: list, rtl_tasklet_names: list, kernel_stream: CodeIOStream, instrumentation_stream: CodeIOStream): ''' Generate the host-specific code for spawning and synchronizing the given kernel. :param sdfg: :param state: :param predecessors: list containing all the name of kernels that must be finished before starting this one :param parameters: list containing the kernel parameters (of all kernels in this state) :param rtl_tasklet_names :param kernel_stream: Device-specific code stream :param instrumentation_stream: Code for profiling kernel execution time. ''' kernel_args = [] for _, name, p, interface_ids in parameters: if isinstance(p, dt.Array): for bank, _ in fpga.iterate_hbm_interface_ids( p, interface_ids): kernel_args.append( p.as_arg(False, name=fpga.fpga_ptr(name, p, sdfg, bank))) else: kernel_args.append(p.as_arg(False, name=name)) kernel_function_name = kernel_name kernel_file_name = "{}.xclbin".format(kernel_name) # Check if this kernel depends from other kernels needs_synch = len(predecessors) > 0 if needs_synch: # Build a vector containing all the events associated with the kernels from which this one depends kernel_deps_name = f"deps_{kernel_name}" kernel_stream.write(f"std::vector<cl::Event> {kernel_deps_name};") for pred in predecessors: # concatenate events from predecessor kernel kernel_stream.write( f"{kernel_deps_name}.push_back({pred}_event);") # Launch HLS kernel, passing synchronization events (if any) kernel_stream.write( f"""\ auto {kernel_name}_kernel = program.MakeKernel({kernel_function_name}, "{kernel_function_name}", {", ".join(kernel_args)}); cl::Event {kernel_name}_event = {kernel_name}_kernel.ExecuteTaskFork({f'{kernel_deps_name}.begin(), {kernel_deps_name}.end()' if needs_synch else ''}); all_events.push_back({kernel_name}_event);""", sdfg, sdfg.node_id(state)) if state.instrument == dtypes.InstrumentationType.FPGA: self.instrument_opencl_kernel(kernel_name, sdfg.node_id(state), sdfg.sdfg_id, instrumentation_stream) # Join RTL tasklets for name in rtl_tasklet_names: kernel_stream.write(f"kernel_{name}.wait();\n", sdfg, sdfg.node_id(state))
def generate_out_register(self, sdfg: SDFG, state: SDFGState, edge: graph.MultiConnectorEdge[mm.Memlet], code: CodeIOStream, use_data_name: bool = False) -> bool: """ Responsible for generating temporary out registers in a Tasklet, given an outgoing edge. Returns `True` if a writeback of this register is needed. """ if edge.src_conn is None: return dst_node = state.memlet_path(edge)[-1].dst src_type = edge.src.out_connectors[edge.src_conn] src_name = edge.src_conn if use_data_name: src_name = edge.data.data if isinstance(dst_node, nodes.AccessNode) and isinstance( dst_node.desc(sdfg), data.Stream): # Streams don't need writeback and are treated differently self.stream_associations[edge.src_conn] = (edge.data.data, src_type.base_type) return False elif edge.data.wcr is not None: # WCR is addressed within the unparser to capture conditionals self.wcr_associations[edge.src_conn] = (dst_node, edge, src_type.base_type) return False # Create temporary registers ctype = None if util.is_vector(src_type): ctype = util.TYPE_TO_SVE[src_type.type] elif util.is_scalar(src_type): ctype = src_type.ctype else: raise util.NotSupportedError( 'Unsupported Code->Code edge (pointer)') self.dispatcher.defined_vars.add(src_name, DefinedType.Scalar, ctype) code.write(f'{ctype} {src_name};') return True
def create_empty_definition(self, conn: dace.typeclass, edge: gr.MultiConnectorEdge[mm.Memlet], callsite_stream: CodeIOStream, output: bool = False, is_code_code: bool = False): """ Creates a simple variable definition `type name;`, which works for both vectors and regular data types. """ var_name = None var_type = None var_ctype = None if output: var_name = edge.dst_conn else: var_name = edge.src_conn if is_code_code: # For edges between Tasklets (Code->Code), we use the data as name because these registers are temporary and shared var_name = edge.data.data if isinstance(conn, dtypes.vector): # Creates an SVE register if conn.type not in util.TYPE_TO_SVE: raise util.NotSupportedError('Data type not supported') # In case of a WCR, we must initialize it with the identity value. # This is to prevent cases in a conditional WCR, where we don't write and it is filled with garbage. # Currently, the initial is 0, because product reduction is not supported in SVE. init_str = '' if edge.data.wcr: init_str = ' = {}(0)'.format(util.instr('dup', type=conn.type)) var_type = conn.type var_ctype = util.TYPE_TO_SVE[var_type] callsite_stream.write('{} {}{};'.format(var_ctype, var_name, init_str)) else: raise NotImplementedError( f'Output into scalar or pointer is not supported ({var_name})') self.dispatcher.defined_vars.add(var_name, var_type, var_ctype)
def copy_memory(self, sdfg: sdfg.SDFG, dfg: state.StateSubgraphView, state_id: int, src_node: nodes.Node, dst_node: nodes.Node, edge: graph.MultiConnectorEdge, function_stream: prettycode.CodeIOStream, callsite_stream: prettycode.CodeIOStream): """ Generate input/output memory copies from the array references to local variables (i.e. for the tasklet code). """ if isinstance(edge.src, nodes.AccessNode) and isinstance( edge.dst, nodes.Tasklet): # handle AccessNode->Tasklet if isinstance(dst_node.in_connectors[edge.dst_conn], dtypes.pointer): # pointer accessor line: str = "{} {} = &{}[0];".format( dst_node.in_connectors[edge.dst_conn].ctype, edge.dst_conn, edge.src.data) elif isinstance(dst_node.in_connectors[edge.dst_conn], dtypes.vector): # vector accessor line: str = "{} {} = *({} *)(&{}[0]);".format( dst_node.in_connectors[edge.dst_conn].ctype, edge.dst_conn, dst_node.in_connectors[edge.dst_conn].ctype, edge.src.data) else: # scalar accessor arr = sdfg.arrays[edge.data.data] if isinstance(arr, data.Array): line: str = "{}* {} = &{}[0];".format( dst_node.in_connectors[edge.dst_conn].ctype, edge.dst_conn, edge.src.data) elif isinstance(arr, data.Scalar): line: str = "{} {} = {};".format( dst_node.in_connectors[edge.dst_conn].ctype, edge.dst_conn, edge.src.data) elif isinstance(edge.src, nodes.MapEntry) and isinstance( edge.dst, nodes.Tasklet): rtl_name = self.unique_name(edge.dst, sdfg.nodes()[state_id], sdfg) self.n_unrolled[rtl_name] = symbolic.evaluate( edge.src.map.range[0][1] + 1, sdfg.constants) line: str = f'{dst_node.in_connectors[edge.dst_conn]} {edge.dst_conn} = &{edge.data.data}[{edge.src.map.params[0]}*{edge.data.volume}];' else: raise RuntimeError( "Not handling copy_memory case of type {} -> {}.".format( type(edge.src), type(edge.dst))) # write accessor to file callsite_stream.write(line)
def generate_node(self, sdfg: SDFG, state: SDFGState, state_id: int, node: nodes.Node, function_stream: CodeIOStream, callsite_stream: CodeIOStream): self.add_header(function_stream) if not isinstance(node, nodes.Tasklet): return scope = util.get_sve_scope(sdfg, state, node) # Reset the stream variable mappings self.stream_associations = dict() self.wcr_associations = dict() callsite_stream.write('{') self.dispatcher.defined_vars.enter_scope(node) ################## # Generate tasklet # Inputs for edge in state.in_edges(node): self.generate_read(sdfg, state, scope.map, edge, callsite_stream) requires_wb = [] # Temporary output registers for edge in state.out_edges(node): if self.generate_out_register(sdfg, state, edge, callsite_stream): requires_wb.append(edge) # Tasklet code self.unparse_tasklet(sdfg, state, state_id, node, function_stream, callsite_stream) # Writeback from temporary registers to memory for edge in requires_wb: self.generate_writeback(sdfg, state, scope, edge, callsite_stream) self.dispatcher.defined_vars.exit_scope(node) callsite_stream.write('}')
def generate_scope(self, sdfg: dace.SDFG, scope: ScopeSubgraphView, state_id: int, function_stream: CodeIOStream, callsite_stream: CodeIOStream): entry_node = scope.source_nodes()[0] index_list = [] for begin, end, stride in entry_node.map.range: l = [] while begin <= end: l.append(begin) begin += stride index_list.append(l) for indices in product(*index_list): callsite_stream.write('{') for param, index in zip(entry_node.map.params, indices): callsite_stream.write(f'auto {param} = {sym2cpp(index)};') self._dispatcher.dispatch_subgraph(sdfg, scope, state_id, function_stream, callsite_stream, skip_entry_node=True, skip_exit_node=True) callsite_stream.write('}')
def generate_scope(self, sdfg: dace.SDFG, scope: ScopeSubgraphView, state_id: int, function_stream: CodeIOStream, callsite_stream: CodeIOStream): entry_node: nd.MapEntry = scope.source_nodes()[0] index_list = [] for begin, end, stride in entry_node.map.range: l = [] while begin <= end: l.append(begin) begin += stride index_list.append(l) sdfgconsts = sdfg.constants_prop sdfg.constants_prop = copy.deepcopy(sdfg.constants_prop) mapsymboltypes = entry_node.new_symbols(sdfg, scope, [entry_node.map.params]) for indices in product(*index_list): callsite_stream.write('{') nsdfg_unroll_info = None for param, index in zip(entry_node.map.params, indices): if nsdfg_unroll_info is None: nsdfg_unroll_info = self.nsdfg_prepare_unroll( scope, str(param), str(index)) else: self.nsdfg_prepare_unroll(scope, str(param), str(index)) callsite_stream.write( f"constexpr {mapsymboltypes[param]} {param} = " f"{dace.codegen.targets.common.sym2cpp(index)};\n", sdfg) sdfg.add_constant(param, int(index)) callsite_stream.write('{') self._dispatcher.dispatch_subgraph( sdfg, scope, state_id, function_stream, callsite_stream, skip_entry_node=True, skip_exit_node=True, ) callsite_stream.write('}') callsite_stream.write('}') self.nsdfg_after_unroll(nsdfg_unroll_info) sdfg.constants_prop = sdfgconsts