Ejemplo n.º 1
0
    def allocate_array(self, sdfg: dace.SDFG, dfg: StateSubgraphView,
                       state_id: int, node: nodes.AccessNode,
                       function_stream: CodeIOStream,
                       declaration_stream: CodeIOStream,
                       allocation_stream: CodeIOStream):
        name = node.data
        nodedesc = node.desc(sdfg)

        # Based on the hardware, the total size must be 16^2
        assert nodedesc.total_size == 16 * 16
        # Majority is detected by the strides of the data
        maj = 'row' if nodedesc.strides[-1] == 1 else 'col'

        # Write a fragment based on the storage type
        if nodedesc.storage == dace.StorageType.TensorCore_Accumulator:
            declaration_stream.write(
                'wmma::fragment<wmma::accumulator, '
                '16, 16, 16, float> {};'.format(name), sdfg, state_id, node)
        else:
            declaration_stream.write(
                'wmma::fragment<wmma::matrix_{mat}, '
                '16, 16, 16, half, wmma::{maj}_major> '
                '{name};'.format(
                    mat=('a' if 'A' in nodedesc.storage.name else 'b'),
                    maj=maj,
                    name=name), sdfg, state_id, node)
Ejemplo n.º 2
0
    def generate_header(self, sdfg: SDFG, global_stream: CodeIOStream,
                        callsite_stream: CodeIOStream):
        """ Generate the header of the frame-code. Code exists in a separate
            function for overriding purposes.
            :param sdfg: The input SDFG.
            :param global_stream: Stream to write to (global).
            :param callsite_stream: Stream to write to (at call site).
        """
        fname = sdfg.name
        params = sdfg.signature()

        # Write frame code - header
        global_stream.write(
            '/* DaCe AUTO-GENERATED FILE. DO NOT MODIFY */\n' +
            '#include <dace/dace.h>\n', sdfg)

        self.generate_fileheader(sdfg, callsite_stream)

        callsite_stream.write(
            'void __program_%s_internal(%s)\n{\n' % (fname, params), sdfg)

        # Invoke all instrumentation providers
        for instr in self._dispatcher.instrumentation.values():
            if instr is not None:
                instr.on_sdfg_begin(sdfg, callsite_stream, global_stream)
Ejemplo n.º 3
0
Archivo: rtl.py Proyecto: sscholbe/dace
 def copy_memory(self, sdfg: sdfg.SDFG, dfg: state.StateSubgraphView,
                 state_id: int, src_node: nodes.Node, dst_node: nodes.Node,
                 edge: graph.MultiConnectorEdge,
                 function_stream: prettycode.CodeIOStream,
                 callsite_stream: prettycode.CodeIOStream):
     """
         Generate input/output memory copies from the array references to local variables (i.e. for the tasklet code).
     """
     if isinstance(edge.src, nodes.AccessNode) and isinstance(
             edge.dst, nodes.Tasklet):  # handle AccessNode->Tasklet
         if isinstance(dst_node.in_connectors[edge.dst_conn],
                       dtypes.pointer):  # pointer accessor
             line: str = "{} {} = &{}[0];".format(
                 dst_node.in_connectors[edge.dst_conn].ctype, edge.dst_conn,
                 edge.src.data)
         elif isinstance(dst_node.in_connectors[edge.dst_conn],
                         dtypes.vector):  # vector accessor
             line: str = "{} {} = *({} *)(&{}[0]);".format(
                 dst_node.in_connectors[edge.dst_conn].ctype, edge.dst_conn,
                 dst_node.in_connectors[edge.dst_conn].ctype, edge.src.data)
         else:  # scalar accessor
             line: str = "{}* {} = &{}[0];".format(
                 dst_node.in_connectors[edge.dst_conn].ctype, edge.dst_conn,
                 edge.src.data)
     else:
         raise RuntimeError(
             "Not handling copy_memory case of type {} -> {}.".format(
                 type(edge.src), type(edge.dst)))
     # write accessor to file
     callsite_stream.write(line)
Ejemplo n.º 4
0
Archivo: rtl.py Proyecto: sscholbe/dace
 def define_out_memlet(self, sdfg: sdfg.SDFG, dfg: state.StateSubgraphView,
                       state_id: int, src_node: nodes.Node,
                       dst_node: nodes.Node, edge: graph.MultiConnectorEdge,
                       function_stream: prettycode.CodeIOStream,
                       callsite_stream: prettycode.CodeIOStream):
     """
         Generate output copy code (handled within the rtl tasklet code).
     """
     if isinstance(edge.src, nodes.Tasklet) and isinstance(
             edge.dst, nodes.AccessNode):
         if isinstance(src_node.out_connectors[edge.src_conn],
                       dtypes.pointer):  # pointer accessor
             line: str = "{} {} = &{}[0];".format(
                 src_node.out_connectors[edge.src_conn].ctype,
                 edge.src_conn, edge.dst.data)
         elif isinstance(src_node.out_connectors[edge.src_conn],
                         dtypes.vector):  # vector accessor
             line: str = "{} {} = *({} *)(&{}[0]);".format(
                 src_node.out_connectors[edge.src_conn].ctype,
                 edge.src_conn,
                 src_node.out_connectors[edge.src_conn].ctype,
                 edge.dst.data)
         else:  # scalar accessor
             line: str = "{}* {} = &{}[0];".format(
                 src_node.out_connectors[edge.src_conn].ctype,
                 edge.src_conn, edge.dst.data)
     else:
         raise RuntimeError(
             "Not handling define_out_memlet case of type {} -> {}.".format(
                 type(edge.src), type(edge.dst)))
     # write accessor to file
     callsite_stream.write(line)
Ejemplo n.º 5
0
    def on_tend(self,
                timer_name: str,
                stream: CodeIOStream,
                sdfg=None,
                state=None,
                node=None):
        idstr = self._idstr(sdfg, state, node)

        state_id = -1
        node_id = -1
        if state is not None:
            state_id = sdfg.node_id(state)
            if node is not None:
                node_id = state.node_id(node)

        stream.write(
            '''auto __dace_tend_{id} = std::chrono::high_resolution_clock::now();
unsigned long int __dace_ts_start_{id} = std::chrono::duration_cast<std::chrono::microseconds>(__dace_tbegin_{id}.time_since_epoch()).count();
unsigned long int __dace_ts_end_{id} = std::chrono::duration_cast<std::chrono::microseconds>(__dace_tend_{id}.time_since_epoch()).count();
__state->report.add_completion("{timer_name}", "Timer", __dace_ts_start_{id}, __dace_ts_end_{id}, {sdfg_id}, {state_id}, {node_id});'''
            .format(timer_name=timer_name,
                    id=idstr,
                    sdfg_id=sdfg.sdfg_id,
                    state_id=state_id,
                    node_id=node_id))
Ejemplo n.º 6
0
    def copy_memory(self, sdfg: SDFG, dfg: SDFGState, state_id: int, src_node: nodes.Node, dst_node: nodes.Node,
                    edge: gr.MultiConnectorEdge[mm.Memlet], function_stream: CodeIOStream,
                    callsite_stream: CodeIOStream) -> None:

        # Check whether it is a known reduction that is possible in SVE
        reduction_type = detect_reduction_type(edge.data.wcr)
        if reduction_type not in util.REDUCTION_TYPE_TO_SVE:
            raise util.NotSupportedError('Unsupported reduction in SVE')

        nc = not is_write_conflicted(dfg, edge)
        desc = edge.src.desc(sdfg)
        if not nc or not isinstance(desc.dtype, (dtypes.pointer, dtypes.vector)):
            # WCR on vectors works in two steps:
            # 1. Reduce the SVE register using SVE instructions into a scalar
            # 2. WCR the scalar to memory using DaCe functionality
            wcr = self.cpu_codegen.write_and_resolve_expr(sdfg, edge.data, not nc, None, '@', dtype=desc.dtype)
            callsite_stream.write(wcr[:wcr.find('@')] + util.REDUCTION_TYPE_TO_SVE[reduction_type] +
                                  f'(svptrue_{util.TYPE_TO_SVE_SUFFIX[desc.dtype]}(), ' + src_node.label +
                                  wcr[wcr.find('@') + 1:] + ');')
            return
        else:
            ######################
            # Horizontal non-atomic reduction
            raise NotImplementedError()

        return super().copy_memory(sdfg, dfg, state_id, src_node, dst_node, edge, function_stream, callsite_stream)
Ejemplo n.º 7
0
    def allocate_array(self, sdfg: SDFG, dfg: SDFGState, state_id: int,
                       node: nodes.Node, nodedesc: data.Data,
                       global_stream: CodeIOStream,
                       declaration_stream: CodeIOStream,
                       allocation_stream: CodeIOStream) -> None:
        if nodedesc.storage == dtypes.StorageType.SVE_Register:
            sve_type = util.TYPE_TO_SVE[nodedesc.dtype]
            self.dispatcher.defined_vars.add(node.data, DefinedType.Scalar,
                                             sve_type)
            return

        if util.get_sve_scope(sdfg, dfg, node) is not None and isinstance(
                nodedesc, data.Scalar) and isinstance(nodedesc.dtype,
                                                      dtypes.vector):
            # Special allocation if vector Code->Code register in SVE scope
            # We prevent dace::vec<>'s and allocate SVE registers instead
            if self.dispatcher.defined_vars.has(node.data):
                sve_type = util.TYPE_TO_SVE[nodedesc.dtype.vtype]
                self.dispatcher.defined_vars.add(node.data, DefinedType.Scalar,
                                                 sve_type)
                declaration_stream.write(f'{sve_type} {node.data};')
            return

        self.cpu_codegen.allocate_array(sdfg, dfg, state_id, node, nodedesc,
                                        global_stream, declaration_stream,
                                        allocation_stream)
Ejemplo n.º 8
0
    def generate_header(self, sdfg: SDFG, global_stream: CodeIOStream,
                        callsite_stream: CodeIOStream):
        """ Generate the header of the frame-code. Code exists in a separate
            function for overriding purposes.
            :param sdfg: The input SDFG.
            :param global_stream: Stream to write to (global).
            :param callsite_stream: Stream to write to (at call site).
        """
        # Write frame code - header
        global_stream.write(
            '/* DaCe AUTO-GENERATED FILE. DO NOT MODIFY */\n' +
            '#include <dace/dace.h>\n', sdfg)

        # Write header required by environments
        for env in self.environments:
            self.statestruct.extend(env.state_fields)

        # Instrumentation preamble
        if len(self._dispatcher.instrumentation) > 1:
            self.statestruct.append('dace::perf::Report report;')
            # Reset report if written every invocation
            if config.Config.get_bool('instrumentation',
                                      'report_each_invocation'):
                callsite_stream.write('__state->report.reset();', sdfg)

        self.generate_fileheader(sdfg, global_stream, 'frame')
Ejemplo n.º 9
0
    def on_tbegin(self, stream: CodeIOStream, sdfg=None, state=None,
                  node=None):
        idstr = self._idstr(sdfg, state, node)

        stream.write(
            'auto __dace_tbegin_%s = std::chrono::high_resolution_clock::now();'
            % idstr)
Ejemplo n.º 10
0
 def define_out_memlet(self, sdfg: dace.SDFG, dfg: StateSubgraphView,
                       state_id: int, src_node: nodes.Node,
                       dst_node: nodes.Node, edge: MultiConnectorEdge,
                       function_stream: CodeIOStream,
                       callsite_stream: CodeIOStream):
     # Output memlets that are directed at WMMA fragments can use the "auto"
     # keyword for simplicity.
     callsite_stream.write(f'auto& {edge.src_conn} = {edge.data.data};')
Ejemplo n.º 11
0
    def add_header(self, function_stream: CodeIOStream):
        if self.has_generated_header:
            return
        self.has_generated_header = True

        function_stream.write('#include <arm_sve.h>\n')

        # TODO: Find this automatically at compile time
        function_stream.write(f'#define {util.REGISTER_BYTE_SIZE} 64\n')
Ejemplo n.º 12
0
    def on_sdfg_begin(self, sdfg: SDFG, local_stream: CodeIOStream,
                      global_stream: CodeIOStream,
                      codegen: 'DaCeCodeGenerator'):
        # Initialize serializer versioning object
        if sdfg.parent is None:
            self.codegen = codegen
            codegen.statestruct.append('dace::DataSerializer *serializer;')
            sdfg.append_init_code(
                f'__state->serializer = new dace::DataSerializer("");\n')

            # Add method that controls serializer input
            global_stream.write(self._generate_report_setter(sdfg))
Ejemplo n.º 13
0
    def on_tend(self,
                timer_name: str,
                stream: CodeIOStream,
                sdfg=None,
                state=None,
                node=None):
        idstr = self._idstr(sdfg, state, node)

        stream.write(
            '''auto __dace_tend_{id} = std::chrono::high_resolution_clock::now();
std::chrono::duration<double, std::milli> __dace_tdiff_{id} = __dace_tend_{id} - __dace_tbegin_{id};
dace::perf::report.add("timer_{timer_name}", __dace_tdiff_{id}.count());'''.
            format(timer_name=timer_name, id=idstr))
Ejemplo n.º 14
0
    def generate_fileheader(self, sdfg: SDFG, global_stream: CodeIOStream):
        """ Generate a header in every output file that includes custom types
            and constants.
            :param sdfg: The input SDFG.
            :param global_stream: Stream to write to (global).
        """
        #########################################################
        # Custom types
        datatypes = set()
        # Types of this SDFG
        for _, arrname, arr in sdfg.arrays_recursive():
            if arr is not None:
                datatypes.add(arr.dtype)

        # Emit unique definitions
        wrote_something = False
        for typ in datatypes:
            if hasattr(typ, 'emit_definition'):
                if not wrote_something:
                    global_stream.write("", sdfg)
                wrote_something = True
                global_stream.write(typ.emit_definition(), sdfg)
        if wrote_something:
            global_stream.write("", sdfg)

        #########################################################
        # Write constants
        self.generate_constants(sdfg, global_stream)

        for sd in sdfg.all_sdfgs_recursive():
            global_stream.write(sd.global_code, sd)
Ejemplo n.º 15
0
    def _setup_gpu_runtime(self, sdfg: SDFG, global_stream: CodeIOStream):
        if self.gpu_runtime_init:
            return
        self.gpu_runtime_init = True
        self.backend = config.Config.get('compiler', 'cuda', 'backend')
        if self.backend == 'cuda':
            header_name = 'cuda_runtime.h'
        elif self.backend == 'hip':
            header_name = 'hip/hip_runtime.h'
        else:
            raise NameError('GPU backend "%s" not recognized' % self.backend)

        global_stream.write('#include <%s>' % header_name)

        # For other file headers
        sdfg.append_global_code('\n#include <%s>' % header_name, None)
Ejemplo n.º 16
0
 def generate_constants(self, sdfg: SDFG, callsite_stream: CodeIOStream):
     # Write constants
     for cstname, (csttype, cstval) in sdfg.constants_prop.items():
         if isinstance(csttype, data.Array):
             const_str = "constexpr " + csttype.dtype.ctype + \
                 " " + cstname + "[" + str(cstval.size) + "] = {"
             it = np.nditer(cstval, order='C')
             for i in range(cstval.size - 1):
                 const_str += str(it[0]) + ", "
                 it.iternext()
             const_str += str(it[0]) + "};\n"
             callsite_stream.write(const_str, sdfg)
         else:
             callsite_stream.write(
                 "constexpr %s %s = %s;\n" %
                 (csttype.dtype.ctype, cstname, sym2cpp(cstval)), sdfg)
Ejemplo n.º 17
0
 def generate_constants(self, sdfg: SDFG, callsite_stream: CodeIOStream):
     # Write constants
     for cstname, cstval in sdfg.constants.items():
         if isinstance(cstval, np.ndarray):
             dtype = dtypes.typeclass(cstval.dtype.type)
             const_str = "constexpr " + dtype.ctype + \
                 " " + cstname + "[" + str(cstval.size) + "] = {"
             it = np.nditer(cstval, order='C')
             for i in range(cstval.size - 1):
                 const_str += str(it[0]) + ", "
                 it.iternext()
             const_str += str(it[0]) + "};\n"
             callsite_stream.write(const_str, sdfg)
         else:
             callsite_stream.write(
                 "constexpr auto %s = %s;\n" % (cstname, str(cstval)), sdfg)
Ejemplo n.º 18
0
    def generate_host_function_body(self, sdfg: dace.SDFG,
                                    state: dace.SDFGState, kernel_name: str,
                                    predecessors: list, parameters: list,
                                    rtl_tasklet_names: list,
                                    kernel_stream: CodeIOStream,
                                    instrumentation_stream: CodeIOStream):
        '''
        Generate the host-specific code for spawning and synchronizing the given kernel.
        :param sdfg:
        :param state:
        :param predecessors: list containing all the name of kernels that must be finished before starting this one
        :param parameters: list containing the kernel parameters (of all kernels in this state)
        :param rtl_tasklet_names
        :param kernel_stream: Device-specific code stream
        :param instrumentation_stream: Code for profiling kernel execution time.
        '''

        kernel_args = []
        for _, name, p, interface_ids in parameters:
            if isinstance(p, dt.Array):
                for bank, _ in fpga.iterate_hbm_interface_ids(
                        p, interface_ids):
                    kernel_args.append(
                        p.as_arg(False,
                                 name=fpga.fpga_ptr(name, p, sdfg, bank)))
            else:
                kernel_args.append(p.as_arg(False, name=name))

        kernel_function_name = kernel_name
        kernel_file_name = "{}.xclbin".format(kernel_name)

        # Check if this kernel depends from other kernels
        needs_synch = len(predecessors) > 0

        if needs_synch:
            # Build a vector containing all the events associated with the kernels from which this one depends
            kernel_deps_name = f"deps_{kernel_name}"
            kernel_stream.write(f"std::vector<cl::Event> {kernel_deps_name};")
            for pred in predecessors:
                # concatenate events from predecessor kernel
                kernel_stream.write(
                    f"{kernel_deps_name}.push_back({pred}_event);")

        # Launch HLS kernel, passing synchronization events (if any)
        kernel_stream.write(
            f"""\
  auto {kernel_name}_kernel = program.MakeKernel({kernel_function_name}, "{kernel_function_name}", {", ".join(kernel_args)});
  cl::Event {kernel_name}_event = {kernel_name}_kernel.ExecuteTaskFork({f'{kernel_deps_name}.begin(), {kernel_deps_name}.end()' if needs_synch else ''});
  all_events.push_back({kernel_name}_event);""", sdfg, sdfg.node_id(state))
        if state.instrument == dtypes.InstrumentationType.FPGA:
            self.instrument_opencl_kernel(kernel_name, sdfg.node_id(state),
                                          sdfg.sdfg_id, instrumentation_stream)

        # Join RTL tasklets
        for name in rtl_tasklet_names:
            kernel_stream.write(f"kernel_{name}.wait();\n", sdfg,
                                sdfg.node_id(state))
Ejemplo n.º 19
0
    def generate_out_register(self,
                              sdfg: SDFG,
                              state: SDFGState,
                              edge: graph.MultiConnectorEdge[mm.Memlet],
                              code: CodeIOStream,
                              use_data_name: bool = False) -> bool:
        """
            Responsible for generating temporary out registers in a Tasklet, given an outgoing edge.
            Returns `True` if a writeback of this register is needed.
        """
        if edge.src_conn is None:
            return

        dst_node = state.memlet_path(edge)[-1].dst

        src_type = edge.src.out_connectors[edge.src_conn]
        src_name = edge.src_conn

        if use_data_name:
            src_name = edge.data.data

        if isinstance(dst_node, nodes.AccessNode) and isinstance(
                dst_node.desc(sdfg), data.Stream):
            # Streams don't need writeback and are treated differently
            self.stream_associations[edge.src_conn] = (edge.data.data,
                                                       src_type.base_type)
            return False
        elif edge.data.wcr is not None:
            # WCR is addressed within the unparser to capture conditionals
            self.wcr_associations[edge.src_conn] = (dst_node, edge,
                                                    src_type.base_type)
            return False

        # Create temporary registers
        ctype = None
        if util.is_vector(src_type):
            ctype = util.TYPE_TO_SVE[src_type.type]
        elif util.is_scalar(src_type):
            ctype = src_type.ctype
        else:
            raise util.NotSupportedError(
                'Unsupported Code->Code edge (pointer)')

        self.dispatcher.defined_vars.add(src_name, DefinedType.Scalar, ctype)
        code.write(f'{ctype} {src_name};')

        return True
Ejemplo n.º 20
0
    def create_empty_definition(self,
                                conn: dace.typeclass,
                                edge: gr.MultiConnectorEdge[mm.Memlet],
                                callsite_stream: CodeIOStream,
                                output: bool = False,
                                is_code_code: bool = False):
        """ Creates a simple variable definition `type name;`, which works for both vectors and regular data types. """

        var_name = None
        var_type = None
        var_ctype = None

        if output:
            var_name = edge.dst_conn
        else:
            var_name = edge.src_conn

        if is_code_code:
            # For edges between Tasklets (Code->Code), we use the data as name because these registers are temporary and shared
            var_name = edge.data.data

        if isinstance(conn, dtypes.vector):
            # Creates an SVE register

            if conn.type not in util.TYPE_TO_SVE:
                raise util.NotSupportedError('Data type not supported')

            # In case of a WCR, we must initialize it with the identity value.
            # This is to prevent cases in a conditional WCR, where we don't write and it is filled with garbage.
            # Currently, the initial is 0, because product reduction is not supported in SVE.
            init_str = ''
            if edge.data.wcr:
                init_str = ' = {}(0)'.format(util.instr('dup', type=conn.type))

            var_type = conn.type
            var_ctype = util.TYPE_TO_SVE[var_type]

            callsite_stream.write('{} {}{};'.format(var_ctype, var_name,
                                                    init_str))
        else:
            raise NotImplementedError(
                f'Output into scalar or pointer is not supported ({var_name})')

        self.dispatcher.defined_vars.add(var_name, var_type, var_ctype)
Ejemplo n.º 21
0
 def copy_memory(self, sdfg: sdfg.SDFG, dfg: state.StateSubgraphView,
                 state_id: int, src_node: nodes.Node, dst_node: nodes.Node,
                 edge: graph.MultiConnectorEdge,
                 function_stream: prettycode.CodeIOStream,
                 callsite_stream: prettycode.CodeIOStream):
     """
         Generate input/output memory copies from the array references to local variables (i.e. for the tasklet code).
     """
     if isinstance(edge.src, nodes.AccessNode) and isinstance(
             edge.dst, nodes.Tasklet):  # handle AccessNode->Tasklet
         if isinstance(dst_node.in_connectors[edge.dst_conn],
                       dtypes.pointer):  # pointer accessor
             line: str = "{} {} = &{}[0];".format(
                 dst_node.in_connectors[edge.dst_conn].ctype, edge.dst_conn,
                 edge.src.data)
         elif isinstance(dst_node.in_connectors[edge.dst_conn],
                         dtypes.vector):  # vector accessor
             line: str = "{} {} = *({} *)(&{}[0]);".format(
                 dst_node.in_connectors[edge.dst_conn].ctype, edge.dst_conn,
                 dst_node.in_connectors[edge.dst_conn].ctype, edge.src.data)
         else:  # scalar accessor
             arr = sdfg.arrays[edge.data.data]
             if isinstance(arr, data.Array):
                 line: str = "{}* {} = &{}[0];".format(
                     dst_node.in_connectors[edge.dst_conn].ctype,
                     edge.dst_conn, edge.src.data)
             elif isinstance(arr, data.Scalar):
                 line: str = "{} {} = {};".format(
                     dst_node.in_connectors[edge.dst_conn].ctype,
                     edge.dst_conn, edge.src.data)
     elif isinstance(edge.src, nodes.MapEntry) and isinstance(
             edge.dst, nodes.Tasklet):
         rtl_name = self.unique_name(edge.dst, sdfg.nodes()[state_id], sdfg)
         self.n_unrolled[rtl_name] = symbolic.evaluate(
             edge.src.map.range[0][1] + 1, sdfg.constants)
         line: str = f'{dst_node.in_connectors[edge.dst_conn]} {edge.dst_conn} = &{edge.data.data}[{edge.src.map.params[0]}*{edge.data.volume}];'
     else:
         raise RuntimeError(
             "Not handling copy_memory case of type {} -> {}.".format(
                 type(edge.src), type(edge.dst)))
     # write accessor to file
     callsite_stream.write(line)
Ejemplo n.º 22
0
    def generate_node(self, sdfg: SDFG, state: SDFGState, state_id: int,
                      node: nodes.Node, function_stream: CodeIOStream,
                      callsite_stream: CodeIOStream):
        self.add_header(function_stream)

        if not isinstance(node, nodes.Tasklet):
            return

        scope = util.get_sve_scope(sdfg, state, node)

        # Reset the stream variable mappings
        self.stream_associations = dict()
        self.wcr_associations = dict()

        callsite_stream.write('{')
        self.dispatcher.defined_vars.enter_scope(node)

        ##################
        # Generate tasklet

        # Inputs
        for edge in state.in_edges(node):
            self.generate_read(sdfg, state, scope.map, edge, callsite_stream)

        requires_wb = []

        # Temporary output registers
        for edge in state.out_edges(node):
            if self.generate_out_register(sdfg, state, edge, callsite_stream):
                requires_wb.append(edge)

        # Tasklet code
        self.unparse_tasklet(sdfg, state, state_id, node, function_stream,
                             callsite_stream)

        # Writeback from temporary registers to memory
        for edge in requires_wb:
            self.generate_writeback(sdfg, state, scope, edge, callsite_stream)

        self.dispatcher.defined_vars.exit_scope(node)
        callsite_stream.write('}')
Ejemplo n.º 23
0
    def generate_scope(self, sdfg: dace.SDFG, scope: ScopeSubgraphView,
                       state_id: int, function_stream: CodeIOStream,
                       callsite_stream: CodeIOStream):

        entry_node = scope.source_nodes()[0]

        index_list = []
        for begin, end, stride in entry_node.map.range:
            l = []
            while begin <= end:
                l.append(begin)
                begin += stride
            index_list.append(l)

        for indices in product(*index_list):
            callsite_stream.write('{')
            for param, index in zip(entry_node.map.params, indices):
                callsite_stream.write(f'auto {param} = {sym2cpp(index)};')
            self._dispatcher.dispatch_subgraph(sdfg,
                                               scope,
                                               state_id,
                                               function_stream,
                                               callsite_stream,
                                               skip_entry_node=True,
                                               skip_exit_node=True)
            callsite_stream.write('}')
Ejemplo n.º 24
0
    def generate_scope(self, sdfg: dace.SDFG, scope: ScopeSubgraphView,
                       state_id: int, function_stream: CodeIOStream,
                       callsite_stream: CodeIOStream):

        entry_node: nd.MapEntry = scope.source_nodes()[0]
        index_list = []

        for begin, end, stride in entry_node.map.range:
            l = []
            while begin <= end:
                l.append(begin)
                begin += stride
            index_list.append(l)

        sdfgconsts = sdfg.constants_prop
        sdfg.constants_prop = copy.deepcopy(sdfg.constants_prop)

        mapsymboltypes = entry_node.new_symbols(sdfg, scope,
                                                [entry_node.map.params])
        for indices in product(*index_list):
            callsite_stream.write('{')
            nsdfg_unroll_info = None
            for param, index in zip(entry_node.map.params, indices):
                if nsdfg_unroll_info is None:
                    nsdfg_unroll_info = self.nsdfg_prepare_unroll(
                        scope, str(param), str(index))
                else:
                    self.nsdfg_prepare_unroll(scope, str(param), str(index))
                callsite_stream.write(
                    f"constexpr {mapsymboltypes[param]} {param} = "
                    f"{dace.codegen.targets.common.sym2cpp(index)};\n", sdfg)
                sdfg.add_constant(param, int(index))

            callsite_stream.write('{')
            self._dispatcher.dispatch_subgraph(
                sdfg,
                scope,
                state_id,
                function_stream,
                callsite_stream,
                skip_entry_node=True,
                skip_exit_node=True,
            )
            callsite_stream.write('}')
            callsite_stream.write('}')
            self.nsdfg_after_unroll(nsdfg_unroll_info)

        sdfg.constants_prop = sdfgconsts
Ejemplo n.º 25
0
    def generate_header(self, sdfg: SDFG, used_environments: Set[str],
                        global_stream: CodeIOStream,
                        callsite_stream: CodeIOStream):
        """ Generate the header of the frame-code. Code exists in a separate
            function for overriding purposes.
            :param sdfg: The input SDFG.
            :param global_stream: Stream to write to (global).
            :param callsite_stream: Stream to write to (at call site).
        """

        environments = [
            dace.library.get_environment(env_name)
            for env_name in used_environments
        ]

        # Write frame code - header
        global_stream.write(
            '/* DaCe AUTO-GENERATED FILE. DO NOT MODIFY */\n' +
            '#include <dace/dace.h>\n', sdfg)

        # Write header required by environments
        for env in environments:
            if len(env.headers) > 0:
                global_stream.write(
                    "\n".join("#include \"" + h + "\"" for h in env.headers),
                    sdfg)

        global_stream.write("\n", sdfg)

        self.generate_fileheader(sdfg, global_stream)

        # Instrumentation preamble
        if len(self._dispatcher.instrumentation) > 1:
            global_stream.write(
                'namespace dace { namespace perf { Report report; } }', sdfg)
            callsite_stream.write('dace::perf::report.reset();', sdfg)
Ejemplo n.º 26
0
    def unparse_tasklet(self, sdfg: SDFG, dfg: state.StateSubgraphView,
                        state_id: int, node: nodes.Node,
                        function_stream: CodeIOStream,
                        callsite_stream: CodeIOStream):
        state_dfg: SDFGState = sdfg.nodes()[state_id]

        callsite_stream.write('\n///////////////////')
        callsite_stream.write(f'// Tasklet code ({node.label})')

        # Determine all defined symbols for the Unparser (for inference)

        # Constants and other defined symbols
        defined_symbols = state_dfg.symbols_defined_at(node)
        defined_symbols.update({
            k: v.dtype if hasattr(v, 'dtype') else dtypes.typeclass(type(v))
            for k, v in sdfg.constants.items()
        })

        # All memlets of that node
        memlets = {}
        for edge in state_dfg.all_edges(node):
            u, uconn, v, vconn, _ = edge
            if u == node and uconn in u.out_connectors:
                defined_symbols.update({uconn: u.out_connectors[uconn]})
            elif v == node and vconn in v.in_connectors:
                defined_symbols.update({vconn: v.in_connectors[vconn]})

        body = node.code.code
        for stmt in body:
            stmt = copy.deepcopy(stmt)
            result = StringIO()
            dace.codegen.targets.sve.unparse.SVEUnparser(
                sdfg, dfg, self.current_map, self.cpu_codegen,
                stmt, result, body, memlets,
                util.get_loop_predicate(sdfg, dfg, node), self.counter_type,
                defined_symbols, self.stream_associations,
                self.wcr_associations)
            callsite_stream.write(result.getvalue(), sdfg, state_id, node)

        callsite_stream.write('///////////////////\n\n')
Ejemplo n.º 27
0
    def on_node_end(self, sdfg: SDFG, state: SDFGState, node: nodes.AccessNode,
                    outer_stream: CodeIOStream, inner_stream: CodeIOStream,
                    global_stream: CodeIOStream):
        from dace.codegen.dispatcher import DefinedType  # Avoid import loop

        if is_devicelevel_gpu(sdfg, state, node) or is_devicelevel_fpga(
                sdfg, state, node):
            # Only run on host code
            return

        desc = node.desc(sdfg)

        # Obtain a pointer for arrays and scalars
        ptrname = cpp.ptr(node.data, desc, sdfg, self.codegen)
        defined_type, _ = self.codegen.dispatcher.defined_vars.get(ptrname)
        if defined_type == DefinedType.Scalar:
            ptrname = '&' + ptrname

        # Create UUID
        state_id = sdfg.node_id(state)
        node_id = state.node_id(node)
        uuid = f'{sdfg.sdfg_id}_{state_id}_{node_id}'

        # Get optional pre/postamble for instrumenting device data
        preamble, postamble = '', ''
        if desc.storage == dtypes.StorageType.GPU_Global:
            self._setup_gpu_runtime(sdfg, global_stream)
            preamble, postamble, ptrname = self._generate_copy_to_host(
                node, desc, ptrname)

        # Encode runtime shape and strides
        shape = ', '.join(cpp.sym2cpp(s) for s in desc.shape)
        strides = ', '.join(cpp.sym2cpp(s) for s in desc.strides)

        # Write code
        inner_stream.write(preamble, sdfg, state_id, node_id)
        inner_stream.write(
            f'__state->serializer->save({ptrname}, {cpp.sym2cpp(desc.total_size - desc.start_offset)}, '
            f'"{node.data}", "{uuid}", {shape}, {strides});\n', sdfg, state_id,
            node_id)
        inner_stream.write(postamble, sdfg, state_id, node_id)
Ejemplo n.º 28
0
    def generate_fileheader(self, sdfg: SDFG, global_stream: CodeIOStream):
        """ Generate a header in every output file that includes custom types
            and constants.
            @param sdfg: The input SDFG.
            @param global_stream: Stream to write to (global).
        """
        #########################################################
        # Custom types
        types = set()
        # Types of this SDFG
        for sdfg, arrname, arr in sdfg.arrays_recursive():
            if arr is not None:
                types.add(arr.dtype)

        # Emit unique definitions
        global_stream.write('\n')
        for typ in types:
            if hasattr(typ, 'emit_definition'):
                global_stream.write(typ.emit_definition(), sdfg)
        global_stream.write('\n')

        #########################################################
        # Write constants
        self.generate_constants(sdfg, global_stream)
Ejemplo n.º 29
0
    def generate_kernel_internal(self, sdfg: dace.SDFG, state: dace.SDFGState,
                                 kernel_name: str, predecessors: list,
                                 subgraphs: list, kernel_stream: CodeIOStream,
                                 state_host_header_stream: CodeIOStream,
                                 state_host_body_stream: CodeIOStream,
                                 instrumentation_stream: CodeIOStream,
                                 function_stream: CodeIOStream,
                                 callsite_stream: CodeIOStream,
                                 state_parameters: list):
        '''
        Generates Kernel code, both device and host side.
        :param sdfg:
        :param state:
        :param kernel_name:
        :param predecessors: list containing all the name of kernels from which this one depends
        :param subgraphs:
        :param kernel_stream: Device code stream, contains the kernel code
        :param state_host_header_stream: Device-specific code stream: contains the host code
            for the state global declarations.
        :param state_host_body_stream: Device-specific code stream: contains all the code related to
            this state, for creating transient buffers, spawning kernels, and synchronizing them.
        :param instrumentation_stream: Code for profiling kernel execution time.
        :param function_stream: CPU code stream.
        :param callsite_stream: CPU code stream.
        :param state_parameters: list of state parameters. The kernel-specific parameters will be appended to it.
        '''

        (global_data_parameters, top_level_local_data, subgraph_parameters,
         nested_global_transients, bank_assignments,
         external_streams) = self.make_parameters(sdfg, state, subgraphs)

        state_parameters.extend(global_data_parameters)

        # Detect RTL tasklets, which will be launched as individual kernels
        rtl_tasklet_names = [
            self.rtl_tasklet_name(nd, state, sdfg) for nd in state.nodes()
            if isinstance(nd, nodes.RTLTasklet)
        ]

        # Generate host code
        self.generate_host_header(sdfg, kernel_name, global_data_parameters,
                                  state_host_header_stream)
        self.generate_host_function_boilerplate(sdfg, state,
                                                nested_global_transients,
                                                state_host_body_stream)

        # Now we write the device code
        module_stream = CodeIOStream()
        entry_stream = CodeIOStream()

        state_id = sdfg.node_id(state)

        self.generate_kernel_boilerplate_pre(sdfg, state_id, kernel_name,
                                             global_data_parameters,
                                             bank_assignments, module_stream,
                                             entry_stream, external_streams)

        # Emit allocations
        for node in top_level_local_data:
            self._dispatcher.dispatch_allocate(sdfg, state, state_id, node,
                                               node.desc(sdfg), module_stream,
                                               entry_stream)
        for is_output, name, node, _ in external_streams:
            self._dispatcher.defined_vars.add_global(name, DefinedType.Stream,
                                                     node.ctype)
            if name not in self._stream_connections:
                self._stream_connections[name] = [None, None]
            key = 0 if is_output else 1
            val = '{}_1.{}'.format(kernel_name, name)
            self._stream_connections[name][key] = val

        self.generate_modules(sdfg, state, kernel_name, subgraphs,
                              subgraph_parameters, module_stream, entry_stream,
                              state_host_body_stream, instrumentation_stream)

        self.generate_host_function_body(sdfg, state, kernel_name,
                                         predecessors, global_data_parameters,
                                         rtl_tasklet_names,
                                         state_host_body_stream,
                                         instrumentation_stream)

        # Store code to be passed to compilation phase
        # self._host_codes.append((kernel_name, host_code_stream.getvalue()))
        kernel_stream.write(module_stream.getvalue())
        kernel_stream.write(entry_stream.getvalue())

        self.generate_kernel_boilerplate_post(kernel_stream, sdfg, state_id)
Ejemplo n.º 30
0
    def get_generated_codeobjects(self):

        execution_mode = Config.get("compiler", "xilinx", "mode")

        kernel_file_name = "DACE_BINARY_DIR \"/{}".format(self._program_name)
        if execution_mode == "software_emulation":
            kernel_file_name += "_sw_emu.xclbin\""
            xcl_emulation_mode = "\"sw_emu\""
            xilinx_sdx = "DACE_VITIS_DIR"
        elif execution_mode == "hardware_emulation":
            kernel_file_name += "_hw_emu.xclbin\""
            xcl_emulation_mode = "\"hw_emu\""
            xilinx_sdx = "DACE_VITIS_DIR"
        elif execution_mode == "hardware" or execution_mode == "simulation":
            kernel_file_name += "_hw.xclbin\""
            xcl_emulation_mode = None
            xilinx_sdx = None
        else:
            raise cgx.CodegenError(
                "Unknown Xilinx execution mode: {}".format(execution_mode))

        set_env_vars = ""
        set_str = "dace::set_environment_variable(\"{}\", {});\n"
        unset_str = "dace::unset_environment_variable(\"{}\");\n"
        set_env_vars += (set_str.format("XCL_EMULATION_MODE",
                                        xcl_emulation_mode)
                         if xcl_emulation_mode is not None else
                         unset_str.format("XCL_EMULATION_MODE"))
        set_env_vars += (set_str.format("XILINX_SDX", xilinx_sdx) if xilinx_sdx
                         is not None else unset_str.format("XILINX_SDX"))
        set_env_vars += set_str.format(
            "EMCONFIG_PATH", "DACE_BINARY_DIR"
        ) if execution_mode == 'hardware_emulation' else unset_str.format(
            "EMCONFIG_PATH")

        host_code = CodeIOStream()
        host_code.write("""\
#include "dace/xilinx/host.h"
#include "dace/dace.h"
""")
        if len(self._dispatcher.instrumentation) > 1:
            host_code.write("""\
#include "dace/perf/reporting.h"
#include <chrono>
#include <iomanip>
#include <iostream>
#include <limits>
""")
        host_code.write("\n\n")

        self._frame.generate_fileheader(self._global_sdfg, host_code,
                                        'xilinx_host')

        params_comma = self._global_sdfg.signature(with_arrays=False)
        if params_comma:
            params_comma = ', ' + params_comma

        host_code.write("""
DACE_EXPORTED int __dace_init_xilinx({sdfg.name}_t *__state{signature}) {{
    {environment_variables}

    __state->fpga_context = new dace::fpga::Context();
    __state->fpga_context->Get().MakeProgram({kernel_file_name});
    return 0;
}}

DACE_EXPORTED void __dace_exit_xilinx({sdfg.name}_t *__state) {{
    delete __state->fpga_context;
}}

{host_code}""".format(signature=params_comma,
                      sdfg=self._global_sdfg,
                      environment_variables=set_env_vars,
                      kernel_file_name=kernel_file_name,
                      host_code="".join([
                          "{separator}\n// Kernel: {kernel_name}"
                          "\n{separator}\n\n{code}\n\n".format(
                              separator="/" * 79, kernel_name=name, code=code)
                          for (name, code) in self._host_codes
                      ])))

        host_code_obj = CodeObject(self._program_name,
                                   host_code.getvalue(),
                                   "cpp",
                                   XilinxCodeGen,
                                   "Xilinx",
                                   target_type="host")

        kernel_code_objs = [
            CodeObject(kernel_name,
                       code,
                       "cpp",
                       XilinxCodeGen,
                       "Xilinx",
                       target_type="device")
            for (kernel_name, code) in self._kernel_codes
        ]

        # Memory bank and streaming interfaces connectivity configuration file
        link_cfg = CodeIOStream()
        self._other_codes["link.cfg"] = link_cfg
        link_cfg.write("[connectivity]")
        are_assigned = [v is not None for v in self._bank_assignments.values()]
        if any(are_assigned):
            if not all(are_assigned):
                raise RuntimeError("Some, but not all global memory arrays "
                                   "were assigned to memory banks: {}".format(
                                       self._bank_assignments))
            # Emit mapping from kernel memory interfaces to DRAM banks
            for (kernel_name, interface_name), (
                    memory_type,
                    memory_bank) in self._bank_assignments.items():
                link_cfg.write(
                    f"sp={kernel_name}_1.m_axi_{interface_name}:{memory_type}[{memory_bank}]"
                )
        # Emit mapping between inter-kernel streaming interfaces
        for _, (src, dst) in self._stream_connections.items():
            link_cfg.write(f"stream_connect={src}:{dst}")

        other_objs = []
        for name, code in self._other_codes.items():
            name = name.split(".")
            other_objs.append(
                CodeObject(name[0],
                           code.getvalue(),
                           ".".join(name[1:]),
                           XilinxCodeGen,
                           "Xilinx",
                           target_type="device"))

        return [host_code_obj] + kernel_code_objs + other_objs