Example #1
0
    def generate_kernel_internal(self, sdfg, state, kernel_name, subgraphs,
                                 kernel_stream, function_stream,
                                 callsite_stream):
        """Main entry function for generating a Xilinx kernel."""

        (global_data_parameters, top_level_local_data, subgraph_parameters,
         scalar_parameters, symbol_parameters,
         nested_global_transients) = self.make_parameters(
             sdfg, state, subgraphs)

        # Scalar parameters are never output
        sc_parameters = [(False, pname, param)
                         for pname, param in scalar_parameters]

        host_code_stream = CodeIOStream()

        # Generate host code
        self.generate_host_header(sdfg, kernel_name,
                                  global_data_parameters + sc_parameters,
                                  symbol_parameters, host_code_stream)
        self.generate_host_function_boilerplate(
            sdfg, state, kernel_name, global_data_parameters + sc_parameters,
            symbol_parameters, nested_global_transients, host_code_stream,
            function_stream, callsite_stream)
        self.generate_host_function_body(
            sdfg, state, kernel_name, global_data_parameters + sc_parameters,
            symbol_parameters, host_code_stream)
        # Store code to be passed to compilation phase
        self._host_codes.append((kernel_name, host_code_stream.getvalue()))

        # Now we write the device code
        module_stream = CodeIOStream()
        entry_stream = CodeIOStream()

        state_id = sdfg.node_id(state)

        self.generate_kernel_boilerplate_pre(sdfg, state_id, kernel_name,
                                             global_data_parameters,
                                             scalar_parameters,
                                             symbol_parameters, module_stream,
                                             entry_stream)

        # Emit allocations
        for node in top_level_local_data:
            self._dispatcher.dispatch_allocate(sdfg, state, state_id, node,
                                               module_stream, entry_stream)
            self._dispatcher.dispatch_initialize(sdfg, state, state_id, node,
                                                 module_stream, entry_stream)

        self.generate_modules(sdfg, state, kernel_name, subgraphs,
                              subgraph_parameters, sc_parameters,
                              symbol_parameters, module_stream, entry_stream,
                              host_code_stream)

        kernel_stream.write(module_stream.getvalue())
        kernel_stream.write(entry_stream.getvalue())

        self.generate_kernel_boilerplate_post(kernel_stream, sdfg, state_id)
Example #2
0
File: mpi.py Project: targetsm/dace
    def __init__(self, frame_codegen, sdfg):
        self._frame = frame_codegen
        self._dispatcher = frame_codegen.dispatcher
        dispatcher = self._dispatcher

        fileheader = CodeIOStream()
        self._frame.generate_fileheader(sdfg, fileheader)

        self._codeobj = CodeObject(
            sdfg.name + '_mpi', """
#include <dace/dace.h>
#include <mpi.h>

MPI_Comm __dace_mpi_comm;
int __dace_comm_size = 1;
int __dace_comm_rank = 0;

{file_header}

DACE_EXPORTED int __dace_init_mpi({params});
DACE_EXPORTED void __dace_exit_mpi({params});

int __dace_init_mpi({params}) {{
    int isinit = 0;
    if (MPI_Initialized(&isinit) != MPI_SUCCESS)
        return 1;
    if (!isinit) {{
        if (MPI_Init(NULL, NULL) != MPI_SUCCESS)
            return 1;
    }}

    MPI_Comm_dup(MPI_COMM_WORLD, &__dace_mpi_comm);
    MPI_Comm_rank(__dace_mpi_comm, &__dace_comm_rank);
    MPI_Comm_size(__dace_mpi_comm, &__dace_comm_size);

    printf(\"MPI was initialized on proc %i of %i\\n\", __dace_comm_rank,
           __dace_comm_size);
    return 0;
}}

void __dace_exit_mpi({params}) {{
    MPI_Comm_free(&__dace_mpi_comm);
    MPI_Finalize();

    printf(\"MPI was finalized on proc %i of %i\\n\", __dace_comm_rank,
           __dace_comm_size);
}}
""".format(params=sdfg.signature(), file_header=fileheader.getvalue()), 'cpp',
            MPICodeGen, 'MPI')

        # Register dispatchers
        dispatcher.register_map_dispatcher(dtypes.ScheduleType.MPI, self)
Example #3
0
    def get_generated_codeobjects(self):
        fileheader = CodeIOStream()
        sdfg = self._global_sdfg
        self._frame.generate_fileheader(sdfg, fileheader, 'mpi')

        params_comma = self._global_sdfg.init_signature(
            free_symbols=self._frame.free_symbols(self._global_sdfg))
        if params_comma:
            params_comma = ', ' + params_comma

        codeobj = CodeObject(
            sdfg.name + '_mpi', """
#include <dace/dace.h>
#include <mpi.h>

MPI_Comm __dace_mpi_comm;
int __dace_comm_size = 1;
int __dace_comm_rank = 0;

{file_header}

DACE_EXPORTED int __dace_init_mpi({sdfg.name}_t *__state{params});
DACE_EXPORTED void __dace_exit_mpi({sdfg.name}_t *__state);

int __dace_init_mpi({sdfg.name}_t *__state{params}) {{
    int isinit = 0;
    if (MPI_Initialized(&isinit) != MPI_SUCCESS)
        return 1;
    if (!isinit) {{
        if (MPI_Init(NULL, NULL) != MPI_SUCCESS)
            return 1;
    }}

    MPI_Comm_dup(MPI_COMM_WORLD, &__dace_mpi_comm);
    MPI_Comm_rank(__dace_mpi_comm, &__dace_comm_rank);
    MPI_Comm_size(__dace_mpi_comm, &__dace_comm_size);

    printf(\"MPI was initialized on proc %i of %i\\n\", __dace_comm_rank,
           __dace_comm_size);
    return 0;
}}

void __dace_exit_mpi({sdfg.name}_t *__state) {{
    MPI_Comm_free(&__dace_mpi_comm);
    MPI_Finalize();

    printf(\"MPI was finalized on proc %i of %i\\n\", __dace_comm_rank,
           __dace_comm_size);
}}
""".format(params=params_comma, sdfg=sdfg, file_header=fileheader.getvalue()),
            'cpp', MPICodeGen, 'MPI')
        return [codeobj]
Example #4
0
    def dispatch_allocate(self, sdfg, dfg, state_id, node, function_stream,
                          allocation_stream):
        """ Dispatches a code generator for data allocation. """

        nodedesc = node.desc(sdfg)
        storage = (nodedesc.storage if not isinstance(node, nodes.Tasklet) else
                   dtypes.StorageType.Register)
        self._used_targets.add(self._array_dispatchers[storage])

        # TODO: Move to central allocator (see PR #434)
        if nodedesc.lifetime is dtypes.AllocationLifetime.Persistent:
            declaration_stream = CodeIOStream()
            allocation_stream = self.frame._initcode
        else:
            declaration_stream = allocation_stream

        self._array_dispatchers[storage].allocate_array(
            sdfg, dfg, state_id, node, function_stream, declaration_stream,
            allocation_stream)

        # TODO: Move to central allocator (see PR #434)
        if nodedesc.lifetime is dtypes.AllocationLifetime.Persistent:
            self.frame.statestruct.append(declaration_stream.getvalue())
Example #5
0
    def get_generated_codeobjects(self):

        execution_mode = Config.get("compiler", "xilinx", "mode")

        kernel_file_name = "DACE_BINARY_DIR \"/{}".format(self._program_name)
        if execution_mode == "software_emulation":
            kernel_file_name += "_sw_emu.xclbin\""
            xcl_emulation_mode = "\"sw_emu\""
            xilinx_sdx = "DACE_VITIS_DIR"
        elif execution_mode == "hardware_emulation":
            kernel_file_name += "_hw_emu.xclbin\""
            xcl_emulation_mode = "\"hw_emu\""
            xilinx_sdx = "DACE_VITIS_DIR"
        elif execution_mode == "hardware" or execution_mode == "simulation":
            kernel_file_name += "_hw.xclbin\""
            xcl_emulation_mode = None
            xilinx_sdx = None
        else:
            raise dace.codegen.codegen.CodegenError(
                "Unknown Xilinx execution mode: {}".format(execution_mode))

        set_env_vars = ""
        set_str = "dace::set_environment_variable(\"{}\", {});\n"
        unset_str = "dace::unset_environment_variable(\"{}\");\n"
        set_env_vars += (set_str.format("XCL_EMULATION_MODE",
                                        xcl_emulation_mode)
                         if xcl_emulation_mode is not None else
                         unset_str.format("XCL_EMULATION_MODE"))
        set_env_vars += (set_str.format("XILINX_SDX", xilinx_sdx) if xilinx_sdx
                         is not None else unset_str.format("XILINX_SDX"))

        host_code = CodeIOStream()
        host_code.write("""\
#include "dace/xilinx/host.h"
#include "dace/dace.h"
#include <iostream>\n\n""")

        self._frame.generate_fileheader(self._global_sdfg, host_code)

        host_code.write("""
dace::fpga::Context *dace::fpga::_context;

DACE_EXPORTED int __dace_init_xilinx({signature}) {{
    {environment_variables}
    dace::fpga::_context = new dace::fpga::Context();
    dace::fpga::_context->Get().MakeProgram({kernel_file_name});
    return 0;
}}

DACE_EXPORTED void __dace_exit_xilinx({signature}) {{
    delete dace::fpga::_context;
}}

{host_code}""".format(signature=self._global_sdfg.signature(),
                      environment_variables=set_env_vars,
                      kernel_file_name=kernel_file_name,
                      host_code="".join([
                          "{separator}\n// Kernel: {kernel_name}"
                          "\n{separator}\n\n{code}\n\n".format(separator="/" *
                                                               79,
                                                               kernel_name=name,
                                                               code=code)
                          for (name, code) in self._host_codes
                      ])))

        host_code_obj = CodeObject(self._program_name,
                                   host_code.getvalue(),
                                   "cpp",
                                   XilinxCodeGen,
                                   "Xilinx",
                                   target_type="host")

        kernel_code_objs = [
            CodeObject(kernel_name,
                       code,
                       "cpp",
                       XilinxCodeGen,
                       "Xilinx",
                       target_type="device")
            for (kernel_name, code) in self._kernel_codes
        ]

        # Configuration file with interface assignments
        are_assigned = [
            v is not None for v in self._interface_assignments.values()
        ]
        bank_assignment_code = []
        if any(are_assigned):
            if not all(are_assigned):
                raise RuntimeError("Some, but not all global memory arrays "
                                   "were assigned to memory banks: {}".format(
                                       self._interface_assignments))
            are_assigned = True
        else:
            are_assigned = False
        for name, _ in self._host_codes:
            # Only iterate over assignments if any exist
            if are_assigned:
                for (kernel_name, interface_name), (
                        memory_type,
                        memory_bank) in self._interface_assignments.items():
                    if kernel_name != name:
                        continue
                    bank_assignment_code.append("{},{},{}".format(
                        interface_name, memory_type.name, memory_bank))
            # Create file even if there are no assignments
            kernel_code_objs.append(
                CodeObject("{}_memory_interfaces".format(name),
                           "\n".join(bank_assignment_code),
                           "csv",
                           XilinxCodeGen,
                           "Xilinx",
                           target_type="device"))

        return [host_code_obj] + kernel_code_objs
Example #6
0
    def generate_module(self, sdfg, state, kernel_name, name, subgraph,
                        parameters, module_stream, entry_stream, host_stream,
                        instrumentation_stream):
        """Generates a module that will run as a dataflow function in the FPGA
           kernel."""

        state_id = sdfg.node_id(state)
        dfg = sdfg.nodes()[state_id]

        kernel_args_call = []
        kernel_args_module = []
        for is_output, pname, p, interface_ids in parameters:
            if isinstance(p, dt.Array):
                for bank, interface_id in fpga.iterate_hbm_interface_ids(
                        p, interface_ids):
                    arr_name = fpga.fpga_ptr(pname,
                                             p,
                                             sdfg,
                                             bank,
                                             is_output,
                                             is_array_interface=True)
                    # Add interface ID to called module, but not to the module
                    # arguments
                    argname = fpga.fpga_ptr(pname,
                                            p,
                                            sdfg,
                                            bank,
                                            is_output,
                                            is_array_interface=True,
                                            interface_id=interface_id)

                    kernel_args_call.append(argname)
                    dtype = p.dtype
                    kernel_args_module.append("{} {}*{}".format(
                        dtype.ctype, "const " if not is_output else "",
                        arr_name))
            else:
                if isinstance(p, dt.Stream):
                    kernel_args_call.append(
                        p.as_arg(with_types=False, name=pname))
                    if p.is_stream_array():
                        kernel_args_module.append(
                            "dace::FIFO<{}, {}, {}> {}[{}]".format(
                                p.dtype.base_type.ctype, p.veclen,
                                p.buffer_size, pname, p.size_string()))
                    else:
                        kernel_args_module.append(
                            "dace::FIFO<{}, {}, {}> &{}".format(
                                p.dtype.base_type.ctype, p.veclen,
                                p.buffer_size, pname))
                else:
                    kernel_args_call.append(
                        p.as_arg(with_types=False, name=pname))
                    kernel_args_module.append(
                        p.as_arg(with_types=True, name=pname))

        # Check if we are generating an RTL module, in which case only the
        # accesses to the streams should be handled
        rtl_tasklet = None
        for n in subgraph.nodes():
            if (isinstance(n, dace.nodes.Tasklet)
                    and n.language == dace.dtypes.Language.SystemVerilog):
                rtl_tasklet = n
                break
        if rtl_tasklet:
            entry_stream.write(
                f'// [RTL] HLSLIB_DATAFLOW_FUNCTION({name}, {", ".join(kernel_args_call)});'
            )
            module_stream.write(
                f'// [RTL] void {name}({", ".join(kernel_args_module)});\n\n')

            # _1 in names are due to vitis
            for node in subgraph.source_nodes():
                if isinstance(sdfg.arrays[node.data], dt.Stream):
                    if node.data not in self._stream_connections:
                        self._stream_connections[node.data] = [None, None]
                    for edge in state.out_edges(node):
                        rtl_name = "{}_{}_{}_{}".format(
                            edge.dst, sdfg.sdfg_id, sdfg.node_id(state),
                            state.node_id(edge.dst))
                        self._stream_connections[
                            node.data][1] = '{}_top_1.s_axis_{}'.format(
                                rtl_name, edge.dst_conn)

            for node in subgraph.sink_nodes():
                if isinstance(sdfg.arrays[node.data], dt.Stream):
                    if node.data not in self._stream_connections:
                        self._stream_connections[node.data] = [None, None]
                    for edge in state.in_edges(node):
                        rtl_name = "{}_{}_{}_{}".format(
                            edge.src, sdfg.sdfg_id, sdfg.node_id(state),
                            state.node_id(edge.src))
                        self._stream_connections[
                            node.data][0] = '{}_top_1.m_axis_{}'.format(
                                rtl_name, edge.src_conn)

            # Make the dispatcher trigger generation of the RTL module, but
            # ignore the generated code, as the RTL codegen will generate the
            # appropriate files.
            ignore_stream = CodeIOStream()
            self._dispatcher.dispatch_subgraph(sdfg,
                                               subgraph,
                                               state_id,
                                               ignore_stream,
                                               ignore_stream,
                                               skip_entry_node=False)

            # Launch the kernel from the host code
            rtl_name = self.rtl_tasklet_name(rtl_tasklet, state, sdfg)
            host_stream.write(
                f"  auto kernel_{rtl_name} = program.MakeKernel(\"{rtl_name}_top\"{', '.join([''] + [name for _, name, p, _ in parameters if not isinstance(p, dt.Stream)])}).ExecuteTaskFork();",
                sdfg, state_id, rtl_tasklet)
            if state.instrument == dtypes.InstrumentationType.FPGA:
                self.instrument_opencl_kernel(rtl_name, state_id, sdfg.sdfg_id,
                                              instrumentation_stream)

            return

        # create a unique module name to prevent name clashes
        module_function_name = f"module_{name}_{sdfg.sdfg_id}"

        # Unrolling processing elements: if there first scope of the subgraph
        # is an unrolled map, generate a processing element for each iteration
        scope_children = subgraph.scope_children()
        top_scopes = [
            n for n in scope_children[None]
            if isinstance(n, dace.sdfg.nodes.EntryNode)
        ]
        unrolled_loops = 0
        if len(top_scopes) == 1:
            scope = top_scopes[0]
            if scope.unroll:
                self._unrolled_pes.add(scope.map)
                kernel_args_call += ", ".join(scope.map.params)
                kernel_args_module += ["int " + p for p in scope.params]
                for p, r in zip(scope.map.params, scope.map.range):
                    if len(r) > 3:
                        raise cgx.CodegenError("Strided unroll not supported")
                    entry_stream.write(
                        "for (size_t {param} = {begin}; {param} < {end}; "
                        "{param} += {increment}) {{\n#pragma HLS UNROLL".
                        format(param=p,
                               begin=r[0],
                               end=r[1] + 1,
                               increment=r[2]))
                    unrolled_loops += 1

        # Generate caller code in top-level function
        entry_stream.write(
            "HLSLIB_DATAFLOW_FUNCTION({}, {});".format(
                module_function_name, ", ".join(kernel_args_call)), sdfg,
            state_id)

        for _ in range(unrolled_loops):
            entry_stream.write("}")

        # ----------------------------------------------------------------------
        # Generate kernel code
        # ----------------------------------------------------------------------

        self._dispatcher.defined_vars.enter_scope(subgraph)

        module_body_stream = CodeIOStream()

        module_body_stream.write(
            "void {}({}) {{".format(module_function_name,
                                    ", ".join(kernel_args_module)), sdfg,
            state_id)

        # Register the array interface as a naked pointer for use inside the
        # FPGA kernel
        interfaces_added = set()
        for is_output, argname, arg, interface_id in parameters:
            for bank, _ in fpga.iterate_hbm_interface_ids(arg, interface_id):
                if (not (isinstance(arg, dt.Array) and arg.storage
                         == dace.dtypes.StorageType.FPGA_Global)):
                    continue
                ctype = dtypes.pointer(arg.dtype).ctype
                ptr_name = fpga.fpga_ptr(argname,
                                         arg,
                                         sdfg,
                                         bank,
                                         is_output,
                                         None,
                                         is_array_interface=True)
                if not is_output:
                    ctype = f"const {ctype}"
                self._dispatcher.defined_vars.add(ptr_name,
                                                  DefinedType.Pointer, ctype)
                if argname in interfaces_added:
                    continue
                interfaces_added.add(argname)
                self._dispatcher.defined_vars.add(argname,
                                                  DefinedType.ArrayInterface,
                                                  ctype,
                                                  allow_shadowing=True)
        module_body_stream.write("\n")

        # Allocate local transients
        data_to_allocate = (set(subgraph.top_level_transients()) -
                            set(sdfg.shared_transients()) -
                            set([p[1] for p in parameters]))
        allocated = set()
        for node in subgraph.nodes():
            if not isinstance(node, dace.sdfg.nodes.AccessNode):
                continue
            if node.data not in data_to_allocate or node.data in allocated:
                continue
            allocated.add(node.data)
            self._dispatcher.dispatch_allocate(sdfg, state, state_id, node,
                                               node.desc(sdfg), module_stream,
                                               module_body_stream)

        self._dispatcher.dispatch_subgraph(sdfg,
                                           subgraph,
                                           state_id,
                                           module_stream,
                                           module_body_stream,
                                           skip_entry_node=False)

        module_stream.write(module_body_stream.getvalue(), sdfg, state_id)
        module_stream.write("}\n\n")

        self._dispatcher.defined_vars.exit_scope(subgraph)
Example #7
0
    def generate_code(
        self,
        sdfg: SDFG,
        schedule: Optional[dtypes.ScheduleType],
        sdfg_id: str = ""
    ) -> Tuple[str, str, Set[TargetCodeGenerator], Set[str]]:
        """ Generate frame code for a given SDFG, calling registered targets'
            code generation callbacks for them to generate their own code.
            :param sdfg: The SDFG to generate code for.
            :param schedule: The schedule the SDFG is currently located, or
                             None if the SDFG is top-level.
            :param sdfg_id: An optional string id given to the SDFG label
            :return: A tuple of the generated global frame code, local frame
                     code, and a set of targets that have been used in the
                     generation of this SDFG.
        """

        if len(sdfg_id) == 0 and sdfg.sdfg_id != 0:
            sdfg_id = '_%d' % sdfg.sdfg_id

        global_stream = CodeIOStream()
        callsite_stream = CodeIOStream()

        is_top_level = sdfg.parent is None

        # Analyze allocation lifetime of SDFG and all nested SDFGs
        if is_top_level:
            self.determine_allocation_lifetime(sdfg)

        # Generate code
        ###########################

        # Keep track of allocated variables
        allocated = set()

        # Add symbol mappings to allocated variables
        if sdfg.parent_nsdfg_node is not None:
            allocated |= sdfg.parent_nsdfg_node.symbol_mapping.keys()

        # Invoke all instrumentation providers
        for instr in self._dispatcher.instrumentation.values():
            if instr is not None:
                instr.on_sdfg_begin(sdfg, callsite_stream, global_stream)

        # Allocate outer-level transients
        self.allocate_arrays_in_scope(sdfg, sdfg, global_stream,
                                      callsite_stream)

        # Allocate inter-state variables
        global_symbols = copy.deepcopy(sdfg.symbols)
        global_symbols.update(
            {aname: arr.dtype
             for aname, arr in sdfg.arrays.items()})
        interstate_symbols = {}
        for e in sdfg.edges():
            symbols = e.data.new_symbols(sdfg, global_symbols)
            # Inferred symbols only take precedence if global symbol not defined
            symbols = {
                k: v if k not in global_symbols else global_symbols[k]
                for k, v in symbols.items()
            }
            interstate_symbols.update(symbols)
            global_symbols.update(symbols)

        for isvarName, isvarType in interstate_symbols.items():
            isvar = data.Scalar(isvarType)
            callsite_stream.write(
                '%s;\n' % (isvar.as_arg(with_types=True, name=isvarName)),
                sdfg)
            self.dispatcher.defined_vars.add(isvarName,
                                             disp.DefinedType.Scalar,
                                             isvarType.ctype)

        callsite_stream.write('\n', sdfg)

        #######################################################################
        # Generate actual program body

        states_generated = self.generate_states(sdfg, global_stream,
                                                callsite_stream)

        #######################################################################

        # Sanity check
        if len(states_generated) != len(sdfg.nodes()):
            raise RuntimeError(
                "Not all states were generated in SDFG {}!"
                "\n  Generated: {}\n  Missing: {}".format(
                    sdfg.label, [s.label for s in states_generated],
                    [s.label for s in (set(sdfg.nodes()) - states_generated)]))

        # Deallocate transients
        self.deallocate_arrays_in_scope(sdfg, sdfg, global_stream,
                                        callsite_stream)

        # Now that we have all the information about dependencies, generate
        # header and footer
        if is_top_level:
            # Let each target append code to frame code state before generating
            # header and footer
            for target in self._dispatcher.used_targets:
                target.on_target_used()

            header_stream = CodeIOStream()
            header_global_stream = CodeIOStream()
            footer_stream = CodeIOStream()
            footer_global_stream = CodeIOStream()

            # Get all environments used in the generated code, including
            # dependent environments
            import dace.library  # Avoid import loops
            self.environments = dace.library.get_environments_and_dependencies(
                self._dispatcher.used_environments)

            self.generate_header(sdfg, header_global_stream, header_stream)

            # Open program function
            params = sdfg.signature()
            if params:
                params = ', ' + params
            function_signature = (
                'void __program_%s_internal(%s_t *__state%s)\n{\n' %
                (sdfg.name, sdfg.name, params))

            self.generate_footer(sdfg, footer_global_stream, footer_stream)

            header_global_stream.write(global_stream.getvalue())
            header_global_stream.write(footer_global_stream.getvalue())
            generated_header = header_global_stream.getvalue()

            all_code = CodeIOStream()
            all_code.write(function_signature)
            all_code.write(header_stream.getvalue())
            all_code.write(callsite_stream.getvalue())
            all_code.write(footer_stream.getvalue())
            generated_code = all_code.getvalue()
        else:
            generated_header = global_stream.getvalue()
            generated_code = callsite_stream.getvalue()

        # Clean up generated code
        gotos = re.findall(r'goto (.*?);', generated_code)
        clean_code = ''
        for line in generated_code.split('\n'):
            # Empty line with semicolon
            if re.match(r'^\s*;\s*', line):
                continue
            # Label that might be unused
            label = re.findall(
                r'^\s*([a-zA-Z_][a-zA-Z_0-9]*):\s*[;]?\s*////.*$', line)
            if len(label) > 0:
                if label[0] not in gotos:
                    continue
            clean_code += line + '\n'

        # Return the generated global and local code strings
        return (generated_header, clean_code, self._dispatcher.used_targets,
                self._dispatcher.used_environments)
Example #8
0
    def generate_module(self, sdfg, state, name, subgraph, parameters,
                        symbol_parameters, module_stream, entry_stream,
                        host_stream):
        """Generates a module that will run as a dataflow function in the FPGA
           kernel."""

        state_id = sdfg.node_id(state)
        dfg = sdfg.nodes()[state_id]

        kernel_args_call = []
        kernel_args_module = []
        added = set()

        parameters = list(sorted(parameters, key=lambda t: t[1]))
        arrays = [
            p for p in parameters if not isinstance(p[2], dace.data.Scalar)
        ]
        scalars = [p for p in parameters if isinstance(p[2], dace.data.Scalar)]
        scalars += ((False, k, v) for k, v in symbol_parameters.items())
        scalars = dace.dtypes.deduplicate(sorted(scalars, key=lambda t: t[1]))
        for is_output, pname, p in itertools.chain(parameters, scalars):
            if isinstance(p, dace.data.Array):
                arr_name = "{}_{}".format(pname, "out" if is_output else "in")
                kernel_args_call.append(arr_name)
                dtype = p.dtype
                kernel_args_module.append("{} {}*{}".format(
                    dtype.ctype, "const " if not is_output else "", arr_name))
            else:
                # Don't make duplicate arguments for other types than arrays
                if pname in added:
                    continue
                added.add(pname)
                if isinstance(p, dace.data.Stream):
                    kernel_args_call.append(
                        p.as_arg(with_types=False, name=pname))
                    if p.is_stream_array():
                        kernel_args_module.append(
                            "dace::FIFO<{}, {}, {}> {}[{}]".format(
                                p.dtype.base_type.ctype, p.veclen,
                                p.buffer_size, pname, p.size_string()))
                    else:
                        kernel_args_module.append(
                            "dace::FIFO<{}, {}, {}> &{}".format(
                                p.dtype.base_type.ctype, p.veclen,
                                p.buffer_size, pname))
                else:
                    kernel_args_call.append(
                        p.as_arg(with_types=False, name=pname))
                    kernel_args_module.append(
                        p.as_arg(with_types=True, name=pname))
        module_function_name = "module_" + name
        # Unrolling processing elements: if there first scope of the subgraph
        # is an unrolled map, generate a processing element for each iteration
        scope_dict = subgraph.scope_dict(node_to_children=True)
        top_scopes = [
            n for n in scope_dict[None]
            if isinstance(n, dace.sdfg.nodes.EntryNode)
        ]
        unrolled_loops = 0
        if len(top_scopes) == 1:
            scope = top_scopes[0]
            if scope.unroll:
                self._unrolled_pes.add(scope.map)
                kernel_args_call += ", ".join(scope.map.params)
                kernel_args_module += ["int " + p for p in scope.params]
                for p, r in zip(scope.map.params, scope.map.range):
                    if len(r) > 3:
                        raise dace.codegen.codegen.CodegenError(
                            "Strided unroll not supported")
                    entry_stream.write(
                        "for (size_t {param} = {begin}; {param} < {end}; "
                        "{param} += {increment}) {{\n#pragma HLS UNROLL".format(
                            param=p, begin=r[0], end=r[1] + 1, increment=r[2]))
                    unrolled_loops += 1

        # Generate caller code in top-level function
        entry_stream.write(
            "HLSLIB_DATAFLOW_FUNCTION({}, {});".format(
                module_function_name, ", ".join(kernel_args_call)), sdfg,
            state_id)

        for _ in range(unrolled_loops):
            entry_stream.write("}")

        # ----------------------------------------------------------------------
        # Generate kernel code
        # ----------------------------------------------------------------------

        self._dispatcher.defined_vars.enter_scope(subgraph)

        module_body_stream = CodeIOStream()

        module_body_stream.write(
            "void {}({}) {{".format(module_function_name,
                                    ", ".join(kernel_args_module)), sdfg,
            state_id)

        # Construct ArrayInterface wrappers to pack input and output pointers
        # to the same global array
        in_args = {
            argname
            for out, argname, arg in parameters
            if isinstance(arg, dace.data.Array)
            and arg.storage == dace.dtypes.StorageType.FPGA_Global and not out
        }
        out_args = {
            argname
            for out, argname, arg in parameters
            if isinstance(arg, dace.data.Array)
            and arg.storage == dace.dtypes.StorageType.FPGA_Global and out
        }
        if len(in_args) > 0 or len(out_args) > 0:
            # Add ArrayInterface objects to wrap input and output pointers to
            # the same array
            module_body_stream.write("\n")
            interfaces_added = set()
            for _, argname, arg in parameters:
                if argname in interfaces_added:
                    continue
                interfaces_added.add(argname)
                has_in_ptr = argname in in_args
                has_out_ptr = argname in out_args
                if not has_in_ptr and not has_out_ptr:
                    continue
                in_ptr = ("{}_in".format(argname) if has_in_ptr else "nullptr")
                out_ptr = ("{}_out".format(argname)
                           if has_out_ptr else "nullptr")
                ctype = "dace::ArrayInterface<{}>".format(arg.dtype.ctype)
                module_body_stream.write("{} {}({}, {});".format(
                    ctype, argname, in_ptr, out_ptr))
                self._dispatcher.defined_vars.add(argname,
                                                  DefinedType.ArrayInterface,
                                                  ctype,
                                                  allow_shadowing=True)
            module_body_stream.write("\n")

        # Allocate local transients
        data_to_allocate = (set(subgraph.top_level_transients()) -
                            set(sdfg.shared_transients()) -
                            set([p[1] for p in parameters]))
        allocated = set()
        for node in subgraph.nodes():
            if not isinstance(node, dace.sdfg.nodes.AccessNode):
                continue
            if node.data not in data_to_allocate or node.data in allocated:
                continue
            allocated.add(node.data)
            self._dispatcher.dispatch_allocate(sdfg, state, state_id, node,
                                               module_stream,
                                               module_body_stream)

        self._dispatcher.dispatch_subgraph(sdfg,
                                           subgraph,
                                           state_id,
                                           module_stream,
                                           module_body_stream,
                                           skip_entry_node=False)

        module_stream.write(module_body_stream.getvalue(), sdfg, state_id)
        module_stream.write("}\n\n")

        self._dispatcher.defined_vars.exit_scope(subgraph)
Example #9
0
class DaCeCodeGenerator(object):
    """ DaCe code generator class that writes the generated code for SDFG
        state machines, and uses a dispatcher to generate code for
        individual states based on the target. """
    def __init__(self, *args, **kwargs):
        self._dispatcher = disp.TargetDispatcher(self)
        self._dispatcher.register_state_dispatcher(self)
        self._initcode = CodeIOStream()
        self._exitcode = CodeIOStream()
        self.statestruct: List[str] = []
        self.environments: List[Any] = []
        self.to_allocate: DefaultDict[
            Union[SDFG, SDFGState, nodes.EntryNode],
            List[Tuple[int, int,
                       nodes.AccessNode]]] = collections.defaultdict(list)

    ##################################################################
    # Target registry

    @property
    def dispatcher(self):
        return self._dispatcher

    ##################################################################
    # Code generation

    def generate_constants(self, sdfg: SDFG, callsite_stream: CodeIOStream):
        # Write constants
        for cstname, (csttype, cstval) in sdfg.constants_prop.items():
            if isinstance(csttype, data.Array):
                const_str = "constexpr " + csttype.dtype.ctype + \
                    " " + cstname + "[" + str(cstval.size) + "] = {"
                it = np.nditer(cstval, order='C')
                for i in range(cstval.size - 1):
                    const_str += str(it[0]) + ", "
                    it.iternext()
                const_str += str(it[0]) + "};\n"
                callsite_stream.write(const_str, sdfg)
            else:
                callsite_stream.write(
                    "constexpr %s %s = %s;\n" %
                    (csttype.dtype.ctype, cstname, sym2cpp(cstval)), sdfg)

    def generate_fileheader(self,
                            sdfg: SDFG,
                            global_stream: CodeIOStream,
                            backend: str = 'frame'):
        """ Generate a header in every output file that includes custom types
            and constants.
            :param sdfg: The input SDFG.
            :param global_stream: Stream to write to (global).
            :param backend: Whose backend this header belongs to.
        """
        # Hash file include
        if backend == 'frame':
            global_stream.write('#include "../../include/hash.h"\n', sdfg)

        #########################################################
        # Environment-based includes
        for env in self.environments:
            if len(env.headers) > 0:
                global_stream.write(
                    "\n".join("#include \"" + h + "\"" for h in env.headers),
                    sdfg)

        #########################################################
        # Custom types
        datatypes = set()
        # Types of this SDFG
        for _, arrname, arr in sdfg.arrays_recursive():
            if arr is not None:
                datatypes.add(arr.dtype)

        # Emit unique definitions
        wrote_something = False
        for typ in datatypes:
            if hasattr(typ, 'emit_definition'):
                if not wrote_something:
                    global_stream.write("", sdfg)
                wrote_something = True
                global_stream.write(typ.emit_definition(), sdfg)
        if wrote_something:
            global_stream.write("", sdfg)

        #########################################################
        # Write constants
        self.generate_constants(sdfg, global_stream)

        #########################################################
        # Write state struct
        structstr = '\n'.join(self.statestruct)
        global_stream.write(
            f'''
struct {sdfg.name}_t {{
    {structstr}
}};

''', sdfg)

        for sd in sdfg.all_sdfgs_recursive():
            if None in sd.global_code:
                global_stream.write(codeblock_to_cpp(sd.global_code[None]), sd)
            if backend in sd.global_code:
                global_stream.write(codeblock_to_cpp(sd.global_code[backend]),
                                    sd)

    def generate_header(self, sdfg: SDFG, global_stream: CodeIOStream,
                        callsite_stream: CodeIOStream):
        """ Generate the header of the frame-code. Code exists in a separate
            function for overriding purposes.
            :param sdfg: The input SDFG.
            :param global_stream: Stream to write to (global).
            :param callsite_stream: Stream to write to (at call site).
        """
        # Write frame code - header
        global_stream.write(
            '/* DaCe AUTO-GENERATED FILE. DO NOT MODIFY */\n' +
            '#include <dace/dace.h>\n', sdfg)

        # Write header required by environments
        for env in self.environments:
            self.statestruct.extend(env.state_fields)

        # Instrumentation preamble
        if len(self._dispatcher.instrumentation) > 1:
            self.statestruct.append('dace::perf::Report report;')
            # Reset report if written every invocation
            if config.Config.get_bool('instrumentation',
                                      'report_each_invocation'):
                callsite_stream.write('__state->report.reset();', sdfg)

        self.generate_fileheader(sdfg, global_stream, 'frame')

    def generate_footer(self, sdfg: SDFG, global_stream: CodeIOStream,
                        callsite_stream: CodeIOStream):
        """ Generate the footer of the frame-code. Code exists in a separate
            function for overriding purposes.
            :param sdfg: The input SDFG.
            :param global_stream: Stream to write to (global).
            :param callsite_stream: Stream to write to (at call site).
        """
        import dace.library
        fname = sdfg.name
        params = sdfg.signature()
        paramnames = sdfg.signature(False, for_call=True)
        initparams = sdfg.signature(with_arrays=False)
        initparamnames = sdfg.signature(False,
                                        for_call=True,
                                        with_arrays=False)

        # Invoke all instrumentation providers
        for instr in self._dispatcher.instrumentation.values():
            if instr is not None:
                instr.on_sdfg_end(sdfg, callsite_stream, global_stream)

        # Instrumentation saving
        if (config.Config.get_bool('instrumentation', 'report_each_invocation')
                and len(self._dispatcher.instrumentation) > 1):
            callsite_stream.write(
                '''__state->report.save("{path}/perf", __HASH_{name});'''.
                format(path=sdfg.build_folder.replace('\\', '/'),
                       name=sdfg.name), sdfg)

        # Write closing brace of program
        callsite_stream.write('}', sdfg)

        # Write awkward footer to avoid 'extern "C"' issues
        params_comma = (', ' + params) if params else ''
        initparams_comma = (', ' + initparams) if initparams else ''
        paramnames_comma = (', ' + paramnames) if paramnames else ''
        initparamnames_comma = (', ' +
                                initparamnames) if initparamnames else ''
        callsite_stream.write(
            f'''
DACE_EXPORTED void __program_{fname}({fname}_t *__state{params_comma})
{{
    __program_{fname}_internal(__state{paramnames_comma});
}}''', sdfg)

        for target in self._dispatcher.used_targets:
            if target.has_initializer:
                callsite_stream.write(
                    'DACE_EXPORTED int __dace_init_%s(%s_t *__state%s);\n' %
                    (target.target_name, sdfg.name, initparams_comma), sdfg)
            if target.has_finalizer:
                callsite_stream.write(
                    'DACE_EXPORTED int __dace_exit_%s(%s_t *__state);\n' %
                    (target.target_name, sdfg.name), sdfg)

        callsite_stream.write(
            f"""
DACE_EXPORTED {sdfg.name}_t *__dace_init_{sdfg.name}({initparams})
{{
    int __result = 0;
    {sdfg.name}_t *__state = new {sdfg.name}_t;

            """, sdfg)

        for target in self._dispatcher.used_targets:
            if target.has_initializer:
                callsite_stream.write(
                    '__result |= __dace_init_%s(__state%s);' %
                    (target.target_name, initparamnames_comma), sdfg)
        for env in self.environments:
            init_code = _get_or_eval_sdfg_first_arg(env.init_code, sdfg)
            if init_code:
                callsite_stream.write("{  // Environment: " + env.__name__,
                                      sdfg)
                callsite_stream.write(init_code)
                callsite_stream.write("}")

        for sd in sdfg.all_sdfgs_recursive():
            if None in sd.init_code:
                callsite_stream.write(codeblock_to_cpp(sd.init_code[None]), sd)
            callsite_stream.write(codeblock_to_cpp(sd.init_code['frame']), sd)

        callsite_stream.write(self._initcode.getvalue(), sdfg)

        callsite_stream.write(
            f"""
    if (__result) {{
        delete __state;
        return nullptr;
    }}
    return __state;
}}

DACE_EXPORTED void __dace_exit_{sdfg.name}({sdfg.name}_t *__state)
{{
""", sdfg)

        # Instrumentation saving
        if (not config.Config.get_bool('instrumentation',
                                       'report_each_invocation')
                and len(self._dispatcher.instrumentation) > 1):
            callsite_stream.write(
                '__state->report.save("%s/perf", __HASH_%s);' %
                (sdfg.build_folder.replace('\\', '/'), sdfg.name), sdfg)

        callsite_stream.write(self._exitcode.getvalue(), sdfg)

        for sd in sdfg.all_sdfgs_recursive():
            if None in sd.exit_code:
                callsite_stream.write(codeblock_to_cpp(sd.exit_code[None]), sd)
            callsite_stream.write(codeblock_to_cpp(sd.exit_code['frame']), sd)

        for target in self._dispatcher.used_targets:
            if target.has_finalizer:
                callsite_stream.write(
                    '__dace_exit_%s(__state);' % target.target_name, sdfg)
        for env in reversed(self.environments):
            finalize_code = _get_or_eval_sdfg_first_arg(
                env.finalize_code, sdfg)
            if finalize_code:
                callsite_stream.write("{  // Environment: " + env.__name__,
                                      sdfg)
                callsite_stream.write(finalize_code)
                callsite_stream.write("}")

        callsite_stream.write('delete __state;\n}\n', sdfg)

    def generate_state(self,
                       sdfg,
                       state,
                       global_stream,
                       callsite_stream,
                       generate_state_footer=True):

        sid = sdfg.node_id(state)

        # Emit internal transient array allocation
        self.allocate_arrays_in_scope(sdfg, state, global_stream,
                                      callsite_stream)

        callsite_stream.write('\n')

        # Invoke all instrumentation providers
        for instr in self._dispatcher.instrumentation.values():
            if instr is not None:
                instr.on_state_begin(sdfg, state, callsite_stream,
                                     global_stream)

        #####################
        # Create dataflow graph for state's children.

        # DFG to code scheme: Only generate code for nodes whose all
        # dependencies have been executed (topological sort).
        # For different connected components, run them concurrently.

        components = dace.sdfg.concurrent_subgraphs(state)

        if len(components) == 1:
            self._dispatcher.dispatch_subgraph(sdfg,
                                               state,
                                               sid,
                                               global_stream,
                                               callsite_stream,
                                               skip_entry_node=False)
        else:
            if sdfg.openmp_sections:
                callsite_stream.write("#pragma omp parallel sections\n{")
            for c in components:
                if sdfg.openmp_sections:
                    callsite_stream.write("#pragma omp section\n{")
                self._dispatcher.dispatch_subgraph(sdfg,
                                                   c,
                                                   sid,
                                                   global_stream,
                                                   callsite_stream,
                                                   skip_entry_node=False)
                if sdfg.openmp_sections:
                    callsite_stream.write("} // End omp section")
            if sdfg.openmp_sections:
                callsite_stream.write("} // End omp sections")

        #####################
        # Write state footer

        if generate_state_footer:
            # Emit internal transient array deallocation
            self.deallocate_arrays_in_scope(sdfg, state, global_stream,
                                            callsite_stream)

            # Invoke all instrumentation providers
            for instr in self._dispatcher.instrumentation.values():
                if instr is not None:
                    instr.on_state_end(sdfg, state, callsite_stream,
                                       global_stream)

    def generate_states(self, sdfg, global_stream, callsite_stream):
        states_generated = set()

        # Create closure + function for state dispatcher
        def dispatch_state(state: SDFGState) -> str:
            stream = CodeIOStream()
            self._dispatcher.dispatch_state(sdfg, state, global_stream, stream)
            states_generated.add(state)  # For sanity check
            return stream.getvalue()

        # Handle specialized control flow
        if config.Config.get_bool('optimizer', 'detect_control_flow'):
            # Avoid import loop
            from dace.transformation import helpers as xfh
            # Clean up the state machine by separating combined condition and assignment
            # edges.
            xfh.split_interstate_edges(sdfg)

            cft = cflow.structured_control_flow_tree(sdfg, dispatch_state)
        else:
            # If disabled, generate entire graph as general control flow block
            states_topological = list(sdfg.topological_sort(sdfg.start_state))
            last = states_topological[-1]
            cft = cflow.GeneralBlock(dispatch_state, [
                cflow.SingleState(dispatch_state, s, s is last)
                for s in states_topological
            ], [], [])

        callsite_stream.write(
            cft.as_cpp(self.dispatcher.defined_vars, sdfg.symbols), sdfg)

        # Write exit label
        callsite_stream.write(f'__state_exit_{sdfg.sdfg_id}:;', sdfg)

        return states_generated

    def _get_schedule(
            self, scope: Union[nodes.EntryNode, SDFGState,
                               SDFG]) -> dtypes.ScheduleType:
        TOP_SCHEDULE = dtypes.ScheduleType.Sequential
        if scope is None:
            return TOP_SCHEDULE
        elif isinstance(scope, nodes.EntryNode):
            return scope.schedule
        elif isinstance(scope, (SDFGState, SDFG)):
            sdfg: SDFG = (scope if isinstance(scope, SDFG) else scope.parent)
            if sdfg.parent_nsdfg_node is None:
                return TOP_SCHEDULE
            return (sdfg.parent_nsdfg_node.schedule or TOP_SCHEDULE)
        else:
            raise TypeError

    def _can_allocate(self, sdfg: SDFG, state: SDFGState, desc: data.Data,
                      scope: Union[nodes.EntryNode, SDFGState, SDFG]) -> bool:
        schedule = self._get_schedule(scope)
        # if not dtypes.can_allocate(desc.storage, schedule):
        #     return False
        if dtypes.can_allocate(desc.storage, schedule):
            return True

        # Check for device-level memory recursively
        node = scope if isinstance(scope, nodes.EntryNode) else None
        cstate = scope if isinstance(scope, SDFGState) else state
        csdfg = scope if isinstance(scope, SDFG) else sdfg

        if desc.storage in dtypes.FPGA_STORAGES:
            return sdscope.is_devicelevel_fpga(csdfg, cstate, node)
        elif desc.storage in dtypes.GPU_STORAGES:
            return sdscope.is_devicelevel_gpu(csdfg, cstate, node)

        return False

    def determine_allocation_lifetime(self, top_sdfg: SDFG):
        """
        Determines where (at which scope/state/SDFG) each data descriptor
        will be allocated/deallocated.
        :param top_sdfg: The top-level SDFG to determine for.
        """
        # Gather shared transients
        shared_transients = {}
        for sdfg in top_sdfg.all_sdfgs_recursive():
            shared_transients[sdfg.sdfg_id] = sdfg.shared_transients(
                check_toplevel=False)

        for sdfg, name, desc in top_sdfg.arrays_recursive():
            if not desc.transient:
                continue

            # NOTE: In the code below we infer where a transient should be
            # declared, allocated, and deallocated. The information is stored
            # in the `to_allocate` dictionary. The key of each entry is the
            # scope where one of the above actions must occur, while the value
            # is a tuple containing the following information:
            # 1. The SDFG object that containts the transient.
            # 2. The State id where the action should (approx.) take place.
            # 3. The Access Node id of the transient in the above State.
            # 4. True if declaration should take place, otherwise False.
            # 5. True if allocation should take place, otherwise False.
            # 6. True if deallocation should take place, otherwise False.

            # Possibly confusing control flow below finds the first state
            # and node of the data descriptor, or continues the
            # arrays_recursive() loop
            first_state_instance: int = None
            first_node_instance: nodes.AccessNode = None
            last_state_instance: int = None
            last_node_instance: nodes.AccessNode = None
            first = True
            for state in sdfg.topological_sort():
                id = sdfg.nodes().index(state)
                for node in state.data_nodes():
                    if node.data == name:
                        if first:
                            first_state_instance = id
                            first_node_instance = node
                            first = False
                        last_state_instance = id
                        last_node_instance = node
                        # break
                else:
                    continue
                break

            # Cases
            if desc.lifetime is dtypes.AllocationLifetime.Persistent:
                # Persistent memory is allocated in initialization code and
                # exists in the library state structure

                # If unused, skip
                if first_node_instance is None:
                    continue

                definition = desc.as_arg(name=f'__{sdfg.sdfg_id}_{name}') + ';'
                self.statestruct.append(definition)

                self.to_allocate[sdfg].append(
                    (sdfg, first_state_instance, first_node_instance, True,
                     True, True))
                continue
            elif desc.lifetime is dtypes.AllocationLifetime.Global:
                # Global memory is allocated in the beginning of the program
                # exists in the library state structure (to be passed along
                # to the right SDFG)

                # If unused, skip
                if first_node_instance is None:
                    continue

                definition = desc.as_arg(name=f'__{sdfg.sdfg_id}_{name}') + ';'
                self.statestruct.append(definition)

                # self.to_allocate[top_sdfg].append(
                #     (sdfg.sdfg_id, sdfg.node_id(state), node))
                self.to_allocate[top_sdfg].append(
                    (sdfg, first_state_instance, first_node_instance, True,
                     True, True))
                continue

            # The rest of the cases change the starting scope we attempt to
            # allocate from, since the descriptors may only be allocated higher
            # in the hierarchy (e.g., in the case of GPU global memory inside
            # a kernel).
            alloc_scope: Union[nodes.EntryNode, SDFGState, SDFG] = None
            alloc_state: SDFGState = None
            access_node: nodes.AccessNode = None
            if (name in shared_transients[sdfg.sdfg_id]
                    or desc.lifetime is dtypes.AllocationLifetime.SDFG):
                # SDFG memory and shared transients are allocated in the
                # beginning of their SDFG
                alloc_scope = sdfg
                if first_state_instance is not None:
                    alloc_state = sdfg.nodes()[first_state_instance]
                # If unused, skip
                if first_node_instance is None:
                    continue
            elif desc.lifetime is dtypes.AllocationLifetime.State:
                # State memory is either allocated in the beginning of the
                # containing state or the SDFG (if used in more than one state)
                curstate: SDFGState = None
                multistate = False
                for state in sdfg.nodes():
                    if any(n.data == name for n in state.data_nodes()):
                        if curstate is not None:
                            multistate = True
                            break
                        curstate = state
                if multistate:
                    alloc_scope = sdfg
                else:
                    alloc_scope = curstate
                    alloc_state = curstate
            elif desc.lifetime is dtypes.AllocationLifetime.Scope:
                # Scope memory (default) is either allocated in the innermost
                # scope (e.g., Map, Consume) it is used in (i.e., greatest
                # common denominator), or in the SDFG if used in multiple states
                curscope: Union[nodes.EntryNode, SDFGState] = None
                curstate: SDFGState = None
                multistate = False

                # Does the array appear in inter-state edges?
                for isedge in sdfg.edges():
                    if name in isedge.data.free_symbols:
                        multistate = True

                for state in sdfg.nodes():
                    if multistate:
                        break
                    sdict = state.scope_dict()
                    for node in state.nodes():
                        if not isinstance(node, nodes.AccessNode):
                            continue
                        if node.data != name:
                            continue

                        # If already found in another state, set scope to SDFG
                        if curstate is not None and curstate != state:
                            multistate = True
                            break
                        curstate = state

                        # Current scope (or state object if top-level)
                        scope = sdict[node] or state
                        if curscope is None:
                            curscope = scope
                            continue
                        # States always win
                        if isinstance(scope, SDFGState):
                            curscope = scope
                            continue
                        # Lower/Higher/Disjoint scopes: find common denominator
                        if isinstance(curscope, SDFGState):
                            if scope in curscope.nodes():
                                continue
                        curscope = sdscope.common_parent_scope(
                            sdict, scope, curscope)

                    if multistate:
                        break

                if multistate:
                    alloc_scope = sdfg
                else:
                    alloc_scope = curscope
                    alloc_state = curstate
            else:
                raise TypeError('Unrecognized allocation lifetime "%s"' %
                                desc.lifetime)

            if alloc_scope is None:  # No allocation necessary
                continue

            # If descriptor cannot be allocated in this scope, traverse up the
            # scope tree until it is possible
            cursdfg = sdfg
            curstate = alloc_state
            curscope = alloc_scope
            while not self._can_allocate(cursdfg, curstate, desc, curscope):
                if curscope is None:
                    break
                if isinstance(curscope, nodes.EntryNode):
                    # Go one scope up
                    curscope = curstate.entry_node(curscope)
                    if curscope is None:
                        curscope = curstate
                elif isinstance(curscope, (SDFGState, SDFG)):
                    cursdfg: SDFG = (curscope if isinstance(curscope, SDFG)
                                     else curscope.parent)
                    # Go one SDFG up
                    if cursdfg.parent_nsdfg_node is None:
                        curscope = None
                        curstate = None
                    else:
                        curstate = cursdfg.parent
                        curscope = curstate.entry_node(
                            cursdfg.parent_nsdfg_node)
                else:
                    raise TypeError

            if curscope is None:
                curscope = top_sdfg

            # Check if Array/View is dependent on non-free SDFG symbols
            # NOTE: Tuple is (SDFG, State, Node, declare, allocate, deallocate)
            fsymbols = sdfg.free_symbols.union(sdfg.constants.keys())
            if (not isinstance(curscope, nodes.EntryNode)
                    and utils.is_nonfree_sym_dependent(
                        first_node_instance, desc, alloc_state, fsymbols)):
                # Declare in current (SDFG) scope
                self.to_allocate[curscope].append(
                    (sdfg, first_state_instance, first_node_instance, True,
                     False, False))
                # Allocate in first State
                # Deallocate in last State
                if first_state_instance != last_state_instance:
                    curscope = sdfg.nodes()[first_state_instance]
                    self.to_allocate[curscope].append(
                        (sdfg, first_state_instance, first_node_instance,
                         False, True, False))
                    curscope = sdfg.nodes()[last_state_instance]
                    self.to_allocate[curscope].append(
                        (sdfg, last_state_instance, last_node_instance, False,
                         False, True))
                else:
                    curscope = sdfg.nodes()[first_state_instance]
                    self.to_allocate[curscope].append(
                        (sdfg, first_state_instance, first_node_instance,
                         False, True, True))
            else:
                self.to_allocate[curscope].append(
                    (sdfg, first_state_instance, first_node_instance, True,
                     True, True))

    def allocate_arrays_in_scope(self, sdfg: SDFG,
                                 scope: Union[nodes.EntryNode, SDFGState,
                                              SDFG],
                                 function_stream: CodeIOStream,
                                 callsite_stream: CodeIOStream):
        """ Dispatches allocation of all arrays in the given scope. """
        for tsdfg, state_id, node, declare, allocate, _ in self.to_allocate[
                scope]:
            if state_id is not None:
                state = tsdfg.node(state_id)
            else:
                state = None
            desc = node.desc(tsdfg)

            self._dispatcher.dispatch_allocate(tsdfg, state, state_id, node,
                                               desc, function_stream,
                                               callsite_stream, declare,
                                               allocate)

    def deallocate_arrays_in_scope(self, sdfg: SDFG,
                                   scope: Union[nodes.EntryNode, SDFGState,
                                                SDFG],
                                   function_stream: CodeIOStream,
                                   callsite_stream: CodeIOStream):
        """ Dispatches deallocation of all arrays in the given scope. """
        for tsdfg, state_id, node, _, _, deallocate in self.to_allocate[scope]:
            if not deallocate:
                continue
            if state_id is not None:
                state = tsdfg.node(state_id)
            else:
                state = None
            desc = node.desc(tsdfg)

            self._dispatcher.dispatch_deallocate(tsdfg, state, state_id, node,
                                                 desc, function_stream,
                                                 callsite_stream)

    def generate_code(
        self,
        sdfg: SDFG,
        schedule: Optional[dtypes.ScheduleType],
        sdfg_id: str = ""
    ) -> Tuple[str, str, Set[TargetCodeGenerator], Set[str]]:
        """ Generate frame code for a given SDFG, calling registered targets'
            code generation callbacks for them to generate their own code.
            :param sdfg: The SDFG to generate code for.
            :param schedule: The schedule the SDFG is currently located, or
                             None if the SDFG is top-level.
            :param sdfg_id: An optional string id given to the SDFG label
            :return: A tuple of the generated global frame code, local frame
                     code, and a set of targets that have been used in the
                     generation of this SDFG.
        """

        if len(sdfg_id) == 0 and sdfg.sdfg_id != 0:
            sdfg_id = '_%d' % sdfg.sdfg_id

        global_stream = CodeIOStream()
        callsite_stream = CodeIOStream()

        is_top_level = sdfg.parent is None

        # Analyze allocation lifetime of SDFG and all nested SDFGs
        if is_top_level:
            self.determine_allocation_lifetime(sdfg)

        # Generate code
        ###########################

        # Keep track of allocated variables
        allocated = set()

        # Add symbol mappings to allocated variables
        if sdfg.parent_nsdfg_node is not None:
            allocated |= sdfg.parent_nsdfg_node.symbol_mapping.keys()

        # Invoke all instrumentation providers
        for instr in self._dispatcher.instrumentation.values():
            if instr is not None:
                instr.on_sdfg_begin(sdfg, callsite_stream, global_stream)

        # Allocate outer-level transients
        self.allocate_arrays_in_scope(sdfg, sdfg, global_stream,
                                      callsite_stream)

        # Allocate inter-state variables
        global_symbols = copy.deepcopy(sdfg.symbols)
        global_symbols.update(
            {aname: arr.dtype
             for aname, arr in sdfg.arrays.items()})
        interstate_symbols = {}
        for e in sdfg.edges():
            symbols = e.data.new_symbols(sdfg, global_symbols)
            # Inferred symbols only take precedence if global symbol not defined
            symbols = {
                k: v if k not in global_symbols else global_symbols[k]
                for k, v in symbols.items()
            }
            interstate_symbols.update(symbols)
            global_symbols.update(symbols)

        for isvarName, isvarType in interstate_symbols.items():
            isvar = data.Scalar(isvarType)
            callsite_stream.write(
                '%s;\n' % (isvar.as_arg(with_types=True, name=isvarName)),
                sdfg)
            self.dispatcher.defined_vars.add(isvarName,
                                             disp.DefinedType.Scalar,
                                             isvarType.ctype)

        callsite_stream.write('\n', sdfg)

        #######################################################################
        # Generate actual program body

        states_generated = self.generate_states(sdfg, global_stream,
                                                callsite_stream)

        #######################################################################

        # Sanity check
        if len(states_generated) != len(sdfg.nodes()):
            raise RuntimeError(
                "Not all states were generated in SDFG {}!"
                "\n  Generated: {}\n  Missing: {}".format(
                    sdfg.label, [s.label for s in states_generated],
                    [s.label for s in (set(sdfg.nodes()) - states_generated)]))

        # Deallocate transients
        self.deallocate_arrays_in_scope(sdfg, sdfg, global_stream,
                                        callsite_stream)

        # Now that we have all the information about dependencies, generate
        # header and footer
        if is_top_level:
            # Let each target append code to frame code state before generating
            # header and footer
            for target in self._dispatcher.used_targets:
                target.on_target_used()

            header_stream = CodeIOStream()
            header_global_stream = CodeIOStream()
            footer_stream = CodeIOStream()
            footer_global_stream = CodeIOStream()

            # Get all environments used in the generated code, including
            # dependent environments
            import dace.library  # Avoid import loops
            self.environments = dace.library.get_environments_and_dependencies(
                self._dispatcher.used_environments)

            self.generate_header(sdfg, header_global_stream, header_stream)

            # Open program function
            params = sdfg.signature()
            if params:
                params = ', ' + params
            function_signature = (
                'void __program_%s_internal(%s_t *__state%s)\n{\n' %
                (sdfg.name, sdfg.name, params))

            self.generate_footer(sdfg, footer_global_stream, footer_stream)

            header_global_stream.write(global_stream.getvalue())
            header_global_stream.write(footer_global_stream.getvalue())
            generated_header = header_global_stream.getvalue()

            all_code = CodeIOStream()
            all_code.write(function_signature)
            all_code.write(header_stream.getvalue())
            all_code.write(callsite_stream.getvalue())
            all_code.write(footer_stream.getvalue())
            generated_code = all_code.getvalue()
        else:
            generated_header = global_stream.getvalue()
            generated_code = callsite_stream.getvalue()

        # Clean up generated code
        gotos = re.findall(r'goto (.*?);', generated_code)
        clean_code = ''
        for line in generated_code.split('\n'):
            # Empty line with semicolon
            if re.match(r'^\s*;\s*', line):
                continue
            # Label that might be unused
            label = re.findall(
                r'^\s*([a-zA-Z_][a-zA-Z_0-9]*):\s*[;]?\s*////.*$', line)
            if len(label) > 0:
                if label[0] not in gotos:
                    continue
            clean_code += line + '\n'

        # Return the generated global and local code strings
        return (generated_header, clean_code, self._dispatcher.used_targets,
                self._dispatcher.used_environments)
Example #10
0
 def dispatch_state(state: SDFGState) -> str:
     stream = CodeIOStream()
     self._dispatcher.dispatch_state(sdfg, state, global_stream, stream)
     states_generated.add(state)  # For sanity check
     return stream.getvalue()
Example #11
0
    def get_generated_codeobjects(self):

        execution_mode = Config.get("compiler", "xilinx", "mode")

        kernel_file_name = "DACE_BINARY_DIR \"/{}".format(self._program_name)
        if execution_mode == "software_emulation":
            kernel_file_name += "_sw_emu.xclbin\""
            xcl_emulation_mode = "\"sw_emu\""
            xilinx_sdx = "DACE_VITIS_DIR"
        elif execution_mode == "hardware_emulation":
            kernel_file_name += "_hw_emu.xclbin\""
            xcl_emulation_mode = "\"hw_emu\""
            xilinx_sdx = "DACE_VITIS_DIR"
        elif execution_mode == "hardware" or execution_mode == "simulation":
            kernel_file_name += "_hw.xclbin\""
            xcl_emulation_mode = None
            xilinx_sdx = None
        else:
            raise dace.codegen.codegen.CodegenError(
                "Unknown Xilinx execution mode: {}".format(execution_mode))

        set_env_vars = ""
        set_str = "dace::set_environment_variable(\"{}\", {});\n"
        unset_str = "dace::unset_environment_variable(\"{}\");\n"
        set_env_vars += (set_str.format("XCL_EMULATION_MODE",
                                        xcl_emulation_mode)
                         if xcl_emulation_mode is not None else
                         unset_str.format("XCL_EMULATION_MODE"))
        set_env_vars += (set_str.format("XILINX_SDX", xilinx_sdx) if xilinx_sdx
                         is not None else unset_str.format("XILINX_SDX"))

        host_code = CodeIOStream()
        host_code.write("""\
#include "dace/xilinx/host.h"
#include "dace/dace.h"
#include <iostream>\n\n""")

        self._frame.generate_fileheader(self._global_sdfg, host_code)

        host_code.write("""
dace::fpga::Context *dace::fpga::_context;

DACE_EXPORTED int __dace_init_xilinx({signature}) {{
    {environment_variables}
    dace::fpga::_context = new dace::fpga::Context();
    dace::fpga::_context->Get().MakeProgram({kernel_file_name});
    return 0;
}}

DACE_EXPORTED void __dace_exit_xilinx({signature}) {{
    delete dace::fpga::_context;
}}

{host_code}""".format(signature=self._global_sdfg.signature(),
                      environment_variables=set_env_vars,
                      kernel_file_name=kernel_file_name,
                      host_code="".join([
                          "{separator}\n// Kernel: {kernel_name}"
                          "\n{separator}\n\n{code}\n\n".format(
                              separator="/" * 79, kernel_name=name, code=code)
                          for (name, code) in self._host_codes
                      ])))

        host_code_obj = CodeObject(self._program_name,
                                   host_code.getvalue(),
                                   "cpp",
                                   XilinxCodeGen,
                                   "Xilinx",
                                   target_type="host")

        kernel_code_objs = [
            CodeObject(kernel_name,
                       code,
                       "cpp",
                       XilinxCodeGen,
                       "Xilinx",
                       target_type="device")
            for (kernel_name, code) in self._kernel_codes
        ]

        return [host_code_obj] + kernel_code_objs
Example #12
0
    def expansion(node: 'Reduce', state: SDFGState, sdfg: SDFG):
        node.validate(sdfg, state)
        input_edge: graph.MultiConnectorEdge = state.in_edges(node)[0]
        output_edge: graph.MultiConnectorEdge = state.out_edges(node)[0]
        input_dims = len(input_edge.data.subset)
        input_data = sdfg.arrays[input_edge.data.data]
        output_data = sdfg.arrays[output_edge.data.data]

        # Setup all locations in which code will be written
        cuda_globalcode = CodeIOStream()
        localcode = CodeIOStream()

        # Try to autodetect reduction type
        redtype = detect_reduction_type(node.wcr)

        node_id = state.node_id(node)
        state_id = sdfg.node_id(state)
        idstr = '{sdfg}_{state}_{node}'.format(sdfg=sdfg.name,
                                               state=state_id,
                                               node=node_id)

        # Obtain some SDFG-related information
        input_memlet = input_edge.data
        output_memlet = output_edge.data

        if node.out_connectors:
            dtype = next(node.out_connectors.values())
        else:
            dtype = sdfg.arrays[output_memlet.data].dtype
        output_type = dtype.ctype

        if node.identity is None:
            raise ValueError('For device reduce nodes, initial value must be '
                             'specified')

        # Create a functor or use an existing one for reduction
        if redtype == dtypes.ReductionType.Custom:
            body, [arg1, arg2] = unparse_cr_split(sdfg, node.wcr)
            cuda_globalcode.write(
                """
        struct __reduce_{id} {{
            template <typename T>
            DACE_HDFI T operator()(const T &{arg1}, const T &{arg2}) const {{
                {contents}
            }}
        }};""".format(id=idstr, arg1=arg1, arg2=arg2, contents=body), sdfg,
                state_id, node_id)
            reduce_op = ', __reduce_' + idstr + '(), ' + symstr(node.identity)
        elif redtype in ExpandReduceCUDADevice._SPECIAL_RTYPES:
            reduce_op = ''
        else:
            credtype = 'dace::ReductionType::' + str(
                redtype)[str(redtype).find('.') + 1:]
            reduce_op = ((', dace::_wcr_fixed<%s, %s>()' %
                          (credtype, output_type)) + ', ' +
                         symstr(node.identity))

        # Try to obtain the number of threads in the block, or use the default
        # configuration
        block_threads = devicelevel_block_size(sdfg, state, node)
        if block_threads is not None:
            block_threads = functools.reduce(lambda a, b: a * b, block_threads,
                                             1)

        # Checks
        if block_threads is None:
            raise ValueError('Block-wide GPU reduction must occur within'
                             ' a GPU kernel')
        if issymbolic(block_threads, sdfg.constants):
            raise ValueError('Block size has to be constant for block-wide '
                             'reduction (got %s)' % str(block_threads))
        if (node.axes is not None and len(node.axes) < input_dims):
            raise ValueError(
                'Only full reduction is supported for block-wide reduce,'
                ' please use the pure expansion')
        if (input_data.storage != dtypes.StorageType.Register
                or output_data.storage != dtypes.StorageType.Register):
            raise ValueError(
                'Block-wise reduction only supports GPU register inputs '
                'and outputs')
        if redtype in ExpandReduceCUDABlock._SPECIAL_RTYPES:
            raise ValueError('%s block reduction not supported' % redtype)

        credtype = 'dace::ReductionType::' + str(
            redtype)[str(redtype).find('.') + 1:]
        if redtype == dtypes.ReductionType.Custom:
            redop = '__reduce_%s()' % idstr
        else:
            redop = 'dace::_wcr_fixed<%s, %s>()' % (credtype, output_type)

        # Allocate shared memory for block reduce
        localcode.write("""
        typedef cub::BlockReduce<{type}, {numthreads}> BlockReduce_{id};
        __shared__ typename BlockReduce_{id}::TempStorage temp_storage_{id};
            """.format(id=idstr,
                       type=output_data.dtype.ctype,
                       numthreads=block_threads))

        input = (input_memlet.data + ' + ' +
                 cpp_array_expr(sdfg, input_memlet, with_brackets=False))
        output = cpp_array_expr(sdfg, output_memlet)
        localcode.write("""
            {output} = BlockReduce_{id}(temp_storage_{id}).Reduce({input}, {redop});
            """.format(id=idstr,
                       redop=redop,
                       input=input_memlet.data,
                       output=output))

        # Make tasklet
        tnode = dace.nodes.Tasklet('reduce',
                                   {'_in': dace.pointer(input_data.dtype)},
                                   {'_out': dace.pointer(output_data.dtype)},
                                   localcode.getvalue(),
                                   language=dace.Language.CPP)

        # Add the rest of the code
        sdfg.append_global_code(cuda_globalcode.getvalue(), 'cuda')

        # Rename outer connectors and add to node
        input_edge._dst_conn = '_in'
        output_edge._src_conn = '_out'
        node.add_in_connector('_in')
        node.add_out_connector('_out')

        return tnode
Example #13
0
    def get_generated_codeobjects(self):

        execution_mode = Config.get("compiler", "xilinx", "mode")

        kernel_file_name = "DACE_BINARY_DIR \"/{}".format(self._program_name)
        if execution_mode == "software_emulation":
            kernel_file_name += "_sw_emu.xclbin\""
            xcl_emulation_mode = "\"sw_emu\""
            xilinx_sdx = "DACE_VITIS_DIR"
        elif execution_mode == "hardware_emulation":
            kernel_file_name += "_hw_emu.xclbin\""
            xcl_emulation_mode = "\"hw_emu\""
            xilinx_sdx = "DACE_VITIS_DIR"
        elif execution_mode == "hardware" or execution_mode == "simulation":
            kernel_file_name += "_hw.xclbin\""
            xcl_emulation_mode = None
            xilinx_sdx = None
        else:
            raise cgx.CodegenError(
                "Unknown Xilinx execution mode: {}".format(execution_mode))

        set_env_vars = ""
        set_str = "dace::set_environment_variable(\"{}\", {});\n"
        unset_str = "dace::unset_environment_variable(\"{}\");\n"
        set_env_vars += (set_str.format("XCL_EMULATION_MODE",
                                        xcl_emulation_mode)
                         if xcl_emulation_mode is not None else
                         unset_str.format("XCL_EMULATION_MODE"))
        set_env_vars += (set_str.format("XILINX_SDX", xilinx_sdx) if xilinx_sdx
                         is not None else unset_str.format("XILINX_SDX"))
        set_env_vars += set_str.format(
            "EMCONFIG_PATH", "DACE_BINARY_DIR"
        ) if execution_mode == 'hardware_emulation' else unset_str.format(
            "EMCONFIG_PATH")

        host_code = CodeIOStream()
        host_code.write("""\
#include "dace/xilinx/host.h"
#include "dace/dace.h"
""")
        if len(self._dispatcher.instrumentation) > 1:
            host_code.write("""\
#include "dace/perf/reporting.h"
#include <chrono>
#include <iomanip>
#include <iostream>
#include <limits>
""")
        host_code.write("\n\n")

        self._frame.generate_fileheader(self._global_sdfg, host_code,
                                        'xilinx_host')

        params_comma = self._global_sdfg.signature(with_arrays=False)
        if params_comma:
            params_comma = ', ' + params_comma

        host_code.write("""
DACE_EXPORTED int __dace_init_xilinx({sdfg.name}_t *__state{signature}) {{
    {environment_variables}

    __state->fpga_context = new dace::fpga::Context();
    __state->fpga_context->Get().MakeProgram({kernel_file_name});
    return 0;
}}

DACE_EXPORTED void __dace_exit_xilinx({sdfg.name}_t *__state) {{
    delete __state->fpga_context;
}}

{host_code}""".format(signature=params_comma,
                      sdfg=self._global_sdfg,
                      environment_variables=set_env_vars,
                      kernel_file_name=kernel_file_name,
                      host_code="".join([
                          "{separator}\n// Kernel: {kernel_name}"
                          "\n{separator}\n\n{code}\n\n".format(
                              separator="/" * 79, kernel_name=name, code=code)
                          for (name, code) in self._host_codes
                      ])))

        host_code_obj = CodeObject(self._program_name,
                                   host_code.getvalue(),
                                   "cpp",
                                   XilinxCodeGen,
                                   "Xilinx",
                                   target_type="host")

        kernel_code_objs = [
            CodeObject(kernel_name,
                       code,
                       "cpp",
                       XilinxCodeGen,
                       "Xilinx",
                       target_type="device")
            for (kernel_name, code) in self._kernel_codes
        ]

        # Memory bank and streaming interfaces connectivity configuration file
        link_cfg = CodeIOStream()
        self._other_codes["link.cfg"] = link_cfg
        link_cfg.write("[connectivity]")
        are_assigned = [v is not None for v in self._bank_assignments.values()]
        if any(are_assigned):
            if not all(are_assigned):
                raise RuntimeError("Some, but not all global memory arrays "
                                   "were assigned to memory banks: {}".format(
                                       self._bank_assignments))
            # Emit mapping from kernel memory interfaces to DRAM banks
            for (kernel_name, interface_name), (
                    memory_type,
                    memory_bank) in self._bank_assignments.items():
                link_cfg.write(
                    f"sp={kernel_name}_1.m_axi_{interface_name}:{memory_type}[{memory_bank}]"
                )
        # Emit mapping between inter-kernel streaming interfaces
        for _, (src, dst) in self._stream_connections.items():
            link_cfg.write(f"stream_connect={src}:{dst}")

        other_objs = []
        for name, code in self._other_codes.items():
            name = name.split(".")
            other_objs.append(
                CodeObject(name[0],
                           code.getvalue(),
                           ".".join(name[1:]),
                           XilinxCodeGen,
                           "Xilinx",
                           target_type="device"))

        return [host_code_obj] + kernel_code_objs + other_objs
Example #14
0
    def generate_code(
        self,
        sdfg: SDFG,
        schedule: Optional[dtypes.ScheduleType],
        sdfg_id: str = ""
    ) -> Tuple[str, str, Set[TargetCodeGenerator], Set[str]]:
        """ Generate frame code for a given SDFG, calling registered targets'
            code generation callbacks for them to generate their own code.
            :param sdfg: The SDFG to generate code for.
            :param schedule: The schedule the SDFG is currently located, or
                             None if the SDFG is top-level.
            :param sdfg_id: An optional string id given to the SDFG label
            :return: A tuple of the generated global frame code, local frame
                     code, and a set of targets that have been used in the
                     generation of this SDFG.
        """

        sdfg_label = sdfg.name + sdfg_id

        global_stream = CodeIOStream()
        callsite_stream = CodeIOStream()

        # Set default storage/schedule types in SDFG
        set_default_schedule_and_storage_types(sdfg, schedule)

        is_top_level = sdfg.parent is None

        # Generate code
        ###########################

        # Invoke all instrumentation providers
        for instr in self._dispatcher.instrumentation.values():
            if instr is not None:
                instr.on_sdfg_begin(sdfg, callsite_stream, global_stream)

        # Allocate outer-level transients
        shared_transients = sdfg.shared_transients()
        allocated = set()
        for state in sdfg.nodes():
            for node in state.data_nodes():
                if (node.data in shared_transients
                        and node.data not in allocated):
                    self._dispatcher.dispatch_allocate(sdfg, state, None, node,
                                                       global_stream,
                                                       callsite_stream)
                    self._dispatcher.dispatch_initialize(
                        sdfg, state, None, node, global_stream,
                        callsite_stream)
                    allocated.add(node.data)

        # Allocate inter-state variables
        assigned, _ = sdfg.interstate_symbols()
        for isvarName, isvarType in assigned.items():
            # Skip symbols that have been declared as outer-level transients
            if isvarName in allocated:
                continue
            callsite_stream.write(
                '%s;\n' %
                (isvarType.signature(with_types=True, name=isvarName)), sdfg)

        # Initialize parameter arrays
        for argnode in dtypes.deduplicate(sdfg.input_arrays() +
                                          sdfg.output_arrays()):
            # Ignore transient arrays
            if argnode.desc(sdfg).transient: continue
            self._dispatcher.dispatch_initialize(sdfg, sdfg, None, argnode,
                                                 global_stream,
                                                 callsite_stream)

        callsite_stream.write('\n', sdfg)

        states_topological = list(sdfg.topological_sort(sdfg.start_state))

        # {edge: [dace.edges.ControlFlow]}
        control_flow = {e: [] for e in sdfg.edges()}

        if dace.config.Config.get_bool('optimizer', 'detect_control_flow'):

            ####################################################################
            # Loop detection procedure

            all_cycles = list(sdfg.find_cycles())  # Returns a list of lists
            # Order according to topological sort
            all_cycles = [
                sorted(c, key=lambda x: states_topological.index(x))
                for c in all_cycles
            ]
            # Group in terms of starting node
            starting_nodes = [c[0] for c in all_cycles]
            # Order cycles according to starting node in topological sort
            starting_nodes = sorted(starting_nodes,
                                    key=lambda x: states_topological.index(x))
            cycles_by_node = [[c for c in all_cycles if c[0] == n]
                              for n in starting_nodes]
            for cycles in cycles_by_node:

                # Use arbitrary cycle to find the first and last nodes
                first_node = cycles[0][0]
                last_node = cycles[0][-1]

                if not first_node.is_empty():
                    # The entry node should not contain any computations
                    continue

                if not all([c[-1] == last_node for c in cycles]):
                    # There are multiple back edges: not a for or while loop
                    continue

                previous_edge = [
                    e for e in sdfg.in_edges(first_node) if e.src != last_node
                ]
                if len(previous_edge) != 1:
                    # No single starting point: not a for or while
                    continue
                previous_edge = previous_edge[0]

                back_edge = sdfg.edges_between(last_node, first_node)
                if len(back_edge) != 1:
                    raise RuntimeError("Expected exactly one edge in cycle")
                back_edge = back_edge[0]

                # Build a set of all nodes in all cycles associated with this
                # set of start and end node
                internal_nodes = functools.reduce(
                    lambda a, b: a | b, [set(c)
                                         for c in cycles]) - {first_node}

                exit_edge = [
                    e for e in sdfg.out_edges(first_node)
                    if e.dst not in internal_nodes | {first_node}
                ]
                if len(exit_edge) != 1:
                    # No single stopping condition: not a for or while
                    # (we don't support continue or break)
                    continue
                exit_edge = exit_edge[0]

                entry_edge = [
                    e for e in sdfg.out_edges(first_node) if e != exit_edge
                ]
                if len(entry_edge) != 1:
                    # No single starting condition: not a for or while
                    continue
                entry_edge = entry_edge[0]

                # Make sure this is not already annotated to be another construct
                if (len(control_flow[entry_edge]) != 0
                        or len(control_flow[back_edge]) != 0):
                    continue

                # Nested loops case I - previous edge of internal loop is a
                # loop-entry of an external loop (first state in a loop is
                # another loop)
                if (len(control_flow[previous_edge]) == 1
                        and isinstance(control_flow[previous_edge][0],
                                       dace.graph.edges.LoopEntry)):
                    # Nested loop, mark parent scope
                    loop_parent = control_flow[previous_edge][0].scope
                # Nested loops case II - exit edge of internal loop is a
                # back-edge of an external loop (last state in a loop is another
                # loop)
                elif (len(control_flow[exit_edge]) == 1
                      and isinstance(control_flow[exit_edge][0],
                                     dace.graph.edges.LoopBack)):
                    # Nested loop, mark parent scope
                    loop_parent = control_flow[exit_edge][0].scope
                elif (len(control_flow[exit_edge]) == 0
                      or len(control_flow[previous_edge]) == 0):
                    loop_parent = None
                else:
                    continue

                if entry_edge == back_edge:
                    # No entry check (we don't support do-loops)
                    # TODO: do we want to add some support for self-loops?
                    continue

                # Now we make sure that there is no other way to exit this
                # cycle, by checking that there's no reachable node *not*
                # included in any cycle between the first and last node.
                if any([len(set(c) - internal_nodes) > 1 for c in cycles]):
                    continue

                # This is a loop! Generate the necessary annotation objects.
                loop_scope = dace.graph.edges.LoopScope(internal_nodes)

                if ((len(previous_edge.data.assignments) > 0
                     or len(back_edge.data.assignments) > 0) and
                    (len(control_flow[previous_edge]) == 0 or
                     (len(control_flow[previous_edge]) == 1 and
                      control_flow[previous_edge][0].scope == loop_parent))):
                    # Generate assignment edge, if available
                    control_flow[previous_edge].append(
                        dace.graph.edges.LoopAssignment(
                            loop_scope, previous_edge))
                # Assign remaining control flow constructs
                control_flow[entry_edge].append(
                    dace.graph.edges.LoopEntry(loop_scope, entry_edge))
                control_flow[exit_edge].append(
                    dace.graph.edges.LoopExit(loop_scope, exit_edge))
                control_flow[back_edge].append(
                    dace.graph.edges.LoopBack(loop_scope, back_edge))

            ###################################################################
            # If/then/else detection procedure

            candidates = [
                n for n in states_topological if sdfg.out_degree(n) == 2
            ]
            for candidate in candidates:

                # A valid if occurs when then are no reachable nodes for either
                # path that does not pass through a common dominator.
                dominators = nx.dominance.dominance_frontiers(
                    sdfg.nx, candidate)

                left_entry, right_entry = sdfg.out_edges(candidate)
                if (len(control_flow[left_entry]) > 0
                        or len(control_flow[right_entry]) > 0):
                    # Already assigned to a control flow construct
                    # TODO: carefully allow this in some cases
                    continue

                left, right = left_entry.dst, right_entry.dst
                dominator = dominators[left] & dominators[right]
                if len(dominator) != 1:
                    # There must be a single dominator across both branches,
                    # unless one of the nodes _is_ the next dominator
                    # if (len(dominator) == 0 and dominators[left] == {right}
                    #         or dominators[right] == {left}):
                    #     dominator = dominators[left] | dominators[right]
                    # else:
                    #     continue
                    continue
                dominator = next(iter(dominator))  # Exactly one dominator

                exit_edges = sdfg.in_edges(dominator)
                if len(exit_edges) != 2:
                    # There must be a single entry and a single exit. This
                    # could be relaxed in the future.
                    continue

                left_exit, right_exit = exit_edges
                if (len(control_flow[left_exit]) > 0
                        or len(control_flow[right_exit]) > 0):
                    # Already assigned to a control flow construct
                    # TODO: carefully allow this in some cases
                    continue

                # Now traverse from the source and verify that all possible paths
                # pass through the dominator
                left_nodes = sdfg.all_nodes_between(left, dominator)
                if left_nodes is None:
                    # Not all paths lead to the next dominator
                    continue
                right_nodes = sdfg.all_nodes_between(right, dominator)
                if right_nodes is None:
                    # Not all paths lead to the next dominator
                    continue
                all_nodes = left_nodes | right_nodes

                # Make sure there is no overlap between left and right nodes
                if len(left_nodes & right_nodes) > 0:
                    continue

                # This is a valid if/then/else construct. Generate annotations
                if_then_else = dace.graph.edges.IfThenElse(
                    candidate, dominator)

                # Arbitrarily assign then/else to the two branches. If one edge
                # has no dominator but leads to the dominator, it means there's
                # only a then clause (and no else).
                has_else = False
                if len(dominators[left]) == 1:
                    then_scope = dace.graph.edges.IfThenScope(
                        if_then_else, left_nodes)
                    else_scope = dace.graph.edges.IfElseScope(
                        if_then_else, right_nodes)
                    control_flow[left_entry].append(
                        dace.graph.edges.IfEntry(then_scope, left_entry))
                    control_flow[left_exit].append(
                        dace.graph.edges.IfExit(then_scope, left_exit))
                    control_flow[right_exit].append(
                        dace.graph.edges.IfExit(else_scope, right_exit))
                    if len(dominators[right]) == 1:
                        control_flow[right_entry].append(
                            dace.graph.edges.IfEntry(else_scope, right_entry))
                        has_else = True
                else:
                    then_scope = dace.graph.edges.IfThenScope(
                        if_then_else, right_nodes)
                    else_scope = dace.graph.edges.IfElseScope(
                        if_then_else, left_nodes)
                    control_flow[right_entry].append(
                        dace.graph.edges.IfEntry(then_scope, right_entry))
                    control_flow[right_exit].append(
                        dace.graph.edges.IfExit(then_scope, right_exit))
                    control_flow[left_exit].append(
                        dace.graph.edges.IfExit(else_scope, left_exit))

        #######################################################################
        # Generate actual program body

        states_generated = set()  # For sanity check
        generated_edges = set()
        self.generate_states(sdfg, "sdfg", control_flow,
                             global_stream, callsite_stream,
                             set(states_topological), states_generated,
                             generated_edges)

        #######################################################################

        # Sanity check
        if len(states_generated) != len(sdfg.nodes()):
            raise RuntimeError(
                "Not all states were generated in SDFG {}!"
                "\n  Generated: {}\n  Missing: {}".format(
                    sdfg.label, [s.label for s in states_generated],
                    [s.label for s in (set(sdfg.nodes()) - states_generated)]))

        # Deallocate transients
        shared_transients = sdfg.shared_transients()
        deallocated = set()
        for state in sdfg.nodes():
            for node in state.data_nodes():
                if (node.data in shared_transients
                        and node.data not in deallocated):
                    self._dispatcher.dispatch_deallocate(
                        sdfg, state, None, node, global_stream,
                        callsite_stream)
                    deallocated.add(node.data)

        # Now that we have all the information about dependencies, generate
        # header and footer
        if is_top_level:
            header_stream = CodeIOStream()
            header_global_stream = CodeIOStream()
            footer_stream = CodeIOStream()
            footer_global_stream = CodeIOStream()
            self.generate_header(sdfg, self._dispatcher.used_environments,
                                 header_global_stream, header_stream)

            # Open program function
            function_signature = 'void __program_%s_internal(%s)\n{\n' % (
                sdfg.name, sdfg.signature())

            self.generate_footer(sdfg, self._dispatcher.used_environments,
                                 footer_global_stream, footer_stream)

            header_global_stream.write(global_stream.getvalue())
            header_global_stream.write(footer_global_stream.getvalue())
            generated_header = header_global_stream.getvalue()

            all_code = CodeIOStream()
            all_code.write(function_signature)
            all_code.write(header_stream.getvalue())
            all_code.write(callsite_stream.getvalue())
            all_code.write(footer_stream.getvalue())
            generated_code = all_code.getvalue()
        else:
            generated_header = global_stream.getvalue()
            generated_code = callsite_stream.getvalue()

        # Return the generated global and local code strings
        return (generated_header, generated_code,
                self._dispatcher.used_targets,
                self._dispatcher.used_environments)
Example #15
0
class DaCeCodeGenerator(object):
    """ DaCe code generator class that writes the generated code for SDFG
        state machines, and uses a dispatcher to generate code for
        individual states based on the target. """
    def __init__(self, *args, **kwargs):
        self._dispatcher = TargetDispatcher()
        self._dispatcher.register_state_dispatcher(self)
        self._initcode = CodeIOStream()
        self._exitcode = CodeIOStream()

    ##################################################################
    # Target registry

    @property
    def dispatcher(self):
        return self._dispatcher

    ##################################################################
    # Code generation

    def generate_constants(self, sdfg: SDFG, callsite_stream: CodeIOStream):
        # Write constants
        for cstname, (csttype, cstval) in sdfg.constants_prop.items():
            if isinstance(csttype, data.Array):
                const_str = "constexpr " + csttype.dtype.ctype + \
                    " " + cstname + "[" + str(cstval.size) + "] = {"
                it = np.nditer(cstval, order='C')
                for i in range(cstval.size - 1):
                    const_str += str(it[0]) + ", "
                    it.iternext()
                const_str += str(it[0]) + "};\n"
                callsite_stream.write(const_str, sdfg)
            else:
                callsite_stream.write(
                    "constexpr %s %s = %s;\n" %
                    (csttype.dtype.ctype, cstname, str(cstval)), sdfg)

    def generate_fileheader(self, sdfg: SDFG, global_stream: CodeIOStream):
        """ Generate a header in every output file that includes custom types
            and constants.
            :param sdfg: The input SDFG.
            :param global_stream: Stream to write to (global).
        """
        #########################################################
        # Custom types
        datatypes = set()
        # Types of this SDFG
        for _, arrname, arr in sdfg.arrays_recursive():
            if arr is not None:
                datatypes.add(arr.dtype)

        # Emit unique definitions
        wrote_something = False
        for typ in datatypes:
            if hasattr(typ, 'emit_definition'):
                if not wrote_something:
                    global_stream.write("", sdfg)
                wrote_something = True
                global_stream.write(typ.emit_definition(), sdfg)
        if wrote_something:
            global_stream.write("", sdfg)

        #########################################################
        # Write constants
        self.generate_constants(sdfg, global_stream)

        for sd in sdfg.all_sdfgs_recursive():
            global_stream.write(sd.global_code, sd)

    def generate_header(self, sdfg: SDFG, used_environments: Set[str],
                        global_stream: CodeIOStream,
                        callsite_stream: CodeIOStream):
        """ Generate the header of the frame-code. Code exists in a separate
            function for overriding purposes.
            :param sdfg: The input SDFG.
            :param global_stream: Stream to write to (global).
            :param callsite_stream: Stream to write to (at call site).
        """

        environments = [
            dace.library.get_environment(env_name)
            for env_name in used_environments
        ]

        # Write frame code - header
        global_stream.write(
            '/* DaCe AUTO-GENERATED FILE. DO NOT MODIFY */\n' +
            '#include <dace/dace.h>\n', sdfg)

        # Write header required by environments
        for env in environments:
            if len(env.headers) > 0:
                global_stream.write(
                    "\n".join("#include \"" + h + "\"" for h in env.headers),
                    sdfg)

        global_stream.write("\n", sdfg)

        self.generate_fileheader(sdfg, global_stream)

        # Instrumentation preamble
        if len(self._dispatcher.instrumentation) > 1:
            global_stream.write(
                'namespace dace { namespace perf { Report report; } }', sdfg)
            callsite_stream.write('dace::perf::report.reset();', sdfg)

    def generate_footer(self, sdfg: SDFG, used_environments: Set[str],
                        global_stream: CodeIOStream,
                        callsite_stream: CodeIOStream):
        """ Generate the footer of the frame-code. Code exists in a separate
            function for overriding purposes.
            :param sdfg: The input SDFG.
            :param global_stream: Stream to write to (global).
            :param callsite_stream: Stream to write to (at call site).
        """
        fname = sdfg.name
        params = sdfg.signature()
        paramnames = sdfg.signature(False, for_call=True)
        environments = [
            dace.library.get_environment(env_name)
            for env_name in used_environments
        ]

        # Invoke all instrumentation providers
        for instr in self._dispatcher.instrumentation.values():
            if instr is not None:
                instr.on_sdfg_end(sdfg, callsite_stream, global_stream)

        # Instrumentation saving
        if len(self._dispatcher.instrumentation) > 1:
            callsite_stream.write(
                'dace::perf::report.save(".dacecache/%s/perf");' % sdfg.name,
                sdfg)

        # Write closing brace of program
        callsite_stream.write('}', sdfg)

        # Write awkward footer to avoid 'extern "C"' issues
        callsite_stream.write(
            """
DACE_EXPORTED void __program_%s(%s)
{
    __program_%s_internal(%s);
}
""" % (fname, params, fname, paramnames), sdfg)

        for target in self._dispatcher.used_targets:
            if target.has_initializer:
                callsite_stream.write(
                    'DACE_EXPORTED int __dace_init_%s(%s);\n' %
                    (target.target_name, params), sdfg)
            if target.has_finalizer:
                callsite_stream.write(
                    'DACE_EXPORTED int __dace_exit_%s(%s);\n' %
                    (target.target_name, params), sdfg)

        callsite_stream.write(
            """
DACE_EXPORTED int __dace_init_%s(%s)
{
    int __result = 0;
""" % (sdfg.name, params), sdfg)

        for target in self._dispatcher.used_targets:
            if target.has_initializer:
                callsite_stream.write(
                    '__result |= __dace_init_%s(%s);' %
                    (target.target_name, paramnames), sdfg)
        for env in environments:
            if env.init_code:
                callsite_stream.write("{  // Environment: " + env.__name__,
                                      sdfg)
                callsite_stream.write(env.init_code)
                callsite_stream.write("}")

        callsite_stream.write(sdfg.init_code, sdfg)

        callsite_stream.write(self._initcode.getvalue(), sdfg)

        callsite_stream.write(
            """
    return __result;
}

DACE_EXPORTED void __dace_exit_%s(%s)
{
""" % (sdfg.name, params), sdfg)

        callsite_stream.write(self._exitcode.getvalue(), sdfg)

        callsite_stream.write(sdfg.exit_code, sdfg)

        for target in self._dispatcher.used_targets:
            if target.has_finalizer:
                callsite_stream.write(
                    '__dace_exit_%s(%s);' % (target.target_name, paramnames),
                    sdfg)
        for env in environments:
            if env.finalize_code:
                callsite_stream.write("{  // Environment: " + env.__name__,
                                      sdfg)
                callsite_stream.write(env.init_code)
                callsite_stream.write("}")

        callsite_stream.write('}\n', sdfg)

    def generate_state(self,
                       sdfg,
                       state,
                       global_stream,
                       callsite_stream,
                       generate_state_footer=True):

        sid = sdfg.node_id(state)

        # Emit internal transient array allocation
        # Don't allocate transients shared with another state
        data_to_allocate = (set(state.top_level_transients()) -
                            set(sdfg.shared_transients()))
        allocated = set()
        for node in state.data_nodes():
            if node.data not in data_to_allocate or node.data in allocated:
                continue
            allocated.add(node.data)
            self._dispatcher.dispatch_allocate(sdfg, state, sid, node,
                                               global_stream, callsite_stream)
            self._dispatcher.dispatch_initialize(sdfg, state, sid, node,
                                                 global_stream,
                                                 callsite_stream)

        # Invoke all instrumentation providers
        for instr in self._dispatcher.instrumentation.values():
            if instr is not None:
                instr.on_state_begin(sdfg, state, callsite_stream,
                                     global_stream)

        #####################
        # Create dataflow graph for state's children.

        # DFG to code scheme: Only generate code for nodes whose all
        # dependencies have been executed (topological sort).
        # For different connected components, run them concurrently.

        components = dace.sdfg.concurrent_subgraphs(state)

        if len(components) == 1:
            self._dispatcher.dispatch_subgraph(sdfg,
                                               state,
                                               sid,
                                               global_stream,
                                               callsite_stream,
                                               skip_entry_node=False)
        else:
            callsite_stream.write("#pragma omp parallel sections\n{")
            for c in components:
                callsite_stream.write("#pragma omp section\n{")
                self._dispatcher.dispatch_subgraph(sdfg,
                                                   c,
                                                   sid,
                                                   global_stream,
                                                   callsite_stream,
                                                   skip_entry_node=False)
                callsite_stream.write("} // End omp section")
            callsite_stream.write("} // End omp sections")

        #####################
        # Write state footer

        if generate_state_footer:
            # Emit internal transient array deallocation
            deallocated = set()
            for node in state.data_nodes():
                if (node.data not in data_to_allocate
                        or node.data in deallocated
                        or (node.data in sdfg.arrays
                            and sdfg.arrays[node.data].transient == False)):
                    continue
                deallocated.add(node.data)
                self._dispatcher.dispatch_deallocate(sdfg, state, sid, node,
                                                     global_stream,
                                                     callsite_stream)

            # Invoke all instrumentation providers
            for instr in self._dispatcher.instrumentation.values():
                if instr is not None:
                    instr.on_state_end(sdfg, state, callsite_stream,
                                       global_stream)

    @staticmethod
    def _generate_assignments(assignments):
        return [
            "{} = {}".format(variable, value)
            for variable, value in assignments.items()
        ]

    @staticmethod
    def _is_always_true(condition_string):
        return condition_string in ["true", "1"]

    def _generate_transition(self, sdfg, sid, callsite_stream, edge,
                             assignments):

        condition_string = cppunparse.cppunparse(edge.data.condition, False)
        always_true = self._is_always_true(condition_string)

        if not always_true:
            callsite_stream.write("if ({}) {{".format(condition_string), sdfg,
                                  sid)

        if len(assignments) > 0:
            callsite_stream.write(
                ";\n".join(
                    DaCeCodeGenerator._generate_assignments(assignments) +
                    [""]), sdfg, sid)

        callsite_stream.write(
            "goto __state_{}_{};".format(sdfg.name, edge.dst.label), sdfg, sid)

        if not always_true:
            callsite_stream.write("}")

    def generate_states(self, sdfg, scope_label, control_flow, global_stream,
                        callsite_stream, scope, states_generated,
                        generated_edges):

        states_topological = list(sdfg.topological_sort(sdfg.start_state))
        states_to_generate = collections.deque([
            s for s in states_topological
            if s in scope and s not in states_generated
        ])
        if len(states_to_generate) == 0:
            return

        while len(states_to_generate) > 0:

            state = states_to_generate.popleft()
            # When generating control flow constructs, we will not necessarily
            # move in topological order, so make sure this state has not
            # already been generated.
            if state in states_generated or state not in scope:
                continue
            states_generated.add(state)

            sid = sdfg.node_id(state)

            callsite_stream.write(
                "__state_{}_{}:\n".format(sdfg.name, state.label), sdfg, sid)

            # Don't generate brackets and comments for empty states
            if len([
                    n for n in state.nodes()
                    if not isinstance(n, dace.graph.nodes.EmptyTasklet)
            ]) > 0:

                callsite_stream.write('{', sdfg, sid)

                self._dispatcher.dispatch_state(sdfg, state, global_stream,
                                                callsite_stream)

                callsite_stream.write('}', sdfg, sid)

            else:

                callsite_stream.write(";")

            out_edges = sdfg.out_edges(state)

            # Write conditional branches to next states
            for edge in out_edges:

                generate_assignments = True
                generate_transition = True

                # Handle specialized control flow
                if (dace.config.Config.get_bool('optimizer',
                                                'detect_control_flow')):

                    for control in control_flow[edge]:

                        if isinstance(control,
                                      dace.graph.edges.LoopAssignment):
                            # Generate the transition, but leave the
                            # assignments to the loop
                            generate_transition &= True
                            generate_assignments &= False

                        elif isinstance(control, dace.graph.edges.LoopBack):
                            generate_transition &= False
                            generate_assignments &= False

                        elif isinstance(control, dace.graph.edges.LoopExit):
                            # Need to strip the condition, so generate it from
                            # the loop entry
                            generate_transition &= False
                            generate_assignments &= True

                        elif isinstance(control, dace.graph.edges.LoopEntry):
                            generate_transition &= False
                            generate_assignments &= False

                            if control.scope.assignment is not None:
                                assignment_edge = control.scope.assignment.edge
                                init_assignments = ", ".join(
                                    DaCeCodeGenerator._generate_assignments(
                                        assignment_edge.data.assignments))
                                generated_edges.add(assignment_edge)
                            else:
                                init_assignments = ""

                            back_edge = control.scope.back.edge
                            continue_assignments = ", ".join(
                                DaCeCodeGenerator._generate_assignments(
                                    back_edge.data.assignments))
                            generated_edges.add(back_edge)

                            entry_edge = control.scope.entry.edge
                            condition = cppunparse.cppunparse(
                                entry_edge.data.condition, False)
                            generated_edges.add(entry_edge)

                            if (len(init_assignments) > 0
                                    or len(continue_assignments) > 0):
                                callsite_stream.write(
                                    "for ({}; {}; {}) {{".format(
                                        init_assignments, condition,
                                        continue_assignments), sdfg, sid)
                            else:
                                callsite_stream.write(
                                    "while ({}) {{".format(condition), sdfg,
                                    sid)

                            # Generate loop body
                            self.generate_states(
                                sdfg, entry_edge.src.label + "_loop",
                                control_flow, global_stream, callsite_stream,
                                control.scope, states_generated,
                                generated_edges)

                            callsite_stream.write("}", sdfg, sid)

                            exit_edge = control.scope.exit.edge

                            # Update states to generate after nested call
                            states_to_generate = collections.deque([
                                s for s in states_to_generate
                                if s not in states_generated
                            ])
                            # If the next state to be generated is the exit
                            # state, we can omit the goto
                            if (len(states_to_generate) > 0
                                    and states_to_generate[0] == exit_edge.dst
                                    and exit_edge.dst not in states_generated):
                                pass
                            elif edge in generated_edges:
                                # This edge has more roles, goto doesn't apply
                                pass
                            else:
                                callsite_stream.write(
                                    "goto __state_{}_{};".format(
                                        sdfg.name,
                                        control.scope.exit.edge.dst))
                                generated_edges.add(control.scope.exit.edge)

                        elif isinstance(control, dace.graph.edges.IfExit):
                            generate_transition &= True
                            generate_assignments &= True

                        elif isinstance(control, dace.graph.edges.IfEntry):
                            generate_transition &= False
                            generate_assignments &= True

                            if len(set(control.scope) - states_generated) == 0:
                                continue

                            then_scope = control.scope.if_then_else.then_scope
                            else_scope = control.scope.if_then_else.else_scope

                            then_entry = then_scope.entry.edge

                            condition = cppunparse.cppunparse(
                                then_entry.data.condition, False)

                            callsite_stream.write(
                                "if ({}) {{".format(condition), sdfg, sid)
                            generated_edges.add(then_entry)

                            # Generate the then-scope
                            self.generate_states(sdfg, state.label + "_then",
                                                 control_flow, global_stream,
                                                 callsite_stream, then_scope,
                                                 states_generated,
                                                 generated_edges)

                            callsite_stream.write("} else {", sdfg, sid)
                            generated_edges.add(else_scope.entry.edge)

                            # Generate the else-scope
                            self.generate_states(sdfg, state.label + "_else",
                                                 control_flow, global_stream,
                                                 callsite_stream, else_scope,
                                                 states_generated,
                                                 generated_edges)

                            callsite_stream.write("}", sdfg, sid)
                            generated_edges.add(else_scope.exit.edge)

                            # Update states to generate after nested call
                            states_to_generate = collections.deque([
                                s for s in states_to_generate
                                if s not in states_generated
                            ])

                            if_exit_state = control.scope.exit.edge.dst

                            if ((if_exit_state not in states_generated) and
                                ((len(states_to_generate) > 0) and
                                 (states_to_generate[0] == if_exit_state))):
                                pass
                            else:
                                callsite_stream.write(
                                    "goto __state_{}_{};".format(
                                        sdfg.name,
                                        control.scope.exit.edge.dst))

                        else:

                            raise TypeError(
                                "Unknown control flow \"{}\"".format(
                                    type(control).__name__))

                if generate_assignments and len(edge.data.assignments) > 0:
                    assignments_to_generate = edge.data.assignments
                else:
                    assignments_to_generate = {}

                if generate_transition:

                    if ((len(out_edges) == 1)
                            and (edge.dst not in states_generated)
                            and ((len(states_to_generate) > 0) and
                                 (states_to_generate[0] == edge.dst))):
                        # If there is only one outgoing edge, the target will
                        # be generated next, we can omit the goto
                        pass
                    elif (len(out_edges) == 1 and len(states_to_generate) == 0
                          and (edge.dst not in scope)):
                        # This scope has ended, and we don't need to generate
                        # any output edge
                        pass
                    else:
                        self._generate_transition(sdfg, sid, callsite_stream,
                                                  edge,
                                                  assignments_to_generate)
                        # Assignments will be generated in the transition
                        generate_assignments = False

                if generate_assignments:

                    callsite_stream.write(
                        ";\n".join(
                            DaCeCodeGenerator._generate_assignments(
                                assignments_to_generate) + [""]), sdfg, sid)
                generated_edges.add(edge)
                # End of out_edges loop

            if (((len(out_edges) == 0) or
                 (not isinstance(scope, dace.graph.edges.ControlFlowScope) and
                  (len(states_to_generate) == 0)))
                    and (len(states_generated) != sdfg.number_of_nodes())):
                callsite_stream.write(
                    "goto __state_exit_{}_{};".format(sdfg.name, scope_label),
                    sdfg, sid)

        # Write exit state
        callsite_stream.write(
            "__state_exit_{}_{}:;".format(sdfg.name, scope_label), sdfg)

    def generate_code(
        self,
        sdfg: SDFG,
        schedule: Optional[dtypes.ScheduleType],
        sdfg_id: str = ""
    ) -> Tuple[str, str, Set[TargetCodeGenerator], Set[str]]:
        """ Generate frame code for a given SDFG, calling registered targets'
            code generation callbacks for them to generate their own code.
            :param sdfg: The SDFG to generate code for.
            :param schedule: The schedule the SDFG is currently located, or
                             None if the SDFG is top-level.
            :param sdfg_id: An optional string id given to the SDFG label
            :return: A tuple of the generated global frame code, local frame
                     code, and a set of targets that have been used in the
                     generation of this SDFG.
        """

        sdfg_label = sdfg.name + sdfg_id

        global_stream = CodeIOStream()
        callsite_stream = CodeIOStream()

        # Set default storage/schedule types in SDFG
        set_default_schedule_and_storage_types(sdfg, schedule)

        is_top_level = sdfg.parent is None

        # Generate code
        ###########################

        # Invoke all instrumentation providers
        for instr in self._dispatcher.instrumentation.values():
            if instr is not None:
                instr.on_sdfg_begin(sdfg, callsite_stream, global_stream)

        # Allocate outer-level transients
        shared_transients = sdfg.shared_transients()
        allocated = set()
        for state in sdfg.nodes():
            for node in state.data_nodes():
                if (node.data in shared_transients
                        and node.data not in allocated):
                    self._dispatcher.dispatch_allocate(sdfg, state, None, node,
                                                       global_stream,
                                                       callsite_stream)
                    self._dispatcher.dispatch_initialize(
                        sdfg, state, None, node, global_stream,
                        callsite_stream)
                    allocated.add(node.data)

        # Allocate inter-state variables
        assigned, _ = sdfg.interstate_symbols()
        for isvarName, isvarType in assigned.items():
            # Skip symbols that have been declared as outer-level transients
            if isvarName in allocated:
                continue
            callsite_stream.write(
                '%s;\n' %
                (isvarType.signature(with_types=True, name=isvarName)), sdfg)

        # Initialize parameter arrays
        for argnode in dtypes.deduplicate(sdfg.input_arrays() +
                                          sdfg.output_arrays()):
            # Ignore transient arrays
            if argnode.desc(sdfg).transient: continue
            self._dispatcher.dispatch_initialize(sdfg, sdfg, None, argnode,
                                                 global_stream,
                                                 callsite_stream)

        callsite_stream.write('\n', sdfg)

        states_topological = list(sdfg.topological_sort(sdfg.start_state))

        # {edge: [dace.edges.ControlFlow]}
        control_flow = {e: [] for e in sdfg.edges()}

        if dace.config.Config.get_bool('optimizer', 'detect_control_flow'):

            ####################################################################
            # Loop detection procedure

            all_cycles = list(sdfg.find_cycles())  # Returns a list of lists
            # Order according to topological sort
            all_cycles = [
                sorted(c, key=lambda x: states_topological.index(x))
                for c in all_cycles
            ]
            # Group in terms of starting node
            starting_nodes = [c[0] for c in all_cycles]
            # Order cycles according to starting node in topological sort
            starting_nodes = sorted(starting_nodes,
                                    key=lambda x: states_topological.index(x))
            cycles_by_node = [[c for c in all_cycles if c[0] == n]
                              for n in starting_nodes]
            for cycles in cycles_by_node:

                # Use arbitrary cycle to find the first and last nodes
                first_node = cycles[0][0]
                last_node = cycles[0][-1]

                if not first_node.is_empty():
                    # The entry node should not contain any computations
                    continue

                if not all([c[-1] == last_node for c in cycles]):
                    # There are multiple back edges: not a for or while loop
                    continue

                previous_edge = [
                    e for e in sdfg.in_edges(first_node) if e.src != last_node
                ]
                if len(previous_edge) != 1:
                    # No single starting point: not a for or while
                    continue
                previous_edge = previous_edge[0]

                back_edge = sdfg.edges_between(last_node, first_node)
                if len(back_edge) != 1:
                    raise RuntimeError("Expected exactly one edge in cycle")
                back_edge = back_edge[0]

                # Build a set of all nodes in all cycles associated with this
                # set of start and end node
                internal_nodes = functools.reduce(
                    lambda a, b: a | b, [set(c)
                                         for c in cycles]) - {first_node}

                exit_edge = [
                    e for e in sdfg.out_edges(first_node)
                    if e.dst not in internal_nodes | {first_node}
                ]
                if len(exit_edge) != 1:
                    # No single stopping condition: not a for or while
                    # (we don't support continue or break)
                    continue
                exit_edge = exit_edge[0]

                entry_edge = [
                    e for e in sdfg.out_edges(first_node) if e != exit_edge
                ]
                if len(entry_edge) != 1:
                    # No single starting condition: not a for or while
                    continue
                entry_edge = entry_edge[0]

                # Make sure this is not already annotated to be another construct
                if (len(control_flow[entry_edge]) != 0
                        or len(control_flow[back_edge]) != 0):
                    continue

                # Nested loops case I - previous edge of internal loop is a
                # loop-entry of an external loop (first state in a loop is
                # another loop)
                if (len(control_flow[previous_edge]) == 1
                        and isinstance(control_flow[previous_edge][0],
                                       dace.graph.edges.LoopEntry)):
                    # Nested loop, mark parent scope
                    loop_parent = control_flow[previous_edge][0].scope
                # Nested loops case II - exit edge of internal loop is a
                # back-edge of an external loop (last state in a loop is another
                # loop)
                elif (len(control_flow[exit_edge]) == 1
                      and isinstance(control_flow[exit_edge][0],
                                     dace.graph.edges.LoopBack)):
                    # Nested loop, mark parent scope
                    loop_parent = control_flow[exit_edge][0].scope
                elif (len(control_flow[exit_edge]) == 0
                      or len(control_flow[previous_edge]) == 0):
                    loop_parent = None
                else:
                    continue

                if entry_edge == back_edge:
                    # No entry check (we don't support do-loops)
                    # TODO: do we want to add some support for self-loops?
                    continue

                # Now we make sure that there is no other way to exit this
                # cycle, by checking that there's no reachable node *not*
                # included in any cycle between the first and last node.
                if any([len(set(c) - internal_nodes) > 1 for c in cycles]):
                    continue

                # This is a loop! Generate the necessary annotation objects.
                loop_scope = dace.graph.edges.LoopScope(internal_nodes)

                if ((len(previous_edge.data.assignments) > 0
                     or len(back_edge.data.assignments) > 0) and
                    (len(control_flow[previous_edge]) == 0 or
                     (len(control_flow[previous_edge]) == 1 and
                      control_flow[previous_edge][0].scope == loop_parent))):
                    # Generate assignment edge, if available
                    control_flow[previous_edge].append(
                        dace.graph.edges.LoopAssignment(
                            loop_scope, previous_edge))
                # Assign remaining control flow constructs
                control_flow[entry_edge].append(
                    dace.graph.edges.LoopEntry(loop_scope, entry_edge))
                control_flow[exit_edge].append(
                    dace.graph.edges.LoopExit(loop_scope, exit_edge))
                control_flow[back_edge].append(
                    dace.graph.edges.LoopBack(loop_scope, back_edge))

            ###################################################################
            # If/then/else detection procedure

            candidates = [
                n for n in states_topological if sdfg.out_degree(n) == 2
            ]
            for candidate in candidates:

                # A valid if occurs when then are no reachable nodes for either
                # path that does not pass through a common dominator.
                dominators = nx.dominance.dominance_frontiers(
                    sdfg.nx, candidate)

                left_entry, right_entry = sdfg.out_edges(candidate)
                if (len(control_flow[left_entry]) > 0
                        or len(control_flow[right_entry]) > 0):
                    # Already assigned to a control flow construct
                    # TODO: carefully allow this in some cases
                    continue

                left, right = left_entry.dst, right_entry.dst
                dominator = dominators[left] & dominators[right]
                if len(dominator) != 1:
                    # There must be a single dominator across both branches,
                    # unless one of the nodes _is_ the next dominator
                    # if (len(dominator) == 0 and dominators[left] == {right}
                    #         or dominators[right] == {left}):
                    #     dominator = dominators[left] | dominators[right]
                    # else:
                    #     continue
                    continue
                dominator = next(iter(dominator))  # Exactly one dominator

                exit_edges = sdfg.in_edges(dominator)
                if len(exit_edges) != 2:
                    # There must be a single entry and a single exit. This
                    # could be relaxed in the future.
                    continue

                left_exit, right_exit = exit_edges
                if (len(control_flow[left_exit]) > 0
                        or len(control_flow[right_exit]) > 0):
                    # Already assigned to a control flow construct
                    # TODO: carefully allow this in some cases
                    continue

                # Now traverse from the source and verify that all possible paths
                # pass through the dominator
                left_nodes = sdfg.all_nodes_between(left, dominator)
                if left_nodes is None:
                    # Not all paths lead to the next dominator
                    continue
                right_nodes = sdfg.all_nodes_between(right, dominator)
                if right_nodes is None:
                    # Not all paths lead to the next dominator
                    continue
                all_nodes = left_nodes | right_nodes

                # Make sure there is no overlap between left and right nodes
                if len(left_nodes & right_nodes) > 0:
                    continue

                # This is a valid if/then/else construct. Generate annotations
                if_then_else = dace.graph.edges.IfThenElse(
                    candidate, dominator)

                # Arbitrarily assign then/else to the two branches. If one edge
                # has no dominator but leads to the dominator, it means there's
                # only a then clause (and no else).
                has_else = False
                if len(dominators[left]) == 1:
                    then_scope = dace.graph.edges.IfThenScope(
                        if_then_else, left_nodes)
                    else_scope = dace.graph.edges.IfElseScope(
                        if_then_else, right_nodes)
                    control_flow[left_entry].append(
                        dace.graph.edges.IfEntry(then_scope, left_entry))
                    control_flow[left_exit].append(
                        dace.graph.edges.IfExit(then_scope, left_exit))
                    control_flow[right_exit].append(
                        dace.graph.edges.IfExit(else_scope, right_exit))
                    if len(dominators[right]) == 1:
                        control_flow[right_entry].append(
                            dace.graph.edges.IfEntry(else_scope, right_entry))
                        has_else = True
                else:
                    then_scope = dace.graph.edges.IfThenScope(
                        if_then_else, right_nodes)
                    else_scope = dace.graph.edges.IfElseScope(
                        if_then_else, left_nodes)
                    control_flow[right_entry].append(
                        dace.graph.edges.IfEntry(then_scope, right_entry))
                    control_flow[right_exit].append(
                        dace.graph.edges.IfExit(then_scope, right_exit))
                    control_flow[left_exit].append(
                        dace.graph.edges.IfExit(else_scope, left_exit))

        #######################################################################
        # Generate actual program body

        states_generated = set()  # For sanity check
        generated_edges = set()
        self.generate_states(sdfg, "sdfg", control_flow,
                             global_stream, callsite_stream,
                             set(states_topological), states_generated,
                             generated_edges)

        #######################################################################

        # Sanity check
        if len(states_generated) != len(sdfg.nodes()):
            raise RuntimeError(
                "Not all states were generated in SDFG {}!"
                "\n  Generated: {}\n  Missing: {}".format(
                    sdfg.label, [s.label for s in states_generated],
                    [s.label for s in (set(sdfg.nodes()) - states_generated)]))

        # Deallocate transients
        shared_transients = sdfg.shared_transients()
        deallocated = set()
        for state in sdfg.nodes():
            for node in state.data_nodes():
                if (node.data in shared_transients
                        and node.data not in deallocated):
                    self._dispatcher.dispatch_deallocate(
                        sdfg, state, None, node, global_stream,
                        callsite_stream)
                    deallocated.add(node.data)

        # Now that we have all the information about dependencies, generate
        # header and footer
        if is_top_level:
            header_stream = CodeIOStream()
            header_global_stream = CodeIOStream()
            footer_stream = CodeIOStream()
            footer_global_stream = CodeIOStream()
            self.generate_header(sdfg, self._dispatcher.used_environments,
                                 header_global_stream, header_stream)

            # Open program function
            function_signature = 'void __program_%s_internal(%s)\n{\n' % (
                sdfg.name, sdfg.signature())

            self.generate_footer(sdfg, self._dispatcher.used_environments,
                                 footer_global_stream, footer_stream)

            header_global_stream.write(global_stream.getvalue())
            header_global_stream.write(footer_global_stream.getvalue())
            generated_header = header_global_stream.getvalue()

            all_code = CodeIOStream()
            all_code.write(function_signature)
            all_code.write(header_stream.getvalue())
            all_code.write(callsite_stream.getvalue())
            all_code.write(footer_stream.getvalue())
            generated_code = all_code.getvalue()
        else:
            generated_header = global_stream.getvalue()
            generated_code = callsite_stream.getvalue()

        # Return the generated global and local code strings
        return (generated_header, generated_code,
                self._dispatcher.used_targets,
                self._dispatcher.used_environments)
Example #16
0
class DaCeCodeGenerator(object):
    """ DaCe code generator class that writes the generated code for SDFG
        state machines, and uses a dispatcher to generate code for
        individual states based on the target. """
    def __init__(self, *args, **kwargs):
        self._dispatcher = disp.TargetDispatcher(self)
        self._dispatcher.register_state_dispatcher(self)
        self._initcode = CodeIOStream()
        self._exitcode = CodeIOStream()
        self.statestruct: List[str] = []
        self.environments: List[Any] = []

    ##################################################################
    # Target registry

    @property
    def dispatcher(self):
        return self._dispatcher

    ##################################################################
    # Code generation

    def generate_constants(self, sdfg: SDFG, callsite_stream: CodeIOStream):
        # Write constants
        for cstname, (csttype, cstval) in sdfg.constants_prop.items():
            if isinstance(csttype, data.Array):
                const_str = "constexpr " + csttype.dtype.ctype + \
                    " " + cstname + "[" + str(cstval.size) + "] = {"
                it = np.nditer(cstval, order='C')
                for i in range(cstval.size - 1):
                    const_str += str(it[0]) + ", "
                    it.iternext()
                const_str += str(it[0]) + "};\n"
                callsite_stream.write(const_str, sdfg)
            else:
                callsite_stream.write(
                    "constexpr %s %s = %s;\n" %
                    (csttype.dtype.ctype, cstname, sym2cpp(cstval)), sdfg)

    def generate_fileheader(self,
                            sdfg: SDFG,
                            global_stream: CodeIOStream,
                            backend: str = 'frame'):
        """ Generate a header in every output file that includes custom types
            and constants.
            :param sdfg: The input SDFG.
            :param global_stream: Stream to write to (global).
            :param backend: Whose backend this header belongs to.
        """
        # Hash file include
        if backend == 'frame':
            global_stream.write('#include "../../include/hash.h"\n', sdfg)

        #########################################################
        # Environment-based includes
        for env in self.environments:
            if len(env.headers) > 0:
                global_stream.write(
                    "\n".join("#include \"" + h + "\"" for h in env.headers),
                    sdfg)

        #########################################################
        # Custom types
        datatypes = set()
        # Types of this SDFG
        for _, arrname, arr in sdfg.arrays_recursive():
            if arr is not None:
                datatypes.add(arr.dtype)

        # Emit unique definitions
        wrote_something = False
        for typ in datatypes:
            if hasattr(typ, 'emit_definition'):
                if not wrote_something:
                    global_stream.write("", sdfg)
                wrote_something = True
                global_stream.write(typ.emit_definition(), sdfg)
        if wrote_something:
            global_stream.write("", sdfg)

        #########################################################
        # Write constants
        self.generate_constants(sdfg, global_stream)

        #########################################################
        # Write state struct
        structstr = '\n'.join(self.statestruct)
        global_stream.write(
            f'''
struct {sdfg.name}_t {{
    {structstr}
}};

''', sdfg)

        for sd in sdfg.all_sdfgs_recursive():
            if None in sd.global_code:
                global_stream.write(codeblock_to_cpp(sd.global_code[None]), sd)
            if backend in sd.global_code:
                global_stream.write(codeblock_to_cpp(sd.global_code[backend]),
                                    sd)

    def generate_header(self, sdfg: SDFG, global_stream: CodeIOStream,
                        callsite_stream: CodeIOStream):
        """ Generate the header of the frame-code. Code exists in a separate
            function for overriding purposes.
            :param sdfg: The input SDFG.
            :param global_stream: Stream to write to (global).
            :param callsite_stream: Stream to write to (at call site).
        """
        # Write frame code - header
        global_stream.write(
            '/* DaCe AUTO-GENERATED FILE. DO NOT MODIFY */\n' +
            '#include <dace/dace.h>\n', sdfg)

        # Write header required by environments
        for env in self.environments:
            self.statestruct.extend(env.state_fields)

        # Instrumentation preamble
        if len(self._dispatcher.instrumentation) > 1:
            self.statestruct.append('dace::perf::Report report;')
            # Reset report if written every invocation
            if config.Config.get_bool('instrumentation',
                                      'report_each_invocation'):
                callsite_stream.write('__state->report.reset();', sdfg)

        self.generate_fileheader(sdfg, global_stream, 'frame')

    def generate_footer(self, sdfg: SDFG, global_stream: CodeIOStream,
                        callsite_stream: CodeIOStream):
        """ Generate the footer of the frame-code. Code exists in a separate
            function for overriding purposes.
            :param sdfg: The input SDFG.
            :param global_stream: Stream to write to (global).
            :param callsite_stream: Stream to write to (at call site).
        """
        import dace.library
        fname = sdfg.name
        params = sdfg.signature()
        paramnames = sdfg.signature(False, for_call=True)
        initparams = sdfg.signature(with_arrays=False)
        initparamnames = sdfg.signature(False,
                                        for_call=True,
                                        with_arrays=False)

        # Invoke all instrumentation providers
        for instr in self._dispatcher.instrumentation.values():
            if instr is not None:
                instr.on_sdfg_end(sdfg, callsite_stream, global_stream)

        # Instrumentation saving
        if (config.Config.get_bool('instrumentation', 'report_each_invocation')
                and len(self._dispatcher.instrumentation) > 1):
            callsite_stream.write(
                '''__state->report.save("{path}/perf", __HASH_{name});'''.
                format(path=sdfg.build_folder.replace('\\', '/'),
                       name=sdfg.name), sdfg)

        # Write closing brace of program
        callsite_stream.write('}', sdfg)

        # Write awkward footer to avoid 'extern "C"' issues
        params_comma = (', ' + params) if params else ''
        initparams_comma = (', ' + initparams) if initparams else ''
        paramnames_comma = (', ' + paramnames) if paramnames else ''
        initparamnames_comma = (', ' +
                                initparamnames) if initparamnames else ''
        callsite_stream.write(
            f'''
DACE_EXPORTED void __program_{fname}({fname}_t *__state{params_comma})
{{
    __program_{fname}_internal(__state{paramnames_comma});
}}''', sdfg)

        for target in self._dispatcher.used_targets:
            if target.has_initializer:
                callsite_stream.write(
                    'DACE_EXPORTED int __dace_init_%s(%s_t *__state%s);\n' %
                    (target.target_name, sdfg.name, initparams_comma), sdfg)
            if target.has_finalizer:
                callsite_stream.write(
                    'DACE_EXPORTED int __dace_exit_%s(%s_t *__state);\n' %
                    (target.target_name, sdfg.name), sdfg)

        callsite_stream.write(
            f"""
DACE_EXPORTED {sdfg.name}_t *__dace_init_{sdfg.name}({initparams})
{{
    int __result = 0;
    {sdfg.name}_t *__state = new {sdfg.name}_t;

            """, sdfg)

        for target in self._dispatcher.used_targets:
            if target.has_initializer:
                callsite_stream.write(
                    '__result |= __dace_init_%s(__state%s);' %
                    (target.target_name, initparamnames_comma), sdfg)
        for env in self.environments:
            init_code = _get_or_eval_sdfg_first_arg(env.init_code, sdfg)
            if init_code:
                callsite_stream.write("{  // Environment: " + env.__name__,
                                      sdfg)
                callsite_stream.write(init_code)
                callsite_stream.write("}")

        for sd in sdfg.all_sdfgs_recursive():
            if None in sd.init_code:
                callsite_stream.write(codeblock_to_cpp(sd.init_code[None]), sd)
            callsite_stream.write(codeblock_to_cpp(sd.init_code['frame']), sd)

        callsite_stream.write(self._initcode.getvalue(), sdfg)

        callsite_stream.write(
            f"""
    if (__result) {{
        delete __state;
        return nullptr;
    }}
    return __state;
}}

DACE_EXPORTED void __dace_exit_{sdfg.name}({sdfg.name}_t *__state)
{{
""", sdfg)

        # Instrumentation saving
        if (not config.Config.get_bool('instrumentation',
                                       'report_each_invocation')
                and len(self._dispatcher.instrumentation) > 1):
            callsite_stream.write(
                '__state->report.save("%s/perf", __HASH_%s);' %
                (sdfg.build_folder.replace('\\', '/'), sdfg.name), sdfg)

        callsite_stream.write(self._exitcode.getvalue(), sdfg)

        for sd in sdfg.all_sdfgs_recursive():
            if None in sd.exit_code:
                callsite_stream.write(codeblock_to_cpp(sd.exit_code[None]), sd)
            callsite_stream.write(codeblock_to_cpp(sd.exit_code['frame']), sd)

        for target in self._dispatcher.used_targets:
            if target.has_finalizer:
                callsite_stream.write(
                    '__dace_exit_%s(__state);' % target.target_name, sdfg)
        for env in reversed(self.environments):
            finalize_code = _get_or_eval_sdfg_first_arg(
                env.finalize_code, sdfg)
            if finalize_code:
                callsite_stream.write("{  // Environment: " + env.__name__,
                                      sdfg)
                callsite_stream.write(finalize_code)
                callsite_stream.write("}")

        callsite_stream.write('delete __state;\n}\n', sdfg)

    def generate_state(self,
                       sdfg,
                       state,
                       global_stream,
                       callsite_stream,
                       generate_state_footer=True):

        sid = sdfg.node_id(state)

        # Emit internal transient array allocation
        # Don't allocate transients shared with another state
        data_to_allocate = (set(state.top_level_transients()) -
                            set(sdfg.shared_transients()))
        allocated = set()
        for node in state.data_nodes():
            if node.data not in data_to_allocate or node.data in allocated:
                continue
            allocated.add(node.data)
            self._dispatcher.dispatch_allocate(sdfg, state, sid, node,
                                               global_stream, callsite_stream)

        callsite_stream.write('\n')

        # Emit internal transient array allocation for nested SDFGs
        # TODO: Replace with global allocation management
        gpu_persistent_subgraphs = [
            state.scope_subgraph(node) for node in state.nodes()
            if isinstance(node, dace.nodes.MapEntry)
            and node.map.schedule == dace.ScheduleType.GPU_Persistent
        ]
        nested_allocated = set()
        for sub_graph in gpu_persistent_subgraphs:
            for nested_sdfg in [
                    n.sdfg for n in sub_graph.nodes()
                    if isinstance(n, nodes.NestedSDFG)
            ]:
                nested_shared_transients = set(nested_sdfg.shared_transients())
                for nested_state in nested_sdfg.nodes():
                    nested_sid = nested_sdfg.node_id(nested_state)
                    nested_to_allocate = (
                        set(nested_state.top_level_transients()) -
                        nested_shared_transients)
                    nodes_to_allocate = [
                        n for n in nested_state.data_nodes()
                        if n.data in nested_to_allocate
                        and n.data not in nested_allocated
                    ]
                    for nested_node in nodes_to_allocate:
                        nested_allocated.add(nested_node.data)
                        self._dispatcher.dispatch_allocate(
                            nested_sdfg, nested_state, nested_sid, nested_node,
                            global_stream, callsite_stream)

        callsite_stream.write('\n')

        # Invoke all instrumentation providers
        for instr in self._dispatcher.instrumentation.values():
            if instr is not None:
                instr.on_state_begin(sdfg, state, callsite_stream,
                                     global_stream)

        #####################
        # Create dataflow graph for state's children.

        # DFG to code scheme: Only generate code for nodes whose all
        # dependencies have been executed (topological sort).
        # For different connected components, run them concurrently.

        components = dace.sdfg.concurrent_subgraphs(state)

        if len(components) == 1:
            self._dispatcher.dispatch_subgraph(sdfg,
                                               state,
                                               sid,
                                               global_stream,
                                               callsite_stream,
                                               skip_entry_node=False)
        else:
            if sdfg.openmp_sections:
                callsite_stream.write("#pragma omp parallel sections\n{")
            for c in components:
                if sdfg.openmp_sections:
                    callsite_stream.write("#pragma omp section\n{")
                self._dispatcher.dispatch_subgraph(sdfg,
                                                   c,
                                                   sid,
                                                   global_stream,
                                                   callsite_stream,
                                                   skip_entry_node=False)
                if sdfg.openmp_sections:
                    callsite_stream.write("} // End omp section")
            if sdfg.openmp_sections:
                callsite_stream.write("} // End omp sections")

        #####################
        # Write state footer

        if generate_state_footer:

            # Emit internal transient array deallocation for nested SDFGs
            # TODO: Replace with global allocation management
            gpu_persistent_subgraphs = [
                state.scope_subgraph(node) for node in state.nodes()
                if isinstance(node, dace.nodes.MapEntry)
                and node.map.schedule == dace.ScheduleType.GPU_Persistent
            ]
            nested_deallocated = set()
            for sub_graph in gpu_persistent_subgraphs:
                for nested_sdfg in [
                        n.sdfg for n in sub_graph.nodes()
                        if isinstance(n, nodes.NestedSDFG)
                ]:
                    nested_shared_transients = \
                        set(nested_sdfg.shared_transients())
                    for nested_state in nested_sdfg:
                        nested_sid = nested_sdfg.node_id(nested_state)
                        nested_to_allocate = (
                            set(nested_state.top_level_transients()) -
                            nested_shared_transients)
                        nodes_to_deallocate = [
                            n for n in nested_state.data_nodes()
                            if n.data in nested_to_allocate
                            and n.data not in nested_deallocated
                        ]
                        for nested_node in nodes_to_deallocate:
                            nested_deallocated.add(nested_node.data)
                            self._dispatcher.dispatch_deallocate(
                                nested_sdfg, nested_state, nested_sid,
                                nested_node, global_stream, callsite_stream)

            # Emit internal transient array deallocation
            deallocated = set()
            for node in state.data_nodes():
                if (node.data not in data_to_allocate
                        or node.data in deallocated
                        or (node.data in sdfg.arrays
                            and sdfg.arrays[node.data].transient == False)):
                    continue
                deallocated.add(node.data)
                self._dispatcher.dispatch_deallocate(sdfg, state, sid, node,
                                                     global_stream,
                                                     callsite_stream)

            # Invoke all instrumentation providers
            for instr in self._dispatcher.instrumentation.values():
                if instr is not None:
                    instr.on_state_end(sdfg, state, callsite_stream,
                                       global_stream)

    def generate_states(self, sdfg, global_stream, callsite_stream):
        states_generated = set()

        # Create closure + function for state dispatcher
        def dispatch_state(state: SDFGState) -> str:
            stream = CodeIOStream()
            self._dispatcher.dispatch_state(sdfg, state, global_stream, stream)
            states_generated.add(state)  # For sanity check
            return stream.getvalue()

        # Handle specialized control flow
        if config.Config.get_bool('optimizer', 'detect_control_flow'):
            # Avoid import loop
            from dace.transformation import helpers as xfh
            # Clean up the state machine by separating combined condition and assignment
            # edges.
            xfh.split_interstate_edges(sdfg)

            cft = cflow.structured_control_flow_tree(sdfg, dispatch_state)
        else:
            # If disabled, generate entire graph as general control flow block
            states_topological = list(sdfg.topological_sort(sdfg.start_state))
            last = states_topological[-1]
            cft = cflow.GeneralBlock(dispatch_state, [
                cflow.SingleState(dispatch_state, s, s is last)
                for s in states_topological
            ], [])

        callsite_stream.write(
            cft.as_cpp(self.dispatcher.defined_vars, sdfg.symbols), sdfg)

        # Write exit label
        callsite_stream.write(f'__state_exit_{sdfg.sdfg_id}:;', sdfg)

        return states_generated

    def generate_code(
        self,
        sdfg: SDFG,
        schedule: Optional[dtypes.ScheduleType],
        sdfg_id: str = ""
    ) -> Tuple[str, str, Set[TargetCodeGenerator], Set[str]]:
        """ Generate frame code for a given SDFG, calling registered targets'
            code generation callbacks for them to generate their own code.
            :param sdfg: The SDFG to generate code for.
            :param schedule: The schedule the SDFG is currently located, or
                             None if the SDFG is top-level.
            :param sdfg_id: An optional string id given to the SDFG label
            :return: A tuple of the generated global frame code, local frame
                     code, and a set of targets that have been used in the
                     generation of this SDFG.
        """

        if len(sdfg_id) == 0 and sdfg.sdfg_id != 0:
            sdfg_id = '_%d' % sdfg.sdfg_id

        global_stream = CodeIOStream()
        callsite_stream = CodeIOStream()

        is_top_level = sdfg.parent is None

        # Generate code
        ###########################

        # Keep track of allocated variables
        allocated = set()

        # Add symbol mappings to allocated variables
        if sdfg.parent_nsdfg_node is not None:
            allocated |= sdfg.parent_nsdfg_node.symbol_mapping.keys()

        # Invoke all instrumentation providers
        for instr in self._dispatcher.instrumentation.values():
            if instr is not None:
                instr.on_sdfg_begin(sdfg, callsite_stream, global_stream)

        # Allocate outer-level transients
        shared_transients = sdfg.shared_transients()
        for state in sdfg.nodes():
            for node in state.data_nodes():
                if (node.data in shared_transients
                        and node.data not in allocated):
                    self._dispatcher.dispatch_allocate(sdfg, state, None, node,
                                                       global_stream,
                                                       callsite_stream)
                    allocated.add(node.data)

        # Allocate inter-state variables
        global_symbols = copy.deepcopy(sdfg.symbols)
        global_symbols.update(
            {aname: arr.dtype
             for aname, arr in sdfg.arrays.items()})
        interstate_symbols = {}
        for e in sdfg.edges():
            symbols = e.data.new_symbols(global_symbols)
            # Inferred symbols only take precedence if global symbol not defined
            symbols = {
                k: v if k not in global_symbols else global_symbols[k]
                for k, v in symbols.items()
            }
            interstate_symbols.update(symbols)
            global_symbols.update(symbols)

        for isvarName, isvarType in interstate_symbols.items():
            # Skip symbols that have been declared as outer-level transients
            if isvarName in allocated:
                continue
            isvar = data.Scalar(isvarType)
            callsite_stream.write(
                '%s;\n' % (isvar.as_arg(with_types=True, name=isvarName)),
                sdfg)
            self.dispatcher.defined_vars.add(isvarName, isvarType,
                                             isvarType.ctype)

        callsite_stream.write('\n', sdfg)

        #######################################################################
        # Generate actual program body

        states_generated = self.generate_states(sdfg, global_stream,
                                                callsite_stream)

        #######################################################################

        # Sanity check
        if len(states_generated) != len(sdfg.nodes()):
            raise RuntimeError(
                "Not all states were generated in SDFG {}!"
                "\n  Generated: {}\n  Missing: {}".format(
                    sdfg.label, [s.label for s in states_generated],
                    [s.label for s in (set(sdfg.nodes()) - states_generated)]))

        # Deallocate transients
        shared_transients = sdfg.shared_transients()
        deallocated = set()
        for state in sdfg.nodes():
            for node in state.data_nodes():
                if (node.data in shared_transients
                        and node.data not in deallocated):
                    self._dispatcher.dispatch_deallocate(
                        sdfg, state, None, node, global_stream,
                        callsite_stream)
                    deallocated.add(node.data)

        # Now that we have all the information about dependencies, generate
        # header and footer
        if is_top_level:
            # Let each target append code to frame code state before generating
            # header and footer
            for target in self._dispatcher.used_targets:
                target.on_target_used()

            header_stream = CodeIOStream()
            header_global_stream = CodeIOStream()
            footer_stream = CodeIOStream()
            footer_global_stream = CodeIOStream()

            # Get all environments used in the generated code, including
            # dependent environments
            import dace.library  # Avoid import loops
            self.environments = dace.library.get_environments_and_dependencies(
                self._dispatcher.used_environments)

            self.generate_header(sdfg, header_global_stream, header_stream)

            # Open program function
            params = sdfg.signature()
            if params:
                params = ', ' + params
            function_signature = (
                'void __program_%s_internal(%s_t *__state%s)\n{\n' %
                (sdfg.name, sdfg.name, params))

            self.generate_footer(sdfg, footer_global_stream, footer_stream)

            header_global_stream.write(global_stream.getvalue())
            header_global_stream.write(footer_global_stream.getvalue())
            generated_header = header_global_stream.getvalue()

            all_code = CodeIOStream()
            all_code.write(function_signature)
            all_code.write(header_stream.getvalue())
            all_code.write(callsite_stream.getvalue())
            all_code.write(footer_stream.getvalue())
            generated_code = all_code.getvalue()
        else:
            generated_header = global_stream.getvalue()
            generated_code = callsite_stream.getvalue()

        # Clean up generated code
        gotos = re.findall(r'goto (.*);', generated_code)
        clean_code = ''
        for line in generated_code.split('\n'):
            # Empty line with semicolon
            if re.match(r'^\s*;\s*', line):
                continue
            # Label that might be unused
            label = re.findall(
                r'^\s*([a-zA-Z_][a-zA-Z_0-9]*):\s*[;]?\s*////.*$', line)
            if len(label) > 0:
                if label[0] not in gotos:
                    continue
            clean_code += line + '\n'

        # Return the generated global and local code strings
        return (generated_header, clean_code, self._dispatcher.used_targets,
                self._dispatcher.used_environments)
Example #17
0
    def generate_kernel_internal(self, sdfg, state, kernel_name, subgraphs,
                                 kernel_stream, function_stream,
                                 callsite_stream):
        """Main entry function for generating a Xilinx kernel."""

        (global_data_parameters, top_level_local_data, subgraph_parameters,
         nested_global_transients, bank_assignments,
         external_streams) = self.make_parameters(sdfg, state, subgraphs)

        # Detect RTL tasklets, which will be launched as individual kernels
        rtl_tasklet_names = [
            self.rtl_tasklet_name(nd, state, sdfg) for nd in state.nodes()
            if isinstance(nd, nodes.RTLTasklet)
        ]

        host_code_stream = CodeIOStream()

        # Generate host code
        self.generate_host_header(sdfg, kernel_name, global_data_parameters,
                                  host_code_stream)
        self.generate_host_function_boilerplate(sdfg, state, kernel_name,
                                                global_data_parameters,
                                                nested_global_transients,
                                                host_code_stream,
                                                function_stream,
                                                callsite_stream)

        # Now we write the device code
        module_stream = CodeIOStream()
        entry_stream = CodeIOStream()

        state_id = sdfg.node_id(state)

        self.generate_kernel_boilerplate_pre(sdfg, state_id, kernel_name,
                                             global_data_parameters,
                                             bank_assignments, module_stream,
                                             entry_stream, external_streams)

        # Emit allocations
        for node in top_level_local_data:
            self._dispatcher.dispatch_allocate(sdfg, state, state_id, node,
                                               module_stream, entry_stream)
        for is_output, name, node, _ in external_streams:
            self._dispatcher.defined_vars.add_global(name, DefinedType.Stream,
                                                     node.ctype)
            if name not in self._stream_connections:
                self._stream_connections[name] = [None, None]
            self._stream_connections[name][
                0 if is_output else 1] = '{}_1.{}'.format(kernel_name, name)

        self.generate_modules(sdfg, state, kernel_name, subgraphs,
                              subgraph_parameters, module_stream, entry_stream,
                              host_code_stream)

        self.generate_host_function_body(sdfg, state, kernel_name,
                                         global_data_parameters,
                                         rtl_tasklet_names, host_code_stream)

        # Store code to be passed to compilation phase
        self._host_codes.append((kernel_name, host_code_stream.getvalue()))
        kernel_stream.write(module_stream.getvalue())
        kernel_stream.write(entry_stream.getvalue())

        self.generate_kernel_boilerplate_post(kernel_stream, sdfg, state_id)
Example #18
0
    def generate_kernel_internal(self, sdfg: dace.SDFG, state: dace.SDFGState,
                                 kernel_name: str, predecessors: list,
                                 subgraphs: list, kernel_stream: CodeIOStream,
                                 state_host_header_stream: CodeIOStream,
                                 state_host_body_stream: CodeIOStream,
                                 instrumentation_stream: CodeIOStream,
                                 function_stream: CodeIOStream,
                                 callsite_stream: CodeIOStream,
                                 state_parameters: list):
        '''
        Generates Kernel code, both device and host side.
        :param sdfg:
        :param state:
        :param kernel_name:
        :param predecessors: list containing all the name of kernels from which this one depends
        :param subgraphs:
        :param kernel_stream: Device code stream, contains the kernel code
        :param state_host_header_stream: Device-specific code stream: contains the host code
            for the state global declarations.
        :param state_host_body_stream: Device-specific code stream: contains all the code related to
            this state, for creating transient buffers, spawning kernels, and synchronizing them.
        :param instrumentation_stream: Code for profiling kernel execution time.
        :param function_stream: CPU code stream.
        :param callsite_stream: CPU code stream.
        :param state_parameters: list of state parameters. The kernel-specific parameters will be appended to it.
        '''

        (global_data_parameters, top_level_local_data, subgraph_parameters,
         nested_global_transients, bank_assignments,
         external_streams) = self.make_parameters(sdfg, state, subgraphs)

        state_parameters.extend(global_data_parameters)

        # Detect RTL tasklets, which will be launched as individual kernels
        rtl_tasklet_names = [
            self.rtl_tasklet_name(nd, state, sdfg) for nd in state.nodes()
            if isinstance(nd, nodes.RTLTasklet)
        ]

        # Generate host code
        self.generate_host_header(sdfg, kernel_name, global_data_parameters,
                                  state_host_header_stream)
        self.generate_host_function_boilerplate(sdfg, state,
                                                nested_global_transients,
                                                state_host_body_stream)

        # Now we write the device code
        module_stream = CodeIOStream()
        entry_stream = CodeIOStream()

        state_id = sdfg.node_id(state)

        self.generate_kernel_boilerplate_pre(sdfg, state_id, kernel_name,
                                             global_data_parameters,
                                             bank_assignments, module_stream,
                                             entry_stream, external_streams)

        # Emit allocations
        for node in top_level_local_data:
            self._dispatcher.dispatch_allocate(sdfg, state, state_id, node,
                                               node.desc(sdfg), module_stream,
                                               entry_stream)
        for is_output, name, node, _ in external_streams:
            self._dispatcher.defined_vars.add_global(name, DefinedType.Stream,
                                                     node.ctype)
            if name not in self._stream_connections:
                self._stream_connections[name] = [None, None]
            key = 0 if is_output else 1
            val = '{}_1.{}'.format(kernel_name, name)
            self._stream_connections[name][key] = val

        self.generate_modules(sdfg, state, kernel_name, subgraphs,
                              subgraph_parameters, module_stream, entry_stream,
                              state_host_body_stream, instrumentation_stream)

        self.generate_host_function_body(sdfg, state, kernel_name,
                                         predecessors, global_data_parameters,
                                         rtl_tasklet_names,
                                         state_host_body_stream,
                                         instrumentation_stream)

        # Store code to be passed to compilation phase
        # self._host_codes.append((kernel_name, host_code_stream.getvalue()))
        kernel_stream.write(module_stream.getvalue())
        kernel_stream.write(entry_stream.getvalue())

        self.generate_kernel_boilerplate_post(kernel_stream, sdfg, state_id)
Example #19
0
    def expansion(node: 'Reduce', state: SDFGState, sdfg: SDFG):
        node.validate(sdfg, state)
        input_edge: graph.MultiConnectorEdge = state.in_edges(node)[0]
        output_edge: graph.MultiConnectorEdge = state.out_edges(node)[0]
        input_dims = len(input_edge.data.subset)
        output_dims = len(output_edge.data.subset)
        input_data = sdfg.arrays[input_edge.data.data]
        output_data = sdfg.arrays[output_edge.data.data]

        # Setup all locations in which code will be written
        cuda_globalcode = CodeIOStream()
        cuda_initcode = CodeIOStream()
        cuda_exitcode = CodeIOStream()
        host_globalcode = CodeIOStream()
        host_localcode = CodeIOStream()
        output_memlet = output_edge.data

        # Try to autodetect reduction type
        redtype = detect_reduction_type(node.wcr)

        node_id = state.node_id(node)
        state_id = sdfg.node_id(state)
        idstr = '{sdfg}_{state}_{node}'.format(sdfg=sdfg.name,
                                               state=state_id,
                                               node=node_id)

        if node.out_connectors:
            dtype = next(node.out_connectors.values())
        else:
            dtype = sdfg.arrays[output_memlet.data].dtype

        output_type = dtype.ctype

        if node.identity is None:
            raise ValueError('For device reduce nodes, initial value must be '
                             'specified')

        # Create a functor or use an existing one for reduction
        if redtype == dtypes.ReductionType.Custom:
            body, [arg1, arg2] = unparse_cr_split(sdfg, node.wcr)
            cuda_globalcode.write(
                """
        struct __reduce_{id} {{
            template <typename T>
            DACE_HDFI T operator()(const T &{arg1}, const T &{arg2}) const {{
                {contents}
            }}
        }};""".format(id=idstr, arg1=arg1, arg2=arg2, contents=body), sdfg,
                state_id, node_id)
            reduce_op = ', __reduce_' + idstr + '(), ' + symstr(node.identity)
        elif redtype in ExpandReduceCUDADevice._SPECIAL_RTYPES:
            reduce_op = ''
        else:
            credtype = 'dace::ReductionType::' + str(
                redtype)[str(redtype).find('.') + 1:]
            reduce_op = ((', dace::_wcr_fixed<%s, %s>()' %
                          (credtype, output_type)) + ', ' +
                         symstr(node.identity))

        # Obtain some SDFG-related information
        input_memlet = input_edge.data
        reduce_shape = input_memlet.subset.bounding_box_size()
        num_items = ' * '.join(symstr(s) for s in reduce_shape)
        input = (input_memlet.data + ' + ' +
                 cpp_array_expr(sdfg, input_memlet, with_brackets=False))
        output = (output_memlet.data + ' + ' +
                  cpp_array_expr(sdfg, output_memlet, with_brackets=False))

        input_dims = input_memlet.subset.dims()
        output_dims = output_memlet.subset.data_dims()

        reduce_all_axes = (node.axes is None or len(node.axes) == input_dims)
        if reduce_all_axes:
            reduce_last_axes = False
        else:
            reduce_last_axes = sorted(node.axes) == list(
                range(input_dims - len(node.axes), input_dims))

        if (not reduce_all_axes) and (not reduce_last_axes):
            raise NotImplementedError(
                'Multiple axis reductions not supported on GPUs. Please use '
                'the pure expansion or make reduce axes the last in the array.'
            )

        # Verify that data is on the GPU
        if input_data.storage not in [
                dtypes.StorageType.GPU_Global, dtypes.StorageType.CPU_Pinned
        ]:
            raise ValueError('Input of GPU reduction must either reside '
                             ' in global GPU memory or pinned CPU memory')
        if output_data.storage not in [
                dtypes.StorageType.GPU_Global, dtypes.StorageType.CPU_Pinned
        ]:
            raise ValueError('Output of GPU reduction must either reside '
                             ' in global GPU memory or pinned CPU memory')

        # Determine reduction type
        kname = (ExpandReduceCUDADevice._SPECIAL_RTYPES[redtype] if redtype
                 in ExpandReduceCUDADevice._SPECIAL_RTYPES else 'Reduce')

        # Create temp memory for this GPU
        cuda_globalcode.write(
            """
            void *__cub_storage_{sdfg}_{state}_{node} = NULL;
            size_t __cub_ssize_{sdfg}_{state}_{node} = 0;
        """.format(sdfg=sdfg.name, state=state_id, node=node_id), sdfg,
            state_id, node)

        if reduce_all_axes:
            reduce_type = 'DeviceReduce'
            reduce_range = num_items
            reduce_range_def = 'size_t num_items'
            reduce_range_use = 'num_items'
            reduce_range_call = num_items
        elif reduce_last_axes:
            num_reduce_axes = len(node.axes)
            not_reduce_axes = reduce_shape[:-num_reduce_axes]
            reduce_axes = reduce_shape[-num_reduce_axes:]

            num_segments = ' * '.join([symstr(s) for s in not_reduce_axes])
            segment_size = ' * '.join([symstr(s) for s in reduce_axes])

            reduce_type = 'DeviceSegmentedReduce'
            iterator = 'dace::stridedIterator({size})'.format(
                size=segment_size)
            reduce_range = '{num}, {it}, {it} + 1'.format(num=num_segments,
                                                          it=iterator)
            reduce_range_def = 'size_t num_segments, size_t segment_size'
            iterator_use = 'dace::stridedIterator(segment_size)'
            reduce_range_use = 'num_segments, {it}, {it} + 1'.format(
                it=iterator_use)
            reduce_range_call = '%s, %s' % (num_segments, segment_size)

        # Call CUB to get the storage size, allocate and free it
        cuda_initcode.write(
            """
            cub::{reduce_type}::{kname}(nullptr, __cub_ssize_{sdfg}_{state}_{node},
                                        ({intype}*)nullptr, ({outtype}*)nullptr, {reduce_range}{redop});
            cudaMalloc(&__cub_storage_{sdfg}_{state}_{node}, __cub_ssize_{sdfg}_{state}_{node});
""".format(sdfg=sdfg.name,
           state=state_id,
           node=node_id,
           reduce_type=reduce_type,
           reduce_range=reduce_range,
           redop=reduce_op,
           intype=input_data.dtype.ctype,
           outtype=output_data.dtype.ctype,
           kname=kname), sdfg, state_id, node)

        cuda_exitcode.write(
            'cudaFree(__cub_storage_{sdfg}_{state}_{node});'.format(
                sdfg=sdfg.name, state=state_id, node=node_id), sdfg, state_id,
            node)

        # Write reduction function definition
        cuda_globalcode.write("""
DACE_EXPORTED void __dace_reduce_{id}({intype} *input, {outtype} *output, {reduce_range_def}, cudaStream_t stream);
void __dace_reduce_{id}({intype} *input, {outtype} *output, {reduce_range_def}, cudaStream_t stream)
{{
cub::{reduce_type}::{kname}(__cub_storage_{id}, __cub_ssize_{id},
                            input, output, {reduce_range_use}{redop}, stream);
}}
        """.format(id=idstr,
                   intype=input_data.dtype.ctype,
                   outtype=output_data.dtype.ctype,
                   reduce_type=reduce_type,
                   reduce_range_def=reduce_range_def,
                   reduce_range_use=reduce_range_use,
                   kname=kname,
                   redop=reduce_op))

        # Write reduction function definition in caller file
        host_globalcode.write(
            """
DACE_EXPORTED void __dace_reduce_{id}({intype} *input, {outtype} *output, {reduce_range_def}, cudaStream_t stream);
        """.format(id=idstr,
                   reduce_range_def=reduce_range_def,
                   intype=input_data.dtype.ctype,
                   outtype=output_data.dtype.ctype), sdfg, state_id, node)

        # Call reduction function where necessary
        host_localcode.write(
            '__dace_reduce_{id}({input}, {output}, {reduce_range_call}, __dace_current_stream);'
            .format(id=idstr,
                    input=input,
                    output=output,
                    reduce_range_call=reduce_range_call))

        # Make tasklet
        tnode = dace.nodes.Tasklet('reduce',
                                   {'_in': dace.pointer(input_data.dtype)},
                                   {'_out': dace.pointer(output_data.dtype)},
                                   host_localcode.getvalue(),
                                   language=dace.Language.CPP)

        # Add the rest of the code
        sdfg.append_global_code(host_globalcode.getvalue())
        sdfg.append_global_code(cuda_globalcode.getvalue(), 'cuda')
        sdfg.append_init_code(cuda_initcode.getvalue(), 'cuda')
        sdfg.append_exit_code(cuda_exitcode.getvalue(), 'cuda')

        # Rename outer connectors and add to node
        input_edge._dst_conn = '_in'
        output_edge._src_conn = '_out'
        node.add_in_connector('_in')
        node.add_out_connector('_out')

        return tnode