Exemple #1
0
def make_write_C(sdfg, state, vtype):

    # Deduce types
    dtype = vtype.base_type
    mem_veclen = 64 // dtype.bytes
    mtype = dace.vector(dtype, mem_veclen)

    from_kernel = state.add_read("C_pipe")
    mem_read = state.add_read("C_device")
    mem_write = state.add_write("C_device")

    if mem_veclen > vtype.veclen:

        # We need to convert from the kernel vectorization length to 512-bit
        # vectors that are written back to memory

        gearbox = Gearbox(f"(N//TN) * (M//TM) * TN * (TM//{mem_veclen})",
                          "convert_C",
                          schedule=dace.ScheduleType.FPGA_Device)
        sdfg.add_stream("C_from_converter",
                        mtype,
                        buffer_size=f"TM//{mem_veclen}",
                        storage=dace.StorageType.FPGA_Local,
                        transient=True)
        converter_write = state.add_write("C_from_converter")
        state.add_memlet_path(from_kernel,
                              gearbox,
                              dst_conn="from_kernel",
                              memlet=dace.Memlet(f"C_pipe[0]", dynamic=True))
        state.add_memlet_path(gearbox,
                              converter_write,
                              src_conn="to_memory",
                              memlet=dace.Memlet("C_from_converter[0]",
                                                 dynamic=True))

        to_writer = state.add_read("C_from_converter")
        to_writer_subset = "C_from_converter[0]"

    else:

        # Just send the data directly to the reader
        to_writer = from_kernel
        to_writer_subset = "C_pipe[0]"

    entry, exit = state.add_map("write_C", {
        "n0": "0:N//TN",
        "m0": "0:M//TM",
        "n1": "0:TN",
        "m1": f"0:TM//{mem_veclen}"
    },
                                schedule=dace.ScheduleType.FPGA_Device)

    tasklet = state.add_tasklet("write_C", {"from_kernel", "prev"},
                                {"to_memory"},
                                "to_memory = from_kernel + prev")
    state.add_memlet_path(to_writer,
                          entry,
                          tasklet,
                          dst_conn="from_kernel",
                          memlet=dace.Memlet(to_writer_subset))

    state.add_memlet_path(
        mem_read,
        entry,
        tasklet,
        dst_conn="prev",
        memlet=dace.Memlet(
            f"C_device[n0 * TN + n1, m0 * (TM//{mem_veclen}) + m1]"))

    state.add_memlet_path(
        tasklet,
        exit,
        mem_write,
        src_conn="to_memory",
        memlet=dace.Memlet(
            f"C_device[n0 * TN + n1, m0 * (TM//{mem_veclen}) + m1]"))
Exemple #2
0
    def apply(self, state: SDFGState, sdfg: SDFG) -> nodes.AccessNode:
        dnode: nodes.AccessNode = self.access
        if self.expr_index == 0:
            edges = state.out_edges(dnode)
        else:
            edges = state.in_edges(dnode)

        # To understand how many components we need to create, all map ranges
        # throughout memlet paths must match exactly. We thus create a
        # dictionary of unique ranges
        mapping: Dict[Tuple[subsets.Range],
                      List[gr.MultiConnectorEdge[mm.Memlet]]] = defaultdict(
                          list)
        ranges = {}
        for edge in edges:
            mpath = state.memlet_path(edge)
            ranges[edge] = _collect_map_ranges(state, mpath)
            mapping[tuple(r[1] for r in ranges[edge])].append(edge)

        # Collect all edges with the same memory access pattern
        components_to_create: Dict[
            Tuple[symbolic.SymbolicType],
            List[gr.MultiConnectorEdge[mm.Memlet]]] = defaultdict(list)
        for edges_with_same_range in mapping.values():
            for edge in edges_with_same_range:
                # Get memlet path and innermost edge
                mpath = state.memlet_path(edge)
                innermost_edge = copy.deepcopy(mpath[-1] if self.expr_index ==
                                               0 else mpath[0])

                # Store memlets of the same access in the same component
                expr = _canonicalize_memlet(innermost_edge.data, ranges[edge])
                components_to_create[expr].append((innermost_edge, edge))
        components = list(components_to_create.values())

        # Split out components that have dependencies between them to avoid
        # deadlocks
        if self.expr_index == 0:
            ccs_to_add = []
            for i, component in enumerate(components):
                edges_to_remove = set()
                for cedge in component:
                    if any(
                            nx.has_path(state.nx, o[1].dst, cedge[1].dst)
                            for o in component if o is not cedge):
                        ccs_to_add.append([cedge])
                        edges_to_remove.add(cedge)
                if edges_to_remove:
                    components[i] = [
                        c for c in component if c not in edges_to_remove
                    ]
            components.extend(ccs_to_add)
        # End of split

        desc = sdfg.arrays[dnode.data]

        # Create new streams of shape 1
        streams = {}
        mpaths = {}
        for edge in edges:

            if self.use_memory_buffering:

                arrname = str(self.access)

                # Add gearbox
                total_size = edge.data.volume
                vector_size = int(self.memory_buffering_target_bytes /
                                  desc.dtype.bytes)

                if not is_int(sdfg.arrays[dnode.data].shape[-1]):
                    warnings.warn(
                        "Using the MemoryBuffering transformation is potential unsafe since {sym} is not an integer. There should be no issue if {sym} % {vec} == 0"
                        .format(sym=sdfg.arrays[dnode.data].shape[-1],
                                vec=vector_size))

                for i in sdfg.arrays[dnode.data].strides:
                    if not is_int(i):
                        warnings.warn(
                            "Using the MemoryBuffering transformation is potential unsafe since {sym} is not an integer. There should be no issue if {sym} % {vec} == 0"
                            .format(sym=i, vec=vector_size))

                if self.expr_index == 0:  # Read
                    edges = state.out_edges(dnode)
                    gearbox_input_type = dtypes.vector(desc.dtype, vector_size)
                    gearbox_output_type = desc.dtype
                    gearbox_read_volume = total_size / vector_size
                    gearbox_write_volume = total_size
                else:  # Write
                    edges = state.in_edges(dnode)
                    gearbox_input_type = desc.dtype
                    gearbox_output_type = dtypes.vector(
                        desc.dtype, vector_size)
                    gearbox_read_volume = total_size
                    gearbox_write_volume = total_size / vector_size

                input_gearbox_name, input_gearbox_newdesc = sdfg.add_stream(
                    "gearbox_input",
                    gearbox_input_type,
                    buffer_size=self.buffer_size,
                    storage=self.storage,
                    transient=True,
                    find_new_name=True)

                output_gearbox_name, output_gearbox_newdesc = sdfg.add_stream(
                    "gearbox_output",
                    gearbox_output_type,
                    buffer_size=self.buffer_size,
                    storage=self.storage,
                    transient=True,
                    find_new_name=True)

                read_to_gearbox = state.add_read(input_gearbox_name)
                write_from_gearbox = state.add_write(output_gearbox_name)

                gearbox = Gearbox(total_size / vector_size)

                state.add_node(gearbox)

                state.add_memlet_path(read_to_gearbox,
                                      gearbox,
                                      dst_conn="from_memory",
                                      memlet=Memlet(
                                          input_gearbox_name + "[0]",
                                          volume=gearbox_read_volume))
                state.add_memlet_path(gearbox,
                                      write_from_gearbox,
                                      src_conn="to_kernel",
                                      memlet=Memlet(
                                          output_gearbox_name + "[0]",
                                          volume=gearbox_write_volume))

                if self.expr_index == 0:
                    streams[edge] = input_gearbox_name
                    name = output_gearbox_name
                    newdesc = output_gearbox_newdesc
                else:
                    streams[edge] = output_gearbox_name
                    name = input_gearbox_name
                    newdesc = input_gearbox_newdesc

            else:
                # Qualify name to avoid name clashes if memory interfaces are not decoupled for Xilinx
                stream_name = "stream_" + dnode.data
                name, newdesc = sdfg.add_stream(stream_name,
                                                desc.dtype,
                                                buffer_size=self.buffer_size,
                                                storage=self.storage,
                                                transient=True,
                                                find_new_name=True)
                streams[edge] = name

                # Add these such that we can easily use output_gearbox_name and input_gearbox_name without using if statements
                output_gearbox_name = name
                input_gearbox_name = name

            mpath = state.memlet_path(edge)
            mpaths[edge] = mpath

            # Replace memlets in path with stream access
            for e in mpath:
                e.data = mm.Memlet(data=name,
                                   subset='0',
                                   other_subset=e.data.other_subset)
                if isinstance(e.src, nodes.NestedSDFG):
                    e.data.dynamic = True
                    _streamify_recursive(e.src, e.src_conn, newdesc)
                if isinstance(e.dst, nodes.NestedSDFG):
                    e.data.dynamic = True
                    _streamify_recursive(e.dst, e.dst_conn, newdesc)

            # Replace access node and memlet tree with one access
            if self.expr_index == 0:
                replacement = state.add_read(output_gearbox_name)
                state.remove_edge(edge)
                state.add_edge(replacement, edge.src_conn, edge.dst,
                               edge.dst_conn, edge.data)
            else:
                replacement = state.add_write(input_gearbox_name)
                state.remove_edge(edge)
                state.add_edge(edge.src, edge.src_conn, replacement,
                               edge.dst_conn, edge.data)

        if self.use_memory_buffering:

            arrname = str(self.access)
            vector_size = int(self.memory_buffering_target_bytes /
                              desc.dtype.bytes)

            # Vectorize access to global array.
            dtype = sdfg.arrays[arrname].dtype
            sdfg.arrays[arrname].dtype = dtypes.vector(dtype, vector_size)
            new_shape = list(sdfg.arrays[arrname].shape)
            contigidx = sdfg.arrays[arrname].strides.index(1)
            new_shape[contigidx] /= vector_size
            try:
                new_shape[contigidx] = int(new_shape[contigidx])
            except TypeError:
                pass
            sdfg.arrays[arrname].shape = new_shape

            # Change strides
            new_strides: List = list(sdfg.arrays[arrname].strides)

            for i in range(len(new_strides)):
                if i == len(new_strides
                            ) - 1:  # Skip last dimension since it is always 1
                    continue
                new_strides[i] = new_strides[i] / vector_size
            sdfg.arrays[arrname].strides = new_strides

            post_state = get_post_state(sdfg, state)

            if post_state != None:
                # Change subset in the post state such that the correct amount of memory is copied back from the device
                for e in post_state.edges():
                    if e.data.data == self.access.data:
                        new_subset = list(e.data.subset)
                        i, j, k = new_subset[-1]
                        new_subset[-1] = (i, (j + 1) / vector_size - 1, k)
                        e.data = mm.Memlet(data=str(e.src),
                                           subset=subsets.Range(new_subset))

        # Make read/write components
        ionodes = []
        for component in components:

            # Pick the first edge as the edge to make the component from
            innermost_edge, outermost_edge = component[0]
            mpath = mpaths[outermost_edge]
            mapname = streams[outermost_edge]
            innermost_edge.data.other_subset = None

            # Get edge data and streams
            if self.expr_index == 0:
                opname = 'read'
                path = [e.dst for e in mpath[:-1]]
                rmemlets = [(dnode, '__inp', innermost_edge.data)]
                wmemlets = []
                for i, (_, edge) in enumerate(component):
                    name = streams[edge]
                    ionode = state.add_write(name)
                    ionodes.append(ionode)
                    wmemlets.append(
                        (ionode, '__out%d' % i, mm.Memlet(data=name,
                                                          subset='0')))
                code = '\n'.join('__out%d = __inp' % i
                                 for i in range(len(component)))
            else:
                # More than one input stream might mean a data race, so we only
                # address the first one in the tasklet code
                if len(component) > 1:
                    warnings.warn(
                        f'More than one input found for the same index for {dnode.data}'
                    )
                opname = 'write'
                path = [state.entry_node(e.src) for e in reversed(mpath[1:])]
                wmemlets = [(dnode, '__out', innermost_edge.data)]
                rmemlets = []
                for i, (_, edge) in enumerate(component):
                    name = streams[edge]
                    ionode = state.add_read(name)
                    ionodes.append(ionode)
                    rmemlets.append(
                        (ionode, '__inp%d' % i, mm.Memlet(data=name,
                                                          subset='0')))
                code = '__out = __inp0'

            # Create map structure for read/write component
            maps = []
            for entry in path:
                map: nodes.Map = entry.map

                ranges = [(p, (r[0], r[1], r[2]))
                          for p, r in zip(map.params, map.range)]

                # Change ranges of map
                if self.use_memory_buffering:
                    # Find edges from/to map

                    edge_subset = [
                        a_tuple[0]
                        for a_tuple in list(innermost_edge.data.subset)
                    ]

                    # Change range of map
                    if isinstance(edge_subset[-1], symbol) and str(
                            edge_subset[-1]) == map.params[-1]:

                        if not is_int(ranges[-1][1][1]):

                            warnings.warn(
                                "Using the MemoryBuffering transformation is potential unsafe since {sym} is not an integer. There should be no issue if {sym} % {vec} == 0"
                                .format(sym=ranges[-1][1][1].args[1],
                                        vec=vector_size))

                        ranges[-1] = (ranges[-1][0],
                                      (ranges[-1][1][0],
                                       (ranges[-1][1][1] + 1) / vector_size -
                                       1, ranges[-1][1][2]))

                    elif isinstance(edge_subset[-1], sympy.core.add.Add):

                        for arg in edge_subset[-1].args:
                            if isinstance(
                                    arg,
                                    symbol) and str(arg) == map.params[-1]:

                                if not is_int(ranges[-1][1][1]):
                                    warnings.warn(
                                        "Using the MemoryBuffering transformation is potential unsafe since {sym} is not an integer. There should be no issue if {sym} % {vec} == 0"
                                        .format(sym=ranges[-1][1][1].args[1],
                                                vec=vector_size))

                                ranges[-1] = (ranges[-1][0], (
                                    ranges[-1][1][0],
                                    (ranges[-1][1][1] + 1) / vector_size - 1,
                                    ranges[-1][1][2]))

                maps.append(
                    state.add_map(f'__s{opname}_{mapname}', ranges,
                                  map.schedule))
            tasklet = state.add_tasklet(
                f'{opname}_{mapname}',
                {m[1]
                 for m in rmemlets},
                {m[1]
                 for m in wmemlets},
                code,
            )
            for node, cname, memlet in rmemlets:
                state.add_memlet_path(node,
                                      *(me for me, _ in maps),
                                      tasklet,
                                      dst_conn=cname,
                                      memlet=memlet)
            for node, cname, memlet in wmemlets:
                state.add_memlet_path(tasklet,
                                      *(mx for _, mx in reversed(maps)),
                                      node,
                                      src_conn=cname,
                                      memlet=memlet)

        return ionodes
Exemple #3
0
def make_read_B(sdfg, state, vtype):

    # Deduce types
    dtype = vtype.base_type
    mem_veclen = 64 // dtype.bytes
    mtype = dace.vector(dtype, mem_veclen)

    entry, exit = state.add_map("read_B", {
        "n0": "0:N//TN",
        "m0": "0:M//TM",
        "k": "0:K",
        "m1": f"0:TM//{mem_veclen}"
    },
                                schedule=dace.ScheduleType.FPGA_Device)

    mem = state.add_read("B_device")
    to_feeder = state.add_write("B_to_feeder")
    tasklet = state.add_tasklet("read_B", {"from_memory"}, {"to_feeder"},
                                "to_feeder = from_memory")
    state.add_memlet_path(
        mem,
        entry,
        tasklet,
        dst_conn="from_memory",
        memlet=dace.Memlet(f"B_device[k, m0 * (TM//{mem_veclen}) + m1]"))

    if mem_veclen > vtype.veclen:

        # Data arrives as 512-bit wide vectors, and will be converted to the
        # vector length of the kernel

        sdfg.add_stream("B_to_converter",
                        dtype=mtype,
                        buffer_size=MINIMUM_CHANNEL_DEPTH,
                        storage=dace.StorageType.FPGA_Local,
                        transient=True)
        to_converter_write = state.add_write("B_to_converter")
        state.add_memlet_path(tasklet,
                              exit,
                              to_converter_write,
                              src_conn="to_feeder",
                              memlet=dace.Memlet("B_to_converter[0]"))

        # Convert 512-bit vectors to whatever width the kernel uses
        to_converter_read = state.add_read("B_to_converter")
        gearbox = Gearbox(f"(N//TN) * (M//TM) * K * (TM//{mem_veclen})",
                          "convert_B", dace.ScheduleType.FPGA_Device)
        state.add_memlet_path(to_converter_read,
                              gearbox,
                              dst_conn="from_memory",
                              memlet=dace.Memlet(f"B_to_converter[0]",
                                                 dynamic=True))
        state.add_memlet_path(gearbox,
                              to_feeder,
                              src_conn="to_feeder",
                              memlet=dace.Memlet("B_to_feeder[0]",
                                                 dynamic=True))

    else:

        # If the kernel uses the full memory width, just send the data directly
        # without any conversion
        state.add_memlet_path(tasklet,
                              exit,
                              to_feeder,
                              src_conn="to_feeder",
                              memlet=dace.Memlet(f"B_to_feeder[0]"))
Exemple #4
0
def memory_buffering(vec_width, use_library_node, elementwise):

    gear_factor = mem_width // vec_width
    kernel_type = dace.vector(dtype, vec_width)
    if elementwise:
        memory_type = dace.vector(dtype, mem_width)
    else:
        memory_type = dace.vector(kernel_type, gear_factor)
    sdfg = dace.SDFG("memory_buffering_library_node")
    state = sdfg.add_state("memory_buffering_library_node")

    sdfg.add_array("input_array", (n / mem_width, ),
                   memory_type,
                   transient=True,
                   storage=dace.StorageType.FPGA_Global)
    sdfg.add_array("output_array", (n / mem_width, ),
                   memory_type,
                   transient=True,
                   storage=dace.StorageType.FPGA_Global)
    sdfg.add_stream("read_to_gearbox",
                    memory_type,
                    transient=True,
                    storage=dace.StorageType.FPGA_Local)
    sdfg.add_stream("gearbox_to_kernel",
                    kernel_type,
                    transient=True,
                    storage=dace.StorageType.FPGA_Local)
    sdfg.add_stream("kernel_to_gearbox",
                    kernel_type,
                    transient=True,
                    storage=dace.StorageType.FPGA_Local)
    sdfg.add_stream("gearbox_to_write",
                    memory_type,
                    transient=True,
                    storage=dace.StorageType.FPGA_Local)

    # Read from memory
    memory_read = state.add_read("input_array")
    read_to_gearbox_write = state.add_write("read_to_gearbox")
    read_entry, read_exit = state.add_map(
        "read", {"i": f"0:n/{mem_width}"},
        schedule=dace.ScheduleType.FPGA_Device)
    read_tasklet = state.add_tasklet("read", {"mem"}, {"to_gearbox"},
                                     "to_gearbox = mem")
    state.add_memlet_path(memory_read,
                          read_entry,
                          read_tasklet,
                          dst_conn="mem",
                          memlet=dace.Memlet(f"input_array[i]"))
    state.add_memlet_path(read_tasklet,
                          read_exit,
                          read_to_gearbox_write,
                          src_conn="to_gearbox",
                          memlet=dace.Memlet(f"read_to_gearbox[0]"))

    # Gearbox input
    read_to_gearbox_read = state.add_read("read_to_gearbox")
    gearbox_to_kernel_write = state.add_write("gearbox_to_kernel")
    if use_library_node:
        read_gearbox = Gearbox(n / mem_width, name="read_gearbox")
        state.add_node(read_gearbox)
        state.add_memlet_path(read_to_gearbox_read,
                              read_gearbox,
                              dst_conn="from_memory",
                              memlet=dace.Memlet("read_to_gearbox[0]",
                                                 volume=n / mem_width))
        state.add_memlet_path(read_gearbox,
                              gearbox_to_kernel_write,
                              src_conn="to_kernel",
                              memlet=dace.Memlet("gearbox_to_kernel[0]",
                                                 volume=n / vec_width))
    else:
        sdfg.add_array("read_buffer", (1, ),
                       memory_type,
                       storage=dace.StorageType.FPGA_Local,
                       transient=True)
        read_buffer_read = state.add_read("read_buffer")
        read_buffer_write = state.add_write("read_buffer")
        read_gearbox_entry, read_gearbox_exit = state.add_map(
            "gearbox_read", {
                "i": f"0:n/{mem_width}",
                "j": f"0:{gear_factor}"
            },
            schedule=dace.ScheduleType.FPGA_Device)
        read_gearbox_tasklet = state.add_tasklet(
            "gearbox_read", {
                "from_memory": memory_type,
                "buffer_in": None
            }, {"to_kernel", "buffer_out"}, """\
wide = from_memory if j == 0 else buffer_in
to_kernel = wide[j]
buffer_out = wide""")
        state.add_memlet_path(read_to_gearbox_read,
                              read_gearbox_entry,
                              read_gearbox_tasklet,
                              dst_conn="from_memory",
                              memlet=dace.Memlet("read_to_gearbox[0]",
                                                 dynamic=True))
        state.add_memlet_path(read_buffer_read,
                              read_gearbox_entry,
                              read_gearbox_tasklet,
                              dst_conn="buffer_in",
                              memlet=dace.Memlet("read_buffer[0]"))
        state.add_memlet_path(read_gearbox_tasklet,
                              read_gearbox_exit,
                              gearbox_to_kernel_write,
                              src_conn="to_kernel",
                              memlet=dace.Memlet("gearbox_to_kernel[0]"))
        state.add_memlet_path(read_gearbox_tasklet,
                              read_gearbox_exit,
                              read_buffer_write,
                              src_conn="buffer_out",
                              memlet=dace.Memlet("read_buffer[0]"))

    # Some fictional compute
    gearbox_to_kernel_read = state.add_read("gearbox_to_kernel")
    kernel_to_gearbox_write = state.add_write("kernel_to_gearbox")
    compute_entry, compute_exit = state.add_map(
        "compute", {"i": f"0:n/{vec_width}"},
        schedule=dace.ScheduleType.FPGA_Device)
    compute_tasklet = state.add_tasklet("compute", {"val_in"}, {"val_out"},
                                        "val_out = val_in + 1")
    state.add_memlet_path(gearbox_to_kernel_read,
                          compute_entry,
                          compute_tasklet,
                          dst_conn="val_in",
                          memlet=dace.Memlet("gearbox_to_kernel[0]"))
    state.add_memlet_path(compute_tasklet,
                          compute_exit,
                          kernel_to_gearbox_write,
                          src_conn="val_out",
                          memlet=dace.Memlet("kernel_to_gearbox[0]"))

    # Gearbox output
    kernel_to_gearbox_read = state.add_write("kernel_to_gearbox")
    gearbox_to_write_write = state.add_read("gearbox_to_write")
    if use_library_node:
        write_gearbox = Gearbox(n / mem_width, name="write_gearbox")
        state.add_node(write_gearbox)
        state.add_memlet_path(kernel_to_gearbox_read,
                              write_gearbox,
                              dst_conn="from_kernel",
                              memlet=dace.Memlet("kernel_to_gearbox[0]",
                                                 volume=n / vec_width))
        state.add_memlet_path(write_gearbox,
                              gearbox_to_write_write,
                              src_conn="to_memory",
                              memlet=dace.Memlet("gearbox_to_write[0]",
                                                 volume=n / mem_width))
    else:
        sdfg.add_array("write_buffer", (1, ),
                       memory_type,
                       storage=dace.StorageType.FPGA_Local,
                       transient=True)
        write_buffer_read = state.add_read("write_buffer")
        write_buffer_write = state.add_write("write_buffer")
        write_gearbox_entry, write_gearbox_exit = state.add_map(
            "gearbox_write", {
                "i": f"0:n/{mem_width}",
                "j": f"0:{gear_factor}"
            },
            schedule=dace.ScheduleType.FPGA_Device)
        write_gearbox_tasklet = state.add_tasklet(
            "gearbox_write", {"from_kernel", "buffer_in"},
            {"to_memory", "buffer_out"}, f"""\
wide = buffer_in
wide[j] = from_kernel
if j == {gear_factor} - 1:
    to_memory = wide
buffer_out = wide""")
        state.add_memlet_path(kernel_to_gearbox_read,
                              write_gearbox_entry,
                              write_gearbox_tasklet,
                              dst_conn="from_kernel",
                              memlet=dace.Memlet("kernel_to_gearbox[0]"))
        state.add_memlet_path(write_buffer_read,
                              write_gearbox_entry,
                              write_gearbox_tasklet,
                              dst_conn="buffer_in",
                              memlet=dace.Memlet("write_buffer[0]"))
        state.add_memlet_path(write_gearbox_tasklet,
                              write_gearbox_exit,
                              gearbox_to_write_write,
                              src_conn="to_memory",
                              memlet=dace.Memlet("gearbox_to_write[0]",
                                                 dynamic=True))
        state.add_memlet_path(write_gearbox_tasklet,
                              write_gearbox_exit,
                              write_buffer_write,
                              src_conn="buffer_out",
                              memlet=dace.Memlet("write_buffer[0]"))

    # Write memory
    gearbox_to_write_read = state.add_read("gearbox_to_write")
    memory_write = state.add_write("output_array")
    write_entry, write_exit = state.add_map(
        "write", {"i": f"0:n/{mem_width}"},
        schedule=dace.ScheduleType.FPGA_Device)
    write_tasklet = state.add_tasklet("write", {"from_gearbox"}, {"mem"},
                                      "mem = from_gearbox")
    state.add_memlet_path(gearbox_to_write_read,
                          write_entry,
                          write_tasklet,
                          dst_conn="from_gearbox",
                          memlet=dace.Memlet("gearbox_to_write[0]"))
    state.add_memlet_path(write_tasklet,
                          write_exit,
                          memory_write,
                          src_conn="mem",
                          memlet=dace.Memlet("output_array[i]"))

    # Copy data to the FPGA
    sdfg.add_array("input_array_host", (n, ), dtype)
    pre_state = sdfg.add_state("host_to_device")
    host_to_device_read = pre_state.add_read("input_array_host")
    host_to_device_write = pre_state.add_write("input_array")
    pre_state.add_memlet_path(
        host_to_device_read,
        host_to_device_write,
        memlet=dace.Memlet(f"input_array[0:n/{mem_width}]"))

    # Copy data back to the host
    sdfg.add_array("output_array_host", (n, ), dtype)
    post_state = sdfg.add_state("device_to_host")
    device_to_host_read = post_state.add_read("output_array")
    device_to_host_write = post_state.add_write("output_array_host")
    post_state.add_memlet_path(
        device_to_host_read,
        device_to_host_write,
        memlet=dace.Memlet(f"output_array[0:n/{mem_width}]"))

    # Link states
    sdfg.add_edge(pre_state, state, dace.InterstateEdge())
    sdfg.add_edge(state, post_state, dace.InterstateEdge())

    run_program(sdfg)

    return sdfg