Esempio n. 1
0
    def _IfExp(self, t):
        if util.only_scalars_involed(self.get_defined_symbols(), t.test,
                                     t.body, t.orelse):
            return super()._IfExp(t)

        if_type, else_type = self.infer(t.body, t.orelse)
        res_type = dtypes.result_type_of(if_type, else_type)
        if not isinstance(res_type, dtypes.vector):
            res_type = dtypes.vector(res_type, -1)

        self.write('svsel(')
        self.dispatch_expect(t.test, dtypes.vector(dace.bool, -1))
        self.write(', ')
        self.dispatch_expect(t.body, res_type)
        self.write(', ')
        self.dispatch_expect(t.orelse, res_type)
        self.write(')')
Esempio n. 2
0
def _IfExp(t, symbols, inferred_symbols):
    type_test = _dispatch(t.test, symbols, inferred_symbols)
    type_body = _dispatch(t.body, symbols, inferred_symbols)
    type_orelse = _dispatch(t.orelse, symbols, inferred_symbols)
    res_type = dtypes.result_type_of(type_body, type_orelse)
    if isinstance(type_test, dtypes.vector) and not isinstance(res_type, (dtypes.vector, dtypes.pointer)):
        # If we test on a vector, the result should be a vector aswell
        # so we can do a selection based on the test predicate
        res_type = dtypes.vector(res_type, type_test.veclen)
    return res_type
Esempio n. 3
0
def _BoolOp(t, symbols, inferred_symbols):
    # If any vector occurs in the bool op, the inferred type is also a bool vector
    vec_len = None
    for v in t.values:
        inf_type = _dispatch(v, symbols, inferred_symbols)
        if isinstance(inf_type, dtypes.vector):
            # Make sure all occuring vectors are of same size
            if vec_len is not None and vec_len != inf_type.veclen:
                raise SyntaxError('Inconsistent vector lengths in BoolOp')
            vec_len = inf_type.veclen
    return dtypes.vector(dace.bool, vec_len) if vec_len is not None else dtypes.bool
Esempio n. 4
0
    def generate_case_predicate(self, t: ast.If, acc_pred: str, id: int) -> str:
        test_pred = f'__pg_test_{self.if_depth}_{id}'

        # Compute the test predicate for the current case
        self.fill(f'svbool_t {test_pred} = ')
        self.pred_name = acc_pred
        self.dispatch_expect(t.test, dtypes.vector(dace.bool, -1))
        self.write(';')

        # Update the accumulator to exclude the test (the next case only occurs if we had failed)
        # BIC(A, B) = A AND NOT B
        self.fill(f'{acc_pred} = svbic_z({acc_pred}, {acc_pred}, {test_pred});')

        return test_pred
Esempio n. 5
0
def _Compare(t, symbols, inferred_symbols):
    # If any vector occurs in the comparision, the inferred type is a bool vector
    inf_type = _dispatch(t.left, symbols, inferred_symbols)
    vec_len = None
    if isinstance(inf_type, dtypes.vector):
        vec_len = inf_type.veclen
    for o, e in zip(t.ops, t.comparators):
        if o.__class__.__name__ not in cppunparse.CPPUnparser.cmpops:
            continue
        inf_type = _dispatch(e, symbols, inferred_symbols)
        if isinstance(inf_type, dtypes.vector):
            # Make sure all occuring vectors are of same size
            if vec_len is not None and vec_len != inf_type.veclen:
                raise SyntaxError('Inconsistent vector lengths in Compare')
            vec_len = inf_type.veclen
    return dtypes.vector(dace.bool, vec_len) if vec_len is not None else dtypes.bool
Esempio n. 6
0
 def _as_type(self, dtype: dace.typeclass, inf_type: int) -> dace.typeclass:
     """
         Turns a typeclass into a scalar or vector.
     """
     if isinstance(dtype, dtypes.pointer):
         raise ValueError('Pointer was provided')
     elif isinstance(dtype, dtypes.vector):
         if inf_type == InferenceNode.Vector:
             return dtype
         else:
             raise VectorInferenceException('Cannot make vector into scalar')
     else:
         if inf_type == InferenceNode.Vector:
             return dtypes.vector(dtype, self.vec_len)
         else:
             return dtype
Esempio n. 7
0
    def push_to_stream(self, t, target):
        target_stream = self.stream_associations[target.id]
        stream_type = target_stream[1]

        self.enter()
        self.fill('\n// === Stream push ===')

        # Casting in case of `long long`
        stream_type = copy.copy(stream_type)
        if stream_type.type == np.int64:
            stream_type.ctype = 'int64_t'
        elif stream_type.type == np.uint64:
            stream_type.ctype = 'uint64_t'

        # Create a temporary array on the heap, where we will copy the SVE register contents to
        self.fill('{} __tmp[{} / {}];'.format(stream_type,
                                              util.REGISTER_BYTE_SIZE,
                                              stream_type.bytes))

        # Count the number of "to push" elements based on the current predicate
        self.fill('size_t __cnt = svcntp_b{}({}, {});'.format(
            self.pred_bits, self.pred_name, self.pred_name))

        # Store the contents of the SVE register in the temporary array
        self.fill(
            f'svst1(svwhilelt_b{self.pred_bits}(0, ({self.counter_type}) __cnt), __tmp, '
        )

        # The contents should be compacted (i.e. all elements where the predicate is true are aligned)
        self.write(f'svcompact({self.pred_name}, ')
        self.dispatch_expect(t.value, dtypes.vector(stream_type, -1))
        self.write('));')

        ptr_cast = ''
        # Special casting for int64_t back to `long long`
        if stream_type.type == np.int64:
            ptr_cast = '(long long*) '
        elif stream_type.type == np.uint64:
            ptr_cast = '(unsigned long long*) '

        # Push the temporary array onto the stream using DaCe's push
        self.fill(f'{target_stream[0]}.push({ptr_cast}&__tmp[0], __cnt);')
        self.leave()
Esempio n. 8
0
def _Subscript(t, symbols, inferred_symbols):
    value_type = _dispatch(t.value, symbols, inferred_symbols)
    slice_type = _dispatch(t.slice, symbols, inferred_symbols)

    if isinstance(slice_type, dtypes.pointer):
        raise SyntaxError('Invalid syntax (pointer given as slice)')

    # A slice as subscript (e.g. [0:N]) returns a pointer
    if isinstance(t.slice, ast.Slice):
        return value_type

    # A vector as subscript of a pointer returns a vector of the base type
    if isinstance(value_type, dtypes.pointer) and isinstance(slice_type, dtypes.vector):
        if not np.issubdtype(slice_type.type, np.integer):
            raise SyntaxError('Subscript must be some integer type')
        return dtypes.vector(value_type.base_type, slice_type.veclen)

    # Otherwise (some index as subscript) we return the base type
    if isinstance(value_type, dtypes.typeclass):
        return value_type.base_type

    return value_type
Esempio n. 9
0
File: dot.py Progetto: mfkiwl/dace
    def expansion(node, parent_state, parent_sdfg, n=None, **kwargs):
        """
        :param node: The node to expand.
        :param parent_state: The state that the node is in.
        :param parent_sdfg: The SDFG that the node is in.
        :param n: Override the vector dimension. If this is not set, the value
                  specified in the node is used.
        """

        (desc_x, stride_x), (desc_y, stride_y), desc_res, sz = node.validate(
            parent_sdfg, parent_state)

        n = n or node.n or sz

        sdfg = dace.SDFG("dot")

        state = sdfg.add_state("dot")

        dtype = desc_x.dtype.base_type
        veclen = desc_x.veclen
        vtype = dtypes.vector(dtype, veclen)

        desc_x = desc_x.clone()
        desc_x.transient = False
        desc_y = desc_y.clone()
        desc_y.transient = False
        desc_res = desc_res.clone()
        desc_res.transient = False
        sdfg.add_datadesc("_x", desc_x)
        sdfg.add_datadesc("_y", desc_y)
        sdfg.add_datadesc("_result", desc_res)

        x_read = state.add_read("_x")
        y_read = state.add_read("_y")
        res_write = state.add_write("_result")

        input_x_name = "input_x"
        sdfg.add_array(input_x_name, (1, ),
                       vtype,
                       transient=True,
                       storage=dtypes.StorageType.FPGA_Local)
        input_x_access = state.add_access(input_x_name)

        input_y_name = "input_y"
        sdfg.add_array(input_y_name, (1, ),
                       vtype,
                       transient=True,
                       storage=dtypes.StorageType.FPGA_Local)
        input_y_access = state.add_access(input_y_name)

        entry, exit = state.add_map("stream", {"_i_dot": f"0:{n}/{veclen}"},
                                    schedule=dtypes.ScheduleType.FPGA_Device)

        index_x = "0" if isinstance(desc_x, dt.Stream) else "_i_dot"
        index_y = "0" if isinstance(desc_y, dt.Stream) else "_i_dot"

        state.add_memlet_path(x_read,
                              entry,
                              input_x_access,
                              memlet=dace.Memlet(f"{x_read.data}[{index_x}]",
                                                 other_subset="0",
                                                 dynamic=False))
        state.add_memlet_path(y_read,
                              entry,
                              input_y_access,
                              memlet=dace.Memlet(f"{y_read.data}[{index_y}]",
                                                 other_subset="0",
                                                 dynamic=False))

        tasklet = state.add_tasklet("multiply", {"__x", "__y"},
                                    {f"_product": vtype},
                                    f"_product = __x * __y")

        state.add_memlet_path(input_x_access,
                              tasklet,
                              dst_conn="__x",
                              memlet=dace.Memlet(f"{input_x_name}[0]"))
        state.add_memlet_path(input_y_access,
                              tasklet,
                              dst_conn="__y",
                              memlet=dace.Memlet(f"{input_y_name}[0]"))

        product_name = "product"
        sdfg.add_array(product_name, (veclen, ),
                       dtype,
                       transient=True,
                       storage=dtypes.StorageType.FPGA_Local)
        product_access = state.add_access(product_name)

        state.add_memlet_path(
            tasklet,
            product_access,
            src_conn="_product",
            memlet=dace.Memlet(f"{product_name}[0:{veclen}]"))

        collapse_name = "reduce_vector"
        sdfg.add_array(collapse_name, (1, ),
                       dtype,
                       transient=True,
                       storage=dtypes.StorageType.FPGA_Local)
        collapse_read = state.add_read(collapse_name)
        collapse_access = state.add_access(collapse_name)

        unroll_entry, unroll_exit = state.add_map(
            "unroll", {"_j_dot": f"0:{veclen}"},
            unroll=True,
            schedule=dtypes.ScheduleType.FPGA_Device)

        collapse_tasklet = state.add_tasklet(
            "reduce_vector", {"val_in", "reduce_in"}, {"reduce_out"}, """\
prev = reduce_in if _j_dot > 0 else 0
reduce_out = prev + val_in""")

        state.add_memlet_path(collapse_read,
                              unroll_entry,
                              collapse_tasklet,
                              dst_conn="reduce_in",
                              memlet=dace.Memlet(f"{collapse_name}[0]"))
        state.add_memlet_path(entry, collapse_read, memlet=dace.Memlet())
        state.add_memlet_path(collapse_tasklet,
                              unroll_exit,
                              collapse_access,
                              src_conn="reduce_out",
                              memlet=dace.Memlet(f"{collapse_name}[0]"))
        state.add_memlet_path(product_access,
                              unroll_entry,
                              collapse_tasklet,
                              dst_conn="val_in",
                              memlet=dace.Memlet(f"{product_name}[_j_dot]"))

        buffer_name = "reduce_buffer"
        sdfg.add_array(buffer_name, (1, ),
                       dtype,
                       transient=True,
                       storage=dtypes.StorageType.FPGA_Local)
        buffer_read = state.add_read(buffer_name)
        buffer_write = state.add_access(buffer_name)

        zero_tasklet = state.add_tasklet("zero", {}, {"buffer"}, "buffer = 0")
        state.add_memlet_path(zero_tasklet,
                              buffer_read,
                              src_conn="buffer",
                              memlet=dace.Memlet(f"{buffer_name}[0]"))

        reduce_tasklet = state.add_tasklet(
            "sum", {"buffer_in", "result_in"}, {"buffer_out"}, """\
prev = buffer_in if _i_dot > 0 else 0
buffer_out = prev + result_in""")

        state.add_memlet_path(collapse_access,
                              reduce_tasklet,
                              dst_conn="result_in",
                              memlet=dace.Memlet(f"{collapse_access.data}[0]"))
        state.add_memlet_path(buffer_read,
                              entry,
                              reduce_tasklet,
                              dst_conn="buffer_in",
                              memlet=dace.Memlet(f"{buffer_name}[0]"))
        state.add_memlet_path(reduce_tasklet,
                              exit,
                              buffer_write,
                              src_conn=f"buffer_out",
                              memlet=dace.Memlet(f"{buffer_name}[0]"))

        state.add_memlet_path(buffer_write,
                              res_write,
                              memlet=dace.Memlet(f"{buffer_name}[0]",
                                                 other_subset="0"))

        return sdfg
Esempio n. 10
0
def get_internal_symbols() -> dict:
    """
    Generates all internal symbols by crossing the internal function names with all possible type suffixes.
    Then defines the symbol with the corresponding return type (based on the suffix).
    """
    res = {}

    for func, type in itertools.product(FUSED_OPERATION_TO_SVE,
                                        TYPE_TO_SVE_SUFFIX):
        res[f'{func}_{TYPE_TO_SVE_SUFFIX[type.type if isinstance(type, dace.dtypes.typeclass) else type]}'] = dtypes.vector(
            type if isinstance(type, dtypes.typeclass) else
            dtypes.typeclass(type), SVE_LEN)
    return res
Esempio n. 11
0
    def apply(self, sdfg: SDFG):
        graph = sdfg.nodes()[self.state_id]
        map_entry = graph.nodes()[self.subgraph[Vectorization._map_entry]]
        tasklet: nodes.Tasklet = graph.successors(map_entry)[0]
        param = symbolic.pystr_to_symbolic(map_entry.map.params[-1])

        # Create new vector size.
        vector_size = self.vector_len
        dim_from, dim_to, dim_skip = map_entry.map.range[-1]

        # Determine whether to create preamble or postamble maps
        if self.preamble is not None:
            create_preamble = self.preamble
        else:
            create_preamble = not ((dim_from % vector_size == 0) == True
                                   or dim_from == 0)
        if self.postamble is not None:
            create_postamble = self.postamble
        else:
            if isinstance(dim_to, symbolic.SymExpr):
                create_postamble = (((dim_to.approx + 1) %
                                     vector_size == 0) == False)
            else:
                create_postamble = (((dim_to + 1) % vector_size == 0) == False)

        # Determine new range for vectorized map
        if self.strided_map:
            new_range = [dim_from, dim_to - vector_size + 1, vector_size]
        else:
            new_range = [
                dim_from // vector_size, ((dim_to + 1) // vector_size) - 1,
                dim_skip
            ]

        # Create preamble non-vectorized map (replacing the original map)
        if create_preamble:
            old_scope = graph.scope_subgraph(map_entry, True, True)
            new_scope: ScopeSubgraphView = replicate_scope(
                sdfg, graph, old_scope)
            new_begin = dim_from + (vector_size - (dim_from % vector_size))
            map_entry.map.range[-1] = (dim_from, new_begin - 1, dim_skip)
            # Replace map_entry with the replicated scope (so that the preamble
            # will usually come first in topological sort)
            map_entry = new_scope.entry
            tasklet = new_scope.nodes()[old_scope.nodes().index(tasklet)]
            new_range[0] = new_begin

        # Create postamble non-vectorized map
        if create_postamble:
            new_scope: ScopeSubgraphView = replicate_scope(
                sdfg, graph, graph.scope_subgraph(map_entry, True, True))
            dim_to_ex = dim_to + 1
            new_scope.entry.map.range[-1] = (dim_to_ex -
                                             (dim_to_ex % vector_size), dim_to,
                                             dim_skip)

        # Change the step of the inner-most dimension.
        map_entry.map.range[-1] = tuple(new_range)

        # Vectorize connectors adjacent to the tasklet.
        for edge in graph.all_edges(tasklet):
            connectors = (tasklet.in_connectors
                          if edge.dst == tasklet else tasklet.out_connectors)
            conn = edge.dst_conn if edge.dst == tasklet else edge.src_conn

            if edge.data.data is None:  # Empty memlets
                continue
            desc = sdfg.arrays[edge.data.data]
            contigidx = desc.strides.index(1)

            newlist = []

            lastindex = edge.data.subset[contigidx]
            if isinstance(lastindex, tuple):
                newlist = [(rb, re, rs) for rb, re, rs in edge.data.subset]
                symbols = set()
                for indd in lastindex:
                    symbols.update(
                        symbolic.pystr_to_symbolic(indd).free_symbols)
            else:
                newlist = [(rb, rb, 1) for rb in edge.data.subset]
                symbols = symbolic.pystr_to_symbolic(lastindex).free_symbols

            oldtype = connectors[conn]
            if oldtype is None or oldtype.type is None:
                oldtype = desc.dtype

            # Vector to scalar WCR edge: change connector and continue
            lastedge = graph.memlet_path(edge)[-1]
            if (lastedge.data.subset.num_elements() == 1
                    and edge.data.wcr is not None):
                connectors[conn] = dtypes.vector(oldtype, vector_size)
                continue

            if str(param) not in map(str, symbols):
                continue

            # Vectorize connector, if not already vectorized
            if isinstance(oldtype, dtypes.vector):
                continue

            connectors[conn] = dtypes.vector(oldtype, vector_size)

            # Modify memlet subset to match vector length
            if self.strided_map:
                rb = newlist[contigidx][0]
                if self.propagate_parent:
                    newlist[contigidx] = (rb / self.vector_len,
                                          rb / self.vector_len, 1)
                else:
                    newlist[contigidx] = (rb, rb + self.vector_len - 1, 1)
            else:
                rb = newlist[contigidx][0]
                if self.propagate_parent:
                    newlist[contigidx] = (rb, rb, 1)
                else:
                    newlist[contigidx] = (self.vector_len * rb,
                                          self.vector_len * rb +
                                          self.vector_len - 1, 1)
            edge.data.subset = subsets.Range(newlist)
            edge.data.volume = vector_size

        # Vector length propagation using data descriptors, recursive traversal
        # outwards
        if self.propagate_parent:
            for edge in graph.all_edges(tasklet):
                cursdfg = sdfg
                curedge = edge
                while cursdfg is not None:
                    arrname = curedge.data.data
                    dtype = cursdfg.arrays[arrname].dtype

                    # Change type and shape to vector
                    if not isinstance(dtype, dtypes.vector):
                        cursdfg.arrays[arrname].dtype = dtypes.vector(
                            dtype, vector_size)
                        new_shape = list(cursdfg.arrays[arrname].shape)
                        contigidx = cursdfg.arrays[arrname].strides.index(1)
                        new_shape[contigidx] /= vector_size
                        try:
                            new_shape[contigidx] = int(new_shape[contigidx])
                        except TypeError:
                            pass
                        cursdfg.arrays[arrname].shape = new_shape

                    propagation.propagate_memlets_sdfg(cursdfg)

                    # Find matching edge in parent
                    nsdfg = cursdfg.parent_nsdfg_node
                    if nsdfg is None:
                        break
                    tstate = cursdfg.parent
                    curedge = ([
                        e
                        for e in tstate.in_edges(nsdfg) if e.dst_conn == arrname
                    ] + [
                        e for e in tstate.out_edges(nsdfg)
                        if e.src_conn == arrname
                    ])[0]
                    cursdfg = cursdfg.parent_sdfg
Esempio n. 12
0
    def vector_reduction_expr(self, edge, dtype, rhs):
        # Check whether it is a known reduction that is possible in SVE
        reduction_type = detect_reduction_type(edge.data.wcr)
        if reduction_type not in util.REDUCTION_TYPE_TO_SVE:
            raise util.NotSupportedError('Unsupported reduction in SVE')

        nc = not is_write_conflicted(self.dfg, edge)
        if not nc or not isinstance(edge.src.out_connectors[edge.src_conn],
                                    (dtypes.pointer, dtypes.vector)):
            # WCR on vectors works in two steps:
            # 1. Reduce the SVE register using SVE instructions into a scalar
            # 2. WCR the scalar to memory using DaCe functionality
            dst_node = self.dfg.memlet_path(edge)[-1].dst
            if (isinstance(dst_node, nodes.AccessNode) and dst_node.desc(
                    self.sdfg).storage == dtypes.StorageType.SVE_Register):
                return

            wcr = self.cpu_codegen.write_and_resolve_expr(self.sdfg,
                                                          edge.data,
                                                          not nc,
                                                          None,
                                                          '@',
                                                          dtype=dtype)
            self.fill(wcr[:wcr.find('@')])
            self.write(util.REDUCTION_TYPE_TO_SVE[reduction_type])
            self.write('(')
            self.write(self.pred_name)
            self.write(', ')
            self.dispatch_expect(rhs, dtypes.vector(dtype, -1))
            self.write(')')
            self.write(wcr[wcr.find('@') + 1:])
            self.write(';')
        else:
            ######################
            # Horizontal non-atomic reduction

            stride = edge.data.get_stride(self.sdfg, self.map)

            # long long fix
            ptr_cast = ''
            src_type = edge.src.out_connectors[edge.src_conn]

            if src_type.type == np.int64:
                ptr_cast = '(int64_t*) '
            elif src_type.type == np.uint64:
                ptr_cast = '(uint64_t*) '

            store_args = '{}, {}'.format(
                self.pred_name,
                ptr_cast +
                cpp_ptr_expr(self.sdfg, edge.data, DefinedType.Pointer),
            )

            red_type = util.REDUCTION_TYPE_TO_SVE[reduction_type][:-1] + '_x'
            if stride == 1:
                self.write(
                    f'svst1({store_args}, {red_type}({self.pred_name}, svld1({store_args}), '
                )
                self.dispatch_expect(rhs, dtypes.vector(dtype, -1))
                self.write('));')
            else:
                store_args = f'{store_args}, svindex_s{util.get_base_type(src_type).bytes * 8}(0, {sym2cpp(stride)})'
                self.write(
                    f'svst1_scatter_index({store_args}, {red_type}({self.pred_name}, svld1_gather_index({store_args}), '
                )
                self.dispatch_expect(rhs, dtypes.vector(dtype, -1))
                self.write('));')
Esempio n. 13
0
    def apply(self, state: SDFGState, sdfg: SDFG) -> nodes.AccessNode:
        dnode: nodes.AccessNode = self.access
        if self.expr_index == 0:
            edges = state.out_edges(dnode)
        else:
            edges = state.in_edges(dnode)

        # To understand how many components we need to create, all map ranges
        # throughout memlet paths must match exactly. We thus create a
        # dictionary of unique ranges
        mapping: Dict[Tuple[subsets.Range],
                      List[gr.MultiConnectorEdge[mm.Memlet]]] = defaultdict(
                          list)
        ranges = {}
        for edge in edges:
            mpath = state.memlet_path(edge)
            ranges[edge] = _collect_map_ranges(state, mpath)
            mapping[tuple(r[1] for r in ranges[edge])].append(edge)

        # Collect all edges with the same memory access pattern
        components_to_create: Dict[
            Tuple[symbolic.SymbolicType],
            List[gr.MultiConnectorEdge[mm.Memlet]]] = defaultdict(list)
        for edges_with_same_range in mapping.values():
            for edge in edges_with_same_range:
                # Get memlet path and innermost edge
                mpath = state.memlet_path(edge)
                innermost_edge = copy.deepcopy(mpath[-1] if self.expr_index ==
                                               0 else mpath[0])

                # Store memlets of the same access in the same component
                expr = _canonicalize_memlet(innermost_edge.data, ranges[edge])
                components_to_create[expr].append((innermost_edge, edge))
        components = list(components_to_create.values())

        # Split out components that have dependencies between them to avoid
        # deadlocks
        if self.expr_index == 0:
            ccs_to_add = []
            for i, component in enumerate(components):
                edges_to_remove = set()
                for cedge in component:
                    if any(
                            nx.has_path(state.nx, o[1].dst, cedge[1].dst)
                            for o in component if o is not cedge):
                        ccs_to_add.append([cedge])
                        edges_to_remove.add(cedge)
                if edges_to_remove:
                    components[i] = [
                        c for c in component if c not in edges_to_remove
                    ]
            components.extend(ccs_to_add)
        # End of split

        desc = sdfg.arrays[dnode.data]

        # Create new streams of shape 1
        streams = {}
        mpaths = {}
        for edge in edges:

            if self.use_memory_buffering:

                arrname = str(self.access)

                # Add gearbox
                total_size = edge.data.volume
                vector_size = int(self.memory_buffering_target_bytes /
                                  desc.dtype.bytes)

                if not is_int(sdfg.arrays[dnode.data].shape[-1]):
                    warnings.warn(
                        "Using the MemoryBuffering transformation is potential unsafe since {sym} is not an integer. There should be no issue if {sym} % {vec} == 0"
                        .format(sym=sdfg.arrays[dnode.data].shape[-1],
                                vec=vector_size))

                for i in sdfg.arrays[dnode.data].strides:
                    if not is_int(i):
                        warnings.warn(
                            "Using the MemoryBuffering transformation is potential unsafe since {sym} is not an integer. There should be no issue if {sym} % {vec} == 0"
                            .format(sym=i, vec=vector_size))

                if self.expr_index == 0:  # Read
                    edges = state.out_edges(dnode)
                    gearbox_input_type = dtypes.vector(desc.dtype, vector_size)
                    gearbox_output_type = desc.dtype
                    gearbox_read_volume = total_size / vector_size
                    gearbox_write_volume = total_size
                else:  # Write
                    edges = state.in_edges(dnode)
                    gearbox_input_type = desc.dtype
                    gearbox_output_type = dtypes.vector(
                        desc.dtype, vector_size)
                    gearbox_read_volume = total_size
                    gearbox_write_volume = total_size / vector_size

                input_gearbox_name, input_gearbox_newdesc = sdfg.add_stream(
                    "gearbox_input",
                    gearbox_input_type,
                    buffer_size=self.buffer_size,
                    storage=self.storage,
                    transient=True,
                    find_new_name=True)

                output_gearbox_name, output_gearbox_newdesc = sdfg.add_stream(
                    "gearbox_output",
                    gearbox_output_type,
                    buffer_size=self.buffer_size,
                    storage=self.storage,
                    transient=True,
                    find_new_name=True)

                read_to_gearbox = state.add_read(input_gearbox_name)
                write_from_gearbox = state.add_write(output_gearbox_name)

                gearbox = Gearbox(total_size / vector_size)

                state.add_node(gearbox)

                state.add_memlet_path(read_to_gearbox,
                                      gearbox,
                                      dst_conn="from_memory",
                                      memlet=Memlet(
                                          input_gearbox_name + "[0]",
                                          volume=gearbox_read_volume))
                state.add_memlet_path(gearbox,
                                      write_from_gearbox,
                                      src_conn="to_kernel",
                                      memlet=Memlet(
                                          output_gearbox_name + "[0]",
                                          volume=gearbox_write_volume))

                if self.expr_index == 0:
                    streams[edge] = input_gearbox_name
                    name = output_gearbox_name
                    newdesc = output_gearbox_newdesc
                else:
                    streams[edge] = output_gearbox_name
                    name = input_gearbox_name
                    newdesc = input_gearbox_newdesc

            else:
                # Qualify name to avoid name clashes if memory interfaces are not decoupled for Xilinx
                stream_name = "stream_" + dnode.data
                name, newdesc = sdfg.add_stream(stream_name,
                                                desc.dtype,
                                                buffer_size=self.buffer_size,
                                                storage=self.storage,
                                                transient=True,
                                                find_new_name=True)
                streams[edge] = name

                # Add these such that we can easily use output_gearbox_name and input_gearbox_name without using if statements
                output_gearbox_name = name
                input_gearbox_name = name

            mpath = state.memlet_path(edge)
            mpaths[edge] = mpath

            # Replace memlets in path with stream access
            for e in mpath:
                e.data = mm.Memlet(data=name,
                                   subset='0',
                                   other_subset=e.data.other_subset)
                if isinstance(e.src, nodes.NestedSDFG):
                    e.data.dynamic = True
                    _streamify_recursive(e.src, e.src_conn, newdesc)
                if isinstance(e.dst, nodes.NestedSDFG):
                    e.data.dynamic = True
                    _streamify_recursive(e.dst, e.dst_conn, newdesc)

            # Replace access node and memlet tree with one access
            if self.expr_index == 0:
                replacement = state.add_read(output_gearbox_name)
                state.remove_edge(edge)
                state.add_edge(replacement, edge.src_conn, edge.dst,
                               edge.dst_conn, edge.data)
            else:
                replacement = state.add_write(input_gearbox_name)
                state.remove_edge(edge)
                state.add_edge(edge.src, edge.src_conn, replacement,
                               edge.dst_conn, edge.data)

        if self.use_memory_buffering:

            arrname = str(self.access)
            vector_size = int(self.memory_buffering_target_bytes /
                              desc.dtype.bytes)

            # Vectorize access to global array.
            dtype = sdfg.arrays[arrname].dtype
            sdfg.arrays[arrname].dtype = dtypes.vector(dtype, vector_size)
            new_shape = list(sdfg.arrays[arrname].shape)
            contigidx = sdfg.arrays[arrname].strides.index(1)
            new_shape[contigidx] /= vector_size
            try:
                new_shape[contigidx] = int(new_shape[contigidx])
            except TypeError:
                pass
            sdfg.arrays[arrname].shape = new_shape

            # Change strides
            new_strides: List = list(sdfg.arrays[arrname].strides)

            for i in range(len(new_strides)):
                if i == len(new_strides
                            ) - 1:  # Skip last dimension since it is always 1
                    continue
                new_strides[i] = new_strides[i] / vector_size
            sdfg.arrays[arrname].strides = new_strides

            post_state = get_post_state(sdfg, state)

            if post_state != None:
                # Change subset in the post state such that the correct amount of memory is copied back from the device
                for e in post_state.edges():
                    if e.data.data == self.access.data:
                        new_subset = list(e.data.subset)
                        i, j, k = new_subset[-1]
                        new_subset[-1] = (i, (j + 1) / vector_size - 1, k)
                        e.data = mm.Memlet(data=str(e.src),
                                           subset=subsets.Range(new_subset))

        # Make read/write components
        ionodes = []
        for component in components:

            # Pick the first edge as the edge to make the component from
            innermost_edge, outermost_edge = component[0]
            mpath = mpaths[outermost_edge]
            mapname = streams[outermost_edge]
            innermost_edge.data.other_subset = None

            # Get edge data and streams
            if self.expr_index == 0:
                opname = 'read'
                path = [e.dst for e in mpath[:-1]]
                rmemlets = [(dnode, '__inp', innermost_edge.data)]
                wmemlets = []
                for i, (_, edge) in enumerate(component):
                    name = streams[edge]
                    ionode = state.add_write(name)
                    ionodes.append(ionode)
                    wmemlets.append(
                        (ionode, '__out%d' % i, mm.Memlet(data=name,
                                                          subset='0')))
                code = '\n'.join('__out%d = __inp' % i
                                 for i in range(len(component)))
            else:
                # More than one input stream might mean a data race, so we only
                # address the first one in the tasklet code
                if len(component) > 1:
                    warnings.warn(
                        f'More than one input found for the same index for {dnode.data}'
                    )
                opname = 'write'
                path = [state.entry_node(e.src) for e in reversed(mpath[1:])]
                wmemlets = [(dnode, '__out', innermost_edge.data)]
                rmemlets = []
                for i, (_, edge) in enumerate(component):
                    name = streams[edge]
                    ionode = state.add_read(name)
                    ionodes.append(ionode)
                    rmemlets.append(
                        (ionode, '__inp%d' % i, mm.Memlet(data=name,
                                                          subset='0')))
                code = '__out = __inp0'

            # Create map structure for read/write component
            maps = []
            for entry in path:
                map: nodes.Map = entry.map

                ranges = [(p, (r[0], r[1], r[2]))
                          for p, r in zip(map.params, map.range)]

                # Change ranges of map
                if self.use_memory_buffering:
                    # Find edges from/to map

                    edge_subset = [
                        a_tuple[0]
                        for a_tuple in list(innermost_edge.data.subset)
                    ]

                    # Change range of map
                    if isinstance(edge_subset[-1], symbol) and str(
                            edge_subset[-1]) == map.params[-1]:

                        if not is_int(ranges[-1][1][1]):

                            warnings.warn(
                                "Using the MemoryBuffering transformation is potential unsafe since {sym} is not an integer. There should be no issue if {sym} % {vec} == 0"
                                .format(sym=ranges[-1][1][1].args[1],
                                        vec=vector_size))

                        ranges[-1] = (ranges[-1][0],
                                      (ranges[-1][1][0],
                                       (ranges[-1][1][1] + 1) / vector_size -
                                       1, ranges[-1][1][2]))

                    elif isinstance(edge_subset[-1], sympy.core.add.Add):

                        for arg in edge_subset[-1].args:
                            if isinstance(
                                    arg,
                                    symbol) and str(arg) == map.params[-1]:

                                if not is_int(ranges[-1][1][1]):
                                    warnings.warn(
                                        "Using the MemoryBuffering transformation is potential unsafe since {sym} is not an integer. There should be no issue if {sym} % {vec} == 0"
                                        .format(sym=ranges[-1][1][1].args[1],
                                                vec=vector_size))

                                ranges[-1] = (ranges[-1][0], (
                                    ranges[-1][1][0],
                                    (ranges[-1][1][1] + 1) / vector_size - 1,
                                    ranges[-1][1][2]))

                maps.append(
                    state.add_map(f'__s{opname}_{mapname}', ranges,
                                  map.schedule))
            tasklet = state.add_tasklet(
                f'{opname}_{mapname}',
                {m[1]
                 for m in rmemlets},
                {m[1]
                 for m in wmemlets},
                code,
            )
            for node, cname, memlet in rmemlets:
                state.add_memlet_path(node,
                                      *(me for me, _ in maps),
                                      tasklet,
                                      dst_conn=cname,
                                      memlet=memlet)
            for node, cname, memlet in wmemlets:
                state.add_memlet_path(tasklet,
                                      *(mx for _, mx in reversed(maps)),
                                      node,
                                      src_conn=cname,
                                      memlet=memlet)

        return ionodes