Exemple #1
0
class BlockGather(MPINode):

    # Global properties
    implementations = {
        "MPI": ExpandBlockGatherMPI,
    }
    default_implementation = "MPI"

    subarray_type = properties.Property(dtype=str, default='tmp')
    gather_grid = properties.Property(dtype=str, default='tmp')
    reduce_grid = properties.Property(dtype=str, allow_none=True, default=None)

    def __init__(self, name, subarray_type='tmp', gather_grid='tmp', reduce_grid=None, *args, **kwargs):
        super().__init__(name, *args, inputs={"_inp_buffer"}, outputs={"_out_buffer"}, **kwargs)
        self.subarray_type = subarray_type
        self.gather_grid = gather_grid
        self.reduce_grid = reduce_grid

    def validate(self, sdfg, state):
        """
        :return: A three-tuple (inbuffer, outbuffer, root) of the three data
                 descriptors in the parent SDFG.
        """

        inp_buffer, out_buffer = None, None
        for e in state.out_edges(self):
            if e.src_conn == "_out_buffer":
                out_buffer = sdfg.arrays[e.data.data]
        for e in state.in_edges(self):
            if e.dst_conn == "_inp_buffer":
                inp_buffer = sdfg.arrays[e.data.data]

        return inp_buffer, out_buffer
Exemple #2
0
class Redistribute(MPINode):

    # Global properties
    implementations = {
        "MPI": ExpandRedistribute,
    }
    default_implementation = "MPI"

    redistr = properties.Property(dtype=str, default='tmp')

    def __init__(self, name, redistr='tmp', *args, **kwargs):
        super().__init__(name,
                         *args,
                         inputs={"_inp_buffer"},
                         outputs={"_out_buffer"},
                         **kwargs)
        self.redistr = redistr

    def validate(self, sdfg, state):
        """
        :return: A three-tuple (inbuffer, outbuffer, root) of the three data
                 descriptors in the parent SDFG.
        """

        inp_buffer, out_buffer = None, None
        for e in state.out_edges(self):
            if e.src_conn == "_out_buffer":
                out_buffer = sdfg.arrays[e.data.data]
        for e in state.in_edges(self):
            if e.dst_conn == "_inp_buffer":
                inp_buffer = sdfg.arrays[e.data.data]

        return inp_buffer, out_buffer
class FPGATransformSDFG(transformation.Transformation):
    """ Implements the FPGATransformSDFG transformation, which takes an entire
        SDFG and transforms it into an FPGA-capable SDFG. """

    promote_global_trans = properties.Property(
        dtype=bool,
        default=True,
        desc=
        "If True, transient arrays that are fully internal are pulled out so "
        "that they can be allocated on the host.")

    @staticmethod
    def annotates_memlets():
        return True

    @staticmethod
    def expressions():
        # Match anything
        return [nx.DiGraph()]

    @staticmethod
    def can_be_applied(graph, candidate, expr_index, sdfg, permissive=False):
        # Avoid import loops
        from dace.transformation.interstate import FPGATransformState

        # Condition match depends on matching FPGATransformState for each state
        for state_id, state in enumerate(sdfg.nodes()):
            candidate = {FPGATransformState._state: state_id}
            if not FPGATransformState.can_be_applied(sdfg, candidate,
                                                     expr_index, sdfg):
                return False

        return True

    @staticmethod
    def match_to_str(graph, candidate):
        return graph.label

    def apply(self, sdfg):
        # Avoid import loops
        from dace.transformation.interstate import NestSDFG
        from dace.transformation.interstate import FPGATransformState

        sdfg_id = sdfg.sdfg_id
        nesting = NestSDFG(sdfg_id, -1, {}, self.expr_index)
        nesting.promote_global_trans = self.promote_global_trans
        nesting.apply(sdfg)

        fpga_transform = FPGATransformState(sdfg_id, -1,
                                            {FPGATransformState._state: 0},
                                            self.expr_index)
        fpga_transform.apply(sdfg)
Exemple #4
0
class FPGATransformSDFG(transformation.MultiStateTransformation):
    """ Implements the FPGATransformSDFG transformation, which takes an entire
        SDFG and transforms it into an FPGA-capable SDFG. """

    promote_global_trans = properties.Property(
        dtype=bool,
        default=True,
        desc=
        "If True, transient arrays that are fully internal are pulled out so "
        "that they can be allocated on the host.")

    @staticmethod
    def annotates_memlets():
        return True

    @classmethod
    def expressions(cls):
        # Match anything
        return [nx.DiGraph()]

    def can_be_applied(self, graph, expr_index, sdfg, permissive=False):
        # Avoid import loops
        from dace.transformation.interstate import FPGATransformState

        # Condition match depends on matching FPGATransformState for each state
        for state_id, state in enumerate(sdfg.nodes()):
            fps = FPGATransformState()
            fps.setup_match(sdfg, graph.sdfg_id, -1,
                            {FPGATransformState.state: state_id}, 0)
            if not fps.can_be_applied(sdfg, expr_index, sdfg):
                return False

        return True

    def apply(self, _, sdfg):
        # Avoid import loops
        from dace.transformation.interstate import NestSDFG
        from dace.transformation.interstate import FPGATransformState

        sdfg_id = sdfg.sdfg_id
        nesting = NestSDFG()
        nesting.setup_match(sdfg, sdfg_id, -1, {}, self.expr_index)
        nesting.promote_global_trans = self.promote_global_trans
        nesting.apply(sdfg, sdfg)

        # The state ID is zero since we applied NestSDFG and have only one state in the new SDFG
        fpga_transform = FPGATransformState()
        fpga_transform.setup_match(sdfg, sdfg_id, -1,
                                   {FPGATransformState.state: 0},
                                   self.expr_index)
        fpga_transform.apply(sdfg, sdfg)
Exemple #5
0
class Gemm(dace.sdfg.nodes.LibraryNode):
    """Executes alpha * (A @ B) + beta * C. C should be unidirectionally
       broadcastable (ONNX terminology) to A @ B.
    """

    # Global properties
    implementations = {
        "pure": ExpandGemmPure,
        "MKL": ExpandGemmMKL,
        "OpenBLAS": ExpandGemmOpenBLAS,
        "cuBLAS": ExpandGemmCuBLAS,
        "PBLAS": ExpandGemmPBLAS,
        "FPGA1DSystolic": ExpandGemmFPGA1DSystolic
    }
    default_implementation = None

    # Object fields
    transA = properties.Property(
        dtype=bool, desc="Whether to transpose A before multiplying")
    transB = properties.Property(
        dtype=bool, desc="Whether to transpose B before multiplying")
    alpha = properties.Property(
        allow_none=False,
        default=1,
        desc="A scalar which will be multiplied with A @ B before adding C")
    beta = properties.Property(
        allow_none=False,
        default=0,
        desc="A scalar which will be multiplied with C before adding C")

    def __init__(self,
                 name,
                 location=None,
                 transA=False,
                 transB=False,
                 alpha=1,
                 beta=0):
        super().__init__(
            name,
            location=location,
            inputs=({"_a", "_b", "_cin"} if beta != 0 else {"_a", "_b"}),
            outputs={"_c"})
        self.transA = transA
        self.transB = transB
        self.alpha = alpha
        self.beta = beta

    def validate(self, sdfg, state):
        in_edges = state.in_edges(self)
        if len(in_edges) not in [2, 3]:
            raise ValueError("Expected 2 or 3 inputs to gemm")
        size2 = None
        for _, _, _, dst_conn, memlet in state.in_edges(self):
            if dst_conn == '_a':
                subset = dc(memlet.subset)
                subset.squeeze()
                size0 = subset.size()
            if dst_conn == '_b':
                subset = dc(memlet.subset)
                subset.squeeze()
                size1 = subset.size()
            if dst_conn == '_c':
                subset = dc(memlet.subset)
                subset.squeeze()
                size2 = subset.size()

        if self.transA:
            size0 = list(reversed(size0))
        if self.transB:
            size1 = list(reversed(size1))

        out_edges = state.out_edges(self)
        if len(out_edges) != 1:
            raise ValueError(
                "Expected exactly one output from matrix-matrix product")
        out_memlet = out_edges[0].data
        # Function is symmetric, edge order does not matter
        if len(size0) != 2 or len(size1) != 2:
            raise ValueError(
                "matrix-matrix product only supported on matrices")
        if size0[1] != size1[0]:
            raise ValueError("Inputs to matrix-matrix product "
                             "must agree in the k-dimension")
        out_subset = dc(out_memlet.subset)
        out_subset.squeeze()
        size3 = out_subset.size()
        if size2 is not None and size2 != size3:
            raise ValueError("Input C matrix must match output matrix.")
        if len(size3) != 2:
            raise ValueError(
                "matrix-matrix product only supported on matrices")
        if len(size3) == 2 and list(size3) != [size0[-2], size1[-1]]:
            raise ValueError(
                "Output to matrix-matrix product must agree in the m and n "
                "dimensions")
Exemple #6
0
class StreamingComposition(xf.Transformation):
    """ 
    Converts two connected computations (nodes, map scopes) into two separate
    processing elements, with a stream connecting the results. Only applies
    if the memory access patterns of the two computations match.
    """
    first = xf.PatternNode(nodes.Node)
    access = xf.PatternNode(nodes.AccessNode)
    second = xf.PatternNode(nodes.Node)

    buffer_size = properties.Property(
        dtype=int,
        default=1,
        desc='Set buffer size for the newly-created stream')

    storage = properties.EnumProperty(
        dtype=dtypes.StorageType,
        desc='Set storage type for the newly-created stream',
        default=dtypes.StorageType.Default)

    @staticmethod
    def expressions() -> List[gr.SubgraphView]:
        return [
            sdutil.node_path_graph(StreamingComposition.first,
                                   StreamingComposition.access,
                                   StreamingComposition.second)
        ]

    @staticmethod
    def can_be_applied(graph: SDFGState,
                       candidate: Dict[xf.PatternNode, int],
                       expr_index: int,
                       sdfg: SDFG,
                       permissive: bool = False) -> bool:
        access = graph.node(candidate[StreamingComposition.access])
        # Make sure the access node is only accessed once (read or write),
        # and not at the same time
        if graph.in_degree(access) > 1 or graph.out_degree(access) > 1:
            return False

        # If already a stream, skip
        if isinstance(sdfg.arrays[access.data], data.Stream):
            return False

        # Only free nodes are allowed (search up the SDFG tree)
        curstate = graph
        node = access
        while curstate is not None:
            if curstate.entry_node(node) is not None:
                return False
            if curstate.parent.parent_nsdfg_node is None:
                break
            node = curstate.parent.parent_nsdfg_node
            curstate = curstate.parent.parent

        # Array must not be used anywhere else in the state
        if any(n is not access and n.data == access.data
               for n in graph.data_nodes()):
            return False

        # Only one memlet path on each direction is allowed
        # TODO: Relax so that repeated application of
        # transformation would yield additional streams
        first_edge = graph.in_edges(access)[0]
        second_edge = graph.out_edges(access)[0]
        first_mpath = graph.memlet_path(first_edge)
        second_mpath = graph.memlet_path(second_edge)
        if len(first_mpath) != len(list(graph.memlet_tree(first_edge))):
            return False
        if len(second_mpath) != len(list(graph.memlet_tree(second_edge))):
            return False

        # The innermost ends of the paths must have a clearly defined memory
        # access pattern and no WCR
        first_iedge = first_mpath[0]
        second_iedge = second_mpath[-1]
        if first_iedge.data.subset.num_elements() != 1:
            return False
        if first_iedge.data.volume != 1:
            return False
        if first_iedge.data.wcr is not None:
            return False
        if second_iedge.data.subset.num_elements() != 1:
            return False
        if second_iedge.data.volume != 1:
            return False

        ##################################################################
        # The memory access pattern must be exactly the same

        # Collect all maps and ranges
        ranges_first = _collect_map_ranges(graph, first_mpath)
        ranges_second = _collect_map_ranges(graph, second_mpath)

        # Check map ranges
        for (_, frng), (_, srng) in zip(ranges_first, ranges_second):
            if frng != srng:
                return False

        # Check memlets for equivalence
        if len(first_iedge.data.subset) != len(second_iedge.data.subset):
            return False
        if not _do_memlets_correspond(first_iedge.data, second_iedge.data,
                                      ranges_first, ranges_second):
            return False

        return True

    def apply(self, sdfg: SDFG) -> nodes.AccessNode:
        state = sdfg.node(self.state_id)
        access: nodes.AccessNode = self.access(sdfg)

        # Get memlet paths
        first_edge = state.in_edges(access)[0]
        second_edge = state.out_edges(access)[0]
        first_mpath = state.memlet_path(first_edge)
        second_mpath = state.memlet_path(second_edge)

        # Create new stream of shape 1
        desc = sdfg.arrays[access.data]
        name, newdesc = sdfg.add_stream(access.data,
                                        desc.dtype,
                                        buffer_size=self.buffer_size,
                                        storage=self.storage,
                                        transient=True,
                                        find_new_name=True)

        # Remove transient array if possible
        for ostate in sdfg.nodes():
            if ostate is state:
                continue
            if any(n.data == access.data for n in ostate.data_nodes()):
                break
        else:
            del sdfg.arrays[access.data]

        # Replace memlets in path with stream access
        for e in first_mpath:
            e.data = mm.Memlet(data=name, subset='0')
            if isinstance(e.src, nodes.NestedSDFG):
                e.data.dynamic = True
                _streamify_recursive(e.src, e.src_conn, newdesc)
            if isinstance(e.dst, nodes.NestedSDFG):
                e.data.dynamic = True
                _streamify_recursive(e.dst, e.dst_conn, newdesc)
        for e in second_mpath:
            e.data = mm.Memlet(data=name, subset='0')
            if isinstance(e.src, nodes.NestedSDFG):
                e.data.dynamic = True
                _streamify_recursive(e.src, e.src_conn, newdesc)
            if isinstance(e.dst, nodes.NestedSDFG):
                e.data.dynamic = True
                _streamify_recursive(e.dst, e.dst_conn, newdesc)

        # Replace array access node with two stream access nodes
        wnode = state.add_write(name)
        rnode = state.add_read(name)
        state.remove_edge(first_edge)
        state.add_edge(first_edge.src, first_edge.src_conn, wnode,
                       first_edge.dst_conn, first_edge.data)
        state.remove_edge(second_edge)
        state.add_edge(rnode, second_edge.src_conn, second_edge.dst,
                       second_edge.dst_conn, second_edge.data)

        # Remove original access node
        state.remove_node(access)

        return wnode, rnode
Exemple #7
0
class StreamingMemory(xf.Transformation):
    """ 
    Converts a read or a write to streaming memory access, where data is
    read/written to/from a stream in a separate connected component than the
    computation.
    """
    access = xf.PatternNode(nodes.AccessNode)
    entry = xf.PatternNode(nodes.EntryNode)
    exit = xf.PatternNode(nodes.ExitNode)

    buffer_size = properties.Property(
        dtype=int,
        default=1,
        desc='Set buffer size for the newly-created stream')

    storage = properties.EnumProperty(
        dtype=dtypes.StorageType,
        desc='Set storage type for the newly-created stream',
        default=dtypes.StorageType.Default)

    @staticmethod
    def expressions() -> List[gr.SubgraphView]:
        return [
            sdutil.node_path_graph(StreamingMemory.access,
                                   StreamingMemory.entry),
            sdutil.node_path_graph(StreamingMemory.exit,
                                   StreamingMemory.access),
        ]

    @staticmethod
    def can_be_applied(graph: SDFGState,
                       candidate: Dict[xf.PatternNode, int],
                       expr_index: int,
                       sdfg: SDFG,
                       permissive: bool = False) -> bool:
        access = graph.node(candidate[StreamingMemory.access])
        # Make sure the access node is only accessed once (read or write),
        # and not at the same time
        if graph.out_degree(access) > 0 and graph.in_degree(access) > 0:
            return False

        # If already a stream, skip
        if isinstance(sdfg.arrays[access.data], data.Stream):
            return False
        # If does not exist on off-chip memory, skip
        if sdfg.arrays[access.data].storage not in [
                dtypes.StorageType.CPU_Heap, dtypes.StorageType.CPU_Pinned,
                dtypes.StorageType.GPU_Global, dtypes.StorageType.FPGA_Global
        ]:
            return False

        # Only free nodes are allowed (search up the SDFG tree)
        curstate = graph
        node = access
        while curstate is not None:
            if curstate.entry_node(node) is not None:
                return False
            if curstate.parent.parent_nsdfg_node is None:
                break
            node = curstate.parent.parent_nsdfg_node
            curstate = curstate.parent.parent

        # Only one memlet path is allowed per outgoing/incoming edge
        edges = (graph.out_edges(access)
                 if expr_index == 0 else graph.in_edges(access))
        for edge in edges:
            mpath = graph.memlet_path(edge)
            if len(mpath) != len(list(graph.memlet_tree(edge))):
                return False

            # The innermost end of the path must have a clearly defined memory
            # access pattern
            innermost_edge = mpath[-1] if expr_index == 0 else mpath[0]
            if (innermost_edge.data.subset.num_elements() != 1
                    or innermost_edge.data.dynamic
                    or innermost_edge.data.volume != 1):
                return False

            # Check if any of the maps has a dynamic range
            # These cases can potentially work but some nodes (and perhaps
            # tasklets) need to be replicated, which are difficult to track.
            for pe in mpath:
                node = pe.dst if expr_index == 0 else graph.entry_node(pe.src)
                if isinstance(
                        node,
                        nodes.MapEntry) and sdutil.has_dynamic_map_inputs(
                            graph, node):
                    return False

        # If already applied on this memlet and this is the I/O component, skip
        if expr_index == 0:
            other_node = graph.node(candidate[StreamingMemory.entry])
        else:
            other_node = graph.node(candidate[StreamingMemory.exit])
            other_node = graph.entry_node(other_node)
        if other_node.label.startswith('__s'):
            return False

        return True

    def apply(self, sdfg: SDFG) -> nodes.AccessNode:
        state = sdfg.node(self.state_id)
        dnode: nodes.AccessNode = self.access(sdfg)
        if self.expr_index == 0:
            edges = state.out_edges(dnode)
        else:
            edges = state.in_edges(dnode)

        # To understand how many components we need to create, all map ranges
        # throughout memlet paths must match exactly. We thus create a
        # dictionary of unique ranges
        mapping: Dict[Tuple[subsets.Range],
                      List[gr.MultiConnectorEdge[mm.Memlet]]] = defaultdict(
                          list)
        ranges = {}
        for edge in edges:
            mpath = state.memlet_path(edge)
            ranges[edge] = _collect_map_ranges(state, mpath)
            mapping[tuple(r[1] for r in ranges[edge])].append(edge)

        # Collect all edges with the same memory access pattern
        components_to_create: Dict[
            Tuple[symbolic.SymbolicType],
            List[gr.MultiConnectorEdge[mm.Memlet]]] = defaultdict(list)
        for edges_with_same_range in mapping.values():
            for edge in edges_with_same_range:
                # Get memlet path and innermost edge
                mpath = state.memlet_path(edge)
                innermost_edge = copy.deepcopy(mpath[-1] if self.expr_index ==
                                               0 else mpath[0])

                # Store memlets of the same access in the same component
                expr = _canonicalize_memlet(innermost_edge.data, ranges[edge])
                components_to_create[expr].append((innermost_edge, edge))
        components = list(components_to_create.values())

        # Split out components that have dependencies between them to avoid
        # deadlocks
        if self.expr_index == 0:
            ccs_to_add = []
            for i, component in enumerate(components):
                edges_to_remove = set()
                for cedge in component:
                    if any(
                            nx.has_path(state.nx, o[1].dst, cedge[1].dst)
                            for o in component if o is not cedge):
                        ccs_to_add.append([cedge])
                        edges_to_remove.add(cedge)
                if edges_to_remove:
                    components[i] = [
                        c for c in component if c not in edges_to_remove
                    ]
            components.extend(ccs_to_add)
        # End of split

        desc = sdfg.arrays[dnode.data]

        # Create new streams of shape 1
        streams = {}
        mpaths = {}
        for edge in edges:
            name, newdesc = sdfg.add_stream(dnode.data,
                                            desc.dtype,
                                            buffer_size=self.buffer_size,
                                            storage=self.storage,
                                            transient=True,
                                            find_new_name=True)
            streams[edge] = name
            mpath = state.memlet_path(edge)
            mpaths[edge] = mpath

            # Replace memlets in path with stream access
            for e in mpath:
                e.data = mm.Memlet(data=name,
                                   subset='0',
                                   other_subset=e.data.other_subset)
                if isinstance(e.src, nodes.NestedSDFG):
                    e.data.dynamic = True
                    _streamify_recursive(e.src, e.src_conn, newdesc)
                if isinstance(e.dst, nodes.NestedSDFG):
                    e.data.dynamic = True
                    _streamify_recursive(e.dst, e.dst_conn, newdesc)

            # Replace access node and memlet tree with one access
            if self.expr_index == 0:
                replacement = state.add_read(name)
                state.remove_edge(edge)
                state.add_edge(replacement, edge.src_conn, edge.dst,
                               edge.dst_conn, edge.data)
            else:
                replacement = state.add_write(name)
                state.remove_edge(edge)
                state.add_edge(edge.src, edge.src_conn, replacement,
                               edge.dst_conn, edge.data)

        # Make read/write components
        ionodes = []
        for component in components:

            # Pick the first edge as the edge to make the component from
            innermost_edge, outermost_edge = component[0]
            mpath = mpaths[outermost_edge]
            mapname = streams[outermost_edge]
            innermost_edge.data.other_subset = None

            # Get edge data and streams
            if self.expr_index == 0:
                opname = 'read'
                path = [e.dst for e in mpath[:-1]]
                rmemlets = [(dnode, '__inp', innermost_edge.data)]
                wmemlets = []
                for i, (_, edge) in enumerate(component):
                    name = streams[edge]
                    ionode = state.add_write(name)
                    ionodes.append(ionode)
                    wmemlets.append(
                        (ionode, '__out%d' % i, mm.Memlet(data=name,
                                                          subset='0')))
                code = '\n'.join('__out%d = __inp' % i
                                 for i in range(len(component)))
            else:
                # More than one input stream might mean a data race, so we only
                # address the first one in the tasklet code
                if len(component) > 1:
                    warnings.warn(
                        f'More than one input found for the same index for {dnode.data}'
                    )
                opname = 'write'
                path = [state.entry_node(e.src) for e in reversed(mpath[1:])]
                wmemlets = [(dnode, '__out', innermost_edge.data)]
                rmemlets = []
                for i, (_, edge) in enumerate(component):
                    name = streams[edge]
                    ionode = state.add_read(name)
                    ionodes.append(ionode)
                    rmemlets.append(
                        (ionode, '__inp%d' % i, mm.Memlet(data=name,
                                                          subset='0')))
                code = '__out = __inp0'

            # Create map structure for read/write component
            maps = []
            for entry in path:
                map: nodes.Map = entry.map
                maps.append(
                    state.add_map(f'__s{opname}_{mapname}',
                                  [(p, r)
                                   for p, r in zip(map.params, map.range)],
                                  map.schedule))
            tasklet = state.add_tasklet(
                f'{opname}_{mapname}',
                {m[1]
                 for m in rmemlets},
                {m[1]
                 for m in wmemlets},
                code,
            )
            for node, cname, memlet in rmemlets:
                state.add_memlet_path(node,
                                      *(me for me, _ in maps),
                                      tasklet,
                                      dst_conn=cname,
                                      memlet=memlet)
            for node, cname, memlet in wmemlets:
                state.add_memlet_path(tasklet,
                                      *(mx for _, mx in reversed(maps)),
                                      node,
                                      src_conn=cname,
                                      memlet=memlet)

        return ionodes
Exemple #8
0
class CopyToDevice(pattern_matching.Transformation):
    """ Implements the copy-to-device transformation, which copies a nested
        SDFG and its dependencies to a given device.

        The transformation changes all data storage types of a nested SDFG to
        the given `storage` property, and creates new arrays and copies around
        the nested SDFG to that storage.
    """

    _nested_sdfg = nodes.NestedSDFG("", graph.OrderedDiGraph(), {}, {})

    storage = properties.Property(dtype=dtypes.StorageType,
                                  desc="Nested SDFG storage",
                                  choices=dtypes.StorageType,
                                  from_string=lambda x: dtypes.StorageType[x],
                                  default=dtypes.StorageType.Default)

    @staticmethod
    def annotates_memlets():
        return True

    @staticmethod
    def expressions():
        return [sdutil.node_path_graph(CopyToDevice._nested_sdfg)]

    @staticmethod
    def can_be_applied(graph, candidate, expr_index, sdfg, strict=False):
        nested_sdfg = graph.nodes()[candidate[CopyToDevice._nested_sdfg]]

        for edge in graph.all_edges(nested_sdfg):
            # Stream inputs/outputs not allowed
            path = graph.memlet_path(edge)
            if ((isinstance(path[0].src, nodes.AccessNode)
                 and isinstance(sdfg.arrays[path[0].src.data], data.Stream)) or
                (isinstance(path[-1].dst, nodes.AccessNode)
                 and isinstance(sdfg.arrays[path[-1].dst.data], data.Stream))):
                return False
            # WCR outputs with arrays are not allowed
            if (edge.data.wcr is not None
                    and edge.data.subset.num_elements() != 1):
                return False

        return True

    @staticmethod
    def match_to_str(graph, candidate):
        nested_sdfg = graph.nodes()[candidate[CopyToDevice._nested_sdfg]]
        return nested_sdfg.label

    def apply(self, sdfg):
        state = sdfg.nodes()[self.state_id]
        nested_sdfg = state.nodes()[self.subgraph[CopyToDevice._nested_sdfg]]
        storage = self.storage
        created_arrays = set()

        for _, edge in enumerate(state.in_edges(nested_sdfg)):

            src, src_conn, dst, dst_conn, memlet = edge
            dataname = memlet.data
            if dataname is None:
                continue
            memdata = sdfg.arrays[dataname]

            name = 'device_' + dataname + '_in'
            if name not in created_arrays:
                if isinstance(memdata, data.Array):
                    name, _ = sdfg.add_array(
                        'device_' + dataname + '_in',
                        shape=[
                            symbolic.overapproximate(r)
                            for r in memlet.bounding_box_size()
                        ],
                        dtype=memdata.dtype,
                        transient=True,
                        storage=storage,
                        find_new_name=True)
                elif isinstance(memdata, data.Scalar):
                    name, _ = sdfg.add_scalar('device_' + dataname + '_in',
                                              dtype=memdata.dtype,
                                              transient=True,
                                              storage=storage,
                                              find_new_name=True)
                else:
                    raise NotImplementedError
                created_arrays.add(name)

            data_node = nodes.AccessNode(name)

            to_data_mm = dcpy(memlet)
            from_data_mm = dcpy(memlet)
            from_data_mm.data = name
            offset = []
            for ind, r in enumerate(memlet.subset):
                offset.append(r[0])
                if isinstance(memlet.subset[ind], tuple):
                    begin = memlet.subset[ind][0] - r[0]
                    end = memlet.subset[ind][1] - r[0]
                    step = memlet.subset[ind][2]
                    from_data_mm.subset[ind] = (begin, end, step)
                else:
                    from_data_mm.subset[ind] -= r[0]

            state.remove_edge(edge)
            state.add_edge(src, src_conn, data_node, None, to_data_mm)
            state.add_edge(data_node, None, dst, dst_conn, from_data_mm)

        for _, edge in enumerate(state.out_edges(nested_sdfg)):

            src, src_conn, dst, dst_conn, memlet = edge
            dataname = memlet.data
            if dataname is None:
                continue
            memdata = sdfg.arrays[dataname]

            name = 'device_' + dataname + '_out'
            if name not in created_arrays:
                if isinstance(memdata, data.Array):
                    name, _ = sdfg.add_array(
                        name,
                        shape=[
                            symbolic.overapproximate(r)
                            for r in memlet.bounding_box_size()
                        ],
                        dtype=memdata.dtype,
                        transient=True,
                        storage=storage,
                        find_new_name=True)
                elif isinstance(memdata, data.Scalar):
                    name, _ = sdfg.add_scalar(name,
                                              dtype=memdata.dtype,
                                              transient=True,
                                              storage=storage)
                else:
                    raise NotImplementedError
                created_arrays.add(name)

            data_node = nodes.AccessNode(name)

            to_data_mm = dcpy(memlet)
            from_data_mm = dcpy(memlet)
            to_data_mm.data = name
            offset = []
            for ind, r in enumerate(memlet.subset):
                offset.append(r[0])
                if isinstance(memlet.subset[ind], tuple):
                    begin = memlet.subset[ind][0] - r[0]
                    end = memlet.subset[ind][1] - r[0]
                    step = memlet.subset[ind][2]
                    to_data_mm.subset[ind] = (begin, end, step)
                else:
                    to_data_mm.subset[ind] -= r[0]

            state.remove_edge(edge)
            state.add_edge(src, src_conn, data_node, None, to_data_mm)
            state.add_edge(data_node, None, dst, dst_conn, from_data_mm)

        # Change storage for all data inside nested SDFG to device.
        change_storage(nested_sdfg.sdfg, storage)
Exemple #9
0
class BatchedMatMul(dace.sdfg.nodes.LibraryNode):

    # Global properties
    implementations = {
        "pure": ExpandBatchedMatMulPure,
        "MKL": ExpandBatchedMatMulMKL,
        "OpenBLAS": ExpandBatchedMatMulOpenBLAS,
        "cuBLAS": ExpandBatchedMatMulCuBLAS
    }
    transA = properties.Property(
        dtype=bool, desc="Whether to transpose A before multiplying")
    transB = properties.Property(
        dtype=bool, desc="Whether to transpose B before multiplying")
    alpha = properties.Property(
        allow_none=False,
        default=1,
        desc="A scalar which will be multiplied with A @ B before adding C")
    beta = properties.Property(
        allow_none=False,
        default=0,
        desc="A scalar which will be multiplied with C before adding C")
    algorithm = properties.Property(
        dtype=str,
        allow_none=True,
        default=None,
        desc="If applicable, chooses the vendor-provided implementation "
        "(algorithm) for the multiplication")
    accumulator_type = properties.TypeClassProperty(
        default=None,
        choices=dtypes.Typeclasses,
        allow_none=True,
        desc="Accumulator or intermediate storage type used in multiplication")
    compute_type = properties.Property(
        default=None,
        dtype=str,
        allow_none=True,
        desc="If applicable, overrides computation type (CUBLAS-specific, see "
        "``cublasComputeType_t``)")

    default_implementation = None

    def __init__(self, name, location=None):
        super().__init__(name,
                         location=location,
                         inputs={'_a', '_b'},
                         outputs={'_c'})

    def validate(self, sdfg, state):
        in_edges = state.in_edges(self)
        if len(in_edges) != 2:
            raise ValueError(
                "Expected exactly two inputs to batched matrix-matrix product")
        for _, _, _, dst_conn, memlet in state.in_edges(self):
            if dst_conn == '_a':
                subset = dc(memlet.subset)
                subset.squeeze()
                size0 = subset.size()
            if dst_conn == '_b':
                subset = dc(memlet.subset)
                subset.squeeze()
                size1 = subset.size()
        out_edges = state.out_edges(self)
        if len(out_edges) != 1:
            raise ValueError("Expected exactly one output from "
                             "batched matrix-matrix product")
        out_memlet = out_edges[0].data
        # Function is symmetric, edge order does not matter
        if len(size0) not in [2, 3]:
            raise ValueError(
                "Batched matrix-matrix product only supported on matrices")
        if len(size1) != 3:
            raise ValueError(
                "Batched matrix-matrix product only supported on matrices")
        if size0[-1] != size1[-2]:
            raise ValueError("Inputs to matrix-matrix product "
                             "must agree in the k-dimension")
        out_subset = dc(out_memlet.subset)
        out_subset.squeeze()
        size2 = out_subset.size()
        if len(size2) != 3:
            raise ValueError(
                "batched matrix-matrix product only supported on matrices")
Exemple #10
0
class Gemv(dace.sdfg.nodes.LibraryNode):

    # Global properties
    implementations = {
        "pure": ExpandGemvPure,
        "OpenBLAS": ExpandGemvOpenBLAS,
        "MKL": ExpandGemvMKL,
        "cuBLAS": ExpandGemvCuBLAS,
        "FPGA_Accumulate": ExpandGemvFpgaAccumulate,
        "FPGA_TilesByColumn": ExpandGemvFpgaTilesByColumn,
        "PBLAS": ExpandGemvPBLAS
    }
    default_implementation = None

    # Object fields
    alpha = properties.SymbolicProperty(allow_none=False, default=1)
    beta = properties.SymbolicProperty(allow_none=False, default=0)

    transA = properties.Property(
        dtype=bool, desc="Whether to transpose A before multiplying")

    n = properties.SymbolicProperty(allow_none=True, default=None)
    m = properties.SymbolicProperty(allow_none=True, default=None)

    def __init__(self, name, location=None, transA=False, alpha=1, beta=0):
        super().__init__(
            name,
            location=location,
            inputs={"_A", "_x", "_y"} if beta != 0 else {"_A", "_x"},
            outputs={"_y"})
        self.transA = transA
        self.alpha = alpha
        self.beta = beta

    def validate(self, sdfg, state):
        in_edges = state.in_edges(self)
        if len(in_edges) not in [2, 3]:
            raise ValueError("Expected 2 or 3 inputs to GEMV")
        size_y_in = None
        for _, _, _, dst_conn, memlet in state.in_edges(self):
            if dst_conn == "_A":
                subset = copy.deepcopy(memlet.subset)
                subset.squeeze()
                size_a = subset.size()
            if dst_conn == "_x":
                subset = copy.deepcopy(memlet.subset)
                subset.squeeze()
                size_x = subset.size()
            if dst_conn == "_y":
                subset = copy.deepcopy(memlet.subset)
                subset.squeeze()
                size_y_in = subset.size()

        if len(size_a) != 2 or len(size_x) != 1:
            raise ValueError(
                "Matrix-vector product only supported on matrix-vector input")

        a_cols = size_a[1] if not self.transA else size_a[0]
        a_rows = size_a[0] if not self.transA else size_a[1]

        if a_cols != size_x[0]:
            raise ValueError(f"Columns of A ({a_cols}) don't match "
                             f"size of x ({size_x[0]}).")

        out_edges = state.out_edges(self)
        if len(out_edges) != 1:
            raise ValueError(
                "Expected exactly one output from matrix-vector product")
        out_memlet = out_edges[0].data

        out_subset = copy.deepcopy(out_memlet.subset)
        out_subset.squeeze()
        size_y_out = out_subset.size()
        if size_y_in is not None and size_y_in != size_y_out:
            raise ValueError("Input y-vector must match output y-vector.")
        if (len(size_y_out) != 1 or size_y_out[0] != a_rows):
            raise ValueError("Vector input to GEMV must match matrix rows.")
Exemple #11
0
class PruneConnectors(pm.SingleStateTransformation, pm.SimplifyPass):
    """ Removes unused connectors from nested SDFGs, as well as their memlets
        in the outer scope, replacing them with empty memlets if necessary.

        Optionally: after pruning, removes the unused containers from parent SDFG.
    """

    nsdfg = pm.PatternNode(nodes.NestedSDFG)

    remove_unused_containers = properties.Property(
        dtype=bool,
        default=False,
        desc='If True, remove unused containers from parent SDFG.')

    @classmethod
    def expressions(cls):
        return [utils.node_path_graph(cls.nsdfg)]

    def can_be_applied(self,
                       graph: SDFGState,
                       expr_index: int,
                       sdfg: SDFG,
                       permissive: bool = False) -> bool:

        nsdfg = self.nsdfg

        read_set, write_set = nsdfg.sdfg.read_and_write_sets()
        prune_in = nsdfg.in_connectors.keys() - read_set
        prune_out = nsdfg.out_connectors.keys() - write_set

        # Take into account symbol mappings
        strs = tuple(nsdfg.symbol_mapping.values())
        syms = tuple(symbolic.pystr_to_symbolic(s) for s in strs)
        symnames = tuple(s.name if hasattr(s, 'name') else '' for s in syms)
        for conn in list(prune_in):
            if conn in syms or conn in symnames or conn in nsdfg.sdfg.symbols:
                prune_in.remove(conn)

        # Add WCR outputs to "do not prune" input list
        for e in graph.out_edges(nsdfg):
            if e.data.wcr is not None and e.src_conn in prune_in:
                if (graph.in_degree(
                        next(
                            iter(graph.in_edges_by_connector(
                                nsdfg, e.src_conn))).src) > 0):
                    prune_in.remove(e.src_conn)
        has_before = all(
            graph.in_degree(graph.memlet_path(e)[0].src) > 0
            for e in graph.in_edges(nsdfg) if e.dst_conn in prune_in)
        has_after = all(
            graph.out_degree(graph.memlet_path(e)[-1].dst) > 0
            for e in graph.out_edges(nsdfg) if e.src_conn in prune_out)
        if has_before and has_after:
            return False
        if len(prune_in) > 0 or len(prune_out) > 0:
            return True

        return False

    def apply(self, state: SDFGState, sdfg: SDFG):
        nsdfg = self.nsdfg

        read_set, write_set = nsdfg.sdfg.read_and_write_sets()
        prune_in = nsdfg.in_connectors.keys() - read_set
        prune_out = nsdfg.out_connectors.keys() - write_set

        # Detect which nodes are used, so we can delete unused nodes after the
        # connectors have been pruned
        all_data_used = read_set | write_set
        # Add WCR outputs to "do not prune" input list
        for e in state.out_edges(nsdfg):
            if e.data.wcr is not None and e.src_conn in prune_in:
                if (state.in_degree(
                        next(
                            iter(state.in_edges_by_connector(
                                nsdfg, e.src_conn))).src) > 0):
                    prune_in.remove(e.src_conn)
        do_not_prune = set()
        for conn in prune_in:
            if any(
                    state.in_degree(state.memlet_path(e)[0].src) > 0
                    for e in state.in_edges(nsdfg) if e.dst_conn == conn):
                do_not_prune.add(conn)
                continue
            for e in state.in_edges_by_connector(nsdfg, conn):
                state.remove_memlet_path(e, remove_orphans=True)

        for conn in prune_out:
            if any(
                    state.out_degree(state.memlet_path(e)[-1].dst) > 0
                    for e in state.out_edges(nsdfg) if e.src_conn == conn):
                do_not_prune.add(conn)
                continue
            for e in state.out_edges_by_connector(nsdfg, conn):
                state.remove_memlet_path(e, remove_orphans=True)

        for conn in prune_in:
            if conn in nsdfg.sdfg.arrays and conn not in all_data_used and conn not in do_not_prune:
                # If the data is now unused, we can purge it from the SDFG
                nsdfg.sdfg.remove_data(conn)
        for conn in prune_out:
            if conn in nsdfg.sdfg.arrays and conn not in all_data_used and conn not in do_not_prune:
                # If the data is now unused, we can purge it from the SDFG
                nsdfg.sdfg.remove_data(conn)

        if self.remove_unused_containers:
            # Remove unused containers from parent SDFGs
            containers = list(sdfg.arrays.keys())
            for name in containers:
                s = nsdfg.sdfg
                while s.parent_sdfg:
                    s = s.parent_sdfg
                    try:
                        s.remove_data(name)
                    except ValueError:
                        break
Exemple #12
0
class CopyToDevice(pattern_matching.Transformation):
    """ Implements the copy-to-device transformation, which copies a nested
        SDFG and its dependencies to a given device.

        The transformation changes all data storage types of a nested SDFG to
        the given `storage` property, and creates new arrays and copies around
        the nested SDFG to that storage.
    """

    _nested_sdfg = nodes.NestedSDFG("", graph.OrderedDiGraph(), set(), set())

    storage = properties.Property(
        dtype=dtypes.StorageType,
        desc="Nested SDFG storage",
        choices=dtypes.StorageType,
        from_string=lambda x: dtypes.StorageType[x],
        default=dtypes.StorageType.Default)

    @staticmethod
    def annotates_memlets():
        return True

    @staticmethod
    def expressions():
        return [nxutil.node_path_graph(CopyToDevice._nested_sdfg)]

    @staticmethod
    def can_be_applied(graph, candidate, expr_index, sdfg, strict=False):
        return True

    @staticmethod
    def match_to_str(graph, candidate):
        nested_sdfg = graph.nodes()[candidate[CopyToDevice._nested_sdfg]]
        return nested_sdfg.label

    def apply(self, sdfg):
        state = sdfg.nodes()[self.state_id]
        nested_sdfg = state.nodes()[self.subgraph[CopyToDevice._nested_sdfg]]
        storage = self.storage

        for _, edge in enumerate(state.in_edges(nested_sdfg)):

            src, src_conn, dst, dst_conn, memlet = edge
            dataname = memlet.data
            memdata = sdfg.arrays[dataname]

            if isinstance(memdata, data.Array):
                new_data = sdfg.add_array(
                    'device_' + dataname + '_in',
                    memdata.dtype, [
                        symbolic.overapproximate(r)
                        for r in memlet.bounding_box_size()
                    ],
                    transient=True,
                    storage=storage)
            elif isinstance(memdata, data.Scalar):
                new_data = sdfg.add_scalar(
                    'device_' + dataname + '_in',
                    memdata.dtype,
                    transient=True,
                    storage=storage)
            else:
                raise NotImplementedError

            data_node = nodes.AccessNode('device_' + dataname + '_in')

            to_data_mm = dcpy(memlet)
            from_data_mm = dcpy(memlet)
            from_data_mm.data = 'device_' + dataname + '_in'
            offset = []
            for ind, r in enumerate(memlet.subset):
                offset.append(r[0])
                if isinstance(memlet.subset[ind], tuple):
                    begin = memlet.subset[ind][0] - r[0]
                    end = memlet.subset[ind][1] - r[0]
                    step = memlet.subset[ind][2]
                    from_data_mm.subset[ind] = (begin, end, step)
                else:
                    from_data_mm.subset[ind] -= r[0]

            state.remove_edge(edge)
            state.add_edge(src, src_conn, data_node, None, to_data_mm)
            state.add_edge(data_node, None, dst, dst_conn, from_data_mm)

        for _, edge in enumerate(state.out_edges(nested_sdfg)):

            src, src_conn, dst, dst_conn, memlet = edge
            dataname = memlet.data
            memdata = sdfg.arrays[dataname]

            if isinstance(memdata, data.Array):
                new_data = data.Array(
                    'device_' + dataname + '_out',
                    memdata.dtype, [
                        symbolic.overapproximate(r)
                        for r in memlet.bounding_box_size()
                    ],
                    transient=True,
                    storage=storage)
            elif isinstance(memdata, data.Scalar):
                new_data = sdfg.add_scalar(
                    'device_' + dataname + '_out',
                    memdata.dtype,
                    transient=True,
                    storage=storage)
            else:
                raise NotImplementedError

            data_node = nodes.AccessNode('device_' + dataname + '_out')

            to_data_mm = dcpy(memlet)
            from_data_mm = dcpy(memlet)
            to_data_mm.data = 'device_' + dataname + '_out'
            offset = []
            for ind, r in enumerate(memlet.subset):
                offset.append(r[0])
                if isinstance(memlet.subset[ind], tuple):
                    begin = memlet.subset[ind][0] - r[0]
                    end = memlet.subset[ind][1] - r[0]
                    step = memlet.subset[ind][2]
                    to_data_mm.subset[ind] = (begin, end, step)
                else:
                    to_data_mm.subset[ind] -= r[0]

            state.remove_edge(edge)
            state.add_edge(src, src_conn, data_node, None, to_data_mm)
            state.add_edge(data_node, None, dst, dst_conn, from_data_mm)

        # Change storage for all data inside nested SDFG to device.
        change_storage(nested_sdfg.sdfg, storage)
Exemple #13
0
class BankSplit(transformation.SingleStateTransformation):
    """
    A transformation that allow splitting an array and distribute it on another
    array with one dimension more, or vice versa. Works with arbitrary arrays,
    but its intended use case is to distribute data on many HBM-banks.
    Matches any 2 AccessNodes connected by an edge, if the dimensionality of the two accessed
    arrays differ by exactly one. The sizes of the arrays have to be large enough with
    respect to the split executed, but this is not verified. While it is allowed to use symbolics
    for the shapes of the array, it is expected that each dimension is divisible by the number
    of splits specified.

    When appling an unrolled map is generated around the accessnodes, which copies the parts of
    the array to the target array.

    Examples:
    Distribute: Suppose for example we copy from A to B, where A has shape [100, 100] and B shape
    [10, 100, 10]. We can distribute A in that case to B using the transformation by setting
    split_array_info=[1, 10]. A will then be divided along it's second dimension into 10 parts
    of size [100, 10] and distributed on B.
    Gather: Suppose A has shape [4, 50, 50] and B has shape [100, 100]. If one sets
    split_array_info to [2, 2] and applies the transformation, it will split
    equally in all dimensions.
    Therefore A[0] will be copied to B[0:50, 0:50], A[1] to B[0:50, 50:100], A[2] to B[50:100, 0:50] and
    A[3] to B[50:100, 50:100].

    Note that simply reversing the AccessNodes for the arrays in the above examples would
    have lead to the inverse operation, i.e. the gather would become a distribute and
    the other way around.
    """

    src_node = transformation.PatternNode(nd.AccessNode)
    dst_node = transformation.PatternNode(nd.AccessNode)

    # dtype=List[int]
    split_array_info = properties.Property(
        dtype=List,
        default=None,
        allow_none=True,
        desc="Describes how many times this array is split in each dimension, "
        "where the k-th number describes how many times dimension k is split. "
        "If the k-th number is 1 this means that the array is not split in "
        "the k-th dimension at all. "
        "If None, then the transform will split the first dimension exactly shape[0] times.")

    default_to_storage = properties.Property(
        dtype=dtypes.StorageType,
        default=dtypes.StorageType.CPU_Heap,
        allow_none=False,
        desc="The storage type of involved arrays will be set to the value of this property if "
        "they have Default storage type. ")

    def _get_split_size(self, virtual_shape: Iterable, split_count: List[int]) -> List[int]:
        """
        :return: the shape of a part-array on one HBMbank
        """
        new_shape_list = []
        for d in range(len(virtual_shape)):
            if split_count[d] != 1:
                new_shape_list.append(virtual_shape[d] // split_count[d])
            else:
                new_shape_list.append(virtual_shape[d])
        return new_shape_list

    def can_be_applied(self, graph: SDFGState, expr_index: int, sdfg: SDFG, permissive: bool) -> bool:
        src = self.src_node
        dst = self.dst_node
        src_array = sdfg.arrays[src.data]
        dst_array = sdfg.arrays[dst.data]

        plain_array = lambda array: isinstance(array, data.Array) and not isinstance(array, data.View)

        if not plain_array(src_array):
            return False
        if not plain_array(dst_array):
            return False

        # same dimensions means HBM-array needs 1 dimension more
        collect_src = len(src_array.shape) - 1 == len(dst_array.shape)
        distribute_dst = len(src_array.shape) + 1 == len(dst_array.shape)
        if collect_src and symbolic.issymbolic(src_array.shape[0], sdfg.constants):
            return False
        elif distribute_dst and symbolic.issymbolic(dst_array.shape[0], sdfg.constants):
            return False
        return collect_src or distribute_dst

    @classmethod
    def expressions(cls):
        return [utils.node_path_graph(cls.src_node, cls.dst_node)]

    def apply(self, graph: SDFGState, sdfg: SDFG) -> Union[Any, None]:
        # Load/parse infos from the SDFG
        src = self.src_node
        dst = self.dst_node
        src_array = sdfg.arrays[src.data]
        dst_array = sdfg.arrays[dst.data]
        collect_src = len(src_array.shape) - 1 == len(
            dst_array.shape)  # If this is not true we have to distribute to dst (checked in can_apply)
        if collect_src:
            bank_count = int(src_array.shape[0])
            true_size = dst_array.shape
        else:
            bank_count = int(dst_array.shape[0])
            true_size = src_array.shape
        ndim = len(true_size)

        # Move Default storage
        if sdfg.arrays[src.data].storage == dtypes.StorageType.Default:
            sdfg.arrays[src.data].storage = self.default_to_storage
        if sdfg.arrays[dst.data].storage == dtypes.StorageType.Default:
            sdfg.arrays[dst.data].storage = self.default_to_storage

        # Figure out how to split
        if self.split_array_info is None:
            split_info = [1] * ndim
            split_info[0] = bank_count
        else:
            split_info = self.split_array_info
            if len(split_info) != ndim:
                raise RuntimeError("Length of split_array_info must match number of " "dimensions")
        if functools.reduce(lambda a, b: a * b, split_info) != bank_count:
            raise RuntimeError("Splitting is not possible with the selected splits"
                               "and this number of HBM-banks (required number of banks "
                               "!= actual number of banks)")

        # create the copy-subgraph
        ndrange = dict()
        usable_params = []
        for i in range(ndim):
            usable_params.append(f"i{i}")
        for i in range(ndim):
            ndrange[usable_params[i]] = f"0:{split_info[i]}"
        graph.remove_edge_and_connectors(graph.edges_between(src, dst)[0])
        copy_map_enter, copy_map_exit = graph.add_map("hbm_bank_split", ndrange, dtypes.ScheduleType.Unrolled)
        graph.add_edge(copy_map_enter, None, src, None, memlet.Memlet())
        graph.add_edge(dst, None, copy_map_exit, None, memlet.Memlet())

        target_size = [str(x) for x in self._get_split_size(true_size, split_info)]
        target_hbm_bank = []
        for i in range(ndim):
            target_hbm_bank.append(usable_params[i])
            for j in range(i):
                target_hbm_bank[j] = f"{split_info[i]}*{target_hbm_bank[j]}"
        target_offset = []
        for i in range(ndim):
            target_offset.append(f"{usable_params[i]}*{target_size[i]}")

        target_size_str = ", ".join([f"{x}:{y}" for x, y in zip([0] * ndim, target_size)])
        target_hbm_bank_str = "+ ".join(target_hbm_bank)
        target_offset_str = ", ".join([f"({x}):({x}+{y})" for x, y in zip(target_offset, target_size)])
        if collect_src:
            copy_memlet = memlet.Memlet(f"{src.data}[{target_hbm_bank_str}, {target_size_str}]->"
                                        f"{target_offset_str}")
        else:
            copy_memlet = memlet.Memlet(f"{src.data}[{target_offset_str}]->{target_hbm_bank_str}, "
                                        f"{target_size_str}")
        graph.add_edge(src, None, dst, None, copy_memlet)
Exemple #14
0
class WarpTiling(xf.SingleStateTransformation):
    """ 
    Implements a GPU specialization tiling that takes a GPU kernel map (with 
    nested maps, but without explicit block sizes) and divides its work across
    a warp. Specifically, it tiles its contents by a configurable warp size 
    (default: 32), and optionally preferring recomputation (map replication) 
    over local storage within the kernel. If write-conflicted reductions happen 
    within the given map, the transformation adds warp reductions to the tiles.
    """

    warp_size = properties.Property(dtype=int,
                                    default=32,
                                    desc='Hardware warp size')
    replicate_maps = properties.Property(
        dtype=bool,
        default=True,
        desc='Replicate tiled maps that lead to multiple other tiled maps')

    mapentry = xf.PatternNode(nodes.MapEntry)

    @classmethod
    def expressions(cls):
        return [sdutil.node_path_graph(cls.mapentry)]

    def can_be_applied(self, graph: SDFGState, expr_index, sdfg: SDFG,
                       permissive) -> bool:
        me = self.mapentry

        if len(xfh.get_internal_scopes(graph, me, immediate=True)) == 0:
            return False

        # GPU map that has no predefined thread-block maps
        return (me.schedule == dtypes.ScheduleType.GPU_Device
                and not xfh.gpu_map_has_explicit_threadblocks(graph, me))

    def apply(self, graph: SDFGState, sdfg: SDFG) -> nodes.MapEntry:
        me = self.mapentry

        # Add new map within map
        mx = graph.exit_node(me)
        new_me, new_mx = graph.add_map('warp_tile',
                                       dict(__tid=f'0:{self.warp_size}'),
                                       dtypes.ScheduleType.GPU_ThreadBlock)
        __tid = symbolic.pystr_to_symbolic('__tid')
        for e in graph.out_edges(me):
            xfh.reconnect_edge_through_map(graph, e, new_me, True)
        for e in graph.in_edges(mx):
            xfh.reconnect_edge_through_map(graph, e, new_mx, False)

        # Stride and offset all internal maps
        maps_to_stride = xfh.get_internal_scopes(graph, new_me, immediate=True)
        for nstate, nmap in maps_to_stride:
            nsdfg = nstate.parent
            nsdfg_node = nsdfg.parent_nsdfg_node

            # Map cannot be partitioned across a warp
            if (nmap.range.size()[-1] < self.warp_size) == True:
                continue

            if nsdfg is not sdfg and nsdfg_node is not None:
                nsdfg_node.symbol_mapping['__tid'] = __tid
                if '__tid' not in nsdfg.symbols:
                    nsdfg.add_symbol('__tid', dtypes.int32)
            nmap.range[-1] = (nmap.range[-1][0], nmap.range[-1][1] - __tid,
                              nmap.range[-1][2] * self.warp_size)
            subgraph = nstate.scope_subgraph(nmap)
            subgraph.replace(nmap.params[-1], f'{nmap.params[-1]} + __tid')
            inner_map_exit = nstate.exit_node(nmap)
            # If requested, replicate maps with multiple dependent maps
            if self.replicate_maps:
                destinations = [
                    nstate.memlet_path(edge)[-1].dst
                    for edge in nstate.out_edges(inner_map_exit)
                ]

                for dst in destinations:
                    # Transformation will not replicate map with more than one
                    # output
                    if len(destinations) != 1:
                        break
                    if not isinstance(dst, nodes.AccessNode):
                        continue  # Not leading to access node
                    if not xfh.contained_in(nstate, dst, new_me):
                        continue  # Memlet path goes out of map
                    if not nsdfg.arrays[dst.data].transient:
                        continue  # Cannot modify non-transients
                    for edge in nstate.out_edges(dst)[1:]:
                        rep_subgraph = xfh.replicate_scope(
                            nsdfg, nstate, subgraph)
                        rep_edge = nstate.out_edges(
                            rep_subgraph.sink_nodes()[0])[0]
                        # Add copy of data
                        newdesc = copy.deepcopy(sdfg.arrays[dst.data])
                        newname = nsdfg.add_datadesc(dst.data,
                                                     newdesc,
                                                     find_new_name=True)
                        newaccess = nstate.add_access(newname)
                        # Redirect edges
                        xfh.redirect_edge(nstate,
                                          rep_edge,
                                          new_dst=newaccess,
                                          new_data=newname)
                        xfh.redirect_edge(nstate,
                                          edge,
                                          new_src=newaccess,
                                          new_data=newname)

            # If has WCR, add warp-collaborative reduction on outputs
            for out_edge in nstate.out_edges(inner_map_exit):
                dst = nstate.memlet_path(out_edge)[-1].dst
                if not xfh.contained_in(nstate, dst, new_me):
                    # Skip edges going out of map
                    continue
                if dst.desc(nsdfg).storage == dtypes.StorageType.GPU_Global:
                    # Skip shared memory
                    continue
                if out_edge.data.wcr is not None:
                    ctype = nsdfg.arrays[out_edge.data.data].dtype.ctype
                    redtype = detect_reduction_type(out_edge.data.wcr)
                    if redtype == dtypes.ReductionType.Custom:
                        raise NotImplementedError
                    credtype = ('dace::ReductionType::' +
                                str(redtype)[str(redtype).find('.') + 1:])

                    # One element: tasklet
                    if out_edge.data.subset.num_elements() == 1:
                        # Add local access between thread-local and warp reduction
                        name = nsdfg._find_new_name(out_edge.data.data)
                        nsdfg.add_scalar(
                            name,
                            nsdfg.arrays[out_edge.data.data].dtype,
                            transient=True)

                        # Initialize thread-local to global value
                        read = nstate.add_read(out_edge.data.data)
                        write = nstate.add_write(name)
                        edge = nstate.add_nedge(read, write,
                                                copy.deepcopy(out_edge.data))
                        edge.data.wcr = None
                        xfh.state_fission(nsdfg,
                                          SubgraphView(nstate, [read, write]))

                        newnode = nstate.add_access(name)
                        nstate.remove_edge(out_edge)
                        edge = nstate.add_edge(out_edge.src, out_edge.src_conn,
                                               newnode, None,
                                               copy.deepcopy(out_edge.data))
                        for e in nstate.memlet_path(edge):
                            e.data.data = name
                            e.data.subset = subsets.Range([(0, 0, 1)])

                        wrt = nstate.add_tasklet(
                            'warpreduce', {'__a'}, {'__out'},
                            f'__out = dace::warpReduce<{credtype}, {ctype}>::reduce(__a);',
                            dtypes.Language.CPP)
                        nstate.add_edge(newnode, None, wrt, '__a',
                                        Memlet(name))
                        out_edge.data.wcr = None
                        nstate.add_edge(wrt, '__out', out_edge.dst, None,
                                        out_edge.data)
                    else:  # More than one element: mapped tasklet
                        # Could be a parallel summation
                        # TODO(later): Check if reduction
                        continue
            # End of WCR to warp reduction

        # Make nested SDFG out of new scope
        xfh.nest_state_subgraph(sdfg, graph,
                                graph.scope_subgraph(new_me, False, False))

        return new_me
Exemple #15
0
class StreamingMemory(xf.SingleStateTransformation):
    """ 
    Converts a read or a write to streaming memory access, where data is
    read/written to/from a stream in a separate connected component than the
    computation.
    If 'use_memory_buffering' is True, the transformation reads/writes data from memory
    using a wider data format (e.g. 512 bits), and then convert it
    on the fly to the right data type used by the computation: 
    """
    access = xf.PatternNode(nodes.AccessNode)
    entry = xf.PatternNode(nodes.EntryNode)
    exit = xf.PatternNode(nodes.ExitNode)

    buffer_size = properties.Property(
        dtype=int,
        default=1,
        desc='Set buffer size for the newly-created stream')

    storage = properties.EnumProperty(
        dtype=dtypes.StorageType,
        desc='Set storage type for the newly-created stream',
        default=dtypes.StorageType.Default)

    use_memory_buffering = properties.Property(
        dtype=bool,
        default=False,
        desc='Set if memory buffering should be used.')

    memory_buffering_target_bytes = properties.Property(
        dtype=int,
        default=64,
        desc=
        'Set bytes read/written from memory if memory buffering is enabled.')

    @classmethod
    def expressions(cls) -> List[gr.SubgraphView]:
        return [
            sdutil.node_path_graph(cls.access, cls.entry),
            sdutil.node_path_graph(cls.exit, cls.access),
        ]

    def can_be_applied(self,
                       graph: SDFGState,
                       expr_index: int,
                       sdfg: SDFG,
                       permissive: bool = False) -> bool:
        access = self.access
        # Make sure the access node is only accessed once (read or write),
        # and not at the same time
        if graph.out_degree(access) > 0 and graph.in_degree(access) > 0:
            return False

        # If already a stream, skip
        if isinstance(sdfg.arrays[access.data], data.Stream):
            return False
        # If does not exist on off-chip memory, skip
        if sdfg.arrays[access.data].storage not in [
                dtypes.StorageType.CPU_Heap, dtypes.StorageType.CPU_Pinned,
                dtypes.StorageType.GPU_Global, dtypes.StorageType.FPGA_Global
        ]:
            return False

        # Only free nodes are allowed (search up the SDFG tree)
        curstate = graph
        node = access
        while curstate is not None:
            if curstate.entry_node(node) is not None:
                return False
            if curstate.parent.parent_nsdfg_node is None:
                break
            node = curstate.parent.parent_nsdfg_node
            curstate = curstate.parent.parent

        # Only one memlet path is allowed per outgoing/incoming edge
        edges = (graph.out_edges(access)
                 if expr_index == 0 else graph.in_edges(access))
        for edge in edges:
            mpath = graph.memlet_path(edge)
            if len(mpath) != len(list(graph.memlet_tree(edge))):
                return False

            # The innermost end of the path must have a clearly defined memory
            # access pattern
            innermost_edge = mpath[-1] if expr_index == 0 else mpath[0]
            if (innermost_edge.data.subset.num_elements() != 1
                    or innermost_edge.data.dynamic
                    or innermost_edge.data.volume != 1):
                return False

            # Check if any of the maps has a dynamic range
            # These cases can potentially work but some nodes (and perhaps
            # tasklets) need to be replicated, which are difficult to track.
            for pe in mpath:
                node = pe.dst if expr_index == 0 else graph.entry_node(pe.src)
                if isinstance(
                        node,
                        nodes.MapEntry) and sdutil.has_dynamic_map_inputs(
                            graph, node):
                    return False

        # If already applied on this memlet and this is the I/O component, skip
        if expr_index == 0:
            other_node = self.entry
        else:
            other_node = self.exit
            other_node = graph.entry_node(other_node)
        if other_node.label.startswith('__s'):
            return False

        ## Check Memory Buffering Properties
        if self.use_memory_buffering:

            access = self.access
            desc = sdfg.arrays[access.data]

            # Array has to be global array
            if desc.storage != dtypes.StorageType.FPGA_Global:
                return False

            # Type has to divide target bytes
            if self.memory_buffering_target_bytes % desc.dtype.bytes != 0:
                return False

            # Target bytes has to be >= size of data type
            if self.memory_buffering_target_bytes < desc.dtype.bytes:
                return False

            strides = list(desc.strides)

            # Last stride has to be one
            if strides[-1] != 1:
                return False

            vector_size = int(self.memory_buffering_target_bytes /
                              desc.dtype.bytes)
            strides.pop()  # Remove last element since we already checked it

            # Other strides have to be divisible by vector size
            for stride in strides:

                if is_int(stride) and stride % vector_size != 0:
                    return False

            # Check if map has the right access pattern
            # Stride 1 access by innermost loop, innermost loop counter has to be divisible by vector size
            # Same code as in apply
            state = sdfg.node(self.state_id)
            dnode: nodes.AccessNode = self.access
            if self.expr_index == 0:
                edges = state.out_edges(dnode)
            else:
                edges = state.in_edges(dnode)

            mapping: Dict[
                Tuple[subsets.Range],
                List[gr.MultiConnectorEdge[mm.Memlet]]] = defaultdict(list)
            ranges = {}
            for edge in edges:
                mpath = state.memlet_path(edge)
                ranges[edge] = _collect_map_ranges(state, mpath)
                mapping[tuple(r[1] for r in ranges[edge])].append(edge)

            for edges_with_same_range in mapping.values():
                for edge in edges_with_same_range:
                    # Get memlet path and innermost edge
                    mpath = state.memlet_path(edge)
                    innermost_edge = copy.deepcopy(
                        mpath[-1] if self.expr_index == 0 else mpath[0])

                    edge_subset = [
                        a_tuple[0]
                        for a_tuple in list(innermost_edge.data.subset)
                    ]

                    if self.expr_index == 0:
                        map_subset = innermost_edge.src.map.params.copy()
                        ranges = list(innermost_edge.src.map.range)
                    else:
                        map_subset = innermost_edge.dst.map.params.copy()
                        ranges = list(innermost_edge.dst.map.range)

                    # Check is correct access pattern
                    # Correct ranges in map
                    if is_int(ranges[-1]
                              [1]) and (ranges[-1][1] + 1) % vector_size != 0:
                        return False

                    if ranges[-1][2] != 1:
                        return False

                    # Correct access in array
                    if isinstance(edge_subset[-1], symbol) and str(
                            edge_subset[-1]) == map_subset[-1]:
                        pass

                    elif isinstance(edge_subset[-1], sympy.core.add.Add):

                        counter: int = 0

                        for arg in edge_subset[-1].args:
                            if isinstance(
                                    arg,
                                    symbol) and str(arg) == map_subset[-1]:
                                counter += 1

                        if counter != 1:
                            return False

                    else:
                        return False

        return True

    def apply(self, state: SDFGState, sdfg: SDFG) -> nodes.AccessNode:
        dnode: nodes.AccessNode = self.access
        if self.expr_index == 0:
            edges = state.out_edges(dnode)
        else:
            edges = state.in_edges(dnode)

        # To understand how many components we need to create, all map ranges
        # throughout memlet paths must match exactly. We thus create a
        # dictionary of unique ranges
        mapping: Dict[Tuple[subsets.Range],
                      List[gr.MultiConnectorEdge[mm.Memlet]]] = defaultdict(
                          list)
        ranges = {}
        for edge in edges:
            mpath = state.memlet_path(edge)
            ranges[edge] = _collect_map_ranges(state, mpath)
            mapping[tuple(r[1] for r in ranges[edge])].append(edge)

        # Collect all edges with the same memory access pattern
        components_to_create: Dict[
            Tuple[symbolic.SymbolicType],
            List[gr.MultiConnectorEdge[mm.Memlet]]] = defaultdict(list)
        for edges_with_same_range in mapping.values():
            for edge in edges_with_same_range:
                # Get memlet path and innermost edge
                mpath = state.memlet_path(edge)
                innermost_edge = copy.deepcopy(mpath[-1] if self.expr_index ==
                                               0 else mpath[0])

                # Store memlets of the same access in the same component
                expr = _canonicalize_memlet(innermost_edge.data, ranges[edge])
                components_to_create[expr].append((innermost_edge, edge))
        components = list(components_to_create.values())

        # Split out components that have dependencies between them to avoid
        # deadlocks
        if self.expr_index == 0:
            ccs_to_add = []
            for i, component in enumerate(components):
                edges_to_remove = set()
                for cedge in component:
                    if any(
                            nx.has_path(state.nx, o[1].dst, cedge[1].dst)
                            for o in component if o is not cedge):
                        ccs_to_add.append([cedge])
                        edges_to_remove.add(cedge)
                if edges_to_remove:
                    components[i] = [
                        c for c in component if c not in edges_to_remove
                    ]
            components.extend(ccs_to_add)
        # End of split

        desc = sdfg.arrays[dnode.data]

        # Create new streams of shape 1
        streams = {}
        mpaths = {}
        for edge in edges:

            if self.use_memory_buffering:

                arrname = str(self.access)

                # Add gearbox
                total_size = edge.data.volume
                vector_size = int(self.memory_buffering_target_bytes /
                                  desc.dtype.bytes)

                if not is_int(sdfg.arrays[dnode.data].shape[-1]):
                    warnings.warn(
                        "Using the MemoryBuffering transformation is potential unsafe since {sym} is not an integer. There should be no issue if {sym} % {vec} == 0"
                        .format(sym=sdfg.arrays[dnode.data].shape[-1],
                                vec=vector_size))

                for i in sdfg.arrays[dnode.data].strides:
                    if not is_int(i):
                        warnings.warn(
                            "Using the MemoryBuffering transformation is potential unsafe since {sym} is not an integer. There should be no issue if {sym} % {vec} == 0"
                            .format(sym=i, vec=vector_size))

                if self.expr_index == 0:  # Read
                    edges = state.out_edges(dnode)
                    gearbox_input_type = dtypes.vector(desc.dtype, vector_size)
                    gearbox_output_type = desc.dtype
                    gearbox_read_volume = total_size / vector_size
                    gearbox_write_volume = total_size
                else:  # Write
                    edges = state.in_edges(dnode)
                    gearbox_input_type = desc.dtype
                    gearbox_output_type = dtypes.vector(
                        desc.dtype, vector_size)
                    gearbox_read_volume = total_size
                    gearbox_write_volume = total_size / vector_size

                input_gearbox_name, input_gearbox_newdesc = sdfg.add_stream(
                    "gearbox_input",
                    gearbox_input_type,
                    buffer_size=self.buffer_size,
                    storage=self.storage,
                    transient=True,
                    find_new_name=True)

                output_gearbox_name, output_gearbox_newdesc = sdfg.add_stream(
                    "gearbox_output",
                    gearbox_output_type,
                    buffer_size=self.buffer_size,
                    storage=self.storage,
                    transient=True,
                    find_new_name=True)

                read_to_gearbox = state.add_read(input_gearbox_name)
                write_from_gearbox = state.add_write(output_gearbox_name)

                gearbox = Gearbox(total_size / vector_size)

                state.add_node(gearbox)

                state.add_memlet_path(read_to_gearbox,
                                      gearbox,
                                      dst_conn="from_memory",
                                      memlet=Memlet(
                                          input_gearbox_name + "[0]",
                                          volume=gearbox_read_volume))
                state.add_memlet_path(gearbox,
                                      write_from_gearbox,
                                      src_conn="to_kernel",
                                      memlet=Memlet(
                                          output_gearbox_name + "[0]",
                                          volume=gearbox_write_volume))

                if self.expr_index == 0:
                    streams[edge] = input_gearbox_name
                    name = output_gearbox_name
                    newdesc = output_gearbox_newdesc
                else:
                    streams[edge] = output_gearbox_name
                    name = input_gearbox_name
                    newdesc = input_gearbox_newdesc

            else:
                # Qualify name to avoid name clashes if memory interfaces are not decoupled for Xilinx
                stream_name = "stream_" + dnode.data
                name, newdesc = sdfg.add_stream(stream_name,
                                                desc.dtype,
                                                buffer_size=self.buffer_size,
                                                storage=self.storage,
                                                transient=True,
                                                find_new_name=True)
                streams[edge] = name

                # Add these such that we can easily use output_gearbox_name and input_gearbox_name without using if statements
                output_gearbox_name = name
                input_gearbox_name = name

            mpath = state.memlet_path(edge)
            mpaths[edge] = mpath

            # Replace memlets in path with stream access
            for e in mpath:
                e.data = mm.Memlet(data=name,
                                   subset='0',
                                   other_subset=e.data.other_subset)
                if isinstance(e.src, nodes.NestedSDFG):
                    e.data.dynamic = True
                    _streamify_recursive(e.src, e.src_conn, newdesc)
                if isinstance(e.dst, nodes.NestedSDFG):
                    e.data.dynamic = True
                    _streamify_recursive(e.dst, e.dst_conn, newdesc)

            # Replace access node and memlet tree with one access
            if self.expr_index == 0:
                replacement = state.add_read(output_gearbox_name)
                state.remove_edge(edge)
                state.add_edge(replacement, edge.src_conn, edge.dst,
                               edge.dst_conn, edge.data)
            else:
                replacement = state.add_write(input_gearbox_name)
                state.remove_edge(edge)
                state.add_edge(edge.src, edge.src_conn, replacement,
                               edge.dst_conn, edge.data)

        if self.use_memory_buffering:

            arrname = str(self.access)
            vector_size = int(self.memory_buffering_target_bytes /
                              desc.dtype.bytes)

            # Vectorize access to global array.
            dtype = sdfg.arrays[arrname].dtype
            sdfg.arrays[arrname].dtype = dtypes.vector(dtype, vector_size)
            new_shape = list(sdfg.arrays[arrname].shape)
            contigidx = sdfg.arrays[arrname].strides.index(1)
            new_shape[contigidx] /= vector_size
            try:
                new_shape[contigidx] = int(new_shape[contigidx])
            except TypeError:
                pass
            sdfg.arrays[arrname].shape = new_shape

            # Change strides
            new_strides: List = list(sdfg.arrays[arrname].strides)

            for i in range(len(new_strides)):
                if i == len(new_strides
                            ) - 1:  # Skip last dimension since it is always 1
                    continue
                new_strides[i] = new_strides[i] / vector_size
            sdfg.arrays[arrname].strides = new_strides

            post_state = get_post_state(sdfg, state)

            if post_state != None:
                # Change subset in the post state such that the correct amount of memory is copied back from the device
                for e in post_state.edges():
                    if e.data.data == self.access.data:
                        new_subset = list(e.data.subset)
                        i, j, k = new_subset[-1]
                        new_subset[-1] = (i, (j + 1) / vector_size - 1, k)
                        e.data = mm.Memlet(data=str(e.src),
                                           subset=subsets.Range(new_subset))

        # Make read/write components
        ionodes = []
        for component in components:

            # Pick the first edge as the edge to make the component from
            innermost_edge, outermost_edge = component[0]
            mpath = mpaths[outermost_edge]
            mapname = streams[outermost_edge]
            innermost_edge.data.other_subset = None

            # Get edge data and streams
            if self.expr_index == 0:
                opname = 'read'
                path = [e.dst for e in mpath[:-1]]
                rmemlets = [(dnode, '__inp', innermost_edge.data)]
                wmemlets = []
                for i, (_, edge) in enumerate(component):
                    name = streams[edge]
                    ionode = state.add_write(name)
                    ionodes.append(ionode)
                    wmemlets.append(
                        (ionode, '__out%d' % i, mm.Memlet(data=name,
                                                          subset='0')))
                code = '\n'.join('__out%d = __inp' % i
                                 for i in range(len(component)))
            else:
                # More than one input stream might mean a data race, so we only
                # address the first one in the tasklet code
                if len(component) > 1:
                    warnings.warn(
                        f'More than one input found for the same index for {dnode.data}'
                    )
                opname = 'write'
                path = [state.entry_node(e.src) for e in reversed(mpath[1:])]
                wmemlets = [(dnode, '__out', innermost_edge.data)]
                rmemlets = []
                for i, (_, edge) in enumerate(component):
                    name = streams[edge]
                    ionode = state.add_read(name)
                    ionodes.append(ionode)
                    rmemlets.append(
                        (ionode, '__inp%d' % i, mm.Memlet(data=name,
                                                          subset='0')))
                code = '__out = __inp0'

            # Create map structure for read/write component
            maps = []
            for entry in path:
                map: nodes.Map = entry.map

                ranges = [(p, (r[0], r[1], r[2]))
                          for p, r in zip(map.params, map.range)]

                # Change ranges of map
                if self.use_memory_buffering:
                    # Find edges from/to map

                    edge_subset = [
                        a_tuple[0]
                        for a_tuple in list(innermost_edge.data.subset)
                    ]

                    # Change range of map
                    if isinstance(edge_subset[-1], symbol) and str(
                            edge_subset[-1]) == map.params[-1]:

                        if not is_int(ranges[-1][1][1]):

                            warnings.warn(
                                "Using the MemoryBuffering transformation is potential unsafe since {sym} is not an integer. There should be no issue if {sym} % {vec} == 0"
                                .format(sym=ranges[-1][1][1].args[1],
                                        vec=vector_size))

                        ranges[-1] = (ranges[-1][0],
                                      (ranges[-1][1][0],
                                       (ranges[-1][1][1] + 1) / vector_size -
                                       1, ranges[-1][1][2]))

                    elif isinstance(edge_subset[-1], sympy.core.add.Add):

                        for arg in edge_subset[-1].args:
                            if isinstance(
                                    arg,
                                    symbol) and str(arg) == map.params[-1]:

                                if not is_int(ranges[-1][1][1]):
                                    warnings.warn(
                                        "Using the MemoryBuffering transformation is potential unsafe since {sym} is not an integer. There should be no issue if {sym} % {vec} == 0"
                                        .format(sym=ranges[-1][1][1].args[1],
                                                vec=vector_size))

                                ranges[-1] = (ranges[-1][0], (
                                    ranges[-1][1][0],
                                    (ranges[-1][1][1] + 1) / vector_size - 1,
                                    ranges[-1][1][2]))

                maps.append(
                    state.add_map(f'__s{opname}_{mapname}', ranges,
                                  map.schedule))
            tasklet = state.add_tasklet(
                f'{opname}_{mapname}',
                {m[1]
                 for m in rmemlets},
                {m[1]
                 for m in wmemlets},
                code,
            )
            for node, cname, memlet in rmemlets:
                state.add_memlet_path(node,
                                      *(me for me, _ in maps),
                                      tasklet,
                                      dst_conn=cname,
                                      memlet=memlet)
            for node, cname, memlet in wmemlets:
                state.add_memlet_path(tasklet,
                                      *(mx for _, mx in reversed(maps)),
                                      node,
                                      src_conn=cname,
                                      memlet=memlet)

        return ionodes