def _make_sdfg(name, storage=dace.dtypes.StorageType.CPU_Heap):

    N = dace.symbol('N', dtype=dace.int32, integer=True, positive=True)
    i = dace.symbol('i', dtype=dace.int32, integer=True)

    sdfg = dace.SDFG(name)
    _, A = sdfg.add_array('A', [N, N, N], dtype=dace.float64)
    _, B = sdfg.add_array('B', [N], dtype=dace.float64)
    _, tmp1 = sdfg.add_transient('tmp1', [N - 4, N - 4, N - i],
                                 dtype=dace.float64,
                                 storage=storage)
    _, tmp2 = sdfg.add_transient('tmp2', [1],
                                 dtype=dace.float64,
                                 storage=storage)

    begin_state = sdfg.add_state("begin", is_start_state=True)
    guard_state = sdfg.add_state("guard")
    body1_state = sdfg.add_state("body1")
    body2_state = sdfg.add_state("body2")
    body3_state = sdfg.add_state("body3")
    end_state = sdfg.add_state("end")

    sdfg.add_edge(begin_state, guard_state,
                  dace.InterstateEdge(assignments=dict(i='0')))
    sdfg.add_edge(guard_state, body1_state,
                  dace.InterstateEdge(condition=f'i<{N}'))
    sdfg.add_edge(guard_state, end_state,
                  dace.InterstateEdge(condition=f'i>={N}'))
    sdfg.add_edge(body1_state, body2_state, dace.InterstateEdge())
    sdfg.add_edge(body2_state, body3_state, dace.InterstateEdge())
    sdfg.add_edge(body3_state, guard_state,
                  dace.InterstateEdge(assignments=dict(i='i+1')))

    read_a = body1_state.add_read('A')
    write_tmp1 = body1_state.add_write('tmp1')
    body1_state.add_nedge(read_a, write_tmp1,
                          dace.Memlet(f'A[2:{N}-2, 2:{N}-2, i:{N}]'))

    read_tmp1 = body2_state.add_read('tmp1')
    rednode = standard.Reduce(wcr='lambda a, b : a + b', identity=0)
    if storage == dace.dtypes.StorageType.GPU_Global:
        rednode.implementation = 'CUDA (device)'
    elif storage == dace.dtypes.StorageType.FPGA_Global:
        rednode.implementation = 'FPGAPartialReduction'
    body2_state.add_node(rednode)
    write_tmp2 = body2_state.add_write('tmp2')
    body2_state.add_nedge(read_tmp1, rednode,
                          dace.Memlet.from_array('tmp1', tmp1))
    body2_state.add_nedge(rednode, write_tmp2, dace.Memlet('tmp2[0]'))

    read_tmp2 = body3_state.add_read('tmp2')
    write_b = body3_state.add_write('B')
    body3_state.add_nedge(read_tmp2, write_b, dace.Memlet('B[i]'))

    return sdfg
Example #2
0
class ReduceExpansion(transformation.Transformation):
    """ Implements the ReduceExpansion transformation.
        Expands a Reduce node into inner and outer map components,
        where the outer map consists of the axes not being reduced.
        A new reduce node is created inside the inner map.
        Special cases where e.g reduction identities are not defined
        and arrays being reduced to already exist are handled
        on the fly.
    """

    _reduce = stdlib.Reduce()

    debug = Property(desc="Debug Info", dtype=bool, default=False)

    create_in_transient = Property(desc="Create local in-transient"
                                   "in registers",
                                   dtype=bool,
                                   default=False)

    create_out_transient = Property(desc="Create local out-transient"
                                    "in registers",
                                    dtype=bool,
                                    default=False)

    reduce_implementation = Property(
        desc="Reduce implementation of inner reduce. If specified,"
        "overrides any existing implementations",
        dtype=str,
        default=None,
        choices=[
            'pure', 'OpenMP', 'CUDA (device)', 'CUDA (block)',
            'CUDA (block allreduce)', 'CUDA (warp)', 'CUDA (warp allreduce)'
        ],
        allow_none=True)

    reduction_type_update = {
        dtypes.ReductionType.Max: 'out = max(reduction_in, array_in)',
        dtypes.ReductionType.Min: 'out = min(reduction_in, array_in)',
        dtypes.ReductionType.Sum: 'out = reduction_in + array_in',
        dtypes.ReductionType.Product: 'out = reduction_in * array_in',
        dtypes.ReductionType.Bitwise_And: 'out = reduction_in & array_in',
        dtypes.ReductionType.Bitwise_Or: 'out = reduction_in | array_in',
        dtypes.ReductionType.Bitwise_Xor: 'out = reduction_in ^ array_in',
        dtypes.ReductionType.Logical_And: 'out = reduction_in and array_in',
        dtypes.ReductionType.Logical_Or: 'out = reduction_in or array_in',
        dtypes.ReductionType.Logical_Xor: 'out = reduction_in != array_in'
    }

    reduction_type_identity = {
        dtypes.ReductionType.Sum: 0,
        dtypes.ReductionType.Product: 1,
        dtypes.ReductionType.Bitwise_Or: 0,
        dtypes.ReductionType.Logical_And: True,
        dtypes.ReductionType.Logical_Or: False
    }

    @staticmethod
    def expressions():
        return [utils.node_path_graph(ReduceExpansion._reduce)]

    @staticmethod
    def can_be_applied(graph, candidate, expr_index, sdfg, strict=False):
        reduce_node = graph.nodes()[candidate[ReduceExpansion._reduce]]
        inedge = graph.in_edges(reduce_node)[0]
        input_dims = inedge.data.subset.data_dims()
        axes = reduce_node.axes
        if axes is None:
            # axes = None -> full reduction, can't expand
            return False
        if len(axes) == input_dims:
            # axes = all  -> full reduction, can't expand
            return False

        return True

    @staticmethod
    def match_to_str(graph, candidate):
        reduce = candidate[ReduceExpansion._reduce]
        return str(reduce)

    def apply(self, sdfg: SDFG, strict=False):
        """ Splits the data dimension into an inner and outer dimension,
            where the inner dimension are the reduction axes and the
            outer axes the complement. Pushes the reduce inside a new
            map consisting of the complement axes.
        """

        graph = sdfg.nodes()[self.state_id]
        reduce_node = graph.nodes()[self.subgraph[ReduceExpansion._reduce]]
        self.expand(sdfg, graph, reduce_node)

    def expand(self, sdfg, graph, reduce_node):
        """ Splits the data dimension into an inner and outer dimension,
            where the inner dimension are the reduction axes and the
            outer axes the complement. Pushes the reduce inside a new
            map consisting of the complement axes.

        """

        out_storage_node = graph.out_edges(reduce_node)[0].dst
        in_storage_node = graph.in_edges(reduce_node)[0].src
        wcr = reduce_node.wcr
        identity = reduce_node.identity
        schedule = reduce_node.schedule
        implementation = reduce_node.implementation
        if implementation and 'warp' in implementation:
            raise NotImplementedError(
                "WIP: Warp Reductions are not Implemented yet.")

        # remove the reduce identity
        # we will reassign it later after expanding
        reduce_node.identity = None
        # expand the reduce node
        in_edge = graph.in_edges(reduce_node)[0]
        nsdfg = self._expand_reduce(sdfg, graph, reduce_node)
        # find the new nodes in the nested sdfg created
        nstate = nsdfg.sdfg.nodes()[0]
        for node, scope in nstate.scope_dict().items():
            if isinstance(node, nodes.MapEntry):
                if scope is None:
                    outer_entry = node
                else:
                    inner_entry = node
            if isinstance(node, nodes.Tasklet):
                tasklet_node = node

        inner_exit = nstate.exit_node(inner_entry)
        outer_exit = nstate.exit_node(outer_entry)

        # find earliest parent read-write occurrence of array onto which
        # we perform the reduction:
        # do BFS, best complexity O(V+E)

        queue = [nsdfg]
        array_closest_ancestor = None
        while len(queue) > 0:
            current = queue.pop(0)
            if isinstance(current, nodes.AccessNode):
                if current.data == out_storage_node.data:
                    # it suffices to find the first node
                    # no matter what access (ReadWrite or Read)
                    array_closest_ancestor = current
                    break
            queue.extend([in_edge.src for in_edge in graph.in_edges(current)])

        # if ancestor doesn't exist:
        #           if non-transient: create data node accessing it
        #           if transient: ancestor_node = none, set_zero on outer node

        shortcut = False
        if (not array_closest_ancestor and sdfg.data(out_storage_node.data).transient) \
                                        or identity is not None:
            if self.debug:
                print("ReduceExpansion::Expanding Reduction into Map")
            # we are lucky
            shortcut = True
            nstate.out_edges(outer_exit)[0].data.wcr = None

        else:
            if self.debug:
                print("ReduceExpansion::Expanding Reduction into Map "
                      "and introducing update Tasklet, "
                      "connecting with ancestor.")
            if not array_closest_ancestor:
                array_closest_ancestor = nodes.AccessNode(
                    out_storage_node.data, access=dtypes.AccessType.ReadOnly)
                graph.add_node(array_closest_ancestor)
                # array_closest_ancestor now points to the node we want to connect
                # to the map entry

            # always have to create out transient in this case
            self.create_out_transient = True

        if self.create_out_transient:
            # create an out transient between inner and outer map exit
            array_out = nstate.out_edges(outer_exit)[0].data.data

            from dace.transformation.dataflow.local_storage import LocalStorage
            local_storage_subgraph = {
                LocalStorage.node_a:
                nsdfg.sdfg.nodes()[0].nodes().index(inner_exit),
                LocalStorage.node_b:
                nsdfg.sdfg.nodes()[0].nodes().index(outer_exit)
            }
            nsdfg_id = nsdfg.sdfg.sdfg_list.index(nsdfg.sdfg)
            nstate_id = 0
            local_storage = LocalStorage(nsdfg_id, nstate_id,
                                         local_storage_subgraph, 0)
            local_storage.array = array_out
            local_storage.apply(nsdfg.sdfg)
            out_transient_node_inner = local_storage._data_node

            # push to register
            nsdfg.sdfg.data(out_transient_node_inner.data
                            ).storage = dtypes.StorageType.Register
            if shortcut:
                nstate.out_edges(out_transient_node_inner)[0].data.wcr = None
                nstate.out_edges(out_transient_node_inner)[0].data.volume = 1

            if shortcut:
                nstate.out_edges(out_transient_node_inner)[0].data.wcr = None
                nstate.out_edges(out_transient_node_inner)[0].data.volume = 1

        if self.create_in_transient:
            # create an in-transient between inner and outer map entry
            array_in = nstate.in_edges(outer_entry)[0].data.data

            from dace.transformation.dataflow.local_storage import LocalStorage
            local_storage_subgraph = {
                LocalStorage.node_a:
                nsdfg.sdfg.nodes()[0].nodes().index(outer_entry),
                LocalStorage.node_b:
                nsdfg.sdfg.nodes()[0].nodes().index(inner_entry)
            }

            nsdfg_id = nsdfg.sdfg.sdfg_list.index(nsdfg.sdfg)
            nstate_id = 0
            local_storage = LocalStorage(nsdfg_id, nstate_id,
                                         local_storage_subgraph, 0)
            local_storage.array = array_in
            local_storage.apply(nsdfg.sdfg)
            in_transient_node_inner = local_storage._data_node

            # push to shared memory / default
            nsdfg.sdfg.data(in_transient_node_inner.data
                            ).storage = dtypes.StorageType.Register

        # first, inline fuse back our nested SDFG
        from dace.transformation.interstate import InlineSDFG
        inline_sdfg = InlineSDFG(
            sdfg.sdfg_list.index(sdfg),
            sdfg.nodes().index(graph),
            {InlineSDFG._nested_sdfg: graph.nodes().index(nsdfg)}, 0)
        inline_sdfg.apply(sdfg)
        if not shortcut:
            reduction_type = detect_reduction_type(wcr)
            try:
                code = ReduceExpansion.reduction_type_update[reduction_type]
            except KeyError:
                raise NotImplementedError(
                    "Not yet implemented for custom reduction")

            new_tasklet = graph.add_tasklet(
                name="reduction_transient_update",
                inputs={"reduction_in", "array_in"},
                outputs={"out"},
                code=code)

            edge_to_remove = graph.out_edges(out_transient_node_inner)[0] \
                             if self.create_out_transient \
                             else graph.out_edges(inner_exit)[0]

            new_memlet_array_inner = Memlet(data=out_storage_node.data,
                                            volume=1,
                                            subset=edge_to_remove.data.subset)
            new_memlet_array_outer = Memlet(
                data=array_closest_ancestor.data,
                volume=graph.in_edges(outer_entry)[0].data.volume,
                subset=subsets.Range.from_array(
                    sdfg.data(out_storage_node.data)))

            new_memlet_reduction = Memlet(
                data=graph.out_edges(inner_exit)[0].data.data,
                volume=1,
                subset=graph.out_edges(inner_exit)[0].data.subset)
            new_memlet_out_inner = Memlet(data=edge_to_remove.data.data,
                                          volume=1,
                                          subset=edge_to_remove.data.subset)
            new_memlet_out_outer = dcpy(new_memlet_array_outer)

            # remove old edges

            outer_edge_to_remove = None
            for edge in graph.out_edges(outer_exit):
                if edge.src == edge_to_remove.dst:
                    outer_edge_to_remove = edge

            graph.remove_edge_and_connectors(edge_to_remove)
            graph.remove_edge_and_connectors(outer_edge_to_remove)


            graph.add_edge(out_transient_node_inner if self.create_out_transient \
                                                    else inner_exit,
                           None,
                           new_tasklet,
                           "reduction_in",
                           new_memlet_reduction)

            graph.add_edge(outer_entry, None, new_tasklet, "array_in",
                           new_memlet_array_inner)
            graph.add_edge(array_closest_ancestor, None, outer_entry, None,
                           new_memlet_array_outer)
            graph.add_edge(new_tasklet, "out", outer_exit, None,
                           new_memlet_out_inner)
            graph.add_edge(outer_exit, None, out_storage_node, None,
                           new_memlet_out_outer)

            # fill map scope connectors
            graph.fill_scope_connectors()
            graph._clear_scopedict_cache()
            # wcr is already removed

        # FORNOW: choose default schedule and implementation
        new_schedule = dtypes.ScheduleType.Default
        new_implementation = self.reduce_implementation \
                             if self.reduce_implementation is not None \
                             else implementation
        new_axes = dcpy(reduce_node.axes)

        reduce_node_new = graph.add_reduce(wcr=wcr,
                                           axes=new_axes,
                                           schedule=new_schedule,
                                           identity=identity)
        reduce_node_new.implementation = new_implementation
        edge_tmp = graph.in_edges(inner_entry)[0]
        memlet_src_reduce = dcpy(edge_tmp.data)
        graph.add_edge(edge_tmp.src, edge_tmp.src_conn, reduce_node_new, None,
                       memlet_src_reduce)

        edge_tmp = graph.out_edges(inner_exit)[0]
        memlet_reduce_dst = Memlet(data=edge_tmp.data.data,
                                   volume=1,
                                   subset=edge_tmp.data.subset)

        graph.add_edge(reduce_node_new, None, edge_tmp.dst, edge_tmp.dst_conn,
                       memlet_reduce_dst)
        identity_tasklet = graph.out_edges(inner_entry)[0].dst
        graph.remove_node(inner_entry)
        graph.remove_node(inner_exit)
        graph.remove_node(identity_tasklet)

        # propagate scope for correct volumes
        scope_tree = ScopeTree(outer_entry, outer_exit)
        scope_tree.parent = ScopeTree(None, None)
        propagate_memlets_scope(sdfg, graph, scope_tree)
        sdfg.validate()

        # create variables for outside access
        self._new_reduce = reduce_node_new
        self._outer_entry = outer_entry

        if identity is None and self.create_out_transient:
            # set the reduction identity accordingly so that the correct
            # blank result is written to the out_transient node
            # we use default values deducted from the reduction type
            reduction_type = detect_reduction_type(wcr)
            try:
                reduce_node_new.identity = self.reduction_type_identity[
                    reduction_type]
            except KeyError:

                if reduction_type == dtypes.ReductionType.Min:
                    reduce_node_new.identity = dtypes.max_value(
                        sdfg.arrays[out_storage_node.data].dtype)
                elif reduction_type == dtypes.ReductionType.Max:
                    reduce_node_new.identity = dtypes.min_value(
                        sdfg.arrays[out_storage_node.data].dtype)
                else:
                    raise ValueError(f"Cannot infer reduction identity."
                                     "Please specify the identity of node"
                                     "{reduce_node_new}")

        return

    def _expand_reduce(self, sdfg, state, node):
        # expands a reduce into two nested maps
        # taken from legacy expand_reduce.py

        node.validate(sdfg, state)
        inedge: graph.MultiConnectorEdge = state.in_edges(node)[0]
        outedge: graph.MultiConnectorEdge = state.out_edges(node)[0]
        input_dims = len(inedge.data.subset)
        output_dims = len(outedge.data.subset)
        input_data = sdfg.arrays[inedge.data.data]
        output_data = sdfg.arrays[outedge.data.data]

        # Standardize axes
        axes = node.axes if node.axes else [i for i in range(input_dims)]

        # Create nested SDFG
        nsdfg = SDFG('reduce')

        nsdfg.add_array('_in',
                        inedge.data.subset.size(),
                        input_data.dtype,
                        strides=input_data.strides,
                        storage=input_data.storage)

        nsdfg.add_array('_out',
                        outedge.data.subset.size(),
                        output_data.dtype,
                        strides=output_data.strides,
                        storage=output_data.storage)

        if node.identity is not None:
            raise ValueError("Node identity has to be None at this point.")
        else:
            nstate = nsdfg.add_state()
        # END OF INIT

        # (If axes != all) Add outer map, which corresponds to the output range
        if len(axes) != input_dims:
            # Interleave input and output axes to match input memlet
            ictr, octr = 0, 0
            input_subset = []
            for i in range(input_dims):
                if i in axes:
                    input_subset.append('_i%d' % ictr)
                    ictr += 1
                else:
                    input_subset.append('_o%d' % octr)
                    octr += 1

            output_size = outedge.data.subset.size()

            ome, omx = nstate.add_map(
                'reduce_output', {
                    '_o%d' % i: '0:%s' % symstr(sz)
                    for i, sz in enumerate(outedge.data.subset.size())
                })
            outm = Memlet.simple('_out',
                                 ','.join(
                                     ['_o%d' % i for i in range(output_dims)]),
                                 wcr_str=node.wcr)
            inmm = Memlet.simple('_in', ','.join(input_subset))
        else:
            ome, omx = None, None
            outm = Memlet.simple('_out', '0', wcr_str=node.wcr)
            inmm = Memlet.simple(
                '_in', ','.join(['_i%d' % i for i in range(len(axes))]))

        # Add inner map, which corresponds to the range to reduce, containing
        # an identity tasklet
        ime, imx = nstate.add_map(
            'reduce_values', {
                '_i%d' % i: '0:%s' % symstr(inedge.data.subset.size()[axis])
                for i, axis in enumerate(sorted(axes))
            })

        # Add identity tasklet for reduction
        t = nstate.add_tasklet('identity', {'inp'}, {'out'}, 'out = inp')

        # Connect everything
        r = nstate.add_read('_in')
        w = nstate.add_read('_out')

        if ome:
            nstate.add_memlet_path(r, ome, ime, t, dst_conn='inp', memlet=inmm)
            nstate.add_memlet_path(t, imx, omx, w, src_conn='out', memlet=outm)
        else:
            nstate.add_memlet_path(r, ime, t, dst_conn='inp', memlet=inmm)
            nstate.add_memlet_path(t, imx, w, src_conn='out', memlet=outm)

        # Rename outer connectors and add to node
        inedge._dst_conn = '_in'
        outedge._src_conn = '_out'
        node.add_in_connector('_in')
        node.add_out_connector('_out')

        nsdfg = state.add_nested_sdfg(nsdfg,
                                      sdfg,
                                      node.in_connectors,
                                      node.out_connectors,
                                      schedule=node.schedule,
                                      name=node.name)

        utils.change_edge_dest(state, node, nsdfg)
        utils.change_edge_src(state, node, nsdfg)
        state.remove_node(node)

        return nsdfg
Example #3
0
class MapReduceFusion(pm.Transformation):
    """ Implements the map-reduce-fusion transformation.
        Fuses a map with an immediately following reduction, where the array
        between the map and the reduction is not used anywhere else.
    """

    no_init = Property(dtype=bool,
                       default=False,
                       desc='If enabled, does not create initialization states '
                       'for reduce nodes with identity')

    _tasklet = nodes.Tasklet('_')
    _tmap_exit = nodes.MapExit(nodes.Map("", [], []))
    _in_array = nodes.AccessNode('_')

    import dace.libraries.standard as stdlib  # Avoid import loop
    _reduce = stdlib.Reduce()

    _out_array = nodes.AccessNode('_')

    @staticmethod
    def expressions():
        return [
            sdutil.node_path_graph(MapReduceFusion._tasklet,
                                   MapReduceFusion._tmap_exit,
                                   MapReduceFusion._in_array,
                                   MapReduceFusion._reduce,
                                   MapReduceFusion._out_array)
        ]

    @staticmethod
    def can_be_applied(graph, candidate, expr_index, sdfg, strict=False):
        tmap_exit = graph.nodes()[candidate[MapReduceFusion._tmap_exit]]
        in_array = graph.nodes()[candidate[MapReduceFusion._in_array]]
        reduce_node = graph.nodes()[candidate[MapReduceFusion._reduce]]
        tasklet = graph.nodes()[candidate[MapReduceFusion._tasklet]]

        # Make sure that the array is only accessed by the map and the reduce
        if any([
                src != tmap_exit
                for src, _, _, _, memlet in graph.in_edges(in_array)
        ]):
            return False
        if any([
                dest != reduce_node
                for _, _, dest, _, memlet in graph.out_edges(in_array)
        ]):
            return False

        tmem = next(e for e in graph.edges_between(tasklet, tmap_exit)
                    if e.data.data == in_array.data).data

        # (strict) Make sure that the transient is not accessed anywhere else
        # in this state or other states
        if strict and (len([
                n for n in graph.nodes()
                if isinstance(n, nodes.AccessNode) and n.data == in_array.data
        ]) > 1 or in_array.data in sdfg.shared_transients()):
            return False

        # If memlet already has WCR and it is different from reduce node,
        # do not match
        if tmem.wcr is not None and tmem.wcr != reduce_node.wcr:
            return False

        # Verify that reduction ranges match tasklet map
        tout_memlet = graph.in_edges(in_array)[0].data
        rin_memlet = graph.out_edges(in_array)[0].data
        if tout_memlet.subset != rin_memlet.subset:
            return False

        return True

    @staticmethod
    def match_to_str(graph, candidate):
        tasklet = candidate[MapReduceFusion._tasklet]
        map_exit = candidate[MapReduceFusion._tmap_exit]
        reduce = candidate[MapReduceFusion._reduce]

        return ' -> '.join(str(node) for node in [tasklet, map_exit, reduce])

    def apply(self, sdfg: SDFG):
        graph = sdfg.nodes()[self.state_id]
        tmap_exit = graph.nodes()[self.subgraph[MapReduceFusion._tmap_exit]]
        in_array = graph.nodes()[self.subgraph[MapReduceFusion._in_array]]
        reduce_node = graph.nodes()[self.subgraph[MapReduceFusion._reduce]]
        out_array = graph.nodes()[self.subgraph[MapReduceFusion._out_array]]

        # Set nodes to remove according to the expression index
        nodes_to_remove = [in_array]
        nodes_to_remove.append(reduce_node)

        memlet_edge = None
        for edge in graph.in_edges(tmap_exit):
            if edge.data.data == in_array.data:
                memlet_edge = edge
                break
        if memlet_edge is None:
            raise RuntimeError('Reduction memlet cannot be None')

        # Find which indices should be removed from new memlet
        input_edge = graph.in_edges(reduce_node)[0]
        axes = reduce_node.axes or list(range(len(input_edge.data.subset)))
        array_edge = graph.out_edges(reduce_node)[0]

        # Delete relevant edges and nodes
        graph.remove_nodes_from(nodes_to_remove)

        # Filter out reduced dimensions from subset
        filtered_subset = [
            dim for i, dim in enumerate(memlet_edge.data.subset)
            if i not in axes
        ]
        if len(filtered_subset) == 0:  # Output is a scalar
            filtered_subset = [(0, 0, 1)]

        # Modify edge from tasklet to map exit
        memlet_edge.data.data = out_array.data
        memlet_edge.data.wcr = reduce_node.wcr
        memlet_edge.data.subset = type(memlet_edge.data.subset)(filtered_subset)

        # Add edge from map exit to output array
        graph.add_edge(
            memlet_edge.dst, 'OUT_' + memlet_edge.dst_conn[3:], array_edge.dst,
            array_edge.dst_conn,
            Memlet.simple(array_edge.data.data,
                          array_edge.data.subset,
                          num_accesses=array_edge.data.num_accesses,
                          wcr_str=reduce_node.wcr))

        # Add initialization state as necessary
        if reduce_node.identity is not None:
            init_state = sdfg.add_state_before(graph)
            init_state.add_mapped_tasklet(
                'freduce_init',
                [('o%d' % i, '%s:%s:%s' % (r[0], r[1] + 1, r[2]))
                 for i, r in enumerate(array_edge.data.subset)], {},
                'out = %s' % reduce_node.identity, {
                    'out':
                    Memlet.simple(
                        array_edge.data.data, ','.join([
                            'o%d' % i
                            for i in range(len(array_edge.data.subset))
                        ]))
                },
                external_edges=True)
Example #4
0
class ReduceExpansion(transformation.Transformation):
    """ Implements the ReduceExpansion transformation.
        Expands a Reduce node into inner and outer map components,
        where the outer map consists of the axes not being reduced.
        A new reduce node is created inside the inner map.
        Special cases where e.g reduction identities are not defined
        and arrays being reduced to already exist are handled
        on the fly.
    """

    _reduce = stdlib.Reduce()

    debug = Property(desc="Debug Info", dtype=bool, default=False)

    create_in_transient = Property(desc="Create local in-transient"
                                   "in registers",
                                   dtype=bool,
                                   default=False)

    create_out_transient = Property(desc="Create local out-transient"
                                    "in registers",
                                    dtype=bool,
                                    default=False)

    reduce_implementation = Property(
        desc="Reduce implementation of inner reduce. If specified,"
        "overrides any existing implementations",
        dtype=str,
        default=None,
        choices=[
            'pure', 'OpenMP', 'CUDA (device)', 'CUDA (block)',
            'CUDA (block allreduce)', 'CUDA (warp)', 'CUDA (warp allreduce)'
        ],
        allow_none=True)

    reduction_type_update = {
        dtypes.ReductionType.Max: 'out = max(reduction_in, array_in)',
        dtypes.ReductionType.Min: 'out = min(reduction_in, array_in)',
        dtypes.ReductionType.Sum: 'out = reduction_in + array_in',
        dtypes.ReductionType.Product: 'out = reduction_in * array_in',
        dtypes.ReductionType.Bitwise_And: 'out = reduction_in & array_in',
        dtypes.ReductionType.Bitwise_Or: 'out = reduction_in | array_in',
        dtypes.ReductionType.Bitwise_Xor: 'out = reduction_in ^ array_in',
        dtypes.ReductionType.Logical_And: 'out = reduction_in and array_in',
        dtypes.ReductionType.Logical_Or: 'out = reduction_in or array_in',
        dtypes.ReductionType.Logical_Xor: 'out = reduction_in != array_in'
    }

    reduction_type_identity = {
        dtypes.ReductionType.Sum: 0,
        dtypes.ReductionType.Product: 1,
        dtypes.ReductionType.Bitwise_Or: 0,
        dtypes.ReductionType.Logical_And: True,
        dtypes.ReductionType.Logical_Or: False
    }

    @staticmethod
    def expressions():
        return [utils.node_path_graph(ReduceExpansion._reduce)]

    @staticmethod
    def can_be_applied(graph, candidate, expr_index, sdfg, permissive=False):
        reduce_node = graph.nodes()[candidate[ReduceExpansion._reduce]]
        inedge = graph.in_edges(reduce_node)[0]
        input_dims = inedge.data.subset.dims()
        axes = reduce_node.axes
        # we cannot expand upon full reduction
        if axes is None:
            return False

        # we cannot expand if there are not outer dimensions
        for (i, sz) in enumerate(inedge.data.subset.size()):
            if i not in reduce_node.axes and sz != 1:
                break
        else:
            return False

        return True

    @staticmethod
    def match_to_str(graph, candidate):
        reduce = candidate[ReduceExpansion._reduce]
        return str(reduce)

    def apply(self, sdfg: SDFG):
        """ Splits the data dimension into an inner and outer dimension,
            where the inner dimension are the reduction axes and the
            outer axes the complement. Pushes the reduce inside a new
            map consisting of the complement axes.
        """

        graph = sdfg.nodes()[self.state_id]
        reduce_node = graph.nodes()[self.subgraph[ReduceExpansion._reduce]]
        self.expand(sdfg, graph, reduce_node)

    def expand(self, sdfg, graph, reduce_node):
        """ Splits the data dimension into an inner and outer dimension,
            where the inner dimension are the reduction axes and the
            outer axes the complement. Pushes the reduce inside a new
            map consisting of the complement axes.

        """

        # get out storage node, might be hidden behind view node
        out_data = graph.out_edges(reduce_node)[0].data
        out_storage_node = reduce_node
        while not isinstance(out_storage_node, nodes.AccessNode):
            out_storage_node = graph.out_edges(out_storage_node)[0].dst

        if isinstance(sdfg.data(out_storage_node.data), View):
            out_storage_node = graph.out_edges(out_storage_node)[0].dst
            while not isinstance(out_storage_node, nodes.AccessNode):
                out_storage_node = graph.out_edges(out_storage_node)[0].dst

        # get other useful quantities from the original reduce node
        wcr = reduce_node.wcr
        identity = reduce_node.identity
        implementation = reduce_node.implementation

        # remove the reduce identity, will get reassigned after expansion
        reduce_node.identity = None
        # expand the reduce node
        in_edge = graph.in_edges(reduce_node)[0]
        nsdfg = self._expand_reduce(sdfg, graph, reduce_node)
        # find the new nodes in the nested sdfg created
        nstate = nsdfg.sdfg.nodes()[0]
        for node, scope in nstate.scope_dict().items():
            if isinstance(node, nodes.MapEntry):
                if scope is None:
                    outer_entry = node
                else:
                    inner_entry = node
            if isinstance(node, nodes.Tasklet):
                tasklet_node = node

        inner_exit = nstate.exit_node(inner_entry)
        outer_exit = nstate.exit_node(outer_entry)

        # find earliest parent read-write occurrence of array onto which the reduction is performed: BFS

        if self.create_out_transient:
            queue = [nsdfg]
            enqueued = set()
            array_closest_ancestor = None

            while len(queue) > 0:
                current = queue.pop()
                if isinstance(current, nodes.AccessNode):
                    if current.data == out_storage_node.data:
                        # it suffices to find the first node
                        # no matter what access (ReadWrite or Read)
                        array_closest_ancestor = current
                        break
                for in_edge in graph.in_edges(current):
                    if in_edge.src not in enqueued:
                        queue.append(in_edge.src)
                        enqueued.add(in_edge.src)

            if self.debug and array_closest_ancestor:
                print(
                    f"ReduceExpansion::Closest ancestor={array_closest_ancestor}"
                )
            elif self.debug:
                print("ReduceExpansion::No closest ancestor found")

        if self.create_out_transient:
            # create an out transient between inner and outer map exit
            array_out = nstate.out_edges(outer_exit)[0].data.data

            from dace.transformation.dataflow.local_storage import LocalStorage
            local_storage_subgraph = {
                LocalStorage.node_a:
                nsdfg.sdfg.nodes()[0].nodes().index(inner_exit),
                LocalStorage.node_b:
                nsdfg.sdfg.nodes()[0].nodes().index(outer_exit)
            }
            nsdfg_id = nsdfg.sdfg.sdfg_list.index(nsdfg.sdfg)
            nstate_id = 0
            local_storage = LocalStorage(nsdfg_id, nstate_id,
                                         local_storage_subgraph, 0)
            local_storage.array = array_out
            local_storage.apply(nsdfg.sdfg)
            out_transient_node_inner = local_storage._data_node

            # push to register
            nsdfg.sdfg.data(out_transient_node_inner.data
                            ).storage = dtypes.StorageType.Register

            # remove WCRs from all edges where possible if there is no
            # prior occurrence
            if array_closest_ancestor is None:
                nstate.out_edges(outer_exit)[0].data.wcr = None
                nstate.out_edges(out_transient_node_inner)[0].data.wcr = None
                nstate.out_edges(out_transient_node_inner)[0].data.volume = 1
        else:

            # remove WCR from outer exit
            nstate.out_edges(outer_exit)[0].data.wcr = None

        if self.create_in_transient:
            # create an in-transient between inner and outer map entry
            array_in = nstate.in_edges(outer_entry)[0].data.data

            from dace.transformation.dataflow.local_storage import LocalStorage
            local_storage_subgraph = {
                LocalStorage.node_a:
                nsdfg.sdfg.nodes()[0].nodes().index(outer_entry),
                LocalStorage.node_b:
                nsdfg.sdfg.nodes()[0].nodes().index(inner_entry)
            }

            nsdfg_id = nsdfg.sdfg.sdfg_list.index(nsdfg.sdfg)
            nstate_id = 0
            local_storage = LocalStorage(nsdfg_id, nstate_id,
                                         local_storage_subgraph, 0)
            local_storage.array = array_in
            local_storage.apply(nsdfg.sdfg)
            in_transient_node_inner = local_storage._data_node

            # push to register
            nsdfg.sdfg.data(in_transient_node_inner.data
                            ).storage = dtypes.StorageType.Register

        # inline fuse back our nested SDFG
        from dace.transformation.interstate import InlineSDFG
        inline_sdfg = InlineSDFG(
            sdfg.sdfg_list.index(sdfg),
            sdfg.nodes().index(graph),
            {InlineSDFG._nested_sdfg: graph.nodes().index(nsdfg)}, 0)
        inline_sdfg.apply(sdfg)

        new_schedule = dtypes.ScheduleType.Default
        new_implementation = self.reduce_implementation \
                             if self.reduce_implementation is not None \
                             else implementation
        new_axes = dcpy(reduce_node.axes)

        reduce_node_new = graph.add_reduce(wcr=wcr,
                                           axes=new_axes,
                                           schedule=new_schedule,
                                           identity=identity)
        reduce_node_new.implementation = new_implementation
        # replace inner map with new reduction node
        edge_tmp = graph.in_edges(inner_entry)[0]
        memlet_src_reduce = dcpy(edge_tmp.data)
        graph.add_edge(edge_tmp.src, edge_tmp.src_conn, reduce_node_new, None,
                       memlet_src_reduce)

        edge_tmp = graph.out_edges(inner_exit)[0]
        memlet_reduce_dst = Memlet(data=edge_tmp.data.data,
                                   volume=1,
                                   subset=edge_tmp.data.subset)

        graph.add_edge(reduce_node_new, None, edge_tmp.dst, edge_tmp.dst_conn,
                       memlet_reduce_dst)

        identity_tasklet = graph.out_edges(inner_entry)[0].dst
        graph.remove_node(inner_entry)
        graph.remove_node(inner_exit)
        graph.remove_node(identity_tasklet)

        # propagate scope for correct volumes
        scope_tree = ScopeTree(outer_entry, outer_exit)
        scope_tree.parent = ScopeTree(None, None)
        propagate_memlets_scope(sdfg, graph, scope_tree)
        sdfg.validate()

        # create variables for outside access
        self._reduce = reduce_node_new
        self._outer_entry = outer_entry

        if identity is None and self.create_out_transient:
            if self.debug:
                print(
                    "ReduceExpansion::Trying to infer reduction WCR type due to out transient created"
                )
            # set the reduction identity accordingly so that the correct
            # blank result is written to the out_transient node
            # we use default values deducted from the reduction type
            reduction_type = detect_reduction_type(wcr)
            try:
                reduce_node_new.identity = self.reduction_type_identity[
                    reduction_type]
            except KeyError:

                if reduction_type == dtypes.ReductionType.Min:
                    reduce_node_new.identity = dtypes.max_value(
                        sdfg.arrays[out_storage_node.data].dtype)
                elif reduction_type == dtypes.ReductionType.Max:
                    reduce_node_new.identity = dtypes.min_value(
                        sdfg.arrays[out_storage_node.data].dtype)
                else:
                    raise ValueError(f"Cannot infer reduction identity."
                                     "Please specify the identity of node"
                                     "{reduce_node_new}")

        return

    def _expand_reduce(self, sdfg, state, node):
        # expands a reduce into two nested maps
        # taken from legacy expand_reduce.py

        node.validate(sdfg, state)
        inedge: graph.MultiConnectorEdge = state.in_edges(node)[0]
        outedge: graph.MultiConnectorEdge = state.out_edges(node)[0]
        input_dims = len(inedge.data.subset)
        output_dims = len(outedge.data.subset)
        input_data = sdfg.arrays[inedge.data.data]
        output_data = sdfg.arrays[outedge.data.data]

        # Standardize axes
        axes = node.axes if node.axes else [i for i in range(input_dims)]

        # Create nested SDFG
        nsdfg = SDFG('reduce')

        nsdfg.add_array('_in',
                        inedge.data.subset.size(),
                        input_data.dtype,
                        strides=input_data.strides,
                        storage=input_data.storage)

        nsdfg.add_array('_out',
                        outedge.data.subset.size(),
                        output_data.dtype,
                        strides=output_data.strides,
                        storage=output_data.storage)

        if node.identity is not None:
            raise ValueError("Node identity has to be None at this point.")
        else:
            nstate = nsdfg.add_state()
        # END OF INIT

        # (If axes != all) Add outer map, which corresponds to the output range
        if len(axes) != input_dims:
            # Interleave input and output axes to match input memlet
            ictr, octr = 0, 0
            input_subset = []
            for i in range(input_dims):
                if i in axes:
                    input_subset.append('_i%d' % ictr)
                    ictr += 1
                else:
                    input_subset.append('_o%d' % octr)
                    octr += 1

            output_size = outedge.data.subset.size()

            ome, omx = nstate.add_map(
                'reduce_output', {
                    '_o%d' % i: '0:%s' % symstr(sz)
                    for i, sz in enumerate(outedge.data.subset.size())
                })
            outm = Memlet.simple('_out',
                                 ','.join(
                                     ['_o%d' % i for i in range(output_dims)]),
                                 wcr_str=node.wcr)
            inmm = Memlet.simple('_in', ','.join(input_subset))
        else:
            ome, omx = None, None
            outm = Memlet.simple('_out', '0', wcr_str=node.wcr)
            inmm = Memlet.simple(
                '_in', ','.join(['_i%d' % i for i in range(len(axes))]))

        # Add inner map, which corresponds to the range to reduce, containing
        # an identity tasklet
        ime, imx = nstate.add_map(
            'reduce_values', {
                '_i%d' % i: '0:%s' % symstr(inedge.data.subset.size()[axis])
                for i, axis in enumerate(sorted(axes))
            })

        # Add identity tasklet for reduction
        t = nstate.add_tasklet('identity', {'inp'}, {'out'}, 'out = inp')

        # Connect everything
        r = nstate.add_read('_in')
        w = nstate.add_read('_out')

        if ome:
            nstate.add_memlet_path(r, ome, ime, t, dst_conn='inp', memlet=inmm)
            nstate.add_memlet_path(t, imx, omx, w, src_conn='out', memlet=outm)
        else:
            nstate.add_memlet_path(r, ime, t, dst_conn='inp', memlet=inmm)
            nstate.add_memlet_path(t, imx, w, src_conn='out', memlet=outm)

        # Rename outer connectors and add to node
        inedge._dst_conn = '_in'
        outedge._src_conn = '_out'
        node.add_in_connector('_in')
        node.add_out_connector('_out')

        nsdfg = state.add_nested_sdfg(nsdfg,
                                      sdfg,
                                      node.in_connectors,
                                      node.out_connectors,
                                      schedule=node.schedule,
                                      name=node.name)

        utils.change_edge_dest(state, node, nsdfg)
        utils.change_edge_src(state, node, nsdfg)
        state.remove_node(node)

        return nsdfg
Example #5
0
class GPUTransformMap(transformation.Transformation):
    """ Implements the GPUTransformMap transformation.

        Converts a single map to a GPU-scheduled map and creates GPU arrays
        outside it, generating CPU<->GPU memory copies automatically.
    """

    fullcopy = Property(desc="Copy whole arrays rather than used subset",
                        dtype=bool,
                        default=False)

    toplevel_trans = Property(desc="Make all GPU transients top-level",
                              dtype=bool,
                              default=False)

    register_trans = Property(
        desc="Make all transients inside GPU maps registers",
        dtype=bool,
        default=False)

    gpu_id = SymbolicProperty(default=None,
                              allow_none=True,
                              desc="Selects which gpu the map should run on")

    sequential_innermaps = Property(desc="Make all internal maps Sequential",
                                    dtype=bool,
                                    default=False)

    _map_entry = nodes.MapEntry(nodes.Map("", [], []))

    import dace.libraries.standard as stdlib  # Avoid import loop
    _reduce = stdlib.Reduce('lambda: None', None)

    @staticmethod
    def expressions():
        return [
            sdutil.node_path_graph(GPUTransformMap._map_entry),
            sdutil.node_path_graph(GPUTransformMap._reduce)
        ]

    @staticmethod
    def can_be_applied(graph, candidate, expr_index, sdfg, strict=False):
        if expr_index == 0:
            map_entry = graph.nodes()[candidate[GPUTransformMap._map_entry]]
            candidate_map = map_entry.map

            # Map schedules that are disallowed to transform to GPUs
            if (candidate_map.schedule
                    in [dtypes.ScheduleType.MPI] + dtypes.GPU_SCHEDULES):
                return False
            if sd.is_devicelevel_gpu(sdfg, graph, map_entry):
                return False

            # Dynamic map ranges cannot become kernels
            if sd.has_dynamic_map_inputs(graph, map_entry):
                return False

            # Ensure that map does not include internal arrays that are
            # allocated on non-default space
            subgraph = graph.scope_subgraph(map_entry)
            for node in subgraph.nodes():
                if (isinstance(node, nodes.AccessNode) and
                        node.desc(sdfg).storage != dtypes.StorageType.Default
                        and
                        node.desc(sdfg).storage != dtypes.StorageType.Register):
                    return False

            # If one of the outputs is a stream, do not match
            map_exit = graph.exit_node(map_entry)
            for edge in graph.out_edges(map_exit):
                dst = graph.memlet_path(edge)[-1].dst
                if (isinstance(dst, nodes.AccessNode)
                        and isinstance(sdfg.arrays[dst.data], data.Stream)):
                    return False

            return True
        elif expr_index == 1:
            reduce = graph.nodes()[candidate[GPUTransformMap._reduce]]

            # Disallow GPU transformation if already in device-level code
            if sd.is_devicelevel_gpu(sdfg, graph, reduce):
                return False

            return True

    @staticmethod
    def match_to_str(graph, candidate):
        if GPUTransformMap._reduce in candidate:
            return str(graph.nodes()[candidate[GPUTransformMap._reduce]])
        else:
            return str(graph.nodes()[candidate[GPUTransformMap._map_entry]])

    def apply(self, sdfg):
        graph = sdfg.nodes()[self.state_id]
        if self.expr_index == 0:
            map_entry = graph.nodes()[self.subgraph[GPUTransformMap._map_entry]]
            nsdfg_node = helpers.nest_state_subgraph(
                sdfg,
                graph,
                graph.scope_subgraph(map_entry),
                full_data=self.fullcopy)
        else:
            cnode = graph.nodes()[self.subgraph[GPUTransformMap._reduce]]
            nsdfg_node = helpers.nest_state_subgraph(sdfg,
                                                     graph,
                                                     SubgraphView(
                                                         graph, [cnode]),
                                                     full_data=self.fullcopy)

        # Avoiding import loops
        from dace.transformation.interstate import GPUTransformSDFG
        transformation = GPUTransformSDFG(0, 0, {}, 0)
        transformation.register_trans = self.register_trans
        transformation.sequential_innermaps = self.sequential_innermaps
        transformation.toplevel_trans = self.toplevel_trans
        transformation.gpu_id = self.gpu_id

        transformation.apply(nsdfg_node.sdfg)

        # Inline back as necessary
        sdfg.apply_strict_transformations()
class GPUTransformLocalStorage(transformation.Transformation):
    """Implements the GPUTransformLocalStorage transformation.

        Similar to GPUTransformMap, but takes multiple maps leading from the
        same data node into account, creating a local storage for each range.

        @see: GPUTransformMap
    """

    _arrays_removed = 0
    _maps_transformed = 0

    fullcopy = Property(desc="Copy whole arrays rather than used subset",
                        dtype=bool,
                        default=False)

    nested_seq = Property(
        desc="Makes nested code semantically-equivalent to single-core code,"
        "transforming nested maps and memory into sequential and "
        "local memory respectively.",
        dtype=bool,
        default=True,
    )

    _map_entry = nodes.MapEntry(nodes.Map("", [], []))

    import dace.libraries.standard as stdlib  # Avoid import loop
    _reduce = stdlib.Reduce("lambda: None", None)

    @staticmethod
    def expressions():
        return [
            sdutil.node_path_graph(GPUTransformLocalStorage._map_entry),
            sdutil.node_path_graph(GPUTransformLocalStorage._reduce),
        ]

    @staticmethod
    def can_be_applied(graph, candidate, expr_index, sdfg, permissive=False):
        if expr_index == 0:
            map_entry = graph.nodes()[candidate[
                GPUTransformLocalStorage._map_entry]]
            candidate_map = map_entry.map

            # Disallow GPUTransform on nested maps in permissive mode
            if not permissive:
                if graph.entry_node(map_entry) is not None:
                    return False

            # Map schedules that are disallowed to transform to GPUs
            if (candidate_map.schedule == dtypes.ScheduleType.MPI
                    or candidate_map.schedule == dtypes.ScheduleType.GPU_Device
                    or candidate_map.schedule
                    == dtypes.ScheduleType.GPU_ThreadBlock or
                    candidate_map.schedule == dtypes.ScheduleType.Sequential):
                return False

            # Dynamic map ranges cannot become kernels
            if sd.has_dynamic_map_inputs(graph, map_entry):
                return False

            # Recursively check parent for GPU schedules
            sdict = graph.scope_dict()
            current_node = map_entry
            while current_node is not None:
                if (current_node.map.schedule == dtypes.ScheduleType.GPU_Device
                        or current_node.map.schedule
                        == dtypes.ScheduleType.GPU_ThreadBlock):
                    return False
                current_node = sdict[current_node]

            # Ensure that map does not include internal arrays that are
            # allocated on non-default space
            subgraph = graph.scope_subgraph(map_entry)
            for node in subgraph.nodes():
                if (isinstance(node, nodes.AccessNode) and
                        node.desc(sdfg).storage != dtypes.StorageType.Default
                        and
                        node.desc(sdfg).storage != dtypes.StorageType.Register):
                    return False

            # If one of the outputs is a stream, do not match
            map_exit = graph.exit_node(map_entry)
            for edge in graph.out_edges(map_exit):
                dst = graph.memlet_path(edge)[-1].dst
                if (isinstance(dst, nodes.AccessNode)
                        and isinstance(sdfg.arrays[dst.data], data.Stream)):
                    return False

            return True
        elif expr_index == 1:
            reduce = graph.nodes()[candidate[GPUTransformLocalStorage._reduce]]

            # Recursively check parent for GPU schedules
            sdict = graph.scope_dict()
            current_node = sdict[reduce]
            while current_node is not None:
                if (current_node.map.schedule == dtypes.ScheduleType.GPU_Device
                        or current_node.map.schedule
                        == dtypes.ScheduleType.GPU_ThreadBlock):
                    return False
                current_node = sdict[current_node]

            return True

    @staticmethod
    def match_to_str(graph, candidate):
        if GPUTransformLocalStorage._reduce in candidate:
            return str(
                graph.nodes()[candidate[GPUTransformLocalStorage._reduce]])
        else:
            map_entry = graph.nodes()[candidate[
                GPUTransformLocalStorage._map_entry]]
            return str(map_entry)

    def apply(self, sdfg):
        graph = sdfg.nodes()[self.state_id]
        if self.expr_index == 0:
            cnode: nodes.MapEntry = graph.nodes()[self.subgraph[
                GPUTransformLocalStorage._map_entry]]
            # Change schedule
            cnode.schedule = dtypes.ScheduleType.GPU_Device
            exit_node = graph.exit_node(cnode)
        else:
            cnode: nodes.LibraryNode = graph.nodes()[self.subgraph[
                GPUTransformLocalStorage._reduce]]
            # Change schedule
            cnode.schedule = dtypes.ScheduleType.GPU_Default
            exit_node = cnode

        if Config.get_bool("debugprint"):
            GPUTransformLocalStorage._maps_transformed += 1
        # If nested graph is designated as sequential, transform schedules and
        # storage from Default to Sequential/Register
        if self.nested_seq and self.expr_index == 0:
            for node in graph.scope_subgraph(cnode).nodes():
                if isinstance(node, nodes.AccessNode):
                    arr = node.desc(sdfg)
                    if arr.storage == dtypes.StorageType.Default:
                        arr.storage = dtypes.StorageType.Register
                elif isinstance(node, nodes.MapEntry):
                    if node.map.schedule == dtypes.ScheduleType.Default:
                        node.map.schedule = dtypes.ScheduleType.Sequential

        gpu_storage_types = [
            dtypes.StorageType.GPU_Global,
            dtypes.StorageType.GPU_Shared,
        ]

        #######################################################
        # Add GPU copies of CPU arrays (i.e., not already on GPU)

        # First, understand which arrays to clone
        all_out_edges = []
        all_out_edges.extend(list(graph.out_edges(exit_node)))
        in_arrays_to_clone = set()
        out_arrays_to_clone = set()
        for e in graph.in_edges(cnode):
            data_node = sd.find_input_arraynode(graph, e)
            if data_node.desc(sdfg).storage not in gpu_storage_types:
                in_arrays_to_clone.add((data_node, e.data))
        for e in all_out_edges:
            data_node = sd.find_output_arraynode(graph, e)
            if data_node.desc(sdfg).storage not in gpu_storage_types:
                out_arrays_to_clone.add((data_node, e.data))

        if Config.get_bool("debugprint"):
            GPUTransformLocalStorage._arrays_removed += len(
                in_arrays_to_clone) + len(out_arrays_to_clone)

        # Second, create a GPU clone of each array
        # TODO: Overapproximate union of memlets
        cloned_arrays = {}
        in_cloned_arraynodes = {}
        out_cloned_arraynodes = {}
        for array_node, memlet in in_arrays_to_clone:
            array = array_node.desc(sdfg)
            cloned_name = "gpu_" + array_node.data
            for i, r in enumerate(memlet.bounding_box_size()):
                size = symbolic.overapproximate(r)
                try:
                    if int(size) == 1:
                        suffix = []
                        for c in str(memlet.subset[i][0]):
                            if c.isalpha() or c.isdigit() or c == "_":
                                suffix.append(c)
                            elif c == "+":
                                suffix.append("p")
                            elif c == "-":
                                suffix.append("m")
                            elif c == "*":
                                suffix.append("t")
                            elif c == "/":
                                suffix.append("d")
                        cloned_name += "_" + "".join(suffix)
                except:
                    continue
            if cloned_name in sdfg.arrays.keys():
                cloned_array = sdfg.arrays[cloned_name]
            elif array_node.data in cloned_arrays:
                cloned_array = cloned_arrays[array_node.data]
            else:
                full_shape = []
                for r in memlet.bounding_box_size():
                    size = symbolic.overapproximate(r)
                    try:
                        full_shape.append(int(size))
                    except:
                        full_shape.append(size)
                actual_dims = [
                    idx for idx, r in enumerate(full_shape)
                    if not (isinstance(r, int) and r == 1)
                ]
                if len(actual_dims) == 0:  # abort
                    actual_dims = [len(full_shape) - 1]
                if isinstance(array, data.Scalar):
                    sdfg.add_array(name=cloned_name,
                                   shape=[1],
                                   dtype=array.dtype,
                                   transient=True,
                                   storage=dtypes.StorageType.GPU_Global)
                elif isinstance(array, data.Stream):
                    sdfg.add_stream(
                        name=cloned_name,
                        dtype=array.dtype,
                        shape=[full_shape[d] for d in actual_dims],
                        veclen=array.veclen,
                        buffer_size=array.buffer_size,
                        storage=dtypes.StorageType.GPU_Global,
                        transient=True,
                        offset=[array.offset[d] for d in actual_dims])
                else:
                    sdfg.add_array(
                        name=cloned_name,
                        shape=[full_shape[d] for d in actual_dims],
                        dtype=array.dtype,
                        transient=True,
                        storage=dtypes.StorageType.GPU_Global,
                        allow_conflicts=array.allow_conflicts,
                        strides=[array.strides[d] for d in actual_dims],
                        offset=[array.offset[d] for d in actual_dims],
                    )
                cloned_arrays[array_node.data] = cloned_name
            cloned_node = type(array_node)(cloned_name)

            in_cloned_arraynodes[array_node.data] = cloned_node
        for array_node, memlet in out_arrays_to_clone:
            array = array_node.desc(sdfg)
            cloned_name = "gpu_" + array_node.data
            for i, r in enumerate(memlet.bounding_box_size()):
                size = symbolic.overapproximate(r)
                try:
                    if int(size) == 1:
                        suffix = []
                        for c in str(memlet.subset[i][0]):
                            if c.isalpha() or c.isdigit() or c == "_":
                                suffix.append(c)
                            elif c == "+":
                                suffix.append("p")
                            elif c == "-":
                                suffix.append("m")
                            elif c == "*":
                                suffix.append("t")
                            elif c == "/":
                                suffix.append("d")
                        cloned_name += "_" + "".join(suffix)
                except:
                    continue
            if cloned_name in sdfg.arrays.keys():
                cloned_array = sdfg.arrays[cloned_name]
            elif array_node.data in cloned_arrays:
                cloned_array = cloned_arrays[array_node.data]
            else:
                full_shape = []
                for r in memlet.bounding_box_size():
                    size = symbolic.overapproximate(r)
                    try:
                        full_shape.append(int(size))
                    except:
                        full_shape.append(size)
                actual_dims = [
                    idx for idx, r in enumerate(full_shape)
                    if not (isinstance(r, int) and r == 1)
                ]
                if len(actual_dims) == 0:  # abort
                    actual_dims = [len(full_shape) - 1]
                if isinstance(array, data.Scalar):
                    sdfg.add_array(name=cloned_name,
                                   shape=[1],
                                   dtype=array.dtype,
                                   transient=True,
                                   storage=dtypes.StorageType.GPU_Global)
                elif isinstance(array, data.Stream):
                    sdfg.add_stream(
                        name=cloned_name,
                        dtype=array.dtype,
                        shape=[full_shape[d] for d in actual_dims],
                        veclen=array.veclen,
                        buffer_size=array.buffer_size,
                        storage=dtypes.StorageType.GPU_Global,
                        transient=True,
                        offset=[array.offset[d] for d in actual_dims])
                else:
                    sdfg.add_array(
                        name=cloned_name,
                        shape=[full_shape[d] for d in actual_dims],
                        dtype=array.dtype,
                        transient=True,
                        storage=dtypes.StorageType.GPU_Global,
                        allow_conflicts=array.allow_conflicts,
                        strides=[array.strides[d] for d in actual_dims],
                        offset=[array.offset[d] for d in actual_dims],
                    )
                cloned_arrays[array_node.data] = cloned_name
            cloned_node = type(array_node)(cloned_name)
            cloned_node.setzero = True

            out_cloned_arraynodes[array_node.data] = cloned_node

        # Third, connect the cloned arrays to the originals
        for array_name, node in in_cloned_arraynodes.items():
            graph.add_node(node)
            is_scalar = isinstance(sdfg.arrays[array_name], data.Scalar)
            for edge in graph.in_edges(cnode):
                if edge.data.data == array_name:
                    newmemlet = copy.deepcopy(edge.data)
                    newmemlet.data = node.data

                    if is_scalar:
                        newmemlet.subset = sbs.Indices([0])
                    else:
                        offset = []
                        lost_dims = []
                        lost_ranges = []
                        newsubset = [None] * len(edge.data.subset)
                        for ind, r in enumerate(edge.data.subset):
                            offset.append(r[0])
                            if isinstance(edge.data.subset[ind], tuple):
                                begin = edge.data.subset[ind][0] - r[0]
                                end = edge.data.subset[ind][1] - r[0]
                                step = edge.data.subset[ind][2]
                                if begin == end:
                                    lost_dims.append(ind)
                                    lost_ranges.append((begin, end, step))
                                else:
                                    newsubset[ind] = (begin, end, step)
                            else:
                                newsubset[ind] -= r[0]
                        if len(lost_dims) == len(edge.data.subset):
                            lost_dims.pop()
                            newmemlet.subset = type(
                                edge.data.subset)([lost_ranges[-1]])
                        else:
                            newmemlet.subset = type(edge.data.subset)(
                                [r for r in newsubset if r is not None])

                    graph.add_edge(node, None, edge.dst, edge.dst_conn,
                                   newmemlet)

                    for e in graph.bfs_edges(edge.dst, reverse=False):
                        parent, _, _child, _, memlet = e
                        if parent != edge.dst and not in_scope(
                                graph, parent, edge.dst):
                            break
                        if memlet.data != edge.data.data:
                            continue
                        path = graph.memlet_path(e)
                        if not isinstance(path[-1].dst, nodes.CodeNode):
                            if in_path(path, e, nodes.ExitNode, forward=True):
                                if isinstance(parent, nodes.CodeNode):
                                    # Output edge
                                    break
                                else:
                                    continue
                        if is_scalar:
                            memlet.subset = sbs.Indices([0])
                        else:
                            newsubset = [None] * len(memlet.subset)
                            for ind, r in enumerate(memlet.subset):
                                if ind in lost_dims:
                                    continue
                                if isinstance(memlet.subset[ind], tuple):
                                    begin = r[0] - offset[ind]
                                    end = r[1] - offset[ind]
                                    step = r[2]
                                    newsubset[ind] = (begin, end, step)
                                else:
                                    newsubset[ind] = (
                                        r - offset[ind],
                                        r - offset[ind],
                                        1,
                                    )
                            memlet.subset = type(edge.data.subset)(
                                [r for r in newsubset if r is not None])
                        memlet.data = node.data

                    if self.fullcopy:
                        edge.data.subset = sbs.Range.from_array(node.desc(sdfg))
                    edge.data.other_subset = newmemlet.subset
                    graph.add_edge(edge.src, edge.src_conn, node, None,
                                   edge.data)
                    graph.remove_edge(edge)

        for array_name, node in out_cloned_arraynodes.items():
            graph.add_node(node)
            is_scalar = isinstance(sdfg.arrays[array_name], data.Scalar)
            for edge in all_out_edges:
                if edge.data.data == array_name:
                    newmemlet = copy.deepcopy(edge.data)
                    newmemlet.data = node.data

                    if is_scalar:
                        newmemlet.subset = sbs.Indices([0])
                    else:
                        offset = []
                        lost_dims = []
                        lost_ranges = []
                        newsubset = [None] * len(edge.data.subset)
                        for ind, r in enumerate(edge.data.subset):
                            offset.append(r[0])
                            if isinstance(edge.data.subset[ind], tuple):
                                begin = edge.data.subset[ind][0] - r[0]
                                end = edge.data.subset[ind][1] - r[0]
                                step = edge.data.subset[ind][2]
                                if begin == end:
                                    lost_dims.append(ind)
                                    lost_ranges.append((begin, end, step))
                                else:
                                    newsubset[ind] = (begin, end, step)
                            else:
                                newsubset[ind] -= r[0]
                        if len(lost_dims) == len(edge.data.subset):
                            lost_dims.pop()
                            newmemlet.subset = type(
                                edge.data.subset)([lost_ranges[-1]])
                        else:
                            newmemlet.subset = type(edge.data.subset)(
                                [r for r in newsubset if r is not None])

                    graph.add_edge(edge.src, edge.src_conn, node, None,
                                   newmemlet)

                    end_node = graph.entry_node(edge.src)
                    for e in graph.bfs_edges(edge.src, reverse=True):
                        parent, _, _child, _, memlet = e
                        if parent == end_node:
                            break
                        if memlet.data != edge.data.data:
                            continue
                        path = graph.memlet_path(e)
                        if not isinstance(path[0].dst, nodes.CodeNode):
                            if in_path(path, e, nodes.EntryNode, forward=False):
                                if isinstance(parent, nodes.CodeNode):
                                    # Output edge
                                    break
                                else:
                                    continue
                        if is_scalar:
                            memlet.subset = sbs.Indices([0])
                        else:
                            newsubset = [None] * len(memlet.subset)
                            for ind, r in enumerate(memlet.subset):
                                if ind in lost_dims:
                                    continue
                                if isinstance(memlet.subset[ind], tuple):
                                    begin = r[0] - offset[ind]
                                    end = r[1] - offset[ind]
                                    step = r[2]
                                    newsubset[ind] = (begin, end, step)
                                else:
                                    newsubset[ind] = (
                                        r - offset[ind],
                                        r - offset[ind],
                                        1,
                                    )
                            memlet.subset = type(edge.data.subset)(
                                [r for r in newsubset if r is not None])
                        memlet.data = node.data

                    edge.data.wcr = None
                    if self.fullcopy:
                        edge.data.subset = sbs.Range.from_array(node.desc(sdfg))
                    edge.data.other_subset = newmemlet.subset
                    graph.add_edge(node, None, edge.dst, edge.dst_conn,
                                   edge.data)
                    graph.remove_edge(edge)

        # Fourth, replace memlet arrays as necessary
        if self.expr_index == 0:
            scope_subgraph = graph.scope_subgraph(cnode)
            for edge in scope_subgraph.edges():
                if edge.data.data is not None and edge.data.data in cloned_arrays:
                    edge.data.data = cloned_arrays[edge.data.data]