Example #1
0
def set_fast_implementations(sdfg: SDFG,
                             device: dtypes.DeviceType,
                             blocklist: List[str] = None):
    """
    Set fast library node implementations for the given device

    :param sdfg: The SDFG to optimize.
    :param device: the device to optimize for.
    :param blocklist: list of disallowed implementations.
    :note: Operates in-place on the given SDFG.
    """
    if blocklist is None:
        implementation_prio = find_fast_library(device)
    else:
        implementation_prio = [
            i for i in find_fast_library(device) if i not in blocklist
        ]

    # specialized nodes: pre-expand
    for current_sdfg in sdfg.all_sdfgs_recursive():
        for state in current_sdfg.nodes():
            for node in state.nodes():
                if isinstance(node, nodes.LibraryNode):
                    if (node.default_implementation == 'specialize' and (len(
                            set(node.implementations)
                            & set(implementation_prio))) == 0):
                        node.expand(current_sdfg, state)

    # general nodes
    for node, _ in sdfg.all_nodes_recursive():
        if isinstance(node, nodes.LibraryNode):
            for impl in implementation_prio:
                if impl in node.implementations:
                    if isinstance(node,
                                  dace.libraries.standard.nodes.reduce.Reduce
                                  ) and node.implementation == 'CUDA (block)':
                        continue
                    node.implementation = impl
                    break

    # reduce nodes
    if device == dtypes.DeviceType.GPU:
        for node, state in sdfg.all_nodes_recursive():
            if isinstance(node, dace.nodes.LibraryNode):
                # Use CUB for device-level reductions
                if ('CUDA (device)' in node.implementations
                        and not is_devicelevel_gpu(state.parent, state, node)
                        and state.scope_dict()[node] is None):
                    node.implementation = 'CUDA (device)'
Example #2
0
    def apply_pass(
        self,
        sdfg: SDFG,
        pipeline_results: Dict[str, Any],
    ) -> Optional[Dict[nodes.EntryNode, Optional[Any]]]:
        """
        Applies the pass to the scopes of the given SDFG by calling ``apply`` on each scope entry node.
        :param sdfg: The SDFG to apply the pass to.
        :param pipeline_results: If in the context of a ``Pipeline``, a dictionary that is populated with prior Pass
                                 results as ``{Pass subclass name: returned object from pass}``. If not run in a
                                 pipeline, an empty dictionary is expected.
        :return: A dictionary of ``{entry node: return value}`` for visited scopes with a non-None return value, or None
                 if nothing was returned.
        """
        result = {}
        for node, state in sdfg.all_nodes_recursive():
            if not isinstance(node, nodes.EntryNode):
                continue
            retval = self.apply(node, state, pipeline_results)
            if retval is not None:
                result[node] = retval

        if not result:
            return None
        return result
Example #3
0
def set_fast_implementations(sdfg: SDFG,
                             device: dtypes.DeviceType,
                             blocklist: List[str] = None):
    """
    Set fast library node implementations for the given device

    :param sdfg: The SDFG to optimize.
    :param device: the device to optimize for.
    :param blocklist: list of disallowed implementations.
    :note: Operates in-place on the given SDFG.
    """
    if blocklist is None:
        implementation_prio = find_fast_library(device)
    else:
        implementation_prio = [
            i for i in find_fast_library(device) if i not in blocklist
        ]

    for node, _ in sdfg.all_nodes_recursive():
        if isinstance(node, nodes.LibraryNode):
            for impl in implementation_prio:
                if impl in node.implementations:
                    node.implementation = impl
                    break
            else:
                warnings.warn('No fast library implementation found for "%s", '
                              'falling back to default.' % node.name)
Example #4
0
def make_transients_persistent(sdfg: SDFG, device: dtypes.DeviceType) -> None:
    ''' 
    Helper function to change several storage and scheduling properties
    - Makes non-view array lifetimes persistent, with some 
      restrictions depending on the device 
    - Reset nonatomic WCR edges on GPU 
    :param sdfg: SDFG
    :param device: Device type
    '''
    for nsdfg in sdfg.all_sdfgs_recursive():
        for aname, arr in nsdfg.arrays.items():
            if arr.transient and not isinstance(
                    arr, dt.View) and not symbolic.issymbolic(arr.total_size):
                if arr.storage != dtypes.StorageType.Register:
                    arr.lifetime = dtypes.AllocationLifetime.Persistent

    if device == dtypes.DeviceType.GPU:
        for aname, arr in sdfg.arrays.items():
            if arr.transient and not isinstance(
                    arr, dt.View):  #and size only depends on SDFG params
                if arr.storage == dtypes.StorageType.GPU_Global:
                    arr.lifetime = dtypes.AllocationLifetime.Persistent

        # Reset nonatomic WCR edges
        for n, _ in sdfg.all_nodes_recursive():
            if isinstance(n, SDFGState):
                for edge in n.edges():
                    edge.data.wcr_nonatomic = False
Example #5
0
File: fpga.py Project: mfkiwl/dace
def fpga_global_to_local(sdfg: SDFG, max_size: int = 1048576) -> None:
    """ Takes an entire  SDFG and changes the storage type of a global FPGA data container
        to Local in the following situation:
           - the data is transient,
           - the data is not a transient shared with other states, and
           - the data has a compile-time known size.
        :param: sdfg: The SDFG to operate on. It must be a top-level SDFG.
        :param: max_size: maximum size (in bytes) that a container can have to be considered for
            storage type change
        :note: Operates in-place on the SDFG.
    """
    converted = []

    for name, desc in sdfg.arrays.items():
        if desc.transient and name not in sdfg.shared_transients(
        ) and desc.storage == dtypes.StorageType.FPGA_Global:

            # Get the total size, trying to resolve it to constant if it is a symbol
            total_size = symbolic.resolve_symbol_to_constant(
                desc.total_size, sdfg)

            if total_size is not None and total_size * desc.dtype.bytes <= max_size:
                desc.storage = dtypes.StorageType.FPGA_Local
                converted.append(name)

                # update all access nodes that refer to this container
                for node, graph in sdfg.all_nodes_recursive():
                    if isinstance(node, nodes.AccessNode):
                        trace = trace_nested_access(node, graph, graph.parent)

                        for (_, candidate
                             ), memlet_trace, state_trace, sdfg_trace in trace:
                            if candidate is not None and candidate.data == name:
                                nodedesc = node.desc(graph)
                                nodedesc.storage = dtypes.StorageType.FPGA_Local
    if config.Config.get_bool('debugprint'):
        print(
            f'Applied {len(converted)} Global-To-Local{": " if len(converted)>0 else "."} {", ".join(converted)}'
        )
Example #6
0
    def apply_pass(
            self, sdfg: SDFG,
            pipeline_results: Dict[str, Any]) -> Optional[Dict[Any, Any]]:
        """
        Visits the given SDFG recursively, calling defined ``visit_*`` methods for each element.
        :param sdfg: The SDFG to recursively visit.
        :param pipeline_results: If in the context of a ``Pipeline``, a dictionary that is populated with prior Pass
                                 results as ``{Pass subclass name: returned object from pass}``. If not run in a
                                 pipeline, an empty dictionary is expected.
        :return: A dictionary of ``{element: return value}`` for visited elements with a non-None return value, or None
                 if nothing was returned.
        """
        results = {}
        for node, parent in sdfg.all_nodes_recursive():
            # Visit node (SDFGState, AccessNode, ...)
            f = getattr(self, f'visit_{type(node).__name__}',
                        self.generic_visit)
            res = f(node, parent, pipeline_results)
            if res is not None:
                results[node] = res
        for edge, parent in sdfg.all_edges_recursive():
            # Visit edge (Edge, MultiConnectorEdge)
            f = getattr(self, f'visit_{type(edge).__name__}',
                        self.generic_visit)
            res = f(edge, parent, pipeline_results)
            if res is not None:
                results[edge] = res

            # Visit edge data (Memlet, InterstateEdge)
            f = getattr(self, f'visit_{type(edge.data).__name__}',
                        self.generic_visit)
            res = f(edge.data, parent, pipeline_results)
            if res is not None:
                results[edge.data] = res

        if not results:
            return None
        return results
Example #7
0
def auto_optimize(sdfg: SDFG,
                  device: dtypes.DeviceType,
                  validate: bool = True,
                  validate_all: bool = False) -> SDFG:
    """
    Runs a basic sequence of transformations to optimize a given SDFG to decent
    performance. In particular, performs the following:
        * Strict transformations
        * Strict auto-parallelization (loop-to-map)
        * Greedy application of SubgraphFusion
        * Tiled write-conflict resolution (MapTiling -> AccumulateTransient)
        * Tiled stream accumulation (MapTiling -> AccumulateTransient)
        * Collapse all maps to parallelize across all dimensions
        * Set all library nodes to expand to ``fast`` expansion, which calls
          the fastest library on the target device
    :param sdfg: The SDFG to optimize.
    :param device: the device to optimize for.
    :param validate: If True, validates the SDFG after all transformations
                     have been applied.
    :param validate_all: If True, validates the SDFG after every step.
    :return: The optimized SDFG.
    :note: Operates in-place on the given SDFG.
    :note: This function is still experimental and may harm correctness in
           certain cases. Please report an issue if it does.
    """
    # Strict transformations and loop parallelization
    transformed = True
    while transformed:
        sdfg.apply_strict_transformations(validate=False,
                                          validate_all=validate_all)

        xfh.split_interstate_edges(sdfg)

        # Try to parallelize loops
        l2ms = sdfg.apply_transformations_repeated(LoopToMap,
                                                   strict=True,
                                                   validate=False,
                                                   validate_all=validate_all)
        transformed = l2ms > 0

    # Map fusion
    greedy_fuse(sdfg, validate_all)

    if device == dtypes.DeviceType.FPGA:
        # apply FPGA Transformations
        sdfg.apply_fpga_transformations()
        fpga_aopt.fpga_global_to_local(sdfg)
        fpga_aopt.fpga_rr_interleave_containers_to_banks(sdfg)

        # Set all library nodes to expand to fast library calls
        set_fast_implementations(sdfg, device)
        return sdfg

    # Tiled WCR and streams
    for nsdfg in list(sdfg.all_sdfgs_recursive()):
        tile_wcrs(nsdfg, validate_all)

    # Collapse maps
    sdfg.apply_transformations_repeated(MapCollapse,
                                        strict=True,
                                        validate=False,
                                        validate_all=validate_all)
    for node, _ in sdfg.all_nodes_recursive():
        if isinstance(node, nodes.MapEntry):
            node.map.collapse = len(node.map.range)

    # Set all library nodes to expand to fast library calls
    set_fast_implementations(sdfg, device)

    # TODO(later): Safe vectorization

    # Disable OpenMP parallel sections
    # TODO(later): Set on a per-SDFG basis
    config.Config.set('compiler', 'cpu', 'openmp_sections', value=False)

    # Set all Default storage types that are constant sized to registers
    move_small_arrays_to_stack(sdfg)

    # Validate at the end
    if validate or validate_all:
        sdfg.validate()

    return sdfg
Example #8
0
def auto_optimize(sdfg: SDFG,
                  device: dtypes.DeviceType,
                  validate: bool = True,
                  validate_all: bool = False,
                  symbols: Dict[str, int] = None) -> SDFG:
    """
    Runs a basic sequence of transformations to optimize a given SDFG to decent
    performance. In particular, performs the following:
        * Simplify
        * Auto-parallelization (loop-to-map)
        * Greedy application of SubgraphFusion
        * Tiled write-conflict resolution (MapTiling -> AccumulateTransient)
        * Tiled stream accumulation (MapTiling -> AccumulateTransient)
        * Collapse all maps to parallelize across all dimensions
        * Set all library nodes to expand to ``fast`` expansion, which calls
          the fastest library on the target device
    :param sdfg: The SDFG to optimize.
    :param device: the device to optimize for.
    :param validate: If True, validates the SDFG after all transformations
                     have been applied.
    :param validate_all: If True, validates the SDFG after every step.
    :param symbols: Optional dict that maps symbols (str/symbolic) to int/float
    :return: The optimized SDFG.
    :note: Operates in-place on the given SDFG.
    :note: This function is still experimental and may harm correctness in
           certain cases. Please report an issue if it does.
    """
    debugprint = config.Config.get_bool('debugprint')

    # Simplification and loop parallelization
    transformed = True
    sdfg.apply_transformations_repeated(TrivialMapElimination,
                                        validate=validate,
                                        validate_all=validate_all)
    while transformed:
        sdfg.simplify(validate=False, validate_all=validate_all)
        for s in sdfg.sdfg_list:
            xfh.split_interstate_edges(s)
        l2ms = sdfg.apply_transformations_repeated(
            (LoopToMap, RefineNestedAccess),
            validate=False,
            validate_all=validate_all)
        transformed = l2ms > 0

    # Collapse maps and eliminate trivial dimensions
    sdfg.simplify()
    sdfg.apply_transformations_repeated(MapCollapse,
                                        validate=False,
                                        validate_all=validate_all)

    # Apply GPU transformations and set library node implementations

    if device == dtypes.DeviceType.GPU:
        sdfg.apply_gpu_transformations()
        sdfg.simplify()

    # fuse subgraphs greedily
    sdfg.simplify()

    greedy_fuse(sdfg, device=device, validate_all=validate_all)

    # fuse stencils greedily
    greedy_fuse(sdfg,
                device=device,
                validate_all=validate_all,
                recursive=False,
                stencil=True)

    if device == dtypes.DeviceType.FPGA:
        # apply FPGA Transformations
        sdfg.apply_fpga_transformations()
        fpga_auto_opt.fpga_global_to_local(sdfg)
        fpga_auto_opt.fpga_rr_interleave_containers_to_banks(sdfg)

        # Set all library nodes to expand to fast library calls
        set_fast_implementations(sdfg, device)
        return sdfg

    # Tiled WCR and streams
    for nsdfg in list(sdfg.all_sdfgs_recursive()):
        tile_wcrs(nsdfg, validate_all)

    # Collapse maps
    sdfg.apply_transformations_repeated(MapCollapse,
                                        validate=False,
                                        validate_all=validate_all)
    for node, _ in sdfg.all_nodes_recursive():
        # Set OMP collapse property to map length
        if isinstance(node, nodes.MapEntry):
            # FORNOW: Leave out
            # node.map.collapse = len(node.map.range)
            pass

    # Set all library nodes to expand to fast library calls
    set_fast_implementations(sdfg, device)

    sdfg.expand_library_nodes()

    # TODO(later): Safe vectorization

    # Disable OpenMP parallel sections on a per-SDFG basis
    for nsdfg in sdfg.all_sdfgs_recursive():
        nsdfg.openmp_sections = False

    if symbols:
        # Specialize for all known symbols
        known_symbols = {
            s: v
            for (s, v) in symbols.items() if s in sdfg.free_symbols
        }
        known_symbols = {}
        for (s, v) in symbols.items():
            if s in sdfg.free_symbols:
                if isinstance(v, (int, float)):
                    known_symbols[s] = v
                if isinstance(v, sympy.core.numbers.Integer):
                    try:
                        known_symbols[s] = int(v)
                    except TypeError:
                        pass

        if debugprint and len(known_symbols) > 0:
            print("Specializing the SDFG for symbols", known_symbols)
        sdfg.specialize(known_symbols)

    # Set all Default storage types that are constant sized to registers
    move_small_arrays_to_stack(sdfg)
    '''
    # Fix storage and allocation properties, e.g., for benchmarking purposes
    # FORNOW: Leave out
    make_transients_persistent(sdfg, device)
    '''

    # Validate at the end
    if validate or validate_all:
        sdfg.validate()

    return sdfg
Example #9
0
def contains_any_sve(sdfg: SDFG):
    for node, _ in sdfg.all_nodes_recursive():
        if isinstance(node,
                      nodes.Map) and node.schedule == dace.ScheduleType.SVE_Map:
            return True
    return False
Example #10
0
    def apply(self, sdfg: sd.SDFG):

        #######################################################
        # Step 0: SDFG metadata

        # Find all input and output data descriptors
        input_nodes = []
        output_nodes = []
        global_code_nodes: Dict[sd.SDFGState, nodes.Tasklet] = defaultdict(list)

        for state in sdfg.nodes():
            sdict = state.scope_dict()
            for node in state.nodes():
                if (isinstance(node, nodes.AccessNode)
                        and node.desc(sdfg).transient == False):
                    if (state.out_degree(node) > 0
                            and node.data not in input_nodes):
                        # Special case: nodes that lead to top-level dynamic
                        # map ranges must stay on host
                        for e in state.out_edges(node):
                            last_edge = state.memlet_path(e)[-1]
                            if (isinstance(last_edge.dst, nodes.EntryNode)
                                    and last_edge.dst_conn
                                    and not last_edge.dst_conn.startswith('IN_')
                                    and sdict[last_edge.dst] is None):
                                break
                        else:
                            input_nodes.append((node.data, node.desc(sdfg)))
                    if (state.in_degree(node) > 0
                            and node.data not in output_nodes):
                        output_nodes.append((node.data, node.desc(sdfg)))

            # Input nodes may also be nodes with WCR memlets and no identity
            for e in state.edges():
                if e.data.wcr is not None:
                    if (e.data.data not in input_nodes
                            and sdfg.arrays[e.data.data].transient == False):
                        input_nodes.append(
                            (e.data.data, sdfg.arrays[e.data.data]))

        start_state = sdfg.start_state
        end_states = sdfg.sink_nodes()

        #######################################################
        # Step 1: Create cloned GPU arrays and replace originals

        cloned_arrays = {}
        for inodename, inode in set(input_nodes):
            if isinstance(inode, data.Scalar):  # Scalars can remain on host
                continue
            if inode.storage == dtypes.StorageType.GPU_Global:
                continue
            newdesc = inode.clone()
            newdesc.storage = dtypes.StorageType.GPU_Global
            newdesc.transient = True
            name = sdfg.add_datadesc('gpu_' + inodename,
                                     newdesc,
                                     find_new_name=True)
            cloned_arrays[inodename] = name

        for onodename, onode in set(output_nodes):
            if onodename in cloned_arrays:
                continue
            if onode.storage == dtypes.StorageType.GPU_Global:
                continue
            newdesc = onode.clone()
            newdesc.storage = dtypes.StorageType.GPU_Global
            newdesc.transient = True
            name = sdfg.add_datadesc('gpu_' + onodename,
                                     newdesc,
                                     find_new_name=True)
            cloned_arrays[onodename] = name

        # Replace nodes
        for state in sdfg.nodes():
            for node in state.nodes():
                if (isinstance(node, nodes.AccessNode)
                        and node.data in cloned_arrays):
                    node.data = cloned_arrays[node.data]

        # Replace memlets
        for state in sdfg.nodes():
            for edge in state.edges():
                if edge.data.data in cloned_arrays:
                    edge.data.data = cloned_arrays[edge.data.data]

        #######################################################
        # Step 2: Create copy-in state
        excluded_copyin = self.exclude_copyin.split(',')

        copyin_state = sdfg.add_state(sdfg.label + '_copyin')
        sdfg.add_edge(copyin_state, start_state, sd.InterstateEdge())

        for nname, desc in dtypes.deduplicate(input_nodes):
            if nname in excluded_copyin or nname not in cloned_arrays:
                continue
            src_array = nodes.AccessNode(nname, debuginfo=desc.debuginfo)
            dst_array = nodes.AccessNode(cloned_arrays[nname],
                                         debuginfo=desc.debuginfo)
            copyin_state.add_node(src_array)
            copyin_state.add_node(dst_array)
            copyin_state.add_nedge(
                src_array, dst_array,
                memlet.Memlet.from_array(src_array.data, src_array.desc(sdfg)))

        #######################################################
        # Step 3: Create copy-out state
        excluded_copyout = self.exclude_copyout.split(',')

        copyout_state = sdfg.add_state(sdfg.label + '_copyout')
        for state in end_states:
            sdfg.add_edge(state, copyout_state, sd.InterstateEdge())

        for nname, desc in dtypes.deduplicate(output_nodes):
            if nname in excluded_copyout or nname not in cloned_arrays:
                continue
            src_array = nodes.AccessNode(cloned_arrays[nname],
                                         debuginfo=desc.debuginfo)
            dst_array = nodes.AccessNode(nname, debuginfo=desc.debuginfo)
            copyout_state.add_node(src_array)
            copyout_state.add_node(dst_array)
            copyout_state.add_nedge(
                src_array, dst_array,
                memlet.Memlet.from_array(dst_array.data, dst_array.desc(sdfg)))

        #######################################################
        # Step 4: Modify transient data storage

        const_syms = xfh.constant_symbols(sdfg)

        for state in sdfg.nodes():
            sdict = state.scope_dict()
            for node in state.nodes():
                if isinstance(node,
                              nodes.AccessNode) and node.desc(sdfg).transient:
                    nodedesc = node.desc(sdfg)

                    # Special case: nodes that lead to dynamic map ranges must
                    # stay on host
                    if any(
                            isinstance(
                                state.memlet_path(e)[-1].dst, nodes.EntryNode)
                            for e in state.out_edges(node)):
                        continue

                    gpu_storage = [
                        dtypes.StorageType.GPU_Global,
                        dtypes.StorageType.GPU_Shared,
                        dtypes.StorageType.CPU_Pinned
                    ]
                    if sdict[
                            node] is None and nodedesc.storage not in gpu_storage:
                        # NOTE: the cloned arrays match too but it's the same
                        # storage so we don't care
                        nodedesc.storage = dtypes.StorageType.GPU_Global

                        # Try to move allocation/deallocation out of loops
                        dsyms = set(map(str, nodedesc.free_symbols))
                        if (self.toplevel_trans
                                and not isinstance(nodedesc, (data.Stream,
                                                              data.View))
                                and len(dsyms - const_syms) == 0):
                            nodedesc.lifetime = dtypes.AllocationLifetime.SDFG
                    elif nodedesc.storage not in gpu_storage:
                        # Make internal transients registers
                        if self.register_trans:
                            nodedesc.storage = dtypes.StorageType.Register

        #######################################################
        # Step 5: Change all top-level maps and library nodes to GPU schedule

        for state in sdfg.nodes():
            sdict = state.scope_dict()
            for node in state.nodes():
                if sdict[node] is None:
                    if isinstance(node, (nodes.LibraryNode, nodes.NestedSDFG)):
                        node.schedule = dtypes.ScheduleType.GPU_Default
                    elif isinstance(node, nodes.EntryNode):
                        node.schedule = dtypes.ScheduleType.GPU_Device
                elif self.sequential_innermaps:
                    if isinstance(node, (nodes.EntryNode, nodes.LibraryNode)):
                        node.schedule = dtypes.ScheduleType.Sequential
                    elif isinstance(node, nodes.NestedSDFG):
                        for nnode, _ in node.sdfg.all_nodes_recursive():
                            if isinstance(nnode,
                                          (nodes.EntryNode, nodes.LibraryNode)):
                                nnode.schedule = dtypes.ScheduleType.Sequential

        #######################################################
        # Step 6: Wrap free tasklets and nested SDFGs with a GPU map

        # Collect free tasklets
        for node, state in sdfg.all_nodes_recursive():
            if isinstance(node, nodes.Tasklet):
                if (state.entry_node(node) is None
                        and not scope.is_devicelevel_gpu(
                            state.parent, state, node, with_gpu_default=True)):
                    global_code_nodes[state].append(node)

        for state, gcodes in global_code_nodes.items():
            for gcode in gcodes:
                if gcode.label in self.exclude_tasklets.split(','):
                    continue
                # Create map and connectors
                me, mx = state.add_map(gcode.label + '_gmap',
                                       {gcode.label + '__gmapi': '0:1'},
                                       schedule=dtypes.ScheduleType.GPU_Device)
                # Store in/out edges in lists so that they don't get corrupted
                # when they are removed from the graph
                in_edges = list(state.in_edges(gcode))
                out_edges = list(state.out_edges(gcode))
                me.in_connectors = {('IN_' + e.dst_conn): None
                                    for e in in_edges}
                me.out_connectors = {('OUT_' + e.dst_conn): None
                                     for e in in_edges}
                mx.in_connectors = {('IN_' + e.src_conn): None
                                    for e in out_edges}
                mx.out_connectors = {('OUT_' + e.src_conn): None
                                     for e in out_edges}

                # Create memlets through map
                for e in in_edges:
                    state.remove_edge(e)
                    state.add_edge(e.src, e.src_conn, me, 'IN_' + e.dst_conn,
                                   e.data)
                    state.add_edge(me, 'OUT_' + e.dst_conn, e.dst, e.dst_conn,
                                   e.data)
                for e in out_edges:
                    state.remove_edge(e)
                    state.add_edge(e.src, e.src_conn, mx, 'IN_' + e.src_conn,
                                   e.data)
                    state.add_edge(mx, 'OUT_' + e.src_conn, e.dst, e.dst_conn,
                                   e.data)

                # Map without inputs
                if len(in_edges) == 0:
                    state.add_nedge(me, gcode, memlet.Memlet())
        #######################################################
        # Step 7: Introduce copy-out if data used in outgoing interstate edges

        for state in list(sdfg.nodes()):
            arrays_used = set()
            for e in sdfg.out_edges(state):
                # Used arrays = intersection between symbols and cloned arrays
                arrays_used.update(
                    set(e.data.free_symbols)
                    & set(cloned_arrays.keys()))

            # Create a state and copy out used arrays
            if len(arrays_used) > 0:
                co_state = sdfg.add_state(state.label + '_icopyout')

                # Reconnect outgoing edges to after interim copyout state
                for e in sdfg.out_edges(state):
                    sdutil.change_edge_src(sdfg, state, co_state)
                # Add unconditional edge to interim state
                sdfg.add_edge(state, co_state, sd.InterstateEdge())

                # Add copy-out nodes
                for nname in arrays_used:
                    desc = sdfg.arrays[nname]
                    src_array = nodes.AccessNode(cloned_arrays[nname],
                                                 debuginfo=desc.debuginfo)
                    dst_array = nodes.AccessNode(nname,
                                                 debuginfo=desc.debuginfo)
                    co_state.add_node(src_array)
                    co_state.add_node(dst_array)
                    co_state.add_nedge(
                        src_array, dst_array,
                        memlet.Memlet.from_array(dst_array.data,
                                                 dst_array.desc(sdfg)))

        #######################################################
        # Step 8: Strict transformations
        if not self.strict_transform:
            return

        # Apply strict state fusions greedily.
        sdfg.apply_strict_transformations()
Example #11
0
def _get_codegen_targets(sdfg: SDFG, frame: framecode.DaCeCodeGenerator):
    """
    Queries all code generation targets in this SDFG and all nested SDFGs,
    as well as instrumentation providers, and stores them in the frame code generator.
    """
    disp = frame._dispatcher
    provider_mapping = InstrumentationProvider.get_provider_mapping()
    disp.instrumentation[dtypes.InstrumentationType.No_Instrumentation] = None
    for node, parent in sdfg.all_nodes_recursive():
        # Query nodes and scopes
        if isinstance(node, SDFGState):
            frame.targets.add(disp.get_state_dispatcher(parent, node))
        elif isinstance(node, dace.nodes.EntryNode):
            frame.targets.add(disp.get_scope_dispatcher(node.schedule))
        elif isinstance(node, dace.nodes.Node):
            state: SDFGState = parent
            nsdfg = state.parent
            frame.targets.add(disp.get_node_dispatcher(nsdfg, state, node))

        # Array allocation
        if isinstance(node, dace.nodes.AccessNode):
            state: SDFGState = parent
            nsdfg = state.parent
            desc = node.desc(nsdfg)
            frame.targets.add(disp.get_array_dispatcher(desc.storage))

        # Copies and memlets - via access nodes and tasklets
        # To avoid duplicate checks, only look at outgoing edges of access nodes and tasklets
        if isinstance(node, (dace.nodes.AccessNode, dace.nodes.Tasklet)):
            state: SDFGState = parent
            for e in state.out_edges(node):
                if e.data.is_empty():
                    continue
                mtree = state.memlet_tree(e)
                if mtree.downwards:
                    # Rooted at src_node
                    for leaf_e in mtree.leaves():
                        dst_node = leaf_e.dst
                        if leaf_e.data.is_empty():
                            continue
                        tgt = disp.get_copy_dispatcher(node, dst_node, leaf_e,
                                                       state.parent, state)
                        if tgt is not None:
                            frame.targets.add(tgt)
                else:
                    # Rooted at dst_node
                    dst_node = mtree.root().edge.dst
                    tgt = disp.get_copy_dispatcher(node, dst_node, e,
                                                   state.parent, state)
                    if tgt is not None:
                        frame.targets.add(tgt)

        # Instrumentation-related query
        if hasattr(node, 'instrument'):
            disp.instrumentation[node.instrument] = provider_mapping[
                node.instrument]
        elif hasattr(node, 'consume'):
            disp.instrumentation[node.consume.instrument] = provider_mapping[
                node.consume.instrument]
        elif hasattr(node, 'map'):
            disp.instrumentation[node.map.instrument] = provider_mapping[
                node.map.instrument]

    # Query instrumentation provider of SDFG
    if sdfg.instrument != dtypes.InstrumentationType.No_Instrumentation:
        disp.instrumentation[sdfg.instrument] = provider_mapping[
            sdfg.instrument]
Example #12
0
def make_transients_persistent(sdfg: SDFG,
                               device: dtypes.DeviceType,
                               toplevel_only: bool = True) -> None:
    ''' 
    Helper function to change several storage and scheduling properties
    - Makes non-view array lifetimes persistent, with some 
      restrictions depending on the device 
    - Reset nonatomic WCR edges on GPU 
    The only arrays that are made persistent by default are ones that do not exist inside a scope (and thus may be
    allocated multiple times), and whose symbols are always given as parameters to the SDFG (so that they can be
    allocated in a persistent manner).

    :param sdfg: SDFG
    :param device: Device type
    :param toplevel_only: If True, only converts access nodes that do not appear in any scope.
    '''
    for nsdfg in sdfg.all_sdfgs_recursive():
        fsyms: Set[str] = nsdfg.free_symbols
        persistent: Set[str] = set()
        not_persistent: Set[str] = set()

        for state in nsdfg.nodes():
            for dnode in state.data_nodes():
                if dnode.data in not_persistent:
                    continue
                desc = dnode.desc(nsdfg)
                # Only convert arrays and scalars that are not registers
                if not desc.transient or type(desc) not in {
                        dt.Array, dt.Scalar
                }:
                    not_persistent.add(dnode.data)
                    continue
                if desc.storage == dtypes.StorageType.Register:
                    not_persistent.add(dnode.data)
                    continue
                # Only convert arrays where the size depends on SDFG parameters
                try:
                    if set(map(str, desc.total_size.free_symbols)) - fsyms:
                        not_persistent.add(dnode.data)
                        continue
                except AttributeError:  # total_size is an integer / has no free symbols
                    pass

                # Only convert arrays with top-level access nodes
                if xfh.get_parent_map(state, dnode) is not None:
                    if toplevel_only:
                        not_persistent.add(dnode.data)
                        continue
                    elif desc.lifetime == dtypes.AllocationLifetime.Scope:
                        not_persistent.add(dnode.data)
                        continue

                persistent.add(dnode.data)

        for aname in (persistent - not_persistent):
            nsdfg.arrays[aname].lifetime = dtypes.AllocationLifetime.Persistent

    if device == dtypes.DeviceType.GPU:
        # Reset nonatomic WCR edges
        for n, _ in sdfg.all_nodes_recursive():
            if isinstance(n, SDFGState):
                for edge in n.edges():
                    edge.data.wcr_nonatomic = False