Beispiel #1
0
    def generate_fileheader(self, sdfg: SDFG, global_stream: CodeIOStream):
        """ Generate a header in every output file that includes custom types
            and constants.
            :param sdfg: The input SDFG.
            :param global_stream: Stream to write to (global).
        """
        #########################################################
        # Custom types
        datatypes = set()
        # Types of this SDFG
        for _, arrname, arr in sdfg.arrays_recursive():
            if arr is not None:
                datatypes.add(arr.dtype)

        # Emit unique definitions
        wrote_something = False
        for typ in datatypes:
            if hasattr(typ, 'emit_definition'):
                if not wrote_something:
                    global_stream.write("", sdfg)
                wrote_something = True
                global_stream.write(typ.emit_definition(), sdfg)
        if wrote_something:
            global_stream.write("", sdfg)

        #########################################################
        # Write constants
        self.generate_constants(sdfg, global_stream)

        for sd in sdfg.all_sdfgs_recursive():
            global_stream.write(sd.global_code, sd)
Beispiel #2
0
def make_transients_persistent(sdfg: SDFG, device: dtypes.DeviceType) -> None:
    ''' 
    Helper function to change several storage and scheduling properties
    - Makes non-view array lifetimes persistent, with some 
      restrictions depending on the device 
    - Reset nonatomic WCR edges on GPU 
    :param sdfg: SDFG
    :param device: Device type
    '''
    for nsdfg in sdfg.all_sdfgs_recursive():
        for aname, arr in nsdfg.arrays.items():
            if arr.transient and not isinstance(
                    arr, dt.View) and not symbolic.issymbolic(arr.total_size):
                if arr.storage != dtypes.StorageType.Register:
                    arr.lifetime = dtypes.AllocationLifetime.Persistent

    if device == dtypes.DeviceType.GPU:
        for aname, arr in sdfg.arrays.items():
            if arr.transient and not isinstance(
                    arr, dt.View):  #and size only depends on SDFG params
                if arr.storage == dtypes.StorageType.GPU_Global:
                    arr.lifetime = dtypes.AllocationLifetime.Persistent

        # Reset nonatomic WCR edges
        for n, _ in sdfg.all_nodes_recursive():
            if isinstance(n, SDFGState):
                for edge in n.edges():
                    edge.data.wcr_nonatomic = False
Beispiel #3
0
    def generate_fileheader(self,
                            sdfg: SDFG,
                            global_stream: CodeIOStream,
                            backend: str = 'frame'):
        """ Generate a header in every output file that includes custom types
            and constants.
            :param sdfg: The input SDFG.
            :param global_stream: Stream to write to (global).
            :param backend: Whose backend this header belongs to.
        """
        #########################################################
        # Environment-based includes
        for env in self.environments:
            if len(env.headers) > 0:
                global_stream.write(
                    "\n".join("#include \"" + h + "\"" for h in env.headers),
                    sdfg)

        #########################################################
        # Custom types
        datatypes = set()
        # Types of this SDFG
        for _, arrname, arr in sdfg.arrays_recursive():
            if arr is not None:
                datatypes.add(arr.dtype)

        # Emit unique definitions
        wrote_something = False
        for typ in datatypes:
            if hasattr(typ, 'emit_definition'):
                if not wrote_something:
                    global_stream.write("", sdfg)
                wrote_something = True
                global_stream.write(typ.emit_definition(), sdfg)
        if wrote_something:
            global_stream.write("", sdfg)

        #########################################################
        # Write constants
        self.generate_constants(sdfg, global_stream)

        #########################################################
        # Write state struct
        structstr = '\n'.join(self.statestruct)
        global_stream.write(
            f'''
struct {sdfg.name}_t {{
    {structstr}
}};

''', sdfg)

        for sd in sdfg.all_sdfgs_recursive():
            if None in sd.global_code:
                global_stream.write(codeblock_to_cpp(sd.global_code[None]), sd)
            if backend in sd.global_code:
                global_stream.write(codeblock_to_cpp(sd.global_code[backend]),
                                    sd)
Beispiel #4
0
def get_post_state(sdfg: SDFG, state: SDFGState):
    """ 
    Returns the post state (the state that copies the data a back from the FGPA device) if there is one.
    """
    for s in sdfg.all_sdfgs_recursive():
        for post_state in s.states():

            if 'post_' + str(state) == str(post_state):
                return post_state

    return None
Beispiel #5
0
def set_fast_implementations(sdfg: SDFG,
                             device: dtypes.DeviceType,
                             blocklist: List[str] = None):
    """
    Set fast library node implementations for the given device

    :param sdfg: The SDFG to optimize.
    :param device: the device to optimize for.
    :param blocklist: list of disallowed implementations.
    :note: Operates in-place on the given SDFG.
    """
    if blocklist is None:
        implementation_prio = find_fast_library(device)
    else:
        implementation_prio = [
            i for i in find_fast_library(device) if i not in blocklist
        ]

    # specialized nodes: pre-expand
    for current_sdfg in sdfg.all_sdfgs_recursive():
        for state in current_sdfg.nodes():
            for node in state.nodes():
                if isinstance(node, nodes.LibraryNode):
                    if (node.default_implementation == 'specialize' and (len(
                            set(node.implementations)
                            & set(implementation_prio))) == 0):
                        node.expand(current_sdfg, state)

    # general nodes
    for node, _ in sdfg.all_nodes_recursive():
        if isinstance(node, nodes.LibraryNode):
            for impl in implementation_prio:
                if impl in node.implementations:
                    if isinstance(node,
                                  dace.libraries.standard.nodes.reduce.Reduce
                                  ) and node.implementation == 'CUDA (block)':
                        continue
                    node.implementation = impl
                    break

    # reduce nodes
    if device == dtypes.DeviceType.GPU:
        for node, state in sdfg.all_nodes_recursive():
            if isinstance(node, dace.nodes.LibraryNode):
                # Use CUB for device-level reductions
                if ('CUDA (device)' in node.implementations
                        and not is_devicelevel_gpu(state.parent, state, node)
                        and state.scope_dict()[node] is None):
                    node.implementation = 'CUDA (device)'
Beispiel #6
0
def enumerate_matches(sdfg: SDFG,
                      pattern: gr.Graph,
                      node_match=type_or_class_match,
                      edge_match=None) -> Iterator[gr.SubgraphView]:
    """
    Returns a generator of subgraphs that match the given subgraph pattern.
    :param sdfg: The SDFG to search in.
    :param pattern: A subgraph to look for.
    :param node_match: An optional function to use for matching nodes.
    :param node_match: An optional function to use for matching edges.
    :return: Yields SDFG subgraph view objects.
    """
    if len(pattern.nodes()) == 0:
        raise ValueError('Subgraph pattern cannot be empty')

    # Find if the subgraph is within states or SDFGs
    is_interstate = (isinstance(pattern.node(0), SDFGState)
                     or (isinstance(pattern.node(0), type)
                         and pattern.node(0) is SDFGState))

    # Collapse multigraphs into directed graphs
    pattern_digraph = collapse_multigraph_to_nx(pattern)

    # Find matches in all SDFGs and nested SDFGs
    for graph in sdfg.all_sdfgs_recursive():
        if is_interstate:
            graph_matcher = iso.DiGraphMatcher(
                collapse_multigraph_to_nx(graph),
                pattern_digraph,
                node_match=node_match,
                edge_match=edge_match)
            for subgraph in graph_matcher.subgraph_isomorphisms_iter():
                yield gr.SubgraphView(graph,
                                      [graph.node(i) for i in subgraph.keys()])
        else:
            for state in graph.nodes():
                graph_matcher = iso.DiGraphMatcher(
                    collapse_multigraph_to_nx(state),
                    pattern_digraph,
                    node_match=node_match,
                    edge_match=edge_match)
                for subgraph in graph_matcher.subgraph_isomorphisms_iter():
                    yield gr.SubgraphView(
                        state, [state.node(i) for i in subgraph.keys()])
Beispiel #7
0
    def apply_pass(
        self, sdfg: SDFG, pipeline_results: Dict[str, Any]
    ) -> Optional[Dict[SDFGState, Optional[Any]]]:
        """
        Applies the pass to states of the given SDFG by calling ``apply`` on each state.
        :param sdfg: The SDFG to apply the pass to.
        :param pipeline_results: If in the context of a ``Pipeline``, a dictionary that is populated with prior Pass
                                 results as ``{Pass subclass name: returned object from pass}``. If not run in a
                                 pipeline, an empty dictionary is expected.
        :return: A dictionary of ``{state: return value}`` for visited states with a non-None return value, or None
                 if nothing was returned.
        """
        result = {}
        for sd in sdfg.all_sdfgs_recursive():
            for state in sd.nodes():
                retval = self.apply(state, pipeline_results)
                if retval is not None:
                    result[state] = retval

        if not result:
            return None
        return result
Beispiel #8
0
    def determine_allocation_lifetime(self, top_sdfg: SDFG):
        """
        Determines where (at which scope/state/SDFG) each data descriptor
        will be allocated/deallocated.
        :param top_sdfg: The top-level SDFG to determine for.
        """
        # Gather shared transients
        shared_transients = {}
        for sdfg in top_sdfg.all_sdfgs_recursive():
            shared_transients[sdfg.sdfg_id] = sdfg.shared_transients(
                check_toplevel=False)

        for sdfg, name, desc in top_sdfg.arrays_recursive():
            if not desc.transient:
                continue

            # NOTE: In the code below we infer where a transient should be
            # declared, allocated, and deallocated. The information is stored
            # in the `to_allocate` dictionary. The key of each entry is the
            # scope where one of the above actions must occur, while the value
            # is a tuple containing the following information:
            # 1. The SDFG object that containts the transient.
            # 2. The State id where the action should (approx.) take place.
            # 3. The Access Node id of the transient in the above State.
            # 4. True if declaration should take place, otherwise False.
            # 5. True if allocation should take place, otherwise False.
            # 6. True if deallocation should take place, otherwise False.

            # Possibly confusing control flow below finds the first state
            # and node of the data descriptor, or continues the
            # arrays_recursive() loop
            first_state_instance: int = None
            first_node_instance: nodes.AccessNode = None
            last_state_instance: int = None
            last_node_instance: nodes.AccessNode = None
            first = True
            for state in sdfg.topological_sort():
                id = sdfg.nodes().index(state)
                for node in state.data_nodes():
                    if node.data == name:
                        if first:
                            first_state_instance = id
                            first_node_instance = node
                            first = False
                        last_state_instance = id
                        last_node_instance = node
                        # break
                else:
                    continue
                break

            # Cases
            if desc.lifetime is dtypes.AllocationLifetime.Persistent:
                # Persistent memory is allocated in initialization code and
                # exists in the library state structure

                # If unused, skip
                if first_node_instance is None:
                    continue

                definition = desc.as_arg(name=f'__{sdfg.sdfg_id}_{name}') + ';'
                self.statestruct.append(definition)

                self.to_allocate[sdfg].append(
                    (sdfg, first_state_instance, first_node_instance, True,
                     True, True))
                continue
            elif desc.lifetime is dtypes.AllocationLifetime.Global:
                # Global memory is allocated in the beginning of the program
                # exists in the library state structure (to be passed along
                # to the right SDFG)

                # If unused, skip
                if first_node_instance is None:
                    continue

                definition = desc.as_arg(name=f'__{sdfg.sdfg_id}_{name}') + ';'
                self.statestruct.append(definition)

                # self.to_allocate[top_sdfg].append(
                #     (sdfg.sdfg_id, sdfg.node_id(state), node))
                self.to_allocate[top_sdfg].append(
                    (sdfg, first_state_instance, first_node_instance, True,
                     True, True))
                continue

            # The rest of the cases change the starting scope we attempt to
            # allocate from, since the descriptors may only be allocated higher
            # in the hierarchy (e.g., in the case of GPU global memory inside
            # a kernel).
            alloc_scope: Union[nodes.EntryNode, SDFGState, SDFG] = None
            alloc_state: SDFGState = None
            access_node: nodes.AccessNode = None
            if (name in shared_transients[sdfg.sdfg_id]
                    or desc.lifetime is dtypes.AllocationLifetime.SDFG):
                # SDFG memory and shared transients are allocated in the
                # beginning of their SDFG
                alloc_scope = sdfg
                if first_state_instance is not None:
                    alloc_state = sdfg.nodes()[first_state_instance]
                # If unused, skip
                if first_node_instance is None:
                    continue
            elif desc.lifetime is dtypes.AllocationLifetime.State:
                # State memory is either allocated in the beginning of the
                # containing state or the SDFG (if used in more than one state)
                curstate: SDFGState = None
                multistate = False
                for state in sdfg.nodes():
                    if any(n.data == name for n in state.data_nodes()):
                        if curstate is not None:
                            multistate = True
                            break
                        curstate = state
                if multistate:
                    alloc_scope = sdfg
                else:
                    alloc_scope = curstate
                    alloc_state = curstate
            elif desc.lifetime is dtypes.AllocationLifetime.Scope:
                # Scope memory (default) is either allocated in the innermost
                # scope (e.g., Map, Consume) it is used in (i.e., greatest
                # common denominator), or in the SDFG if used in multiple states
                curscope: Union[nodes.EntryNode, SDFGState] = None
                curstate: SDFGState = None
                multistate = False

                # Does the array appear in inter-state edges?
                for isedge in sdfg.edges():
                    if name in isedge.data.free_symbols:
                        multistate = True

                for state in sdfg.nodes():
                    if multistate:
                        break
                    sdict = state.scope_dict()
                    for node in state.nodes():
                        if not isinstance(node, nodes.AccessNode):
                            continue
                        if node.data != name:
                            continue

                        # If already found in another state, set scope to SDFG
                        if curstate is not None and curstate != state:
                            multistate = True
                            break
                        curstate = state

                        # Current scope (or state object if top-level)
                        scope = sdict[node] or state
                        if curscope is None:
                            curscope = scope
                            continue
                        # States always win
                        if isinstance(scope, SDFGState):
                            curscope = scope
                            continue
                        # Lower/Higher/Disjoint scopes: find common denominator
                        if isinstance(curscope, SDFGState):
                            if scope in curscope.nodes():
                                continue
                        curscope = sdscope.common_parent_scope(
                            sdict, scope, curscope)

                    if multistate:
                        break

                if multistate:
                    alloc_scope = sdfg
                else:
                    alloc_scope = curscope
                    alloc_state = curstate
            else:
                raise TypeError('Unrecognized allocation lifetime "%s"' %
                                desc.lifetime)

            if alloc_scope is None:  # No allocation necessary
                continue

            # If descriptor cannot be allocated in this scope, traverse up the
            # scope tree until it is possible
            cursdfg = sdfg
            curstate = alloc_state
            curscope = alloc_scope
            while not self._can_allocate(cursdfg, curstate, desc, curscope):
                if curscope is None:
                    break
                if isinstance(curscope, nodes.EntryNode):
                    # Go one scope up
                    curscope = curstate.entry_node(curscope)
                    if curscope is None:
                        curscope = curstate
                elif isinstance(curscope, (SDFGState, SDFG)):
                    cursdfg: SDFG = (curscope if isinstance(curscope, SDFG)
                                     else curscope.parent)
                    # Go one SDFG up
                    if cursdfg.parent_nsdfg_node is None:
                        curscope = None
                        curstate = None
                    else:
                        curstate = cursdfg.parent
                        curscope = curstate.entry_node(
                            cursdfg.parent_nsdfg_node)
                else:
                    raise TypeError

            if curscope is None:
                curscope = top_sdfg

            # Check if Array/View is dependent on non-free SDFG symbols
            # NOTE: Tuple is (SDFG, State, Node, declare, allocate, deallocate)
            fsymbols = sdfg.free_symbols.union(sdfg.constants.keys())
            if (not isinstance(curscope, nodes.EntryNode)
                    and utils.is_nonfree_sym_dependent(
                        first_node_instance, desc, alloc_state, fsymbols)):
                # Declare in current (SDFG) scope
                self.to_allocate[curscope].append(
                    (sdfg, first_state_instance, first_node_instance, True,
                     False, False))
                # Allocate in first State
                # Deallocate in last State
                if first_state_instance != last_state_instance:
                    curscope = sdfg.nodes()[first_state_instance]
                    self.to_allocate[curscope].append(
                        (sdfg, first_state_instance, first_node_instance,
                         False, True, False))
                    curscope = sdfg.nodes()[last_state_instance]
                    self.to_allocate[curscope].append(
                        (sdfg, last_state_instance, last_node_instance, False,
                         False, True))
                else:
                    curscope = sdfg.nodes()[first_state_instance]
                    self.to_allocate[curscope].append(
                        (sdfg, first_state_instance, first_node_instance,
                         False, True, True))
            else:
                self.to_allocate[curscope].append(
                    (sdfg, first_state_instance, first_node_instance, True,
                     True, True))
Beispiel #9
0
    def generate_footer(self, sdfg: SDFG, global_stream: CodeIOStream,
                        callsite_stream: CodeIOStream):
        """ Generate the footer of the frame-code. Code exists in a separate
            function for overriding purposes.
            :param sdfg: The input SDFG.
            :param global_stream: Stream to write to (global).
            :param callsite_stream: Stream to write to (at call site).
        """
        import dace.library
        fname = sdfg.name
        params = sdfg.signature()
        paramnames = sdfg.signature(False, for_call=True)
        initparams = sdfg.signature(with_arrays=False)
        initparamnames = sdfg.signature(False,
                                        for_call=True,
                                        with_arrays=False)

        # Invoke all instrumentation providers
        for instr in self._dispatcher.instrumentation.values():
            if instr is not None:
                instr.on_sdfg_end(sdfg, callsite_stream, global_stream)

        # Instrumentation saving
        if (config.Config.get_bool('instrumentation', 'report_each_invocation')
                and len(self._dispatcher.instrumentation) > 1):
            callsite_stream.write(
                '''__state->report.save("{path}/perf", __HASH_{name});'''.
                format(path=sdfg.build_folder.replace('\\', '/'),
                       name=sdfg.name), sdfg)

        # Write closing brace of program
        callsite_stream.write('}', sdfg)

        # Write awkward footer to avoid 'extern "C"' issues
        params_comma = (', ' + params) if params else ''
        initparams_comma = (', ' + initparams) if initparams else ''
        paramnames_comma = (', ' + paramnames) if paramnames else ''
        initparamnames_comma = (', ' +
                                initparamnames) if initparamnames else ''
        callsite_stream.write(
            f'''
DACE_EXPORTED void __program_{fname}({fname}_t *__state{params_comma})
{{
    __program_{fname}_internal(__state{paramnames_comma});
}}''', sdfg)

        for target in self._dispatcher.used_targets:
            if target.has_initializer:
                callsite_stream.write(
                    'DACE_EXPORTED int __dace_init_%s(%s_t *__state%s);\n' %
                    (target.target_name, sdfg.name, initparams_comma), sdfg)
            if target.has_finalizer:
                callsite_stream.write(
                    'DACE_EXPORTED int __dace_exit_%s(%s_t *__state);\n' %
                    (target.target_name, sdfg.name), sdfg)

        callsite_stream.write(
            f"""
DACE_EXPORTED {sdfg.name}_t *__dace_init_{sdfg.name}({initparams})
{{
    int __result = 0;
    {sdfg.name}_t *__state = new {sdfg.name}_t;

            """, sdfg)

        for target in self._dispatcher.used_targets:
            if target.has_initializer:
                callsite_stream.write(
                    '__result |= __dace_init_%s(__state%s);' %
                    (target.target_name, initparamnames_comma), sdfg)
        for env in self.environments:
            init_code = _get_or_eval_sdfg_first_arg(env.init_code, sdfg)
            if init_code:
                callsite_stream.write("{  // Environment: " + env.__name__,
                                      sdfg)
                callsite_stream.write(init_code)
                callsite_stream.write("}")

        for sd in sdfg.all_sdfgs_recursive():
            if None in sd.init_code:
                callsite_stream.write(codeblock_to_cpp(sd.init_code[None]), sd)
            callsite_stream.write(codeblock_to_cpp(sd.init_code['frame']), sd)

        callsite_stream.write(self._initcode.getvalue(), sdfg)

        callsite_stream.write(
            f"""
    if (__result) {{
        delete __state;
        return nullptr;
    }}
    return __state;
}}

DACE_EXPORTED void __dace_exit_{sdfg.name}({sdfg.name}_t *__state)
{{
""", sdfg)

        # Instrumentation saving
        if (not config.Config.get_bool('instrumentation',
                                       'report_each_invocation')
                and len(self._dispatcher.instrumentation) > 1):
            callsite_stream.write(
                '__state->report.save("%s/perf", __HASH_%s);' %
                (sdfg.build_folder.replace('\\', '/'), sdfg.name), sdfg)

        callsite_stream.write(self._exitcode.getvalue(), sdfg)

        for sd in sdfg.all_sdfgs_recursive():
            if None in sd.exit_code:
                callsite_stream.write(codeblock_to_cpp(sd.exit_code[None]), sd)
            callsite_stream.write(codeblock_to_cpp(sd.exit_code['frame']), sd)

        for target in self._dispatcher.used_targets:
            if target.has_finalizer:
                callsite_stream.write(
                    '__dace_exit_%s(__state);' % target.target_name, sdfg)
        for env in reversed(self.environments):
            finalize_code = _get_or_eval_sdfg_first_arg(
                env.finalize_code, sdfg)
            if finalize_code:
                callsite_stream.write("{  // Environment: " + env.__name__,
                                      sdfg)
                callsite_stream.write(finalize_code)
                callsite_stream.write("}")

        callsite_stream.write('delete __state;\n}\n', sdfg)
Beispiel #10
0
def auto_optimize(sdfg: SDFG,
                  device: dtypes.DeviceType,
                  validate: bool = True,
                  validate_all: bool = False) -> SDFG:
    """
    Runs a basic sequence of transformations to optimize a given SDFG to decent
    performance. In particular, performs the following:
        * Strict transformations
        * Strict auto-parallelization (loop-to-map)
        * Greedy application of SubgraphFusion
        * Tiled write-conflict resolution (MapTiling -> AccumulateTransient)
        * Tiled stream accumulation (MapTiling -> AccumulateTransient)
        * Collapse all maps to parallelize across all dimensions
        * Set all library nodes to expand to ``fast`` expansion, which calls
          the fastest library on the target device
    :param sdfg: The SDFG to optimize.
    :param device: the device to optimize for.
    :param validate: If True, validates the SDFG after all transformations
                     have been applied.
    :param validate_all: If True, validates the SDFG after every step.
    :return: The optimized SDFG.
    :note: Operates in-place on the given SDFG.
    :note: This function is still experimental and may harm correctness in
           certain cases. Please report an issue if it does.
    """
    # Strict transformations and loop parallelization
    transformed = True
    while transformed:
        sdfg.apply_strict_transformations(validate=False,
                                          validate_all=validate_all)

        xfh.split_interstate_edges(sdfg)

        # Try to parallelize loops
        l2ms = sdfg.apply_transformations_repeated(LoopToMap,
                                                   strict=True,
                                                   validate=False,
                                                   validate_all=validate_all)
        transformed = l2ms > 0

    # Map fusion
    greedy_fuse(sdfg, validate_all)

    if device == dtypes.DeviceType.FPGA:
        # apply FPGA Transformations
        sdfg.apply_fpga_transformations()
        fpga_aopt.fpga_global_to_local(sdfg)
        fpga_aopt.fpga_rr_interleave_containers_to_banks(sdfg)

        # Set all library nodes to expand to fast library calls
        set_fast_implementations(sdfg, device)
        return sdfg

    # Tiled WCR and streams
    for nsdfg in list(sdfg.all_sdfgs_recursive()):
        tile_wcrs(nsdfg, validate_all)

    # Collapse maps
    sdfg.apply_transformations_repeated(MapCollapse,
                                        strict=True,
                                        validate=False,
                                        validate_all=validate_all)
    for node, _ in sdfg.all_nodes_recursive():
        if isinstance(node, nodes.MapEntry):
            node.map.collapse = len(node.map.range)

    # Set all library nodes to expand to fast library calls
    set_fast_implementations(sdfg, device)

    # TODO(later): Safe vectorization

    # Disable OpenMP parallel sections
    # TODO(later): Set on a per-SDFG basis
    config.Config.set('compiler', 'cpu', 'openmp_sections', value=False)

    # Set all Default storage types that are constant sized to registers
    move_small_arrays_to_stack(sdfg)

    # Validate at the end
    if validate or validate_all:
        sdfg.validate()

    return sdfg
Beispiel #11
0
def match_patterns(sdfg: SDFG,
                   patterns: Union[Type[xf.Transformation],
                                   List[Type[xf.Transformation]]],
                   node_match: Callable[[Any, Any], bool] = type_match,
                   edge_match: Optional[Callable[[Any, Any], bool]] = None,
                   strict: bool = False,
                   metadata: Optional[PatternMetadataType] = None,
                   states: Optional[List[SDFGState]] = None):
    """ Returns a generator of Transformations that match the input SDFG. 
        Ordered by SDFG ID.
        :param sdfg: The SDFG to match in.
        :param patterns: Transformation type (or list thereof) to match.
        :param node_match: Function for checking whether two nodes match.
        :param edge_match: Function for checking whether two edges match.
        :param strict: Only match transformation if strict (i.e., can only
                       improve the performance/reduce complexity of the SDFG).
        :param metadata: Transformation metadata that can be reused.
        :param states: If given, only tries to match single-state 
                       transformations on this list.
        :return: A list of Transformation objects that match.
    """

    if isinstance(patterns, type):
        patterns = [patterns]

    # Collect transformation metadata
    if metadata is not None:
        # Transformation metadata can be evaluated once per apply loop
        interstate_transformations, singlestate_transformations = metadata
    else:
        # Otherwise, precompute all transformation data once
        (interstate_transformations,
         singlestate_transformations) = get_transformation_metadata(patterns)

    # Collect SDFG and nested SDFGs
    sdfgs = sdfg.all_sdfgs_recursive()

    # Try to find transformations on each SDFG
    for tsdfg in sdfgs:
        ###################################
        # Match inter-state transformations
        if len(interstate_transformations) > 0:
            # Collapse multigraph into directed graph in order to use VF2
            digraph = collapse_multigraph_to_nx(tsdfg)

        for xform, expr_idx, nxpattern, matcher in interstate_transformations:
            for subgraph in matcher(digraph, nxpattern, node_match,
                                    edge_match):
                match = _try_to_match_transformation(tsdfg, digraph, subgraph,
                                                     tsdfg, xform, expr_idx,
                                                     nxpattern, -1, strict)
                if match is not None:
                    yield match

        ####################################
        # Match single-state transformations
        if len(singlestate_transformations) == 0:
            continue
        for state_id, state in enumerate(tsdfg.nodes()):
            if states is not None and state not in states:
                continue

            # Collapse multigraph into directed graph in order to use VF2
            digraph = collapse_multigraph_to_nx(state)

            for xform, expr_idx, nxpattern, matcher in singlestate_transformations:
                for subgraph in matcher(digraph, nxpattern, node_match,
                                        edge_match):
                    match = _try_to_match_transformation(
                        state, digraph, subgraph, tsdfg, xform, expr_idx,
                        nxpattern, state_id, strict)
                    if match is not None:
                        yield match
Beispiel #12
0
def auto_optimize(sdfg: SDFG,
                  device: dtypes.DeviceType,
                  validate: bool = True,
                  validate_all: bool = False,
                  symbols: Dict[str, int] = None) -> SDFG:
    """
    Runs a basic sequence of transformations to optimize a given SDFG to decent
    performance. In particular, performs the following:
        * Simplify
        * Auto-parallelization (loop-to-map)
        * Greedy application of SubgraphFusion
        * Tiled write-conflict resolution (MapTiling -> AccumulateTransient)
        * Tiled stream accumulation (MapTiling -> AccumulateTransient)
        * Collapse all maps to parallelize across all dimensions
        * Set all library nodes to expand to ``fast`` expansion, which calls
          the fastest library on the target device
    :param sdfg: The SDFG to optimize.
    :param device: the device to optimize for.
    :param validate: If True, validates the SDFG after all transformations
                     have been applied.
    :param validate_all: If True, validates the SDFG after every step.
    :param symbols: Optional dict that maps symbols (str/symbolic) to int/float
    :return: The optimized SDFG.
    :note: Operates in-place on the given SDFG.
    :note: This function is still experimental and may harm correctness in
           certain cases. Please report an issue if it does.
    """
    debugprint = config.Config.get_bool('debugprint')

    # Simplification and loop parallelization
    transformed = True
    sdfg.apply_transformations_repeated(TrivialMapElimination,
                                        validate=validate,
                                        validate_all=validate_all)
    while transformed:
        sdfg.simplify(validate=False, validate_all=validate_all)
        for s in sdfg.sdfg_list:
            xfh.split_interstate_edges(s)
        l2ms = sdfg.apply_transformations_repeated(
            (LoopToMap, RefineNestedAccess),
            validate=False,
            validate_all=validate_all)
        transformed = l2ms > 0

    # Collapse maps and eliminate trivial dimensions
    sdfg.simplify()
    sdfg.apply_transformations_repeated(MapCollapse,
                                        validate=False,
                                        validate_all=validate_all)

    # Apply GPU transformations and set library node implementations

    if device == dtypes.DeviceType.GPU:
        sdfg.apply_gpu_transformations()
        sdfg.simplify()

    # fuse subgraphs greedily
    sdfg.simplify()

    greedy_fuse(sdfg, device=device, validate_all=validate_all)

    # fuse stencils greedily
    greedy_fuse(sdfg,
                device=device,
                validate_all=validate_all,
                recursive=False,
                stencil=True)

    if device == dtypes.DeviceType.FPGA:
        # apply FPGA Transformations
        sdfg.apply_fpga_transformations()
        fpga_auto_opt.fpga_global_to_local(sdfg)
        fpga_auto_opt.fpga_rr_interleave_containers_to_banks(sdfg)

        # Set all library nodes to expand to fast library calls
        set_fast_implementations(sdfg, device)
        return sdfg

    # Tiled WCR and streams
    for nsdfg in list(sdfg.all_sdfgs_recursive()):
        tile_wcrs(nsdfg, validate_all)

    # Collapse maps
    sdfg.apply_transformations_repeated(MapCollapse,
                                        validate=False,
                                        validate_all=validate_all)
    for node, _ in sdfg.all_nodes_recursive():
        # Set OMP collapse property to map length
        if isinstance(node, nodes.MapEntry):
            # FORNOW: Leave out
            # node.map.collapse = len(node.map.range)
            pass

    # Set all library nodes to expand to fast library calls
    set_fast_implementations(sdfg, device)

    sdfg.expand_library_nodes()

    # TODO(later): Safe vectorization

    # Disable OpenMP parallel sections on a per-SDFG basis
    for nsdfg in sdfg.all_sdfgs_recursive():
        nsdfg.openmp_sections = False

    if symbols:
        # Specialize for all known symbols
        known_symbols = {
            s: v
            for (s, v) in symbols.items() if s in sdfg.free_symbols
        }
        known_symbols = {}
        for (s, v) in symbols.items():
            if s in sdfg.free_symbols:
                if isinstance(v, (int, float)):
                    known_symbols[s] = v
                if isinstance(v, sympy.core.numbers.Integer):
                    try:
                        known_symbols[s] = int(v)
                    except TypeError:
                        pass

        if debugprint and len(known_symbols) > 0:
            print("Specializing the SDFG for symbols", known_symbols)
        sdfg.specialize(known_symbols)

    # Set all Default storage types that are constant sized to registers
    move_small_arrays_to_stack(sdfg)
    '''
    # Fix storage and allocation properties, e.g., for benchmarking purposes
    # FORNOW: Leave out
    make_transients_persistent(sdfg, device)
    '''

    # Validate at the end
    if validate or validate_all:
        sdfg.validate()

    return sdfg
Beispiel #13
0
    def generate_footer(self, sdfg: SDFG, used_environments: Set[str],
                        global_stream: CodeIOStream,
                        callsite_stream: CodeIOStream):
        """ Generate the footer of the frame-code. Code exists in a separate
            function for overriding purposes.
            :param sdfg: The input SDFG.
            :param global_stream: Stream to write to (global).
            :param callsite_stream: Stream to write to (at call site).
        """
        fname = sdfg.name
        params = sdfg.signature()
        paramnames = sdfg.signature(False, for_call=True)
        environments = [
            dace.library.get_environment(env_name)
            for env_name in used_environments
        ]

        # Invoke all instrumentation providers
        for instr in self._dispatcher.instrumentation.values():
            if instr is not None:
                instr.on_sdfg_end(sdfg, callsite_stream, global_stream)

        # Instrumentation saving
        if len(self._dispatcher.instrumentation) > 1:
            callsite_stream.write(
                'dace::perf::report.save("%s/perf");' %
                sdfg.build_folder.replace('\\', '/'), sdfg)

        # Write closing brace of program
        callsite_stream.write('}', sdfg)

        # Write awkward footer to avoid 'extern "C"' issues
        callsite_stream.write(
            """
DACE_EXPORTED void __program_%s(%s)
{
    __program_%s_internal(%s);
}
""" % (fname, params, fname, paramnames), sdfg)

        for target in self._dispatcher.used_targets:
            if target.has_initializer:
                callsite_stream.write(
                    'DACE_EXPORTED int __dace_init_%s(%s);\n' %
                    (target.target_name, params), sdfg)
            if target.has_finalizer:
                callsite_stream.write(
                    'DACE_EXPORTED int __dace_exit_%s(%s);\n' %
                    (target.target_name, params), sdfg)

        callsite_stream.write(
            """
DACE_EXPORTED int __dace_init_%s(%s)
{
    int __result = 0;
""" % (sdfg.name, params), sdfg)

        for target in self._dispatcher.used_targets:
            if target.has_initializer:
                callsite_stream.write(
                    '__result |= __dace_init_%s(%s);' %
                    (target.target_name, paramnames), sdfg)
        for env in environments:
            if env.init_code:
                callsite_stream.write("{  // Environment: " + env.__name__,
                                      sdfg)
                callsite_stream.write(env.init_code)
                callsite_stream.write("}")

        for sd in sdfg.all_sdfgs_recursive():
            if None in sd.init_code:
                callsite_stream.write(codeblock_to_cpp(sd.init_code[None]), sd)
            callsite_stream.write(codeblock_to_cpp(sd.init_code['frame']), sd)

        callsite_stream.write(self._initcode.getvalue(), sdfg)

        callsite_stream.write(
            """
    return __result;
}

DACE_EXPORTED void __dace_exit_%s(%s)
{
""" % (sdfg.name, params), sdfg)

        callsite_stream.write(self._exitcode.getvalue(), sdfg)

        for sd in sdfg.all_sdfgs_recursive():
            if None in sd.exit_code:
                callsite_stream.write(codeblock_to_cpp(sd.exit_code[None]), sd)
            callsite_stream.write(codeblock_to_cpp(sd.exit_code['frame']), sd)

        for target in self._dispatcher.used_targets:
            if target.has_finalizer:
                callsite_stream.write(
                    '__dace_exit_%s(%s);' % (target.target_name, paramnames),
                    sdfg)
        for env in environments:
            if env.finalize_code:
                callsite_stream.write("{  // Environment: " + env.__name__,
                                      sdfg)
                callsite_stream.write(env.init_code)
                callsite_stream.write("}")

        callsite_stream.write('}\n', sdfg)
Beispiel #14
0
def make_transients_persistent(sdfg: SDFG,
                               device: dtypes.DeviceType,
                               toplevel_only: bool = True) -> None:
    ''' 
    Helper function to change several storage and scheduling properties
    - Makes non-view array lifetimes persistent, with some 
      restrictions depending on the device 
    - Reset nonatomic WCR edges on GPU 
    The only arrays that are made persistent by default are ones that do not exist inside a scope (and thus may be
    allocated multiple times), and whose symbols are always given as parameters to the SDFG (so that they can be
    allocated in a persistent manner).

    :param sdfg: SDFG
    :param device: Device type
    :param toplevel_only: If True, only converts access nodes that do not appear in any scope.
    '''
    for nsdfg in sdfg.all_sdfgs_recursive():
        fsyms: Set[str] = nsdfg.free_symbols
        persistent: Set[str] = set()
        not_persistent: Set[str] = set()

        for state in nsdfg.nodes():
            for dnode in state.data_nodes():
                if dnode.data in not_persistent:
                    continue
                desc = dnode.desc(nsdfg)
                # Only convert arrays and scalars that are not registers
                if not desc.transient or type(desc) not in {
                        dt.Array, dt.Scalar
                }:
                    not_persistent.add(dnode.data)
                    continue
                if desc.storage == dtypes.StorageType.Register:
                    not_persistent.add(dnode.data)
                    continue
                # Only convert arrays where the size depends on SDFG parameters
                try:
                    if set(map(str, desc.total_size.free_symbols)) - fsyms:
                        not_persistent.add(dnode.data)
                        continue
                except AttributeError:  # total_size is an integer / has no free symbols
                    pass

                # Only convert arrays with top-level access nodes
                if xfh.get_parent_map(state, dnode) is not None:
                    if toplevel_only:
                        not_persistent.add(dnode.data)
                        continue
                    elif desc.lifetime == dtypes.AllocationLifetime.Scope:
                        not_persistent.add(dnode.data)
                        continue

                persistent.add(dnode.data)

        for aname in (persistent - not_persistent):
            nsdfg.arrays[aname].lifetime = dtypes.AllocationLifetime.Persistent

    if device == dtypes.DeviceType.GPU:
        # Reset nonatomic WCR edges
        for n, _ in sdfg.all_nodes_recursive():
            if isinstance(n, SDFGState):
                for edge in n.edges():
                    edge.data.wcr_nonatomic = False