Exemple #1
0
def tile_wcrs(graph_or_subgraph: GraphViewType,
              validate_all: bool,
              prefer_partial_parallelism: bool = None) -> None:
    """
    Tiles parallel write-conflict resolution maps in an SDFG, state,
    or subgraphs thereof. Reduces the number of atomic operations by tiling
    and introducing transient arrays to accumulate atomics on.
    :param graph_or_subgraph: The SDFG/state/subgraph to optimize within.
    :param validate_all: If True, runs SDFG validation after every tiling.
    :param prefer_partial_parallelism: If set, prefers extracting non-conflicted
                                       map dimensions over tiling WCR map (may
                                       not perform well if parallel dimensions
                                       are small).
    :note: This function operates in-place.
    """
    # Avoid import loops
    from dace.codegen.targets import cpp
    from dace.frontend import operations
    from dace.transformation import dataflow, helpers as xfh

    # Determine on which nodes to run the operation
    graph = graph_or_subgraph
    if isinstance(graph_or_subgraph, gr.SubgraphView):
        graph = graph_or_subgraph.graph
    if isinstance(graph, SDFG):
        for state in graph_or_subgraph.nodes():
            tile_wcrs(state, validate_all)
        return
    if not isinstance(graph, SDFGState):
        raise TypeError(
            'Graph must be a state, an SDFG, or a subgraph of either')
    sdfg = graph.parent

    edges_to_consider: Set[Tuple[gr.MultiConnectorEdge[Memlet],
                                 nodes.MapEntry]] = set()
    for edge in graph_or_subgraph.edges():
        if edge.data.wcr is not None:
            if (isinstance(edge.src, (nodes.MapExit, nodes.NestedSDFG))
                    or isinstance(edge.dst, nodes.MapEntry)):
                # Do not consider intermediate edges
                continue
            reason = cpp.is_write_conflicted_with_reason(graph, edge)
            if reason is None or not isinstance(reason, nodes.MapEntry):
                # Do not consider edges that will not generate atomics or
                # atomics we cannot transform
                continue
            if reason not in graph_or_subgraph.nodes():
                # Skip if conflict exists outside of nested SDFG
                continue

            # Check if identity value can be inferred
            redtype = operations.detect_reduction_type(edge.data.wcr)
            dtype = sdfg.arrays[edge.data.data].dtype
            identity = dtypes.reduction_identity(dtype, redtype)
            if identity is None:  # Cannot infer identity value
                continue

            edges_to_consider.add((edge, reason))

    tile_size = config.Config.get('optimizer', 'autotile_size')
    debugprint = config.Config.get_bool('debugprint')
    if prefer_partial_parallelism is None:
        prefer_partial_parallelism = config.Config.get_bool(
            'optimizer', 'autotile_partial_parallelism')

    maps_to_consider: Set[nodes.MapEntry] = set(me
                                                for _, me in edges_to_consider)

    transformed: Set[nodes.MapEntry] = set()

    # Heuristic: If the map is only partially conflicted, extract
    # parallel dimensions instead of tiling
    if prefer_partial_parallelism:
        for mapentry in maps_to_consider:
            # Check the write-conflicts of all WCR edges in map
            conflicts: Set[str] = set()
            for edge, me in edges_to_consider:
                if me is not mapentry:
                    continue
                conflicts |= set(
                    cpp.write_conflicted_map_params(mapentry, edge))

            nonconflicted_dims = set(mapentry.params) - conflicts
            if nonconflicted_dims:
                dims = [
                    i for i, p in enumerate(mapentry.params)
                    if p in nonconflicted_dims
                ]
                if ((dt._prod(s for i, s in enumerate(mapentry.range.size())
                              if i in dims) < tile_size) == True):
                    # Map has a small range, extracting parallelism may not be
                    # beneficial
                    continue
                xfh.extract_map_dims(sdfg, mapentry, dims)
                transformed.add(mapentry)

    # Tile and accumulate other not-transformed maps
    for edge, mapentry in edges_to_consider:
        if mapentry in transformed:
            continue
        transformed.add(mapentry)

        # NOTE: The test "(x < y) == True" below is crafted for SymPy
        # to be "definitely True"
        if all((s < tile_size) == True for s in mapentry.map.range.size()):
            # If smaller than tile size, don't transform and instead
            # make map sequential
            if debugprint:
                print(f'Making map "{mapentry}" sequential due to being '
                      'smaller than tile size')
            mapentry.map.schedule = dtypes.ScheduleType.Sequential
            continue

        # MapTiling -> AccumulateTransient / AccumulateStream
        outer_mapentry = dataflow.MapTiling.apply_to(
            sdfg, dict(tile_sizes=(tile_size, )), map_entry=mapentry)

        # Transform all outgoing WCR and stream edges
        mapexit = graph.exit_node(mapentry)
        outer_mapexit = graph.exit_node(outer_mapentry)

        # Tuple of (transformation type, options, pattern)
        to_apply: Tuple[Union[dataflow.StreamTransient,
                              dataflow.AccumulateTransient], Dict[str, Any],
                        Dict[str, nodes.Node]] = None
        for e in graph.out_edges(mapexit):
            if isinstance(sdfg.arrays[e.data.data], dt.Stream):
                mpath = graph.memlet_path(e)
                tasklet = mpath[0].src
                if not isinstance(tasklet, nodes.Tasklet) or len(mpath) != 3:
                    # TODO(later): Implement StreamTransient independently of tasklet
                    continue

                # Make transient only if there is one WCR/stream
                if to_apply is not None:
                    to_apply = None
                    break

                to_apply = (dataflow.StreamTransient, {},
                            dict(tasklet=tasklet,
                                 map_exit=mapexit,
                                 outer_map_exit=outer_mapexit))
            else:
                if (e.data.is_empty() or e.data.wcr is None
                        or e.data.wcr_nonatomic
                        or (e.data.dst_subset is not None
                            and e.data.dst_subset.num_elements() > 0
                            and e.data.dynamic)):
                    continue

                dtype = sdfg.arrays[e.data.data].dtype
                redtype = operations.detect_reduction_type(e.data.wcr)
                identity = dtypes.reduction_identity(dtype, redtype)
                if identity is None:  # Cannot infer identity value
                    continue
                # Make transient only if there is one WCR/stream
                if to_apply is not None:
                    to_apply = None
                    break

                to_apply = (dataflow.AccumulateTransient,
                            dict(identity=identity, array=e.data.data),
                            dict(map_exit=mapexit,
                                 outer_map_exit=outer_mapexit))
        if to_apply is not None:
            xform, opts, pattern = to_apply
            xform.apply_to(sdfg, options=opts, **pattern)

    if debugprint and len(transformed) > 0:
        print(f'Optimized {len(transformed)} write-conflicted maps')
    def apply(self, sdfg: SDFG) -> None:
        graph: SDFGState = sdfg.nodes()[self.state_id]

        inner_map_entry: nodes.MapEntry = graph.nodes()[self.subgraph[
            GPUMultiTransformMap._map_entry]]

        number_of_gpus = self.number_of_gpus
        ngpus = Config.get("compiler", "cuda", "max_number_gpus")
        if (number_of_gpus == None):
            number_of_gpus = ngpus
        if number_of_gpus > ngpus:
            raise ValueError(
                'Requesting more gpus than specified in the dace config')

        # Avoiding import loops
        from dace.transformation.dataflow import (StripMining, InLocalStorage,
                                                  OutLocalStorage,
                                                  AccumulateTransient)

        # The user has responsibility for the implementation of a Library node.
        scope_subgraph = graph.scope_subgraph(inner_map_entry)
        for node in scope_subgraph.nodes():
            if isinstance(node, nodes.LibraryNode):
                warnings.warn(
                    'Node %s is a library node, make sure to manually set the '
                    'implementation to a GPU compliant specialization.' % node)

        # Tile map into number_of_gpus tiles
        outer_map: nodes.Map = StripMining.apply_to(
            sdfg,
            dict(dim_idx=-1,
                 new_dim_prefix=self.new_dim_prefix,
                 tile_size=number_of_gpus,
                 tiling_type=dtypes.TilingType.NumberOfTiles),
            _map_entry=inner_map_entry)

        outer_map_entry: nodes.MapEntry = graph.scope_dict()[inner_map_entry]
        inner_map_exit: nodes.MapExit = graph.exit_node(inner_map_entry)
        outer_map_exit: nodes.MapExit = graph.exit_node(outer_map_entry)

        # Change map schedules
        inner_map_entry.map.schedule = dtypes.ScheduleType.GPU_Device
        outer_map.schedule = dtypes.ScheduleType.GPU_Multidevice

        symbolic_gpu_id = outer_map.params[0]

        # Add the parameter of the outer map
        for node in graph.successors(inner_map_entry):
            if isinstance(node, nodes.NestedSDFG):
                map_syms = inner_map_entry.range.free_symbols
                for sym in map_syms:
                    symname = str(sym)
                    if symname not in node.symbol_mapping.keys():
                        node.symbol_mapping[symname] = sym
                        node.sdfg.symbols[symname] = graph.symbols_defined_at(
                            node)[symname]

        # Add transient Data leading to the inner map
        prefix = self.new_transient_prefix
        for node in graph.predecessors(outer_map_entry):
            # Only AccessNodes are relevant
            if (isinstance(node, nodes.AccessNode)
                    and not (self.skip_scalar
                             and isinstance(node.desc(sdfg), Scalar))):
                if self.use_p2p and node.desc(
                        sdfg).storage is dtypes.StorageType.GPU_Global:
                    continue

                in_data_node = InLocalStorage.apply_to(sdfg,
                                                       dict(array=node.data,
                                                            prefix=prefix),
                                                       verify=False,
                                                       save=False,
                                                       node_a=outer_map_entry,
                                                       node_b=inner_map_entry)
                in_data_node.desc(sdfg).location['gpu'] = symbolic_gpu_id
                in_data_node.desc(sdfg).storage = dtypes.StorageType.GPU_Global

        wcr_data: Dict[str, Any] = {}
        # Add transient Data leading to the outer map
        for edge in graph.in_edges(outer_map_exit):
            node = graph.memlet_path(edge)[-1].dst
            if isinstance(node, nodes.AccessNode):
                data_name = node.data
                # Transients with write-conflict resolution need to be
                # collected first as AccumulateTransient creates a nestedSDFG
                if edge.data.wcr is not None:
                    dtype = sdfg.arrays[data_name].dtype
                    redtype = operations.detect_reduction_type(edge.data.wcr)
                    # Custom reduction can not have an accumulate transient,
                    # as the accumulation from the transient to the outer
                    # storage is not defined.
                    if redtype == dtypes.ReductionType.Custom:
                        warnings.warn(
                            'Using custom reductions in a GPUMultitransformed '
                            'Map only works for a small data volume. For large '
                            'volume there is no guarantee.')
                        continue
                    identity = dtypes.reduction_identity(dtype, redtype)
                    wcr_data[data_name] = identity
                elif (not isinstance(node.desc(sdfg), Scalar)
                      or not self.skip_scalar):
                    if self.use_p2p and node.desc(
                            sdfg).storage is dtypes.StorageType.GPU_Global:
                        continue
                    # Transients without write-conflict resolution
                    if prefix + '_' + data_name in sdfg.arrays:
                        create_array = False
                    else:
                        create_array = True
                    out_data_node = OutLocalStorage.apply_to(
                        sdfg,
                        dict(array=data_name,
                             prefix=prefix,
                             create_array=create_array),
                        verify=False,
                        save=False,
                        node_a=inner_map_exit,
                        node_b=outer_map_exit)
                    out_data_node.desc(sdfg).location['gpu'] = symbolic_gpu_id
                    out_data_node.desc(
                        sdfg).storage = dtypes.StorageType.GPU_Global

        # Add Transients for write-conflict resolution
        if len(wcr_data) != 0:
            nsdfg = AccumulateTransient.apply_to(
                sdfg,
                options=dict(array_identity_dict=wcr_data, prefix=prefix),
                map_exit=inner_map_exit,
                outer_map_exit=outer_map_exit)
            nsdfg.schedule = dtypes.ScheduleType.GPU_Multidevice
            nsdfg.location['gpu'] = symbolic_gpu_id
            for transient_node in graph.successors(nsdfg):
                if isinstance(transient_node, nodes.AccessNode):
                    transient_node.desc(sdfg).location['gpu'] = symbolic_gpu_id
                    transient_node.desc(
                        sdfg).storage = dtypes.StorageType.GPU_Global
                    nsdfg.sdfg.arrays[
                        transient_node.label].location['gpu'] = symbolic_gpu_id
                    nsdfg.sdfg.arrays[
                        transient_node.
                        label].storage = dtypes.StorageType.GPU_Global
            infer_types.set_default_schedule_storage_types_and_location(
                nsdfg.sdfg, dtypes.ScheduleType.GPU_Multidevice,
                symbolic_gpu_id)

        # Remove the parameter of the outer_map from the sdfg symbols,
        # as it got added as a symbol in StripMining.
        if outer_map.params[0] in sdfg.free_symbols:
            sdfg.remove_symbol(outer_map.params[0])