def optimize_for_gpu(sdfg: dace.SDFG, m: int, n: int, k: int): """ Optimize the matrix multiplication example for GPUs. """ # Ensure integers are 32-bit by default dace.Config.set('compiler', 'default_data_types', value='C') # Fuse the map and reduce nodes sdfg.apply_transformations(MapReduceFusion) # Apply GPU transformation sdfg.apply_gpu_transformations() # Find multiplication map entry = find_map_by_param(sdfg, 'k') # Create a tiling strategy divides_evenly = (m % 64 == 0) and (n % 64 == 0) and (k % 8 == 0) xfutil.tile(sdfg, entry, divides_evenly, True, i=64, j=64, k=8) xfutil.tile(sdfg, entry, divides_evenly, True, i=8, j=4) # Create kernel schedule by collapsing and reordering maps gtile_i = find_map_by_param(sdfg, 'tile_i') gtile_j = find_map_by_param(sdfg, 'tile_j') btile_i = find_map_by_param(sdfg, 'tile1_i') btile_j = find_map_by_param(sdfg, 'tile1_j') MapCollapse.apply_to(sdfg, outer_map_entry=gtile_i, inner_map_entry=gtile_j, permissive=True) MapCollapse.apply_to(sdfg, outer_map_entry=btile_i, inner_map_entry=btile_j, permissive=True) btile = find_map_by_param(sdfg, 'tile1_i') btile.map.schedule = dace.ScheduleType.GPU_ThreadBlock # Add local storage (shared memory) for A and B on GPU ktile = find_map_by_param(sdfg, 'tile_k') smem_a = InLocalStorage.apply_to(sdfg, dict(array='A'), node_a=ktile, node_b=btile) smem_b = InLocalStorage.apply_to(sdfg, dict(array='B'), node_a=ktile, node_b=btile) sdfg.arrays[smem_a.data].storage = dace.StorageType.GPU_Shared sdfg.arrays[smem_b.data].storage = dace.StorageType.GPU_Shared # Add local storage (registers) for A and B ttile = find_map_by_param(sdfg, 'k') warptile, ttile = xfutil.extract_map_dims(sdfg, ttile, [2]) InLocalStorage.apply_to(sdfg, dict(array='trans_gpu_A'), node_a=warptile, node_b=ttile) InLocalStorage.apply_to(sdfg, dict(array='trans_gpu_B'), node_a=warptile, node_b=ttile) # Add local storage (registers) for C state = next(s for s in sdfg.nodes() if warptile in s.nodes()) warptile_exit = state.exit_node(warptile) btile_exit = state.exit_node(btile) AccumulateTransient.apply_to(sdfg, map_exit=warptile_exit, outer_map_exit=btile_exit) # Set C tile to zero on allocation c_access = next(n for n in state.data_nodes() if n.data == 'trans_gpu_C') c_access.setzero = True # Unroll microkernel maps ttile.map.unroll = True # Apply double-buffering on shared memory DoubleBuffering.apply_to(sdfg, map_entry=ktile, transient=smem_a)
def tile_wcrs(graph_or_subgraph: GraphViewType, validate_all: bool, prefer_partial_parallelism: bool = None) -> None: """ Tiles parallel write-conflict resolution maps in an SDFG, state, or subgraphs thereof. Reduces the number of atomic operations by tiling and introducing transient arrays to accumulate atomics on. :param graph_or_subgraph: The SDFG/state/subgraph to optimize within. :param validate_all: If True, runs SDFG validation after every tiling. :param prefer_partial_parallelism: If set, prefers extracting non-conflicted map dimensions over tiling WCR map (may not perform well if parallel dimensions are small). :note: This function operates in-place. """ # Avoid import loops from dace.codegen.targets import cpp from dace.frontend import operations from dace.transformation import dataflow, helpers as xfh # Determine on which nodes to run the operation graph = graph_or_subgraph if isinstance(graph_or_subgraph, gr.SubgraphView): graph = graph_or_subgraph.graph if isinstance(graph, SDFG): for state in graph_or_subgraph.nodes(): tile_wcrs(state, validate_all) return if not isinstance(graph, SDFGState): raise TypeError( 'Graph must be a state, an SDFG, or a subgraph of either') sdfg = graph.parent edges_to_consider: Set[Tuple[gr.MultiConnectorEdge[Memlet], nodes.MapEntry]] = set() for edge in graph_or_subgraph.edges(): if edge.data.wcr is not None: if (isinstance(edge.src, (nodes.MapExit, nodes.NestedSDFG)) or isinstance(edge.dst, nodes.MapEntry)): # Do not consider intermediate edges continue reason = cpp.is_write_conflicted_with_reason(graph, edge) if reason is None or not isinstance(reason, nodes.MapEntry): # Do not consider edges that will not generate atomics or # atomics we cannot transform continue if reason not in graph_or_subgraph.nodes(): # Skip if conflict exists outside of nested SDFG continue # Check if identity value can be inferred redtype = operations.detect_reduction_type(edge.data.wcr) dtype = sdfg.arrays[edge.data.data].dtype identity = dtypes.reduction_identity(dtype, redtype) if identity is None: # Cannot infer identity value continue edges_to_consider.add((edge, reason)) tile_size = config.Config.get('optimizer', 'autotile_size') debugprint = config.Config.get_bool('debugprint') if prefer_partial_parallelism is None: prefer_partial_parallelism = config.Config.get_bool( 'optimizer', 'autotile_partial_parallelism') maps_to_consider: Set[nodes.MapEntry] = set(me for _, me in edges_to_consider) transformed: Set[nodes.MapEntry] = set() # Heuristic: If the map is only partially conflicted, extract # parallel dimensions instead of tiling if prefer_partial_parallelism: for mapentry in maps_to_consider: # Check the write-conflicts of all WCR edges in map conflicts: Set[str] = set() for edge, me in edges_to_consider: if me is not mapentry: continue conflicts |= set( cpp.write_conflicted_map_params(mapentry, edge)) nonconflicted_dims = set(mapentry.params) - conflicts if nonconflicted_dims: dims = [ i for i, p in enumerate(mapentry.params) if p in nonconflicted_dims ] if ((dt._prod(s for i, s in enumerate(mapentry.range.size()) if i in dims) < tile_size) == True): # Map has a small range, extracting parallelism may not be # beneficial continue xfh.extract_map_dims(sdfg, mapentry, dims) transformed.add(mapentry) # Tile and accumulate other not-transformed maps for edge, mapentry in edges_to_consider: if mapentry in transformed: continue transformed.add(mapentry) # NOTE: The test "(x < y) == True" below is crafted for SymPy # to be "definitely True" if all((s < tile_size) == True for s in mapentry.map.range.size()): # If smaller than tile size, don't transform and instead # make map sequential if debugprint: print(f'Making map "{mapentry}" sequential due to being ' 'smaller than tile size') mapentry.map.schedule = dtypes.ScheduleType.Sequential continue # MapTiling -> AccumulateTransient / AccumulateStream outer_mapentry = dataflow.MapTiling.apply_to( sdfg, dict(tile_sizes=(tile_size, )), map_entry=mapentry) # Transform all outgoing WCR and stream edges mapexit = graph.exit_node(mapentry) outer_mapexit = graph.exit_node(outer_mapentry) # Tuple of (transformation type, options, pattern) to_apply: Tuple[Union[dataflow.StreamTransient, dataflow.AccumulateTransient], Dict[str, Any], Dict[str, nodes.Node]] = None for e in graph.out_edges(mapexit): if isinstance(sdfg.arrays[e.data.data], dt.Stream): mpath = graph.memlet_path(e) tasklet = mpath[0].src if not isinstance(tasklet, nodes.Tasklet) or len(mpath) != 3: # TODO(later): Implement StreamTransient independently of tasklet continue # Make transient only if there is one WCR/stream if to_apply is not None: to_apply = None break to_apply = (dataflow.StreamTransient, {}, dict(tasklet=tasklet, map_exit=mapexit, outer_map_exit=outer_mapexit)) else: if (e.data.is_empty() or e.data.wcr is None or e.data.wcr_nonatomic or (e.data.dst_subset is not None and e.data.dst_subset.num_elements() > 0 and e.data.dynamic)): continue dtype = sdfg.arrays[e.data.data].dtype redtype = operations.detect_reduction_type(e.data.wcr) identity = dtypes.reduction_identity(dtype, redtype) if identity is None: # Cannot infer identity value continue # Make transient only if there is one WCR/stream if to_apply is not None: to_apply = None break to_apply = (dataflow.AccumulateTransient, dict(identity=identity, array=e.data.data), dict(map_exit=mapexit, outer_map_exit=outer_mapexit)) if to_apply is not None: xform, opts, pattern = to_apply xform.apply_to(sdfg, options=opts, **pattern) if debugprint and len(transformed) > 0: print(f'Optimized {len(transformed)} write-conflicted maps')