def tile(sdfg: SDFG, map_entry: nodes.MapEntry, divides_evenly: bool, skew: bool, **tile_sizes: symbolic.SymbolicType): """ Helper function that tiles a Map scope by the given sizes, in the given order. :param sdfg: The SDFG where the map resides. :param map_entry: The map entry node to tile. :param divides_evenly: If True, skips pre/postamble for cases where the map dimension is not a multiplier of the tile size. :param skew: If True, skews the tiled map to start from zero. Helps compilers improve performance in certain cases. :param tile_sizes: An ordered dictionary of the map parameter names to tile and their respective tile size (which can be symbolic expressions). """ # Avoid import loop from dace.transformation.dataflow import StripMining for k, v in tile_sizes.items(): StripMining.apply_to(sdfg, dict(dim_idx=map_entry.params.index(k), tile_size=str(v), divides_evenly=divides_evenly, skew=skew), map_entry=map_entry)
def tile(sdfg: dace.SDFG, map_entry: dace.nodes.MapEntry, divides_evenly: bool, skew: bool, **tile_sizes: dace.symbolic.SymbolicType): """ Helper function that tiles a Map scope by the given sizes. """ for k, v in tile_sizes.items(): StripMining.apply_to(sdfg, dict(dim_idx=map_entry.params.index(k), tile_size=str(v), divides_evenly=divides_evenly, skew=skew), _map_entry=map_entry)
def apply(self, sdfg: SDFG) -> None: graph: SDFGState = sdfg.nodes()[self.state_id] inner_map_entry: nodes.MapEntry = graph.nodes()[self.subgraph[ GPUMultiTransformMap._map_entry]] number_of_gpus = self.number_of_gpus ngpus = Config.get("compiler", "cuda", "max_number_gpus") if (number_of_gpus == None): number_of_gpus = ngpus if number_of_gpus > ngpus: raise ValueError( 'Requesting more gpus than specified in the dace config') # Avoiding import loops from dace.transformation.dataflow import (StripMining, InLocalStorage, OutLocalStorage, AccumulateTransient) # The user has responsibility for the implementation of a Library node. scope_subgraph = graph.scope_subgraph(inner_map_entry) for node in scope_subgraph.nodes(): if isinstance(node, nodes.LibraryNode): warnings.warn( 'Node %s is a library node, make sure to manually set the ' 'implementation to a GPU compliant specialization.' % node) # Tile map into number_of_gpus tiles outer_map: nodes.Map = StripMining.apply_to( sdfg, dict(dim_idx=-1, new_dim_prefix=self.new_dim_prefix, tile_size=number_of_gpus, tiling_type=dtypes.TilingType.NumberOfTiles), _map_entry=inner_map_entry) outer_map_entry: nodes.MapEntry = graph.scope_dict()[inner_map_entry] inner_map_exit: nodes.MapExit = graph.exit_node(inner_map_entry) outer_map_exit: nodes.MapExit = graph.exit_node(outer_map_entry) # Change map schedules inner_map_entry.map.schedule = dtypes.ScheduleType.GPU_Device outer_map.schedule = dtypes.ScheduleType.GPU_Multidevice symbolic_gpu_id = outer_map.params[0] # Add the parameter of the outer map for node in graph.successors(inner_map_entry): if isinstance(node, nodes.NestedSDFG): map_syms = inner_map_entry.range.free_symbols for sym in map_syms: symname = str(sym) if symname not in node.symbol_mapping.keys(): node.symbol_mapping[symname] = sym node.sdfg.symbols[symname] = graph.symbols_defined_at( node)[symname] # Add transient Data leading to the inner map prefix = self.new_transient_prefix for node in graph.predecessors(outer_map_entry): # Only AccessNodes are relevant if (isinstance(node, nodes.AccessNode) and not (self.skip_scalar and isinstance(node.desc(sdfg), Scalar))): if self.use_p2p and node.desc( sdfg).storage is dtypes.StorageType.GPU_Global: continue in_data_node = InLocalStorage.apply_to(sdfg, dict(array=node.data, prefix=prefix), verify=False, save=False, node_a=outer_map_entry, node_b=inner_map_entry) in_data_node.desc(sdfg).location['gpu'] = symbolic_gpu_id in_data_node.desc(sdfg).storage = dtypes.StorageType.GPU_Global wcr_data: Dict[str, Any] = {} # Add transient Data leading to the outer map for edge in graph.in_edges(outer_map_exit): node = graph.memlet_path(edge)[-1].dst if isinstance(node, nodes.AccessNode): data_name = node.data # Transients with write-conflict resolution need to be # collected first as AccumulateTransient creates a nestedSDFG if edge.data.wcr is not None: dtype = sdfg.arrays[data_name].dtype redtype = operations.detect_reduction_type(edge.data.wcr) # Custom reduction can not have an accumulate transient, # as the accumulation from the transient to the outer # storage is not defined. if redtype == dtypes.ReductionType.Custom: warnings.warn( 'Using custom reductions in a GPUMultitransformed ' 'Map only works for a small data volume. For large ' 'volume there is no guarantee.') continue identity = dtypes.reduction_identity(dtype, redtype) wcr_data[data_name] = identity elif (not isinstance(node.desc(sdfg), Scalar) or not self.skip_scalar): if self.use_p2p and node.desc( sdfg).storage is dtypes.StorageType.GPU_Global: continue # Transients without write-conflict resolution if prefix + '_' + data_name in sdfg.arrays: create_array = False else: create_array = True out_data_node = OutLocalStorage.apply_to( sdfg, dict(array=data_name, prefix=prefix, create_array=create_array), verify=False, save=False, node_a=inner_map_exit, node_b=outer_map_exit) out_data_node.desc(sdfg).location['gpu'] = symbolic_gpu_id out_data_node.desc( sdfg).storage = dtypes.StorageType.GPU_Global # Add Transients for write-conflict resolution if len(wcr_data) != 0: nsdfg = AccumulateTransient.apply_to( sdfg, options=dict(array_identity_dict=wcr_data, prefix=prefix), map_exit=inner_map_exit, outer_map_exit=outer_map_exit) nsdfg.schedule = dtypes.ScheduleType.GPU_Multidevice nsdfg.location['gpu'] = symbolic_gpu_id for transient_node in graph.successors(nsdfg): if isinstance(transient_node, nodes.AccessNode): transient_node.desc(sdfg).location['gpu'] = symbolic_gpu_id transient_node.desc( sdfg).storage = dtypes.StorageType.GPU_Global nsdfg.sdfg.arrays[ transient_node.label].location['gpu'] = symbolic_gpu_id nsdfg.sdfg.arrays[ transient_node. label].storage = dtypes.StorageType.GPU_Global infer_types.set_default_schedule_storage_types_and_location( nsdfg.sdfg, dtypes.ScheduleType.GPU_Multidevice, symbolic_gpu_id) # Remove the parameter of the outer_map from the sdfg symbols, # as it got added as a symbol in StripMining. if outer_map.params[0] in sdfg.free_symbols: sdfg.remove_symbol(outer_map.params[0])
def apply(self, sdfg): graph = sdfg.nodes()[self.state_id] map_entry = graph.nodes()[self.subgraph[MPITransformMap._map_entry]] # Avoiding import loops from dace.transformation.dataflow import StripMining from dace.transformation.dataflow.stream_transient import ( InLocalStorage) from dace.transformation.dataflow.stream_transient import ( OutLocalStorage) from dace.graph import labeling rangeexpr = str(map_entry.map.range.num_elements()) stripmine_subgraph = { StripMining._map_entry: self.subgraph[MPITransformMap._map_entry] } sdfg_id = sdfg.sdfg_list.index(sdfg) stripmine = StripMining(sdfg_id, self.state_id, stripmine_subgraph, self.expr_index) stripmine.dim_idx = -1 stripmine.new_dim_prefix = "mpi" stripmine.tile_size = "(" + rangeexpr + "/__dace_comm_size)" stripmine.divides_evenly = True stripmine.apply(sdfg) # Find all in-edges that lead to candidate[MPITransformMap._map_entry] outer_map = None edges = [ e for e in graph.in_edges(map_entry) if isinstance(e.src, nodes.EntryNode) ] outer_map = edges[0].src # We need a tasklet for InLocalStorage tasklet = None for e in graph.out_edges(map_entry): if isinstance(e.dst, nodes.CodeNode): tasklet = e.dst break if tasklet is None: raise ValueError("Tasklet not found") # Add MPI schedule attribute to outer map outer_map.map._schedule = dtypes.ScheduleType.MPI # Now create a transient for each array for e in edges: in_local_storage_subgraph = { InLocalStorage._outer_map_entry: graph.node_id(outer_map), InLocalStorage._inner_map_entry: self.subgraph[MPITransformMap._map_entry] } sdfg_id = sdfg.sdfg_list.index(sdfg) in_local_storage = InLocalStorage(sdfg_id, self.state_id, in_local_storage_subgraph, self.expr_index) in_local_storage.array = e.data.data in_local_storage.apply(sdfg) # Transform OutLocalStorage for each output of the MPI map in_map_exits = graph.exit_nodes(map_entry) out_map_exits = graph.exit_nodes(outer_map) in_map_exit = in_map_exits[0] out_map_exit = out_map_exits[0] for e in graph.out_edges(out_map_exit): name = e.data.data outlocalstorage_subgraph = { OutLocalStorage._inner_map_exit: graph.node_id(in_map_exit), OutLocalStorage._outer_map_exit: graph.node_id(out_map_exit) } sdfg_id = sdfg.sdfg_list.index(sdfg) outlocalstorage = OutLocalStorage(sdfg_id, self.state_id, outlocalstorage_subgraph, self.expr_index) outlocalstorage.array = name outlocalstorage.apply(sdfg) return