def _create_strided_range(self, sdfg: SDFG, state: SDFGState, map_entry: nodes.MapEntry): map_exit = state.exit_node(map_entry) dim_idx = self.dim_idx new_dim_prefix = self.new_dim_prefix tile_size = self.tile_size divides_evenly = self.divides_evenly tile_stride = self.tile_stride if tile_stride == 0: tile_stride = tile_size if tile_stride != tile_size: raise NotImplementedError # Retrieve parameter and range of dimension to be strip-mined. target_dim = map_entry.map.params[dim_idx] td_from, td_to, td_step = map_entry.map.range[dim_idx] new_dim = self._find_new_dim(sdfg, state, map_entry, new_dim_prefix, target_dim) new_dim_range = (td_from, td_to, tile_size) new_map = nodes.Map(map_entry.map.label, [new_dim], subsets.Range([new_dim_range])) dimsym = dace.symbolic.pystr_to_symbolic(new_dim) td_from_new = dimsym if divides_evenly: td_to_new = dimsym + tile_size - 1 else: if isinstance(td_to, dace.symbolic.SymExpr): td_to = td_to.expr td_to_new = dace.symbolic.SymExpr( sympy.Min(dimsym + tile_size - 1, td_to), dimsym + tile_size - 1) td_step_new = td_step return new_dim, new_map, (td_from_new, td_to_new, td_step_new)
class RedundantArrayCopying3(pm.Transformation): """ Implements the redundant array removal transformation. Removes multiples of array B in pattern MapEntry -> B. """ _arrays_removed = 0 _map_entry = nodes.MapEntry(nodes.Map("", [], [])) _out_array = nodes.AccessNode("_") @staticmethod def expressions(): return [ sdutil.node_path_graph(RedundantArrayCopying3._map_entry, RedundantArrayCopying3._out_array) ] @staticmethod def can_be_applied(graph, candidate, expr_index, sdfg, strict=False): map_entry = graph.nodes()[candidate[RedundantArrayCopying3._map_entry]] out_array = graph.nodes()[candidate[RedundantArrayCopying3._out_array]] # Ensure out degree is one (only one target, which is out_array) found = 0 for _, _, dst, _, _ in graph.out_edges(map_entry): if (isinstance(dst, nodes.AccessNode) and dst != out_array and dst.data == out_array.data): found += 1 return found > 0 @staticmethod def match_to_str(graph, candidate): out_array = graph.nodes()[candidate[RedundantArrayCopying3._out_array]] return "Remove " + str(out_array) def apply(self, sdfg): def gnode(nname): return graph.nodes()[self.subgraph[nname]] graph = sdfg.nodes()[self.state_id] map_entry = gnode(RedundantArrayCopying3._map_entry) out_array = gnode(RedundantArrayCopying3._out_array) for e1 in graph.out_edges(map_entry): dst = e1.dst if (isinstance(dst, nodes.AccessNode) and dst != out_array and dst.data == out_array.data): for e2 in graph.out_edges(dst): graph.add_edge(out_array, None, e2.dst, e2.dst_conn, e2.data) graph.remove_edge(e2) graph.remove_edge(e1) graph.remove_node(dst) if Config.get_bool("debugprint"): RedundantArrayCopying3._arrays_removed += 1
class TrivialMapRangeElimination(transformation.Transformation): """ Implements the Trivial Map Range Elimination pattern. Trivial Map Range Elimination takes a multi-dimensional map with a range containing one element and removes the corresponding dimension. Example: Map[i=0:I,j=0] -> Map[i=0:I] """ _map_entry = nodes.MapEntry(nodes.Map("", [], [])) @staticmethod def expressions(): return [sdutil.node_path_graph(TrivialMapRangeElimination._map_entry)] @staticmethod def can_be_applied(graph, candidate, expr_index, sdfg, strict=False): map_entry = graph.nodes()[candidate[ TrivialMapRangeElimination._map_entry]] if len(map_entry.map.range) <= 1: return False # only acts on multi-dimensional maps return any(frm == to for frm, to, _ in map_entry.map.range) @staticmethod def match_to_str(graph, candidate): map_entry = graph.nodes()[candidate[ TrivialMapRangeElimination._map_entry]] return map_entry.map.label + ': ' + str(map_entry.map.params) def apply(self, sdfg): graph = sdfg.nodes()[self.state_id] map_entry = graph.nodes()[self.subgraph[ TrivialMapRangeElimination._map_entry]] remaining_ranges = [] remaining_params = [] for map_param, ranges in zip(map_entry.map.params, map_entry.map.range.ranges): map_from, map_to, _ = ranges if map_from == map_to: # Replace the map index variable with the value it obtained scope = graph.scope_subgraph(map_entry) scope.replace(map_param, map_from) else: remaining_ranges.append(ranges) remaining_params.append(map_param) map_entry.map.range.ranges = remaining_ranges map_entry.map.params = remaining_params
def _create_from_tile_numbers(self, sdfg: SDFG, state: SDFGState, map_entry: nodes.MapEntry): map_exit = state.exit_node(map_entry) # Retrieve transformation properties. dim_idx = self.dim_idx new_dim_prefix = self.new_dim_prefix divides_evenly = self.divides_evenly number_of_tiles = self.tile_size tile_stride = self.tile_stride number_of_tiles = dace.symbolic.pystr_to_symbolic(number_of_tiles) # Retrieve parameter and range of dimension to be strip-mined. target_dim = map_entry.map.params[dim_idx] td_from, td_to, td_step = map_entry.map.range[dim_idx] tile_size = map_entry.map.range.size_exact()[dim_idx] / number_of_tiles if tile_stride == 0: tile_stride = tile_size if tile_stride != tile_size: raise NotImplementedError new_dim = self._find_new_dim(sdfg, state, map_entry, new_dim_prefix, target_dim) new_dim_range = (td_from, number_of_tiles - 1, 1) new_map = nodes.Map(map_entry.map.label, [new_dim], subsets.Range([new_dim_range])) dimsym = dace.symbolic.pystr_to_symbolic(new_dim) td_from_new = dimsym * tile_size if divides_evenly: td_to_new = (dimsym + 1) * tile_size - 1 else: if isinstance(td_to, dace.symbolic.SymExpr): td_to = td_to.expr td_to_new = dace.symbolic.SymExpr( sympy.Min((dimsym + 1) * tile_size - 1, td_to), (dimsym + 1) * tile_size - 1) td_step_new = td_step return new_dim, new_map, (td_from_new, td_to_new, td_step_new)
class MapReduceFusion(pm.Transformation): """ Implements the map-reduce-fusion transformation. Fuses a map with an immediately following reduction, where the array between the map and the reduction is not used anywhere else. """ no_init = Property(dtype=bool, default=False, desc='If enabled, does not create initialization states ' 'for reduce nodes with identity') _tasklet = nodes.Tasklet('_') _tmap_exit = nodes.MapExit(nodes.Map("", [], [])) _in_array = nodes.AccessNode('_') import dace.libraries.standard as stdlib # Avoid import loop _reduce = stdlib.Reduce() _out_array = nodes.AccessNode('_') @staticmethod def expressions(): return [ sdutil.node_path_graph(MapReduceFusion._tasklet, MapReduceFusion._tmap_exit, MapReduceFusion._in_array, MapReduceFusion._reduce, MapReduceFusion._out_array) ] @staticmethod def can_be_applied(graph, candidate, expr_index, sdfg, strict=False): tmap_exit = graph.nodes()[candidate[MapReduceFusion._tmap_exit]] in_array = graph.nodes()[candidate[MapReduceFusion._in_array]] reduce_node = graph.nodes()[candidate[MapReduceFusion._reduce]] tasklet = graph.nodes()[candidate[MapReduceFusion._tasklet]] # Make sure that the array is only accessed by the map and the reduce if any([ src != tmap_exit for src, _, _, _, memlet in graph.in_edges(in_array) ]): return False if any([ dest != reduce_node for _, _, dest, _, memlet in graph.out_edges(in_array) ]): return False tmem = next(e for e in graph.edges_between(tasklet, tmap_exit) if e.data.data == in_array.data).data # (strict) Make sure that the transient is not accessed anywhere else # in this state or other states if strict and (len([ n for n in graph.nodes() if isinstance(n, nodes.AccessNode) and n.data == in_array.data ]) > 1 or in_array.data in sdfg.shared_transients()): return False # If memlet already has WCR and it is different from reduce node, # do not match if tmem.wcr is not None and tmem.wcr != reduce_node.wcr: return False # Verify that reduction ranges match tasklet map tout_memlet = graph.in_edges(in_array)[0].data rin_memlet = graph.out_edges(in_array)[0].data if tout_memlet.subset != rin_memlet.subset: return False return True @staticmethod def match_to_str(graph, candidate): tasklet = candidate[MapReduceFusion._tasklet] map_exit = candidate[MapReduceFusion._tmap_exit] reduce = candidate[MapReduceFusion._reduce] return ' -> '.join(str(node) for node in [tasklet, map_exit, reduce]) def apply(self, sdfg: SDFG): graph = sdfg.nodes()[self.state_id] tmap_exit = graph.nodes()[self.subgraph[MapReduceFusion._tmap_exit]] in_array = graph.nodes()[self.subgraph[MapReduceFusion._in_array]] reduce_node = graph.nodes()[self.subgraph[MapReduceFusion._reduce]] out_array = graph.nodes()[self.subgraph[MapReduceFusion._out_array]] # Set nodes to remove according to the expression index nodes_to_remove = [in_array] nodes_to_remove.append(reduce_node) memlet_edge = None for edge in graph.in_edges(tmap_exit): if edge.data.data == in_array.data: memlet_edge = edge break if memlet_edge is None: raise RuntimeError('Reduction memlet cannot be None') # Find which indices should be removed from new memlet input_edge = graph.in_edges(reduce_node)[0] axes = reduce_node.axes or list(range(len(input_edge.data.subset))) array_edge = graph.out_edges(reduce_node)[0] # Delete relevant edges and nodes graph.remove_nodes_from(nodes_to_remove) # Filter out reduced dimensions from subset filtered_subset = [ dim for i, dim in enumerate(memlet_edge.data.subset) if i not in axes ] if len(filtered_subset) == 0: # Output is a scalar filtered_subset = [(0, 0, 1)] # Modify edge from tasklet to map exit memlet_edge.data.data = out_array.data memlet_edge.data.wcr = reduce_node.wcr memlet_edge.data.subset = type(memlet_edge.data.subset)(filtered_subset) # Add edge from map exit to output array graph.add_edge( memlet_edge.dst, 'OUT_' + memlet_edge.dst_conn[3:], array_edge.dst, array_edge.dst_conn, Memlet.simple(array_edge.data.data, array_edge.data.subset, num_accesses=array_edge.data.num_accesses, wcr_str=reduce_node.wcr)) # Add initialization state as necessary if reduce_node.identity is not None: init_state = sdfg.add_state_before(graph) init_state.add_mapped_tasklet( 'freduce_init', [('o%d' % i, '%s:%s:%s' % (r[0], r[1] + 1, r[2])) for i, r in enumerate(array_edge.data.subset)], {}, 'out = %s' % reduce_node.identity, { 'out': Memlet.simple( array_edge.data.data, ','.join([ 'o%d' % i for i in range(len(array_edge.data.subset)) ])) }, external_edges=True)
class MapWCRFusion(pm.Transformation): """ Implements the map expanded-reduce fusion transformation. Fuses a map with an immediately following reduction, where the array between the map and the reduction is not used anywhere else, and the reduction is divided to two maps with a WCR, denoting partial reduction. """ _tasklet = nodes.Tasklet('_') _tmap_exit = nodes.MapExit(nodes.Map("", [], [])) _in_array = nodes.AccessNode('_') _rmap_in_entry = nodes.MapEntry(nodes.Map("", [], [])) _rmap_in_tasklet = nodes.Tasklet('_') _rmap_in_cr = nodes.MapExit(nodes.Map("", [], [])) _rmap_out_entry = nodes.MapEntry(nodes.Map("", [], [])) _rmap_out_exit = nodes.MapExit(nodes.Map("", [], [])) _out_array = nodes.AccessNode('_') @staticmethod def expressions(): return [ # Map, then partial reduction of axes sdutil.node_path_graph( MapWCRFusion._tasklet, MapWCRFusion._tmap_exit, MapWCRFusion._in_array, MapWCRFusion._rmap_out_entry, MapWCRFusion._rmap_in_entry, MapWCRFusion._rmap_in_tasklet, MapWCRFusion._rmap_in_cr, MapWCRFusion._rmap_out_exit, MapWCRFusion._out_array) ] @staticmethod def can_be_applied(graph, candidate, expr_index, sdfg, strict=False): tmap_exit = graph.nodes()[candidate[MapWCRFusion._tmap_exit]] in_array = graph.nodes()[candidate[MapWCRFusion._in_array]] rmap_entry = graph.nodes()[candidate[MapWCRFusion._rmap_out_entry]] # Make sure that the array is only accessed by the map and the reduce if any([ src != tmap_exit for src, _, _, _, memlet in graph.in_edges(in_array) ]): return False if any([ dest != rmap_entry for _, _, dest, _, memlet in graph.out_edges(in_array) ]): return False # Make sure that there is a reduction in the second map rmap_cr = graph.nodes()[candidate[MapWCRFusion._rmap_in_cr]] reduce_edge = graph.in_edges(rmap_cr)[0] if reduce_edge.data.wcr is None: return False # (strict) Make sure that the transient is not accessed anywhere else # in this state or other states if strict and (len([ n for n in graph.nodes() if isinstance(n, nodes.AccessNode) and n.data == in_array.data ]) > 1 or in_array.data in sdfg.shared_transients()): return False # Verify that reduction ranges match tasklet map tout_memlet = graph.in_edges(in_array)[0].data rin_memlet = graph.out_edges(in_array)[0].data if tout_memlet.subset != rin_memlet.subset: return False return True @staticmethod def match_to_str(graph, candidate): tasklet = candidate[MapWCRFusion._tasklet] map_exit = candidate[MapWCRFusion._tmap_exit] reduce = candidate[MapWCRFusion._rmap_in_cr] return ' -> '.join(str(node) for node in [tasklet, map_exit, reduce]) def apply(self, sdfg): graph = sdfg.node(self.state_id) # To apply, collapse the second map and then fuse the two resulting maps map_collapse = MapCollapse( self.sdfg_id, self.state_id, { MapCollapse._outer_map_entry: self.subgraph[MapWCRFusion._rmap_out_entry], MapCollapse._inner_map_entry: self.subgraph[MapWCRFusion._rmap_in_entry] }, 0) map_entry, _ = map_collapse.apply(sdfg) map_fusion = MapFusion( self.sdfg_id, self.state_id, { MapFusion._first_map_exit: self.subgraph[MapWCRFusion._tmap_exit], MapFusion._second_map_entry: graph.node_id(map_entry) }, 0) map_fusion.apply(sdfg)
class AccumulateTransient(transformation.Transformation): """ Implements the AccumulateTransient transformation, which adds transient stream and data nodes between nested maps that lead to a stream. The transient data nodes then act as a local accumulator. """ _map_exit = nodes.MapExit(nodes.Map("", [], [])) _outer_map_exit = nodes.MapExit(nodes.Map("", [], [])) array = Property( dtype=str, desc="Array to create local storage for (if empty, first available)", default=None, allow_none=True) identity = SymbolicProperty(desc="Identity value to set", default=None, allow_none=True) @staticmethod def expressions(): return [ sdutil.node_path_graph(AccumulateTransient._map_exit, AccumulateTransient._outer_map_exit) ] @staticmethod def can_be_applied(graph, candidate, expr_index, sdfg, strict=False): map_exit = graph.nodes()[candidate[AccumulateTransient._map_exit]] outer_map_exit = graph.nodes()[candidate[ AccumulateTransient._outer_map_exit]] # Check if there is an accumulation output for e in graph.edges_between(map_exit, outer_map_exit): if e.data.wcr is not None: return True return False @staticmethod def match_to_str(graph, candidate): map_exit = candidate[AccumulateTransient._map_exit] outer_map_exit = candidate[AccumulateTransient._outer_map_exit] return ' -> '.join(str(node) for node in [map_exit, outer_map_exit]) def apply(self, sdfg: SDFG): graph = sdfg.node(self.state_id) map_exit = graph.node(self.subgraph[AccumulateTransient._map_exit]) outer_map_exit = graph.node( self.subgraph[AccumulateTransient._outer_map_exit]) # Choose array array = self.array if array is None or len(array) == 0: array = next(e.data.data for e in graph.edges_between(map_exit, outer_map_exit) if e.data.wcr is not None) # Avoid import loop from dace.transformation.dataflow.local_storage import OutLocalStorage data_node: nodes.AccessNode = OutLocalStorage.apply_to( sdfg, dict(array=array), verify=False, save=False, node_a=map_exit, node_b=outer_map_exit) if self.identity is None: warnings.warn('AccumulateTransient did not properly initialize ' 'newly-created transient!') return sdfg_state: SDFGState = sdfg.node(self.state_id) map_entry = sdfg_state.entry_node(map_exit) nested_sdfg: NestedSDFG = nest_state_subgraph( sdfg=sdfg, state=sdfg_state, subgraph=SubgraphView( sdfg_state, {map_entry, map_exit} | sdfg_state.all_nodes_between(map_entry, map_exit))) nested_sdfg_state: SDFGState = nested_sdfg.sdfg.nodes()[0] init_state = nested_sdfg.sdfg.add_state_before(nested_sdfg_state) temp_array: Array = sdfg.arrays[data_node.data] init_state.add_mapped_tasklet( name='acctrans_init', map_ranges={ '_o%d' % i: '0:%s' % symstr(d) for i, d in enumerate(temp_array.shape) }, inputs={}, code='out = %s' % self.identity, outputs={ 'out': dace.Memlet.simple(data=data_node.data, subset_str=','.join( ['0:%d' % i for i in temp_array.shape])) }, external_edges=True)
class MapExpansion(pm.Transformation): """ Implements the map-expansion pattern. Map-expansion takes an N-dimensional map and expands it to N unidimensional maps. New edges abide by the following rules: 1. If there are no edges coming from the outside, use empty memlets 2. Edges with IN_* connectors replicate along the maps 3. Edges for dynamic map ranges replicate until reaching range(s) """ _map_entry = nodes.MapEntry(nodes.Map("", [], [])) @staticmethod def expressions(): return [sdutil.node_path_graph(MapExpansion._map_entry)] @staticmethod def can_be_applied(graph: dace.sdfg.graph.OrderedMultiDiConnectorGraph, candidate: Dict[dace.sdfg.nodes.Node, int], expr_index: int, sdfg: dace.SDFG, strict: bool = False): # A candidate subgraph matches the map-expansion pattern when it # includes an N-dimensional map, with N greater than one. map_entry = graph.nodes()[candidate[MapExpansion._map_entry]] return map_entry.map.get_param_num() > 1 @staticmethod def match_to_str(graph: dace.sdfg.graph.OrderedMultiDiConnectorGraph, candidate: Dict[dace.sdfg.nodes.Node, int]): map_entry = graph.nodes()[candidate[MapExpansion._map_entry]] return map_entry.map.label + ': ' + str(map_entry.map.params) def apply(self, sdfg: dace.SDFG): # Extract the map and its entry and exit nodes. graph = sdfg.nodes()[self.state_id] map_entry = graph.nodes()[self.subgraph[MapExpansion._map_entry]] map_exit = graph.exit_node(map_entry) current_map = map_entry.map # Create new maps new_maps = [ nodes.Map(current_map.label + '_' + str(param), [param], subsets.Range([param_range]), schedule=dtypes.ScheduleType.Sequential) for param, param_range in zip(current_map.params[1:], current_map.range[1:]) ] current_map.params = [current_map.params[0]] current_map.range = subsets.Range([current_map.range[0]]) # Create new map entries and exits entries = [nodes.MapEntry(new_map) for new_map in new_maps] exits = [nodes.MapExit(new_map) for new_map in new_maps] # Create edges, abiding by the following rules: # 1. If there are no edges coming from the outside, use empty memlets # 2. Edges with IN_* connectors replicate along the maps # 3. Edges for dynamic map ranges replicate until reaching range(s) for edge in graph.out_edges(map_entry): graph.remove_edge(edge) graph.add_memlet_path(map_entry, *entries, edge.dst, src_conn=edge.src_conn, memlet=edge.data, dst_conn=edge.dst_conn) # Modify dynamic map ranges dynamic_edges = dace.sdfg.dynamic_map_inputs(graph, map_entry) for edge in dynamic_edges: # Remove old edge and connector graph.remove_edge(edge) edge.dst.remove_in_connector(edge.dst_conn) # Propagate to each range it belongs to path = [] for mapnode in [map_entry] + entries: path.append(mapnode) if any(edge.dst_conn in map(str, symbolic.symlist(r)) for r in mapnode.map.range): graph.add_memlet_path(edge.src, *path, memlet=edge.data, src_conn=edge.src_conn, dst_conn=edge.dst_conn) # Create new map exits for edge in graph.in_edges(map_exit): graph.remove_edge(edge) graph.add_memlet_path(edge.src, *exits[::-1], map_exit, memlet=edge.data, src_conn=edge.src_conn, dst_conn=edge.dst_conn)
def _create_ceil_range(self, sdfg: SDFG, graph: SDFGState, map_entry: nodes.MapEntry): map_exit = graph.exit_node(map_entry) # Retrieve transformation properties. dim_idx = self.dim_idx new_dim_prefix = self.new_dim_prefix tile_size = self.tile_size divides_evenly = self.divides_evenly strided = self.strided offset = self.tile_offset tile_stride = self.tile_stride if tile_stride == 0: tile_stride = tile_size # Retrieve parameter and range of dimension to be strip-mined. target_dim = map_entry.map.params[dim_idx] td_from, td_to, td_step = map_entry.map.range[dim_idx] # Create new map. Replace by cloning map object? new_dim = self._find_new_dim(sdfg, graph, map_entry, new_dim_prefix, target_dim) nd_from = 0 if tile_stride == 1: nd_to = td_to - td_from else: nd_to = symbolic.pystr_to_symbolic( 'int_ceil(%s + 1 - %s, %s) - 1' % (symbolic.symstr(td_to), symbolic.symstr(td_from), symbolic.symstr(tile_stride))) nd_step = 1 new_dim_range = (nd_from, nd_to, nd_step) new_map = nodes.Map(new_dim + '_' + map_entry.map.label, [new_dim], subsets.Range([new_dim_range])) # Change the range of the selected dimension to iterate over a single # tile if strided: td_from_new = symbolic.pystr_to_symbolic(new_dim) td_to_new_approx = td_to td_step = tile_size elif offset == 0: td_from_new = symbolic.pystr_to_symbolic( '%s + %s * %s' % (symbolic.symstr(td_from), symbolic.symstr(new_dim), symbolic.symstr(tile_stride))) td_to_new_exact = symbolic.pystr_to_symbolic( 'min(%s + 1, %s + %s * %s + %s) - 1' % (symbolic.symstr(td_to), symbolic.symstr(td_from), symbolic.symstr(tile_stride), symbolic.symstr(new_dim), symbolic.symstr(tile_size))) td_to_new_approx = symbolic.pystr_to_symbolic( '%s + %s * %s + %s - 1' % (symbolic.symstr(td_from), symbolic.symstr(tile_stride), symbolic.symstr(new_dim), symbolic.symstr(tile_size))) else: # include offset td_from_new_exact = symbolic.pystr_to_symbolic( 'max(%s,%s + %s * %s - %s)' % (symbolic.symstr(td_from), symbolic.symstr(td_from), symbolic.symstrtr(tile_stride), symbolic.symstr(new_dim), symbolic.symstr(offset))) td_from_new_approx = symbolic.pystr_to_symbolic( '%s + %s * %s - %s ' % (symbolic.symstr(td_from), symbolic.symstr(tile_stride), symbolic.symstr(new_dim), symbolic.symstr(offset))) td_from_new = dace.symbolic.SymExpr(td_from_new_exact, td_from_new_approx) td_to_new_exact = symbolic.pystr_to_symbolic( 'min(%s + 1, %s + %s * %s + %s - %s) -1' % (symbolic.symstr(td_to), symbolic.symstr(td_from), symbolic.symstr(tile_stride), symbolic.symstr(new_dim), symbolic.symstr(tile_size), symbolic.symstr(offset))) td_to_new_approx = symbolic.pystr_to_symbolic( '%s + %s * %s + %s - %s - 1' % (symbolic.symstr(td_from), symbolic.symstr(tile_stride), symbolic.symstr(new_dim), symbolic.symstr(tile_size), symbolic.symstr(offset))) if divides_evenly or strided: td_to_new = td_to_new_approx else: td_to_new = dace.symbolic.SymExpr(td_to_new_exact, td_to_new_approx) return new_dim, new_map, (td_from_new, td_to_new, td_step)
class StripMining(transformation.Transformation): """ Implements the strip-mining transformation. Strip-mining takes as input a map dimension and splits it into two dimensions. The new dimension iterates over the range of the original one with a parameterizable step, called the tile size. The original dimension is changed to iterates over the range of the tile size, with the same step as before. """ _map_entry = nodes.MapEntry(nodes.Map("", [], [])) # Properties dim_idx = Property(dtype=int, default=-1, desc="Index of dimension to be strip-mined") new_dim_prefix = Property(dtype=str, default="tile", desc="Prefix for new dimension name") tile_size = SymbolicProperty( default=64, desc="Tile size of strip-mined dimension, " "or number of tiles if tiling_type=number_of_tiles") tile_stride = SymbolicProperty(default=0, desc="Stride between two tiles of the " "strip-mined dimension. If zero, it is set " "equal to the tile size.") tile_offset = SymbolicProperty(default=0, desc="Tile stride offset (negative)") divides_evenly = Property(dtype=bool, default=False, desc="Tile size divides dimension range evenly?") strided = Property( dtype=bool, default=False, desc="Continuous (false) or strided (true) elements in tile") tiling_type = Property( dtype=str, default='normal', choices=['normal', 'ceilrange', 'number_of_tiles'], allow_none=True, desc="normal: the outerloop increments with tile_size, " "ceilrange: uses ceiling(N/tile_size) in outer range, " "number_of_tiles: tiles the map into the number of provided tiles, " "provide the number of tiles over tile_size") skew = Property( dtype=bool, default=False, desc="If True, offsets inner tile back such that it starts with zero") @staticmethod def annotates_memlets(): return True @staticmethod def expressions(): return [ sdutil.node_path_graph(StripMining._map_entry) # kStripMining._tasklet, StripMining._map_exit) ] @staticmethod def can_be_applied(graph, candidate, expr_index, sdfg, strict=False): return True @staticmethod def match_to_str(graph, candidate): map_entry = graph.nodes()[candidate[StripMining._map_entry]] return map_entry.map.label + ': ' + str(map_entry.map.params) def apply(self, sdfg): graph = sdfg.nodes()[self.state_id] # Strip-mine selected dimension. _, _, new_map = self._stripmine(sdfg, graph, self.subgraph) return new_map # def __init__(self, tag=True): def __init__(self, *args, **kwargs): self._entry = nodes.EntryNode() self._tasklet = nodes.Tasklet('_') self._exit = nodes.ExitNode() super().__init__(*args, **kwargs) # self.tag = tag @property def entry(self): return self._entry @property def exit(self): return self._exit @property def tasklet(self): return self._tasklet def print_match_pattern(self, candidate): gentry = candidate[self.entry] return str(gentry.map.params[-1]) def _find_new_dim(self, sdfg: SDFG, state: SDFGState, entry: nodes.MapEntry, prefix: str, target_dim: str): """ Finds a variable that is not already defined in scope. """ stree = state.scope_tree() if len(prefix) == 0: return target_dim candidate = '%s_%s' % (prefix, target_dim) index = 1 while candidate in map(str, stree[entry].defined_vars): candidate = '%s%d_%s' % (prefix, index, target_dim) index += 1 return candidate def _create_strided_range(self, sdfg: SDFG, state: SDFGState, map_entry: nodes.MapEntry): map_exit = state.exit_node(map_entry) dim_idx = self.dim_idx new_dim_prefix = self.new_dim_prefix tile_size = self.tile_size divides_evenly = self.divides_evenly tile_stride = self.tile_stride if tile_stride == 0: tile_stride = tile_size if tile_stride != tile_size: raise NotImplementedError # Retrieve parameter and range of dimension to be strip-mined. target_dim = map_entry.map.params[dim_idx] td_from, td_to, td_step = map_entry.map.range[dim_idx] new_dim = self._find_new_dim(sdfg, state, map_entry, new_dim_prefix, target_dim) new_dim_range = (td_from, td_to, tile_size) new_map = nodes.Map(map_entry.map.label, [new_dim], subsets.Range([new_dim_range])) dimsym = dace.symbolic.pystr_to_symbolic(new_dim) td_from_new = dimsym if divides_evenly: td_to_new = dimsym + tile_size - 1 else: if isinstance(td_to, dace.symbolic.SymExpr): td_to = td_to.expr td_to_new = dace.symbolic.SymExpr( sympy.Min(dimsym + tile_size - 1, td_to), dimsym + tile_size - 1) td_step_new = td_step return new_dim, new_map, (td_from_new, td_to_new, td_step_new) def _create_ceil_range(self, sdfg: SDFG, graph: SDFGState, map_entry: nodes.MapEntry): map_exit = graph.exit_node(map_entry) # Retrieve transformation properties. dim_idx = self.dim_idx new_dim_prefix = self.new_dim_prefix tile_size = self.tile_size divides_evenly = self.divides_evenly strided = self.strided offset = self.tile_offset tile_stride = self.tile_stride if tile_stride == 0: tile_stride = tile_size # Retrieve parameter and range of dimension to be strip-mined. target_dim = map_entry.map.params[dim_idx] td_from, td_to, td_step = map_entry.map.range[dim_idx] # Create new map. Replace by cloning map object? new_dim = self._find_new_dim(sdfg, graph, map_entry, new_dim_prefix, target_dim) nd_from = 0 if tile_stride == 1: nd_to = td_to - td_from else: nd_to = symbolic.pystr_to_symbolic( 'int_ceil(%s + 1 - %s, %s) - 1' % (symbolic.symstr(td_to), symbolic.symstr(td_from), symbolic.symstr(tile_stride))) nd_step = 1 new_dim_range = (nd_from, nd_to, nd_step) new_map = nodes.Map(new_dim + '_' + map_entry.map.label, [new_dim], subsets.Range([new_dim_range])) # Change the range of the selected dimension to iterate over a single # tile if strided: td_from_new = symbolic.pystr_to_symbolic(new_dim) td_to_new_approx = td_to td_step = tile_size elif offset == 0: td_from_new = symbolic.pystr_to_symbolic( '%s + %s * %s' % (symbolic.symstr(td_from), symbolic.symstr(new_dim), symbolic.symstr(tile_stride))) td_to_new_exact = symbolic.pystr_to_symbolic( 'min(%s + 1, %s + %s * %s + %s) - 1' % (symbolic.symstr(td_to), symbolic.symstr(td_from), symbolic.symstr(tile_stride), symbolic.symstr(new_dim), symbolic.symstr(tile_size))) td_to_new_approx = symbolic.pystr_to_symbolic( '%s + %s * %s + %s - 1' % (symbolic.symstr(td_from), symbolic.symstr(tile_stride), symbolic.symstr(new_dim), symbolic.symstr(tile_size))) else: # include offset td_from_new_exact = symbolic.pystr_to_symbolic( 'max(%s,%s + %s * %s - %s)' % (symbolic.symstr(td_from), symbolic.symstr(td_from), symbolic.symstrtr(tile_stride), symbolic.symstr(new_dim), symbolic.symstr(offset))) td_from_new_approx = symbolic.pystr_to_symbolic( '%s + %s * %s - %s ' % (symbolic.symstr(td_from), symbolic.symstr(tile_stride), symbolic.symstr(new_dim), symbolic.symstr(offset))) td_from_new = dace.symbolic.SymExpr(td_from_new_exact, td_from_new_approx) td_to_new_exact = symbolic.pystr_to_symbolic( 'min(%s + 1, %s + %s * %s + %s - %s) -1' % (symbolic.symstr(td_to), symbolic.symstr(td_from), symbolic.symstr(tile_stride), symbolic.symstr(new_dim), symbolic.symstr(tile_size), symbolic.symstr(offset))) td_to_new_approx = symbolic.pystr_to_symbolic( '%s + %s * %s + %s - %s - 1' % (symbolic.symstr(td_from), symbolic.symstr(tile_stride), symbolic.symstr(new_dim), symbolic.symstr(tile_size), symbolic.symstr(offset))) if divides_evenly or strided: td_to_new = td_to_new_approx else: td_to_new = dace.symbolic.SymExpr(td_to_new_exact, td_to_new_approx) return new_dim, new_map, (td_from_new, td_to_new, td_step) def _create_from_tile_numbers(self, sdfg: SDFG, state: SDFGState, map_entry: nodes.MapEntry): map_exit = state.exit_node(map_entry) # Retrieve transformation properties. dim_idx = self.dim_idx new_dim_prefix = self.new_dim_prefix divides_evenly = self.divides_evenly number_of_tiles = self.tile_size tile_stride = self.tile_stride number_of_tiles = dace.symbolic.pystr_to_symbolic(number_of_tiles) # Retrieve parameter and range of dimension to be strip-mined. target_dim = map_entry.map.params[dim_idx] td_from, td_to, td_step = map_entry.map.range[dim_idx] tile_size = map_entry.map.range.size_exact()[dim_idx] / number_of_tiles if tile_stride == 0: tile_stride = tile_size if tile_stride != tile_size: raise NotImplementedError new_dim = self._find_new_dim(sdfg, state, map_entry, new_dim_prefix, target_dim) new_dim_range = (td_from, number_of_tiles - 1, 1) new_map = nodes.Map(map_entry.map.label, [new_dim], subsets.Range([new_dim_range])) dimsym = dace.symbolic.pystr_to_symbolic(new_dim) td_from_new = dimsym * tile_size if divides_evenly: td_to_new = (dimsym + 1) * tile_size - 1 else: if isinstance(td_to, dace.symbolic.SymExpr): td_to = td_to.expr td_to_new = dace.symbolic.SymExpr( sympy.Min((dimsym + 1) * tile_size - 1, td_to), (dimsym + 1) * tile_size - 1) td_step_new = td_step return new_dim, new_map, (td_from_new, td_to_new, td_step_new) def _stripmine(self, sdfg, graph, candidate): # Retrieve map entry and exit nodes. map_entry = graph.nodes()[candidate[StripMining._map_entry]] map_exit = graph.exit_node(map_entry) # Retrieve transformation properties. dim_idx = self.dim_idx target_dim = map_entry.map.params[dim_idx] if self.tiling_type == 'ceilrange': new_dim, new_map, td_rng = self._create_ceil_range( sdfg, graph, map_entry) elif self.tiling_type == 'number_of_tiles': new_dim, new_map, td_rng = self._create_from_tile_numbers( sdfg, graph, map_entry) else: new_dim, new_map, td_rng = self._create_strided_range( sdfg, graph, map_entry) new_map_entry = nodes.MapEntry(new_map) new_map_exit = nodes.MapExit(new_map) td_to_new_approx = td_rng[1] if isinstance(td_to_new_approx, dace.symbolic.SymExpr): td_to_new_approx = td_to_new_approx.approx # Special case: If range is 1 and no prefix was specified, skip range if td_rng[0] == td_to_new_approx and target_dim == new_dim: map_entry.map.range = subsets.Range( [r for i, r in enumerate(map_entry.map.range) if i != dim_idx]) map_entry.map.params = [ p for i, p in enumerate(map_entry.map.params) if i != dim_idx ] if len(map_entry.map.params) == 0: raise ValueError('Strip-mining all dimensions of the map with ' 'empty tiles is disallowed') else: map_entry.map.range[dim_idx] = td_rng # Make internal map's schedule to "not parallel" new_map.schedule = map_entry.map.schedule map_entry.map.schedule = dtypes.ScheduleType.Sequential # Redirect edges new_map_entry.in_connectors = dcpy(map_entry.in_connectors) sdutil.change_edge_dest(graph, map_entry, new_map_entry) new_map_exit.out_connectors = dcpy(map_exit.out_connectors) sdutil.change_edge_src(graph, map_exit, new_map_exit) # Create new entry edges new_in_edges = dict() entry_in_conn = {} entry_out_conn = {} for _src, src_conn, _dst, _, memlet in graph.out_edges(map_entry): if (src_conn is not None and src_conn[:4] == 'OUT_' and not isinstance( sdfg.arrays[memlet.data], dace.data.Scalar)): new_subset = calc_set_image( map_entry.map.params, map_entry.map.range, memlet.subset, ) conn = src_conn[4:] key = (memlet.data, 'IN_' + conn, 'OUT_' + conn) if key in new_in_edges.keys(): old_subset = new_in_edges[key].subset new_in_edges[key].subset = calc_set_union( old_subset, new_subset) else: entry_in_conn['IN_' + conn] = None entry_out_conn['OUT_' + conn] = None new_memlet = dcpy(memlet) new_memlet.subset = new_subset if memlet.dynamic: new_memlet.num_accesses = memlet.num_accesses else: new_memlet.num_accesses = new_memlet.num_elements() new_in_edges[key] = new_memlet else: if src_conn is not None and src_conn[:4] == 'OUT_': conn = src_conn[4:] in_conn = 'IN_' + conn out_conn = 'OUT_' + conn else: in_conn = src_conn out_conn = src_conn if in_conn: entry_in_conn[in_conn] = None if out_conn: entry_out_conn[out_conn] = None new_in_edges[(memlet.data, in_conn, out_conn)] = dcpy(memlet) new_map_entry.out_connectors = entry_out_conn map_entry.in_connectors = entry_in_conn for (_, in_conn, out_conn), memlet in new_in_edges.items(): graph.add_edge(new_map_entry, out_conn, map_entry, in_conn, memlet) # Create new exit edges new_out_edges = dict() exit_in_conn = {} exit_out_conn = {} for _src, _, _dst, dst_conn, memlet in graph.in_edges(map_exit): if (dst_conn is not None and dst_conn[:3] == 'IN_' and not isinstance( sdfg.arrays[memlet.data], dace.data.Scalar)): new_subset = calc_set_image( map_entry.map.params, map_entry.map.range, memlet.subset, ) conn = dst_conn[3:] key = (memlet.data, 'IN_' + conn, 'OUT_' + conn) if key in new_out_edges.keys(): old_subset = new_out_edges[key].subset new_out_edges[key].subset = calc_set_union( old_subset, new_subset) else: exit_in_conn['IN_' + conn] = None exit_out_conn['OUT_' + conn] = None new_memlet = dcpy(memlet) new_memlet.subset = new_subset if memlet.dynamic: new_memlet.num_accesses = memlet.num_accesses else: new_memlet.num_accesses = new_memlet.num_elements() new_out_edges[key] = new_memlet else: if dst_conn is not None and dst_conn[:3] == 'IN_': conn = dst_conn[3:] in_conn = 'IN_' + conn out_conn = 'OUT_' + conn else: in_conn = dst_conn out_conn = dst_conn if in_conn: exit_in_conn[in_conn] = None if out_conn: exit_out_conn[out_conn] = None new_in_edges[(memlet.data, in_conn, out_conn)] = dcpy(memlet) new_map_exit.in_connectors = exit_in_conn map_exit.out_connectors = exit_out_conn for (_, in_conn, out_conn), memlet in new_out_edges.items(): graph.add_edge(map_exit, out_conn, new_map_exit, in_conn, memlet) # Skew if necessary if self.skew: xfh.offset_map(sdfg, graph, map_entry, dim_idx, td_rng[0]) # Return strip-mined dimension. return target_dim, new_dim, new_map
class MapToForLoop(pattern_matching.Transformation): """ Implements the Map to for-loop transformation. Takes a map and enforces a sequential schedule by transforming it into a state-machine of a for-loop. Creates a nested SDFG, if necessary. """ _map_entry = nodes.MapEntry(nodes.Map("", [], [])) @staticmethod def annotates_memlets(): return True @staticmethod def expressions(): return [sdutil.node_path_graph(MapToForLoop._map_entry)] @staticmethod def can_be_applied(graph, candidate, expr_index, sdfg, strict=False): # Only uni-dimensional maps are accepted. map_entry = graph.nodes()[candidate[MapToForLoop._map_entry]] if len(map_entry.map.params) > 1: return False return True @staticmethod def match_to_str(graph, candidate): map_entry = graph.nodes()[candidate[MapToForLoop._map_entry]] return map_entry.map.label + ': ' + str(map_entry.map.params) def apply(self, sdfg) -> Tuple[nodes.NestedSDFG, SDFGState]: """ Applies the transformation and returns a tuple with the new nested SDFG node and the main state in the for-loop. """ # Retrieve map entry and exit nodes. graph = sdfg.nodes()[self.state_id] map_entry = graph.nodes()[self.subgraph[MapToForLoop._map_entry]] map_exit = graph.exit_node(map_entry) loop_idx = map_entry.map.params[0] loop_from, loop_to, loop_step = map_entry.map.range[0] # Turn the map scope into a nested SDFG node = nest_state_subgraph(sdfg, graph, graph.scope_subgraph(map_entry)) nsdfg: SDFG = node.sdfg nstate: SDFGState = nsdfg.nodes()[0] # If map range is dynamic, replace loop expressions with memlets param_to_edge = {} for edge in nstate.in_edges(map_entry): if edge.dst_conn and not edge.dst_conn.startswith('IN_'): param = '__DACE_P%d' % len(param_to_edge) repldict = {symbolic.pystr_to_symbolic(edge.dst_conn): param} param_to_edge[param] = edge loop_from = loop_from.subs(repldict) loop_to = loop_to.subs(repldict) loop_step = loop_step.subs(repldict) # Avoiding import loop from dace.codegen.targets.cpp import cpp_array_expr def replace_param(param): param = symbolic.symstr(param) for p, pval in param_to_edge.items(): # TODO: Correct w.r.t. connector type param = param.replace(p, cpp_array_expr(nsdfg, pval.data)) return param # End of dynamic input range # Create a loop inside the nested SDFG nsdfg.add_loop(None, nstate, None, loop_idx, replace_param(loop_from), '%s < %s' % (loop_idx, replace_param(loop_to + 1)), '%s + %s' % (loop_idx, replace_param(loop_step))) # Skip map in input edges for edge in nstate.out_edges(map_entry): src_node = nstate.memlet_path(edge)[0].src nstate.add_edge(src_node, None, edge.dst, edge.dst_conn, edge.data) nstate.remove_edge(edge) # Skip map in output edges for edge in nstate.in_edges(map_exit): dst_node = nstate.memlet_path(edge)[-1].dst nstate.add_edge(edge.src, edge.src_conn, dst_node, None, edge.data) nstate.remove_edge(edge) # Remove nodes from dynamic map range nstate.remove_nodes_from( [e.src for e in dace.sdfg.dynamic_map_inputs(nstate, map_entry)]) # Remove scope nodes nstate.remove_nodes_from([map_entry, map_exit]) return node, nstate
class GPUTransformMap(transformation.Transformation): """ Implements the GPUTransformMap transformation. Converts a single map to a GPU-scheduled map and creates GPU arrays outside it, generating CPU<->GPU memory copies automatically. """ fullcopy = Property(desc="Copy whole arrays rather than used subset", dtype=bool, default=False) toplevel_trans = Property(desc="Make all GPU transients top-level", dtype=bool, default=False) register_trans = Property( desc="Make all transients inside GPU maps registers", dtype=bool, default=False) gpu_id = SymbolicProperty(default=None, allow_none=True, desc="Selects which gpu the map should run on") sequential_innermaps = Property(desc="Make all internal maps Sequential", dtype=bool, default=False) _map_entry = nodes.MapEntry(nodes.Map("", [], [])) import dace.libraries.standard as stdlib # Avoid import loop _reduce = stdlib.Reduce('lambda: None', None) @staticmethod def expressions(): return [ sdutil.node_path_graph(GPUTransformMap._map_entry), sdutil.node_path_graph(GPUTransformMap._reduce) ] @staticmethod def can_be_applied(graph, candidate, expr_index, sdfg, strict=False): if expr_index == 0: map_entry = graph.nodes()[candidate[GPUTransformMap._map_entry]] candidate_map = map_entry.map # Map schedules that are disallowed to transform to GPUs if (candidate_map.schedule in [dtypes.ScheduleType.MPI] + dtypes.GPU_SCHEDULES): return False if sd.is_devicelevel_gpu(sdfg, graph, map_entry): return False # Dynamic map ranges cannot become kernels if sd.has_dynamic_map_inputs(graph, map_entry): return False # Ensure that map does not include internal arrays that are # allocated on non-default space subgraph = graph.scope_subgraph(map_entry) for node in subgraph.nodes(): if (isinstance(node, nodes.AccessNode) and node.desc(sdfg).storage != dtypes.StorageType.Default and node.desc(sdfg).storage != dtypes.StorageType.Register): return False # If one of the outputs is a stream, do not match map_exit = graph.exit_node(map_entry) for edge in graph.out_edges(map_exit): dst = graph.memlet_path(edge)[-1].dst if (isinstance(dst, nodes.AccessNode) and isinstance(sdfg.arrays[dst.data], data.Stream)): return False return True elif expr_index == 1: reduce = graph.nodes()[candidate[GPUTransformMap._reduce]] # Disallow GPU transformation if already in device-level code if sd.is_devicelevel_gpu(sdfg, graph, reduce): return False return True @staticmethod def match_to_str(graph, candidate): if GPUTransformMap._reduce in candidate: return str(graph.nodes()[candidate[GPUTransformMap._reduce]]) else: return str(graph.nodes()[candidate[GPUTransformMap._map_entry]]) def apply(self, sdfg): graph = sdfg.nodes()[self.state_id] if self.expr_index == 0: map_entry = graph.nodes()[self.subgraph[GPUTransformMap._map_entry]] nsdfg_node = helpers.nest_state_subgraph( sdfg, graph, graph.scope_subgraph(map_entry), full_data=self.fullcopy) else: cnode = graph.nodes()[self.subgraph[GPUTransformMap._reduce]] nsdfg_node = helpers.nest_state_subgraph(sdfg, graph, SubgraphView( graph, [cnode]), full_data=self.fullcopy) # Avoiding import loops from dace.transformation.interstate import GPUTransformSDFG transformation = GPUTransformSDFG(0, 0, {}, 0) transformation.register_trans = self.register_trans transformation.sequential_innermaps = self.sequential_innermaps transformation.toplevel_trans = self.toplevel_trans transformation.gpu_id = self.gpu_id transformation.apply(nsdfg_node.sdfg) # Inline back as necessary sdfg.apply_strict_transformations()
class TrivialMapElimination(transformation.Transformation): """ Implements the Trivial-Map Elimination pattern. Trivial-Map Elimination takes a map with a range containing one element and removes the map. Example: Map[i=0] -> nothing """ _map_entry = nodes.MapEntry(nodes.Map("", [], [])) @staticmethod def expressions(): return [sdutil.node_path_graph(TrivialMapElimination._map_entry)] @staticmethod def can_be_applied(graph, candidate, expr_index, sdfg, strict=False): map_entry = graph.nodes()[candidate[TrivialMapElimination._map_entry]] map_from, map_to, map_step = map_entry.map.range[0] return len(map_entry.map.range) == 1 and map_to == map_from @staticmethod def match_to_str(graph, candidate): map_entry = graph.nodes()[candidate[TrivialMapElimination._map_entry]] return map_entry.map.label + ': ' + str(map_entry.map.params) def apply(self, sdfg): graph = sdfg.nodes()[self.state_id] map_entry = graph.nodes()[self.subgraph[ TrivialMapElimination._map_entry]] map_exit = graph.exit_node(map_entry) map_param = map_entry.map.params[0] map_from, map_to, _ = map_entry.map.range[0] assert map_from == map_to # Replace the map index variable with the value it obtained scope = graph.scope_subgraph(map_entry) scope.replace(map_param, map_from) # Redirect map entry's out edges. for edge in graph.out_edges(map_entry): path = graph.memlet_path(edge) ind = path.index(edge) # Add an edge directly from the previous source connector to the # destination graph.add_edge(path[ind - 1].src, path[ind - 1].src_conn, edge.dst, edge.dst_conn, edge.data) # Redirect map exit's in edges. for edge in graph.in_edges(map_exit): path = graph.memlet_path(edge) ind = path.index(edge) # Add an edge directly from the source to the next destination # connector graph.add_edge(edge.src, edge.src_conn, path[ind + 1].dst, path[ind + 1].dst_conn, edge.data) # Clean-up graph.remove_nodes_from([map_entry, map_exit])
class Vectorization(pattern_matching.Transformation): """ Implements the vectorization transformation. Vectorization matches when all the input and output memlets of a tasklet inside a map access the inner-most loop variable in their last dimension. The transformation changes the step of the inner-most loop to be equal to the length of the vector and vectorizes the memlets. """ vector_len = Property(desc="Vector length", dtype=int, default=4) propagate_parent = Property(desc="Propagate vector length through " "parent SDFGs", dtype=bool, default=False) strided_map = Property(desc="Use strided map range (jump by vector length)" " instead of modifying memlets", dtype=bool, default=False) _map_entry = nodes.MapEntry(nodes.Map("", [], [])) _tasklet = nodes.Tasklet('_') _map_exit = nodes.MapExit(nodes.Map("", [], [])) @staticmethod def expressions(): return [ sdutil.node_path_graph(Vectorization._map_entry, Vectorization._tasklet, Vectorization._map_exit) ] @staticmethod def can_be_applied(graph, candidate, expr_index, sdfg, strict=False): map_entry = graph.nodes()[candidate[Vectorization._map_entry]] tasklet = graph.nodes()[candidate[Vectorization._tasklet]] param = symbolic.pystr_to_symbolic(map_entry.map.params[-1]) found = False # Check if all edges, adjacent to the tasklet, # use the parameter in their last dimension. for _src, _, _dest, _, memlet in graph.all_edges(tasklet): # Cases that do not matter for vectorization if memlet.data is None: # Empty memlets continue if isinstance(sdfg.arrays[memlet.data], data.Stream): # Streams continue # Vectorization can not be applied in WCR if memlet.wcr is not None: return False try: subset = memlet.subset veclen = memlet.veclen except AttributeError: return False if subset is None: return False try: if veclen > symbolic.pystr_to_symbolic('1'): return False for idx, expr in enumerate(subset): if isinstance(expr, tuple): for ex in expr: ex = symbolic.pystr_to_symbolic(ex) symbols = ex.free_symbols if param in symbols: if idx == subset.dims() - 1: found = True else: return False else: expr = symbolic.pystr_to_symbolic(expr) symbols = expr.free_symbols if param in symbols: if idx == subset.dims() - 1: found = True else: return False except TypeError: # cannot determine truth value of Relational return False return found @staticmethod def match_to_str(graph, candidate): map_entry = candidate[Vectorization._map_entry] tasklet = candidate[Vectorization._tasklet] map_exit = candidate[Vectorization._map_exit] return ' -> '.join( str(node) for node in [map_entry, tasklet, map_exit]) def apply(self, sdfg): graph = sdfg.nodes()[self.state_id] map_entry = graph.nodes()[self.subgraph[Vectorization._map_entry]] tasklet = graph.nodes()[self.subgraph[Vectorization._tasklet]] map_exit = graph.nodes()[self.subgraph[Vectorization._map_exit]] param = symbolic.pystr_to_symbolic(map_entry.map.params[-1]) # Create new vector size. vector_size = self.vector_len # Change the step of the inner-most dimension. dim_from, dim_to, dim_step = map_entry.map.range[-1] if self.strided_map: map_entry.map.range[-1] = (dim_from, dim_to, vector_size) else: map_entry.map.range[-1] = (dim_from, (dim_to + 1) / vector_size - 1, dim_step) # TODO: Postamble and/or preamble non-vectorized map # Vectorize memlets adjacent to the tasklet. processed_edges = set() for edge in graph.all_edges(tasklet): _src, _, _dest, _, memlet = edge if memlet.data is None: # Empty memlets continue lastindex = memlet.subset[-1] if isinstance(lastindex, tuple): symbols = set() for indd in lastindex: symbols.update( symbolic.pystr_to_symbolic(indd).free_symbols) else: symbols = symbolic.pystr_to_symbolic( memlet.subset[-1]).free_symbols if param not in symbols: continue try: # propagate vector length inside this SDFG for e in graph.memlet_tree(edge): e.data.veclen = vector_size if not self.strided_map and e not in processed_edges: e.data.subset.replace({param: vector_size * param}) processed_edges.add(e) # propagate to the parent (TODO: handle multiple level of nestings) if self.propagate_parent and sdfg.parent is not None: source_edge = graph.memlet_path(edge)[0] sink_edge = graph.memlet_path(edge)[-1] # Find parent Nested SDFG node parent_node = next(n for n in sdfg.parent.nodes() if isinstance(n, nodes.NestedSDFG) and n.sdfg.name == sdfg.name) # continue in propagating the vector length following the # path that arrives to source_edge or starts from sink_edge for pe in sdfg.parent.all_edges(parent_node): if str(pe.dst_conn) == str(source_edge.src) or str( pe.src_conn) == str(sink_edge.dst): for ppe in sdfg.parent.memlet_tree(pe): ppe.data.veclen = vector_size if (not self.strided_map and ppe not in processed_edges): ppe.data.subset.replace( {param: vector_size * param}) processed_edges.add(ppe) except AttributeError: raise return
class MapCollapse(pattern_matching.Transformation): """ Implements the Map Collapse pattern. Map-collapse takes two nested maps with M and N dimensions respectively, and collapses them to a single M+N dimensional map. """ _outer_map_entry = nodes.MapEntry(nodes.Map("", [], [])) _inner_map_entry = nodes.MapEntry(nodes.Map("", [], [])) @staticmethod def expressions(): return [ sdutil.node_path_graph( MapCollapse._outer_map_entry, MapCollapse._inner_map_entry, ) ] @staticmethod def can_be_applied(graph, candidate, expr_index, sdfg, strict=False): # Check the edges between the entries of the two maps. outer_map_entry = graph.nodes()[candidate[ MapCollapse._outer_map_entry]] inner_map_entry = graph.nodes()[candidate[ MapCollapse._inner_map_entry]] # Check that inner map range is independent of outer range map_deps = set() for s in inner_map_entry.map.range: map_deps |= set(map(str, symlist(s))) if any(dep in outer_map_entry.map.params for dep in map_deps): return False # Check that the destination of all the outgoing edges # from the outer map's entry is the inner map's entry. for _src, _, dest, _, _ in graph.out_edges(outer_map_entry): if dest != inner_map_entry: return False # Check that the source of all the incoming edges # to the inner map's entry is the outer map's entry. for src, _, _, dst_conn, memlet in graph.in_edges(inner_map_entry): if src != outer_map_entry: return False # Check that dynamic input range memlets are independent of # first map range if dst_conn is not None and not dst_conn.startswith('IN_'): memlet_deps = set() for s in memlet.subset: memlet_deps |= set(map(str, symlist(s))) if any(dep in outer_map_entry.map.params for dep in memlet_deps): return False # Check the edges between the exits of the two maps. inner_map_exit = graph.exit_node(inner_map_entry) outer_map_exit = graph.exit_node(outer_map_entry) # Check that the destination of all the outgoing edges # from the inner map's exit is the outer map's exit. for _src, _, dest, _, _ in graph.out_edges(inner_map_exit): if dest != outer_map_exit: return False # Check that the source of all the incoming edges # to the outer map's exit is the inner map's exit. for src, _, _dest, _, _ in graph.in_edges(outer_map_exit): if src != inner_map_exit: return False return True @staticmethod def match_to_str(graph, candidate): outer_map_entry = graph.nodes()[candidate[ MapCollapse._outer_map_entry]] inner_map_entry = graph.nodes()[candidate[ MapCollapse._inner_map_entry]] return ' -> '.join(entry.map.label + ': ' + str(entry.map.params) for entry in [outer_map_entry, inner_map_entry]) def apply(self, sdfg) -> Tuple[nodes.MapEntry, nodes.MapExit]: """ Collapses two maps into one. :param sdfg: The SDFG to apply the transformation to. :return: A 2-tuple of the new map entry and exit nodes. """ # Extract the parameters and ranges of the inner/outer maps. graph = sdfg.nodes()[self.state_id] outer_map_entry = graph.nodes()[self.subgraph[ MapCollapse._outer_map_entry]] inner_map_entry = graph.nodes()[self.subgraph[ MapCollapse._inner_map_entry]] inner_map_exit = graph.exit_node(inner_map_entry) outer_map_exit = graph.exit_node(outer_map_entry) return sdutil.merge_maps(graph, outer_map_entry, outer_map_exit, inner_map_entry, inner_map_exit)
class GPUTransformLocalStorage(transformation.Transformation): """Implements the GPUTransformLocalStorage transformation. Similar to GPUTransformMap, but takes multiple maps leading from the same data node into account, creating a local storage for each range. @see: GPUTransformMap """ _arrays_removed = 0 _maps_transformed = 0 fullcopy = Property(desc="Copy whole arrays rather than used subset", dtype=bool, default=False) nested_seq = Property( desc="Makes nested code semantically-equivalent to single-core code," "transforming nested maps and memory into sequential and " "local memory respectively.", dtype=bool, default=True, ) _map_entry = nodes.MapEntry(nodes.Map("", [], [])) import dace.libraries.standard as stdlib # Avoid import loop _reduce = stdlib.Reduce("lambda: None", None) @staticmethod def expressions(): return [ sdutil.node_path_graph(GPUTransformLocalStorage._map_entry), sdutil.node_path_graph(GPUTransformLocalStorage._reduce), ] @staticmethod def can_be_applied(graph, candidate, expr_index, sdfg, permissive=False): if expr_index == 0: map_entry = graph.nodes()[candidate[ GPUTransformLocalStorage._map_entry]] candidate_map = map_entry.map # Disallow GPUTransform on nested maps in permissive mode if not permissive: if graph.entry_node(map_entry) is not None: return False # Map schedules that are disallowed to transform to GPUs if (candidate_map.schedule == dtypes.ScheduleType.MPI or candidate_map.schedule == dtypes.ScheduleType.GPU_Device or candidate_map.schedule == dtypes.ScheduleType.GPU_ThreadBlock or candidate_map.schedule == dtypes.ScheduleType.Sequential): return False # Dynamic map ranges cannot become kernels if sd.has_dynamic_map_inputs(graph, map_entry): return False # Recursively check parent for GPU schedules sdict = graph.scope_dict() current_node = map_entry while current_node is not None: if (current_node.map.schedule == dtypes.ScheduleType.GPU_Device or current_node.map.schedule == dtypes.ScheduleType.GPU_ThreadBlock): return False current_node = sdict[current_node] # Ensure that map does not include internal arrays that are # allocated on non-default space subgraph = graph.scope_subgraph(map_entry) for node in subgraph.nodes(): if (isinstance(node, nodes.AccessNode) and node.desc(sdfg).storage != dtypes.StorageType.Default and node.desc(sdfg).storage != dtypes.StorageType.Register): return False # If one of the outputs is a stream, do not match map_exit = graph.exit_node(map_entry) for edge in graph.out_edges(map_exit): dst = graph.memlet_path(edge)[-1].dst if (isinstance(dst, nodes.AccessNode) and isinstance(sdfg.arrays[dst.data], data.Stream)): return False return True elif expr_index == 1: reduce = graph.nodes()[candidate[GPUTransformLocalStorage._reduce]] # Recursively check parent for GPU schedules sdict = graph.scope_dict() current_node = sdict[reduce] while current_node is not None: if (current_node.map.schedule == dtypes.ScheduleType.GPU_Device or current_node.map.schedule == dtypes.ScheduleType.GPU_ThreadBlock): return False current_node = sdict[current_node] return True @staticmethod def match_to_str(graph, candidate): if GPUTransformLocalStorage._reduce in candidate: return str( graph.nodes()[candidate[GPUTransformLocalStorage._reduce]]) else: map_entry = graph.nodes()[candidate[ GPUTransformLocalStorage._map_entry]] return str(map_entry) def apply(self, sdfg): graph = sdfg.nodes()[self.state_id] if self.expr_index == 0: cnode: nodes.MapEntry = graph.nodes()[self.subgraph[ GPUTransformLocalStorage._map_entry]] # Change schedule cnode.schedule = dtypes.ScheduleType.GPU_Device exit_node = graph.exit_node(cnode) else: cnode: nodes.LibraryNode = graph.nodes()[self.subgraph[ GPUTransformLocalStorage._reduce]] # Change schedule cnode.schedule = dtypes.ScheduleType.GPU_Default exit_node = cnode if Config.get_bool("debugprint"): GPUTransformLocalStorage._maps_transformed += 1 # If nested graph is designated as sequential, transform schedules and # storage from Default to Sequential/Register if self.nested_seq and self.expr_index == 0: for node in graph.scope_subgraph(cnode).nodes(): if isinstance(node, nodes.AccessNode): arr = node.desc(sdfg) if arr.storage == dtypes.StorageType.Default: arr.storage = dtypes.StorageType.Register elif isinstance(node, nodes.MapEntry): if node.map.schedule == dtypes.ScheduleType.Default: node.map.schedule = dtypes.ScheduleType.Sequential gpu_storage_types = [ dtypes.StorageType.GPU_Global, dtypes.StorageType.GPU_Shared, ] ####################################################### # Add GPU copies of CPU arrays (i.e., not already on GPU) # First, understand which arrays to clone all_out_edges = [] all_out_edges.extend(list(graph.out_edges(exit_node))) in_arrays_to_clone = set() out_arrays_to_clone = set() for e in graph.in_edges(cnode): data_node = sd.find_input_arraynode(graph, e) if data_node.desc(sdfg).storage not in gpu_storage_types: in_arrays_to_clone.add((data_node, e.data)) for e in all_out_edges: data_node = sd.find_output_arraynode(graph, e) if data_node.desc(sdfg).storage not in gpu_storage_types: out_arrays_to_clone.add((data_node, e.data)) if Config.get_bool("debugprint"): GPUTransformLocalStorage._arrays_removed += len( in_arrays_to_clone) + len(out_arrays_to_clone) # Second, create a GPU clone of each array # TODO: Overapproximate union of memlets cloned_arrays = {} in_cloned_arraynodes = {} out_cloned_arraynodes = {} for array_node, memlet in in_arrays_to_clone: array = array_node.desc(sdfg) cloned_name = "gpu_" + array_node.data for i, r in enumerate(memlet.bounding_box_size()): size = symbolic.overapproximate(r) try: if int(size) == 1: suffix = [] for c in str(memlet.subset[i][0]): if c.isalpha() or c.isdigit() or c == "_": suffix.append(c) elif c == "+": suffix.append("p") elif c == "-": suffix.append("m") elif c == "*": suffix.append("t") elif c == "/": suffix.append("d") cloned_name += "_" + "".join(suffix) except: continue if cloned_name in sdfg.arrays.keys(): cloned_array = sdfg.arrays[cloned_name] elif array_node.data in cloned_arrays: cloned_array = cloned_arrays[array_node.data] else: full_shape = [] for r in memlet.bounding_box_size(): size = symbolic.overapproximate(r) try: full_shape.append(int(size)) except: full_shape.append(size) actual_dims = [ idx for idx, r in enumerate(full_shape) if not (isinstance(r, int) and r == 1) ] if len(actual_dims) == 0: # abort actual_dims = [len(full_shape) - 1] if isinstance(array, data.Scalar): sdfg.add_array(name=cloned_name, shape=[1], dtype=array.dtype, transient=True, storage=dtypes.StorageType.GPU_Global) elif isinstance(array, data.Stream): sdfg.add_stream( name=cloned_name, dtype=array.dtype, shape=[full_shape[d] for d in actual_dims], veclen=array.veclen, buffer_size=array.buffer_size, storage=dtypes.StorageType.GPU_Global, transient=True, offset=[array.offset[d] for d in actual_dims]) else: sdfg.add_array( name=cloned_name, shape=[full_shape[d] for d in actual_dims], dtype=array.dtype, transient=True, storage=dtypes.StorageType.GPU_Global, allow_conflicts=array.allow_conflicts, strides=[array.strides[d] for d in actual_dims], offset=[array.offset[d] for d in actual_dims], ) cloned_arrays[array_node.data] = cloned_name cloned_node = type(array_node)(cloned_name) in_cloned_arraynodes[array_node.data] = cloned_node for array_node, memlet in out_arrays_to_clone: array = array_node.desc(sdfg) cloned_name = "gpu_" + array_node.data for i, r in enumerate(memlet.bounding_box_size()): size = symbolic.overapproximate(r) try: if int(size) == 1: suffix = [] for c in str(memlet.subset[i][0]): if c.isalpha() or c.isdigit() or c == "_": suffix.append(c) elif c == "+": suffix.append("p") elif c == "-": suffix.append("m") elif c == "*": suffix.append("t") elif c == "/": suffix.append("d") cloned_name += "_" + "".join(suffix) except: continue if cloned_name in sdfg.arrays.keys(): cloned_array = sdfg.arrays[cloned_name] elif array_node.data in cloned_arrays: cloned_array = cloned_arrays[array_node.data] else: full_shape = [] for r in memlet.bounding_box_size(): size = symbolic.overapproximate(r) try: full_shape.append(int(size)) except: full_shape.append(size) actual_dims = [ idx for idx, r in enumerate(full_shape) if not (isinstance(r, int) and r == 1) ] if len(actual_dims) == 0: # abort actual_dims = [len(full_shape) - 1] if isinstance(array, data.Scalar): sdfg.add_array(name=cloned_name, shape=[1], dtype=array.dtype, transient=True, storage=dtypes.StorageType.GPU_Global) elif isinstance(array, data.Stream): sdfg.add_stream( name=cloned_name, dtype=array.dtype, shape=[full_shape[d] for d in actual_dims], veclen=array.veclen, buffer_size=array.buffer_size, storage=dtypes.StorageType.GPU_Global, transient=True, offset=[array.offset[d] for d in actual_dims]) else: sdfg.add_array( name=cloned_name, shape=[full_shape[d] for d in actual_dims], dtype=array.dtype, transient=True, storage=dtypes.StorageType.GPU_Global, allow_conflicts=array.allow_conflicts, strides=[array.strides[d] for d in actual_dims], offset=[array.offset[d] for d in actual_dims], ) cloned_arrays[array_node.data] = cloned_name cloned_node = type(array_node)(cloned_name) cloned_node.setzero = True out_cloned_arraynodes[array_node.data] = cloned_node # Third, connect the cloned arrays to the originals for array_name, node in in_cloned_arraynodes.items(): graph.add_node(node) is_scalar = isinstance(sdfg.arrays[array_name], data.Scalar) for edge in graph.in_edges(cnode): if edge.data.data == array_name: newmemlet = copy.deepcopy(edge.data) newmemlet.data = node.data if is_scalar: newmemlet.subset = sbs.Indices([0]) else: offset = [] lost_dims = [] lost_ranges = [] newsubset = [None] * len(edge.data.subset) for ind, r in enumerate(edge.data.subset): offset.append(r[0]) if isinstance(edge.data.subset[ind], tuple): begin = edge.data.subset[ind][0] - r[0] end = edge.data.subset[ind][1] - r[0] step = edge.data.subset[ind][2] if begin == end: lost_dims.append(ind) lost_ranges.append((begin, end, step)) else: newsubset[ind] = (begin, end, step) else: newsubset[ind] -= r[0] if len(lost_dims) == len(edge.data.subset): lost_dims.pop() newmemlet.subset = type( edge.data.subset)([lost_ranges[-1]]) else: newmemlet.subset = type(edge.data.subset)( [r for r in newsubset if r is not None]) graph.add_edge(node, None, edge.dst, edge.dst_conn, newmemlet) for e in graph.bfs_edges(edge.dst, reverse=False): parent, _, _child, _, memlet = e if parent != edge.dst and not in_scope( graph, parent, edge.dst): break if memlet.data != edge.data.data: continue path = graph.memlet_path(e) if not isinstance(path[-1].dst, nodes.CodeNode): if in_path(path, e, nodes.ExitNode, forward=True): if isinstance(parent, nodes.CodeNode): # Output edge break else: continue if is_scalar: memlet.subset = sbs.Indices([0]) else: newsubset = [None] * len(memlet.subset) for ind, r in enumerate(memlet.subset): if ind in lost_dims: continue if isinstance(memlet.subset[ind], tuple): begin = r[0] - offset[ind] end = r[1] - offset[ind] step = r[2] newsubset[ind] = (begin, end, step) else: newsubset[ind] = ( r - offset[ind], r - offset[ind], 1, ) memlet.subset = type(edge.data.subset)( [r for r in newsubset if r is not None]) memlet.data = node.data if self.fullcopy: edge.data.subset = sbs.Range.from_array(node.desc(sdfg)) edge.data.other_subset = newmemlet.subset graph.add_edge(edge.src, edge.src_conn, node, None, edge.data) graph.remove_edge(edge) for array_name, node in out_cloned_arraynodes.items(): graph.add_node(node) is_scalar = isinstance(sdfg.arrays[array_name], data.Scalar) for edge in all_out_edges: if edge.data.data == array_name: newmemlet = copy.deepcopy(edge.data) newmemlet.data = node.data if is_scalar: newmemlet.subset = sbs.Indices([0]) else: offset = [] lost_dims = [] lost_ranges = [] newsubset = [None] * len(edge.data.subset) for ind, r in enumerate(edge.data.subset): offset.append(r[0]) if isinstance(edge.data.subset[ind], tuple): begin = edge.data.subset[ind][0] - r[0] end = edge.data.subset[ind][1] - r[0] step = edge.data.subset[ind][2] if begin == end: lost_dims.append(ind) lost_ranges.append((begin, end, step)) else: newsubset[ind] = (begin, end, step) else: newsubset[ind] -= r[0] if len(lost_dims) == len(edge.data.subset): lost_dims.pop() newmemlet.subset = type( edge.data.subset)([lost_ranges[-1]]) else: newmemlet.subset = type(edge.data.subset)( [r for r in newsubset if r is not None]) graph.add_edge(edge.src, edge.src_conn, node, None, newmemlet) end_node = graph.entry_node(edge.src) for e in graph.bfs_edges(edge.src, reverse=True): parent, _, _child, _, memlet = e if parent == end_node: break if memlet.data != edge.data.data: continue path = graph.memlet_path(e) if not isinstance(path[0].dst, nodes.CodeNode): if in_path(path, e, nodes.EntryNode, forward=False): if isinstance(parent, nodes.CodeNode): # Output edge break else: continue if is_scalar: memlet.subset = sbs.Indices([0]) else: newsubset = [None] * len(memlet.subset) for ind, r in enumerate(memlet.subset): if ind in lost_dims: continue if isinstance(memlet.subset[ind], tuple): begin = r[0] - offset[ind] end = r[1] - offset[ind] step = r[2] newsubset[ind] = (begin, end, step) else: newsubset[ind] = ( r - offset[ind], r - offset[ind], 1, ) memlet.subset = type(edge.data.subset)( [r for r in newsubset if r is not None]) memlet.data = node.data edge.data.wcr = None if self.fullcopy: edge.data.subset = sbs.Range.from_array(node.desc(sdfg)) edge.data.other_subset = newmemlet.subset graph.add_edge(node, None, edge.dst, edge.dst_conn, edge.data) graph.remove_edge(edge) # Fourth, replace memlet arrays as necessary if self.expr_index == 0: scope_subgraph = graph.scope_subgraph(cnode) for edge in scope_subgraph.edges(): if edge.data.data is not None and edge.data.data in cloned_arrays: edge.data.data = cloned_arrays[edge.data.data]
class Vectorization(transformation.Transformation): """ Implements the vectorization transformation. Vectorization matches when all the input and output memlets of a tasklet inside a map access the inner-most loop variable in their last dimension. The transformation changes the step of the inner-most loop to be equal to the length of the vector and vectorizes the memlets. """ vector_len = Property(desc="Vector length", dtype=int, default=4) propagate_parent = Property(desc="Propagate vector length through " "parent SDFGs", dtype=bool, default=False) strided_map = Property(desc="Use strided map range (jump by vector length)" " instead of modifying memlets", dtype=bool, default=True) preamble = Property( dtype=bool, default=None, allow_none=True, desc='Force creation or skipping a preamble map without vectors') postamble = Property( dtype=bool, default=None, allow_none=True, desc='Force creation or skipping a postamble map without vectors') _map_entry = nodes.MapEntry(nodes.Map("", [], [])) @staticmethod def expressions(): return [ sdutil.node_path_graph(Vectorization._map_entry) ] def can_be_applied(self, graph: SDFGState, candidate, expr_index, sdfg, strict=False): map_entry = graph.nodes()[candidate[Vectorization._map_entry]] # Only accept scopes that have one internal tasklet scope = graph.scope_subgraph(map_entry, False, False) if len(scope.nodes()) != 1: return False tasklet = scope.nodes()[0] if not isinstance(tasklet, nodes.Tasklet): return False param = symbolic.pystr_to_symbolic(map_entry.map.params[-1]) found = False # Strided maps cannot be vectorized if map_entry.map.range[-1][2] != 1 and self.strided_map: return False # Check if all edges, adjacent to the tasklet, # use the parameter in their contiguous dimension. for e, conntype in graph.all_edges_and_connectors(tasklet): # Cases that do not matter for vectorization if e.data.data is None: # Empty memlets continue if isinstance(sdfg.arrays[e.data.data], data.Stream): # Streams continue # Vectorization can not be applied in WCR # if e.data.wcr is not None: # return False subset = e.data.subset array = sdfg.arrays[e.data.data] # If already vectorized or a pointer, do not apply if isinstance(conntype, (dtypes.vector, dtypes.pointer)): return False try: for idx, expr in enumerate(subset): if isinstance(expr, tuple): for ex in expr: ex = symbolic.pystr_to_symbolic(ex) symbols = ex.free_symbols if param in symbols: if array.strides[idx] == 1: found = True else: return False else: expr = symbolic.pystr_to_symbolic(expr) symbols = expr.free_symbols if param in symbols: if array.strides[idx] == 1: found = True else: return False except TypeError: # cannot determine truth value of Relational return False return found @staticmethod def match_to_str(graph, candidate): map_entry = candidate[Vectorization._map_entry] return str(map_entry) def apply(self, sdfg: SDFG): graph = sdfg.nodes()[self.state_id] map_entry = graph.nodes()[self.subgraph[Vectorization._map_entry]] tasklet: nodes.Tasklet = graph.successors(map_entry)[0] param = symbolic.pystr_to_symbolic(map_entry.map.params[-1]) # Create new vector size. vector_size = self.vector_len dim_from, dim_to, dim_skip = map_entry.map.range[-1] # Determine whether to create preamble or postamble maps if self.preamble is not None: create_preamble = self.preamble else: create_preamble = not ((dim_from % vector_size == 0) == True or dim_from == 0) if self.postamble is not None: create_postamble = self.postamble else: if isinstance(dim_to, symbolic.SymExpr): create_postamble = (((dim_to.approx + 1) % vector_size == 0) == False) else: create_postamble = (((dim_to + 1) % vector_size == 0) == False) # Determine new range for vectorized map if self.strided_map: new_range = [dim_from, dim_to - vector_size + 1, vector_size] else: new_range = [ dim_from // vector_size, ((dim_to + 1) // vector_size) - 1, dim_skip ] # Create preamble non-vectorized map (replacing the original map) if create_preamble: old_scope = graph.scope_subgraph(map_entry, True, True) new_scope: ScopeSubgraphView = replicate_scope( sdfg, graph, old_scope) new_begin = dim_from + (vector_size - (dim_from % vector_size)) map_entry.map.range[-1] = (dim_from, new_begin - 1, dim_skip) # Replace map_entry with the replicated scope (so that the preamble # will usually come first in topological sort) map_entry = new_scope.entry tasklet = new_scope.nodes()[old_scope.nodes().index(tasklet)] new_range[0] = new_begin # Create postamble non-vectorized map if create_postamble: new_scope: ScopeSubgraphView = replicate_scope( sdfg, graph, graph.scope_subgraph(map_entry, True, True)) dim_to_ex = dim_to + 1 new_scope.entry.map.range[-1] = (dim_to_ex - (dim_to_ex % vector_size), dim_to, dim_skip) # Change the step of the inner-most dimension. map_entry.map.range[-1] = tuple(new_range) # Vectorize connectors adjacent to the tasklet. for edge in graph.all_edges(tasklet): connectors = (tasklet.in_connectors if edge.dst == tasklet else tasklet.out_connectors) conn = edge.dst_conn if edge.dst == tasklet else edge.src_conn if edge.data.data is None: # Empty memlets continue desc = sdfg.arrays[edge.data.data] contigidx = desc.strides.index(1) newlist = [] lastindex = edge.data.subset[contigidx] if isinstance(lastindex, tuple): newlist = [(rb, re, rs) for rb, re, rs in edge.data.subset] symbols = set() for indd in lastindex: symbols.update( symbolic.pystr_to_symbolic(indd).free_symbols) else: newlist = [(rb, rb, 1) for rb in edge.data.subset] symbols = symbolic.pystr_to_symbolic(lastindex).free_symbols oldtype = connectors[conn] if oldtype is None or oldtype.type is None: oldtype = desc.dtype # Vector to scalar WCR edge: change connector and continue lastedge = graph.memlet_path(edge)[-1] if (lastedge.data.subset.num_elements() == 1 and edge.data.wcr is not None): connectors[conn] = dtypes.vector(oldtype, vector_size) continue if str(param) not in map(str, symbols): continue # Vectorize connector, if not already vectorized if isinstance(oldtype, dtypes.vector): continue connectors[conn] = dtypes.vector(oldtype, vector_size) # Modify memlet subset to match vector length if self.strided_map: rb = newlist[contigidx][0] if self.propagate_parent: newlist[contigidx] = (rb / self.vector_len, rb / self.vector_len, 1) else: newlist[contigidx] = (rb, rb + self.vector_len - 1, 1) else: rb = newlist[contigidx][0] if self.propagate_parent: newlist[contigidx] = (rb, rb, 1) else: newlist[contigidx] = (self.vector_len * rb, self.vector_len * rb + self.vector_len - 1, 1) edge.data.subset = subsets.Range(newlist) edge.data.volume = vector_size # Vector length propagation using data descriptors, recursive traversal # outwards if self.propagate_parent: for edge in graph.all_edges(tasklet): cursdfg = sdfg curedge = edge while cursdfg is not None: arrname = curedge.data.data dtype = cursdfg.arrays[arrname].dtype # Change type and shape to vector if not isinstance(dtype, dtypes.vector): cursdfg.arrays[arrname].dtype = dtypes.vector( dtype, vector_size) new_shape = list(cursdfg.arrays[arrname].shape) contigidx = cursdfg.arrays[arrname].strides.index(1) new_shape[contigidx] /= vector_size try: new_shape[contigidx] = int(new_shape[contigidx]) except TypeError: pass cursdfg.arrays[arrname].shape = new_shape propagation.propagate_memlets_sdfg(cursdfg) # Find matching edge in parent nsdfg = cursdfg.parent_nsdfg_node if nsdfg is None: break tstate = cursdfg.parent curedge = ([ e for e in tstate.in_edges(nsdfg) if e.dst_conn == arrname ] + [ e for e in tstate.out_edges(nsdfg) if e.src_conn == arrname ])[0] cursdfg = cursdfg.parent_sdfg
class TrivialMapElimination(transformation.Transformation): """ Implements the Trivial-Map Elimination pattern. Trivial-Map Elimination removes all dimensions containing only one element from a map. If this applies to all ranges the map is removed. Example: Map[i=0:I,j=7] -> Map[i=0:I] Example: Map[i=0 ,j=7] -> nothing """ _map_entry = nodes.MapEntry(nodes.Map("", [], [])) @staticmethod def expressions(): return [sdutil.node_path_graph(TrivialMapElimination._map_entry)] @staticmethod def can_be_applied(graph, candidate, expr_index, sdfg, permissive=False): map_entry = graph.nodes()[candidate[TrivialMapElimination._map_entry]] return any(r[0] == r[1] for r in map_entry.map.range) @staticmethod def match_to_str(graph, candidate): map_entry = graph.nodes()[candidate[TrivialMapElimination._map_entry]] return map_entry.map.label + ': ' + str(map_entry.map.params) def apply(self, sdfg): graph = sdfg.nodes()[self.state_id] map_entry = graph.nodes()[self.subgraph[ TrivialMapElimination._map_entry]] map_exit = graph.exit_node(map_entry) remaining_ranges = [] remaining_params = [] for map_param, ranges in zip(map_entry.map.params, map_entry.map.range.ranges): map_from, map_to, _ = ranges if map_from == map_to: # Replace the map index variable with the value it obtained scope = graph.scope_subgraph(map_entry) scope.replace(map_param, map_from) else: remaining_ranges.append(ranges) remaining_params.append(map_param) map_entry.map.range.ranges = remaining_ranges map_entry.map.params = remaining_params if len(remaining_ranges) == 0: # Redirect map entry's out edges for edge in graph.out_edges(map_entry): path = graph.memlet_path(edge) index = path.index(edge) # Add an edge directly from the previous source connector to the destination graph.add_edge(path[index - 1].src, path[index - 1].src_conn, edge.dst, edge.dst_conn, edge.data) # Redirect map exit's in edges. for edge in graph.in_edges(map_exit): path = graph.memlet_path(edge) index = path.index(edge) # Add an edge directly from the source to the next destination connector if len(path) > index + 1: graph.add_edge(edge.src, edge.src_conn, path[index + 1].dst, path[index + 1].dst_conn, edge.data) # Remove map graph.remove_nodes_from([map_entry, map_exit])
def apply(self, sdfg: dace.SDFG): # Extract the map and its entry and exit nodes. graph = sdfg.node(self.state_id) map_entry = self.map_entry(sdfg) map_exit = graph.exit_node(map_entry) current_map = map_entry.map # Create new maps new_maps = [ nodes.Map(current_map.label + '_' + str(param), [param], subsets.Range([param_range]), schedule=dtypes.ScheduleType.Sequential) for param, param_range in zip(current_map.params[1:], current_map.range[1:]) ] current_map.params = [current_map.params[0]] current_map.range = subsets.Range([current_map.range[0]]) # Create new map entries and exits entries = [nodes.MapEntry(new_map) for new_map in new_maps] exits = [nodes.MapExit(new_map) for new_map in new_maps] # Create edges, abiding by the following rules: # 1. If there are no edges coming from the outside, use empty memlets # 2. Edges with IN_* connectors replicate along the maps # 3. Edges for dynamic map ranges replicate until reaching range(s) for edge in graph.out_edges(map_entry): graph.remove_edge(edge) graph.add_memlet_path(map_entry, *entries, edge.dst, src_conn=edge.src_conn, memlet=edge.data, dst_conn=edge.dst_conn) # Modify dynamic map ranges dynamic_edges = dace.sdfg.dynamic_map_inputs(graph, map_entry) for edge in dynamic_edges: # Remove old edge and connector graph.remove_edge(edge) edge.dst.remove_in_connector(edge.dst_conn) # Propagate to each range it belongs to path = [] for mapnode in [map_entry] + entries: path.append(mapnode) if any(edge.dst_conn in map(str, symbolic.symlist(r)) for r in mapnode.map.range): graph.add_memlet_path(edge.src, *path, memlet=edge.data, src_conn=edge.src_conn, dst_conn=edge.dst_conn) # Create new map exits for edge in graph.in_edges(map_exit): graph.remove_edge(edge) graph.add_memlet_path(edge.src, *exits[::-1], map_exit, memlet=edge.data, src_conn=edge.src_conn, dst_conn=edge.dst_conn) from dace.sdfg.scope import ScopeTree scope = None queue: List[ScopeTree] = graph.scope_leaves() while len(queue) > 0: tnode = queue.pop() if tnode.entry == entries[-1]: scope = tnode break elif tnode.parent is not None: queue.append(tnode.parent) else: raise ValueError('Cannot find scope in state') consolidate_edges(sdfg, scope) return [map_entry] + entries
class MPITransformMap(transformation.Transformation): """ Implements the MPI parallelization pattern. Takes a map and makes it an MPI-scheduled map, introduces transients that keep locally accessed data. Original SDFG ============= ``` Input1 - Output1 \ / Input2 --- MapEntry -- Arbitrary R -- MapExit -- Output2 / \ InputN - OutputN ``` Nothing in R may access other inputs/outputs that are not defined in R itself and do not go through MapEntry/MapExit Map must be a one-dimensional map for now. The range of the map must be a Range object. Output: ======= * Add transients for the accessed parts * The schedule property of Map is set to MPI * The range of Map is changed to var = startexpr + p * chunksize ... startexpr + p + 1 * chunksize where p is the current rank and P is the total number of ranks, and chunksize is defined as (endexpr - startexpr) / P, adding the remaining K iterations to the first K procs. * For each input InputI, create a new transient transInputI, which has an attribute that specifies that it needs to be filled with (possibly) remote data * Collect all accesses to InputI within R, assume their convex hull is InputI[rs ... re] * The transInputI transient will contain InputI[rs ... re] * Change all accesses to InputI within R to accesses to transInputI """ _map_entry = nodes.MapEntry(nodes.Map("", [], [])) @staticmethod def annotates_memlets(): return True @staticmethod def expressions(): return [sdutil.node_path_graph(MPITransformMap._map_entry)] @staticmethod def can_be_applied(graph, candidate, expr_index, sdfg, strict=False): map_entry = graph.nodes()[candidate[MPITransformMap._map_entry]] # Check if the map is one-dimensional if map_entry.map.range.dims() != 1: return False # We cannot transform a map which is already of schedule type MPI if map_entry.map.schedule == dtypes.ScheduleType.MPI: return False # We cannot transform a map which is already inside a MPI map, or in # another device schedule_whitelist = [ dtypes.ScheduleType.Default, dtypes.ScheduleType.Sequential ] sdict = graph.scope_dict() parent = sdict[map_entry] while parent is not None: if parent.map.schedule not in schedule_whitelist: return False parent = sdict[parent] # Dynamic map ranges not supported (will allocate dynamic memory) if has_dynamic_map_inputs(graph, map_entry): return False # MPI schedules currently do not support WCR map_exit = graph.exit_node(map_entry) if any(e.data.wcr for e in graph.out_edges(map_exit)): return False return True @staticmethod def match_to_str(graph, candidate): map_entry = graph.nodes()[candidate[MPITransformMap._map_entry]] return map_entry.map.label def apply(self, sdfg): graph = sdfg.nodes()[self.state_id] map_entry = graph.nodes()[self.subgraph[MPITransformMap._map_entry]] # Avoiding import loops from dace.transformation.dataflow.strip_mining import StripMining from dace.transformation.dataflow.local_storage import LocalStorage rangeexpr = str(map_entry.map.range.num_elements()) stripmine_subgraph = { StripMining._map_entry: self.subgraph[MPITransformMap._map_entry] } sdfg_id = sdfg.sdfg_id stripmine = StripMining(sdfg_id, self.state_id, stripmine_subgraph, self.expr_index) stripmine.dim_idx = -1 stripmine.new_dim_prefix = "mpi" stripmine.tile_size = "(" + rangeexpr + "/__dace_comm_size)" stripmine.divides_evenly = True stripmine.apply(sdfg) # Find all in-edges that lead to candidate[MPITransformMap._map_entry] outer_map = None edges = [ e for e in graph.in_edges(map_entry) if isinstance(e.src, nodes.EntryNode) ] outer_map = edges[0].src # Add MPI schedule attribute to outer map outer_map.map._schedule = dtypes.ScheduleType.MPI # Now create a transient for each array for e in edges: in_local_storage_subgraph = { LocalStorage.node_a: graph.node_id(outer_map), LocalStorage.node_b: self.subgraph[MPITransformMap._map_entry] } sdfg_id = sdfg.sdfg_id in_local_storage = LocalStorage(sdfg_id, self.state_id, in_local_storage_subgraph, self.expr_index) in_local_storage.array = e.data.data in_local_storage.apply(sdfg) # Transform OutLocalStorage for each output of the MPI map in_map_exit = graph.exit_node(map_entry) out_map_exit = graph.exit_node(outer_map) for e in graph.out_edges(out_map_exit): name = e.data.data outlocalstorage_subgraph = { LocalStorage.node_a: graph.node_id(in_map_exit), LocalStorage.node_b: graph.node_id(out_map_exit) } sdfg_id = sdfg.sdfg_id outlocalstorage = LocalStorage(sdfg_id, self.state_id, outlocalstorage_subgraph, self.expr_index) outlocalstorage.array = name outlocalstorage.apply(sdfg)
class MapUnroll(transformation.Transformation): """ Unrolls a map with constant ranges in the top-level scope of an SDFG by replicating its subgraph for each iteration. If there are local data containers only used in this map, they will also be replicated, as will nested SDFGs found within. This transformation can be useful for forming weakly connected components that will be inferred as processing elements in an FPGA kernel. """ _map_entry = nodes.MapEntry(nodes.Map("", [], [])) @staticmethod def expressions(): return [sdutil.node_path_graph(MapUnroll._map_entry)] @staticmethod def can_be_applied(graph, candidate, expr_index, sdfg, strict=False): map_entry = graph.nodes()[candidate[MapUnroll._map_entry]] # Must be top-level map if graph.scope_dict()[map_entry] is not None: return False # All map ranges must be constant try: for begin, end, step in map_entry.map.range: symbolic.evaluate(begin, sdfg.constants) symbolic.evaluate(end, sdfg.constants) symbolic.evaluate(step, sdfg.constants) except TypeError: return False return True @staticmethod def match_to_str(graph, candidate): map_entry = graph.nodes()[candidate[MapUnroll._map_entry]] return map_entry.map.label + ': ' + str(map_entry.map.params) def apply(self, sdfg): from dace.transformation.dataflow import TrivialMapElimination state = sdfg.nodes()[self.state_id] map_entry = state.nodes()[self.subgraph[MapUnroll._map_entry]] map_exit = state.exit_node(map_entry) # Collect all nodes in this weakly connected component subgraph = sdutil.weakly_connected_component(state, map_entry) # Save nested SDFGs to JSON, then deserialize them for every copy we # need to make nested_sdfgs = {} for node in subgraph: if isinstance(node, nodes.NestedSDFG): nested_sdfgs[node.sdfg] = node.sdfg.to_json() # Check for local memories that need to be replicated local_memories = [ name for name in sdutil.local_transients( sdfg, subgraph, entry_node=map_entry, include_nested=True) if not isinstance(sdfg.arrays[name], dt.Stream) and not isinstance(sdfg.arrays[name], dt.View) ] params = map_entry.map.params ranges = map_entry.map.range.ranges constant_ranges = [] for r in ranges: begin = symbolic.evaluate(r[0], sdfg.constants) end = symbolic.evaluate(r[1], sdfg.constants) step = symbolic.evaluate(r[2], sdfg.constants) end += step # Make non-inclusive constant_ranges.append(range(begin, end, step)) index_tuples = itertools.product(*constant_ranges) for t in index_tuples: suffix = "_" + "_".join(map(str, t)) node_to_unrolled = {} # Copy all nodes for node in subgraph: if isinstance(node, nodes.NestedSDFG): # Avoid deep-copying the nested SDFG nsdfg = node.sdfg # Don't copy the nested SDFG, as we will do this separately node.sdfg = None unrolled_node = copy.deepcopy(node) node.sdfg = nsdfg # Deserialize into a new SDFG specific to this copy nsdfg_json = nested_sdfgs[nsdfg] name = nsdfg_json["attributes"]["name"] nsdfg_json["attributes"]["name"] += suffix unrolled_nsdfg = SDFG.from_json(nsdfg_json) nsdfg_json["attributes"]["name"] = name # Reinstate # Set all the references unrolled_nsdfg.parent = state unrolled_nsdfg.parent_sdfg = sdfg unrolled_nsdfg.update_sdfg_list([]) unrolled_node.sdfg = unrolled_nsdfg unrolled_nsdfg.parent_nsdfg_node = unrolled_node else: unrolled_node = copy.deepcopy(node) if node == map_entry: # Fix the map bounds to only this iteration unrolled_node.map.range = [(i, i, 1) for i in t] if (isinstance(node, nodes.AccessNode) and node.data in local_memories): # If this is a local memory only used in this subgraph, # we need to replicate it for each new subgraph unrolled_name = node.data + suffix if unrolled_name not in sdfg.arrays: unrolled_desc = copy.deepcopy( sdfg.arrays[node.data]) sdfg.add_datadesc(unrolled_name, unrolled_desc) unrolled_node.data = unrolled_name state.add_node(unrolled_node) node_to_unrolled[node] = unrolled_node # Remember mapping # Copy all edges for src, src_conn, dst, dst_conn, memlet in subgraph.edges(): src = node_to_unrolled[src] dst = node_to_unrolled[dst] memlet = copy.deepcopy(memlet) if memlet.data in local_memories: memlet.data = memlet.data + suffix state.add_edge(src, src_conn, dst, dst_conn, memlet) # Eliminate the now trivial map TrivialMapElimination.apply_to( sdfg, verify=False, annotate=False, save=False, _map_entry=node_to_unrolled[map_entry]) # Now we can delete the original subgraph. This implicitly also remove # memlets between nodes state.remove_nodes_from(subgraph) # If we added a bunch of new nested SDFGs, reset the internal list if len(nested_sdfgs) > 0: sdfg.reset_sdfg_list() # Remove local memories that were replicated for mem in local_memories: sdfg.remove_data(mem)
class AccumulateTransient(pattern_matching.Transformation): """ Implements the AccumulateTransient transformation, which adds transient stream and data nodes between nested maps that lead to a stream. The transient data nodes then act as a local accumulator. """ _tasklet = nodes.Tasklet('_') _map_exit = nodes.MapExit(nodes.Map("", [], [])) _outer_map_exit = nodes.MapExit(nodes.Map("", [], [])) array = Property( dtype=str, desc="Array to create local storage for (if empty, first available)", default=None, allow_none=True) @staticmethod def expressions(): return [ sdutil.node_path_graph(AccumulateTransient._tasklet, AccumulateTransient._map_exit, AccumulateTransient._outer_map_exit) ] @staticmethod def can_be_applied(graph, candidate, expr_index, sdfg, strict=False): tasklet = graph.nodes()[candidate[AccumulateTransient._tasklet]] map_exit = graph.nodes()[candidate[AccumulateTransient._map_exit]] # Check if there is an accumulation output for _src, _, dest, _, memlet in graph.out_edges(tasklet): if memlet.wcr is not None and dest == map_exit: return True return False @staticmethod def match_to_str(graph, candidate): tasklet = candidate[AccumulateTransient._tasklet] map_exit = candidate[AccumulateTransient._map_exit] outer_map_exit = candidate[AccumulateTransient._outer_map_exit] return ' -> '.join( str(node) for node in [tasklet, map_exit, outer_map_exit]) def apply(self, sdfg): graph = sdfg.node(self.state_id) # Choose array array = self.array if array is None or len(array) == 0: map_exit = graph.node(self.subgraph[AccumulateTransient._map_exit]) outer_map_exit = graph.node( self.subgraph[AccumulateTransient._outer_map_exit]) array = next(e.data.data for e in graph.edges_between(map_exit, outer_map_exit) if e.data.wcr is not None) # Avoid import loop from dace.transformation.dataflow.local_storage import LocalStorage local_storage_subgraph = { LocalStorage._node_a: self.subgraph[AccumulateTransient._map_exit], LocalStorage._node_b: self.subgraph[AccumulateTransient._outer_map_exit] } sdfg_id = sdfg.sdfg_list.index(sdfg) in_local_storage = LocalStorage(sdfg_id, self.state_id, local_storage_subgraph, self.expr_index) in_local_storage.array = array in_local_storage.apply(sdfg) # Initialize transient to zero in case of summation # TODO: Initialize transient in other WCR types memlet = graph.in_edges(in_local_storage._data_node)[0].data if detect_reduction_type(memlet.wcr) == dtypes.ReductionType.Sum: in_local_storage._data_node.setzero = True else: warnings.warn('AccumulateTransient did not properly initialize' 'newly-created transient!')
def fuse(self, sdfg, graph, map_entries, do_not_override=None, **kwargs): """ takes the map_entries specified and tries to fuse maps. all maps have to be extended into outer and inner map (use MapExpansion as a pre-pass) Arrays that don't exist outside the subgraph get pushed into the map and their data dimension gets cropped. Otherwise the original array is taken. For every output respective connections are crated automatically. :param sdfg: SDFG :param graph: State :param map_entries: Map Entries (class MapEntry) of the outer maps which we want to fuse :param do_not_override: List of data names whose corresponding nodes are fully contained within the subgraph but should not be augmented/transformed nevertheless. """ # if there are no maps, return immediately if len(map_entries) == 0: return do_not_override = do_not_override or [] # get maps and map exits maps = [map_entry.map for map_entry in map_entries] map_exits = [graph.exit_node(map_entry) for map_entry in map_entries] # See function documentation for an explanation of these variables node_config = SubgraphFusion.get_adjacent_nodes(sdfg, graph, map_entries) (in_nodes, intermediate_nodes, out_nodes) = node_config if self.debug: print("SubgraphFusion::In_nodes", in_nodes) print("SubgraphFusion::Out_nodes", out_nodes) print("SubgraphFusion::Intermediate_nodes", intermediate_nodes) # all maps are assumed to have the same params and range in order global_map = nodes.Map(label="outer_fused", params=maps[0].params, ndrange=maps[0].range) global_map_entry = nodes.MapEntry(global_map) global_map_exit = nodes.MapExit(global_map) schedule = map_entries[0].schedule global_map_entry.schedule = schedule graph.add_node(global_map_entry) graph.add_node(global_map_exit) # next up, for any intermediate node, find whether it only appears # in the subgraph or also somewhere else / as an input # create new transients for nodes that are in out_nodes and # intermediate_nodes simultaneously # also check which dimensions of each transient data element correspond # to map axes and write this information into a dict. node_info = self.prepare_intermediate_nodes(sdfg, graph, in_nodes, out_nodes, \ intermediate_nodes,\ map_entries, map_exits, \ do_not_override) (subgraph_contains_data, transients_created, invariant_dimensions) = node_info if self.debug: print( "SubgraphFusion:: {Intermediate_node: subgraph_contains_data} dict" ) print(subgraph_contains_data) inconnectors_dict = {} # Dict for saving incoming nodes and their assigned connectors # Format: {access_node: (edge, in_conn, out_conn)} for map_entry, map_exit in zip(map_entries, map_exits): # handle inputs # TODO: dynamic map range -- this is fairly unrealistic in such a setting for edge in graph.in_edges(map_entry): src = edge.src mmt = graph.memlet_tree(edge) out_edges = [child.edge for child in mmt.root().children] if src in in_nodes: in_conn = None out_conn = None if src in inconnectors_dict: # no need to augment subset of outer edge. # will do this at the end in one pass. in_conn = inconnectors_dict[src][1] out_conn = inconnectors_dict[src][2] else: next_conn = global_map_entry.next_connector() in_conn = 'IN_' + next_conn out_conn = 'OUT_' + next_conn global_map_entry.add_in_connector(in_conn) global_map_entry.add_out_connector(out_conn) inconnectors_dict[src] = (edge, in_conn, out_conn) # reroute in edge via global_map_entry self.copy_edge(graph, edge, new_dst = global_map_entry, \ new_dst_conn = in_conn) # map out edges to new map for out_edge in out_edges: self.copy_edge(graph, out_edge, new_src = global_map_entry, \ new_src_conn = out_conn) else: # connect directly for out_edge in out_edges: mm = dcpy(out_edge.data) self.copy_edge(graph, out_edge, new_src=src, new_src_conn=None, new_data=mm) for edge in graph.out_edges(map_entry): # special case: for nodes that have no data connections if not edge.src_conn: self.copy_edge(graph, edge, new_src=global_map_entry) ###################################### for edge in graph.in_edges(map_exit): if not edge.dst_conn: # no destination connector, path ends here. self.copy_edge(graph, edge, new_dst=global_map_exit) continue # find corresponding out_edges for current edge, cannot use mmt anymore out_edges = [ oedge for oedge in graph.out_edges(map_exit) if oedge.src_conn[3:] == edge.dst_conn[2:] ] # Tuple to store in/out connector port that might be created port_created = None for out_edge in out_edges: dst = out_edge.dst if dst in intermediate_nodes & out_nodes: # create connection through global map from # dst to dst_transient that was created dst_transient = transients_created[dst] next_conn = global_map_exit.next_connector() in_conn = 'IN_' + next_conn out_conn = 'OUT_' + next_conn global_map_exit.add_in_connector(in_conn) global_map_exit.add_out_connector(out_conn) # for each transient created, create a union # of outgoing memlets' subsets. this is # a cheap fix to override assignments in invariant # dimensions union = None for oe in graph.out_edges(transients_created[dst]): union = subsets.union(union, oe.data.subset) inner_memlet = dcpy(edge.data) for i, s in enumerate(edge.data.subset): if i in invariant_dimensions[dst.label]: inner_memlet.subset[i] = union[i] inner_memlet.other_subset = dcpy(inner_memlet.subset) e_inner = graph.add_edge(dst, None, global_map_exit, in_conn, inner_memlet) mm_outer = propagate_memlet(graph, inner_memlet, global_map_entry, \ union_inner_edges = False) e_outer = graph.add_edge(global_map_exit, out_conn, dst_transient, None, mm_outer) # remove edge from dst to dst_transient that was created # in intermediate preparation. for e in graph.out_edges(dst): if e.dst == dst_transient: graph.remove_edge(e) break # handle separately: intermediate_nodes and pure out nodes # case 1: intermediate_nodes: can just redirect edge if dst in intermediate_nodes: self.copy_edge(graph, out_edge, new_src=edge.src, new_src_conn=edge.src_conn, new_data=dcpy(edge.data)) # case 2: pure out node: connect to outer array node if dst in (out_nodes - intermediate_nodes): if edge.dst != global_map_exit: next_conn = global_map_exit.next_connector() in_conn = 'IN_' + next_conn out_conn = 'OUT_' + next_conn global_map_exit.add_in_connector(in_conn) global_map_exit.add_out_connector(out_conn) self.copy_edge(graph, edge, new_dst=global_map_exit, new_dst_conn=in_conn) port_created = (in_conn, out_conn) else: conn_nr = edge.dst_conn[3:] in_conn = port_created.st out_conn = port_created.nd # map graph.add_edge(global_map_exit, out_conn, dst, None, dcpy(out_edge.data)) # maps are now ready to be discarded # all connected edges will be finally removed as well graph.remove_node(map_entry) graph.remove_node(map_exit) # create a mapping from data arrays to offsets # for later memlet adjustments later min_offsets = dict() # do one pass to augment all transient arrays data_intermediate = set([node.data for node in intermediate_nodes]) for data_name in data_intermediate: if subgraph_contains_data[data_name]: all_nodes = [ n for n in intermediate_nodes if n.data == data_name ] in_edges = list(chain(*(graph.in_edges(n) for n in all_nodes))) in_edges_iter = iter(in_edges) in_edge = next(in_edges_iter) target_subset = dcpy(in_edge.data.subset) target_subset.pop(invariant_dimensions[data_name]) ###### while True: try: # executed if there are multiple in_edges in_edge = next(in_edges_iter) target_subset_curr = dcpy(in_edge.data.subset) target_subset_curr.pop(invariant_dimensions[data_name]) target_subset = subsets.union(target_subset, \ target_subset_curr) except StopIteration: break min_offsets_cropped = target_subset.min_element_approx() # calculate the new transient array size. target_subset.offset(min_offsets_cropped, True) # re-add invariant dimensions with offset 0 and save to min_offsets min_offset = [] index = 0 for i in range(len(sdfg.data(data_name).shape)): if i in invariant_dimensions[data_name]: min_offset.append(0) else: min_offset.append(min_offsets_cropped[index]) index += 1 min_offsets[data_name] = min_offset # determine the shape of the new array. new_data_shape = [] index = 0 for i, sz in enumerate(sdfg.data(data_name).shape): if i in invariant_dimensions[data_name]: new_data_shape.append(sz) else: new_data_shape.append(target_subset.size()[index]) index += 1 new_data_strides = [ data._prod(new_data_shape[i + 1:]) for i in range(len(new_data_shape)) ] new_data_totalsize = data._prod(new_data_shape) new_data_offset = [0] * len(new_data_shape) # augment. transient_to_transform = sdfg.data(data_name) transient_to_transform.shape = new_data_shape transient_to_transform.strides = new_data_strides transient_to_transform.total_size = new_data_totalsize transient_to_transform.offset = new_data_offset transient_to_transform.lifetime = dtypes.AllocationLifetime.Scope transient_to_transform.storage = self.transient_allocation else: # don't modify data container - array is needed outside # of subgraph. # hack: set lifetime to State if allocation has only been # scope so far to avoid allocation issues if sdfg.data( data_name).lifetime == dtypes.AllocationLifetime.Scope: sdfg.data( data_name).lifetime = dtypes.AllocationLifetime.State # do one pass to adjust and the memlets of in-between transients for node in intermediate_nodes: # all incoming edges to node in_edges = graph.in_edges(node) # outgoing edges going to another fused part out_edges = graph.out_edges(node) # memlets of created transient: # correct data names if node in transients_created: transient_in_edges = graph.in_edges(transients_created[node]) transient_out_edges = graph.out_edges(transients_created[node]) for edge in chain(transient_in_edges, transient_out_edges): for e in graph.memlet_tree(edge): if e.data.data == node.data: e.data.data += '_OUT' # memlets of all in between transients: # offset memlets if array has been augmented if subgraph_contains_data[node.data]: # get min_offset min_offset = min_offsets[node.data] # re-add invariant dimensions with offset 0 for iedge in in_edges: for edge in graph.memlet_tree(iedge): if edge.data.data == node.data: edge.data.subset.offset(min_offset, True) elif edge.data.other_subset: edge.data.other_subset.offset(min_offset, True) # nested SDFG: adjust arrays connected if isinstance(iedge.src, nodes.NestedSDFG): nsdfg = iedge.src.sdfg nested_data_name = edge.src_conn self.adjust_arrays_nsdfg(sdfg, nsdfg, node.data, nested_data_name) for cedge in out_edges: for edge in graph.memlet_tree(cedge): if edge.data.data == node.data: edge.data.subset.offset(min_offset, True) elif edge.data.other_subset: edge.data.other_subset.offset(min_offset, True) # nested SDFG: adjust arrays connected if isinstance(edge.dst, nodes.NestedSDFG): nsdfg = edge.dst.sdfg nested_data_name = edge.dst_conn self.adjust_arrays_nsdfg(sdfg, nsdfg, node.data, nested_data_name) # if in_edges has several entries: # put other_subset into out_edges for correctness if len(in_edges) > 1: for oedge in out_edges: if oedge.dst == global_map_exit and \ oedge.data.other_subset is None: oedge.data.other_subset = dcpy(oedge.data.subset) oedge.data.other_subset.offset(min_offset, True) # consolidate edges if desired if self.consolidate: consolidate_edges_scope(graph, global_map_entry) consolidate_edges_scope(graph, global_map_exit) # propagate edges adjacent to global map entry and exit # if desired if self.propagate: _propagate_node(graph, global_map_entry) _propagate_node(graph, global_map_exit) # create a hook for outside access to global_map self._global_map_entry = global_map_entry if self.schedule_innermaps is not None: for node in graph.scope_children()[global_map_entry]: if isinstance(node, nodes.MapEntry): node.map.schedule = self.schedule_innermaps
class StreamTransient(pattern_matching.Transformation): """ Implements the StreamTransient transformation, which adds a transient and stream nodes between nested maps that lead to a stream. The transient then acts as a local buffer. """ with_buffer = Property(dtype=bool, default=True, desc="Use an intermediate buffer for accumulation") _tasklet = nodes.Tasklet('_') _map_exit = nodes.MapExit(nodes.Map("", [], [])) _outer_map_exit = nodes.MapExit(nodes.Map("", [], [])) @staticmethod def expressions(): return [ sdutil.node_path_graph(StreamTransient._tasklet, StreamTransient._map_exit, StreamTransient._outer_map_exit) ] @staticmethod def can_be_applied(graph, candidate, expr_index, sdfg, strict=False): map_exit = graph.nodes()[candidate[StreamTransient._map_exit]] outer_map_exit = graph.nodes()[candidate[ StreamTransient._outer_map_exit]] # Check if there is a streaming output for _src, _, dest, _, memlet in graph.out_edges(map_exit): if isinstance(sdfg.arrays[memlet.data], data.Stream) and dest == outer_map_exit: return True return False @staticmethod def match_to_str(graph, candidate): tasklet = candidate[StreamTransient._tasklet] map_exit = candidate[StreamTransient._map_exit] outer_map_exit = candidate[StreamTransient._outer_map_exit] return ' -> '.join( str(node) for node in [tasklet, map_exit, outer_map_exit]) def apply(self, sdfg: SDFG): graph = sdfg.nodes()[self.state_id] tasklet = graph.nodes()[self.subgraph[StreamTransient._tasklet]] map_exit = graph.nodes()[self.subgraph[StreamTransient._map_exit]] outer_map_exit = graph.nodes()[self.subgraph[ StreamTransient._outer_map_exit]] memlet = None edge = None for e in graph.out_edges(map_exit): memlet = e.data # TODO: What if there's more than one? if e.dst == outer_map_exit and isinstance(sdfg.arrays[memlet.data], data.Stream): edge = e break tasklet_memlet = None for e in graph.out_edges(tasklet): tasklet_memlet = e.data if tasklet_memlet.data == memlet.data: break bbox = map_exit.map.range.bounding_box_size() bbox_approx = [symbolic.overapproximate(dim) for dim in bbox] dataname = memlet.data # Create the new node: Temporary stream and an access node newname, _ = sdfg.add_stream('trans_' + dataname, sdfg.arrays[memlet.data].dtype, 1, bbox_approx[0], [1], transient=True, find_new_name=True) snode = graph.add_access(newname) to_stream_mm = copy.deepcopy(memlet) to_stream_mm.data = snode.data tasklet_memlet.data = snode.data if self.with_buffer: newname_arr, _ = sdfg.add_transient('strans_' + dataname, [bbox_approx[0]], sdfg.arrays[memlet.data].dtype, find_new_name=True) anode = graph.add_access(newname_arr) to_array_mm = copy.deepcopy(memlet) to_array_mm.data = anode.data graph.add_edge(snode, None, anode, None, to_array_mm) else: anode = snode # Reconnect, assuming one edge to the stream graph.remove_edge(edge) graph.add_edge(map_exit, edge.src_conn, snode, None, to_stream_mm) graph.add_edge(anode, None, outer_map_exit, edge.dst_conn, memlet) return def modifies_graph(self): return True
def expand(self, sdfg, graph, map_entries, map_base_variables=None): """ Expansion into outer and inner maps for each map in a specified set. The resulting outer maps all have same range and indices, corresponding variables and memlets get changed accordingly. The inner map contains the leftover dimensions :param sdfg: Underlying SDFG :param graph: Graph in which we expand :param map_entries: List of Map Entries(Type MapEntry) that we want to expand :param map_base_variables: Optional parameter. List of strings If None, then expand() searches for the maximal amount of equal map ranges and pushes those and their corresponding loop variables into the outer loop. If specified, then expand() pushes the ranges belonging to the loop iteration variables specified into the outer loop (For instance map_base_variables = ['i','j'] assumes that all maps have common iteration indices i and j with corresponding correct ranges) """ maps = [entry.map for entry in map_entries] if not map_base_variables: # find the maximal subset of variables to expand # greedy if there exist multiple ranges that are equal in a map map_base_ranges = helpers.common_map_base_ranges(maps) reassignments = helpers.find_reassignment(maps, map_base_ranges) ##### first, regroup and reassign # create params_dict for every map # first, let us define the outer iteration variable names, # just take the first map and their indices at common ranges map_base_variables = [] for rng in map_base_ranges: for i in range(len(maps[0].params)): if maps[0].range[i] == rng and maps[0].params[ i] not in map_base_variables: map_base_variables.append(maps[0].params[i]) break params_dict = {} if self.debug: print("MultiExpansion::Map_base_variables:", map_base_variables) print("MultiExpansion::Map_base_ranges:", map_base_ranges) for map in maps: # for each map create param dict, first assign identity params_dict_map = {param: param for param in map.params} # now look for the correct reassignment # for every element neq -1, need to change param to map_base_variables[] # if param already appears in own dict, do a swap # else we just replace it for i, reassignment in enumerate(reassignments[map]): if reassignment == -1: # nothing to do pass else: current_var = map.params[i] current_assignment = params_dict_map[current_var] target_assignment = map_base_variables[reassignment] if current_assignment != target_assignment: if target_assignment in params_dict_map.values(): # do a swap key1 = current_var for key, value in params_dict_map.items(): if value == target_assignment: key2 = key value1 = params_dict_map[key1] value2 = params_dict_map[key2] params_dict_map[key1] = key2 params_dict_map[key2] = key1 else: # just reassign params_dict_map[current_var] = target_assignment # done, assign params_dict_map to the global one params_dict[map] = params_dict_map for map, map_entry in zip(maps, map_entries): map_scope = graph.scope_subgraph(map_entry) params_dict_map = params_dict[map] for firstp, secondp in params_dict_map.items(): if firstp != secondp: replace(map_scope, firstp, '__' + firstp + '_fused') for firstp, secondp in params_dict_map.items(): if firstp != secondp: replace(map_scope, '__' + firstp + '_fused', secondp) # now also replace the map variables inside maps for i in range(len(map.params)): map.params[i] = params_dict_map[map.params[i]] if self.debug: print("MultiExpansion::Params replaced") else: # just calculate map_base_ranges # do a check whether all maps correct map_base_ranges = [] map0 = maps[0] for var in map_base_variables: index = map0.params.index(var) map_base_ranges.append(map0.range[index]) for map in maps: for var, rng in zip(map_base_variables, map_base_ranges): assert map.range[map.params.index(var)] == rng # then expand all the maps for map, map_entry in zip(maps, map_entries): if map.get_param_num() == len(map_base_variables): # nothing to expand, continue continue map_exit = graph.exit_node(map_entry) # create two new maps, outer and inner params_outer = map_base_variables ranges_outer = map_base_ranges init_params_inner = [] init_ranges_inner = [] for param, rng in zip(map.params, map.range): if param in map_base_variables: continue else: init_params_inner.append(param) init_ranges_inner.append(rng) params_inner = init_params_inner ranges_inner = subsets.Range(init_ranges_inner) inner_map = nodes.Map(label = map.label + '_inner', params = params_inner, ndrange = ranges_inner, schedule = dtypes.ScheduleType.Sequential \ if self.sequential_innermaps \ else dtypes.ScheduleType.Default) map.label = map.label + '_outer' map.params = params_outer map.range = ranges_outer # create new map entries and exits map_entry_inner = nodes.MapEntry(inner_map) map_exit_inner = nodes.MapExit(inner_map) # analogously to Map_Expansion for edge in graph.out_edges(map_entry): graph.remove_edge(edge) graph.add_memlet_path(map_entry, map_entry_inner, edge.dst, src_conn=edge.src_conn, memlet=edge.data, dst_conn=edge.dst_conn) dynamic_edges = dynamic_map_inputs(graph, map_entry) for edge in dynamic_edges: # Remove old edge and connector graph.remove_edge(edge) edge.dst._in_connectors.remove(edge.dst_conn) # Propagate to each range it belongs to path = [] for mapnode in [map_entry, map_entry_inner]: path.append(mapnode) if any(edge.dst_conn in map(str, symbolic.symlist(r)) for r in mapnode.map.range): graph.add_memlet_path(edge.src, *path, memlet=edge.data, src_conn=edge.src_conn, dst_conn=edge.dst_conn) for edge in graph.in_edges(map_exit): graph.remove_edge(edge) graph.add_memlet_path(edge.src, map_exit_inner, map_exit, memlet=edge.data, src_conn=edge.src_conn, dst_conn=edge.dst_conn)
class GPUMultiTransformMap(transformation.Transformation): """ Implements the GPUMultiTransformMap transformation. Tiles a single map into 2 maps. The outer map is of schedule GPU_Multidevice and loops over the GPUs, while the inner map is a GPU-scheduled map. It also creates GPU transient arrays between the two maps. """ _map_entry = nodes.MapEntry(nodes.Map("", [], [])) dim_idx = Property(dtype=int, default=-1, desc="Index of dimension to be distributed.") new_dim_prefix = Property(dtype=str, default="gpu", allow_none=True, desc="Prefix for new dimension name") new_transient_prefix = Property(dtype=str, default="gpu_multi", allow_none=True, desc="Prefix for the transient name") skip_scalar = Property( dtype=bool, default=True, allow_none=True, desc="If True: skips the scalar data nodes. " "If False: creates localstorage for scalar transients.") use_p2p = Property( dtype=bool, default=False, allow_none=True, desc="If True: uses peer-to-peer access if a data container is already " "located on a GPU. " "If False: creates transient localstorage for data located on GPU.") number_of_gpus = SymbolicProperty( default=None, allow_none=True, desc="number of gpus to divide the map onto," " if not used, uses the amount specified" " in the dace.config in max_number_gpus.") @staticmethod def annotates_memlets(): return True @staticmethod def expressions(): return [sdutil.node_path_graph(GPUMultiTransformMap._map_entry)] @staticmethod def can_be_applied(graph: SDFGState, candidate, expr_index, sdfg, strict=False): map_entry = graph.nodes()[candidate[GPUMultiTransformMap._map_entry]] # Check if there is more than one GPU available: if (Config.get("compiler", "cuda", "max_number_gpus") < 2): return False # Dynamic map ranges not supported if has_dynamic_map_inputs(graph, map_entry): return False # Only accept maps with a default schedule schedule_whitelist = [dtypes.ScheduleType.Default] sdict = graph.scope_dict() parent = sdict[map_entry] while parent is not None: if parent.map.schedule not in schedule_whitelist: return False parent = sdict[parent] # Library nodes inside the scope are not supported scope_subgraph = graph.scope_subgraph(map_entry) for node in scope_subgraph.nodes(): if isinstance(node, nodes.LibraryNode): return False # Custom reductions can not have an accumulate transient, as the # reduction would have to be split up for the ingoing memlet of the # accumulate transient and the outgoing memlet. Not using GPU local # accumulate transient only works for a small volume of data. map_exit = graph.exit_node(map_entry) for edge in graph.out_edges(map_exit): if edge.data.wcr is not None and operations.detect_reduction_type( edge.data.wcr) == dtypes.ReductionType.Custom: return False storage_whitelist = [ dtypes.StorageType.Default, dtypes.StorageType.CPU_Pinned, dtypes.StorageType.CPU_Heap, dtypes.StorageType.GPU_Global, ] for node in graph.predecessors(map_entry): if not isinstance(node, nodes.AccessNode): return False if node.desc(graph).storage not in storage_whitelist: return False for node in graph.successors(map_exit): if not isinstance(node, nodes.AccessNode): return False if node.desc(graph).storage not in storage_whitelist: return False return True @staticmethod def match_to_str(graph, candidate): map_entry = graph.nodes()[candidate[GPUMultiTransformMap._map_entry]] return map_entry.map.label def apply(self, sdfg: SDFG) -> None: graph: SDFGState = sdfg.nodes()[self.state_id] inner_map_entry: nodes.MapEntry = graph.nodes()[self.subgraph[ GPUMultiTransformMap._map_entry]] number_of_gpus = self.number_of_gpus ngpus = Config.get("compiler", "cuda", "max_number_gpus") if (number_of_gpus == None): number_of_gpus = ngpus if number_of_gpus > ngpus: raise ValueError( 'Requesting more gpus than specified in the dace config') # Avoiding import loops from dace.transformation.dataflow import (StripMining, InLocalStorage, OutLocalStorage, AccumulateTransient) # The user has responsibility for the implementation of a Library node. scope_subgraph = graph.scope_subgraph(inner_map_entry) for node in scope_subgraph.nodes(): if isinstance(node, nodes.LibraryNode): warnings.warn( 'Node %s is a library node, make sure to manually set the ' 'implementation to a GPU compliant specialization.' % node) # Tile map into number_of_gpus tiles outer_map: nodes.Map = StripMining.apply_to( sdfg, dict(dim_idx=-1, new_dim_prefix=self.new_dim_prefix, tile_size=number_of_gpus, tiling_type=dtypes.TilingType.NumberOfTiles), _map_entry=inner_map_entry) outer_map_entry: nodes.MapEntry = graph.scope_dict()[inner_map_entry] inner_map_exit: nodes.MapExit = graph.exit_node(inner_map_entry) outer_map_exit: nodes.MapExit = graph.exit_node(outer_map_entry) # Change map schedules inner_map_entry.map.schedule = dtypes.ScheduleType.GPU_Device outer_map.schedule = dtypes.ScheduleType.GPU_Multidevice symbolic_gpu_id = outer_map.params[0] # Add the parameter of the outer map for node in graph.successors(inner_map_entry): if isinstance(node, nodes.NestedSDFG): map_syms = inner_map_entry.range.free_symbols for sym in map_syms: symname = str(sym) if symname not in node.symbol_mapping.keys(): node.symbol_mapping[symname] = sym node.sdfg.symbols[symname] = graph.symbols_defined_at( node)[symname] # Add transient Data leading to the inner map prefix = self.new_transient_prefix for node in graph.predecessors(outer_map_entry): # Only AccessNodes are relevant if (isinstance(node, nodes.AccessNode) and not (self.skip_scalar and isinstance(node.desc(sdfg), Scalar))): if self.use_p2p and node.desc( sdfg).storage is dtypes.StorageType.GPU_Global: continue in_data_node = InLocalStorage.apply_to(sdfg, dict(array=node.data, prefix=prefix), verify=False, save=False, node_a=outer_map_entry, node_b=inner_map_entry) in_data_node.desc(sdfg).location['gpu'] = symbolic_gpu_id in_data_node.desc(sdfg).storage = dtypes.StorageType.GPU_Global wcr_data: Dict[str, Any] = {} # Add transient Data leading to the outer map for edge in graph.in_edges(outer_map_exit): node = graph.memlet_path(edge)[-1].dst if isinstance(node, nodes.AccessNode): data_name = node.data # Transients with write-conflict resolution need to be # collected first as AccumulateTransient creates a nestedSDFG if edge.data.wcr is not None: dtype = sdfg.arrays[data_name].dtype redtype = operations.detect_reduction_type(edge.data.wcr) # Custom reduction can not have an accumulate transient, # as the accumulation from the transient to the outer # storage is not defined. if redtype == dtypes.ReductionType.Custom: warnings.warn( 'Using custom reductions in a GPUMultitransformed ' 'Map only works for a small data volume. For large ' 'volume there is no guarantee.') continue identity = dtypes.reduction_identity(dtype, redtype) wcr_data[data_name] = identity elif (not isinstance(node.desc(sdfg), Scalar) or not self.skip_scalar): if self.use_p2p and node.desc( sdfg).storage is dtypes.StorageType.GPU_Global: continue # Transients without write-conflict resolution if prefix + '_' + data_name in sdfg.arrays: create_array = False else: create_array = True out_data_node = OutLocalStorage.apply_to( sdfg, dict(array=data_name, prefix=prefix, create_array=create_array), verify=False, save=False, node_a=inner_map_exit, node_b=outer_map_exit) out_data_node.desc(sdfg).location['gpu'] = symbolic_gpu_id out_data_node.desc( sdfg).storage = dtypes.StorageType.GPU_Global # Add Transients for write-conflict resolution if len(wcr_data) != 0: nsdfg = AccumulateTransient.apply_to( sdfg, options=dict(array_identity_dict=wcr_data, prefix=prefix), map_exit=inner_map_exit, outer_map_exit=outer_map_exit) nsdfg.schedule = dtypes.ScheduleType.GPU_Multidevice nsdfg.location['gpu'] = symbolic_gpu_id for transient_node in graph.successors(nsdfg): if isinstance(transient_node, nodes.AccessNode): transient_node.desc(sdfg).location['gpu'] = symbolic_gpu_id transient_node.desc( sdfg).storage = dtypes.StorageType.GPU_Global nsdfg.sdfg.arrays[ transient_node.label].location['gpu'] = symbolic_gpu_id nsdfg.sdfg.arrays[ transient_node. label].storage = dtypes.StorageType.GPU_Global infer_types.set_default_schedule_storage_types_and_location( nsdfg.sdfg, dtypes.ScheduleType.GPU_Multidevice, symbolic_gpu_id) # Remove the parameter of the outer_map from the sdfg symbols, # as it got added as a symbol in StripMining. if outer_map.params[0] in sdfg.free_symbols: sdfg.remove_symbol(outer_map.params[0])
def apply(self, sdfg: dace.SDFG): # Extract the map and its entry and exit nodes. graph = sdfg.nodes()[self.state_id] map_entry = graph.nodes()[self.subgraph[MapExpansion._map_entry]] map_exit = graph.exit_node(map_entry) current_map = map_entry.map # Create new maps new_maps = [ nodes.Map(current_map.label + '_' + str(param), [param], subsets.Range([param_range]), schedule=dtypes.ScheduleType.Sequential) for param, param_range in zip(current_map.params[1:], current_map.range[1:]) ] current_map.params = [current_map.params[0]] current_map.range = subsets.Range([current_map.range[0]]) # Create new map entries and exits entries = [nodes.MapEntry(new_map) for new_map in new_maps] exits = [nodes.MapExit(new_map) for new_map in new_maps] # Create edges, abiding by the following rules: # 1. If there are no edges coming from the outside, use empty memlets # 2. Edges with IN_* connectors replicate along the maps # 3. Edges for dynamic map ranges replicate until reaching range(s) for edge in graph.out_edges(map_entry): graph.remove_edge(edge) graph.add_memlet_path(map_entry, *entries, edge.dst, src_conn=edge.src_conn, memlet=edge.data, dst_conn=edge.dst_conn) # Modify dynamic map ranges dynamic_edges = dace.sdfg.dynamic_map_inputs(graph, map_entry) for edge in dynamic_edges: # Remove old edge and connector graph.remove_edge(edge) edge.dst.remove_in_connector(edge.dst_conn) # Propagate to each range it belongs to path = [] for mapnode in [map_entry] + entries: path.append(mapnode) if any(edge.dst_conn in map(str, symbolic.symlist(r)) for r in mapnode.map.range): graph.add_memlet_path(edge.src, *path, memlet=edge.data, src_conn=edge.src_conn, dst_conn=edge.dst_conn) # Create new map exits for edge in graph.in_edges(map_exit): graph.remove_edge(edge) graph.add_memlet_path(edge.src, *exits[::-1], map_exit, memlet=edge.data, src_conn=edge.src_conn, dst_conn=edge.dst_conn)
class MapTiling(transformation.Transformation): """ Implements the orthogonal tiling transformation. Orthogonal tiling is a type of nested map fission that creates tiles in every dimension of the matched Map. """ _map_entry = nodes.MapEntry(nodes.Map("", [], [])) # Properties prefix = Property(dtype=str, default="tile", desc="Prefix for new range symbols") tile_sizes = ShapeProperty(dtype=tuple, default=(128, 128, 128), desc="Tile size per dimension") strides = ShapeProperty( dtype=tuple, default=tuple(), desc="Tile stride (enables overlapping tiles). If empty, matches tile") tile_offset = ShapeProperty(dtype=tuple, default=None, desc="Negative Stride offset per dimension", allow_none=True) divides_evenly = Property(dtype=bool, default=False, desc="Tile size divides dimension length evenly") @staticmethod def annotates_memlets(): return True @staticmethod def expressions(): return [sdutil.node_path_graph(MapTiling._map_entry)] @staticmethod def can_be_applied(graph, candidate, expr_index, sdfg, strict=False): return True @staticmethod def match_to_str(graph, candidate): map_entry = graph.nodes()[candidate[MapTiling._map_entry]] return map_entry.map.label + ': ' + str(map_entry.map.params) def apply(self, sdfg): graph = sdfg.nodes()[self.state_id] tile_strides = self.tile_sizes if self.strides is not None and len(self.strides) == len(tile_strides): tile_strides = self.strides # Retrieve map entry and exit nodes. map_entry = graph.nodes()[self.subgraph[MapTiling._map_entry]] from dace.transformation.dataflow.map_collapse import MapCollapse from dace.transformation.dataflow.strip_mining import StripMining stripmine_subgraph = { StripMining._map_entry: self.subgraph[MapTiling._map_entry] } sdfg_id = sdfg.sdfg_id last_map_entry = None removed_maps = 0 original_schedule = map_entry.schedule for dim_idx in range(len(map_entry.map.params)): if dim_idx >= len(self.tile_sizes): tile_size = symbolic.pystr_to_symbolic(self.tile_sizes[-1]) tile_stride = symbolic.pystr_to_symbolic(tile_strides[-1]) else: tile_size = symbolic.pystr_to_symbolic( self.tile_sizes[dim_idx]) tile_stride = symbolic.pystr_to_symbolic(tile_strides[dim_idx]) # handle offsets if self.tile_offset and dim_idx >= len(self.tile_offset): offset = self.tile_offset[-1] elif self.tile_offset: offset = self.tile_offset[dim_idx] else: offset = 0 dim_idx -= removed_maps # If tile size is trivial, skip strip-mining map dimension if tile_size == map_entry.map.range.size()[dim_idx]: continue stripmine = StripMining(sdfg_id, self.state_id, stripmine_subgraph, self.expr_index) # Special case: Tile size of 1 should be omitted from inner map if tile_size == 1 and tile_stride == 1: stripmine.dim_idx = dim_idx stripmine.new_dim_prefix = '' stripmine.tile_size = str(tile_size) stripmine.tile_stride = str(tile_stride) stripmine.divides_evenly = True stripmine.tile_offset = str(offset) stripmine.apply(sdfg) removed_maps += 1 else: stripmine.dim_idx = dim_idx stripmine.new_dim_prefix = self.prefix stripmine.tile_size = str(tile_size) stripmine.tile_stride = str(tile_stride) stripmine.divides_evenly = self.divides_evenly stripmine.tile_offset = str(offset) stripmine.apply(sdfg) # apply to the new map the schedule of the original one map_entry.schedule = original_schedule if last_map_entry: new_map_entry = graph.in_edges(map_entry)[0].src mapcollapse_subgraph = { MapCollapse._outer_map_entry: graph.node_id(last_map_entry), MapCollapse._inner_map_entry: graph.node_id(new_map_entry) } mapcollapse = MapCollapse(sdfg_id, self.state_id, mapcollapse_subgraph, 0) mapcollapse.apply(sdfg) last_map_entry = graph.in_edges(map_entry)[0].src
class BufferTiling(transformation.Transformation): """ Implements the buffer tiling transformation. BufferTiling tiles a buffer that is in between two maps, where the preceding map writes to the buffer and the succeeding map reads from it. It introduces additional computations in exchange for reduced memory footprint. Commonly used to make use of shared memory on GPUs. """ _map1_exit = nodes.MapExit(nodes.Map('', [], [])) _array = nodes.AccessNode('') _map2_entry = nodes.MapEntry(nodes.Map('', [], [])) tile_sizes = ShapeProperty(dtype=tuple, default=(128, 128, 128), desc="Tile size per dimension") # Returns a list of graphs that represent the pattern @staticmethod def expressions(): return [ sdutil.node_path_graph( BufferTiling._map1_exit, BufferTiling._array, BufferTiling._map2_entry, ) ] @staticmethod def can_be_applied(graph, candidate, expr_index, sdfg, strict=False): map1_exit = graph.nodes()[candidate[BufferTiling._map1_exit]] map2_entry = graph.nodes()[candidate[BufferTiling._map2_entry]] for buf in graph.all_nodes_between(map1_exit, map2_entry): # Check that buffers are AccessNodes. if not isinstance(buf, nodes.AccessNode): return False # Check that buffers are transient. if not sdfg.arrays[buf.data].transient: return False # Check that buffers have exactly 1 input and 1 output edge. if graph.in_degree(buf) != 1: return False if graph.out_degree(buf) != 1: return False # Check that buffers are next to the maps. if graph.in_edges(buf)[0].src != map1_exit: return False if graph.out_edges(buf)[0].dst != map2_entry: return False # Check that the data consumed is provided. provided = graph.in_edges(buf)[0].data.subset consumed = graph.out_edges(buf)[0].data.subset if not provided.covers(consumed): return False # Check that buffers occur only once in this state. num_occurrences = len([ n for n in graph.nodes() if isinstance(n, nodes.AccessNode) and n.data == buf ]) if num_occurrences > 1: return False return True @staticmethod def match_to_str(graph, candidate): map1_exit = graph.nodes()[candidate[BufferTiling._map1_exit]] map2_entry = graph.nodes()[candidate[BufferTiling._map2_entry]] return " -> ".join(entry.map.label + ": " + str(entry.map.params) for entry in [map1_exit, map2_entry]) def apply(self, sdfg): graph = sdfg.nodes()[self.state_id] map1_exit = graph.nodes()[self.subgraph[self._map1_exit]] map1_entry = graph.entry_node(map1_exit) map2_entry = graph.nodes()[self.subgraph[self._map2_entry]] buffers = graph.all_nodes_between(map1_exit, map2_entry) # Situation: # -> map1_entry -> ... -> map1_exit -> buffers -> map2_entry -> ... lower_extents = tuple(b - a for a, b in zip( map1_entry.range.min_element(), map2_entry.range.min_element())) upper_extents = tuple(a - b for a, b in zip( map1_entry.range.max_element(), map2_entry.range.max_element())) # Tile the first map with overlap MapTilingWithOverlap.apply_to(sdfg, map_entry=map1_entry, options={ 'tile_sizes': self.tile_sizes, 'lower_overlap': lower_extents, 'upper_overlap': upper_extents }) tile_map1_exit = graph.out_edges(map1_exit)[0].dst tile_map1_entry = graph.entry_node(tile_map1_exit) tile_map1_entry.label = 'BufferTiling' # Tile the second map MapTiling.apply_to(sdfg, map_entry=map2_entry, options={ 'tile_sizes': self.tile_sizes, 'tile_trivial': True }) tile_map2_entry = graph.in_edges(map2_entry)[0].src # Fuse maps some_buffer = next( iter(buffers)) # some dummy to pass to MapFusion.apply_to() MapFusion.apply_to(sdfg, first_map_exit=tile_map1_exit, array=some_buffer, second_map_entry=tile_map2_entry) # Optimize the simple cases map1_entry.range.ranges = [ (r[0], r[0], r[2]) if l_ext == 0 and u_ext == 0 and ts == 1 else r for r, l_ext, u_ext, ts in zip(map1_entry.range.ranges, lower_extents, upper_extents, self.tile_sizes) ] map2_entry.range.ranges = [ (r[0], r[0], r[2]) if ts == 1 else r for r, ts in zip(map2_entry.range.ranges, self.tile_sizes) ] if any(ts == 1 for ts in self.tile_sizes): if any(r[0] == r[1] for r in map1_entry.map.range): TrivialMapElimination.apply_to(sdfg, _map_entry=map1_entry) if any(r[0] == r[1] for r in map2_entry.map.range): TrivialMapElimination.apply_to(sdfg, _map_entry=map2_entry)
class MapInterchange(transformation.Transformation): """ Implements the map-interchange transformation. Map-interchange takes two nested maps and interchanges their position. """ _outer_map_entry = nodes.MapEntry(nodes.Map("", [], [])) _inner_map_entry = nodes.MapEntry(nodes.Map("", [], [])) @staticmethod def expressions(): return [ sdutil.node_path_graph(MapInterchange._outer_map_entry, MapInterchange._inner_map_entry) ] @staticmethod def can_be_applied(graph, candidate, expr_index, sdfg, strict=False): # TODO: Assuming that the subsets on the edges between the two map # entries/exits are the union of separate inner subsets, is it possible # that inverting these edges breaks the continuity of union? What about # the opposite? # Check the edges between the entries of the two maps. outer_map_entry = graph.nodes()[candidate[ MapInterchange._outer_map_entry]] inner_map_entry = graph.nodes()[candidate[ MapInterchange._inner_map_entry]] # Check that inner map range is independent of outer range map_deps = set() for s in inner_map_entry.map.range: map_deps |= set(map(str, symlist(s))) if any(dep in outer_map_entry.map.params for dep in map_deps): return False # Check that the destination of all the outgoing edges # from the outer map's entry is the inner map's entry. for e in graph.out_edges(outer_map_entry): if e.dst != inner_map_entry: return False # Check that the source of all the incoming edges # to the inner map's entry is the outer map's entry. for e in graph.in_edges(inner_map_entry): if e.src != outer_map_entry: return False # Check that dynamic input range memlets are independent of # first map range if e.dst_conn and not e.dst_conn.startswith('IN_'): memlet_deps = set() for s in e.data.subset: memlet_deps |= set(map(str, symlist(s))) if any(dep in outer_map_entry.map.params for dep in memlet_deps): return False # Check the edges between the exits of the two maps. inner_map_exit = graph.exit_node(inner_map_entry) outer_map_exit = graph.exit_node(outer_map_entry) # Check that the destination of all the outgoing edges # from the inner map's exit is the outer map's exit. for e in graph.out_edges(inner_map_exit): if e.dst != outer_map_exit: return False # Check that the source of all the incoming edges # to the outer map's exit is the inner map's exit. for e in graph.in_edges(outer_map_exit): if e.src != inner_map_exit: return False return True @staticmethod def match_to_str(graph, candidate): outer_map_entry = graph.nodes()[candidate[ MapInterchange._outer_map_entry]] inner_map_entry = graph.nodes()[candidate[ MapInterchange._inner_map_entry]] return ' -> '.join(entry.map.label + ': ' + str(entry.map.params) for entry in [outer_map_entry, inner_map_entry]) def apply(self, sdfg: SDFG): # Extract the parameters and ranges of the inner/outer maps. graph: SDFGState = sdfg.nodes()[self.state_id] outer_map_entry = graph.nodes()[self.subgraph[ MapInterchange._outer_map_entry]] inner_map_entry = graph.nodes()[self.subgraph[ MapInterchange._inner_map_entry]] inner_map_exit = graph.exit_node(inner_map_entry) outer_map_exit = graph.exit_node(outer_map_entry) # Switch connectors outer_map_entry.in_connectors, inner_map_entry.in_connectors = \ inner_map_entry.in_connectors, outer_map_entry.in_connectors outer_map_entry.out_connectors, inner_map_entry.out_connectors = \ inner_map_entry.out_connectors, outer_map_entry.out_connectors outer_map_exit.in_connectors, inner_map_exit.in_connectors = \ inner_map_exit.in_connectors, outer_map_exit.in_connectors outer_map_exit.out_connectors, inner_map_exit.out_connectors = \ inner_map_exit.out_connectors, outer_map_exit.out_connectors # Get edges between the map entries and exits. entry_edges = graph.edges_between(outer_map_entry, inner_map_entry) exit_edges = graph.edges_between(inner_map_exit, outer_map_exit) for e in entry_edges + exit_edges: graph.remove_edge(e) # Change source and destination of edges. sdutil.change_edge_dest(graph, outer_map_entry, inner_map_entry) sdutil.change_edge_src(graph, inner_map_entry, outer_map_entry) sdutil.change_edge_dest(graph, inner_map_exit, outer_map_exit) sdutil.change_edge_src(graph, outer_map_exit, inner_map_exit) # Add edges between the map entries and exits. new_entry_edges = [] new_exit_edges = [] for e in entry_edges: new_entry_edges.append( graph.add_edge(e.dst, e.src_conn, e.src, e.dst_conn, e.data)) for e in exit_edges: new_exit_edges.append( graph.add_edge(e.dst, e.src_conn, e.src, e.dst_conn, e.data)) # Repropagate memlets in modified region for e in new_entry_edges: path = graph.memlet_path(e) index = next(i for i, edge in enumerate(path) if e is edge) e.data.subset = propagate_memlet(graph, path[index + 1].data, outer_map_entry, True).subset for e in new_exit_edges: path = graph.memlet_path(e) index = next(i for i, edge in enumerate(path) if e is edge) e.data.subset = propagate_memlet(graph, path[index - 1].data, outer_map_exit, True).subset @staticmethod def annotates_memlets(): return True