class RedundantArrayCopying3(pm.Transformation): """ Implements the redundant array removal transformation. Removes multiples of array B in pattern MapEntry -> B. """ _arrays_removed = 0 _map_entry = nodes.MapEntry(nodes.Map("", [], [])) _out_array = nodes.AccessNode("_") @staticmethod def expressions(): return [ sdutil.node_path_graph(RedundantArrayCopying3._map_entry, RedundantArrayCopying3._out_array) ] @staticmethod def can_be_applied(graph, candidate, expr_index, sdfg, strict=False): map_entry = graph.nodes()[candidate[RedundantArrayCopying3._map_entry]] out_array = graph.nodes()[candidate[RedundantArrayCopying3._out_array]] # Ensure out degree is one (only one target, which is out_array) found = 0 for _, _, dst, _, _ in graph.out_edges(map_entry): if (isinstance(dst, nodes.AccessNode) and dst != out_array and dst.data == out_array.data): found += 1 return found > 0 @staticmethod def match_to_str(graph, candidate): out_array = graph.nodes()[candidate[RedundantArrayCopying3._out_array]] return "Remove " + str(out_array) def apply(self, sdfg): def gnode(nname): return graph.nodes()[self.subgraph[nname]] graph = sdfg.nodes()[self.state_id] map_entry = gnode(RedundantArrayCopying3._map_entry) out_array = gnode(RedundantArrayCopying3._out_array) for e1 in graph.out_edges(map_entry): dst = e1.dst if (isinstance(dst, nodes.AccessNode) and dst != out_array and dst.data == out_array.data): for e2 in graph.out_edges(dst): graph.add_edge(out_array, None, e2.dst, e2.dst_conn, e2.data) graph.remove_edge(e2) graph.remove_edge(e1) graph.remove_node(dst) if Config.get_bool("debugprint"): RedundantArrayCopying3._arrays_removed += 1
class TrivialMapRangeElimination(transformation.Transformation): """ Implements the Trivial Map Range Elimination pattern. Trivial Map Range Elimination takes a multi-dimensional map with a range containing one element and removes the corresponding dimension. Example: Map[i=0:I,j=0] -> Map[i=0:I] """ _map_entry = nodes.MapEntry(nodes.Map("", [], [])) @staticmethod def expressions(): return [sdutil.node_path_graph(TrivialMapRangeElimination._map_entry)] @staticmethod def can_be_applied(graph, candidate, expr_index, sdfg, strict=False): map_entry = graph.nodes()[candidate[ TrivialMapRangeElimination._map_entry]] if len(map_entry.map.range) <= 1: return False # only acts on multi-dimensional maps return any(frm == to for frm, to, _ in map_entry.map.range) @staticmethod def match_to_str(graph, candidate): map_entry = graph.nodes()[candidate[ TrivialMapRangeElimination._map_entry]] return map_entry.map.label + ': ' + str(map_entry.map.params) def apply(self, sdfg): graph = sdfg.nodes()[self.state_id] map_entry = graph.nodes()[self.subgraph[ TrivialMapRangeElimination._map_entry]] remaining_ranges = [] remaining_params = [] for map_param, ranges in zip(map_entry.map.params, map_entry.map.range.ranges): map_from, map_to, _ = ranges if map_from == map_to: # Replace the map index variable with the value it obtained scope = graph.scope_subgraph(map_entry) scope.replace(map_param, map_from) else: remaining_ranges.append(ranges) remaining_params.append(map_param) map_entry.map.range.ranges = remaining_ranges map_entry.map.params = remaining_params
def _stripmine(self, sdfg, graph, candidate): # Retrieve map entry and exit nodes. map_entry = graph.nodes()[candidate[StripMining._map_entry]] map_exit = graph.exit_node(map_entry) # Retrieve transformation properties. dim_idx = self.dim_idx new_dim_prefix = self.new_dim_prefix tile_size = self.tile_size divides_evenly = self.divides_evenly strided = self.strided tile_stride = self.tile_stride if tile_stride is None or len(tile_stride) == 0: tile_stride = tile_size # Retrieve parameter and range of dimension to be strip-mined. target_dim = map_entry.map.params[dim_idx] td_from, td_to, td_step = map_entry.map.range[dim_idx] # Create new map. Replace by cloning map object? new_dim = self._find_new_dim(sdfg, graph, map_entry, new_dim_prefix, target_dim) nd_from = 0 if symbolic.pystr_to_symbolic(tile_stride) == 1: nd_to = td_to else: nd_to = symbolic.pystr_to_symbolic( 'int_ceil(%s + 1 - %s, %s) - 1' % (symbolic.symstr(td_to), symbolic.symstr(td_from), tile_stride)) nd_step = 1 new_dim_range = (nd_from, nd_to, nd_step) new_map = nodes.Map(new_dim + '_' + map_entry.map.label, [new_dim], subsets.Range([new_dim_range])) new_map_entry = nodes.MapEntry(new_map) new_map_exit = nodes.MapExit(new_map) # Change the range of the selected dimension to iterate over a single # tile if strided: td_from_new = symbolic.pystr_to_symbolic(new_dim) td_to_new_approx = td_to td_step = symbolic.pystr_to_symbolic(tile_size) else: td_from_new = symbolic.pystr_to_symbolic( '%s + %s * %s' % (symbolic.symstr(td_from), str(new_dim), tile_stride)) td_to_new_exact = symbolic.pystr_to_symbolic( 'min(%s + 1, %s + %s * %s + %s) - 1' % (symbolic.symstr(td_to), symbolic.symstr(td_from), tile_stride, str(new_dim), tile_size)) td_to_new_approx = symbolic.pystr_to_symbolic( '%s + %s * %s + %s - 1' % (symbolic.symstr(td_from), tile_stride, str(new_dim), tile_size)) if divides_evenly or strided: td_to_new = td_to_new_approx else: td_to_new = dace.symbolic.SymExpr(td_to_new_exact, td_to_new_approx) # Special case: If range is 1 and no prefix was specified, skip range if td_from_new == td_to_new_approx and target_dim == new_dim: map_entry.map.range = subsets.Range( [r for i, r in enumerate(map_entry.map.range) if i != dim_idx]) map_entry.map.params = [ p for i, p in enumerate(map_entry.map.params) if i != dim_idx ] if len(map_entry.map.params) == 0: raise ValueError('Strip-mining all dimensions of the map with ' 'empty tiles is disallowed') else: map_entry.map.range[dim_idx] = (td_from_new, td_to_new, td_step) # Make internal map's schedule to "not parallel" new_map.schedule = map_entry.map.schedule map_entry.map.schedule = dtypes.ScheduleType.Sequential # Redirect edges new_map_entry.in_connectors = dcpy(map_entry.in_connectors) sdutil.change_edge_dest(graph, map_entry, new_map_entry) new_map_exit.out_connectors = dcpy(map_exit.out_connectors) sdutil.change_edge_src(graph, map_exit, new_map_exit) # Create new entry edges new_in_edges = dict() entry_in_conn = {} entry_out_conn = {} for _src, src_conn, _dst, _, memlet in graph.out_edges(map_entry): if (src_conn is not None and src_conn[:4] == 'OUT_' and not isinstance( sdfg.arrays[memlet.data], dace.data.Scalar)): new_subset = calc_set_image( map_entry.map.params, map_entry.map.range, memlet.subset, ) conn = src_conn[4:] key = (memlet.data, 'IN_' + conn, 'OUT_' + conn) if key in new_in_edges.keys(): old_subset = new_in_edges[key].subset new_in_edges[key].subset = calc_set_union( old_subset, new_subset) else: entry_in_conn['IN_' + conn] = None entry_out_conn['OUT_' + conn] = None new_memlet = dcpy(memlet) new_memlet.subset = new_subset if memlet.dynamic: new_memlet.num_accesses = memlet.num_accesses else: new_memlet.num_accesses = new_memlet.num_elements() new_in_edges[key] = new_memlet else: if src_conn is not None and src_conn[:4] == 'OUT_': conn = src_conn[4:] in_conn = 'IN_' + conn out_conn = 'OUT_' + conn else: in_conn = src_conn out_conn = src_conn if in_conn: entry_in_conn[in_conn] = None if out_conn: entry_out_conn[out_conn] = None new_in_edges[(memlet.data, in_conn, out_conn)] = dcpy(memlet) new_map_entry.out_connectors = entry_out_conn map_entry.in_connectors = entry_in_conn for (_, in_conn, out_conn), memlet in new_in_edges.items(): graph.add_edge(new_map_entry, out_conn, map_entry, in_conn, memlet) # Create new exit edges new_out_edges = dict() exit_in_conn = {} exit_out_conn = {} for _src, _, _dst, dst_conn, memlet in graph.in_edges(map_exit): if (dst_conn is not None and dst_conn[:3] == 'IN_' and not isinstance( sdfg.arrays[memlet.data], dace.data.Scalar)): new_subset = calc_set_image( map_entry.map.params, map_entry.map.range, memlet.subset, ) conn = dst_conn[3:] key = (memlet.data, 'IN_' + conn, 'OUT_' + conn) if key in new_out_edges.keys(): old_subset = new_out_edges[key].subset new_out_edges[key].subset = calc_set_union( old_subset, new_subset) else: exit_in_conn['IN_' + conn] = None exit_out_conn['OUT_' + conn] = None new_memlet = dcpy(memlet) new_memlet.subset = new_subset if memlet.dynamic: new_memlet.num_accesses = memlet.num_accesses else: new_memlet.num_accesses = new_memlet.num_elements() new_out_edges[key] = new_memlet else: if dst_conn is not None and dst_conn[:3] == 'IN_': conn = dst_conn[3:] in_conn = 'IN_' + conn out_conn = 'OUT_' + conn else: in_conn = src_conn out_conn = src_conn if in_conn: exit_in_conn[in_conn] = None if out_conn: exit_out_conn[out_conn] = None new_in_edges[(memlet.data, in_conn, out_conn)] = dcpy(memlet) new_map_exit.in_connectors = exit_in_conn map_exit.out_connectors = exit_out_conn for (_, in_conn, out_conn), memlet in new_out_edges.items(): graph.add_edge(map_exit, out_conn, new_map_exit, in_conn, memlet) # Return strip-mined dimension. return target_dim, new_dim, new_map
def expand(self, sdfg, graph, map_entries, map_base_variables=None): """ Expansion into outer and inner maps for each map in a specified set. The resulting outer maps all have same range and indices, corresponding variables and memlets get changed accordingly. The inner map contains the leftover dimensions :param sdfg: Underlying SDFG :param graph: Graph in which we expand :param map_entries: List of Map Entries(Type MapEntry) that we want to expand :param map_base_variables: Optional parameter. List of strings If None, then expand() searches for the maximal amount of equal map ranges and pushes those and their corresponding loop variables into the outer loop. If specified, then expand() pushes the ranges belonging to the loop iteration variables specified into the outer loop (For instance map_base_variables = ['i','j'] assumes that all maps have common iteration indices i and j with corresponding correct ranges) """ maps = [entry.map for entry in map_entries] if not map_base_variables: # find the maximal subset of variables to expand # greedy if there exist multiple ranges that are equal in a map map_base_ranges = helpers.common_map_base_ranges(maps) reassignments = helpers.find_reassignment(maps, map_base_ranges) ##### first, regroup and reassign # create params_dict for every map # first, let us define the outer iteration variable names, # just take the first map and their indices at common ranges map_base_variables = [] for rng in map_base_ranges: for i in range(len(maps[0].params)): if maps[0].range[i] == rng and maps[0].params[ i] not in map_base_variables: map_base_variables.append(maps[0].params[i]) break params_dict = {} if self.debug: print("MultiExpansion::Map_base_variables:", map_base_variables) print("MultiExpansion::Map_base_ranges:", map_base_ranges) for map in maps: # for each map create param dict, first assign identity params_dict_map = {param: param for param in map.params} # now look for the correct reassignment # for every element neq -1, need to change param to map_base_variables[] # if param already appears in own dict, do a swap # else we just replace it for i, reassignment in enumerate(reassignments[map]): if reassignment == -1: # nothing to do pass else: current_var = map.params[i] current_assignment = params_dict_map[current_var] target_assignment = map_base_variables[reassignment] if current_assignment != target_assignment: if target_assignment in params_dict_map.values(): # do a swap key1 = current_var for key, value in params_dict_map.items(): if value == target_assignment: key2 = key value1 = params_dict_map[key1] value2 = params_dict_map[key2] params_dict_map[key1] = key2 params_dict_map[key2] = key1 else: # just reassign params_dict_map[current_var] = target_assignment # done, assign params_dict_map to the global one params_dict[map] = params_dict_map for map, map_entry in zip(maps, map_entries): map_scope = graph.scope_subgraph(map_entry) params_dict_map = params_dict[map] for firstp, secondp in params_dict_map.items(): if firstp != secondp: replace(map_scope, firstp, '__' + firstp + '_fused') for firstp, secondp in params_dict_map.items(): if firstp != secondp: replace(map_scope, '__' + firstp + '_fused', secondp) # now also replace the map variables inside maps for i in range(len(map.params)): map.params[i] = params_dict_map[map.params[i]] if self.debug: print("MultiExpansion::Params replaced") else: # just calculate map_base_ranges # do a check whether all maps correct map_base_ranges = [] map0 = maps[0] for var in map_base_variables: index = map0.params.index(var) map_base_ranges.append(map0.range[index]) for map in maps: for var, rng in zip(map_base_variables, map_base_ranges): assert map.range[map.params.index(var)] == rng # then expand all the maps for map, map_entry in zip(maps, map_entries): if map.get_param_num() == len(map_base_variables): # nothing to expand, continue continue map_exit = graph.exit_node(map_entry) # create two new maps, outer and inner params_outer = map_base_variables ranges_outer = map_base_ranges init_params_inner = [] init_ranges_inner = [] for param, rng in zip(map.params, map.range): if param in map_base_variables: continue else: init_params_inner.append(param) init_ranges_inner.append(rng) params_inner = init_params_inner ranges_inner = subsets.Range(init_ranges_inner) inner_map = nodes.Map(label = map.label + '_inner', params = params_inner, ndrange = ranges_inner, schedule = dtypes.ScheduleType.Sequential \ if self.sequential_innermaps \ else dtypes.ScheduleType.Default) map.label = map.label + '_outer' map.params = params_outer map.range = ranges_outer # create new map entries and exits map_entry_inner = nodes.MapEntry(inner_map) map_exit_inner = nodes.MapExit(inner_map) # analogously to Map_Expansion for edge in graph.out_edges(map_entry): graph.remove_edge(edge) graph.add_memlet_path(map_entry, map_entry_inner, edge.dst, src_conn=edge.src_conn, memlet=edge.data, dst_conn=edge.dst_conn) dynamic_edges = dynamic_map_inputs(graph, map_entry) for edge in dynamic_edges: # Remove old edge and connector graph.remove_edge(edge) edge.dst._in_connectors.remove(edge.dst_conn) # Propagate to each range it belongs to path = [] for mapnode in [map_entry, map_entry_inner]: path.append(mapnode) if any(edge.dst_conn in map(str, symbolic.symlist(r)) for r in mapnode.map.range): graph.add_memlet_path(edge.src, *path, memlet=edge.data, src_conn=edge.src_conn, dst_conn=edge.dst_conn) for edge in graph.in_edges(map_exit): graph.remove_edge(edge) graph.add_memlet_path(edge.src, map_exit_inner, map_exit, memlet=edge.data, src_conn=edge.src_conn, dst_conn=edge.dst_conn)
class MapExpansion(pm.Transformation): """ Implements the map-expansion pattern. Map-expansion takes an N-dimensional map and expands it to N unidimensional maps. New edges abide by the following rules: 1. If there are no edges coming from the outside, use empty memlets 2. Edges with IN_* connectors replicate along the maps 3. Edges for dynamic map ranges replicate until reaching range(s) """ _map_entry = nodes.MapEntry(nodes.Map("", [], [])) @staticmethod def expressions(): return [sdutil.node_path_graph(MapExpansion._map_entry)] @staticmethod def can_be_applied(graph: dace.sdfg.graph.OrderedMultiDiConnectorGraph, candidate: Dict[dace.sdfg.nodes.Node, int], expr_index: int, sdfg: dace.SDFG, strict: bool = False): # A candidate subgraph matches the map-expansion pattern when it # includes an N-dimensional map, with N greater than one. map_entry = graph.nodes()[candidate[MapExpansion._map_entry]] return map_entry.map.get_param_num() > 1 @staticmethod def match_to_str(graph: dace.sdfg.graph.OrderedMultiDiConnectorGraph, candidate: Dict[dace.sdfg.nodes.Node, int]): map_entry = graph.nodes()[candidate[MapExpansion._map_entry]] return map_entry.map.label + ': ' + str(map_entry.map.params) def apply(self, sdfg: dace.SDFG): # Extract the map and its entry and exit nodes. graph = sdfg.nodes()[self.state_id] map_entry = graph.nodes()[self.subgraph[MapExpansion._map_entry]] map_exit = graph.exit_node(map_entry) current_map = map_entry.map # Create new maps new_maps = [ nodes.Map(current_map.label + '_' + str(param), [param], subsets.Range([param_range]), schedule=dtypes.ScheduleType.Sequential) for param, param_range in zip(current_map.params[1:], current_map.range[1:]) ] current_map.params = [current_map.params[0]] current_map.range = subsets.Range([current_map.range[0]]) # Create new map entries and exits entries = [nodes.MapEntry(new_map) for new_map in new_maps] exits = [nodes.MapExit(new_map) for new_map in new_maps] # Create edges, abiding by the following rules: # 1. If there are no edges coming from the outside, use empty memlets # 2. Edges with IN_* connectors replicate along the maps # 3. Edges for dynamic map ranges replicate until reaching range(s) for edge in graph.out_edges(map_entry): graph.remove_edge(edge) graph.add_memlet_path(map_entry, *entries, edge.dst, src_conn=edge.src_conn, memlet=edge.data, dst_conn=edge.dst_conn) # Modify dynamic map ranges dynamic_edges = dace.sdfg.dynamic_map_inputs(graph, map_entry) for edge in dynamic_edges: # Remove old edge and connector graph.remove_edge(edge) edge.dst.remove_in_connector(edge.dst_conn) # Propagate to each range it belongs to path = [] for mapnode in [map_entry] + entries: path.append(mapnode) if any(edge.dst_conn in map(str, symbolic.symlist(r)) for r in mapnode.map.range): graph.add_memlet_path(edge.src, *path, memlet=edge.data, src_conn=edge.src_conn, dst_conn=edge.dst_conn) # Create new map exits for edge in graph.in_edges(map_exit): graph.remove_edge(edge) graph.add_memlet_path(edge.src, *exits[::-1], map_exit, memlet=edge.data, src_conn=edge.src_conn, dst_conn=edge.dst_conn)
class StripMining(transformation.Transformation): """ Implements the strip-mining transformation. Strip-mining takes as input a map dimension and splits it into two dimensions. The new dimension iterates over the range of the original one with a parameterizable step, called the tile size. The original dimension is changed to iterates over the range of the tile size, with the same step as before. """ _map_entry = nodes.MapEntry(nodes.Map("", [], [])) # Properties dim_idx = Property(dtype=int, default=-1, desc="Index of dimension to be strip-mined") new_dim_prefix = Property(dtype=str, default="tile", desc="Prefix for new dimension name") tile_size = SymbolicProperty( default=64, desc="Tile size of strip-mined dimension, " "or number of tiles if tiling_type=number_of_tiles") tile_stride = SymbolicProperty(default=0, desc="Stride between two tiles of the " "strip-mined dimension. If zero, it is set " "equal to the tile size.") tile_offset = SymbolicProperty(default=0, desc="Tile stride offset (negative)") divides_evenly = Property(dtype=bool, default=False, desc="Tile size divides dimension range evenly?") strided = Property( dtype=bool, default=False, desc="Continuous (false) or strided (true) elements in tile") tiling_type = Property( dtype=str, default='normal', choices=['normal', 'ceilrange', 'number_of_tiles'], allow_none=True, desc="normal: the outerloop increments with tile_size, " "ceilrange: uses ceiling(N/tile_size) in outer range, " "number_of_tiles: tiles the map into the number of provided tiles, " "provide the number of tiles over tile_size") skew = Property( dtype=bool, default=False, desc="If True, offsets inner tile back such that it starts with zero") @staticmethod def annotates_memlets(): return True @staticmethod def expressions(): return [ sdutil.node_path_graph(StripMining._map_entry) # kStripMining._tasklet, StripMining._map_exit) ] @staticmethod def can_be_applied(graph, candidate, expr_index, sdfg, strict=False): return True @staticmethod def match_to_str(graph, candidate): map_entry = graph.nodes()[candidate[StripMining._map_entry]] return map_entry.map.label + ': ' + str(map_entry.map.params) def apply(self, sdfg): graph = sdfg.nodes()[self.state_id] # Strip-mine selected dimension. _, _, new_map = self._stripmine(sdfg, graph, self.subgraph) return new_map # def __init__(self, tag=True): def __init__(self, *args, **kwargs): self._entry = nodes.EntryNode() self._tasklet = nodes.Tasklet('_') self._exit = nodes.ExitNode() super().__init__(*args, **kwargs) # self.tag = tag @property def entry(self): return self._entry @property def exit(self): return self._exit @property def tasklet(self): return self._tasklet def print_match_pattern(self, candidate): gentry = candidate[self.entry] return str(gentry.map.params[-1]) def _find_new_dim(self, sdfg: SDFG, state: SDFGState, entry: nodes.MapEntry, prefix: str, target_dim: str): """ Finds a variable that is not already defined in scope. """ stree = state.scope_tree() if len(prefix) == 0: return target_dim candidate = '%s_%s' % (prefix, target_dim) index = 1 while candidate in map(str, stree[entry].defined_vars): candidate = '%s%d_%s' % (prefix, index, target_dim) index += 1 return candidate def _create_strided_range(self, sdfg: SDFG, state: SDFGState, map_entry: nodes.MapEntry): map_exit = state.exit_node(map_entry) dim_idx = self.dim_idx new_dim_prefix = self.new_dim_prefix tile_size = self.tile_size divides_evenly = self.divides_evenly tile_stride = self.tile_stride if tile_stride == 0: tile_stride = tile_size if tile_stride != tile_size: raise NotImplementedError # Retrieve parameter and range of dimension to be strip-mined. target_dim = map_entry.map.params[dim_idx] td_from, td_to, td_step = map_entry.map.range[dim_idx] new_dim = self._find_new_dim(sdfg, state, map_entry, new_dim_prefix, target_dim) new_dim_range = (td_from, td_to, tile_size) new_map = nodes.Map(map_entry.map.label, [new_dim], subsets.Range([new_dim_range])) dimsym = dace.symbolic.pystr_to_symbolic(new_dim) td_from_new = dimsym if divides_evenly: td_to_new = dimsym + tile_size - 1 else: if isinstance(td_to, dace.symbolic.SymExpr): td_to = td_to.expr td_to_new = dace.symbolic.SymExpr( sympy.Min(dimsym + tile_size - 1, td_to), dimsym + tile_size - 1) td_step_new = td_step return new_dim, new_map, (td_from_new, td_to_new, td_step_new) def _create_ceil_range(self, sdfg: SDFG, graph: SDFGState, map_entry: nodes.MapEntry): map_exit = graph.exit_node(map_entry) # Retrieve transformation properties. dim_idx = self.dim_idx new_dim_prefix = self.new_dim_prefix tile_size = self.tile_size divides_evenly = self.divides_evenly strided = self.strided offset = self.tile_offset tile_stride = self.tile_stride if tile_stride == 0: tile_stride = tile_size # Retrieve parameter and range of dimension to be strip-mined. target_dim = map_entry.map.params[dim_idx] td_from, td_to, td_step = map_entry.map.range[dim_idx] # Create new map. Replace by cloning map object? new_dim = self._find_new_dim(sdfg, graph, map_entry, new_dim_prefix, target_dim) nd_from = 0 if tile_stride == 1: nd_to = td_to - td_from else: nd_to = symbolic.pystr_to_symbolic( 'int_ceil(%s + 1 - %s, %s) - 1' % (symbolic.symstr(td_to), symbolic.symstr(td_from), symbolic.symstr(tile_stride))) nd_step = 1 new_dim_range = (nd_from, nd_to, nd_step) new_map = nodes.Map(new_dim + '_' + map_entry.map.label, [new_dim], subsets.Range([new_dim_range])) # Change the range of the selected dimension to iterate over a single # tile if strided: td_from_new = symbolic.pystr_to_symbolic(new_dim) td_to_new_approx = td_to td_step = tile_size elif offset == 0: td_from_new = symbolic.pystr_to_symbolic( '%s + %s * %s' % (symbolic.symstr(td_from), symbolic.symstr(new_dim), symbolic.symstr(tile_stride))) td_to_new_exact = symbolic.pystr_to_symbolic( 'min(%s + 1, %s + %s * %s + %s) - 1' % (symbolic.symstr(td_to), symbolic.symstr(td_from), symbolic.symstr(tile_stride), symbolic.symstr(new_dim), symbolic.symstr(tile_size))) td_to_new_approx = symbolic.pystr_to_symbolic( '%s + %s * %s + %s - 1' % (symbolic.symstr(td_from), symbolic.symstr(tile_stride), symbolic.symstr(new_dim), symbolic.symstr(tile_size))) else: # include offset td_from_new_exact = symbolic.pystr_to_symbolic( 'max(%s,%s + %s * %s - %s)' % (symbolic.symstr(td_from), symbolic.symstr(td_from), symbolic.symstrtr(tile_stride), symbolic.symstr(new_dim), symbolic.symstr(offset))) td_from_new_approx = symbolic.pystr_to_symbolic( '%s + %s * %s - %s ' % (symbolic.symstr(td_from), symbolic.symstr(tile_stride), symbolic.symstr(new_dim), symbolic.symstr(offset))) td_from_new = dace.symbolic.SymExpr(td_from_new_exact, td_from_new_approx) td_to_new_exact = symbolic.pystr_to_symbolic( 'min(%s + 1, %s + %s * %s + %s - %s) -1' % (symbolic.symstr(td_to), symbolic.symstr(td_from), symbolic.symstr(tile_stride), symbolic.symstr(new_dim), symbolic.symstr(tile_size), symbolic.symstr(offset))) td_to_new_approx = symbolic.pystr_to_symbolic( '%s + %s * %s + %s - %s - 1' % (symbolic.symstr(td_from), symbolic.symstr(tile_stride), symbolic.symstr(new_dim), symbolic.symstr(tile_size), symbolic.symstr(offset))) if divides_evenly or strided: td_to_new = td_to_new_approx else: td_to_new = dace.symbolic.SymExpr(td_to_new_exact, td_to_new_approx) return new_dim, new_map, (td_from_new, td_to_new, td_step) def _create_from_tile_numbers(self, sdfg: SDFG, state: SDFGState, map_entry: nodes.MapEntry): map_exit = state.exit_node(map_entry) # Retrieve transformation properties. dim_idx = self.dim_idx new_dim_prefix = self.new_dim_prefix divides_evenly = self.divides_evenly number_of_tiles = self.tile_size tile_stride = self.tile_stride number_of_tiles = dace.symbolic.pystr_to_symbolic(number_of_tiles) # Retrieve parameter and range of dimension to be strip-mined. target_dim = map_entry.map.params[dim_idx] td_from, td_to, td_step = map_entry.map.range[dim_idx] tile_size = map_entry.map.range.size_exact()[dim_idx] / number_of_tiles if tile_stride == 0: tile_stride = tile_size if tile_stride != tile_size: raise NotImplementedError new_dim = self._find_new_dim(sdfg, state, map_entry, new_dim_prefix, target_dim) new_dim_range = (td_from, number_of_tiles - 1, 1) new_map = nodes.Map(map_entry.map.label, [new_dim], subsets.Range([new_dim_range])) dimsym = dace.symbolic.pystr_to_symbolic(new_dim) td_from_new = dimsym * tile_size if divides_evenly: td_to_new = (dimsym + 1) * tile_size - 1 else: if isinstance(td_to, dace.symbolic.SymExpr): td_to = td_to.expr td_to_new = dace.symbolic.SymExpr( sympy.Min((dimsym + 1) * tile_size - 1, td_to), (dimsym + 1) * tile_size - 1) td_step_new = td_step return new_dim, new_map, (td_from_new, td_to_new, td_step_new) def _stripmine(self, sdfg, graph, candidate): # Retrieve map entry and exit nodes. map_entry = graph.nodes()[candidate[StripMining._map_entry]] map_exit = graph.exit_node(map_entry) # Retrieve transformation properties. dim_idx = self.dim_idx target_dim = map_entry.map.params[dim_idx] if self.tiling_type == 'ceilrange': new_dim, new_map, td_rng = self._create_ceil_range( sdfg, graph, map_entry) elif self.tiling_type == 'number_of_tiles': new_dim, new_map, td_rng = self._create_from_tile_numbers( sdfg, graph, map_entry) else: new_dim, new_map, td_rng = self._create_strided_range( sdfg, graph, map_entry) new_map_entry = nodes.MapEntry(new_map) new_map_exit = nodes.MapExit(new_map) td_to_new_approx = td_rng[1] if isinstance(td_to_new_approx, dace.symbolic.SymExpr): td_to_new_approx = td_to_new_approx.approx # Special case: If range is 1 and no prefix was specified, skip range if td_rng[0] == td_to_new_approx and target_dim == new_dim: map_entry.map.range = subsets.Range( [r for i, r in enumerate(map_entry.map.range) if i != dim_idx]) map_entry.map.params = [ p for i, p in enumerate(map_entry.map.params) if i != dim_idx ] if len(map_entry.map.params) == 0: raise ValueError('Strip-mining all dimensions of the map with ' 'empty tiles is disallowed') else: map_entry.map.range[dim_idx] = td_rng # Make internal map's schedule to "not parallel" new_map.schedule = map_entry.map.schedule map_entry.map.schedule = dtypes.ScheduleType.Sequential # Redirect edges new_map_entry.in_connectors = dcpy(map_entry.in_connectors) sdutil.change_edge_dest(graph, map_entry, new_map_entry) new_map_exit.out_connectors = dcpy(map_exit.out_connectors) sdutil.change_edge_src(graph, map_exit, new_map_exit) # Create new entry edges new_in_edges = dict() entry_in_conn = {} entry_out_conn = {} for _src, src_conn, _dst, _, memlet in graph.out_edges(map_entry): if (src_conn is not None and src_conn[:4] == 'OUT_' and not isinstance( sdfg.arrays[memlet.data], dace.data.Scalar)): new_subset = calc_set_image( map_entry.map.params, map_entry.map.range, memlet.subset, ) conn = src_conn[4:] key = (memlet.data, 'IN_' + conn, 'OUT_' + conn) if key in new_in_edges.keys(): old_subset = new_in_edges[key].subset new_in_edges[key].subset = calc_set_union( old_subset, new_subset) else: entry_in_conn['IN_' + conn] = None entry_out_conn['OUT_' + conn] = None new_memlet = dcpy(memlet) new_memlet.subset = new_subset if memlet.dynamic: new_memlet.num_accesses = memlet.num_accesses else: new_memlet.num_accesses = new_memlet.num_elements() new_in_edges[key] = new_memlet else: if src_conn is not None and src_conn[:4] == 'OUT_': conn = src_conn[4:] in_conn = 'IN_' + conn out_conn = 'OUT_' + conn else: in_conn = src_conn out_conn = src_conn if in_conn: entry_in_conn[in_conn] = None if out_conn: entry_out_conn[out_conn] = None new_in_edges[(memlet.data, in_conn, out_conn)] = dcpy(memlet) new_map_entry.out_connectors = entry_out_conn map_entry.in_connectors = entry_in_conn for (_, in_conn, out_conn), memlet in new_in_edges.items(): graph.add_edge(new_map_entry, out_conn, map_entry, in_conn, memlet) # Create new exit edges new_out_edges = dict() exit_in_conn = {} exit_out_conn = {} for _src, _, _dst, dst_conn, memlet in graph.in_edges(map_exit): if (dst_conn is not None and dst_conn[:3] == 'IN_' and not isinstance( sdfg.arrays[memlet.data], dace.data.Scalar)): new_subset = calc_set_image( map_entry.map.params, map_entry.map.range, memlet.subset, ) conn = dst_conn[3:] key = (memlet.data, 'IN_' + conn, 'OUT_' + conn) if key in new_out_edges.keys(): old_subset = new_out_edges[key].subset new_out_edges[key].subset = calc_set_union( old_subset, new_subset) else: exit_in_conn['IN_' + conn] = None exit_out_conn['OUT_' + conn] = None new_memlet = dcpy(memlet) new_memlet.subset = new_subset if memlet.dynamic: new_memlet.num_accesses = memlet.num_accesses else: new_memlet.num_accesses = new_memlet.num_elements() new_out_edges[key] = new_memlet else: if dst_conn is not None and dst_conn[:3] == 'IN_': conn = dst_conn[3:] in_conn = 'IN_' + conn out_conn = 'OUT_' + conn else: in_conn = dst_conn out_conn = dst_conn if in_conn: exit_in_conn[in_conn] = None if out_conn: exit_out_conn[out_conn] = None new_in_edges[(memlet.data, in_conn, out_conn)] = dcpy(memlet) new_map_exit.in_connectors = exit_in_conn map_exit.out_connectors = exit_out_conn for (_, in_conn, out_conn), memlet in new_out_edges.items(): graph.add_edge(map_exit, out_conn, new_map_exit, in_conn, memlet) # Skew if necessary if self.skew: xfh.offset_map(sdfg, graph, map_entry, dim_idx, td_rng[0]) # Return strip-mined dimension. return target_dim, new_dim, new_map
class MapToForLoop(pattern_matching.Transformation): """ Implements the Map to for-loop transformation. Takes a map and enforces a sequential schedule by transforming it into a state-machine of a for-loop. Creates a nested SDFG, if necessary. """ _map_entry = nodes.MapEntry(nodes.Map("", [], [])) @staticmethod def annotates_memlets(): return True @staticmethod def expressions(): return [sdutil.node_path_graph(MapToForLoop._map_entry)] @staticmethod def can_be_applied(graph, candidate, expr_index, sdfg, strict=False): # Only uni-dimensional maps are accepted. map_entry = graph.nodes()[candidate[MapToForLoop._map_entry]] if len(map_entry.map.params) > 1: return False return True @staticmethod def match_to_str(graph, candidate): map_entry = graph.nodes()[candidate[MapToForLoop._map_entry]] return map_entry.map.label + ': ' + str(map_entry.map.params) def apply(self, sdfg) -> Tuple[nodes.NestedSDFG, SDFGState]: """ Applies the transformation and returns a tuple with the new nested SDFG node and the main state in the for-loop. """ # Retrieve map entry and exit nodes. graph = sdfg.nodes()[self.state_id] map_entry = graph.nodes()[self.subgraph[MapToForLoop._map_entry]] map_exit = graph.exit_node(map_entry) loop_idx = map_entry.map.params[0] loop_from, loop_to, loop_step = map_entry.map.range[0] # Turn the map scope into a nested SDFG node = nest_state_subgraph(sdfg, graph, graph.scope_subgraph(map_entry)) nsdfg: SDFG = node.sdfg nstate: SDFGState = nsdfg.nodes()[0] # If map range is dynamic, replace loop expressions with memlets param_to_edge = {} for edge in nstate.in_edges(map_entry): if edge.dst_conn and not edge.dst_conn.startswith('IN_'): param = '__DACE_P%d' % len(param_to_edge) repldict = {symbolic.pystr_to_symbolic(edge.dst_conn): param} param_to_edge[param] = edge loop_from = loop_from.subs(repldict) loop_to = loop_to.subs(repldict) loop_step = loop_step.subs(repldict) # Avoiding import loop from dace.codegen.targets.cpp import cpp_array_expr def replace_param(param): param = symbolic.symstr(param) for p, pval in param_to_edge.items(): # TODO: Correct w.r.t. connector type param = param.replace(p, cpp_array_expr(nsdfg, pval.data)) return param # End of dynamic input range # Create a loop inside the nested SDFG nsdfg.add_loop(None, nstate, None, loop_idx, replace_param(loop_from), '%s < %s' % (loop_idx, replace_param(loop_to + 1)), '%s + %s' % (loop_idx, replace_param(loop_step))) # Skip map in input edges for edge in nstate.out_edges(map_entry): src_node = nstate.memlet_path(edge)[0].src nstate.add_edge(src_node, None, edge.dst, edge.dst_conn, edge.data) nstate.remove_edge(edge) # Skip map in output edges for edge in nstate.in_edges(map_exit): dst_node = nstate.memlet_path(edge)[-1].dst nstate.add_edge(edge.src, edge.src_conn, dst_node, None, edge.data) nstate.remove_edge(edge) # Remove nodes from dynamic map range nstate.remove_nodes_from( [e.src for e in dace.sdfg.dynamic_map_inputs(nstate, map_entry)]) # Remove scope nodes nstate.remove_nodes_from([map_entry, map_exit]) return node, nstate
def merge_maps( graph: SDFGState, outer_map_entry: nd.MapEntry, outer_map_exit: nd.MapExit, inner_map_entry: nd.MapEntry, inner_map_exit: nd.MapExit, param_merge: Callable[[ParamsType, ParamsType], ParamsType] = lambda p1, p2: p1 + p2, range_merge: Callable[[RangesType, RangesType], RangesType] = lambda r1, r2: type(r1) (r1.ranges + r2.ranges) ) -> (nd.MapEntry, nd.MapExit): """ Merges two maps (their entries and exits). It is assumed that the operation is valid. """ outer_map = outer_map_entry.map inner_map = inner_map_entry.map # Create merged map by inheriting attributes from outer map and using # the merge functions for parameters and ranges. merged_map = copy.deepcopy(outer_map) merged_map.label = outer_map.label merged_map.params = param_merge(outer_map.params, inner_map.params) merged_map.range = range_merge(outer_map.range, inner_map.range) merged_entry = nd.MapEntry(merged_map) merged_entry.in_connectors = outer_map_entry.in_connectors merged_entry.out_connectors = outer_map_entry.out_connectors merged_exit = nd.MapExit(merged_map) merged_exit.in_connectors = outer_map_exit.in_connectors merged_exit.out_connectors = outer_map_exit.out_connectors graph.add_nodes_from([merged_entry, merged_exit]) # Handle the case of dynamic map inputs in the inner map inner_dynamic_map_inputs = dynamic_map_inputs(graph, inner_map_entry) for edge in inner_dynamic_map_inputs: remove_conn = (len( list(graph.out_edges_by_connector(edge.src, edge.src_conn))) == 1) conn_to_remove = edge.src_conn[4:] if remove_conn: merged_entry.remove_in_connector('IN_' + conn_to_remove) merged_entry.remove_out_connector('OUT_' + conn_to_remove) merged_entry.add_in_connector( edge.dst_conn, inner_map_entry.in_connectors[edge.dst_conn]) outer_edge = next( graph.in_edges_by_connector(outer_map_entry, 'IN_' + conn_to_remove)) graph.add_edge(outer_edge.src, outer_edge.src_conn, merged_entry, edge.dst_conn, outer_edge.data) if remove_conn: graph.remove_edge(outer_edge) # Redirect inner in edges. for edge in graph.out_edges(inner_map_entry): if edge.src_conn is None: # Empty memlets graph.add_edge(merged_entry, edge.src_conn, edge.dst, edge.dst_conn, edge.data) continue # Get memlet path and edge path = graph.memlet_path(edge) ind = path.index(edge) # Add an edge directly from the previous source connector to the # destination graph.add_edge(merged_entry, path[ind - 1].src_conn, edge.dst, edge.dst_conn, edge.data) # Redirect inner out edges. for edge in graph.in_edges(inner_map_exit): if edge.dst_conn is None: # Empty memlets graph.add_edge(edge.src, edge.src_conn, merged_exit, edge.dst_conn, edge.data) continue # Get memlet path and edge path = graph.memlet_path(edge) ind = path.index(edge) # Add an edge directly from the source to the next destination # connector graph.add_edge(edge.src, edge.src_conn, merged_exit, path[ind + 1].dst_conn, edge.data) # Redirect outer edges. change_edge_dest(graph, outer_map_entry, merged_entry) change_edge_src(graph, outer_map_exit, merged_exit) # Clean-up graph.remove_nodes_from( [outer_map_entry, outer_map_exit, inner_map_entry, inner_map_exit]) return merged_entry, merged_exit
class MapWCRFusion(pm.Transformation): """ Implements the map expanded-reduce fusion transformation. Fuses a map with an immediately following reduction, where the array between the map and the reduction is not used anywhere else, and the reduction is divided to two maps with a WCR, denoting partial reduction. """ _tasklet = nodes.Tasklet('_') _tmap_exit = nodes.MapExit(nodes.Map("", [], [])) _in_array = nodes.AccessNode('_') _rmap_in_entry = nodes.MapEntry(nodes.Map("", [], [])) _rmap_in_tasklet = nodes.Tasklet('_') _rmap_in_cr = nodes.MapExit(nodes.Map("", [], [])) _rmap_out_entry = nodes.MapEntry(nodes.Map("", [], [])) _rmap_out_exit = nodes.MapExit(nodes.Map("", [], [])) _out_array = nodes.AccessNode('_') @staticmethod def expressions(): return [ # Map, then partial reduction of axes sdutil.node_path_graph( MapWCRFusion._tasklet, MapWCRFusion._tmap_exit, MapWCRFusion._in_array, MapWCRFusion._rmap_out_entry, MapWCRFusion._rmap_in_entry, MapWCRFusion._rmap_in_tasklet, MapWCRFusion._rmap_in_cr, MapWCRFusion._rmap_out_exit, MapWCRFusion._out_array) ] @staticmethod def can_be_applied(graph, candidate, expr_index, sdfg, strict=False): tmap_exit = graph.nodes()[candidate[MapWCRFusion._tmap_exit]] in_array = graph.nodes()[candidate[MapWCRFusion._in_array]] rmap_entry = graph.nodes()[candidate[MapWCRFusion._rmap_out_entry]] # Make sure that the array is only accessed by the map and the reduce if any([ src != tmap_exit for src, _, _, _, memlet in graph.in_edges(in_array) ]): return False if any([ dest != rmap_entry for _, _, dest, _, memlet in graph.out_edges(in_array) ]): return False # Make sure that there is a reduction in the second map rmap_cr = graph.nodes()[candidate[MapWCRFusion._rmap_in_cr]] reduce_edge = graph.in_edges(rmap_cr)[0] if reduce_edge.data.wcr is None: return False # (strict) Make sure that the transient is not accessed anywhere else # in this state or other states if strict and (len([ n for n in graph.nodes() if isinstance(n, nodes.AccessNode) and n.data == in_array.data ]) > 1 or in_array.data in sdfg.shared_transients()): return False # Verify that reduction ranges match tasklet map tout_memlet = graph.in_edges(in_array)[0].data rin_memlet = graph.out_edges(in_array)[0].data if tout_memlet.subset != rin_memlet.subset: return False return True @staticmethod def match_to_str(graph, candidate): tasklet = candidate[MapWCRFusion._tasklet] map_exit = candidate[MapWCRFusion._tmap_exit] reduce = candidate[MapWCRFusion._rmap_in_cr] return ' -> '.join(str(node) for node in [tasklet, map_exit, reduce]) def apply(self, sdfg): graph = sdfg.node(self.state_id) # To apply, collapse the second map and then fuse the two resulting maps map_collapse = MapCollapse( self.sdfg_id, self.state_id, { MapCollapse._outer_map_entry: self.subgraph[MapWCRFusion._rmap_out_entry], MapCollapse._inner_map_entry: self.subgraph[MapWCRFusion._rmap_in_entry] }, 0) map_entry, _ = map_collapse.apply(sdfg) map_fusion = MapFusion( self.sdfg_id, self.state_id, { MapFusion._first_map_exit: self.subgraph[MapWCRFusion._tmap_exit], MapFusion._second_map_entry: graph.node_id(map_entry) }, 0) map_fusion.apply(sdfg)
class MapTiling(transformation.Transformation): """ Implements the orthogonal tiling transformation. Orthogonal tiling is a type of nested map fission that creates tiles in every dimension of the matched Map. """ _map_entry = nodes.MapEntry(nodes.Map("", [], [])) # Properties prefix = Property(dtype=str, default="tile", desc="Prefix for new range symbols") tile_sizes = ShapeProperty(dtype=tuple, default=(128, 128, 128), desc="Tile size per dimension") strides = ShapeProperty( dtype=tuple, default=tuple(), desc="Tile stride (enables overlapping tiles). If empty, matches tile") tile_offset = ShapeProperty(dtype=tuple, default=None, desc="Negative Stride offset per dimension", allow_none=True) divides_evenly = Property(dtype=bool, default=False, desc="Tile size divides dimension length evenly") @staticmethod def annotates_memlets(): return True @staticmethod def expressions(): return [sdutil.node_path_graph(MapTiling._map_entry)] @staticmethod def can_be_applied(graph, candidate, expr_index, sdfg, strict=False): return True @staticmethod def match_to_str(graph, candidate): map_entry = graph.nodes()[candidate[MapTiling._map_entry]] return map_entry.map.label + ': ' + str(map_entry.map.params) def apply(self, sdfg): graph = sdfg.nodes()[self.state_id] tile_strides = self.tile_sizes if self.strides is not None and len(self.strides) == len(tile_strides): tile_strides = self.strides # Retrieve map entry and exit nodes. map_entry = graph.nodes()[self.subgraph[MapTiling._map_entry]] from dace.transformation.dataflow.map_collapse import MapCollapse from dace.transformation.dataflow.strip_mining import StripMining stripmine_subgraph = { StripMining._map_entry: self.subgraph[MapTiling._map_entry] } sdfg_id = sdfg.sdfg_id last_map_entry = None removed_maps = 0 original_schedule = map_entry.schedule for dim_idx in range(len(map_entry.map.params)): if dim_idx >= len(self.tile_sizes): tile_size = symbolic.pystr_to_symbolic(self.tile_sizes[-1]) tile_stride = symbolic.pystr_to_symbolic(tile_strides[-1]) else: tile_size = symbolic.pystr_to_symbolic( self.tile_sizes[dim_idx]) tile_stride = symbolic.pystr_to_symbolic(tile_strides[dim_idx]) # handle offsets if self.tile_offset and dim_idx >= len(self.tile_offset): offset = self.tile_offset[-1] elif self.tile_offset: offset = self.tile_offset[dim_idx] else: offset = 0 dim_idx -= removed_maps # If tile size is trivial, skip strip-mining map dimension if tile_size == map_entry.map.range.size()[dim_idx]: continue stripmine = StripMining(sdfg_id, self.state_id, stripmine_subgraph, self.expr_index) # Special case: Tile size of 1 should be omitted from inner map if tile_size == 1 and tile_stride == 1: stripmine.dim_idx = dim_idx stripmine.new_dim_prefix = '' stripmine.tile_size = str(tile_size) stripmine.tile_stride = str(tile_stride) stripmine.divides_evenly = True stripmine.tile_offset = str(offset) stripmine.apply(sdfg) removed_maps += 1 else: stripmine.dim_idx = dim_idx stripmine.new_dim_prefix = self.prefix stripmine.tile_size = str(tile_size) stripmine.tile_stride = str(tile_stride) stripmine.divides_evenly = self.divides_evenly stripmine.tile_offset = str(offset) stripmine.apply(sdfg) # apply to the new map the schedule of the original one map_entry.schedule = original_schedule if last_map_entry: new_map_entry = graph.in_edges(map_entry)[0].src mapcollapse_subgraph = { MapCollapse._outer_map_entry: graph.node_id(last_map_entry), MapCollapse._inner_map_entry: graph.node_id(new_map_entry) } mapcollapse = MapCollapse(sdfg_id, self.state_id, mapcollapse_subgraph, 0) mapcollapse.apply(sdfg) last_map_entry = graph.in_edges(map_entry)[0].src
def fuse(self, sdfg, graph, map_entries, do_not_override=None, **kwargs): """ takes the map_entries specified and tries to fuse maps. all maps have to be extended into outer and inner map (use MapExpansion as a pre-pass) Arrays that don't exist outside the subgraph get pushed into the map and their data dimension gets cropped. Otherwise the original array is taken. For every output respective connections are crated automatically. :param sdfg: SDFG :param graph: State :param map_entries: Map Entries (class MapEntry) of the outer maps which we want to fuse :param do_not_override: List of data names whose corresponding nodes are fully contained within the subgraph but should not be augmented/transformed nevertheless. """ # if there are no maps, return immediately if len(map_entries) == 0: return do_not_override = do_not_override or [] # get maps and map exits maps = [map_entry.map for map_entry in map_entries] map_exits = [graph.exit_node(map_entry) for map_entry in map_entries] # See function documentation for an explanation of these variables node_config = SubgraphFusion.get_adjacent_nodes(sdfg, graph, map_entries) (in_nodes, intermediate_nodes, out_nodes) = node_config if self.debug: print("SubgraphFusion::In_nodes", in_nodes) print("SubgraphFusion::Out_nodes", out_nodes) print("SubgraphFusion::Intermediate_nodes", intermediate_nodes) # all maps are assumed to have the same params and range in order global_map = nodes.Map(label="outer_fused", params=maps[0].params, ndrange=maps[0].range) global_map_entry = nodes.MapEntry(global_map) global_map_exit = nodes.MapExit(global_map) schedule = map_entries[0].schedule global_map_entry.schedule = schedule graph.add_node(global_map_entry) graph.add_node(global_map_exit) # next up, for any intermediate node, find whether it only appears # in the subgraph or also somewhere else / as an input # create new transients for nodes that are in out_nodes and # intermediate_nodes simultaneously # also check which dimensions of each transient data element correspond # to map axes and write this information into a dict. node_info = self.prepare_intermediate_nodes(sdfg, graph, in_nodes, out_nodes, \ intermediate_nodes,\ map_entries, map_exits, \ do_not_override) (subgraph_contains_data, transients_created, invariant_dimensions) = node_info if self.debug: print( "SubgraphFusion:: {Intermediate_node: subgraph_contains_data} dict" ) print(subgraph_contains_data) inconnectors_dict = {} # Dict for saving incoming nodes and their assigned connectors # Format: {access_node: (edge, in_conn, out_conn)} for map_entry, map_exit in zip(map_entries, map_exits): # handle inputs # TODO: dynamic map range -- this is fairly unrealistic in such a setting for edge in graph.in_edges(map_entry): src = edge.src mmt = graph.memlet_tree(edge) out_edges = [child.edge for child in mmt.root().children] if src in in_nodes: in_conn = None out_conn = None if src in inconnectors_dict: # no need to augment subset of outer edge. # will do this at the end in one pass. in_conn = inconnectors_dict[src][1] out_conn = inconnectors_dict[src][2] else: next_conn = global_map_entry.next_connector() in_conn = 'IN_' + next_conn out_conn = 'OUT_' + next_conn global_map_entry.add_in_connector(in_conn) global_map_entry.add_out_connector(out_conn) inconnectors_dict[src] = (edge, in_conn, out_conn) # reroute in edge via global_map_entry self.copy_edge(graph, edge, new_dst = global_map_entry, \ new_dst_conn = in_conn) # map out edges to new map for out_edge in out_edges: self.copy_edge(graph, out_edge, new_src = global_map_entry, \ new_src_conn = out_conn) else: # connect directly for out_edge in out_edges: mm = dcpy(out_edge.data) self.copy_edge(graph, out_edge, new_src=src, new_src_conn=None, new_data=mm) for edge in graph.out_edges(map_entry): # special case: for nodes that have no data connections if not edge.src_conn: self.copy_edge(graph, edge, new_src=global_map_entry) ###################################### for edge in graph.in_edges(map_exit): if not edge.dst_conn: # no destination connector, path ends here. self.copy_edge(graph, edge, new_dst=global_map_exit) continue # find corresponding out_edges for current edge, cannot use mmt anymore out_edges = [ oedge for oedge in graph.out_edges(map_exit) if oedge.src_conn[3:] == edge.dst_conn[2:] ] # Tuple to store in/out connector port that might be created port_created = None for out_edge in out_edges: dst = out_edge.dst if dst in intermediate_nodes & out_nodes: # create connection through global map from # dst to dst_transient that was created dst_transient = transients_created[dst] next_conn = global_map_exit.next_connector() in_conn = 'IN_' + next_conn out_conn = 'OUT_' + next_conn global_map_exit.add_in_connector(in_conn) global_map_exit.add_out_connector(out_conn) # for each transient created, create a union # of outgoing memlets' subsets. this is # a cheap fix to override assignments in invariant # dimensions union = None for oe in graph.out_edges(transients_created[dst]): union = subsets.union(union, oe.data.subset) inner_memlet = dcpy(edge.data) for i, s in enumerate(edge.data.subset): if i in invariant_dimensions[dst.label]: inner_memlet.subset[i] = union[i] inner_memlet.other_subset = dcpy(inner_memlet.subset) e_inner = graph.add_edge(dst, None, global_map_exit, in_conn, inner_memlet) mm_outer = propagate_memlet(graph, inner_memlet, global_map_entry, \ union_inner_edges = False) e_outer = graph.add_edge(global_map_exit, out_conn, dst_transient, None, mm_outer) # remove edge from dst to dst_transient that was created # in intermediate preparation. for e in graph.out_edges(dst): if e.dst == dst_transient: graph.remove_edge(e) break # handle separately: intermediate_nodes and pure out nodes # case 1: intermediate_nodes: can just redirect edge if dst in intermediate_nodes: self.copy_edge(graph, out_edge, new_src=edge.src, new_src_conn=edge.src_conn, new_data=dcpy(edge.data)) # case 2: pure out node: connect to outer array node if dst in (out_nodes - intermediate_nodes): if edge.dst != global_map_exit: next_conn = global_map_exit.next_connector() in_conn = 'IN_' + next_conn out_conn = 'OUT_' + next_conn global_map_exit.add_in_connector(in_conn) global_map_exit.add_out_connector(out_conn) self.copy_edge(graph, edge, new_dst=global_map_exit, new_dst_conn=in_conn) port_created = (in_conn, out_conn) else: conn_nr = edge.dst_conn[3:] in_conn = port_created.st out_conn = port_created.nd # map graph.add_edge(global_map_exit, out_conn, dst, None, dcpy(out_edge.data)) # maps are now ready to be discarded # all connected edges will be finally removed as well graph.remove_node(map_entry) graph.remove_node(map_exit) # create a mapping from data arrays to offsets # for later memlet adjustments later min_offsets = dict() # do one pass to augment all transient arrays data_intermediate = set([node.data for node in intermediate_nodes]) for data_name in data_intermediate: if subgraph_contains_data[data_name]: all_nodes = [ n for n in intermediate_nodes if n.data == data_name ] in_edges = list(chain(*(graph.in_edges(n) for n in all_nodes))) in_edges_iter = iter(in_edges) in_edge = next(in_edges_iter) target_subset = dcpy(in_edge.data.subset) target_subset.pop(invariant_dimensions[data_name]) ###### while True: try: # executed if there are multiple in_edges in_edge = next(in_edges_iter) target_subset_curr = dcpy(in_edge.data.subset) target_subset_curr.pop(invariant_dimensions[data_name]) target_subset = subsets.union(target_subset, \ target_subset_curr) except StopIteration: break min_offsets_cropped = target_subset.min_element_approx() # calculate the new transient array size. target_subset.offset(min_offsets_cropped, True) # re-add invariant dimensions with offset 0 and save to min_offsets min_offset = [] index = 0 for i in range(len(sdfg.data(data_name).shape)): if i in invariant_dimensions[data_name]: min_offset.append(0) else: min_offset.append(min_offsets_cropped[index]) index += 1 min_offsets[data_name] = min_offset # determine the shape of the new array. new_data_shape = [] index = 0 for i, sz in enumerate(sdfg.data(data_name).shape): if i in invariant_dimensions[data_name]: new_data_shape.append(sz) else: new_data_shape.append(target_subset.size()[index]) index += 1 new_data_strides = [ data._prod(new_data_shape[i + 1:]) for i in range(len(new_data_shape)) ] new_data_totalsize = data._prod(new_data_shape) new_data_offset = [0] * len(new_data_shape) # augment. transient_to_transform = sdfg.data(data_name) transient_to_transform.shape = new_data_shape transient_to_transform.strides = new_data_strides transient_to_transform.total_size = new_data_totalsize transient_to_transform.offset = new_data_offset transient_to_transform.lifetime = dtypes.AllocationLifetime.Scope transient_to_transform.storage = self.transient_allocation else: # don't modify data container - array is needed outside # of subgraph. # hack: set lifetime to State if allocation has only been # scope so far to avoid allocation issues if sdfg.data( data_name).lifetime == dtypes.AllocationLifetime.Scope: sdfg.data( data_name).lifetime = dtypes.AllocationLifetime.State # do one pass to adjust and the memlets of in-between transients for node in intermediate_nodes: # all incoming edges to node in_edges = graph.in_edges(node) # outgoing edges going to another fused part out_edges = graph.out_edges(node) # memlets of created transient: # correct data names if node in transients_created: transient_in_edges = graph.in_edges(transients_created[node]) transient_out_edges = graph.out_edges(transients_created[node]) for edge in chain(transient_in_edges, transient_out_edges): for e in graph.memlet_tree(edge): if e.data.data == node.data: e.data.data += '_OUT' # memlets of all in between transients: # offset memlets if array has been augmented if subgraph_contains_data[node.data]: # get min_offset min_offset = min_offsets[node.data] # re-add invariant dimensions with offset 0 for iedge in in_edges: for edge in graph.memlet_tree(iedge): if edge.data.data == node.data: edge.data.subset.offset(min_offset, True) elif edge.data.other_subset: edge.data.other_subset.offset(min_offset, True) # nested SDFG: adjust arrays connected if isinstance(iedge.src, nodes.NestedSDFG): nsdfg = iedge.src.sdfg nested_data_name = edge.src_conn self.adjust_arrays_nsdfg(sdfg, nsdfg, node.data, nested_data_name) for cedge in out_edges: for edge in graph.memlet_tree(cedge): if edge.data.data == node.data: edge.data.subset.offset(min_offset, True) elif edge.data.other_subset: edge.data.other_subset.offset(min_offset, True) # nested SDFG: adjust arrays connected if isinstance(edge.dst, nodes.NestedSDFG): nsdfg = edge.dst.sdfg nested_data_name = edge.dst_conn self.adjust_arrays_nsdfg(sdfg, nsdfg, node.data, nested_data_name) # if in_edges has several entries: # put other_subset into out_edges for correctness if len(in_edges) > 1: for oedge in out_edges: if oedge.dst == global_map_exit and \ oedge.data.other_subset is None: oedge.data.other_subset = dcpy(oedge.data.subset) oedge.data.other_subset.offset(min_offset, True) # consolidate edges if desired if self.consolidate: consolidate_edges_scope(graph, global_map_entry) consolidate_edges_scope(graph, global_map_exit) # propagate edges adjacent to global map entry and exit # if desired if self.propagate: _propagate_node(graph, global_map_entry) _propagate_node(graph, global_map_exit) # create a hook for outside access to global_map self._global_map_entry = global_map_entry if self.schedule_innermaps is not None: for node in graph.scope_children()[global_map_entry]: if isinstance(node, nodes.MapEntry): node.map.schedule = self.schedule_innermaps
class GPUMultiTransformMap(transformation.Transformation): """ Implements the GPUMultiTransformMap transformation. Tiles a single map into 2 maps. The outer map is of schedule GPU_Multidevice and loops over the GPUs, while the inner map is a GPU-scheduled map. It also creates GPU transient arrays between the two maps. """ _map_entry = nodes.MapEntry(nodes.Map("", [], [])) dim_idx = Property(dtype=int, default=-1, desc="Index of dimension to be distributed.") new_dim_prefix = Property(dtype=str, default="gpu", allow_none=True, desc="Prefix for new dimension name") new_transient_prefix = Property(dtype=str, default="gpu_multi", allow_none=True, desc="Prefix for the transient name") skip_scalar = Property( dtype=bool, default=True, allow_none=True, desc="If True: skips the scalar data nodes. " "If False: creates localstorage for scalar transients.") use_p2p = Property( dtype=bool, default=False, allow_none=True, desc="If True: uses peer-to-peer access if a data container is already " "located on a GPU. " "If False: creates transient localstorage for data located on GPU.") number_of_gpus = SymbolicProperty( default=None, allow_none=True, desc="number of gpus to divide the map onto," " if not used, uses the amount specified" " in the dace.config in max_number_gpus.") @staticmethod def annotates_memlets(): return True @staticmethod def expressions(): return [sdutil.node_path_graph(GPUMultiTransformMap._map_entry)] @staticmethod def can_be_applied(graph: SDFGState, candidate, expr_index, sdfg, strict=False): map_entry = graph.nodes()[candidate[GPUMultiTransformMap._map_entry]] # Check if there is more than one GPU available: if (Config.get("compiler", "cuda", "max_number_gpus") < 2): return False # Dynamic map ranges not supported if has_dynamic_map_inputs(graph, map_entry): return False # Only accept maps with a default schedule schedule_whitelist = [dtypes.ScheduleType.Default] sdict = graph.scope_dict() parent = sdict[map_entry] while parent is not None: if parent.map.schedule not in schedule_whitelist: return False parent = sdict[parent] # Library nodes inside the scope are not supported scope_subgraph = graph.scope_subgraph(map_entry) for node in scope_subgraph.nodes(): if isinstance(node, nodes.LibraryNode): return False # Custom reductions can not have an accumulate transient, as the # reduction would have to be split up for the ingoing memlet of the # accumulate transient and the outgoing memlet. Not using GPU local # accumulate transient only works for a small volume of data. map_exit = graph.exit_node(map_entry) for edge in graph.out_edges(map_exit): if edge.data.wcr is not None and operations.detect_reduction_type( edge.data.wcr) == dtypes.ReductionType.Custom: return False storage_whitelist = [ dtypes.StorageType.Default, dtypes.StorageType.CPU_Pinned, dtypes.StorageType.CPU_Heap, dtypes.StorageType.GPU_Global, ] for node in graph.predecessors(map_entry): if not isinstance(node, nodes.AccessNode): return False if node.desc(graph).storage not in storage_whitelist: return False for node in graph.successors(map_exit): if not isinstance(node, nodes.AccessNode): return False if node.desc(graph).storage not in storage_whitelist: return False return True @staticmethod def match_to_str(graph, candidate): map_entry = graph.nodes()[candidate[GPUMultiTransformMap._map_entry]] return map_entry.map.label def apply(self, sdfg: SDFG) -> None: graph: SDFGState = sdfg.nodes()[self.state_id] inner_map_entry: nodes.MapEntry = graph.nodes()[self.subgraph[ GPUMultiTransformMap._map_entry]] number_of_gpus = self.number_of_gpus ngpus = Config.get("compiler", "cuda", "max_number_gpus") if (number_of_gpus == None): number_of_gpus = ngpus if number_of_gpus > ngpus: raise ValueError( 'Requesting more gpus than specified in the dace config') # Avoiding import loops from dace.transformation.dataflow import (StripMining, InLocalStorage, OutLocalStorage, AccumulateTransient) # The user has responsibility for the implementation of a Library node. scope_subgraph = graph.scope_subgraph(inner_map_entry) for node in scope_subgraph.nodes(): if isinstance(node, nodes.LibraryNode): warnings.warn( 'Node %s is a library node, make sure to manually set the ' 'implementation to a GPU compliant specialization.' % node) # Tile map into number_of_gpus tiles outer_map: nodes.Map = StripMining.apply_to( sdfg, dict(dim_idx=-1, new_dim_prefix=self.new_dim_prefix, tile_size=number_of_gpus, tiling_type=dtypes.TilingType.NumberOfTiles), _map_entry=inner_map_entry) outer_map_entry: nodes.MapEntry = graph.scope_dict()[inner_map_entry] inner_map_exit: nodes.MapExit = graph.exit_node(inner_map_entry) outer_map_exit: nodes.MapExit = graph.exit_node(outer_map_entry) # Change map schedules inner_map_entry.map.schedule = dtypes.ScheduleType.GPU_Device outer_map.schedule = dtypes.ScheduleType.GPU_Multidevice symbolic_gpu_id = outer_map.params[0] # Add the parameter of the outer map for node in graph.successors(inner_map_entry): if isinstance(node, nodes.NestedSDFG): map_syms = inner_map_entry.range.free_symbols for sym in map_syms: symname = str(sym) if symname not in node.symbol_mapping.keys(): node.symbol_mapping[symname] = sym node.sdfg.symbols[symname] = graph.symbols_defined_at( node)[symname] # Add transient Data leading to the inner map prefix = self.new_transient_prefix for node in graph.predecessors(outer_map_entry): # Only AccessNodes are relevant if (isinstance(node, nodes.AccessNode) and not (self.skip_scalar and isinstance(node.desc(sdfg), Scalar))): if self.use_p2p and node.desc( sdfg).storage is dtypes.StorageType.GPU_Global: continue in_data_node = InLocalStorage.apply_to(sdfg, dict(array=node.data, prefix=prefix), verify=False, save=False, node_a=outer_map_entry, node_b=inner_map_entry) in_data_node.desc(sdfg).location['gpu'] = symbolic_gpu_id in_data_node.desc(sdfg).storage = dtypes.StorageType.GPU_Global wcr_data: Dict[str, Any] = {} # Add transient Data leading to the outer map for edge in graph.in_edges(outer_map_exit): node = graph.memlet_path(edge)[-1].dst if isinstance(node, nodes.AccessNode): data_name = node.data # Transients with write-conflict resolution need to be # collected first as AccumulateTransient creates a nestedSDFG if edge.data.wcr is not None: dtype = sdfg.arrays[data_name].dtype redtype = operations.detect_reduction_type(edge.data.wcr) # Custom reduction can not have an accumulate transient, # as the accumulation from the transient to the outer # storage is not defined. if redtype == dtypes.ReductionType.Custom: warnings.warn( 'Using custom reductions in a GPUMultitransformed ' 'Map only works for a small data volume. For large ' 'volume there is no guarantee.') continue identity = dtypes.reduction_identity(dtype, redtype) wcr_data[data_name] = identity elif (not isinstance(node.desc(sdfg), Scalar) or not self.skip_scalar): if self.use_p2p and node.desc( sdfg).storage is dtypes.StorageType.GPU_Global: continue # Transients without write-conflict resolution if prefix + '_' + data_name in sdfg.arrays: create_array = False else: create_array = True out_data_node = OutLocalStorage.apply_to( sdfg, dict(array=data_name, prefix=prefix, create_array=create_array), verify=False, save=False, node_a=inner_map_exit, node_b=outer_map_exit) out_data_node.desc(sdfg).location['gpu'] = symbolic_gpu_id out_data_node.desc( sdfg).storage = dtypes.StorageType.GPU_Global # Add Transients for write-conflict resolution if len(wcr_data) != 0: nsdfg = AccumulateTransient.apply_to( sdfg, options=dict(array_identity_dict=wcr_data, prefix=prefix), map_exit=inner_map_exit, outer_map_exit=outer_map_exit) nsdfg.schedule = dtypes.ScheduleType.GPU_Multidevice nsdfg.location['gpu'] = symbolic_gpu_id for transient_node in graph.successors(nsdfg): if isinstance(transient_node, nodes.AccessNode): transient_node.desc(sdfg).location['gpu'] = symbolic_gpu_id transient_node.desc( sdfg).storage = dtypes.StorageType.GPU_Global nsdfg.sdfg.arrays[ transient_node.label].location['gpu'] = symbolic_gpu_id nsdfg.sdfg.arrays[ transient_node. label].storage = dtypes.StorageType.GPU_Global infer_types.set_default_schedule_storage_types_and_location( nsdfg.sdfg, dtypes.ScheduleType.GPU_Multidevice, symbolic_gpu_id) # Remove the parameter of the outer_map from the sdfg symbols, # as it got added as a symbol in StripMining. if outer_map.params[0] in sdfg.free_symbols: sdfg.remove_symbol(outer_map.params[0])
class DeduplicateAccess(xf.Transformation): """ This transformation takes a node that is connected to multiple destinations with overlapping memlets, and consolidates those accesses through a transient array or scalar. """ _map_entry = nodes.MapEntry(nodes.Map('_', [], [])) _node1 = nodes.Node() _node2 = nodes.Node() @staticmethod def expressions(): state = sd.SDFGState() state.add_nedge(DeduplicateAccess._map_entry, DeduplicateAccess._node1, Memlet()) state.add_nedge(DeduplicateAccess._map_entry, DeduplicateAccess._node2, Memlet()) return [state] @staticmethod def can_be_applied(graph: sd.SDFGState, candidate, expr_index, sdfg, strict=False): map_entry = graph.node(candidate[DeduplicateAccess._map_entry]) nid1 = candidate[DeduplicateAccess._node1] node1 = graph.node(nid1) nid2 = candidate[DeduplicateAccess._node2] node2 = graph.node(nid2) # Two nodes must be ordered (avoid duplicates/nondeterminism) if nid1 >= nid2: return False # Two nodes must belong to same connector edges1 = set(e.src_conn for e in graph.edges_between(map_entry, node1)) edges2 = set(e.src_conn for e in graph.edges_between(map_entry, node2)) if len(edges1 & edges2) == 0: return False # For each common connector for conn in (edges1 & edges2): # Deduplication: Only apply to first pair of edges node_ids = [ graph.node_id(e.dst) for e in graph.out_edges(map_entry) if e.src_conn == conn ] if any(nid < nid1 for nid in node_ids): return False if any(nid < nid2 for nid in node_ids if nid != nid1): return False # Matching condition: Bounding box union of subsets is smaller than # adding the subset sizes memlets: List[Memlet] = [ e.data for e in graph.out_edges(map_entry) if e.src_conn == conn ] union_subset = memlets[0].subset for memlet in memlets[1:]: union_subset = subsets.bounding_box_union( union_subset, memlet.subset) # TODO: Enhance me! # NOTE: This does not always result in correct behaviour for certain # ranges whose volume is not comparable by "<", # e.g "2*K" >? "K+1" > "K-1" >? "1" if not strict: try: if union_subset.num_elements() < sum( m.subset.num_elements() for m in memlets): return True except TypeError: pass return False @staticmethod def match_to_str(graph, candidate): return str(graph.node(candidate[DeduplicateAccess._map_entry])) def apply(self, sdfg: sd.SDFG): graph: sd.SDFGState = sdfg.nodes()[self.state_id] map_entry = graph.node(self.subgraph[DeduplicateAccess._map_entry]) node1 = graph.node(self.subgraph[DeduplicateAccess._node1]) node2 = graph.node(self.subgraph[DeduplicateAccess._node2]) # Steps: # 1. Find unique subsets # 2. Find sets of contiguous subsets # 3. Create transients for subsets # 4. Redirect edges through new transients edges1 = set(e.src_conn for e in graph.edges_between(map_entry, node1)) edges2 = set(e.src_conn for e in graph.edges_between(map_entry, node2)) # Only apply to first connector (determinism) conn = sorted(edges1 & edges2)[0] edges = [e for e in graph.out_edges(map_entry) if e.src_conn == conn] # Get original data descriptor dname = edges[0].data.data desc = sdfg.arrays[edges[0].data.data] if isinstance(edges[0].dst, nodes.AccessNode) and '15' in edges[0].dst.data: sdfg.save('faulty_dedup.sdfg') # Get unique subsets unique_subsets = set(e.data.subset for e in edges) # Find largest contiguous subsets try: # Start from stride-1 dimension contiguous_subsets = helpers.find_contiguous_subsets( unique_subsets, dim=next(i for i, s in enumerate(desc.strides) if s == 1)) except (StopIteration, NotImplementedError): warnings.warn( "DeduplicateAcces::Not operating on Stride One Dimension!") contiguous_subsets = unique_subsets # Then find subsets for rest of the dimensions contiguous_subsets = helpers.find_contiguous_subsets( contiguous_subsets) # Map original edges to subsets edge_mapping = defaultdict(list) for e in edges: for ind, subset in enumerate(contiguous_subsets): if subset.covers(e.data.subset): edge_mapping[ind].append(e) break else: raise ValueError( "Failed to find contiguous subset for edge %s" % e.data) # Create transients for subsets and redirect edges for ind, subset in enumerate(contiguous_subsets): name, _ = sdfg.add_temp_transient(subset.size(), desc.dtype) anode = graph.add_access(name) graph.add_edge(map_entry, conn, anode, None, Memlet(data=dname, subset=subset)) for e in edge_mapping[ind]: graph.remove_edge(e) new_memlet = copy.deepcopy(e.data) new_edge = graph.add_edge(anode, None, e.dst, e.dst_conn, new_memlet) for pe in graph.memlet_tree(new_edge): # Rename data on memlet pe.data.data = name # Offset memlets to match new transient pe.data.subset.offset(subset, True)
class DeduplicateAccess(pattern_matching.Transformation): """ This transformation takes a node that is connected to multiple destinations with overlapping memlets, and consolidates those accesses through a transient array or scalar. """ _map_entry = nodes.MapEntry(nodes.Map('_', [], [])) _node1 = nodes.Node() _node2 = nodes.Node() @staticmethod def expressions(): state = sd.SDFGState() state.add_nedge(DeduplicateAccess._map_entry, DeduplicateAccess._node1, Memlet()) state.add_nedge(DeduplicateAccess._map_entry, DeduplicateAccess._node2, Memlet()) return [state] @staticmethod def can_be_applied(graph: sd.SDFGState, candidate, expr_index, sdfg, strict=False): map_entry = graph.node(candidate[DeduplicateAccess._map_entry]) nid1 = candidate[DeduplicateAccess._node1] node1 = graph.node(nid1) nid2 = candidate[DeduplicateAccess._node2] node2 = graph.node(nid2) # Two nodes must be ordered (avoid duplicates/nondeterminism) if nid1 >= nid2: return False # Two nodes must belong to same connector edges1 = set(e.src_conn for e in graph.edges_between(map_entry, node1)) edges2 = set(e.src_conn for e in graph.edges_between(map_entry, node2)) if len(edges1 & edges2) == 0: return False # For each common connector for conn in (edges1 & edges2): # Deduplication: Only apply to first pair of edges node_ids = [ graph.node_id(e.dst) for e in graph.out_edges(map_entry) if e.src_conn == conn ] if any(nid < nid1 for nid in node_ids): return False if any(nid < nid2 for nid in node_ids if nid != nid1): return False # Matching condition: Bounding box union of subsets is smaller than # adding the subset sizes memlets: List[Memlet] = [ e.data for e in graph.out_edges(map_entry) if e.src_conn == conn ] union_subset = memlets[0].subset for memlet in memlets[1:]: union_subset = subsets.bounding_box_union( union_subset, memlet.subset) if union_subset.num_elements() < sum(m.subset.num_elements() for m in memlets): return True return False @staticmethod def match_to_str(graph, candidate): return str(graph.node(candidate[DeduplicateAccess._map_entry])) @staticmethod def are_subsets_contiguous(subset_a: subsets.Subset, subset_b: subsets.Subset, dim: int = None) -> bool: if dim is not None: # A version that only checks for contiguity in certain # dimension (e.g., to prioritize stride-1 range) if (not isinstance(subset_a, subsets.Range) or not isinstance(subset_b, subsets.Range)): raise NotImplementedError('Contiguous subset check only ' 'implemented for ranges') # Other dimensions must be equal for i, (s1, s2) in enumerate(zip(subset_a.ranges, subset_b.ranges)): if i == dim: continue if s1[0] != s2[0] or s1[1] != s2[1] or s1[2] != s2[2]: return False # Set of conditions for contiguous dimension ab = (subset_a[dim][1] + 1) == subset_b[dim][0] a_overlap_b = subset_a[dim][1] >= subset_b[dim][0] ba = (subset_b[dim][1] + 1) == subset_a[dim][0] b_overlap_a = subset_b[dim][1] >= subset_a[dim][0] # NOTE: Must check with "==" due to sympy using special types return (ab == True or a_overlap_b == True or ba == True or b_overlap_a == True) # General case bbunion = subsets.bounding_box_union(subset_a, subset_b) return bbunion.num_elements() == (subset_a.num_elements() + subset_b.num_elements()) @staticmethod def find_contiguous_subsets(subset_list: List[subsets.Subset], dim: int = None) -> Set[subsets.Subset]: """ Finds the set of largest contiguous subsets in a list of subsets. :param subsets: Iterable of subset objects. :param dim: Check for contiguity only for the specified dimension. :return: A list of contiguous subsets. """ # Currently O(n^3) worst case. TODO: improve subset_set = set( subsets.Range.from_indices(s) if isinstance(s, subsets.Indices ) else s for s in subset_list) while True: for sa, sb in itertools.product(subset_set, subset_set): if sa is sb: continue if sa.covers(sb): subset_set.remove(sb) break elif sb.covers(sa): subset_set.remove(sa) break elif DeduplicateAccess.are_subsets_contiguous(sa, sb, dim): subset_set.remove(sa) subset_set.remove(sb) subset_set.add(subsets.bounding_box_union(sa, sb)) break else: # No modification performed break return subset_set def apply(self, sdfg: sd.SDFG): graph: sd.SDFGState = sdfg.nodes()[self.state_id] map_entry = graph.node(self.subgraph[DeduplicateAccess._map_entry]) node1 = graph.node(self.subgraph[DeduplicateAccess._node1]) node2 = graph.node(self.subgraph[DeduplicateAccess._node2]) # Steps: # 1. Find unique subsets # 2. Find sets of contiguous subsets # 3. Create transients for subsets # 4. Redirect edges through new transients edges1 = set(e.src_conn for e in graph.edges_between(map_entry, node1)) edges2 = set(e.src_conn for e in graph.edges_between(map_entry, node2)) # Only apply to first connector (determinism) conn = sorted(edges1 & edges2)[0] edges = [e for e in graph.out_edges(map_entry) if e.src_conn == conn] # Get original data descriptor dname = edges[0].data.data desc = sdfg.arrays[edges[0].data.data] # Get unique subsets unique_subsets = set(e.data.subset for e in edges) # Find largest contiguous subsets try: # Start from stride-1 dimension contiguous_subsets = self.find_contiguous_subsets( unique_subsets, dim=next(i for i, s in enumerate(desc.strides) if s == 1)) except (StopIteration, NotImplementedError): contiguous_subsets = unique_subsets # Then find subsets for rest of the dimensions contiguous_subsets = self.find_contiguous_subsets(contiguous_subsets) # Map original edges to subsets edge_mapping = defaultdict(list) for e in edges: for ind, subset in enumerate(contiguous_subsets): if subset.covers(e.data.subset): edge_mapping[ind].append(e) break else: raise ValueError( "Failed to find contiguous subset for edge %s" % e.data) # Create transients for subsets and redirect edges for ind, subset in enumerate(contiguous_subsets): name, _ = sdfg.add_temp_transient(subset.size(), desc.dtype) anode = graph.add_access(name) graph.add_edge(map_entry, conn, anode, None, Memlet(data=dname, subset=subset)) for e in edge_mapping[ind]: graph.remove_edge(e) new_memlet = copy.deepcopy(e.data) new_edge = graph.add_edge(anode, None, e.dst, e.dst_conn, new_memlet) for pe in graph.memlet_tree(new_edge): # Rename data on memlet pe.data.data = name # Offset memlets to match new transient pe.data.subset.offset(subset, True)
class DoubleBuffering(transformation.Transformation): """ Implements the double buffering pattern, which pipelines reading and processing data by creating a second copy of the memory. In particular, the transformation takes a 1D map and all internal (directly connected) transients, adds an additional dimension of size 2, and turns the map into a for loop that processes and reads the data in a double-buffered manner. Other memlets will not be transformed. """ _map_entry = nodes.MapEntry(nodes.Map('_', [], [])) _transient = nodes.AccessNode('_') @staticmethod def expressions(): return [ sdutil.node_path_graph(DoubleBuffering._map_entry, DoubleBuffering._transient) ] @staticmethod def can_be_applied(graph, candidate, expr_index, sdfg, strict=False): map_entry = graph.nodes()[candidate[DoubleBuffering._map_entry]] transient = graph.nodes()[candidate[DoubleBuffering._transient]] # Only one dimensional maps are allowed if len(map_entry.map.params) != 1: return False # Verify the map can be transformed to a for-loop if not MapToForLoop.can_be_applied( graph, {MapToForLoop._map_entry: candidate[DoubleBuffering._map_entry]}, expr_index, sdfg, strict): return False # Verify that all directly-connected internal access nodes point to # transient arrays first = True for edge in graph.out_edges(map_entry): if isinstance(edge.dst, nodes.AccessNode): desc = sdfg.arrays[edge.dst.data] if not isinstance(desc, data.Array) or not desc.transient: return False else: # To avoid duplicate matches, only match the first transient if first and edge.dst != transient: return False first = False return True @staticmethod def match_to_str(graph, candidate): return str(graph.node(candidate[DoubleBuffering._map_entry])) def apply(self, sdfg: sd.SDFG): graph: sd.SDFGState = sdfg.nodes()[self.state_id] map_entry = graph.node(self.subgraph[DoubleBuffering._map_entry]) map_param = map_entry.map.params[0] # Assuming one dimensional ############################## # Change condition of loop to one fewer iteration (so that the # final one reads from the last buffer) map_rstart, map_rend, map_rstride = map_entry.map.range[0] map_rend = symbolic.pystr_to_symbolic('(%s) - (%s)' % (map_rend, map_rstride)) map_entry.map.range = subsets.Range([(map_rstart, map_rend, map_rstride)]) ############################## # Gather transients to modify transients_to_modify = set(edge.dst.data for edge in graph.out_edges(map_entry) if isinstance(edge.dst, nodes.AccessNode)) # Add dimension to transients and modify memlets for transient in transients_to_modify: desc: data.Array = sdfg.arrays[transient] # Using non-python syntax to ensure properties change desc.strides = [desc.total_size] + list(desc.strides) desc.shape = [2] + list(desc.shape) desc.offset = [0] + list(desc.offset) desc.total_size = desc.total_size * 2 ############################## # Modify memlets to use map parameter as buffer index modified_subsets = [] # Store modified memlets for final state for edge in graph.scope_subgraph(map_entry).edges(): if edge.data.data in transients_to_modify: edge.data.subset = self._modify_memlet(sdfg, edge.data.subset, edge.data.data) modified_subsets.append(edge.data.subset) else: # Could be other_subset path = graph.memlet_path(edge) src_node = path[0].src dst_node = path[-1].dst # other_subset could be None. In that case, recreate from array dataname = None if (isinstance(src_node, nodes.AccessNode) and src_node.data in transients_to_modify): dataname = src_node.data elif (isinstance(dst_node, nodes.AccessNode) and dst_node.data in transients_to_modify): dataname = dst_node.data if dataname is not None: subset = (edge.data.other_subset or subsets.Range.from_array(sdfg.arrays[dataname])) edge.data.other_subset = self._modify_memlet( sdfg, subset, dataname) modified_subsets.append(edge.data.other_subset) ############################## # Turn map into for loop map_to_for = MapToForLoop(self.sdfg_id, self.state_id, { MapToForLoop._map_entry: self.subgraph[DoubleBuffering._map_entry] }, self.expr_index) nsdfg_node, nstate = map_to_for.apply(sdfg) ############################## # Gather node copies and remove memlets edges_to_replace = [] for node in nstate.source_nodes(): for edge in nstate.out_edges(node): if (isinstance(edge.dst, nodes.AccessNode) and edge.dst.data in transients_to_modify): edges_to_replace.append(edge) nstate.remove_edge(edge) if nstate.out_degree(node) == 0: nstate.remove_node(node) ############################## # Add initial reads to initial nested state initial_state: sd.SDFGState = nsdfg_node.sdfg.start_state initial_state.set_label('%s_init' % map_entry.map.label) for edge in edges_to_replace: initial_state.add_node(edge.src) rnode = edge.src wnode = initial_state.add_write(edge.dst.data) initial_state.add_edge(rnode, edge.src_conn, wnode, edge.dst_conn, copy.deepcopy(edge.data)) # All instances of the map parameter in this state become the loop start sd.replace(initial_state, map_param, map_rstart) # Initial writes go to the first buffer sd.replace(initial_state, '__dace_db_param', 0) ############################## # Modify main state's memlets # Divide by loop stride new_expr = symbolic.pystr_to_symbolic('(%s / %s) %% 2' % (map_param, map_rstride)) sd.replace(nstate, '__dace_db_param', new_expr) ############################## # Add the main state's contents to the last state, modifying # memlets appropriately. final_state: sd.SDFGState = nsdfg_node.sdfg.sink_nodes()[0] final_state.set_label('%s_final_computation' % map_entry.map.label) dup_nstate = copy.deepcopy(nstate) final_state.add_nodes_from(dup_nstate.nodes()) for e in dup_nstate.edges(): final_state.add_edge(e.src, e.src_conn, e.dst, e.dst_conn, e.data) # If there is a WCR output with transient, only output in last state nstate: sd.SDFGState for node in nstate.sink_nodes(): for e in list(nstate.in_edges(node)): if e.data.wcr is not None: path = nstate.memlet_path(e) if isinstance(path[0].src, nodes.AccessNode): nstate.remove_memlet_path(e) ############################## # Add reads into next buffers to main state for edge in edges_to_replace: rnode = copy.deepcopy(edge.src) nstate.add_node(rnode) wnode = nstate.add_write(edge.dst.data) new_memlet = copy.deepcopy(edge.data) if new_memlet.data in transients_to_modify: new_memlet.other_subset = self._replace_in_subset( new_memlet.other_subset, map_param, '(%s + %s)' % (map_param, map_rstride)) else: new_memlet.subset = self._replace_in_subset( new_memlet.subset, map_param, '(%s + %s)' % (map_param, map_rstride)) nstate.add_edge(rnode, edge.src_conn, wnode, edge.dst_conn, new_memlet) nstate.set_label('%s_double_buffered' % map_entry.map.label) # Divide by loop stride new_expr = symbolic.pystr_to_symbolic('((%s / %s) + 1) %% 2' % (map_param, map_rstride)) sd.replace(nstate, '__dace_db_param', new_expr) # Remove symbol once done del nsdfg_node.sdfg.symbols['__dace_db_param'] del nsdfg_node.symbol_mapping['__dace_db_param'] return nsdfg_node @staticmethod def _modify_memlet(sdfg, subset, data_name): desc = sdfg.arrays[data_name] if len(subset) == len(desc.shape): # Already in the right shape, modify new dimension subset = list(subset)[1:] new_subset = subsets.Range([('__dace_db_param', '__dace_db_param', 1)] + list(subset)) return new_subset @staticmethod def _replace_in_subset(subset, string_or_symbol, new_string_or_symbol): new_subset = copy.deepcopy(subset) repldict = { symbolic.pystr_to_symbolic(string_or_symbol): symbolic.pystr_to_symbolic(new_string_or_symbol) } for i, dim in enumerate(new_subset): try: new_subset[i] = tuple(d.subs(repldict) for d in dim) except TypeError: new_subset[i] = (dim.subs(repldict) if symbolic.issymbolic(dim) else dim) return new_subset
def fuse(self, sdfg, graph, map_entries, do_not_override=[], **kwargs): """ takes the map_entries specified and tries to fuse maps. all maps have to be extended into outer and inner map (use MapExpansion as a pre-pass) Arrays that don't exist outside the subgraph get pushed into the map and their data dimension gets cropped. Otherwise the original array is taken. For every output respective connections are crated automatically. :param sdfg: SDFG :param graph: State :param map_entries: Map Entries (class MapEntry) of the outer maps which we want to fuse :param do_not_override: List of data names whose corresponding nodes are fully contained within the subgraph but should not be augmented/transformed nevertheless. """ # if there are no maps, return immediately if len(map_entries) == 0: return # get maps and map exits maps = [map_entry.map for map_entry in map_entries] map_exits = [graph.exit_node(map_entry) for map_entry in map_entries] # re-construct the map subgraph if necessary try: self.subgraph except AttributeError: subgraph_nodes = set() scope_dict = graph.scope_dict(node_to_children=True) for node in chain(map_entries, map_exits): subgraph_nodes.add(node) # add all border arrays for e in chain(graph.in_edges(node), graph.out_edges(node)): subgraph_nodes.add(e.src) subgraph_nodes.add(e.dst) try: subgraph_nodes |= set(scope_dict[node]) except KeyError: pass self.subgraph = SubgraphView(graph, subgraph_nodes) # Nodes that flow into one or several maps but no data is flowed to them from any map in_nodes = set() # Nodes into which data is flowed but that no data flows into any map from them out_nodes = set() # Nodes that act as intermediate node - data flows from a map into them and then there # is an outgoing path into another map intermediate_nodes = set() ### NOTE: #- in_nodes, out_nodes, intermediate_nodes refer to the configuration of the final fused map #- in_nodes and out_nodes are trivially disjoint #- Intermediate_nodes and out_nodes are not necessarily disjoint #- Intermediate_nodes and in_nodes are disjoint by design. # There could be a node that has both incoming edges from a map exit # and from outside, but it is just treated as intermediate_node and handled # automatically. for map_entry, map_exit in zip(map_entries, map_exits): for edge in graph.in_edges(map_entry): in_nodes.add(edge.src) for edge in graph.out_edges(map_exit): current_node = edge.dst if len(graph.out_edges(current_node)) == 0: out_nodes.add(current_node) else: for dst_edge in graph.out_edges(current_node): if dst_edge.dst in map_entries: # add to intermediate_nodes intermediate_nodes.add(current_node) else: # add to out_nodes out_nodes.add(current_node) for e in graph.in_edges(current_node): if e.src not in map_exits: raise NotImplementedError( "Nodes between two maps to be" "fused with *incoming* edges" "from outside the maps are not" "allowed yet.") # any intermediate_nodes currently in in_nodes shouldnt be there in_nodes -= intermediate_nodes if self.debug: print("SubgraphFusion::In_nodes", in_nodes) print("SubgraphFusion::Out_nodes", out_nodes) print("SubgraphFusion::Intermediate_nodes", intermediate_nodes) # all maps are assumed to have the same params and range in order global_map = nodes.Map(label="outer_fused", params=maps[0].params, ndrange=maps[0].range) global_map_entry = nodes.MapEntry(global_map) global_map_exit = nodes.MapExit(global_map) schedule = map_entries[0].schedule global_map_entry.schedule = schedule graph.add_node(global_map_entry) graph.add_node(global_map_exit) # next up, for any intermediate node, find whether it only appears # in the subgraph or also somewhere else / as an input # create new transients for nodes that are in out_nodes and # intermediate_nodes simultaneously # also check which dimensions of each transient data element correspond # to map axes and write this information into a dict. node_info = self.prepare_intermediate_nodes(sdfg, graph, in_nodes, out_nodes, \ intermediate_nodes,\ map_entries, map_exits, \ do_not_override) (subgraph_contains_data, transients_created, invariant_dimensions) = node_info if self.debug: print( "SubgraphFusion:: {Intermediate_node: subgraph_contains_data} dict" ) print(subgraph_contains_data) inconnectors_dict = {} # Dict for saving incoming nodes and their assigned connectors # Format: {access_node: (edge, in_conn, out_conn)} for map_entry, map_exit in zip(map_entries, map_exits): # handle inputs # TODO: dynamic map range -- this is fairly unrealistic in such a setting for edge in graph.in_edges(map_entry): src = edge.src mmt = graph.memlet_tree(edge) out_edges = [child.edge for child in mmt.root().children] if src in in_nodes: in_conn = None out_conn = None if src in inconnectors_dict: # no need to augment subset of outer edge. # will do this at the end in one pass. in_conn = inconnectors_dict[src][1] out_conn = inconnectors_dict[src][2] graph.remove_edge(edge) else: next_conn = global_map_entry.next_connector() in_conn = 'IN_' + next_conn out_conn = 'OUT_' + next_conn global_map_entry.add_in_connector(in_conn) global_map_entry.add_out_connector(out_conn) inconnectors_dict[src] = (edge, in_conn, out_conn) # reroute in edge via global_map_entry self.redirect_edge(graph, edge, new_dst = global_map_entry, \ new_dst_conn = in_conn) # map out edges to new map for out_edge in out_edges: self.redirect_edge(graph, out_edge, new_src = global_map_entry, \ new_src_conn = out_conn) else: # connect directly for out_edge in out_edges: mm = dcpy(out_edge.data) self.redirect_edge(graph, out_edge, new_src=src, new_data=mm) graph.remove_edge(edge) for edge in graph.out_edges(map_entry): # special case: for nodes that have no data connections if not edge.src_conn: self.redirect_edge(graph, edge, new_src=global_map_entry) ###################################### for edge in graph.in_edges(map_exit): if not edge.dst_conn: # no destination connector, path ends here. self.redirect_edge(graph, edge, new_dst=global_map_exit) continue # find corresponding out_edges for current edge, cannot use mmt anymore out_edges = [ oedge for oedge in graph.out_edges(map_exit) if oedge.src_conn[3:] == edge.dst_conn[2:] ] # Tuple to store in/out connector port that might be created port_created = None for out_edge in out_edges: dst = out_edge.dst if dst in intermediate_nodes & out_nodes: # create connection through global map from # dst to dst_transient that was created dst_transient = transients_created[dst] next_conn = global_map_exit.next_connector() in_conn = 'IN_' + next_conn out_conn = 'OUT_' + next_conn global_map_exit.add_in_connector(in_conn) global_map_exit.add_out_connector(out_conn) inner_memlet = dcpy(edge.data) inner_memlet.other_subset = dcpy(edge.data.subset) e_inner = graph.add_edge(dst, None, global_map_exit, in_conn, inner_memlet) mm_outer = propagate_memlet(graph, inner_memlet, global_map_entry, \ union_inner_edges = False) e_outer = graph.add_edge(global_map_exit, out_conn, dst_transient, None, mm_outer) # remove edge from dst to dst_transient that was created # in intermediate preparation. for e in graph.out_edges(dst): if e.dst == dst_transient: graph.remove_edge(e) removed = True break if self.debug: assert removed == True # handle separately: intermediate_nodes and pure out nodes # case 1: intermediate_nodes: can just redirect edge if dst in intermediate_nodes: self.redirect_edge(graph, out_edge, new_src=edge.src, new_src_conn=edge.src_conn, new_data=dcpy(edge.data)) # case 2: pure out node: connect to outer array node if dst in (out_nodes - intermediate_nodes): if edge.dst != global_map_exit: next_conn = global_map_exit.next_connector() in_conn = 'IN_' + next_conn out_conn = 'OUT_' + next_conn global_map_exit.add_in_connector(in_conn) global_map_exit.add_out_connector(out_conn) self.redirect_edge(graph, edge, new_dst=global_map_exit, new_dst_conn=in_conn) port_created = (in_conn, out_conn) #edge.dst = global_map_exit #edge.dst_conn = in_conn else: conn_nr = edge.dst_conn[3:] in_conn = port_created.st out_conn = port_created.nd # map graph.add_edge(global_map_exit, out_conn, dst, None, dcpy(out_edge.data)) graph.remove_edge(out_edge) # remove the edge if it has not been used by any pure out node if not port_created: graph.remove_edge(edge) # maps are now ready to be discarded graph.remove_node(map_entry) graph.remove_node(map_exit) # end main loop. # create a mapping from data arrays to offsets # for later memlet adjustments later min_offsets = dict() # do one pass to augment all transient arrays data_intermediate = set([node.data for node in intermediate_nodes]) for data_name in data_intermediate: if subgraph_contains_data[data_name]: all_nodes = [ n for n in intermediate_nodes if n.data == data_name ] in_edges = list(chain(*(graph.in_edges(n) for n in all_nodes))) in_edges_iter = iter(in_edges) in_edge = next(in_edges_iter) target_subset = dcpy(in_edge.data.subset) target_subset.pop(invariant_dimensions[data_name]) ###### while True: try: # executed if there are multiple in_edges in_edge = next(in_edges_iter) target_subset_curr = dcpy(in_edge.data.subset) target_subset_curr.pop(invariant_dimensions[data_name]) target_subset = subsets.union(target_subset, \ target_subset_curr) except StopIteration: break min_offsets_cropped = target_subset.min_element_approx() # calculate the new transient array size. target_subset.offset(min_offsets_cropped, True) # re-add invariant dimensions with offset 0 and save to min_offsets min_offset = [] index = 0 for i in range(len(sdfg.data(data_name).shape)): if i in invariant_dimensions[data_name]: min_offset.append(0) else: min_offset.append(min_offsets_cropped[index]) index += 1 min_offsets[data_name] = min_offset # determine the shape of the new array. new_data_shape = [] index = 0 for i, sz in enumerate(sdfg.data(data_name).shape): if i in invariant_dimensions[data_name]: new_data_shape.append(sz) else: new_data_shape.append(target_subset.size()[index]) index += 1 new_data_strides = [ data._prod(new_data_shape[i + 1:]) for i in range(len(new_data_shape)) ] new_data_totalsize = data._prod(new_data_shape) new_data_offset = [0] * len(new_data_shape) # augment. transient_to_transform = sdfg.data(data_name) transient_to_transform.shape = new_data_shape transient_to_transform.strides = new_data_strides transient_to_transform.total_size = new_data_totalsize transient_to_transform.offset = new_data_offset transient_to_transform.lifetime = dtypes.AllocationLifetime.Scope transient_to_transform.storage = self.transient_allocation else: # don't modify data container - array is needed outside # of subgraph. # hack: set lifetime to State if allocation has only been # scope so far to avoid allocation issues if sdfg.data( data_name).lifetime == dtypes.AllocationLifetime.Scope: sdfg.data( data_name).lifetime = dtypes.AllocationLifetime.State # do one pass to adjust and the memlets of in-between transients for node in intermediate_nodes: # all incoming edges to node in_edges = graph.in_edges(node) # outgoing edges going to another fused part inter_edges = [] # outgoing edges that exit global map out_edges = [] for e in graph.out_edges(node): if e.dst == global_map_exit: out_edges.append(e) else: inter_edges.append(e) # offset memlets where necessary if subgraph_contains_data[node.data]: # get min_offset min_offset = min_offsets[node.data] # re-add invariant dimensions with offset 0 for iedge in in_edges: for edge in graph.memlet_tree(iedge): if edge.data.data == node.data: edge.data.subset.offset(min_offset, True) elif edge.data.other_subset: edge.data.other_subset.offset(min_offset, True) for cedge in inter_edges: for edge in graph.memlet_tree(cedge): if edge.data.data == node.data: edge.data.subset.offset(min_offset, True) elif edge.data.other_subset: edge.data.other_subset.offset(min_offset, True) # if in_edges has several entries: # put other_subset into out_edges for correctness if len(in_edges) > 1: for oedge in out_edges: oedge.data.other_subset = dcpy(oedge.data.subset) oedge.data.other_subset.offset(min_offset, True) # also correct memlets of created transient if node in transients_created: transient_in_edges = graph.in_edges(transients_created[node]) transient_out_edges = graph.out_edges(transients_created[node]) for edge in chain(transient_in_edges, transient_out_edges): for e in graph.memlet_tree(edge): if e.data.data == node.data: e.data.data += '_OUT' # do one last pass to correct outside memlets adjacent to global map for out_connector in global_map_entry.out_connectors: # find corresponding in_connector # and the in-connecting edge in_connector = 'IN' + out_connector[3:] for iedge in graph.in_edges(global_map_entry): if iedge.dst_conn == in_connector: in_edge = iedge # find corresponding out_connector # and all out-connecting edges that belong to it # count them oedge_counter = 0 for oedge in graph.out_edges(global_map_entry): if oedge.src_conn == out_connector: out_edge = oedge oedge_counter += 1 # do memlet propagation # if there are several out edges, else there is no need if oedge_counter > 1: memlet_out = propagate_memlet(dfg_state=graph, memlet=out_edge.data, scope_node=global_map_entry, union_inner_edges=True) # override number of accesses in_edge.data.volume = memlet_out.volume in_edge.data.subset = memlet_out.subset # create a hook for outside access to global_map self._global_map_entry = global_map_entry
class MapInterchange(transformation.Transformation): """ Implements the map-interchange transformation. Map-interchange takes two nested maps and interchanges their position. """ _outer_map_entry = nodes.MapEntry(nodes.Map("", [], [])) _inner_map_entry = nodes.MapEntry(nodes.Map("", [], [])) @staticmethod def expressions(): return [ sdutil.node_path_graph(MapInterchange._outer_map_entry, MapInterchange._inner_map_entry) ] @staticmethod def can_be_applied(graph, candidate, expr_index, sdfg, strict=False): # TODO: Assuming that the subsets on the edges between the two map # entries/exits are the union of separate inner subsets, is it possible # that inverting these edges breaks the continuity of union? What about # the opposite? # Check the edges between the entries of the two maps. outer_map_entry = graph.nodes()[candidate[ MapInterchange._outer_map_entry]] inner_map_entry = graph.nodes()[candidate[ MapInterchange._inner_map_entry]] # Check that inner map range is independent of outer range map_deps = set() for s in inner_map_entry.map.range: map_deps |= set(map(str, symlist(s))) if any(dep in outer_map_entry.map.params for dep in map_deps): return False # Check that the destination of all the outgoing edges # from the outer map's entry is the inner map's entry. for e in graph.out_edges(outer_map_entry): if e.dst != inner_map_entry: return False # Check that the source of all the incoming edges # to the inner map's entry is the outer map's entry. for e in graph.in_edges(inner_map_entry): if e.src != outer_map_entry: return False # Check that dynamic input range memlets are independent of # first map range if e.dst_conn and not e.dst_conn.startswith('IN_'): memlet_deps = set() for s in e.data.subset: memlet_deps |= set(map(str, symlist(s))) if any(dep in outer_map_entry.map.params for dep in memlet_deps): return False # Check the edges between the exits of the two maps. inner_map_exit = graph.exit_node(inner_map_entry) outer_map_exit = graph.exit_node(outer_map_entry) # Check that the destination of all the outgoing edges # from the inner map's exit is the outer map's exit. for e in graph.out_edges(inner_map_exit): if e.dst != outer_map_exit: return False # Check that the source of all the incoming edges # to the outer map's exit is the inner map's exit. for e in graph.in_edges(outer_map_exit): if e.src != inner_map_exit: return False return True @staticmethod def match_to_str(graph, candidate): outer_map_entry = graph.nodes()[candidate[ MapInterchange._outer_map_entry]] inner_map_entry = graph.nodes()[candidate[ MapInterchange._inner_map_entry]] return ' -> '.join(entry.map.label + ': ' + str(entry.map.params) for entry in [outer_map_entry, inner_map_entry]) def apply(self, sdfg: SDFG): # Extract the parameters and ranges of the inner/outer maps. graph: SDFGState = sdfg.nodes()[self.state_id] outer_map_entry = graph.nodes()[self.subgraph[ MapInterchange._outer_map_entry]] inner_map_entry = graph.nodes()[self.subgraph[ MapInterchange._inner_map_entry]] inner_map_exit = graph.exit_node(inner_map_entry) outer_map_exit = graph.exit_node(outer_map_entry) # Switch connectors outer_map_entry.in_connectors, inner_map_entry.in_connectors = \ inner_map_entry.in_connectors, outer_map_entry.in_connectors outer_map_entry.out_connectors, inner_map_entry.out_connectors = \ inner_map_entry.out_connectors, outer_map_entry.out_connectors outer_map_exit.in_connectors, inner_map_exit.in_connectors = \ inner_map_exit.in_connectors, outer_map_exit.in_connectors outer_map_exit.out_connectors, inner_map_exit.out_connectors = \ inner_map_exit.out_connectors, outer_map_exit.out_connectors # Get edges between the map entries and exits. entry_edges = graph.edges_between(outer_map_entry, inner_map_entry) exit_edges = graph.edges_between(inner_map_exit, outer_map_exit) for e in entry_edges + exit_edges: graph.remove_edge(e) # Change source and destination of edges. sdutil.change_edge_dest(graph, outer_map_entry, inner_map_entry) sdutil.change_edge_src(graph, inner_map_entry, outer_map_entry) sdutil.change_edge_dest(graph, inner_map_exit, outer_map_exit) sdutil.change_edge_src(graph, outer_map_exit, inner_map_exit) # Add edges between the map entries and exits. new_entry_edges = [] new_exit_edges = [] for e in entry_edges: new_entry_edges.append( graph.add_edge(e.dst, e.src_conn, e.src, e.dst_conn, e.data)) for e in exit_edges: new_exit_edges.append( graph.add_edge(e.dst, e.src_conn, e.src, e.dst_conn, e.data)) # Repropagate memlets in modified region for e in new_entry_edges: path = graph.memlet_path(e) index = next(i for i, edge in enumerate(path) if e is edge) e.data.subset = propagate_memlet(graph, path[index + 1].data, outer_map_entry, True).subset for e in new_exit_edges: path = graph.memlet_path(e) index = next(i for i, edge in enumerate(path) if e is edge) e.data.subset = propagate_memlet(graph, path[index - 1].data, outer_map_exit, True).subset @staticmethod def annotates_memlets(): return True
class Vectorization(transformation.Transformation): """ Implements the vectorization transformation. Vectorization matches when all the input and output memlets of a tasklet inside a map access the inner-most loop variable in their last dimension. The transformation changes the step of the inner-most loop to be equal to the length of the vector and vectorizes the memlets. """ vector_len = Property(desc="Vector length", dtype=int, default=4) propagate_parent = Property(desc="Propagate vector length through " "parent SDFGs", dtype=bool, default=False) strided_map = Property(desc="Use strided map range (jump by vector length)" " instead of modifying memlets", dtype=bool, default=True) preamble = Property( dtype=bool, default=None, allow_none=True, desc='Force creation or skipping a preamble map without vectors') postamble = Property( dtype=bool, default=None, allow_none=True, desc='Force creation or skipping a postamble map without vectors') _map_entry = nodes.MapEntry(nodes.Map("", [], [])) @staticmethod def expressions(): return [ sdutil.node_path_graph(Vectorization._map_entry) ] def can_be_applied(self, graph: SDFGState, candidate, expr_index, sdfg, strict=False): map_entry = graph.nodes()[candidate[Vectorization._map_entry]] # Only accept scopes that have one internal tasklet scope = graph.scope_subgraph(map_entry, False, False) if len(scope.nodes()) != 1: return False tasklet = scope.nodes()[0] if not isinstance(tasklet, nodes.Tasklet): return False param = symbolic.pystr_to_symbolic(map_entry.map.params[-1]) found = False # Strided maps cannot be vectorized if map_entry.map.range[-1][2] != 1 and self.strided_map: return False # Check if all edges, adjacent to the tasklet, # use the parameter in their contiguous dimension. for e, conntype in graph.all_edges_and_connectors(tasklet): # Cases that do not matter for vectorization if e.data.data is None: # Empty memlets continue if isinstance(sdfg.arrays[e.data.data], data.Stream): # Streams continue # Vectorization can not be applied in WCR # if e.data.wcr is not None: # return False subset = e.data.subset array = sdfg.arrays[e.data.data] # If already vectorized or a pointer, do not apply if isinstance(conntype, (dtypes.vector, dtypes.pointer)): return False try: for idx, expr in enumerate(subset): if isinstance(expr, tuple): for ex in expr: ex = symbolic.pystr_to_symbolic(ex) symbols = ex.free_symbols if param in symbols: if array.strides[idx] == 1: found = True else: return False else: expr = symbolic.pystr_to_symbolic(expr) symbols = expr.free_symbols if param in symbols: if array.strides[idx] == 1: found = True else: return False except TypeError: # cannot determine truth value of Relational return False return found @staticmethod def match_to_str(graph, candidate): map_entry = candidate[Vectorization._map_entry] return str(map_entry) def apply(self, sdfg: SDFG): graph = sdfg.nodes()[self.state_id] map_entry = graph.nodes()[self.subgraph[Vectorization._map_entry]] tasklet: nodes.Tasklet = graph.successors(map_entry)[0] param = symbolic.pystr_to_symbolic(map_entry.map.params[-1]) # Create new vector size. vector_size = self.vector_len dim_from, dim_to, dim_skip = map_entry.map.range[-1] # Determine whether to create preamble or postamble maps if self.preamble is not None: create_preamble = self.preamble else: create_preamble = not ((dim_from % vector_size == 0) == True or dim_from == 0) if self.postamble is not None: create_postamble = self.postamble else: if isinstance(dim_to, symbolic.SymExpr): create_postamble = (((dim_to.approx + 1) % vector_size == 0) == False) else: create_postamble = (((dim_to + 1) % vector_size == 0) == False) # Determine new range for vectorized map if self.strided_map: new_range = [dim_from, dim_to - vector_size + 1, vector_size] else: new_range = [ dim_from // vector_size, ((dim_to + 1) // vector_size) - 1, dim_skip ] # Create preamble non-vectorized map (replacing the original map) if create_preamble: old_scope = graph.scope_subgraph(map_entry, True, True) new_scope: ScopeSubgraphView = replicate_scope( sdfg, graph, old_scope) new_begin = dim_from + (vector_size - (dim_from % vector_size)) map_entry.map.range[-1] = (dim_from, new_begin - 1, dim_skip) # Replace map_entry with the replicated scope (so that the preamble # will usually come first in topological sort) map_entry = new_scope.entry tasklet = new_scope.nodes()[old_scope.nodes().index(tasklet)] new_range[0] = new_begin # Create postamble non-vectorized map if create_postamble: new_scope: ScopeSubgraphView = replicate_scope( sdfg, graph, graph.scope_subgraph(map_entry, True, True)) dim_to_ex = dim_to + 1 new_scope.entry.map.range[-1] = (dim_to_ex - (dim_to_ex % vector_size), dim_to, dim_skip) # Change the step of the inner-most dimension. map_entry.map.range[-1] = tuple(new_range) # Vectorize connectors adjacent to the tasklet. for edge in graph.all_edges(tasklet): connectors = (tasklet.in_connectors if edge.dst == tasklet else tasklet.out_connectors) conn = edge.dst_conn if edge.dst == tasklet else edge.src_conn if edge.data.data is None: # Empty memlets continue desc = sdfg.arrays[edge.data.data] contigidx = desc.strides.index(1) newlist = [] lastindex = edge.data.subset[contigidx] if isinstance(lastindex, tuple): newlist = [(rb, re, rs) for rb, re, rs in edge.data.subset] symbols = set() for indd in lastindex: symbols.update( symbolic.pystr_to_symbolic(indd).free_symbols) else: newlist = [(rb, rb, 1) for rb in edge.data.subset] symbols = symbolic.pystr_to_symbolic(lastindex).free_symbols oldtype = connectors[conn] if oldtype is None or oldtype.type is None: oldtype = desc.dtype # Vector to scalar WCR edge: change connector and continue lastedge = graph.memlet_path(edge)[-1] if (lastedge.data.subset.num_elements() == 1 and edge.data.wcr is not None): connectors[conn] = dtypes.vector(oldtype, vector_size) continue if str(param) not in map(str, symbols): continue # Vectorize connector, if not already vectorized if isinstance(oldtype, dtypes.vector): continue connectors[conn] = dtypes.vector(oldtype, vector_size) # Modify memlet subset to match vector length if self.strided_map: rb = newlist[contigidx][0] if self.propagate_parent: newlist[contigidx] = (rb / self.vector_len, rb / self.vector_len, 1) else: newlist[contigidx] = (rb, rb + self.vector_len - 1, 1) else: rb = newlist[contigidx][0] if self.propagate_parent: newlist[contigidx] = (rb, rb, 1) else: newlist[contigidx] = (self.vector_len * rb, self.vector_len * rb + self.vector_len - 1, 1) edge.data.subset = subsets.Range(newlist) edge.data.volume = vector_size # Vector length propagation using data descriptors, recursive traversal # outwards if self.propagate_parent: for edge in graph.all_edges(tasklet): cursdfg = sdfg curedge = edge while cursdfg is not None: arrname = curedge.data.data dtype = cursdfg.arrays[arrname].dtype # Change type and shape to vector if not isinstance(dtype, dtypes.vector): cursdfg.arrays[arrname].dtype = dtypes.vector( dtype, vector_size) new_shape = list(cursdfg.arrays[arrname].shape) contigidx = cursdfg.arrays[arrname].strides.index(1) new_shape[contigidx] /= vector_size try: new_shape[contigidx] = int(new_shape[contigidx]) except TypeError: pass cursdfg.arrays[arrname].shape = new_shape propagation.propagate_memlets_sdfg(cursdfg) # Find matching edge in parent nsdfg = cursdfg.parent_nsdfg_node if nsdfg is None: break tstate = cursdfg.parent curedge = ([ e for e in tstate.in_edges(nsdfg) if e.dst_conn == arrname ] + [ e for e in tstate.out_edges(nsdfg) if e.src_conn == arrname ])[0] cursdfg = cursdfg.parent_sdfg
class TrivialMapElimination(transformation.Transformation): """ Implements the Trivial-Map Elimination pattern. Trivial-Map Elimination removes all dimensions containing only one element from a map. If this applies to all ranges the map is removed. Example: Map[i=0:I,j=7] -> Map[i=0:I] Example: Map[i=0 ,j=7] -> nothing """ _map_entry = nodes.MapEntry(nodes.Map("", [], [])) @staticmethod def expressions(): return [sdutil.node_path_graph(TrivialMapElimination._map_entry)] @staticmethod def can_be_applied(graph, candidate, expr_index, sdfg, permissive=False): map_entry = graph.nodes()[candidate[TrivialMapElimination._map_entry]] return any(r[0] == r[1] for r in map_entry.map.range) @staticmethod def match_to_str(graph, candidate): map_entry = graph.nodes()[candidate[TrivialMapElimination._map_entry]] return map_entry.map.label + ': ' + str(map_entry.map.params) def apply(self, sdfg): graph = sdfg.nodes()[self.state_id] map_entry = graph.nodes()[self.subgraph[ TrivialMapElimination._map_entry]] map_exit = graph.exit_node(map_entry) remaining_ranges = [] remaining_params = [] for map_param, ranges in zip(map_entry.map.params, map_entry.map.range.ranges): map_from, map_to, _ = ranges if map_from == map_to: # Replace the map index variable with the value it obtained scope = graph.scope_subgraph(map_entry) scope.replace(map_param, map_from) else: remaining_ranges.append(ranges) remaining_params.append(map_param) map_entry.map.range.ranges = remaining_ranges map_entry.map.params = remaining_params if len(remaining_ranges) == 0: # Redirect map entry's out edges for edge in graph.out_edges(map_entry): path = graph.memlet_path(edge) index = path.index(edge) # Add an edge directly from the previous source connector to the destination graph.add_edge(path[index - 1].src, path[index - 1].src_conn, edge.dst, edge.dst_conn, edge.data) # Redirect map exit's in edges. for edge in graph.in_edges(map_exit): path = graph.memlet_path(edge) index = path.index(edge) # Add an edge directly from the source to the next destination connector if len(path) > index + 1: graph.add_edge(edge.src, edge.src_conn, path[index + 1].dst, path[index + 1].dst_conn, edge.data) # Remove map graph.remove_nodes_from([map_entry, map_exit])
class GPUTransformLocalStorage(transformation.Transformation): """Implements the GPUTransformLocalStorage transformation. Similar to GPUTransformMap, but takes multiple maps leading from the same data node into account, creating a local storage for each range. @see: GPUTransformMap """ _arrays_removed = 0 _maps_transformed = 0 fullcopy = Property(desc="Copy whole arrays rather than used subset", dtype=bool, default=False) nested_seq = Property( desc="Makes nested code semantically-equivalent to single-core code," "transforming nested maps and memory into sequential and " "local memory respectively.", dtype=bool, default=True, ) _map_entry = nodes.MapEntry(nodes.Map("", [], [])) import dace.libraries.standard as stdlib # Avoid import loop _reduce = stdlib.Reduce("lambda: None", None) @staticmethod def expressions(): return [ sdutil.node_path_graph(GPUTransformLocalStorage._map_entry), sdutil.node_path_graph(GPUTransformLocalStorage._reduce), ] @staticmethod def can_be_applied(graph, candidate, expr_index, sdfg, permissive=False): if expr_index == 0: map_entry = graph.nodes()[candidate[ GPUTransformLocalStorage._map_entry]] candidate_map = map_entry.map # Disallow GPUTransform on nested maps in permissive mode if not permissive: if graph.entry_node(map_entry) is not None: return False # Map schedules that are disallowed to transform to GPUs if (candidate_map.schedule == dtypes.ScheduleType.MPI or candidate_map.schedule == dtypes.ScheduleType.GPU_Device or candidate_map.schedule == dtypes.ScheduleType.GPU_ThreadBlock or candidate_map.schedule == dtypes.ScheduleType.Sequential): return False # Dynamic map ranges cannot become kernels if sd.has_dynamic_map_inputs(graph, map_entry): return False # Recursively check parent for GPU schedules sdict = graph.scope_dict() current_node = map_entry while current_node is not None: if (current_node.map.schedule == dtypes.ScheduleType.GPU_Device or current_node.map.schedule == dtypes.ScheduleType.GPU_ThreadBlock): return False current_node = sdict[current_node] # Ensure that map does not include internal arrays that are # allocated on non-default space subgraph = graph.scope_subgraph(map_entry) for node in subgraph.nodes(): if (isinstance(node, nodes.AccessNode) and node.desc(sdfg).storage != dtypes.StorageType.Default and node.desc(sdfg).storage != dtypes.StorageType.Register): return False # If one of the outputs is a stream, do not match map_exit = graph.exit_node(map_entry) for edge in graph.out_edges(map_exit): dst = graph.memlet_path(edge)[-1].dst if (isinstance(dst, nodes.AccessNode) and isinstance(sdfg.arrays[dst.data], data.Stream)): return False return True elif expr_index == 1: reduce = graph.nodes()[candidate[GPUTransformLocalStorage._reduce]] # Recursively check parent for GPU schedules sdict = graph.scope_dict() current_node = sdict[reduce] while current_node is not None: if (current_node.map.schedule == dtypes.ScheduleType.GPU_Device or current_node.map.schedule == dtypes.ScheduleType.GPU_ThreadBlock): return False current_node = sdict[current_node] return True @staticmethod def match_to_str(graph, candidate): if GPUTransformLocalStorage._reduce in candidate: return str( graph.nodes()[candidate[GPUTransformLocalStorage._reduce]]) else: map_entry = graph.nodes()[candidate[ GPUTransformLocalStorage._map_entry]] return str(map_entry) def apply(self, sdfg): graph = sdfg.nodes()[self.state_id] if self.expr_index == 0: cnode: nodes.MapEntry = graph.nodes()[self.subgraph[ GPUTransformLocalStorage._map_entry]] # Change schedule cnode.schedule = dtypes.ScheduleType.GPU_Device exit_node = graph.exit_node(cnode) else: cnode: nodes.LibraryNode = graph.nodes()[self.subgraph[ GPUTransformLocalStorage._reduce]] # Change schedule cnode.schedule = dtypes.ScheduleType.GPU_Default exit_node = cnode if Config.get_bool("debugprint"): GPUTransformLocalStorage._maps_transformed += 1 # If nested graph is designated as sequential, transform schedules and # storage from Default to Sequential/Register if self.nested_seq and self.expr_index == 0: for node in graph.scope_subgraph(cnode).nodes(): if isinstance(node, nodes.AccessNode): arr = node.desc(sdfg) if arr.storage == dtypes.StorageType.Default: arr.storage = dtypes.StorageType.Register elif isinstance(node, nodes.MapEntry): if node.map.schedule == dtypes.ScheduleType.Default: node.map.schedule = dtypes.ScheduleType.Sequential gpu_storage_types = [ dtypes.StorageType.GPU_Global, dtypes.StorageType.GPU_Shared, ] ####################################################### # Add GPU copies of CPU arrays (i.e., not already on GPU) # First, understand which arrays to clone all_out_edges = [] all_out_edges.extend(list(graph.out_edges(exit_node))) in_arrays_to_clone = set() out_arrays_to_clone = set() for e in graph.in_edges(cnode): data_node = sd.find_input_arraynode(graph, e) if data_node.desc(sdfg).storage not in gpu_storage_types: in_arrays_to_clone.add((data_node, e.data)) for e in all_out_edges: data_node = sd.find_output_arraynode(graph, e) if data_node.desc(sdfg).storage not in gpu_storage_types: out_arrays_to_clone.add((data_node, e.data)) if Config.get_bool("debugprint"): GPUTransformLocalStorage._arrays_removed += len( in_arrays_to_clone) + len(out_arrays_to_clone) # Second, create a GPU clone of each array # TODO: Overapproximate union of memlets cloned_arrays = {} in_cloned_arraynodes = {} out_cloned_arraynodes = {} for array_node, memlet in in_arrays_to_clone: array = array_node.desc(sdfg) cloned_name = "gpu_" + array_node.data for i, r in enumerate(memlet.bounding_box_size()): size = symbolic.overapproximate(r) try: if int(size) == 1: suffix = [] for c in str(memlet.subset[i][0]): if c.isalpha() or c.isdigit() or c == "_": suffix.append(c) elif c == "+": suffix.append("p") elif c == "-": suffix.append("m") elif c == "*": suffix.append("t") elif c == "/": suffix.append("d") cloned_name += "_" + "".join(suffix) except: continue if cloned_name in sdfg.arrays.keys(): cloned_array = sdfg.arrays[cloned_name] elif array_node.data in cloned_arrays: cloned_array = cloned_arrays[array_node.data] else: full_shape = [] for r in memlet.bounding_box_size(): size = symbolic.overapproximate(r) try: full_shape.append(int(size)) except: full_shape.append(size) actual_dims = [ idx for idx, r in enumerate(full_shape) if not (isinstance(r, int) and r == 1) ] if len(actual_dims) == 0: # abort actual_dims = [len(full_shape) - 1] if isinstance(array, data.Scalar): sdfg.add_array(name=cloned_name, shape=[1], dtype=array.dtype, transient=True, storage=dtypes.StorageType.GPU_Global) elif isinstance(array, data.Stream): sdfg.add_stream( name=cloned_name, dtype=array.dtype, shape=[full_shape[d] for d in actual_dims], veclen=array.veclen, buffer_size=array.buffer_size, storage=dtypes.StorageType.GPU_Global, transient=True, offset=[array.offset[d] for d in actual_dims]) else: sdfg.add_array( name=cloned_name, shape=[full_shape[d] for d in actual_dims], dtype=array.dtype, transient=True, storage=dtypes.StorageType.GPU_Global, allow_conflicts=array.allow_conflicts, strides=[array.strides[d] for d in actual_dims], offset=[array.offset[d] for d in actual_dims], ) cloned_arrays[array_node.data] = cloned_name cloned_node = type(array_node)(cloned_name) in_cloned_arraynodes[array_node.data] = cloned_node for array_node, memlet in out_arrays_to_clone: array = array_node.desc(sdfg) cloned_name = "gpu_" + array_node.data for i, r in enumerate(memlet.bounding_box_size()): size = symbolic.overapproximate(r) try: if int(size) == 1: suffix = [] for c in str(memlet.subset[i][0]): if c.isalpha() or c.isdigit() or c == "_": suffix.append(c) elif c == "+": suffix.append("p") elif c == "-": suffix.append("m") elif c == "*": suffix.append("t") elif c == "/": suffix.append("d") cloned_name += "_" + "".join(suffix) except: continue if cloned_name in sdfg.arrays.keys(): cloned_array = sdfg.arrays[cloned_name] elif array_node.data in cloned_arrays: cloned_array = cloned_arrays[array_node.data] else: full_shape = [] for r in memlet.bounding_box_size(): size = symbolic.overapproximate(r) try: full_shape.append(int(size)) except: full_shape.append(size) actual_dims = [ idx for idx, r in enumerate(full_shape) if not (isinstance(r, int) and r == 1) ] if len(actual_dims) == 0: # abort actual_dims = [len(full_shape) - 1] if isinstance(array, data.Scalar): sdfg.add_array(name=cloned_name, shape=[1], dtype=array.dtype, transient=True, storage=dtypes.StorageType.GPU_Global) elif isinstance(array, data.Stream): sdfg.add_stream( name=cloned_name, dtype=array.dtype, shape=[full_shape[d] for d in actual_dims], veclen=array.veclen, buffer_size=array.buffer_size, storage=dtypes.StorageType.GPU_Global, transient=True, offset=[array.offset[d] for d in actual_dims]) else: sdfg.add_array( name=cloned_name, shape=[full_shape[d] for d in actual_dims], dtype=array.dtype, transient=True, storage=dtypes.StorageType.GPU_Global, allow_conflicts=array.allow_conflicts, strides=[array.strides[d] for d in actual_dims], offset=[array.offset[d] for d in actual_dims], ) cloned_arrays[array_node.data] = cloned_name cloned_node = type(array_node)(cloned_name) cloned_node.setzero = True out_cloned_arraynodes[array_node.data] = cloned_node # Third, connect the cloned arrays to the originals for array_name, node in in_cloned_arraynodes.items(): graph.add_node(node) is_scalar = isinstance(sdfg.arrays[array_name], data.Scalar) for edge in graph.in_edges(cnode): if edge.data.data == array_name: newmemlet = copy.deepcopy(edge.data) newmemlet.data = node.data if is_scalar: newmemlet.subset = sbs.Indices([0]) else: offset = [] lost_dims = [] lost_ranges = [] newsubset = [None] * len(edge.data.subset) for ind, r in enumerate(edge.data.subset): offset.append(r[0]) if isinstance(edge.data.subset[ind], tuple): begin = edge.data.subset[ind][0] - r[0] end = edge.data.subset[ind][1] - r[0] step = edge.data.subset[ind][2] if begin == end: lost_dims.append(ind) lost_ranges.append((begin, end, step)) else: newsubset[ind] = (begin, end, step) else: newsubset[ind] -= r[0] if len(lost_dims) == len(edge.data.subset): lost_dims.pop() newmemlet.subset = type( edge.data.subset)([lost_ranges[-1]]) else: newmemlet.subset = type(edge.data.subset)( [r for r in newsubset if r is not None]) graph.add_edge(node, None, edge.dst, edge.dst_conn, newmemlet) for e in graph.bfs_edges(edge.dst, reverse=False): parent, _, _child, _, memlet = e if parent != edge.dst and not in_scope( graph, parent, edge.dst): break if memlet.data != edge.data.data: continue path = graph.memlet_path(e) if not isinstance(path[-1].dst, nodes.CodeNode): if in_path(path, e, nodes.ExitNode, forward=True): if isinstance(parent, nodes.CodeNode): # Output edge break else: continue if is_scalar: memlet.subset = sbs.Indices([0]) else: newsubset = [None] * len(memlet.subset) for ind, r in enumerate(memlet.subset): if ind in lost_dims: continue if isinstance(memlet.subset[ind], tuple): begin = r[0] - offset[ind] end = r[1] - offset[ind] step = r[2] newsubset[ind] = (begin, end, step) else: newsubset[ind] = ( r - offset[ind], r - offset[ind], 1, ) memlet.subset = type(edge.data.subset)( [r for r in newsubset if r is not None]) memlet.data = node.data if self.fullcopy: edge.data.subset = sbs.Range.from_array(node.desc(sdfg)) edge.data.other_subset = newmemlet.subset graph.add_edge(edge.src, edge.src_conn, node, None, edge.data) graph.remove_edge(edge) for array_name, node in out_cloned_arraynodes.items(): graph.add_node(node) is_scalar = isinstance(sdfg.arrays[array_name], data.Scalar) for edge in all_out_edges: if edge.data.data == array_name: newmemlet = copy.deepcopy(edge.data) newmemlet.data = node.data if is_scalar: newmemlet.subset = sbs.Indices([0]) else: offset = [] lost_dims = [] lost_ranges = [] newsubset = [None] * len(edge.data.subset) for ind, r in enumerate(edge.data.subset): offset.append(r[0]) if isinstance(edge.data.subset[ind], tuple): begin = edge.data.subset[ind][0] - r[0] end = edge.data.subset[ind][1] - r[0] step = edge.data.subset[ind][2] if begin == end: lost_dims.append(ind) lost_ranges.append((begin, end, step)) else: newsubset[ind] = (begin, end, step) else: newsubset[ind] -= r[0] if len(lost_dims) == len(edge.data.subset): lost_dims.pop() newmemlet.subset = type( edge.data.subset)([lost_ranges[-1]]) else: newmemlet.subset = type(edge.data.subset)( [r for r in newsubset if r is not None]) graph.add_edge(edge.src, edge.src_conn, node, None, newmemlet) end_node = graph.entry_node(edge.src) for e in graph.bfs_edges(edge.src, reverse=True): parent, _, _child, _, memlet = e if parent == end_node: break if memlet.data != edge.data.data: continue path = graph.memlet_path(e) if not isinstance(path[0].dst, nodes.CodeNode): if in_path(path, e, nodes.EntryNode, forward=False): if isinstance(parent, nodes.CodeNode): # Output edge break else: continue if is_scalar: memlet.subset = sbs.Indices([0]) else: newsubset = [None] * len(memlet.subset) for ind, r in enumerate(memlet.subset): if ind in lost_dims: continue if isinstance(memlet.subset[ind], tuple): begin = r[0] - offset[ind] end = r[1] - offset[ind] step = r[2] newsubset[ind] = (begin, end, step) else: newsubset[ind] = ( r - offset[ind], r - offset[ind], 1, ) memlet.subset = type(edge.data.subset)( [r for r in newsubset if r is not None]) memlet.data = node.data edge.data.wcr = None if self.fullcopy: edge.data.subset = sbs.Range.from_array(node.desc(sdfg)) edge.data.other_subset = newmemlet.subset graph.add_edge(node, None, edge.dst, edge.dst_conn, edge.data) graph.remove_edge(edge) # Fourth, replace memlet arrays as necessary if self.expr_index == 0: scope_subgraph = graph.scope_subgraph(cnode) for edge in scope_subgraph.edges(): if edge.data.data is not None and edge.data.data in cloned_arrays: edge.data.data = cloned_arrays[edge.data.data]
def apply(self, sdfg: dace.SDFG): # Extract the map and its entry and exit nodes. graph = sdfg.node(self.state_id) map_entry = self.map_entry(sdfg) map_exit = graph.exit_node(map_entry) current_map = map_entry.map # Create new maps new_maps = [ nodes.Map(current_map.label + '_' + str(param), [param], subsets.Range([param_range]), schedule=dtypes.ScheduleType.Sequential) for param, param_range in zip(current_map.params[1:], current_map.range[1:]) ] current_map.params = [current_map.params[0]] current_map.range = subsets.Range([current_map.range[0]]) # Create new map entries and exits entries = [nodes.MapEntry(new_map) for new_map in new_maps] exits = [nodes.MapExit(new_map) for new_map in new_maps] # Create edges, abiding by the following rules: # 1. If there are no edges coming from the outside, use empty memlets # 2. Edges with IN_* connectors replicate along the maps # 3. Edges for dynamic map ranges replicate until reaching range(s) for edge in graph.out_edges(map_entry): graph.remove_edge(edge) graph.add_memlet_path(map_entry, *entries, edge.dst, src_conn=edge.src_conn, memlet=edge.data, dst_conn=edge.dst_conn) # Modify dynamic map ranges dynamic_edges = dace.sdfg.dynamic_map_inputs(graph, map_entry) for edge in dynamic_edges: # Remove old edge and connector graph.remove_edge(edge) edge.dst.remove_in_connector(edge.dst_conn) # Propagate to each range it belongs to path = [] for mapnode in [map_entry] + entries: path.append(mapnode) if any(edge.dst_conn in map(str, symbolic.symlist(r)) for r in mapnode.map.range): graph.add_memlet_path(edge.src, *path, memlet=edge.data, src_conn=edge.src_conn, dst_conn=edge.dst_conn) # Create new map exits for edge in graph.in_edges(map_exit): graph.remove_edge(edge) graph.add_memlet_path(edge.src, *exits[::-1], map_exit, memlet=edge.data, src_conn=edge.src_conn, dst_conn=edge.dst_conn) from dace.sdfg.scope import ScopeTree scope = None queue: List[ScopeTree] = graph.scope_leaves() while len(queue) > 0: tnode = queue.pop() if tnode.entry == entries[-1]: scope = tnode break elif tnode.parent is not None: queue.append(tnode.parent) else: raise ValueError('Cannot find scope in state') consolidate_edges(sdfg, scope) return [map_entry] + entries
class MapCollapse(pattern_matching.Transformation): """ Implements the Map Collapse pattern. Map-collapse takes two nested maps with M and N dimensions respectively, and collapses them to a single M+N dimensional map. """ _outer_map_entry = nodes.MapEntry(nodes.Map("", [], [])) _inner_map_entry = nodes.MapEntry(nodes.Map("", [], [])) @staticmethod def expressions(): return [ sdutil.node_path_graph( MapCollapse._outer_map_entry, MapCollapse._inner_map_entry, ) ] @staticmethod def can_be_applied(graph, candidate, expr_index, sdfg, strict=False): # Check the edges between the entries of the two maps. outer_map_entry = graph.nodes()[candidate[ MapCollapse._outer_map_entry]] inner_map_entry = graph.nodes()[candidate[ MapCollapse._inner_map_entry]] # Check that inner map range is independent of outer range map_deps = set() for s in inner_map_entry.map.range: map_deps |= set(map(str, symlist(s))) if any(dep in outer_map_entry.map.params for dep in map_deps): return False # Check that the destination of all the outgoing edges # from the outer map's entry is the inner map's entry. for _src, _, dest, _, _ in graph.out_edges(outer_map_entry): if dest != inner_map_entry: return False # Check that the source of all the incoming edges # to the inner map's entry is the outer map's entry. for src, _, _, dst_conn, memlet in graph.in_edges(inner_map_entry): if src != outer_map_entry: return False # Check that dynamic input range memlets are independent of # first map range if dst_conn is not None and not dst_conn.startswith('IN_'): memlet_deps = set() for s in memlet.subset: memlet_deps |= set(map(str, symlist(s))) if any(dep in outer_map_entry.map.params for dep in memlet_deps): return False # Check the edges between the exits of the two maps. inner_map_exit = graph.exit_node(inner_map_entry) outer_map_exit = graph.exit_node(outer_map_entry) # Check that the destination of all the outgoing edges # from the inner map's exit is the outer map's exit. for _src, _, dest, _, _ in graph.out_edges(inner_map_exit): if dest != outer_map_exit: return False # Check that the source of all the incoming edges # to the outer map's exit is the inner map's exit. for src, _, _dest, _, _ in graph.in_edges(outer_map_exit): if src != inner_map_exit: return False return True @staticmethod def match_to_str(graph, candidate): outer_map_entry = graph.nodes()[candidate[ MapCollapse._outer_map_entry]] inner_map_entry = graph.nodes()[candidate[ MapCollapse._inner_map_entry]] return ' -> '.join(entry.map.label + ': ' + str(entry.map.params) for entry in [outer_map_entry, inner_map_entry]) def apply(self, sdfg) -> Tuple[nodes.MapEntry, nodes.MapExit]: """ Collapses two maps into one. :param sdfg: The SDFG to apply the transformation to. :return: A 2-tuple of the new map entry and exit nodes. """ # Extract the parameters and ranges of the inner/outer maps. graph = sdfg.nodes()[self.state_id] outer_map_entry = graph.nodes()[self.subgraph[ MapCollapse._outer_map_entry]] inner_map_entry = graph.nodes()[self.subgraph[ MapCollapse._inner_map_entry]] inner_map_exit = graph.exit_node(inner_map_entry) outer_map_exit = graph.exit_node(outer_map_entry) return sdutil.merge_maps(graph, outer_map_entry, outer_map_exit, inner_map_entry, inner_map_exit)
class MapUnroll(transformation.Transformation): """ Unrolls a map with constant ranges in the top-level scope of an SDFG by replicating its subgraph for each iteration. If there are local data containers only used in this map, they will also be replicated, as will nested SDFGs found within. This transformation can be useful for forming weakly connected components that will be inferred as processing elements in an FPGA kernel. """ _map_entry = nodes.MapEntry(nodes.Map("", [], [])) @staticmethod def expressions(): return [sdutil.node_path_graph(MapUnroll._map_entry)] @staticmethod def can_be_applied(graph, candidate, expr_index, sdfg, strict=False): map_entry = graph.nodes()[candidate[MapUnroll._map_entry]] # Must be top-level map if graph.scope_dict()[map_entry] is not None: return False # All map ranges must be constant try: for begin, end, step in map_entry.map.range: symbolic.evaluate(begin, sdfg.constants) symbolic.evaluate(end, sdfg.constants) symbolic.evaluate(step, sdfg.constants) except TypeError: return False return True @staticmethod def match_to_str(graph, candidate): map_entry = graph.nodes()[candidate[MapUnroll._map_entry]] return map_entry.map.label + ': ' + str(map_entry.map.params) def apply(self, sdfg): from dace.transformation.dataflow import TrivialMapElimination state = sdfg.nodes()[self.state_id] map_entry = state.nodes()[self.subgraph[MapUnroll._map_entry]] map_exit = state.exit_node(map_entry) # Collect all nodes in this weakly connected component subgraph = sdutil.weakly_connected_component(state, map_entry) # Save nested SDFGs to JSON, then deserialize them for every copy we # need to make nested_sdfgs = {} for node in subgraph: if isinstance(node, nodes.NestedSDFG): nested_sdfgs[node.sdfg] = node.sdfg.to_json() # Check for local memories that need to be replicated local_memories = [ name for name in sdutil.local_transients( sdfg, subgraph, entry_node=map_entry, include_nested=True) if not isinstance(sdfg.arrays[name], dt.Stream) and not isinstance(sdfg.arrays[name], dt.View) ] params = map_entry.map.params ranges = map_entry.map.range.ranges constant_ranges = [] for r in ranges: begin = symbolic.evaluate(r[0], sdfg.constants) end = symbolic.evaluate(r[1], sdfg.constants) step = symbolic.evaluate(r[2], sdfg.constants) end += step # Make non-inclusive constant_ranges.append(range(begin, end, step)) index_tuples = itertools.product(*constant_ranges) for t in index_tuples: suffix = "_" + "_".join(map(str, t)) node_to_unrolled = {} # Copy all nodes for node in subgraph: if isinstance(node, nodes.NestedSDFG): # Avoid deep-copying the nested SDFG nsdfg = node.sdfg # Don't copy the nested SDFG, as we will do this separately node.sdfg = None unrolled_node = copy.deepcopy(node) node.sdfg = nsdfg # Deserialize into a new SDFG specific to this copy nsdfg_json = nested_sdfgs[nsdfg] name = nsdfg_json["attributes"]["name"] nsdfg_json["attributes"]["name"] += suffix unrolled_nsdfg = SDFG.from_json(nsdfg_json) nsdfg_json["attributes"]["name"] = name # Reinstate # Set all the references unrolled_nsdfg.parent = state unrolled_nsdfg.parent_sdfg = sdfg unrolled_nsdfg.update_sdfg_list([]) unrolled_node.sdfg = unrolled_nsdfg unrolled_nsdfg.parent_nsdfg_node = unrolled_node else: unrolled_node = copy.deepcopy(node) if node == map_entry: # Fix the map bounds to only this iteration unrolled_node.map.range = [(i, i, 1) for i in t] if (isinstance(node, nodes.AccessNode) and node.data in local_memories): # If this is a local memory only used in this subgraph, # we need to replicate it for each new subgraph unrolled_name = node.data + suffix if unrolled_name not in sdfg.arrays: unrolled_desc = copy.deepcopy( sdfg.arrays[node.data]) sdfg.add_datadesc(unrolled_name, unrolled_desc) unrolled_node.data = unrolled_name state.add_node(unrolled_node) node_to_unrolled[node] = unrolled_node # Remember mapping # Copy all edges for src, src_conn, dst, dst_conn, memlet in subgraph.edges(): src = node_to_unrolled[src] dst = node_to_unrolled[dst] memlet = copy.deepcopy(memlet) if memlet.data in local_memories: memlet.data = memlet.data + suffix state.add_edge(src, src_conn, dst, dst_conn, memlet) # Eliminate the now trivial map TrivialMapElimination.apply_to( sdfg, verify=False, annotate=False, save=False, _map_entry=node_to_unrolled[map_entry]) # Now we can delete the original subgraph. This implicitly also remove # memlets between nodes state.remove_nodes_from(subgraph) # If we added a bunch of new nested SDFGs, reset the internal list if len(nested_sdfgs) > 0: sdfg.reset_sdfg_list() # Remove local memories that were replicated for mem in local_memories: sdfg.remove_data(mem)
class Vectorization(pattern_matching.Transformation): """ Implements the vectorization transformation. Vectorization matches when all the input and output memlets of a tasklet inside a map access the inner-most loop variable in their last dimension. The transformation changes the step of the inner-most loop to be equal to the length of the vector and vectorizes the memlets. """ vector_len = Property(desc="Vector length", dtype=int, default=4) propagate_parent = Property(desc="Propagate vector length through " "parent SDFGs", dtype=bool, default=False) strided_map = Property(desc="Use strided map range (jump by vector length)" " instead of modifying memlets", dtype=bool, default=False) _map_entry = nodes.MapEntry(nodes.Map("", [], [])) _tasklet = nodes.Tasklet('_') _map_exit = nodes.MapExit(nodes.Map("", [], [])) @staticmethod def expressions(): return [ sdutil.node_path_graph(Vectorization._map_entry, Vectorization._tasklet, Vectorization._map_exit) ] @staticmethod def can_be_applied(graph, candidate, expr_index, sdfg, strict=False): map_entry = graph.nodes()[candidate[Vectorization._map_entry]] tasklet = graph.nodes()[candidate[Vectorization._tasklet]] param = symbolic.pystr_to_symbolic(map_entry.map.params[-1]) found = False # Check if all edges, adjacent to the tasklet, # use the parameter in their last dimension. for _src, _, _dest, _, memlet in graph.all_edges(tasklet): # Cases that do not matter for vectorization if memlet.data is None: # Empty memlets continue if isinstance(sdfg.arrays[memlet.data], data.Stream): # Streams continue # Vectorization can not be applied in WCR if memlet.wcr is not None: return False try: subset = memlet.subset veclen = memlet.veclen except AttributeError: return False if subset is None: return False try: if veclen > symbolic.pystr_to_symbolic('1'): return False for idx, expr in enumerate(subset): if isinstance(expr, tuple): for ex in expr: ex = symbolic.pystr_to_symbolic(ex) symbols = ex.free_symbols if param in symbols: if idx == subset.dims() - 1: found = True else: return False else: expr = symbolic.pystr_to_symbolic(expr) symbols = expr.free_symbols if param in symbols: if idx == subset.dims() - 1: found = True else: return False except TypeError: # cannot determine truth value of Relational return False return found @staticmethod def match_to_str(graph, candidate): map_entry = candidate[Vectorization._map_entry] tasklet = candidate[Vectorization._tasklet] map_exit = candidate[Vectorization._map_exit] return ' -> '.join( str(node) for node in [map_entry, tasklet, map_exit]) def apply(self, sdfg): graph = sdfg.nodes()[self.state_id] map_entry = graph.nodes()[self.subgraph[Vectorization._map_entry]] tasklet = graph.nodes()[self.subgraph[Vectorization._tasklet]] map_exit = graph.nodes()[self.subgraph[Vectorization._map_exit]] param = symbolic.pystr_to_symbolic(map_entry.map.params[-1]) # Create new vector size. vector_size = self.vector_len # Change the step of the inner-most dimension. dim_from, dim_to, dim_step = map_entry.map.range[-1] if self.strided_map: map_entry.map.range[-1] = (dim_from, dim_to, vector_size) else: map_entry.map.range[-1] = (dim_from, (dim_to + 1) / vector_size - 1, dim_step) # TODO: Postamble and/or preamble non-vectorized map # Vectorize memlets adjacent to the tasklet. processed_edges = set() for edge in graph.all_edges(tasklet): _src, _, _dest, _, memlet = edge if memlet.data is None: # Empty memlets continue lastindex = memlet.subset[-1] if isinstance(lastindex, tuple): symbols = set() for indd in lastindex: symbols.update( symbolic.pystr_to_symbolic(indd).free_symbols) else: symbols = symbolic.pystr_to_symbolic( memlet.subset[-1]).free_symbols if param not in symbols: continue try: # propagate vector length inside this SDFG for e in graph.memlet_tree(edge): e.data.veclen = vector_size if not self.strided_map and e not in processed_edges: e.data.subset.replace({param: vector_size * param}) processed_edges.add(e) # propagate to the parent (TODO: handle multiple level of nestings) if self.propagate_parent and sdfg.parent is not None: source_edge = graph.memlet_path(edge)[0] sink_edge = graph.memlet_path(edge)[-1] # Find parent Nested SDFG node parent_node = next(n for n in sdfg.parent.nodes() if isinstance(n, nodes.NestedSDFG) and n.sdfg.name == sdfg.name) # continue in propagating the vector length following the # path that arrives to source_edge or starts from sink_edge for pe in sdfg.parent.all_edges(parent_node): if str(pe.dst_conn) == str(source_edge.src) or str( pe.src_conn) == str(sink_edge.dst): for ppe in sdfg.parent.memlet_tree(pe): ppe.data.veclen = vector_size if (not self.strided_map and ppe not in processed_edges): ppe.data.subset.replace( {param: vector_size * param}) processed_edges.add(ppe) except AttributeError: raise return
def _stripmine(self, sdfg, graph, candidate): # Retrieve map entry and exit nodes. map_entry = graph.nodes()[candidate[StripMining._map_entry]] map_exit = graph.exit_node(map_entry) # Retrieve transformation properties. dim_idx = self.dim_idx target_dim = map_entry.map.params[dim_idx] if self.tiling_type == 'ceilrange': new_dim, new_map, td_rng = self._create_ceil_range( sdfg, graph, map_entry) elif self.tiling_type == 'number_of_tiles': new_dim, new_map, td_rng = self._create_from_tile_numbers( sdfg, graph, map_entry) else: new_dim, new_map, td_rng = self._create_strided_range( sdfg, graph, map_entry) new_map_entry = nodes.MapEntry(new_map) new_map_exit = nodes.MapExit(new_map) td_to_new_approx = td_rng[1] if isinstance(td_to_new_approx, dace.symbolic.SymExpr): td_to_new_approx = td_to_new_approx.approx # Special case: If range is 1 and no prefix was specified, skip range if td_rng[0] == td_to_new_approx and target_dim == new_dim: map_entry.map.range = subsets.Range( [r for i, r in enumerate(map_entry.map.range) if i != dim_idx]) map_entry.map.params = [ p for i, p in enumerate(map_entry.map.params) if i != dim_idx ] if len(map_entry.map.params) == 0: raise ValueError('Strip-mining all dimensions of the map with ' 'empty tiles is disallowed') else: map_entry.map.range[dim_idx] = td_rng # Make internal map's schedule to "not parallel" new_map.schedule = map_entry.map.schedule map_entry.map.schedule = dtypes.ScheduleType.Sequential # Redirect edges new_map_entry.in_connectors = dcpy(map_entry.in_connectors) sdutil.change_edge_dest(graph, map_entry, new_map_entry) new_map_exit.out_connectors = dcpy(map_exit.out_connectors) sdutil.change_edge_src(graph, map_exit, new_map_exit) # Create new entry edges new_in_edges = dict() entry_in_conn = {} entry_out_conn = {} for _src, src_conn, _dst, _, memlet in graph.out_edges(map_entry): if (src_conn is not None and src_conn[:4] == 'OUT_' and not isinstance( sdfg.arrays[memlet.data], dace.data.Scalar)): new_subset = calc_set_image( map_entry.map.params, map_entry.map.range, memlet.subset, ) conn = src_conn[4:] key = (memlet.data, 'IN_' + conn, 'OUT_' + conn) if key in new_in_edges.keys(): old_subset = new_in_edges[key].subset new_in_edges[key].subset = calc_set_union( old_subset, new_subset) else: entry_in_conn['IN_' + conn] = None entry_out_conn['OUT_' + conn] = None new_memlet = dcpy(memlet) new_memlet.subset = new_subset if memlet.dynamic: new_memlet.num_accesses = memlet.num_accesses else: new_memlet.num_accesses = new_memlet.num_elements() new_in_edges[key] = new_memlet else: if src_conn is not None and src_conn[:4] == 'OUT_': conn = src_conn[4:] in_conn = 'IN_' + conn out_conn = 'OUT_' + conn else: in_conn = src_conn out_conn = src_conn if in_conn: entry_in_conn[in_conn] = None if out_conn: entry_out_conn[out_conn] = None new_in_edges[(memlet.data, in_conn, out_conn)] = dcpy(memlet) new_map_entry.out_connectors = entry_out_conn map_entry.in_connectors = entry_in_conn for (_, in_conn, out_conn), memlet in new_in_edges.items(): graph.add_edge(new_map_entry, out_conn, map_entry, in_conn, memlet) # Create new exit edges new_out_edges = dict() exit_in_conn = {} exit_out_conn = {} for _src, _, _dst, dst_conn, memlet in graph.in_edges(map_exit): if (dst_conn is not None and dst_conn[:3] == 'IN_' and not isinstance( sdfg.arrays[memlet.data], dace.data.Scalar)): new_subset = calc_set_image( map_entry.map.params, map_entry.map.range, memlet.subset, ) conn = dst_conn[3:] key = (memlet.data, 'IN_' + conn, 'OUT_' + conn) if key in new_out_edges.keys(): old_subset = new_out_edges[key].subset new_out_edges[key].subset = calc_set_union( old_subset, new_subset) else: exit_in_conn['IN_' + conn] = None exit_out_conn['OUT_' + conn] = None new_memlet = dcpy(memlet) new_memlet.subset = new_subset if memlet.dynamic: new_memlet.num_accesses = memlet.num_accesses else: new_memlet.num_accesses = new_memlet.num_elements() new_out_edges[key] = new_memlet else: if dst_conn is not None and dst_conn[:3] == 'IN_': conn = dst_conn[3:] in_conn = 'IN_' + conn out_conn = 'OUT_' + conn else: in_conn = dst_conn out_conn = dst_conn if in_conn: exit_in_conn[in_conn] = None if out_conn: exit_out_conn[out_conn] = None new_in_edges[(memlet.data, in_conn, out_conn)] = dcpy(memlet) new_map_exit.in_connectors = exit_in_conn map_exit.out_connectors = exit_out_conn for (_, in_conn, out_conn), memlet in new_out_edges.items(): graph.add_edge(map_exit, out_conn, new_map_exit, in_conn, memlet) # Skew if necessary if self.skew: xfh.offset_map(sdfg, graph, map_entry, dim_idx, td_rng[0]) # Return strip-mined dimension. return target_dim, new_dim, new_map
class TrivialMapElimination(transformation.Transformation): """ Implements the Trivial-Map Elimination pattern. Trivial-Map Elimination takes a map with a range containing one element and removes the map. Example: Map[i=0] -> nothing """ _map_entry = nodes.MapEntry(nodes.Map("", [], [])) @staticmethod def expressions(): return [sdutil.node_path_graph(TrivialMapElimination._map_entry)] @staticmethod def can_be_applied(graph, candidate, expr_index, sdfg, strict=False): map_entry = graph.nodes()[candidate[TrivialMapElimination._map_entry]] map_from, map_to, map_step = map_entry.map.range[0] return len(map_entry.map.range) == 1 and map_to == map_from @staticmethod def match_to_str(graph, candidate): map_entry = graph.nodes()[candidate[TrivialMapElimination._map_entry]] return map_entry.map.label + ': ' + str(map_entry.map.params) def apply(self, sdfg): graph = sdfg.nodes()[self.state_id] map_entry = graph.nodes()[self.subgraph[ TrivialMapElimination._map_entry]] map_exit = graph.exit_node(map_entry) map_param = map_entry.map.params[0] map_from, map_to, _ = map_entry.map.range[0] assert map_from == map_to # Replace the map index variable with the value it obtained scope = graph.scope_subgraph(map_entry) scope.replace(map_param, map_from) # Redirect map entry's out edges. for edge in graph.out_edges(map_entry): path = graph.memlet_path(edge) ind = path.index(edge) # Add an edge directly from the previous source connector to the # destination graph.add_edge(path[ind - 1].src, path[ind - 1].src_conn, edge.dst, edge.dst_conn, edge.data) # Redirect map exit's in edges. for edge in graph.in_edges(map_exit): path = graph.memlet_path(edge) ind = path.index(edge) # Add an edge directly from the source to the next destination # connector graph.add_edge(edge.src, edge.src_conn, path[ind + 1].dst, path[ind + 1].dst_conn, edge.data) # Clean-up graph.remove_nodes_from([map_entry, map_exit])
def apply(self, sdfg: dace.SDFG): # Extract the map and its entry and exit nodes. graph = sdfg.nodes()[self.state_id] map_entry = graph.nodes()[self.subgraph[MapExpansion._map_entry]] map_exit = graph.exit_node(map_entry) current_map = map_entry.map # Create new maps new_maps = [ nodes.Map(current_map.label + '_' + str(param), [param], subsets.Range([param_range]), schedule=dtypes.ScheduleType.Sequential) for param, param_range in zip(current_map.params[1:], current_map.range[1:]) ] current_map.params = [current_map.params[0]] current_map.range = subsets.Range([current_map.range[0]]) # Create new map entries and exits entries = [nodes.MapEntry(new_map) for new_map in new_maps] exits = [nodes.MapExit(new_map) for new_map in new_maps] # Create edges, abiding by the following rules: # 1. If there are no edges coming from the outside, use empty memlets # 2. Edges with IN_* connectors replicate along the maps # 3. Edges for dynamic map ranges replicate until reaching range(s) for edge in graph.out_edges(map_entry): graph.remove_edge(edge) graph.add_memlet_path(map_entry, *entries, edge.dst, src_conn=edge.src_conn, memlet=edge.data, dst_conn=edge.dst_conn) # Modify dynamic map ranges dynamic_edges = dace.sdfg.dynamic_map_inputs(graph, map_entry) for edge in dynamic_edges: # Remove old edge and connector graph.remove_edge(edge) edge.dst.remove_in_connector(edge.dst_conn) # Propagate to each range it belongs to path = [] for mapnode in [map_entry] + entries: path.append(mapnode) if any(edge.dst_conn in map(str, symbolic.symlist(r)) for r in mapnode.map.range): graph.add_memlet_path(edge.src, *path, memlet=edge.data, src_conn=edge.src_conn, dst_conn=edge.dst_conn) # Create new map exits for edge in graph.in_edges(map_exit): graph.remove_edge(edge) graph.add_memlet_path(edge.src, *exits[::-1], map_exit, memlet=edge.data, src_conn=edge.src_conn, dst_conn=edge.dst_conn)
class GPUTransformMap(transformation.Transformation): """ Implements the GPUTransformMap transformation. Converts a single map to a GPU-scheduled map and creates GPU arrays outside it, generating CPU<->GPU memory copies automatically. """ fullcopy = Property(desc="Copy whole arrays rather than used subset", dtype=bool, default=False) toplevel_trans = Property(desc="Make all GPU transients top-level", dtype=bool, default=False) register_trans = Property( desc="Make all transients inside GPU maps registers", dtype=bool, default=False) gpu_id = SymbolicProperty(default=None, allow_none=True, desc="Selects which gpu the map should run on") sequential_innermaps = Property(desc="Make all internal maps Sequential", dtype=bool, default=False) _map_entry = nodes.MapEntry(nodes.Map("", [], [])) import dace.libraries.standard as stdlib # Avoid import loop _reduce = stdlib.Reduce('lambda: None', None) @staticmethod def expressions(): return [ sdutil.node_path_graph(GPUTransformMap._map_entry), sdutil.node_path_graph(GPUTransformMap._reduce) ] @staticmethod def can_be_applied(graph, candidate, expr_index, sdfg, strict=False): if expr_index == 0: map_entry = graph.nodes()[candidate[GPUTransformMap._map_entry]] candidate_map = map_entry.map # Map schedules that are disallowed to transform to GPUs if (candidate_map.schedule in [dtypes.ScheduleType.MPI] + dtypes.GPU_SCHEDULES): return False if sd.is_devicelevel_gpu(sdfg, graph, map_entry): return False # Dynamic map ranges cannot become kernels if sd.has_dynamic_map_inputs(graph, map_entry): return False # Ensure that map does not include internal arrays that are # allocated on non-default space subgraph = graph.scope_subgraph(map_entry) for node in subgraph.nodes(): if (isinstance(node, nodes.AccessNode) and node.desc(sdfg).storage != dtypes.StorageType.Default and node.desc(sdfg).storage != dtypes.StorageType.Register): return False # If one of the outputs is a stream, do not match map_exit = graph.exit_node(map_entry) for edge in graph.out_edges(map_exit): dst = graph.memlet_path(edge)[-1].dst if (isinstance(dst, nodes.AccessNode) and isinstance(sdfg.arrays[dst.data], data.Stream)): return False return True elif expr_index == 1: reduce = graph.nodes()[candidate[GPUTransformMap._reduce]] # Disallow GPU transformation if already in device-level code if sd.is_devicelevel_gpu(sdfg, graph, reduce): return False return True @staticmethod def match_to_str(graph, candidate): if GPUTransformMap._reduce in candidate: return str(graph.nodes()[candidate[GPUTransformMap._reduce]]) else: return str(graph.nodes()[candidate[GPUTransformMap._map_entry]]) def apply(self, sdfg): graph = sdfg.nodes()[self.state_id] if self.expr_index == 0: map_entry = graph.nodes()[self.subgraph[GPUTransformMap._map_entry]] nsdfg_node = helpers.nest_state_subgraph( sdfg, graph, graph.scope_subgraph(map_entry), full_data=self.fullcopy) else: cnode = graph.nodes()[self.subgraph[GPUTransformMap._reduce]] nsdfg_node = helpers.nest_state_subgraph(sdfg, graph, SubgraphView( graph, [cnode]), full_data=self.fullcopy) # Avoiding import loops from dace.transformation.interstate import GPUTransformSDFG transformation = GPUTransformSDFG(0, 0, {}, 0) transformation.register_trans = self.register_trans transformation.sequential_innermaps = self.sequential_innermaps transformation.toplevel_trans = self.toplevel_trans transformation.gpu_id = self.gpu_id transformation.apply(nsdfg_node.sdfg) # Inline back as necessary sdfg.apply_strict_transformations()
class BufferTiling(transformation.Transformation): """ Implements the buffer tiling transformation. BufferTiling tiles a buffer that is in between two maps, where the preceding map writes to the buffer and the succeeding map reads from it. It introduces additional computations in exchange for reduced memory footprint. Commonly used to make use of shared memory on GPUs. """ _map1_exit = nodes.MapExit(nodes.Map('', [], [])) _array = nodes.AccessNode('') _map2_entry = nodes.MapEntry(nodes.Map('', [], [])) tile_sizes = ShapeProperty(dtype=tuple, default=(128, 128, 128), desc="Tile size per dimension") # Returns a list of graphs that represent the pattern @staticmethod def expressions(): return [ sdutil.node_path_graph( BufferTiling._map1_exit, BufferTiling._array, BufferTiling._map2_entry, ) ] @staticmethod def can_be_applied(graph, candidate, expr_index, sdfg, strict=False): map1_exit = graph.nodes()[candidate[BufferTiling._map1_exit]] map2_entry = graph.nodes()[candidate[BufferTiling._map2_entry]] for buf in graph.all_nodes_between(map1_exit, map2_entry): # Check that buffers are AccessNodes. if not isinstance(buf, nodes.AccessNode): return False # Check that buffers are transient. if not sdfg.arrays[buf.data].transient: return False # Check that buffers have exactly 1 input and 1 output edge. if graph.in_degree(buf) != 1: return False if graph.out_degree(buf) != 1: return False # Check that buffers are next to the maps. if graph.in_edges(buf)[0].src != map1_exit: return False if graph.out_edges(buf)[0].dst != map2_entry: return False # Check that the data consumed is provided. provided = graph.in_edges(buf)[0].data.subset consumed = graph.out_edges(buf)[0].data.subset if not provided.covers(consumed): return False # Check that buffers occur only once in this state. num_occurrences = len([ n for n in graph.nodes() if isinstance(n, nodes.AccessNode) and n.data == buf ]) if num_occurrences > 1: return False return True @staticmethod def match_to_str(graph, candidate): map1_exit = graph.nodes()[candidate[BufferTiling._map1_exit]] map2_entry = graph.nodes()[candidate[BufferTiling._map2_entry]] return " -> ".join(entry.map.label + ": " + str(entry.map.params) for entry in [map1_exit, map2_entry]) def apply(self, sdfg): graph = sdfg.nodes()[self.state_id] map1_exit = graph.nodes()[self.subgraph[self._map1_exit]] map1_entry = graph.entry_node(map1_exit) map2_entry = graph.nodes()[self.subgraph[self._map2_entry]] buffers = graph.all_nodes_between(map1_exit, map2_entry) # Situation: # -> map1_entry -> ... -> map1_exit -> buffers -> map2_entry -> ... lower_extents = tuple(b - a for a, b in zip( map1_entry.range.min_element(), map2_entry.range.min_element())) upper_extents = tuple(a - b for a, b in zip( map1_entry.range.max_element(), map2_entry.range.max_element())) # Tile the first map with overlap MapTilingWithOverlap.apply_to(sdfg, map_entry=map1_entry, options={ 'tile_sizes': self.tile_sizes, 'lower_overlap': lower_extents, 'upper_overlap': upper_extents }) tile_map1_exit = graph.out_edges(map1_exit)[0].dst tile_map1_entry = graph.entry_node(tile_map1_exit) tile_map1_entry.label = 'BufferTiling' # Tile the second map MapTiling.apply_to(sdfg, map_entry=map2_entry, options={ 'tile_sizes': self.tile_sizes, 'tile_trivial': True }) tile_map2_entry = graph.in_edges(map2_entry)[0].src # Fuse maps some_buffer = next( iter(buffers)) # some dummy to pass to MapFusion.apply_to() MapFusion.apply_to(sdfg, first_map_exit=tile_map1_exit, array=some_buffer, second_map_entry=tile_map2_entry) # Optimize the simple cases map1_entry.range.ranges = [ (r[0], r[0], r[2]) if l_ext == 0 and u_ext == 0 and ts == 1 else r for r, l_ext, u_ext, ts in zip(map1_entry.range.ranges, lower_extents, upper_extents, self.tile_sizes) ] map2_entry.range.ranges = [ (r[0], r[0], r[2]) if ts == 1 else r for r, ts in zip(map2_entry.range.ranges, self.tile_sizes) ] if any(ts == 1 for ts in self.tile_sizes): if any(r[0] == r[1] for r in map1_entry.map.range): TrivialMapElimination.apply_to(sdfg, _map_entry=map1_entry) if any(r[0] == r[1] for r in map2_entry.map.range): TrivialMapElimination.apply_to(sdfg, _map_entry=map2_entry)
class MPITransformMap(transformation.Transformation): """ Implements the MPI parallelization pattern. Takes a map and makes it an MPI-scheduled map, introduces transients that keep locally accessed data. Original SDFG ============= ``` Input1 - Output1 \ / Input2 --- MapEntry -- Arbitrary R -- MapExit -- Output2 / \ InputN - OutputN ``` Nothing in R may access other inputs/outputs that are not defined in R itself and do not go through MapEntry/MapExit Map must be a one-dimensional map for now. The range of the map must be a Range object. Output: ======= * Add transients for the accessed parts * The schedule property of Map is set to MPI * The range of Map is changed to var = startexpr + p * chunksize ... startexpr + p + 1 * chunksize where p is the current rank and P is the total number of ranks, and chunksize is defined as (endexpr - startexpr) / P, adding the remaining K iterations to the first K procs. * For each input InputI, create a new transient transInputI, which has an attribute that specifies that it needs to be filled with (possibly) remote data * Collect all accesses to InputI within R, assume their convex hull is InputI[rs ... re] * The transInputI transient will contain InputI[rs ... re] * Change all accesses to InputI within R to accesses to transInputI """ _map_entry = nodes.MapEntry(nodes.Map("", [], [])) @staticmethod def annotates_memlets(): return True @staticmethod def expressions(): return [sdutil.node_path_graph(MPITransformMap._map_entry)] @staticmethod def can_be_applied(graph, candidate, expr_index, sdfg, strict=False): map_entry = graph.nodes()[candidate[MPITransformMap._map_entry]] # Check if the map is one-dimensional if map_entry.map.range.dims() != 1: return False # We cannot transform a map which is already of schedule type MPI if map_entry.map.schedule == dtypes.ScheduleType.MPI: return False # We cannot transform a map which is already inside a MPI map, or in # another device schedule_whitelist = [ dtypes.ScheduleType.Default, dtypes.ScheduleType.Sequential ] sdict = graph.scope_dict() parent = sdict[map_entry] while parent is not None: if parent.map.schedule not in schedule_whitelist: return False parent = sdict[parent] # Dynamic map ranges not supported (will allocate dynamic memory) if has_dynamic_map_inputs(graph, map_entry): return False # MPI schedules currently do not support WCR map_exit = graph.exit_node(map_entry) if any(e.data.wcr for e in graph.out_edges(map_exit)): return False return True @staticmethod def match_to_str(graph, candidate): map_entry = graph.nodes()[candidate[MPITransformMap._map_entry]] return map_entry.map.label def apply(self, sdfg): graph = sdfg.nodes()[self.state_id] map_entry = graph.nodes()[self.subgraph[MPITransformMap._map_entry]] # Avoiding import loops from dace.transformation.dataflow.strip_mining import StripMining from dace.transformation.dataflow.local_storage import LocalStorage rangeexpr = str(map_entry.map.range.num_elements()) stripmine_subgraph = { StripMining._map_entry: self.subgraph[MPITransformMap._map_entry] } sdfg_id = sdfg.sdfg_id stripmine = StripMining(sdfg_id, self.state_id, stripmine_subgraph, self.expr_index) stripmine.dim_idx = -1 stripmine.new_dim_prefix = "mpi" stripmine.tile_size = "(" + rangeexpr + "/__dace_comm_size)" stripmine.divides_evenly = True stripmine.apply(sdfg) # Find all in-edges that lead to candidate[MPITransformMap._map_entry] outer_map = None edges = [ e for e in graph.in_edges(map_entry) if isinstance(e.src, nodes.EntryNode) ] outer_map = edges[0].src # Add MPI schedule attribute to outer map outer_map.map._schedule = dtypes.ScheduleType.MPI # Now create a transient for each array for e in edges: in_local_storage_subgraph = { LocalStorage.node_a: graph.node_id(outer_map), LocalStorage.node_b: self.subgraph[MPITransformMap._map_entry] } sdfg_id = sdfg.sdfg_id in_local_storage = LocalStorage(sdfg_id, self.state_id, in_local_storage_subgraph, self.expr_index) in_local_storage.array = e.data.data in_local_storage.apply(sdfg) # Transform OutLocalStorage for each output of the MPI map in_map_exit = graph.exit_node(map_entry) out_map_exit = graph.exit_node(outer_map) for e in graph.out_edges(out_map_exit): name = e.data.data outlocalstorage_subgraph = { LocalStorage.node_a: graph.node_id(in_map_exit), LocalStorage.node_b: graph.node_id(out_map_exit) } sdfg_id = sdfg.sdfg_id outlocalstorage = LocalStorage(sdfg_id, self.state_id, outlocalstorage_subgraph, self.expr_index) outlocalstorage.array = name outlocalstorage.apply(sdfg)