def can_be_applied(graph, candidate, expr_index, sdfg, strict=False): map_entry = graph.nodes()[candidate[MPITransformMap._map_entry]] # Check if the map is one-dimensional if map_entry.map.range.dims() != 1: return False # We cannot transform a map which is already of schedule type MPI if map_entry.map.schedule == dtypes.ScheduleType.MPI: return False # We cannot transform a map which is already inside a MPI map, or in # another device schedule_whitelist = [ dtypes.ScheduleType.Default, dtypes.ScheduleType.Sequential ] sdict = graph.scope_dict() parent = sdict[map_entry] while parent is not None: if parent.map.schedule not in schedule_whitelist: return False parent = sdict[parent] # Dynamic map ranges not supported (will allocate dynamic memory) if has_dynamic_map_inputs(graph, map_entry): return False # MPI schedules currently do not support WCR map_exit = graph.exit_node(map_entry) if any(e.data.wcr for e in graph.out_edges(map_exit)): return False return True
def can_be_applied(graph: SDFGState, candidate, expr_index, sdfg, strict=False): map_entry = graph.nodes()[candidate[GPUMultiTransformMap._map_entry]] # Check if there is more than one GPU available: if (Config.get("compiler", "cuda", "max_number_gpus") < 2): return False # Dynamic map ranges not supported if has_dynamic_map_inputs(graph, map_entry): return False # Only accept maps with a default schedule schedule_whitelist = [dtypes.ScheduleType.Default] sdict = graph.scope_dict() parent = sdict[map_entry] while parent is not None: if parent.map.schedule not in schedule_whitelist: return False parent = sdict[parent] # Library nodes inside the scope are not supported scope_subgraph = graph.scope_subgraph(map_entry) for node in scope_subgraph.nodes(): if isinstance(node, nodes.LibraryNode): return False # Custom reductions can not have an accumulate transient, as the # reduction would have to be split up for the ingoing memlet of the # accumulate transient and the outgoing memlet. Not using GPU local # accumulate transient only works for a small volume of data. map_exit = graph.exit_node(map_entry) for edge in graph.out_edges(map_exit): if edge.data.wcr is not None and operations.detect_reduction_type( edge.data.wcr) == dtypes.ReductionType.Custom: return False storage_whitelist = [ dtypes.StorageType.Default, dtypes.StorageType.CPU_Pinned, dtypes.StorageType.CPU_Heap, dtypes.StorageType.GPU_Global, ] for node in graph.predecessors(map_entry): if not isinstance(node, nodes.AccessNode): return False if node.desc(graph).storage not in storage_whitelist: return False for node in graph.successors(map_exit): if not isinstance(node, nodes.AccessNode): return False if node.desc(graph).storage not in storage_whitelist: return False return True
def can_be_applied(self, graph, expr_index, sdfg, permissive=False): if expr_index == 0: map_entry = self.map_entry candidate_map = map_entry.map # Disallow GPUTransform on nested maps in permissive mode if not permissive: if graph.entry_node(map_entry) is not None: return False # Map schedules that are disallowed to transform to GPUs if (candidate_map.schedule == dtypes.ScheduleType.MPI or candidate_map.schedule == dtypes.ScheduleType.GPU_Device or candidate_map.schedule == dtypes.ScheduleType.GPU_ThreadBlock or candidate_map.schedule == dtypes.ScheduleType.Sequential): return False # Dynamic map ranges cannot become kernels if sd.has_dynamic_map_inputs(graph, map_entry): return False # Recursively check parent for GPU schedules sdict = graph.scope_dict() current_node = map_entry while current_node is not None: if (current_node.map.schedule == dtypes.ScheduleType.GPU_Device or current_node.map.schedule == dtypes.ScheduleType.GPU_ThreadBlock): return False current_node = sdict[current_node] # Ensure that map does not include internal arrays that are # allocated on non-default space subgraph = graph.scope_subgraph(map_entry) for node in subgraph.nodes(): if (isinstance(node, nodes.AccessNode) and node.desc(sdfg).storage != dtypes.StorageType.Default and node.desc(sdfg).storage != dtypes.StorageType.Register): return False # If one of the outputs is a stream, do not match map_exit = graph.exit_node(map_entry) for edge in graph.out_edges(map_exit): dst = graph.memlet_path(edge)[-1].dst if (isinstance(dst, nodes.AccessNode) and isinstance(sdfg.arrays[dst.data], data.Stream)): return False return True elif expr_index == 1: reduce = self.reduce # Recursively check parent for GPU schedules sdict = graph.scope_dict() current_node = sdict[reduce] while current_node is not None: if (current_node.map.schedule == dtypes.ScheduleType.GPU_Device or current_node.map.schedule == dtypes.ScheduleType.GPU_ThreadBlock): return False current_node = sdict[current_node] return True
def can_be_applied(graph, candidate, expr_index, sdfg, strict=False): if expr_index == 0: map_entry = graph.nodes()[candidate[GPUTransformMap._map_entry]] candidate_map = map_entry.map # Map schedules that are disallowed to transform to GPUs if (candidate_map.schedule in [dtypes.ScheduleType.MPI] + dtypes.GPU_SCHEDULES): return False if sd.is_devicelevel(sdfg, graph, map_entry): return False # Dynamic map ranges cannot become kernels if sd.has_dynamic_map_inputs(graph, map_entry): return False # Ensure that map does not include internal arrays that are # allocated on non-default space subgraph = graph.scope_subgraph(map_entry) for node in subgraph.nodes(): if (isinstance(node, nodes.AccessNode) and node.desc(sdfg).storage != dtypes.StorageType.Default and node.desc(sdfg).storage != dtypes.StorageType.Register): return False # If one of the outputs is a stream, do not match map_exit = graph.exit_nodes(map_entry)[0] for edge in graph.out_edges(map_exit): dst = graph.memlet_path(edge)[-1].dst if (isinstance(dst, nodes.AccessNode) and isinstance(sdfg.arrays[dst.data], data.Stream)): return False return True elif expr_index == 1: reduce = graph.nodes()[candidate[GPUTransformMap._reduce]] # Map schedules that are disallowed to transform to GPUs if (reduce.schedule in [dtypes.ScheduleType.MPI] + dtypes.GPU_SCHEDULES): return False if sd.is_devicelevel(sdfg, graph, reduce): return False return True
def can_be_applied(graph: dace.SDFGState, candidate: Dict[Any, int], expr_index: int, sdfg: dace.SDFG, strict=False): map_entry: nodes.MapEntry = graph.node(candidate[NestK._map_entry]) stencil: Stencil = graph.node(candidate[NestK._stencil]) if len(map_entry.map.params) != 1: return False if sd.has_dynamic_map_inputs(graph, map_entry): return False pname = map_entry.map.params[0] # Usually "k" dim_index = None for edge in graph.out_edges(map_entry): if edge.dst != stencil: return False for edge in graph.all_edges(stencil): if edge.data.data is None: # Empty memlet continue # TODO: Use bitmap to verify lower-dimensional arrays if len(edge.data.subset) == 3: for i, rng in enumerate(edge.data.subset.ndrange()): for r in rng: if pname in map(str, r.free_symbols): if dim_index is not None and dim_index != i: # k dimension must match in all memlets return False if str(r) != pname: if symbolic.issymbolic( r - symbolic.symbol(pname), sdfg.constants): warnings.warn('k expression is nontrivial') dim_index = i # No nesting dimension found if dim_index is None: return False # Ensure the stencil shape is 1 for the found dimension if stencil.shape[dim_index] != 1: return False return True
def can_be_applied(self, graph, expr_index, sdfg, permissive=False): map_node = self.map_entry nsdfg_node = None # If the map is dynamic-ranged, the resulting border arrays would be # dynamically sized if sd.has_dynamic_map_inputs(graph, map_node): return False if expr_index == 0: # Map with subgraph subgraphs = [ graph.scope_subgraph(map_node, include_entry=False, include_exit=False) ] else: # Map with nested SDFG nsdfg_node = self.nested_sdfg # Make sure there are no other internal nodes in the map if len(set(e.dst for e in graph.out_edges(map_node))) > 1: return False subgraphs = list(nsdfg_node.sdfg.nodes()) # Test subgraphs border_arrays = set() total_components = [] for sg in subgraphs: components = self._components(sg) snodes = sg.nodes() # Test that the subgraphs have more than one computational component if expr_index == 0 and len(snodes) > 0 and len(components) <= 1: return False # Test that the components are connected by transients that are not # used anywhere else border_arrays |= self._border_arrays( nsdfg_node.sdfg if expr_index == 1 else sdfg, sg if expr_index == 1 else graph, sg) total_components.append(components) # In nested SDFGs and subgraphs, ensure none of the border # values are non-transients for array in border_arrays: if expr_index == 0: ndesc = sdfg.arrays[array] else: ndesc = nsdfg_node.sdfg.arrays[array] if ndesc.transient is False: return False # In subgraphs, make sure transients are not used/allocated # in other scopes or states if expr_index == 0: # Find all nodes not in subgraph not_subgraph = set( n.data for n in graph.nodes() if n not in snodes and isinstance(n, nodes.AccessNode)) not_subgraph.update( set(n.data for s in sdfg.nodes() if s != graph for n in s.nodes() if isinstance(n, nodes.AccessNode))) for _, component_out in components: for e in sg.out_edges(component_out): if isinstance(e.dst, nodes.AccessNode): if e.dst.data in not_subgraph: return False # Fail if there are arrays inside the map that are not a direct # output of a computational component # TODO(later): Support this case? Ambiguous array sizes and memlets external_arrays = ( border_arrays - self._internal_border_arrays(total_components, subgraphs)) if len(external_arrays) > 0: return False return True
max_params = 0 for file in tqdm(paths): try: sdfg = dace.SDFG.from_file(file) except: print("Not Valid SDFG at: " + str(file)) continue opt = Optimizer(sdfg) vectorization_map_entry = [ i.query_node(sdfg.sdfg_list[i.sdfg_id], i._map_entry) for i in opt.get_pattern_matches(patterns=[Vectorization]) ] for node, state in sdfg.all_nodes_recursive(): if isinstance(node, MapEntry): dic_training = {} if has_dynamic_map_inputs(state, node): continue tasklet = state.out_edges(node)[0].dst if not isinstance(tasklet, Tasklet): continue free_symbols = set() #Get the Free symbols for Memlet in state.in_edges(tasklet) + state.out_edges(tasklet): for free_symbol in Memlet.data.free_symbols: free_symbols.add(free_symbol) for free_symbol in node.free_symbols: free_symbols.add(free_symbol) dic_training["Free_symbols"] = free_symbols dic_training["Params"] = node.params max_free_symbols = max(max_free_symbols, len(free_symbols)) max_params = max(max_params, len(node.params))