def optimize_for_cpu(sdfg: dace.SDFG, m: int, n: int, k: int): """ Optimize the matrix multiplication example for multi-core CPUs. """ # Ensure integers are 32-bit by default dace.Config.set('compiler', 'default_data_types', value='C') # Fuse the map and reduce nodes sdfg.apply_transformations(MapReduceFusion) # Find multiplication map entry = find_map_by_param(sdfg, 'k') # Create a tiling strategy divides_evenly = (m % 32 == 0) and (n % 32 == 0) and (k % 256 == 0) xfutil.tile(sdfg, entry, divides_evenly, False, k=256, i=32, j=32) xfutil.tile(sdfg, entry, divides_evenly, divides_evenly, j=16, i=4) # Reorder internal map to "k,i,j" xfutil.permute_map(entry, [2, 0, 1]) # Add local storage for B in j tile: we apply InLocalStorage with a # parameter "array" named B, between the two maps of j and i regtile_j = find_map_by_param(sdfg, 'tile1_j') regtile_i = find_map_by_param(sdfg, 'tile1_i') InLocalStorage.apply_to(sdfg, dict(array='B'), node_a=regtile_j, node_b=regtile_i) if divides_evenly: # Add local storage for C exit_inner = find_mapexit_by_param(sdfg, 'k') exit_rti = find_mapexit_by_param(sdfg, 'tile1_i') AccumulateTransient.apply_to(sdfg, dict(array='C', identity=0), map_exit=exit_inner, outer_map_exit=exit_rti) # Vectorize microkernel map postamble = n % 4 != 0 entry_inner, inner_state = find_map_and_state_by_param(sdfg, 'k') Vectorization.apply_to(inner_state.parent, dict(vector_len=4, preamble=False, postamble=postamble), _map_entry=entry_inner) # Mark outer tile map as sequential to remove atomics find_map_by_param(sdfg, 'tile_k').map.schedule = dace.ScheduleType.Sequential # Collapse maps for more parallelism find_map_by_param(sdfg, 'o0').map.collapse = 2 tile_i = find_map_by_param(sdfg, 'tile_i') tile_j = find_map_by_param(sdfg, 'tile_j') MapCollapse.apply_to(sdfg, _outer_map_entry=tile_i, _inner_map_entry=tile_j) tile_ij = find_map_by_param(sdfg, 'tile_i') # Find newly created map tile_ij.map.schedule = dace.ScheduleType.CPU_Multicore tile_ij.map.collapse = 2
def is_sdfg_equal(sdfg1: dace.SDFG, sdfg2: dace.SDFG): if not (len(sdfg1.states()) == 1 and len(sdfg2.states()) == 1): return False state1 = sdfg1.states()[0] state2 = sdfg2.states()[0] # SDFGState.nx does not contain any node info in the networkx node attrs (but does for edges), # so we add it here manually. nx.set_node_attributes(state1.nx, {n: n for n in state1.nx.nodes}, "node") nx.set_node_attributes(state2.nx, {n: n for n in state2.nx.nodes}, "node") if not nx.is_isomorphic( state1.nx, state2.nx, edge_match=edge_match, node_match=node_match): return False for name in sdfg1.arrays.keys(): if not (isinstance(sdfg1.arrays[name], type(sdfg2.arrays[name])) and isinstance(sdfg2.arrays[name], type(sdfg1.arrays[name])) and sdfg1.arrays[name].dtype == sdfg2.arrays[name].dtype and sdfg1.arrays[name].transient == sdfg2.arrays[name].transient and sdfg1.arrays[name].shape == sdfg2.arrays[name].shape): return False return True
def op_repo_replacement(sdfg: SDFG, state: SDFGState, **kwargs): attrs = { name: value for name, value in kwargs.items() if name in dace_schema.attributes } onnx_node = cls(name=cls_name, **attrs) state.add_node(onnx_node) input_names = {p.name for p in dace_schema.inputs} output_names = {p.name for p in dace_schema.outputs} inputs = { name: arr_name for name, arr_name in kwargs.items() if name in input_names } outputs = { name: arr_name for name, arr_name in kwargs.items() if name in output_names } for inp, arr_name in inputs.items(): read = state.add_read(arr_name) state.add_edge(read, None, onnx_node, inp, sdfg.make_array_memlet(arr_name)) for outp, arr_name in outputs.items(): write = state.add_read(arr_name) state.add_edge(onnx_node, outp, write, None, sdfg.make_array_memlet(arr_name)) return []
def expand_reduce(sdfg: dace.SDFG, graph: dace.SDFGState, subgraph: Union[SubgraphView, List[SubgraphView]] = None, **kwargs): subgraph = graph if not subgraph else subgraph if not isinstance(subgraph, list): subgraph = [subgraph] for sg in subgraph: reduce_nodes = [] for node in sg.nodes(): if isinstance(node, stdlib.Reduce): rexp = ReduceExpansion(sdfg, sdfg.sdfg_id, sdfg.node_id(graph), {ReduceExpansion.reduce: graph.node_id(node)}, 0) if not rexp.can_be_applied(graph, 0, sdfg): print(f"WARNING: Cannot expand reduce node {node}:" "can_be_applied() failed.") continue reduce_nodes.append(node) trafo_reduce = ReduceExpansion(sdfg, sdfg.sdfg_id, sdfg.node_id(graph), {}, 0) for (property, val) in kwargs.items(): setattr(trafo_reduce, property, val) for reduce_node in reduce_nodes: trafo_reduce.expand(sdfg, graph, reduce_node) if isinstance(sg, SubgraphView): sg.nodes().remove(reduce_node) sg.nodes().append(trafo_reduce._reduce) sg.nodes().append(trafo_reduce._outer_entry)
def mkc(sdfg: dace.SDFG, state_before, src_name, dst_name, src_storage=None, dst_storage=None, src_shape=None, dst_shape=None, copy_expr=None, src_loc=None, dst_loc=None): """ Helper MaKe_Copy that creates and appends states performing exactly one copy. If a provided arrayname already exists it will use the old array, and ignore all newly passed values """ if copy_expr is None: copy_expr = src_name if (state_before == None): state = sdfg.add_state(is_start_state=True) else: state = sdfg.add_state_after(state_before) def mkarray(name, shape, storage, loc): if (name in sdfg.arrays): return sdfg.arrays[name] is_transient = False if (storage in _FPGA_STORAGE_TYPES): is_transient = True arr = sdfg.add_array(name, shape, dace.int32, storage, transient=is_transient) if loc is not None: arr[1].location["memorytype"] = loc[0] arr[1].location["bank"] = loc[1] return arr a = mkarray(src_name, src_shape, src_storage, src_loc) b = mkarray(dst_name, dst_shape, dst_storage, dst_loc) aAcc = state.add_access(src_name) bAcc = state.add_access(dst_name) edge = state.add_edge(aAcc, None, bAcc, None, mem.Memlet(copy_expr)) a_np_arr, b_np_arr = None, None if src_shape is not None: try: a_np_arr = np.zeros(src_shape, dtype=np.int32) except: pass if dst_shape is not None: try: b_np_arr = np.zeros(dst_shape, dtype=np.int32) except: pass return (state, a_np_arr, b_np_arr)
def _post_expand_trafos(sdfg: dace.SDFG): while inline_sdfgs(sdfg) or fuse_states(sdfg): pass sdfg.simplify() for node, _ in sdfg.all_nodes_recursive(): if isinstance(node, dace.nodes.MapEntry): node.collapse = len(node.range)
def assert_sdfg_equal(sdfg1: dace.SDFG, sdfg2: dace.SDFG): from gtc.dace.nodes import ( HorizontalExecutionLibraryNode, OIRLibraryNode, VerticalLoopLibraryNode, ) def edge_match(edge1, edge2): edge1 = next(iter(edge1.values())) edge2 = next(iter(edge2.values())) try: if edge1["src_conn"] is not None: assert edge2["src_conn"] is not None assert edge1["src_conn"] == edge2["src_conn"] else: assert edge2["src_conn"] is None assert edge1["data"] == edge2["data"] assert edge1["data"].data == edge2["data"].data except AssertionError: return False return True def node_match(n1, n2): n1 = n1["node"] n2 = n2["node"] try: if not isinstance( n1, (dace.nodes.AccessNode, VerticalLoopLibraryNode, HorizontalExecutionLibraryNode) ): raise TypeError if isinstance(n1, dace.nodes.AccessNode): assert isinstance(n2, dace.nodes.AccessNode) assert n1.access == n2.access assert n1.data == n2.data elif isinstance(n1, OIRLibraryNode): assert n1 == n2 except AssertionError: return False return True assert len(sdfg1.states()) == 1 assert len(sdfg2.states()) == 1 state1 = sdfg1.states()[0] state2 = sdfg2.states()[0] # SDFGState.nx does not contain any node info in the networkx node attrs (but does for edges), # so we add it here manually. nx.set_node_attributes(state1.nx, {n: n for n in state1.nx.nodes}, "node") nx.set_node_attributes(state2.nx, {n: n for n in state2.nx.nodes}, "node") assert nx.is_isomorphic(state1.nx, state2.nx, edge_match=edge_match, node_match=node_match) for name in sdfg1.arrays.keys(): assert isinstance(sdfg1.arrays[name], type(sdfg2.arrays[name])) assert isinstance(sdfg2.arrays[name], type(sdfg1.arrays[name])) assert sdfg1.arrays[name].dtype == sdfg2.arrays[name].dtype assert sdfg1.arrays[name].transient == sdfg2.arrays[name].transient assert sdfg1.arrays[name].shape == sdfg2.arrays[name].shape
def generate_host_function_body(self, sdfg: dace.SDFG, state: dace.SDFGState, kernel_name: str, predecessors: list, parameters: list, rtl_tasklet_names: list, kernel_stream: CodeIOStream, instrumentation_stream: CodeIOStream): ''' Generate the host-specific code for spawning and synchronizing the given kernel. :param sdfg: :param state: :param predecessors: list containing all the name of kernels that must be finished before starting this one :param parameters: list containing the kernel parameters (of all kernels in this state) :param rtl_tasklet_names :param kernel_stream: Device-specific code stream :param instrumentation_stream: Code for profiling kernel execution time. ''' kernel_args = [] for _, name, p, interface_ids in parameters: if isinstance(p, dt.Array): for bank, _ in fpga.iterate_hbm_interface_ids( p, interface_ids): kernel_args.append( p.as_arg(False, name=fpga.fpga_ptr(name, p, sdfg, bank))) else: kernel_args.append(p.as_arg(False, name=name)) kernel_function_name = kernel_name kernel_file_name = "{}.xclbin".format(kernel_name) # Check if this kernel depends from other kernels needs_synch = len(predecessors) > 0 if needs_synch: # Build a vector containing all the events associated with the kernels from which this one depends kernel_deps_name = f"deps_{kernel_name}" kernel_stream.write(f"std::vector<cl::Event> {kernel_deps_name};") for pred in predecessors: # concatenate events from predecessor kernel kernel_stream.write( f"{kernel_deps_name}.push_back({pred}_event);") # Launch HLS kernel, passing synchronization events (if any) kernel_stream.write( f"""\ auto {kernel_name}_kernel = program.MakeKernel({kernel_function_name}, "{kernel_function_name}", {", ".join(kernel_args)}); cl::Event {kernel_name}_event = {kernel_name}_kernel.ExecuteTaskFork({f'{kernel_deps_name}.begin(), {kernel_deps_name}.end()' if needs_synch else ''}); all_events.push_back({kernel_name}_event);""", sdfg, sdfg.node_id(state)) if state.instrument == dtypes.InstrumentationType.FPGA: self.instrument_opencl_kernel(kernel_name, sdfg.node_id(state), sdfg.sdfg_id, instrumentation_stream) # Join RTL tasklets for name in rtl_tasklet_names: kernel_stream.write(f"kernel_{name}.wait();\n", sdfg, sdfg.node_id(state))
def optimize_for_gpu(sdfg: dace.SDFG, m: int, n: int, k: int): """ Optimize the matrix multiplication example for GPUs. """ # Ensure integers are 32-bit by default dace.Config.set('compiler', 'default_data_types', value='C') # Fuse the map and reduce nodes sdfg.apply_transformations(MapReduceFusion) # Apply GPU transformation sdfg.apply_gpu_transformations() # Find multiplication map entry = find_map_by_param(sdfg, 'k') # Create a tiling strategy divides_evenly = (m % 64 == 0) and (n % 64 == 0) and (k % 8 == 0) xfutil.tile(sdfg, entry, divides_evenly, True, i=64, j=64, k=8) xfutil.tile(sdfg, entry, divides_evenly, True, i=8, j=4) # Create kernel schedule by collapsing and reordering maps gtile_i = find_map_by_param(sdfg, 'tile_i') gtile_j = find_map_by_param(sdfg, 'tile_j') btile_i = find_map_by_param(sdfg, 'tile1_i') btile_j = find_map_by_param(sdfg, 'tile1_j') MapCollapse.apply_to(sdfg, outer_map_entry=gtile_i, inner_map_entry=gtile_j, permissive=True) MapCollapse.apply_to(sdfg, outer_map_entry=btile_i, inner_map_entry=btile_j, permissive=True) btile = find_map_by_param(sdfg, 'tile1_i') btile.map.schedule = dace.ScheduleType.GPU_ThreadBlock # Add local storage (shared memory) for A and B on GPU ktile = find_map_by_param(sdfg, 'tile_k') smem_a = InLocalStorage.apply_to(sdfg, dict(array='A'), node_a=ktile, node_b=btile) smem_b = InLocalStorage.apply_to(sdfg, dict(array='B'), node_a=ktile, node_b=btile) sdfg.arrays[smem_a.data].storage = dace.StorageType.GPU_Shared sdfg.arrays[smem_b.data].storage = dace.StorageType.GPU_Shared # Add local storage (registers) for A and B ttile = find_map_by_param(sdfg, 'k') warptile, ttile = xfutil.extract_map_dims(sdfg, ttile, [2]) InLocalStorage.apply_to(sdfg, dict(array='trans_gpu_A'), node_a=warptile, node_b=ttile) InLocalStorage.apply_to(sdfg, dict(array='trans_gpu_B'), node_a=warptile, node_b=ttile) # Add local storage (registers) for C state = next(s for s in sdfg.nodes() if warptile in s.nodes()) warptile_exit = state.exit_node(warptile) btile_exit = state.exit_node(btile) AccumulateTransient.apply_to(sdfg, map_exit=warptile_exit, outer_map_exit=btile_exit) # Set C tile to zero on allocation c_access = next(n for n in state.data_nodes() if n.data == 'trans_gpu_C') c_access.setzero = True # Unroll microkernel maps ttile.map.unroll = True # Apply double-buffering on shared memory DoubleBuffering.apply_to(sdfg, map_entry=ktile, transient=smem_a)
def op_repo_replacement(pv: ProgramVisitor, sdfg: SDFG, state: SDFGState, **kwargs): attrs = { name: value for name, value in kwargs.items() if name in dace_schema.attributes } # remove used attrs kwargs = {k: v for k, v in kwargs.items() if k not in attrs} onnx_node = cls(name=cls_name, **attrs) state.add_node(onnx_node) input_names = dace_schema.non_variadic_inputs() variadic_inputs = dace_schema.variadic_inputs() output_names = dace_schema.non_variadic_outputs() variadic_outputs = dace_schema.variadic_outputs() inputs = { name: arr_name for name, arr_name in kwargs.items() if (name in input_names or # variadic params ("__" in name and parse_variadic_param(name)[0] in variadic_inputs)) } kwargs = {k: v for k, v in kwargs.items() if k not in inputs} outputs = { name: arr_name for name, arr_name in kwargs.items() if (name in output_names or # variadic params ("__" in name and parse_variadic_param(name)[0] in variadic_outputs)) } kwargs = {k: v for k, v in kwargs.items() if k not in outputs} if len(kwargs) > 0: raise TypeError(f"Unknown arguments {', '.join(kwargs)}") for inp, arr_name in inputs.items(): read = state.add_read(arr_name) state.add_edge(read, None, onnx_node, inp, sdfg.make_array_memlet(arr_name)) onnx_node.add_in_connector(inp) for outp, arr_name in outputs.items(): write = state.add_read(arr_name) state.add_edge(onnx_node, outp, write, None, sdfg.make_array_memlet(arr_name)) onnx_node.add_out_connector(outp) return []
def on_sdfg_begin(self, sdfg: SDFG, local_stream: CodeIOStream, global_stream: CodeIOStream, codegen: 'DaCeCodeGenerator'): # Initialize serializer versioning object if sdfg.parent is None: self.codegen = codegen path = os.path.abspath(os.path.join(sdfg.build_folder, 'data')).replace('\\', '/') codegen.statestruct.append('dace::DataSerializer *serializer;') sdfg.append_init_code( f'__state->serializer = new dace::DataSerializer("{path}");\n')
def on_sdfg_begin(self, sdfg: SDFG, local_stream: CodeIOStream, global_stream: CodeIOStream, codegen: 'DaCeCodeGenerator'): # Initialize serializer versioning object if sdfg.parent is None: self.codegen = codegen codegen.statestruct.append('dace::DataSerializer *serializer;') sdfg.append_init_code( f'__state->serializer = new dace::DataSerializer("");\n') # Add method that controls serializer input global_stream.write(self._generate_report_setter(sdfg))
def apply(self, state: SDFGState, sdfg: SDFG): nsdfg = self.nsdfg candidates, candidate_nodes = self._candidates(nsdfg) for outer_edge in state.out_edges(nsdfg): if outer_edge.src_conn in candidates: state.remove_memlet_path(outer_edge) sdfg.remove_data(outer_edge.data.data, validate=False) for nstate, node in candidate_nodes: for ie in nstate.in_edges(node): nstate.remove_memlet_path(ie) for cand in candidates: nsdfg.sdfg.remove_data(cand, validate=False)
def _specialize_transient_strides(sdfg: dace.SDFG, layout_map): repldict = replace_strides( [array for array in sdfg.arrays.values() if array.transient], layout_map, ) sdfg.replace_dict(repldict) for state in sdfg.nodes(): for node in state.nodes(): if isinstance(node, dace.nodes.NestedSDFG): for k, v in repldict.items(): if k in node.symbol_mapping: node.symbol_mapping[k] = v for k in repldict.keys(): if k in sdfg.symbols: sdfg.remove_symbol(k)
def validate_oir_sdfg(sdfg: dace.SDFG): from gtc.dace.nodes import VerticalLoopLibraryNode sdfg.validate() is_correct_node_types = all( isinstance(n, (dace.SDFGState, dace.nodes.AccessNode, VerticalLoopLibraryNode)) for n, _ in sdfg.all_nodes_recursive()) is_correct_data_and_dtype = all( isinstance(array, dace.data.Array) and typestr_to_data_type( dace_dtype_to_typestr(array.dtype)) != DataType.INVALID for array in sdfg.arrays.values()) if not is_correct_node_types or not is_correct_data_and_dtype: raise ValueError("Not a valid OIR-level SDFG")
def apply_pass(self, sdfg: SDFG, _) -> Optional[Set[SDFGState]]: """ Removes unreachable states throughout an SDFG. :param sdfg: The SDFG to modify. :param pipeline_results: If in the context of a ``Pipeline``, a dictionary that is populated with prior Pass results as ``{Pass subclass name: returned object from pass}``. If not run in a pipeline, an empty dictionary is expected. :param initial_symbols: If not None, sets values of initial symbols. :return: A set of the removed states, or None if nothing was changed. """ # Mark dead states and remove them result = self.find_dead_states(sdfg, set_unconditional_edges=True) sdfg.remove_nodes_from(result) return result or None
def generate_scope(self, sdfg: dace.SDFG, scope: ScopeSubgraphView, state_id: int, function_stream: CodeIOStream, callsite_stream: CodeIOStream): entry_node: nd.MapEntry = scope.source_nodes()[0] index_list = [] for begin, end, stride in entry_node.map.range: l = [] while begin <= end: l.append(begin) begin += stride index_list.append(l) sdfgconsts = sdfg.constants_prop sdfg.constants_prop = copy.deepcopy(sdfg.constants_prop) mapsymboltypes = entry_node.new_symbols(sdfg, scope, [entry_node.map.params]) for indices in product(*index_list): callsite_stream.write('{') nsdfg_unroll_info = None for param, index in zip(entry_node.map.params, indices): if nsdfg_unroll_info is None: nsdfg_unroll_info = self.nsdfg_prepare_unroll( scope, str(param), str(index)) else: self.nsdfg_prepare_unroll(scope, str(param), str(index)) callsite_stream.write( f"constexpr {mapsymboltypes[param]} {param} = " f"{dace.codegen.targets.common.sym2cpp(index)};\n", sdfg) sdfg.add_constant(param, int(index)) callsite_stream.write('{') self._dispatcher.dispatch_subgraph( sdfg, scope, state_id, function_stream, callsite_stream, skip_entry_node=True, skip_exit_node=True, ) callsite_stream.write('}') callsite_stream.write('}') self.nsdfg_after_unroll(nsdfg_unroll_info) sdfg.constants_prop = sdfgconsts
def program_for_node(program, sdfg: SDFG, state: SDFGState, node: onnx_op.ONNXOp) -> SDFG: """ Expand a function to a dace program. The dtypes for the arguments will be extracted by matching the parameter names to edges. """ input_names = node.schema.non_variadic_inputs() variadic_input_names = node.schema.variadic_inputs() output_names = node.schema.non_variadic_outputs() variadic_output_names = node.schema.variadic_outputs() if set(input_names).intersection(output_names): # this is currently the case for only one onnx op raise ValueError( "program_for_node cannot be applied on nodes of this type;" " '{}' is both an input and an output".format( next(input_names.intersection(output_names)))) params = inspect.signature(program).parameters annotations = {} for name, param in params.items(): if name in input_names or ("__" in name and parse_variadic_param(name)[0] in variadic_input_names): annotations[name] = in_desc_with_name(node, state, sdfg, name) elif name in output_names or ("__" in name and parse_variadic_param(name)[0] in variadic_output_names): annotations[name] = out_desc_with_name(node, state, sdfg, name) else: raise ValueError( "'{}' was not found as an input or output for {}".format( name, node.schema.name)) program.__annotations__ = annotations result = DaceProgram(program, (), {}, False, dace.DeviceType.CPU) result.name = node.label + "_expansion" sdfg = result.to_sdfg() if node.schedule in [dtypes.ScheduleType.GPU_Default ] + dtypes.GPU_SCHEDULES: sdfg.apply_gpu_transformations() return sdfg
def get_out_memlet_costs(sdfg: dace.SDFG, state_id: int, node: nodes.Node, dfg: StateGraphView): scope_dict = sdfg.node(state_id).scope_dict() out_costs = 0 for edge in dfg.out_edges(node): _, uconn, v, _, memlet = edge dst_node = dfg.memlet_path(edge)[-1].dst if (isinstance(node, nodes.CodeNode) and isinstance(dst_node, nodes.AccessNode)): # If the memlet is pointing into an array in an inner scope, # it will be handled by the inner scope. if (scope_dict[node] != scope_dict[dst_node] and scope_contains_scope(scope_dict, node, dst_node)): continue if not uconn: # This would normally raise a syntax error return 0 if memlet.subset.data_dims() == 0: if memlet.wcr is not None: # write_and_resolve # We have to assume that every reduction costs 3 # accesses of the same size (read old, read new, write) out_costs += 3 * PAPIUtils.get_memlet_byte_size( sdfg, memlet) else: # This standard operation is already counted out_costs += PAPIUtils.get_memlet_byte_size( sdfg, memlet) return out_costs
def _dml_disambiguate_direction_dependent_views(sdfg: dace.SDFG): """ Consider the following subgraph: (A) -- y --> (n) -- x --> (C) In dace, if B is a View node and A and C are access nodes, and y and x both have data set to A.data and B.data respectively, the semantics of the graph depend on the order in which it is executed, i.e. reversing the subgraph doesn't perform as expected anymore. To disambiguate this case, we set y.data to the View's data. """ for n, state in sdfg.all_nodes_recursive(): if isinstance(n, nd.AccessNode) and type(n.desc(sdfg)) is dt.View: in_edges = state.in_edges(n) out_edges = state.out_edges(n) if len(in_edges) == 1 and len(out_edges) == 1: A = in_edges[0].src y = in_edges[0].data C = out_edges[0].dst x = out_edges[0].data if (isinstance(A, nd.AccessNode) and isinstance(C, nd.AccessNode) and y.data == A.data and x.data == C.data): # flip the memlet y.subset, y.other_subset = y.other_subset, y.subset y.data = n.data y.try_initialize(sdfg, state, in_edges[0])
def find_library_nodes( sdfg: dace.SDFG, lib_type: dace.sdfg.nodes.LibraryNode) -> dace.nodes.MapEntry: """ Finds the first access node by the given data name. """ return [ n for n, _ in sdfg.all_nodes_recursive() if isinstance(n, lib_type) ]
def apply(cls, gtir, sdfg: dace.SDFG): self = cls() code_objects = sdfg.generate_code() computations = code_objects[[co.title for co in code_objects ].index("Frame")].clean_code lines = computations.split("\n") computations = "\n".join( lines[0:2] + lines[3:]) # remove import of not generated file computations = codegen.format_source("cpp", computations, style="LLVM") interface = cls.template.definition.render( name=sdfg.name, dace_args=self.generate_dace_args(gtir, sdfg), functor_args=self.generate_functor_args(sdfg), tmp_allocs=self.generate_tmp_allocs(sdfg), ) generated_code = f"""#include <gridtools/sid/sid_shift_origin.hpp> #include <gridtools/sid/allocator.hpp> #include <gridtools/stencil/cartesian.hpp> namespace gt = gridtools; {computations} {interface} """ formatted_code = codegen.format_source("cpp", generated_code, style="LLVM") return formatted_code
def _count_views(sdfg: dace.SDFG) -> int: num = 0 for n, _ in sdfg.all_nodes_recursive(): if (isinstance(n, nodes.AccessNode) and isinstance(sdfg.arrays[n.data], data.View)): num += 1 return num
def pre_evaluate(self, cutout: dace.SDFG, measurements: int, **kwargs) -> Dict: cutout.start_state.instrument = self.instrument map_entry = None for node in cutout.start_state.nodes(): if isinstance(node, dace.nodes.MapEntry) and xfh.get_parent_map( cutout.start_state, node) is None: map_entry = node break assert map_entry is not None new_kwargs = { "space_kwargs": { "map_entry": map_entry }, "cutout": cutout.to_json(), "map_entry_id": cutout.start_state.node_id(map_entry), "measurements": measurements, "key": lambda point: "None" if point is None else ".".join(map(lambda p: str(p), point)) } return new_kwargs
def apply(self, sdfg: dace.SDFG) -> None: state = sdfg.node(self.state_id) left = self.left(sdfg) right = self.right(sdfg) # Merge source locations dinfo = self._merge_source_locations(left, right) # merge oir nodes res = HorizontalExecutionLibraryNode( oir_node=oir.HorizontalExecution( body=left.as_oir().body + right.as_oir().body, declarations=left.as_oir().declarations + right.as_oir().declarations, ), iteration_space=left.iteration_space, debuginfo=dinfo, ) state.add_node(res) intermediate_accesses = set( n for path in nx.all_simple_paths(state.nx, left, right) for n in path[1:-1]) # rewire edges and connectors to left and delete right for edge in state.edges_between(left, right): state.remove_edge_and_connectors(edge) for acc in intermediate_accesses: for edge in state.in_edges(acc): if edge.src is not left: rewire_edge(state, edge, dst=res) else: state.remove_edge_and_connectors(edge) for edge in state.out_edges(acc): if edge.dst is not right: rewire_edge(state, edge, src=res) else: state.remove_edge_and_connectors(edge) for edge in state.in_edges(left): rewire_edge(state, edge, dst=res) for edge in state.out_edges(right): rewire_edge(state, edge, src=res) for edge in state.out_edges(left): rewire_edge(state, edge, src=res) for edge in state.in_edges(right): rewire_edge(state, edge, dst=res) state.remove_node(left) state.remove_node(right) for acc in intermediate_accesses: if not state.in_edges(acc): if not state.out_edges(acc): state.remove_node(acc) else: assert (len(state.edges_between(acc, res)) == 1 and len(state.out_edges(acc)) == 1), "Previously written array now read-only." state.remove_node(acc) res.remove_in_connector("IN_" + acc.label) elif not state.out_edges: acc.access = dace.AccessType.WriteOnly
def apply(self, sdfg: SDFG) -> Union[Any, None]: state = sdfg.node(self.state_id) nsdfg = self.nsdfg(sdfg) read_set, write_set = nsdfg.sdfg.read_and_write_sets() prune_in = nsdfg.in_connectors.keys() - read_set prune_out = nsdfg.out_connectors.keys() - write_set # Detect which nodes are used, so we can delete unused nodes after the # connectors have been pruned all_data_used = read_set | write_set # Add WCR outputs to "do not prune" input list for e in state.out_edges(nsdfg): if e.data.wcr is not None and e.src_conn in prune_in: if (state.in_degree( next( iter(state.in_edges_by_connector( nsdfg, e.src_conn))).src) > 0): prune_in.remove(e.src_conn) for conn in prune_in: for e in state.in_edges_by_connector(nsdfg, conn): state.remove_memlet_path(e, remove_orphans=True) if conn in nsdfg.sdfg.arrays and conn not in all_data_used: # If the data is now unused, we can purge it from the SDFG nsdfg.sdfg.remove_data(conn) for conn in prune_out: for e in state.out_edges_by_connector(nsdfg, conn): state.remove_memlet_path(e, remove_orphans=True) if conn in nsdfg.sdfg.arrays and conn not in all_data_used: # If the data is now unused, we can purge it from the SDFG nsdfg.sdfg.remove_data(conn)
def was_vectorized(sdfg: dace.SDFG) -> bool: """ Tests whether a binary contains 128-bit CUDA memory operations. """ csdfg: compiled_sdfg.CompiledSDFG = sdfg.compile() output: bytes = subprocess.check_output( ['cuobjdump', '-sass', csdfg.filename], stderr=subprocess.STDOUT) del csdfg return b'.128' in output
def load_sdfg_from_json(json): if 'error' in json: message = '' if ('message' in json['error']): message = json['error']['message'] error = { 'error': { 'message': 'Invalid SDFG provided', 'details': message, } } sdfg = None else: try: sdfg = SDFG.from_json(json) error = None except Exception as e: print(traceback.format_exc(), file=sys.stderr) sys.stderr.flush() error = { 'error': { 'message': 'Failed to parse the provided SDFG', 'details': get_exception_message(e), }, } sdfg = None return { 'error': error, 'sdfg': sdfg, }
def _setup_gpu_runtime(self, sdfg: SDFG, global_stream: CodeIOStream): if self.gpu_runtime_init: return self.gpu_runtime_init = True self.backend = config.Config.get('compiler', 'cuda', 'backend') if self.backend == 'cuda': header_name = 'cuda_runtime.h' elif self.backend == 'hip': header_name = 'hip/hip_runtime.h' else: raise NameError('GPU backend "%s" not recognized' % self.backend) global_stream.write('#include <%s>' % header_name) # For other file headers sdfg.append_global_code('\n#include <%s>' % header_name, None)
def _constants_from_unvisited_state( self, sdfg: SDFG, state: SDFGState, arrays: Set[str], existing_constants: Dict[SDFGState, Dict[str, Any]]) -> Dict[str, Any]: """ Collects constants from an unvisited state, traversing backwards until reaching states that do have collected constants. """ result: Dict[str, Any] = {} for parent, node in sdutil.dfs_conditional( sdfg, sources=[state], reverse=True, condition=lambda p, c: c not in existing_constants, yield_parent=True): # Skip first node if parent is None: continue # Get connecting edge (reversed) edge = sdfg.edges_between(node, parent)[0] # If node already has propagated constants, update dictionary and stop traversal self._propagate( result, self._data_independent_assignments(edge.data, arrays), True) if node in existing_constants: self._propagate(result, existing_constants[node], True) return result