def define_stream(dtype, vector_length, buffer_size, var_name, array_size, function_stream, kernel_stream): if cpp.sym2cpp(array_size) == "1": kernel_stream.write("dace::FIFO<{}, {}, {}> {}(\"{}\");".format( dtype.ctype, vector_length, buffer_size, var_name, var_name)) else: kernel_stream.write("dace::FIFO<{}, {}, {}> {}[{}];\n".format( dtype.ctype, vector_length, buffer_size, var_name, cpp.sym2cpp(array_size))) kernel_stream.write("dace::SetNames({}, \"{}\", {});".format( var_name, var_name, cpp.sym2cpp(array_size)))
def define_stream(dtype, buffer_size, var_name, array_size, function_stream, kernel_stream): ctype = "dace::FIFO<{}, {}, {}>".format(dtype.base_type.ctype, dtype.veclen, buffer_size) if cpp.sym2cpp(array_size) == "1": kernel_stream.write("{} {}(\"{}\");".format(ctype, var_name, var_name)) else: kernel_stream.write("{} {}[{}];\n".format(ctype, var_name, cpp.sym2cpp(array_size))) kernel_stream.write("dace::SetNames({}, \"{}\", {});".format( var_name, var_name, cpp.sym2cpp(array_size))) # Return value is used for adding to defined_vars in fpga.py return ctype
def copy_memory(self, sdfg, dfg, state_id, src_node, dst_node, edge, function_stream, callsite_stream): memlet = edge.data if (isinstance(src_node, nodes.AccessNode) and (src_node.desc(sdfg).materialize_func is not None)): function_stream.write(src_node.desc(sdfg).materialize_func) if edge.dst_conn is not None: arrayname = str(edge.dst_conn) else: arrayname = str(dst_node.desc) if isinstance(dst_node, nodes.Tasklet) or \ (dst_node.desc(sdfg).storage == dtypes.StorageType.Register): callsite_stream.write( self.memlet_definition(sdfg, memlet, arrayname, direction="in"), sdfg, state_id, [src_node, dst_node]) else: callsite_stream.write("__dace_materialize(\"" + \ sym2cpp(src_node) + "\", " + \ sym2cpp(memlet.subset.min_element()[0]) + ", " + \ sym2cpp(memlet.subset.min_element()[0] + memlet.subset.num_elements()) + ", " + sym2cpp(dst_node.data) + ");\n", sdfg, state_id, [src_node, dst_node]) if (isinstance(dst_node, nodes.AccessNode) and (dst_node.desc(sdfg).materialize_func is not None)): # This case is pretty complicated due to how the rest of the # codegen works: This is not the place to actually copy code. In # the place where data is ready to be written there will be a call # __foo.write(foo) where foo is the local_name of the memlet that # "causes" the write. But this function is actually called when # we should set up everything for this call to work. # The above mentioned code is generated by process_out_memlets function_stream.write(dst_node.desc(sdfg).materialize_func) if isinstance(src_node, nodes.Tasklet) or \ (src_node.desc(sdfg).storage == dtypes.StorageType.Register): callsite_stream.write( self.memlet_definition(sdfg, memlet, edge.src_conn, direction="out"), sdfg, state_id, [src_node, dst_node]) else: callsite_stream.write("__dace_serialize(\"" + \ sym2cpp(dst_node) + "\", " + \ sym2cpp(memlet.subset.min_element()[0]) + ", " + \ sym2cpp(memlet.subset.min_element()[0] + memlet.subset.num_elements()) + ", " + sym2cpp(src_node.data) + ");\n", sdfg, state_id, [src_node, dst_node])
def replace_properties(node: Any, symrepl: Dict[symbolic.symbol, symbolic.SymbolicType], name: str, new_name: str): for propclass, propval in node.properties(): if propval is None: continue pname = propclass.attr_name if isinstance(propclass, properties.SymbolicProperty): setattr(node, pname, propval.subs(symrepl)) elif isinstance(propclass, properties.DataProperty): if propval == name: setattr(node, pname, new_name) elif isinstance(propclass, (properties.RangeProperty, properties.ShapeProperty)): setattr(node, pname, _replsym(list(propval), symrepl)) elif isinstance(propclass, properties.CodeProperty): # Don't replace variables that appear as an input or an output # connector, as this should shadow the outer declaration. if hasattr(node, 'in_connectors'): if name in node.in_connectors: continue if hasattr(node, 'out_connectors'): if name in node.out_connectors: continue if isinstance(propval.code, str): if str(name) != str(new_name): lang = propval.language newcode = propval.code if not re.findall(r'[^\w]%s[^\w]' % name, newcode): continue if lang is dtypes.Language.CPP: # Replace in C++ code # Avoid import loop from dace.codegen.targets.cpp import sym2cpp # Use local variables and shadowing to replace replacement = 'auto %s = %s;\n' % (name, sym2cpp(new_name)) propval.code = replacement + newcode else: warnings.warn( 'Replacement of %s with %s was not made ' 'for string tasklet code of language %s' % (name, new_name, lang)) elif propval.code is not None: for stmt in propval.code: ASTFindReplace({ name: symbolic.symstr(new_name) }).visit(stmt) elif (isinstance(propclass, properties.DictProperty) and pname == 'symbol_mapping'): # Symbol mappings for nested SDFGs for symname, sym_mapping in propval.items(): try: propval[symname] = symbolic.pystr_to_symbolic( str(sym_mapping)).subs(symrepl) except AttributeError: # If the symbolified value has no subs pass
def on_node_end(self, sdfg: SDFG, state: SDFGState, node: nodes.AccessNode, outer_stream: CodeIOStream, inner_stream: CodeIOStream, global_stream: CodeIOStream): from dace.codegen.dispatcher import DefinedType # Avoid import loop if is_devicelevel_gpu(sdfg, state, node) or is_devicelevel_fpga( sdfg, state, node): # Only run on host code return desc = node.desc(sdfg) # Obtain a pointer for arrays and scalars ptrname = cpp.ptr(node.data, desc, sdfg, self.codegen) defined_type, _ = self.codegen.dispatcher.defined_vars.get(ptrname) if defined_type == DefinedType.Scalar: ptrname = '&' + ptrname # Create UUID state_id = sdfg.node_id(state) node_id = state.node_id(node) uuid = f'{sdfg.sdfg_id}_{state_id}_{node_id}' # Get optional pre/postamble for instrumenting device data preamble, postamble = '', '' if desc.storage == dtypes.StorageType.GPU_Global: self._setup_gpu_runtime(sdfg, global_stream) preamble, postamble, ptrname = self._generate_copy_to_host( node, desc, ptrname) # Encode runtime shape and strides shape = ', '.join(cpp.sym2cpp(s) for s in desc.shape) strides = ', '.join(cpp.sym2cpp(s) for s in desc.strides) # Write code inner_stream.write(preamble, sdfg, state_id, node_id) inner_stream.write( f'__state->serializer->save({ptrname}, {cpp.sym2cpp(desc.total_size - desc.start_offset)}, ' f'"{node.data}", "{uuid}", {shape}, {strides});\n', sdfg, state_id, node_id) inner_stream.write(postamble, sdfg, state_id, node_id)
def define_local_array(dtype, vector_length, var_name, array_size, storage, shape, function_stream, kernel_stream, sdfg, state_id, node): kernel_stream.write("dace::vec<{}, {}> {}[{}];\n".format( dtype.ctype, vector_length, var_name, cpp.sym2cpp(array_size))) if storage == dace.dtypes.StorageType.FPGA_Registers: kernel_stream.write("#pragma HLS ARRAY_PARTITION variable={} " "complete\n".format(var_name)) elif len(shape) > 1: kernel_stream.write("#pragma HLS ARRAY_PARTITION variable={} " "block factor={}\n".format( var_name, shape[-2]))
def define_stream(dtype, buffer_size, var_name, array_size, function_stream, kernel_stream): """ Defines a stream :return: a tuple containing the type of the created variable, and boolean indicating whether this is a global variable or not """ ctype = "dace::FIFO<{}, {}, {}>".format(dtype.base_type.ctype, dtype.veclen, buffer_size) if cpp.sym2cpp(array_size) == "1": kernel_stream.write("{} {}(\"{}\");".format( ctype, var_name, var_name)) else: kernel_stream.write("{} {}[{}];\n".format(ctype, var_name, cpp.sym2cpp(array_size))) kernel_stream.write("dace::SetNames({}, \"{}\", {});".format( var_name, var_name, cpp.sym2cpp(array_size))) # In Xilinx, streams are defined as local variables # Return value is used for adding to defined_vars in fpga.py return ctype, False
def _cases_from_branches( edges: List[Edge[InterstateEdge]], cblocks: Dict[Edge[InterstateEdge], GeneralBlock], ) -> Tuple[str, Dict[str, GeneralBlock]]: """ If the input list of edges correspond to a switch/case scope (with all conditions being "x == y" for a unique symbolic x and integers y), returns the switch/case scope parameters. :param edges: List of inter-state edges. :return: Tuple of (case variable C++ expression, mapping from case to control flow block). If not a valid switch/case scope, returns None. """ cond = edges[0].data.condition_sympy() if not isinstance(cond, sp.Basic): return None a = sp.Wild('a') b = sp.Wild('b', properties=[lambda k: k.is_Integer]) m = cond.match(sp.Eq(a, b)) if m: # Obtain original code for variable call_or_compare = edges[0].data.condition.code[0].value if isinstance(call_or_compare, ast.Call): astvar = call_or_compare.args[0] else: # Binary comparison astvar = call_or_compare.left else: # Try integer == symbol m = cond.match(sp.Eq(b, a)) if m: call_or_compare = edges[0].data.condition.code[0].value if isinstance(call_or_compare, ast.Call): astvar = call_or_compare.args[1] else: # Binary comparison astvar = call_or_compare.right else: return None # Get C++ expression from AST switchvar = cppunparse.pyexpr2cpp(astvar) # Check that all edges match criteria result = {} for e in edges: ematch = e.data.condition_sympy().match(sp.Eq(m[a], b)) if not ematch: ematch = e.data.condition_sympy().match(sp.Eq(b, m[a])) if not ematch: return None # Create mapping to codeblocks result[cpp.sym2cpp(ematch[b])] = cblocks[e] return switchvar, result
def make_kernel_argument(data: dt.Data, var_name: str, subset_info: Union[int, subsets.Subset], sdfg: SDFG, is_output: bool, with_vectorization: bool, interface_id: Union[int, List[int]] = None): if isinstance(data, dt.Array): var_name = fpga.fpga_ptr(var_name, data, sdfg, subset_info, is_output, None, None, True, interface_id) if with_vectorization: dtype = data.dtype else: dtype = data.dtype.base_type return "{} *{}".format(dtype.ctype, var_name) if isinstance(data, dt.Stream): ctype = "dace::FIFO<{}, {}, {}>".format( data.dtype.base_type.ctype, cpp.sym2cpp(data.dtype.veclen), cpp.sym2cpp(data.buffer_size)) return "{} &{}".format(ctype, var_name) else: return data.as_arg(with_types=True, name=var_name)
def define_local_array(self, var_name, desc, array_size, function_stream, kernel_stream, sdfg, state_id, node): dtype = desc.dtype kernel_stream.write("{} {}[{}];\n".format(dtype.ctype, var_name, cpp.sym2cpp(array_size))) if desc.storage == dace.dtypes.StorageType.FPGA_Registers: kernel_stream.write("#pragma HLS ARRAY_PARTITION variable={} " "complete\n".format(var_name)) elif desc.storage == dace.dtypes.StorageType.FPGA_Local: pass else: raise ValueError("Unsupported storage type: {}".format( desc.storage.name)) self._dispatcher.defined_vars.add(var_name, DefinedType.Pointer, '%s *' % dtype.ctype)
def generate_dummy(sdfg) -> str: """ Generates a C program calling this SDFG. Since we do not know the purpose/semantics of the program, we allocate the right types and and guess values for scalars. """ includes = "#include <stdlib.h>\n" includes += "#include \"" + sdfg.name + ".h\"\n\n" header = "int main(int argc, char** argv) {\n" allocations = "" deallocations = "" sdfg_call = "" footer = " return 0;\n}\n" al = sdfg.arglist() # first find all scalars and set them to 42 for argname, arg in al.items(): if isinstance(arg, data.Scalar): allocations += " " + str(arg.as_arg(name=argname, with_types=True)) + " = 42;\n" # allocate the array args using calloc for argname, arg in al.items(): if isinstance(arg, data.Array): dims_mul = cpp.sym2cpp( functools.reduce(lambda a, b: a * b, arg.shape, 1)) basetype = str(arg.dtype) allocations += " " + str(arg.as_arg(name=argname, with_types=True)) + \ " = (" + basetype + "*) calloc(" + dims_mul + ", sizeof("+ basetype +")" + ");\n" deallocations += " free(" + argname + ");\n" sdfg_call = ''' __dace_init_{name}({params}); __program_{name}({params}); __dace_exit_{name}({params});\n\n'''.format(name=sdfg.name, params=sdfg.signature( with_types=False, for_call=True)) res = includes res += header res += allocations res += sdfg_call res += deallocations res += footer return res
def memlet_definition(self, sdfg, memlet, local_name, direction="in"): if isinstance(memlet.data, data.Stream): return 'auto& %s = %s;\n' % (local_name, memlet.data) result = ('auto __%s = ' % local_name + self.memlet_view_ctor(sdfg, memlet, direction) + ';\n') # Allocate variable type memlet_type = ' dace::vec<%s, %s>' % ( sdfg.arrays[memlet.data].dtype.ctype, sym2cpp(memlet.veclen)) if memlet.subset.data_dims() == 0 and memlet.num_accesses >= 0: result += memlet_type + ' ' + local_name if direction == "in": result += ' = __%s;\n' % local_name else: result += ';\n' return result
def _generate_copy_to_device(self, node: nodes.AccessNode, desc: dt.Array, ptr: str) -> Tuple[str, str, str]: """ Copies restored data to device and returns (preamble, postamble, name of new host pointer). """ new_ptr = f'__dinstr_{node.data}' new_desc = dt.Array(desc.dtype, [desc.total_size - desc.start_offset]) csize = cpp.sym2cpp(desc.total_size - desc.start_offset) # Emit synchronous memcpy preamble = f''' {{ {new_desc.as_arg(name=new_ptr)} = new {desc.dtype.ctype}[{csize}]; ''' postamble = f''' {self.backend}Memcpy({ptr}, {new_ptr}, sizeof({desc.dtype.ctype}) * ({csize}), {self.backend}MemcpyHostToDevice); delete[] {new_ptr}; }} ''' return preamble, postamble, new_ptr
def generate_dummy(sdfg: SDFG, frame: framecode.DaCeCodeGenerator) -> str: """ Generates a C program calling this SDFG. Since we do not know the purpose/semantics of the program, we allocate the right types and and guess values for scalars. """ al = frame.arglist init_params = sdfg.signature(with_types=False, for_call=True, with_arrays=False, arglist=frame.arglist_scalars_only) params = sdfg.signature(with_types=False, for_call=True, arglist=frame.arglist) if len(params) > 0: params = ', ' + params allocations = '' deallocations = '' # first find all scalars and set them to 42 for argname, arg in al.items(): if isinstance(arg, data.Scalar): allocations += (" " + str(arg.as_arg(name=argname, with_types=True)) + " = 42;\n") # allocate the array args using calloc for argname, arg in al.items(): if isinstance(arg, data.Array): dims_mul = cpp.sym2cpp( functools.reduce(lambda a, b: a * b, arg.shape, 1)) basetype = str(arg.dtype) allocations += (" " + str(arg.as_arg(name=argname, with_types=True)) + " = (" + basetype + "*) calloc(" + dims_mul + ", sizeof(" + basetype + ")" + ");\n") deallocations += " free(" + argname + ");\n" return f'''#include <cstdlib>
def generate_scope(self, sdfg: dace.SDFG, scope: ScopeSubgraphView, state_id: int, function_stream: CodeIOStream, callsite_stream: CodeIOStream): entry_node = scope.source_nodes()[0] loop_type = list(set([sdfg.arrays[a].dtype for a in sdfg.arrays]))[0] ltype_size = loop_type.bytes long_type = copy.copy(dace.int64) long_type.ctype = 'int64_t' self.counter_type = { 1: dace.int8, 2: dace.int16, 4: dace.int32, 8: long_type }[ltype_size] callsite_stream.write('{') # Define all input connectors of the map entry state_dfg = sdfg.node(state_id) for e in dace.sdfg.dynamic_map_inputs(state_dfg, entry_node): if e.data.data != e.dst_conn: callsite_stream.write( self.cpu_codegen.memlet_definition( sdfg, e.data, False, e.dst_conn, e.dst.in_connectors[e.dst_conn]), sdfg, state_id, entry_node) # We only create an SVE do-while in the innermost loop for param, rng in zip(entry_node.map.params, entry_node.map.range): begin, end, stride = (sym2cpp(r) for r in rng) self.dispatcher.defined_vars.enter_scope(sdfg) # Check whether we are in the innermost loop if param != entry_node.map.params[-1]: # Default C++ for-loop callsite_stream.write( f'for(auto {param} = {begin}; {param} <= {end}; {param} += {stride}) {{' ) else: # Generate the SVE loop header # The name of our loop predicate is always __pg_{param} self.dispatcher.defined_vars.add('__pg_' + param, DefinedType.Scalar, 'svbool_t') # Declare our counting variable (e.g. i) and precompute the loop predicate for our range callsite_stream.write( f'''{self.counter_type} {param} = {begin}; svbool_t __pg_{param} = svwhilele_b{ltype_size * 8}({param}, ({self.counter_type}) {end}); do {{''', sdfg, state_id, entry_node) # Dispatch the subgraph generation self.dispatcher.dispatch_subgraph(sdfg, scope, state_id, function_stream, callsite_stream, skip_entry_node=True, skip_exit_node=True) # Close the loops from above (in reverse) for param, rng in zip(reversed(entry_node.map.params), reversed(entry_node.map.range)): # The innermost loop is SVE and needs a special while-footer, otherwise we just add the closing bracket if param != entry_node.map.params[-1]: # Close the default C++ for-loop callsite_stream.write('}') else: # Generate the SVE loop footer _, end, stride = (sym2cpp(r) for r in rng) # Increase the counting variable (according to the number of processed elements) # Then recompute the loop predicate and test for it callsite_stream.write( f'''{param} += svcntp_b{ltype_size * 8}(__pg_{param}, __pg_{param}) * {stride}; __pg_{param} = svwhilele_b{ltype_size * 8}({param}, ({self.counter_type}) {end}); }} while(svptest_any(svptrue_b{ltype_size * 8}(), __pg_{param}));''', sdfg, state_id, entry_node) self.dispatcher.defined_vars.exit_scope(sdfg) callsite_stream.write('}')
def generate_scope(self, sdfg: dace.SDFG, scope: ScopeSubgraphView, state_id: int, function_stream: CodeIOStream, callsite_stream: CodeIOStream): entry_node = scope.source_nodes()[0] current_map = entry_node.map self.current_map = current_map if len(current_map.params) > 1: raise util.NotSupportedError('SVE map must be one dimensional') loop_types = list(set([util.get_base_type(sdfg.arrays[a].dtype) for a in sdfg.arrays])) # Edge case if no arrays are used loop_type = loop_types[0] if len(loop_types) > 0 else dace.int64 ltype_size = loop_type.bytes long_type = copy.copy(dace.int64) long_type.ctype = 'int64_t' self.counter_type = {1: dace.int8, 2: dace.int16, 4: dace.int32, 8: long_type}[ltype_size] callsite_stream.write('{') self.dispatcher.defined_vars.enter_scope(scope) # Define all dynamic input connectors of the map entry state_dfg = sdfg.node(state_id) for e in dace.sdfg.dynamic_map_inputs(state_dfg, entry_node): if e.data.data != e.dst_conn: callsite_stream.write( self.cpu_codegen.memlet_definition(sdfg, e.data, False, e.dst_conn, e.dst.in_connectors[e.dst_conn]), sdfg, state_id, entry_node) param = current_map.params[0] rng = current_map.range[0] begin, end, stride = (sym2cpp(r) for r in rng) # Generate the SVE loop header # The name of our loop predicate is always __pg_{param} self.dispatcher.defined_vars.add('__pg_' + param, DefinedType.Scalar, 'svbool_t') # Declare our counting variable (e.g. i) and precompute the loop predicate for our range callsite_stream.write(f'{self.counter_type} {param} = {begin};') end_param = f'__{param}_to' callsite_stream.write(f'{self.counter_type} {end_param} = {end};') callsite_stream.write(f'svbool_t __pg_{param} = svwhilele_b{ltype_size * 8}({param}, {end_param});') # Test for the predicate callsite_stream.write(f'while(svptest_any(svptrue_b{ltype_size * 8}(), __pg_{param})) {{') # Allocate scope related memory for node, _ in scope.all_nodes_recursive(): if isinstance(node, nodes.Tasklet): # Create empty shared registers for outputs into other tasklets for edge in state_dfg.out_edges(node): if isinstance(edge.dst, dace.nodes.Tasklet): self.generate_out_register(sdfg, state_dfg, edge, callsite_stream, True) # Dispatch the subgraph generation self.dispatcher.dispatch_subgraph(sdfg, scope, state_id, function_stream, callsite_stream, skip_entry_node=True, skip_exit_node=True) # Increase the counting variable (according to the number of processed elements) size_letter = {1: 'b', 2: 'h', 4: 'w', 8: 'd'}[ltype_size] callsite_stream.write(f'{param} += svcnt{size_letter}() * {stride};') # Then recompute the loop predicate callsite_stream.write(f'__pg_{param} = svwhilele_b{ltype_size * 8}({param}, {end_param});') callsite_stream.write('}') self.dispatcher.defined_vars.exit_scope(scope) callsite_stream.write('}')
def expansion(node: 'Reduce', state: SDFGState, sdfg: SDFG): node.validate(sdfg, state) inedge: graph.MultiConnectorEdge = state.in_edges(node)[0] outedge: graph.MultiConnectorEdge = state.out_edges(node)[0] input_dims = len(inedge.data.subset) output_dims = len(outedge.data.subset) input_data = sdfg.arrays[inedge.data.data] output_data = sdfg.arrays[outedge.data.data] # Get reduction type for OpenMP redtype = detect_reduction_type(node.wcr, openmp=True) if redtype not in ExpandReduceOpenMP._REDUCTION_TYPE_TO_OPENMP: raise ValueError('Reduction type not supported for "%s"' % node.wcr) omptype, expr = ExpandReduceOpenMP._REDUCTION_TYPE_TO_OPENMP[redtype] # Standardize axes axes = node.axes if node.axes else [i for i in range(input_dims)] outer_loops = len(axes) != input_dims # Create OpenMP clause if outer_loops: code = '#pragma omp parallel for collapse({cdim})\n'.format( cdim=output_dims) else: code = '' from dace.codegen.targets.cpp import sym2cpp # Output loops out_offset = [] if outer_loops: for i, sz in enumerate(outedge.data.subset.size()): code += 'for (int _o{i} = 0; _o{i} < {sz}; ++_o{i}) {{\n'.format( i=i, sz=sym2cpp(sz)) out_offset.append('_o%d * %s' % (i, sym2cpp(output_data.strides[i]))) else: out_offset.append('0') outexpr = '_out[%s]' % ' + '.join(out_offset) # Write identity value first if node.identity is not None: code += '%s = %s;\n' % (outexpr, node.identity) # Reduction OpenMP clause code += '#pragma omp parallel for collapse({cdim}) ' \ 'reduction({rtype}: {oexpr})\n'.format(cdim=len(axes), rtype=omptype, oexpr=outexpr) # Reduction loops for i, axis in enumerate(sorted(axes)): sz = sym2cpp(inedge.data.subset.size()[axis]) code += 'for (int _i{i} = 0; _i{i} < {sz}; ++_i{i}) {{\n'.format( i=i, sz=sz) # Prepare input offset expression in_offset = [] ictr, octr = 0, 0 for i in range(input_dims): if i in axes: result = '_i%d' % ictr ictr += 1 else: result = '_o%d' % octr octr += 1 in_offset.append('%s * %s' % (result, sym2cpp(input_data.strides[i]))) in_offset = ' + '.join(in_offset) # Reduction expression code += expr.format(i='_in[%s]' % in_offset, o=outexpr) code += '\n' # Closing braces code += '}\n' * len(axes) if outer_loops: code += '}\n' * output_dims # Make tasklet tnode = dace.nodes.Tasklet('reduce', {'_in': dace.pointer(input_data.dtype)}, {'_out': dace.pointer(output_data.dtype)}, code, language=dace.Language.CPP) # Rename outer connectors and add to node inedge._dst_conn = '_in' outedge._src_conn = '_out' node.add_in_connector('_in') node.add_out_connector('_out') return tnode
def generate_module(self, sdfg, state, kernel_name, name, subgraph, parameters, module_stream, entry_stream, host_stream, instrumentation_stream): """Generates a module that will run as a dataflow function in the FPGA kernel.""" state_id = sdfg.node_id(state) dfg = sdfg.nodes()[state_id] kernel_args_call = [] kernel_args_module = [] for is_output, pname, p, interface_ids in parameters: if isinstance(p, dt.Array): for bank, interface_id in fpga.iterate_multibank_interface_ids( p, interface_ids): arr_name = fpga.fpga_ptr(pname, p, sdfg, bank, is_output, is_array_interface=True) # Add interface ID to called module, but not to the module # arguments argname = fpga.fpga_ptr(pname, p, sdfg, bank, is_output, is_array_interface=True, interface_id=interface_id) kernel_args_call.append(argname) dtype = p.dtype kernel_args_module.append("{} {}*{}".format( dtype.ctype, "const " if not is_output else "", arr_name)) else: if isinstance(p, dt.Stream): kernel_args_call.append( p.as_arg(with_types=False, name=pname)) if p.is_stream_array(): kernel_args_module.append( "dace::FIFO<{}, {}, {}> {}[{}]".format( p.dtype.base_type.ctype, cpp.sym2cpp(p.veclen), cpp.sym2cpp(p.buffer_size), pname, p.size_string())) else: kernel_args_module.append( "dace::FIFO<{}, {}, {}> &{}".format( p.dtype.base_type.ctype, cpp.sym2cpp(p.veclen), cpp.sym2cpp(p.buffer_size), pname)) else: kernel_args_call.append( p.as_arg(with_types=False, name=pname)) kernel_args_module.append( p.as_arg(with_types=True, name=pname)) # Check if we are generating an RTL module, in which case only the # accesses to the streams should be handled rtl_tasklet = None for n in subgraph.nodes(): if (isinstance(n, dace.nodes.Tasklet) and n.language == dace.dtypes.Language.SystemVerilog): rtl_tasklet = n break if rtl_tasklet: entry_stream.write( f'// [RTL] HLSLIB_DATAFLOW_FUNCTION({name}, {", ".join(kernel_args_call)});' ) module_stream.write( f'// [RTL] void {name}({", ".join(kernel_args_module)});\n\n') # _1 in names are due to vitis for node in subgraph.source_nodes(): if isinstance(sdfg.arrays[node.data], dt.Stream): if node.data not in self._stream_connections: self._stream_connections[node.data] = [None, None] for edge in state.out_edges(node): rtl_name = "{}_{}_{}_{}".format( edge.dst, sdfg.sdfg_id, sdfg.node_id(state), state.node_id(edge.dst)) self._stream_connections[ node.data][1] = '{}_top_1.s_axis_{}'.format( rtl_name, edge.dst_conn) for node in subgraph.sink_nodes(): if isinstance(sdfg.arrays[node.data], dt.Stream): if node.data not in self._stream_connections: self._stream_connections[node.data] = [None, None] for edge in state.in_edges(node): rtl_name = "{}_{}_{}_{}".format( edge.src, sdfg.sdfg_id, sdfg.node_id(state), state.node_id(edge.src)) self._stream_connections[ node.data][0] = '{}_top_1.m_axis_{}'.format( rtl_name, edge.src_conn) # Make the dispatcher trigger generation of the RTL module, but # ignore the generated code, as the RTL codegen will generate the # appropriate files. ignore_stream = CodeIOStream() self._dispatcher.dispatch_subgraph(sdfg, subgraph, state_id, ignore_stream, ignore_stream, skip_entry_node=False) # Launch the kernel from the host code rtl_name = self.rtl_tasklet_name(rtl_tasklet, state, sdfg) host_stream.write( f" auto kernel_{rtl_name} = program.MakeKernel(\"{rtl_name}_top\"{', '.join([''] + [name for _, name, p, _ in parameters if not isinstance(p, dt.Stream)])}).ExecuteTaskFork();", sdfg, state_id, rtl_tasklet) if state.instrument == dtypes.InstrumentationType.FPGA: self.instrument_opencl_kernel(rtl_name, state_id, sdfg.sdfg_id, instrumentation_stream) return # create a unique module name to prevent name clashes module_function_name = f"module_{name}_{sdfg.sdfg_id}" # Unrolling processing elements: if there first scope of the subgraph # is an unrolled map, generate a processing element for each iteration scope_children = subgraph.scope_children() top_scopes = [ n for n in scope_children[None] if isinstance(n, dace.sdfg.nodes.EntryNode) ] unrolled_loops = 0 if len(top_scopes) == 1: scope = top_scopes[0] if scope.unroll: self._unrolled_pes.add(scope.map) kernel_args_call += ", ".join(scope.map.params) kernel_args_module += ["int " + p for p in scope.params] for p, r in zip(scope.map.params, scope.map.range): if len(r) > 3: raise cgx.CodegenError("Strided unroll not supported") entry_stream.write( "for (size_t {param} = {begin}; {param} < {end}; " "{param} += {increment}) {{\n#pragma HLS UNROLL". format(param=p, begin=r[0], end=r[1] + 1, increment=r[2])) unrolled_loops += 1 # Generate caller code in top-level function entry_stream.write( "HLSLIB_DATAFLOW_FUNCTION({}, {});".format( module_function_name, ", ".join(kernel_args_call)), sdfg, state_id) for _ in range(unrolled_loops): entry_stream.write("}") # ---------------------------------------------------------------------- # Generate kernel code # ---------------------------------------------------------------------- self._dispatcher.defined_vars.enter_scope(subgraph) module_body_stream = CodeIOStream() module_body_stream.write( "void {}({}) {{".format(module_function_name, ", ".join(kernel_args_module)), sdfg, state_id) # Register the array interface as a naked pointer for use inside the # FPGA kernel interfaces_added = set() for is_output, argname, arg, interface_id in parameters: for bank, _ in fpga.iterate_multibank_interface_ids( arg, interface_id): if (not (isinstance(arg, dt.Array) and arg.storage == dace.dtypes.StorageType.FPGA_Global)): continue ctype = dtypes.pointer(arg.dtype).ctype ptr_name = fpga.fpga_ptr(argname, arg, sdfg, bank, is_output, None, is_array_interface=True) if not is_output: ctype = f"const {ctype}" self._dispatcher.defined_vars.add(ptr_name, DefinedType.Pointer, ctype) if argname in interfaces_added: continue interfaces_added.add(argname) self._dispatcher.defined_vars.add(argname, DefinedType.ArrayInterface, ctype, allow_shadowing=True) module_body_stream.write("\n") # Allocate local transients data_to_allocate = (set(subgraph.top_level_transients()) - set(sdfg.shared_transients()) - set([p[1] for p in parameters])) allocated = set() for node in subgraph.nodes(): if not isinstance(node, dace.sdfg.nodes.AccessNode): continue if node.data not in data_to_allocate or node.data in allocated: continue allocated.add(node.data) self._dispatcher.dispatch_allocate(sdfg, state, state_id, node, node.desc(sdfg), module_stream, module_body_stream) self._dispatcher.dispatch_subgraph(sdfg, subgraph, state_id, module_stream, module_body_stream, skip_entry_node=False) module_stream.write(module_body_stream.getvalue(), sdfg, state_id) module_stream.write("}\n\n") self._dispatcher.defined_vars.exit_scope(subgraph)
def generate_boundary_conditions(node, shape, field_accesses, field_to_desc, iterator_mapping): boundary_code = "" # Conditions where the output should not be written oob_cond = set() # Loop over each input for field_name in node.in_connectors: accesses = field_accesses[field_name] dtype = field_to_desc[field_name].dtype veclen = dtype.veclen iterators = iterator_mapping[field_name] num_dims = sum(iterators, 0) # Loop over each access to this data for indices, memlet_name in accesses.items(): if len(indices) != num_dims: raise ValueError(f"Access {field_name}[{indices}] inconsistent " f"with iterator mapping {iterators}.") cond = set() cond_global = set() # Loop over each index of this access for i, offset in enumerate(indices): if i == len(indices) - 1 and dtype.veclen > 1: unroll_boundary = f"*{dtype.veclen} + i_unroll" unroll_write = f"*{dtype.veclen} - i_unroll" else: unroll_boundary = "" unroll_write = "" if offset < 0: offset_str = sym2cpp(-offset) term = f"_i{i}{unroll_boundary} < {offset_str}" if i != len(indices) - 1: offset_str = sym2cpp(-offset) cond_global.add(f"_i{i} < {offset_str}") elif offset <= -veclen: offset_str = sym2cpp(-offset // veclen) cond_global.add(f"_i{i} < {offset_str}") elif offset > 0: offset_str = sym2cpp(shape[i] - offset) term = f"_i{i}{unroll_boundary} >= {offset_str}" if i != len(indices) - 1: cond_global.add(f"_i{i} >= {offset_str}") elif offset >= veclen: offset_str = sym2cpp((shape[i] - offset) // veclen) cond_global.add(f"_i{i} >= {offset_str}") else: continue cond.add(term) if len(cond) == 0: boundary_code += "{} = _{}\n".format(memlet_name, memlet_name) else: if field_name in node.boundary_conditions: bc = node.boundary_conditions[field_name] else: bc = {"btype": "shrink"} btype = bc["btype"] if btype == "copy": center_memlet = accesses[center] boundary_val = "_{}".format(center_memlet) elif btype == "constant": boundary_val = bc["value"] elif btype == "shrink": # We don't need to do anything here, it's up to the # user to not use the junk output if np.issubdtype(dtype.type, np.floating): boundary_val = np.finfo(dtype.type).min else: # If not a float, assume it's some kind of integer boundary_val = np.iinfo(dtype.type).min # Add this to the output condition oob_cond |= cond_global else: raise ValueError( f"Unsupported boundary condition type: {btype}") boundary_code += ("{} = {} if {} else _{}\n".format( memlet_name, boundary_val, " or ".join(sorted(cond)), memlet_name)) return boundary_code, oob_cond
def copy_memory(self, sdfg: SDFG, dfg: SDFGState, state_id: int, src_node: nodes.Node, dst_node: nodes.Node, edge: gr.MultiConnectorEdge[mm.Memlet], function_stream: CodeIOStream, callsite_stream: CodeIOStream): # We should always be in an SVE scope scope = util.get_sve_scope(sdfg, dfg, dst_node) if scope is None: raise NotImplementedError('Not in an SVE scope') in_conn = dst_node.in_connectors[edge.dst_conn] if isinstance(src_node, dace.nodes.Tasklet): # Copy from tasklet is just copying the shared register # Use defined_vars to get the C++ type of the shared register callsite_stream.write( f'{self.dispatcher.defined_vars.get(edge.data.data)[1]} {edge.dst_conn} = {edge.data.data};' ) return if not isinstance(src_node, dace.nodes.AccessNode): raise util.NotSupportedError( 'Copy neither from Tasklet nor AccessNode') src_desc = src_node.desc(sdfg) if isinstance(src_desc, dace.data.Stream): # A copy from a stream will trigger a vector pop raise NotImplementedError() # FIXME: Issue when we can pop different amounts of data! # If we limit to the smallest amount, certain data will be lost (never processed) """ # SVE register where the stream will be popped to self.create_empty_definition(in_conn, edge, callsite_stream, output=True) var_name = edge.dst_conn callsite_stream.write( f'{util.TYPE_TO_SVE[in_conn.type]} {var_name};') callsite_stream.write('{') callsite_stream.write('// Stream pop') # Pop into local buffer # 256 // in_conn.vtype.bytes n_vec = f'{util.REGISTER_BYTE_SIZE} / {in_conn.vtype.bytes}' callsite_stream.write(f'{in_conn.vtype.ctype} __tmp[{n_vec}];') callsite_stream.write( f'size_t __cnt = {edge.data.data}.pop_try(__tmp, {n_vec});') # Limit the loop predicate loop_pred = util.get_loop_predicate(sdfg, dfg, dst_node) callsite_stream.write( f'{loop_pred} = svand_z({loop_pred}, {loop_pred}, svwhilelt_b{in_conn.vtype.bytes * 8}(0ll, __cnt));') # Transfer to register callsite_stream.write(f'{var_name} = svld1({loop_pred}, __tmp);') callsite_stream.write('}') """ return if isinstance(in_conn, dtypes.vector): # Copy from vector, so we can use svld if in_conn.type not in util.TYPE_TO_SVE: raise NotImplementedError( f'Data type {in_conn.type} not supported') self.dispatcher.defined_vars.add(edge.dst_conn, dtypes.vector, in_conn.ctype) # Determine the stride of the load and use a gather if applicable stride = self.get_load_stride(sdfg, dfg, dst_node, edge.data) # First part of the declaration is `type name` load_lhs = '{} {}'.format(util.TYPE_TO_SVE[in_conn.type], edge.dst_conn) ptr_cast = '' if in_conn.type == np.int64: ptr_cast = '(int64_t*) ' elif in_conn.type == np.uint64: ptr_cast = '(uint64_t*) ' # Regular load and gather share the first arguments load_args = '{}, {}'.format( util.get_loop_predicate(sdfg, dfg, dst_node), ptr_cast + cpp.cpp_ptr_expr(sdfg, edge.data, DefinedType.Pointer)) if stride == 1: callsite_stream.write('{} = svld1({});'.format( load_lhs, load_args)) else: callsite_stream.write( '{} = svld1_gather_index({}, svindex_s{}(0, {}));'.format( load_lhs, load_args, util.get_base_type(in_conn).bytes * 8, sym2cpp(stride))) else: # Any other copy (e.g. pointer or scalar) is handled by the default CPU codegen self.cpu_codegen.copy_memory(sdfg, dfg, state_id, src_node, dst_node, edge, function_stream, callsite_stream)
def generate_read(self, sdfg: SDFG, state: SDFGState, map: nodes.Map, edge: graph.MultiConnectorEdge[mm.Memlet], code: CodeIOStream): """ Responsible for generating code for reads into a Tasklet, given the ingoing edge. """ if edge.dst_conn is None: return src_node = state.memlet_path(edge)[0].src dst_type = edge.dst.in_connectors[edge.dst_conn] dst_name = edge.dst_conn if isinstance(src_node, nodes.Tasklet): ################## # Code->Code edges src_type = edge.src.out_connectors[edge.src_conn] if util.is_vector(src_type) and util.is_vector(dst_type): # Directly read from shared vector register code.write(f'{util.TYPE_TO_SVE[dst_type.type]} {dst_name} = {edge.data.data};') elif util.is_scalar(src_type) and util.is_scalar(dst_type): # Directly read from shared scalar register code.write(f'{dst_type} {dst_name} = {edge.data.data};') elif util.is_scalar(src_type) and util.is_vector(dst_type): # Scalar broadcast from shared scalar register code.write( f'{util.TYPE_TO_SVE[dst_type.type]} {dst_name} = svdup_{util.TYPE_TO_SVE_SUFFIX[dst_type.type]}({edge.data.data});' ) else: raise util.NotSupportedError('Unsupported Code->Code edge') elif isinstance(src_node, nodes.AccessNode): ################## # Read from AccessNode desc = src_node.desc(sdfg) if isinstance(desc, data.Array): # Copy from array if util.is_pointer(dst_type): ################## # Pointer reference code.write( f'{dst_type} {dst_name} = {cpp.cpp_ptr_expr(sdfg, edge.data, None, codegen=self.frame)};') elif util.is_vector(dst_type): ################## # Vector load stride = edge.data.get_stride(sdfg, map) # First part of the declaration is `type name` load_lhs = '{} {}'.format(util.TYPE_TO_SVE[dst_type.type], dst_name) # long long issue casting ptr_cast = '' if dst_type.type == np.int64: ptr_cast = '(int64_t*) ' elif dst_type.type == np.uint64: ptr_cast = '(uint64_t*) ' # Regular load and gather share the first arguments load_args = '{}, {}'.format( util.get_loop_predicate(sdfg, state, edge.dst), ptr_cast + cpp.cpp_ptr_expr(sdfg, edge.data, DefinedType.Pointer, codegen=self.frame)) if stride == 1: code.write('{} = svld1({});'.format(load_lhs, load_args)) else: code.write('{} = svld1_gather_index({}, svindex_s{}(0, {}));'.format( load_lhs, load_args, util.get_base_type(dst_type).bytes * 8, sym2cpp(stride))) else: ################## # Scalar read from array code.write(f'{dst_type} {dst_name} = {cpp.cpp_array_expr(sdfg, edge.data, codegen=self.frame)};') elif isinstance(desc, data.Scalar): # Refer to shared variable src_type = desc.dtype if util.is_vector(src_type) and util.is_vector(dst_type): # Directly read from shared vector register code.write(f'{util.TYPE_TO_SVE[dst_type.type]} {dst_name} = {edge.data.data};') elif util.is_scalar(src_type) and util.is_scalar(dst_type): # Directly read from shared scalar register code.write(f'{dst_type} {dst_name} = {edge.data.data};') elif util.is_scalar(src_type) and util.is_vector(dst_type): # Scalar broadcast from shared scalar register code.write( f'{util.TYPE_TO_SVE[dst_type.type]} {dst_name} = svdup_{util.TYPE_TO_SVE_SUFFIX[dst_type.type]}({edge.data.data});' ) else: raise util.NotSupportedError('Unsupported Scalar->Code edge') else: raise util.NotSupportedError('Only copy from Tasklets and AccessNodes is supported')
def memlet_view_ctor(self, sdfg, memlet, direction): useskip = False memlet_params = [] memlet_name = memlet.data if isinstance(sdfg.arrays[memlet.data], data.Scalar): raise ValueError("This should never have happened") if isinstance(memlet.subset, subsets.Indices): # Compute address memlet_params.append(cpp_array_expr(sdfg, memlet, False)) dims = 0 elif isinstance(memlet.subset, subsets.Range): dims = len(memlet.subset.ranges) #memlet_params.append("") # Dimensions to remove from view (due to having one value) indexdims = [] nonIndexDims = [] for dim, (rb, re, rs) in enumerate(memlet.subset.ranges): if rs != 1: useskip = True try: if (re - rb) == 0: indexdims.append(dim) else: nonIndexDims.append(dim) except TypeError: # cannot determine truth value of Relational nonIndexDims.append(dim) if len(nonIndexDims) > 1 and len(indexdims) > 0: raise NotImplementedError( 'subviews of more than one dimension ' + 'not implemented') elif len( nonIndexDims) == 1 and len(indexdims) > 0: # One dimension indexdim = nonIndexDims[0] # Contiguous dimension if indexdim == dims - 1: memlet_params[-1] += ' + %s' % cpp_array_expr( sdfg, memlet, False) memlet_params.append( '0, %s' % (sym2cpp(memlet.subset.ranges[-1][1] - memlet.subset.ranges[-1][0]))) else: # Non-contiguous dimension useskip = True memlet_params[-1] += ' + %s' % cpp_array_expr( sdfg, memlet, False) memlet_range = memlet.subset.ranges[indexdim] memlet_stride = sdfg.arrays[memlet.data].strides[indexdim] memlet_stride = sym2cpp(memlet_stride) memlet_params.append( '0, %s, %s' % (sym2cpp(memlet_range[1] - memlet_range[0]), sym2cpp(memlet_stride))) # Subtract index dimensions from array dimensions dims -= len(indexdims) elif len(indexdims) == 0: for (rb, re, rs), s in zip(memlet.subset.ranges, sdfg.arrays[memlet.data].shape): if useskip: memlet_params.append( '%s, %s, %s' % (cppunparse.pyexpr2cpp(symbolic.symstr(rb)), cppunparse.pyexpr2cpp(symbolic.symstr(s)), cppunparse.pyexpr2cpp(symbolic.symstr(rs)))) else: memlet_params.append( '%s, %s' % (cppunparse.pyexpr2cpp(symbolic.symstr(rb)), cppunparse.pyexpr2cpp(symbolic.symstr(s)))) elif len(nonIndexDims) == 0: # Scalar view if len(memlet_params) > 0: # Compute address memlet_params[-1] += ' + ' + cpp_array_expr( sdfg, memlet, False) else: memlet_params.append(cpp_array_expr(sdfg, memlet, False)) dims = 0 else: raise RuntimeError('Memlet type "%s" not implemented' % memlet.subset) if dims == 0: return 'dace::ArrayViewImmaterial%s%s<%s, %s, int32_t> ("%s", %s)' % ( 'In' if direction == "in" else "Out", 'Skip' if useskip else '', sdfg.arrays[memlet.data].dtype.ctype, symbolic.symstr( memlet.veclen), memlet.data, ', '.join(memlet_params)) else: return 'dace::ArrayViewImmaterial%s%s<%s, %s, int32_t, %s> ("%s", %s)' % ( 'In' if direction == "in" else "Out", 'Skip' if useskip else '', sdfg.arrays[memlet.data].dtype.ctype, symbolic.symstr(memlet.veclen), ', '.join([ str(s) for s in memlet.subset.bounding_box_size() ]), memlet.data, ', '.join(memlet_params))