def get_load_stride(self, sdfg: SDFG, state: SDFGState, node: nodes.Node, memlet: dace.Memlet) -> symbolic.SymExpr: """Determines the stride of a load/store based on: - The memlet subset - The array strides - The involved SVE loop stride""" scope = util.get_sve_scope(sdfg, state, node) if scope is None: raise NotImplementedError('Not in an SVE scope') sve_param = scope.map.params[-1] sve_range = scope.map.range[-1] sve_sym = dace.symbolic.symbol(sve_param) array = sdfg.arrays[memlet.data] # 1. Flatten the subset to a 1D-offset (using the array strides) offset_1 = memlet.subset.at([0] * len(array.strides), array.strides) if not offset_1.has(sve_sym): raise util.NotSupportedError("SVE param does not occur in subset") # 2. Replace the SVE loop param with its next (possibly strided) value offset_2 = offset_1.subs(sve_sym, sve_sym + sve_range[2]) # 3. The load stride is the difference between both stride = (offset_2 - offset_1).simplify() return stride
def _BoolOp(self, t): if util.only_scalars_involed(self.get_defined_symbols(), *t.values): return super()._BoolOp(t) types = self.infer(*t.values) # Bool ops are nested SVE instructions, so we must make sure they all act on vectors for type in types: if not isinstance(type, dtypes.vector): raise util.NotSupportedError( 'Non-vectorizable boolean operation') # There can be many t.values, e.g. if # x or y or z for val in t.values: # The last entry doesn't need more nesting if val == t.values[-1]: self.dispatch(t.values[-1]) break # Binary nesting self.write('{}_z({}, '.format(util.BOOL_OP_TO_SVE[t.op.__class__], self.pred_name)) self.dispatch(val) self.write(', ') # Close all except the last bracket (because the last entry isn't nested) self.write(')' * (len(t.values) - 1))
def copy_memory(self, sdfg: SDFG, dfg: SDFGState, state_id: int, src_node: nodes.Node, dst_node: nodes.Node, edge: gr.MultiConnectorEdge[mm.Memlet], function_stream: CodeIOStream, callsite_stream: CodeIOStream) -> None: # Check whether it is a known reduction that is possible in SVE reduction_type = detect_reduction_type(edge.data.wcr) if reduction_type not in util.REDUCTION_TYPE_TO_SVE: raise util.NotSupportedError('Unsupported reduction in SVE') nc = not is_write_conflicted(dfg, edge) desc = edge.src.desc(sdfg) if not nc or not isinstance(desc.dtype, (dtypes.pointer, dtypes.vector)): # WCR on vectors works in two steps: # 1. Reduce the SVE register using SVE instructions into a scalar # 2. WCR the scalar to memory using DaCe functionality wcr = self.cpu_codegen.write_and_resolve_expr(sdfg, edge.data, not nc, None, '@', dtype=desc.dtype) callsite_stream.write(wcr[:wcr.find('@')] + util.REDUCTION_TYPE_TO_SVE[reduction_type] + f'(svptrue_{util.TYPE_TO_SVE_SUFFIX[desc.dtype]}(), ' + src_node.label + wcr[wcr.find('@') + 1:] + ');') return else: ###################### # Horizontal non-atomic reduction raise NotImplementedError() return super().copy_memory(sdfg, dfg, state_id, src_node, dst_node, edge, function_stream, callsite_stream)
def _Call(self, t): res_type = self.infer(t)[0] if not res_type: raise util.NotSupportedError(f'Unsupported call') if not isinstance(res_type, dtypes.vector): # Call does not involve any vectors (to our knowledge) # Replace default modules (e.g., math) with dace::math:: attr_name = astutils.rname(t) module_name = attr_name[:attr_name.rfind(".")] func_name = attr_name[attr_name.rfind(".") + 1:] if module_name not in dtypes._ALLOWED_MODULES: raise NotImplementedError( f'Module {module_name} is not implemented') cpp_mod_name = dtypes._ALLOWED_MODULES[module_name] name = cpp_mod_name + func_name self.write(name) self.write('(') comma = False for e in t.args: if comma: self.write(", ") else: comma = True self.dispatch(e) self.write(')') return name = None if isinstance(t.func, ast.Name): # Could be an internal operation (provided by the preprocessor) if not util.is_sve_internal(t.func.id): raise NotImplementedError( f'Function {t.func.id} is not implemented') name = util.internal_to_external(t.func.id)[0] elif isinstance(t.func, ast.Attribute): # Some module function (xxx.xxx), make sure it is available name = util.MATH_FUNCTION_TO_SVE.get(astutils.rname(t.func)) if name is None: raise NotImplementedError( f'Function {astutils.rname(t.func)} is not implemented') # Vectorized function self.write('{}_x({}, '.format(name, self.pred_name)) comma = False for e in t.args: if comma: self.write(", ") else: comma = True self.dispatch_expect(e, res_type) self.write(')')
def vectorize_connector(sdfg: dace.SDFG, dfg: dace.SDFGState, node: dace.nodes.Node, par: str, conn: str, is_input: bool): edges = get_connector_edges(dfg, node, conn, is_input) connectors = node.in_connectors if is_input else node.out_connectors for edge in edges: if edge.data.data is None: # Empty memlets return desc = sdfg.arrays[edge.data.data] if isinstance(desc, data.Stream): # Streams are treated differently in SVE, instead of pointers they become vectors of unknown size connectors[conn] = dace.dtypes.vector(connectors[conn].base_type, -1) return if isinstance(connectors[conn], (dace.dtypes.vector, dace.dtypes.pointer)): # No need for vectorization return subset = edge.data.subset sve_dim = None for i, rng in enumerate(subset): for expr in rng: if symbolic.symbol(par) in symbolic.pystr_to_symbolic( expr).free_symbols: if sve_dim is not None and sve_dim != i: raise util.NotSupportedError( 'Non-vectorizable memlet (loop param occurs in more than one dimension)' ) sve_dim = i if sve_dim is None and edge.data.wcr is None: # Should stay scalar return if sve_dim is not None: sve_subset = subset[sve_dim] edge.data.subset[sve_dim] = (sve_subset[0], sve_subset[0] + util.SVE_LEN, sve_subset[2]) connectors[conn] = dace.dtypes.vector( connectors[conn].type or desc.dtype, util.SVE_LEN)
def generate_out_register(self, sdfg: SDFG, state: SDFGState, edge: graph.MultiConnectorEdge[mm.Memlet], code: CodeIOStream, use_data_name: bool = False) -> bool: """ Responsible for generating temporary out registers in a Tasklet, given an outgoing edge. Returns `True` if a writeback of this register is needed. """ if edge.src_conn is None: return dst_node = state.memlet_path(edge)[-1].dst src_type = edge.src.out_connectors[edge.src_conn] src_name = edge.src_conn if use_data_name: src_name = edge.data.data if isinstance(dst_node, nodes.AccessNode) and isinstance( dst_node.desc(sdfg), data.Stream): # Streams don't need writeback and are treated differently self.stream_associations[edge.src_conn] = (edge.data.data, src_type.base_type) return False elif edge.data.wcr is not None: # WCR is addressed within the unparser to capture conditionals self.wcr_associations[edge.src_conn] = (dst_node, edge, src_type.base_type) return False # Create temporary registers ctype = None if util.is_vector(src_type): ctype = util.TYPE_TO_SVE[src_type.type] elif util.is_scalar(src_type): ctype = src_type.ctype else: raise util.NotSupportedError( 'Unsupported Code->Code edge (pointer)') self.dispatcher.defined_vars.add(src_name, DefinedType.Scalar, ctype) code.write(f'{ctype} {src_name};') return True
def create_empty_definition(self, conn: dace.typeclass, edge: gr.MultiConnectorEdge[mm.Memlet], callsite_stream: CodeIOStream, output: bool = False, is_code_code: bool = False): """ Creates a simple variable definition `type name;`, which works for both vectors and regular data types. """ var_name = None var_type = None var_ctype = None if output: var_name = edge.dst_conn else: var_name = edge.src_conn if is_code_code: # For edges between Tasklets (Code->Code), we use the data as name because these registers are temporary and shared var_name = edge.data.data if isinstance(conn, dtypes.vector): # Creates an SVE register if conn.type not in util.TYPE_TO_SVE: raise util.NotSupportedError('Data type not supported') # In case of a WCR, we must initialize it with the identity value. # This is to prevent cases in a conditional WCR, where we don't write and it is filled with garbage. # Currently, the initial is 0, because product reduction is not supported in SVE. init_str = '' if edge.data.wcr: init_str = ' = {}(0)'.format(util.instr('dup', type=conn.type)) var_type = conn.type var_ctype = util.TYPE_TO_SVE[var_type] callsite_stream.write('{} {}{};'.format(var_ctype, var_name, init_str)) else: raise NotImplementedError( f'Output into scalar or pointer is not supported ({var_name})') self.dispatcher.defined_vars.add(var_name, var_type, var_ctype)
def _Assign(self, t): if len(t.targets) > 1: raise util.NotSupportedError('Tuple output not supported') target = t.targets[0] if isinstance(target, ast.Name) and target.id in self.stream_associations: # Assigning to a stream variable is equivalent to a push self.push_to_stream(t, target) return elif isinstance(target, ast.Name) and target.id in self.wcr_associations: # Assigning to a WCR output self.resolve_conflict(t, target) return lhs_type, rhs_type = self.infer(target, t.value) if rhs_type is None: raise NotImplementedError( f'Can not infer RHS of assignment ({astunparse.unparse(t.value)})' ) is_new_variable = False if lhs_type is None: # The LHS could involve a variable name that was not declared (which is why inference fails) if not isinstance( target, ast.Name) or target.id in self.get_defined_symbols(): # Either we don't assign to a name, or the variable name has # already been declared (but infer still fails, i.e. something went wrong!) raise NotImplementedError('Can not infer LHS of assignment') # Declare it as `type name` lhs_type = rhs_type if isinstance(rhs_type, dtypes.vector): # SVE register is possible (declare it as svXXX_t) self.fill(util.TYPE_TO_SVE[rhs_type.type]) self.write(' ') # Define the new symbol as vector self.defined_symbols.update({target.id: rhs_type}) elif isinstance(rhs_type, dtypes.pointer): raise util.NotSupportedError( 'Defining pointers in Tasklet code not supported') # Otherwise, the fallback will grab the case of a scalar, # because the RHS is scalar, and the LHS is the same is_new_variable = True # LHS and RHS types are now both well defined lhs_vec = isinstance(lhs_type, dtypes.vector) rhs_vec = isinstance(rhs_type, dtypes.vector) # TODO: This is only bad if we assign to a variable from an outer scope """ if self.if_depth > 0 and not lhs_vec: raise util.NotSupportedError( 'Assignments in an if block must be to a vector or stream (otherwise not vectorizable)') """ if not lhs_vec and not rhs_vec: # Simple scalar-scalar assign handled by fallback super()._Assign(t) if isinstance(target, ast.Name): self.defined_symbols.update({target.id: rhs_type}) return if not is_new_variable: # Indentation fix self.fill() # Some vector assignment self.dispatch(target) self.write(' = ') # Note, that if this variable is declared in the same line, we # don't need to select at all (there is nothing to select from, # because it just got declared) if self.if_depth > 0 and not is_new_variable: # If we are in an If block, we assign based on the predicate # In case of "a = b", we do: # a = select(if_pred, b, a) self.write(f'svsel({self.pred_name}, ') self.dispatch_expect(t.value, lhs_type) if self.if_depth > 0 and not is_new_variable: # Close the select self.write(', ') self.dispatch(target) self.write(')') self.write(';')
def vector_reduction_expr(self, edge, dtype, rhs): # Check whether it is a known reduction that is possible in SVE reduction_type = detect_reduction_type(edge.data.wcr) if reduction_type not in util.REDUCTION_TYPE_TO_SVE: raise util.NotSupportedError('Unsupported reduction in SVE') nc = not is_write_conflicted(self.dfg, edge) if not nc or not isinstance(edge.src.out_connectors[edge.src_conn], (dtypes.pointer, dtypes.vector)): # WCR on vectors works in two steps: # 1. Reduce the SVE register using SVE instructions into a scalar # 2. WCR the scalar to memory using DaCe functionality dst_node = self.dfg.memlet_path(edge)[-1].dst if (isinstance(dst_node, nodes.AccessNode) and dst_node.desc( self.sdfg).storage == dtypes.StorageType.SVE_Register): return wcr = self.cpu_codegen.write_and_resolve_expr(self.sdfg, edge.data, not nc, None, '@', dtype=dtype) self.fill(wcr[:wcr.find('@')]) self.write(util.REDUCTION_TYPE_TO_SVE[reduction_type]) self.write('(') self.write(self.pred_name) self.write(', ') self.dispatch_expect(rhs, dtypes.vector(dtype, -1)) self.write(')') self.write(wcr[wcr.find('@') + 1:]) self.write(';') else: ###################### # Horizontal non-atomic reduction stride = edge.data.get_stride(self.sdfg, self.map) # long long fix ptr_cast = '' src_type = edge.src.out_connectors[edge.src_conn] if src_type.type == np.int64: ptr_cast = '(int64_t*) ' elif src_type.type == np.uint64: ptr_cast = '(uint64_t*) ' store_args = '{}, {}'.format( self.pred_name, ptr_cast + cpp_ptr_expr(self.sdfg, edge.data, DefinedType.Pointer), ) red_type = util.REDUCTION_TYPE_TO_SVE[reduction_type][:-1] + '_x' if stride == 1: self.write( f'svst1({store_args}, {red_type}({self.pred_name}, svld1({store_args}), ' ) self.dispatch_expect(rhs, dtypes.vector(dtype, -1)) self.write('));') else: store_args = f'{store_args}, svindex_s{util.get_base_type(src_type).bytes * 8}(0, {sym2cpp(stride)})' self.write( f'svst1_scatter_index({store_args}, {red_type}({self.pred_name}, svld1_gather_index({store_args}), ' ) self.dispatch_expect(rhs, dtypes.vector(dtype, -1)) self.write('));')
def dispatch_expect(self, tree: ast.AST, expect: dtypes.typeclass): """ This function is an extension to the dispatch() call and allows to pass the type that is expected when unparsing the tree and will take care of any casting that might be required. It is mainly used in SVE instructions in cases where an argument must be of some type (otherwise the compiler complains). """ inf = self.infer(tree)[0] # Sanity check if not inf: raise util.NotSupportedError( f'Could not infer the expression type of `{astunparse.unparse(tree)}`' ) if isinstance(inf, dtypes.vector): # Unparsing a vector if isinstance(expect, dtypes.vector): # A vector is expected if inf.vtype.type == expect.vtype.type: # No cast required self.dispatch(tree) else: # TODO: Cast vectors (but only if same bitwidth) raise NotImplementedError( 'Vector-vector casting not implemented') else: # A pointer or scalar is expected (incompatible) raise util.NotSupportedError( 'Given a vector, expected a scalar or pointer') elif isinstance(inf, dtypes.pointer): # Unparsing a pointer if isinstance(expect, dtypes.pointer): # Expecting a pointer if inf.base_type.type == expect.base_type.type: # No cast required, expect for `long long` fix if expect.base_type.type == np.int64: self.write('(int_64_t*) ') if expect.base_type.type == np.uint64: self.write('(uint_64_t*) ') self.dispatch(tree) else: raise util.NotSupportedError('Inconsistent pointer types') else: # Expecting anything else raise util.NotSupportedError( 'Given a pointer, expected a scalar or vector') else: # Unparsing a scalar if isinstance(expect, dtypes.vector): # Expecting a vector: duplicate the scalar if expect.type in [np.bool, np.bool_, bool]: # Special case for duplicating boolean into predicate suffix = f'b{self.pred_bits}' #self.write(f'svptrue_{suffix}()') self.dispatch_expect(tree, expect.base_type) self.write(f' ? svptrue_{suffix}() : svpfalse_b()') else: self.write( f'svdup_{util.TYPE_TO_SVE_SUFFIX[expect.type]}(') self.dispatch_expect(tree, expect.base_type) self.write(')') elif isinstance(expect, dtypes.pointer): # Expecting a pointer raise util.NotSupportedError( 'Given a scalar, expected a pointer') else: # Expecting a scalar: cast if needed cast_ctype = None if inf.type != expect.type: cast_ctype = expect.ctype # Special casting for `long long` if expect.type == np.int64: cast_ctype = 'int64_t' elif expect.type == np.uint64: cast_ctype = 'uint64_t' if cast_ctype: self.write(f'({cast_ctype}) ') self.dispatch(tree)
def write_back(self, sdfg: SDFG, dfg: state.StateSubgraphView, state_id: int, src_node: nodes.Node, dst_node: nodes.Node, edge: graph.MultiConnectorEdge, function_stream: CodeIOStream, callsite_stream: CodeIOStream): scope = util.get_sve_scope(sdfg, dfg, src_node) if scope is None: raise NotImplementedError('Not in an SVE scope') out_conn = src_node.out_connectors[edge.src_conn] if out_conn.type not in util.TYPE_TO_SVE: raise NotImplementedError( f'Data type {out_conn.type} not supported') if edge.data.wcr is None: # No WCR required if isinstance(dst_node, dace.nodes.Tasklet): # Writeback into a tasklet is just writing into the shared register callsite_stream.write(f'{edge.data.data} = {edge.src_conn};') return if isinstance(out_conn, dtypes.vector): # If no WCR, we can directly store the vector (SVE register) in memory # Determine the stride of the store and use a scatter load if applicable stride = self.get_load_stride(sdfg, dfg, src_node, edge.data) ptr_cast = '' if out_conn.type == np.int64: ptr_cast = '(int64_t*) ' elif out_conn.type == np.uint64: ptr_cast = '(uint64_t*) ' store_args = '{}, {}'.format( util.get_loop_predicate(sdfg, dfg, src_node), ptr_cast + cpp.cpp_ptr_expr(sdfg, edge.data, DefinedType.Pointer), ) if stride == 1: callsite_stream.write( f'svst1({store_args}, {edge.src_conn});') else: callsite_stream.write( f'svst1_scatter_index({store_args}, svindex_s{util.get_base_type(out_conn).bytes * 8}(0, {sym2cpp(stride)}), {edge.src_conn});' ) else: raise NotImplementedError('Writeback into non-vector') else: # TODO: Check what are we WCR'ing in? # Since we have WCR, we must determine a suitable SVE reduce instruction # Check whether it is a known reduction that is possible in SVE reduction_type = detect_reduction_type(edge.data.wcr) if reduction_type not in util.REDUCTION_TYPE_TO_SVE: raise util.NotSupportedError('Unsupported reduction in SVE') # If the memlet contains the innermost SVE param, we have a problem, because # SVE doesn't support WCR stores. This would require unrolling the loop. if scope.params[-1] in edge.data.free_symbols: raise util.NotSupportedError( 'SVE loop param used in WCR memlet') # WCR on vectors works in two steps: # 1. Reduce the SVE register using SVE instructions into a scalar # 2. WCR the scalar to memory using DaCe functionality sve_reduction = '{}({}, {})'.format( util.REDUCTION_TYPE_TO_SVE[reduction_type], util.get_loop_predicate(sdfg, dfg, src_node), edge.src_conn) ptr_cast = '' if out_conn.type == np.int64: ptr_cast = '(long long*) ' elif out_conn.type == np.uint64: ptr_cast = '(unsigned long long*) ' wcr_expr = self.cpu_codegen.write_and_resolve_expr( sdfg, edge.data, edge.data.wcr_nonatomic, None, ptr_cast + sve_reduction, dtype=out_conn.vtype) callsite_stream.write(wcr_expr + ';')
def copy_memory(self, sdfg: SDFG, dfg: SDFGState, state_id: int, src_node: nodes.Node, dst_node: nodes.Node, edge: gr.MultiConnectorEdge[mm.Memlet], function_stream: CodeIOStream, callsite_stream: CodeIOStream): # We should always be in an SVE scope scope = util.get_sve_scope(sdfg, dfg, dst_node) if scope is None: raise NotImplementedError('Not in an SVE scope') in_conn = dst_node.in_connectors[edge.dst_conn] if isinstance(src_node, dace.nodes.Tasklet): # Copy from tasklet is just copying the shared register # Use defined_vars to get the C++ type of the shared register callsite_stream.write( f'{self.dispatcher.defined_vars.get(edge.data.data)[1]} {edge.dst_conn} = {edge.data.data};' ) return if not isinstance(src_node, dace.nodes.AccessNode): raise util.NotSupportedError( 'Copy neither from Tasklet nor AccessNode') src_desc = src_node.desc(sdfg) if isinstance(src_desc, dace.data.Stream): # A copy from a stream will trigger a vector pop raise NotImplementedError() # FIXME: Issue when we can pop different amounts of data! # If we limit to the smallest amount, certain data will be lost (never processed) """ # SVE register where the stream will be popped to self.create_empty_definition(in_conn, edge, callsite_stream, output=True) var_name = edge.dst_conn callsite_stream.write( f'{util.TYPE_TO_SVE[in_conn.type]} {var_name};') callsite_stream.write('{') callsite_stream.write('// Stream pop') # Pop into local buffer # 256 // in_conn.vtype.bytes n_vec = f'{util.REGISTER_BYTE_SIZE} / {in_conn.vtype.bytes}' callsite_stream.write(f'{in_conn.vtype.ctype} __tmp[{n_vec}];') callsite_stream.write( f'size_t __cnt = {edge.data.data}.pop_try(__tmp, {n_vec});') # Limit the loop predicate loop_pred = util.get_loop_predicate(sdfg, dfg, dst_node) callsite_stream.write( f'{loop_pred} = svand_z({loop_pred}, {loop_pred}, svwhilelt_b{in_conn.vtype.bytes * 8}(0ll, __cnt));') # Transfer to register callsite_stream.write(f'{var_name} = svld1({loop_pred}, __tmp);') callsite_stream.write('}') """ return if isinstance(in_conn, dtypes.vector): # Copy from vector, so we can use svld if in_conn.type not in util.TYPE_TO_SVE: raise NotImplementedError( f'Data type {in_conn.type} not supported') self.dispatcher.defined_vars.add(edge.dst_conn, dtypes.vector, in_conn.ctype) # Determine the stride of the load and use a gather if applicable stride = self.get_load_stride(sdfg, dfg, dst_node, edge.data) # First part of the declaration is `type name` load_lhs = '{} {}'.format(util.TYPE_TO_SVE[in_conn.type], edge.dst_conn) ptr_cast = '' if in_conn.type == np.int64: ptr_cast = '(int64_t*) ' elif in_conn.type == np.uint64: ptr_cast = '(uint64_t*) ' # Regular load and gather share the first arguments load_args = '{}, {}'.format( util.get_loop_predicate(sdfg, dfg, dst_node), ptr_cast + cpp.cpp_ptr_expr(sdfg, edge.data, DefinedType.Pointer)) if stride == 1: callsite_stream.write('{} = svld1({});'.format( load_lhs, load_args)) else: callsite_stream.write( '{} = svld1_gather_index({}, svindex_s{}(0, {}));'.format( load_lhs, load_args, util.get_base_type(in_conn).bytes * 8, sym2cpp(stride))) else: # Any other copy (e.g. pointer or scalar) is handled by the default CPU codegen self.cpu_codegen.copy_memory(sdfg, dfg, state_id, src_node, dst_node, edge, function_stream, callsite_stream)
def generate_scope(self, sdfg: dace.SDFG, scope: ScopeSubgraphView, state_id: int, function_stream: CodeIOStream, callsite_stream: CodeIOStream): entry_node = scope.source_nodes()[0] current_map = entry_node.map self.current_map = current_map if len(current_map.params) > 1: raise util.NotSupportedError('SVE map must be one dimensional') loop_types = list(set([util.get_base_type(sdfg.arrays[a].dtype) for a in sdfg.arrays])) # Edge case if no arrays are used loop_type = loop_types[0] if len(loop_types) > 0 else dace.int64 ltype_size = loop_type.bytes long_type = copy.copy(dace.int64) long_type.ctype = 'int64_t' self.counter_type = {1: dace.int8, 2: dace.int16, 4: dace.int32, 8: long_type}[ltype_size] callsite_stream.write('{') self.dispatcher.defined_vars.enter_scope(scope) # Define all dynamic input connectors of the map entry state_dfg = sdfg.node(state_id) for e in dace.sdfg.dynamic_map_inputs(state_dfg, entry_node): if e.data.data != e.dst_conn: callsite_stream.write( self.cpu_codegen.memlet_definition(sdfg, e.data, False, e.dst_conn, e.dst.in_connectors[e.dst_conn]), sdfg, state_id, entry_node) param = current_map.params[0] rng = current_map.range[0] begin, end, stride = (sym2cpp(r) for r in rng) # Generate the SVE loop header # The name of our loop predicate is always __pg_{param} self.dispatcher.defined_vars.add('__pg_' + param, DefinedType.Scalar, 'svbool_t') # Declare our counting variable (e.g. i) and precompute the loop predicate for our range callsite_stream.write(f'{self.counter_type} {param} = {begin};') end_param = f'__{param}_to' callsite_stream.write(f'{self.counter_type} {end_param} = {end};') callsite_stream.write(f'svbool_t __pg_{param} = svwhilele_b{ltype_size * 8}({param}, {end_param});') # Test for the predicate callsite_stream.write(f'while(svptest_any(svptrue_b{ltype_size * 8}(), __pg_{param})) {{') # Allocate scope related memory for node, _ in scope.all_nodes_recursive(): if isinstance(node, nodes.Tasklet): # Create empty shared registers for outputs into other tasklets for edge in state_dfg.out_edges(node): if isinstance(edge.dst, dace.nodes.Tasklet): self.generate_out_register(sdfg, state_dfg, edge, callsite_stream, True) # Dispatch the subgraph generation self.dispatcher.dispatch_subgraph(sdfg, scope, state_id, function_stream, callsite_stream, skip_entry_node=True, skip_exit_node=True) # Increase the counting variable (according to the number of processed elements) size_letter = {1: 'b', 2: 'h', 4: 'w', 8: 'd'}[ltype_size] callsite_stream.write(f'{param} += svcnt{size_letter}() * {stride};') # Then recompute the loop predicate callsite_stream.write(f'__pg_{param} = svwhilele_b{ltype_size * 8}({param}, {end_param});') callsite_stream.write('}') self.dispatcher.defined_vars.exit_scope(scope) callsite_stream.write('}')
def generate_writeback(self, sdfg: SDFG, state: SDFGState, map: nodes.Map, edge: graph.MultiConnectorEdge[mm.Memlet], code: CodeIOStream): """ Responsible for generating code for a writeback in a Tasklet, given the outgoing edge. This is mainly taking the temporary register and writing it back. """ if edge.src_conn is None: return dst_node = state.memlet_path(edge)[-1].dst src_type = edge.src.out_connectors[edge.src_conn] src_name = edge.src_conn if isinstance(dst_node, nodes.Tasklet): ################## # Code->Code edges dst_type = edge.dst.in_connectors[edge.dst_conn] if (util.is_vector(src_type) and util.is_vector(dst_type)) or (util.is_scalar(src_type) and util.is_scalar(dst_type)): # Simply write back to shared register code.write(f'{edge.data.data} = {src_name};') elif util.is_scalar(src_type) and util.is_vector(dst_type): # Scalar broadcast to shared vector register code.write(f'{edge.data.data} = svdup_{util.TYPE_TO_SVE_SUFFIX[dst_type.type]}({src_name});') else: raise util.NotSupportedError('Unsupported Code->Code edge') elif isinstance(dst_node, nodes.AccessNode): ################## # Write to AccessNode desc = dst_node.desc(sdfg) if isinstance(desc, data.Array): ################## # Write into Array if util.is_pointer(src_type): raise util.NotSupportedError('Unsupported writeback') elif util.is_vector(src_type): ################## # Scatter vector store into array stride = edge.data.get_stride(sdfg, map) # long long fix ptr_cast = '' if src_type.type == np.int64: ptr_cast = '(int64_t*) ' elif src_type.type == np.uint64: ptr_cast = '(uint64_t*) ' store_args = '{}, {}'.format( util.get_loop_predicate(sdfg, state, edge.src), ptr_cast + cpp.cpp_ptr_expr(sdfg, edge.data, DefinedType.Pointer, codegen=self.frame), ) if stride == 1: code.write(f'svst1({store_args}, {src_name});') else: code.write( f'svst1_scatter_index({store_args}, svindex_s{util.get_base_type(src_type).bytes * 8}(0, {sym2cpp(stride)}), {src_name});' ) else: ################## # Scalar write into array code.write(f'{cpp.cpp_array_expr(sdfg, edge.data, codegen=self.frame)} = {src_name};') elif isinstance(desc, data.Scalar): ################## # Write into Scalar if util.is_pointer(src_type): raise util.NotSupportedError('Unsupported writeback') elif util.is_vector(src_type): if util.is_vector(desc.dtype): ################## # Vector write into vector Scalar access node code.write(f'{edge.data.data} = {src_name};') else: raise util.NotSupportedError('Unsupported writeback') else: if util.is_vector(desc.dtype): ################## # Broadcast into scalar AccessNode code.write(f'{edge.data.data} = svdup_{util.TYPE_TO_SVE_SUFFIX[src_type]}({src_name});') else: ################## # Scalar write into scalar AccessNode code.write(f'{edge.data.data} = {src_name};') else: raise util.NotSupportedError('Only writeback to Tasklets and AccessNodes is supported')
def generate_read(self, sdfg: SDFG, state: SDFGState, map: nodes.Map, edge: graph.MultiConnectorEdge[mm.Memlet], code: CodeIOStream): """ Responsible for generating code for reads into a Tasklet, given the ingoing edge. """ if edge.dst_conn is None: return src_node = state.memlet_path(edge)[0].src dst_type = edge.dst.in_connectors[edge.dst_conn] dst_name = edge.dst_conn if isinstance(src_node, nodes.Tasklet): ################## # Code->Code edges src_type = edge.src.out_connectors[edge.src_conn] if util.is_vector(src_type) and util.is_vector(dst_type): # Directly read from shared vector register code.write(f'{util.TYPE_TO_SVE[dst_type.type]} {dst_name} = {edge.data.data};') elif util.is_scalar(src_type) and util.is_scalar(dst_type): # Directly read from shared scalar register code.write(f'{dst_type} {dst_name} = {edge.data.data};') elif util.is_scalar(src_type) and util.is_vector(dst_type): # Scalar broadcast from shared scalar register code.write( f'{util.TYPE_TO_SVE[dst_type.type]} {dst_name} = svdup_{util.TYPE_TO_SVE_SUFFIX[dst_type.type]}({edge.data.data});' ) else: raise util.NotSupportedError('Unsupported Code->Code edge') elif isinstance(src_node, nodes.AccessNode): ################## # Read from AccessNode desc = src_node.desc(sdfg) if isinstance(desc, data.Array): # Copy from array if util.is_pointer(dst_type): ################## # Pointer reference code.write( f'{dst_type} {dst_name} = {cpp.cpp_ptr_expr(sdfg, edge.data, None, codegen=self.frame)};') elif util.is_vector(dst_type): ################## # Vector load stride = edge.data.get_stride(sdfg, map) # First part of the declaration is `type name` load_lhs = '{} {}'.format(util.TYPE_TO_SVE[dst_type.type], dst_name) # long long issue casting ptr_cast = '' if dst_type.type == np.int64: ptr_cast = '(int64_t*) ' elif dst_type.type == np.uint64: ptr_cast = '(uint64_t*) ' # Regular load and gather share the first arguments load_args = '{}, {}'.format( util.get_loop_predicate(sdfg, state, edge.dst), ptr_cast + cpp.cpp_ptr_expr(sdfg, edge.data, DefinedType.Pointer, codegen=self.frame)) if stride == 1: code.write('{} = svld1({});'.format(load_lhs, load_args)) else: code.write('{} = svld1_gather_index({}, svindex_s{}(0, {}));'.format( load_lhs, load_args, util.get_base_type(dst_type).bytes * 8, sym2cpp(stride))) else: ################## # Scalar read from array code.write(f'{dst_type} {dst_name} = {cpp.cpp_array_expr(sdfg, edge.data, codegen=self.frame)};') elif isinstance(desc, data.Scalar): # Refer to shared variable src_type = desc.dtype if util.is_vector(src_type) and util.is_vector(dst_type): # Directly read from shared vector register code.write(f'{util.TYPE_TO_SVE[dst_type.type]} {dst_name} = {edge.data.data};') elif util.is_scalar(src_type) and util.is_scalar(dst_type): # Directly read from shared scalar register code.write(f'{dst_type} {dst_name} = {edge.data.data};') elif util.is_scalar(src_type) and util.is_vector(dst_type): # Scalar broadcast from shared scalar register code.write( f'{util.TYPE_TO_SVE[dst_type.type]} {dst_name} = svdup_{util.TYPE_TO_SVE_SUFFIX[dst_type.type]}({edge.data.data});' ) else: raise util.NotSupportedError('Unsupported Scalar->Code edge') else: raise util.NotSupportedError('Only copy from Tasklets and AccessNodes is supported')