def _get_codegen_gemm_opts(node, state, sdfg, adesc, bdesc, cdesc, alpha, beta, cdtype, func) -> Dict[str, Any]: """ Get option map for GEMM code generation (with column-major order). """ # Avoid import loops from dace.codegen.targets.common import sym2cpp from dace.libraries.blas.blas_helpers import get_gemm_opts (_, _, ashape, astride), (_, _, bshape, bstride), (_, _, cshape, cstride) = _get_matmul_operands(node, state, sdfg) if getattr(node, 'transA', False): ashape = list(reversed(ashape)) astride = list(reversed(astride)) if getattr(node, 'transB', False): bshape = list(reversed(bshape)) bstride = list(reversed(bstride)) opt = get_gemm_opts(astride, bstride, cstride) bopt = _get_batchmm_opts(ashape, astride, bshape, bstride, cshape, cstride) opt['x'] = '_a' opt['y'] = '_b' opt['xdtype'] = adesc.dtype opt['ydtype'] = bdesc.dtype opt['cdtype'] = cdesc.dtype opt['M'] = sym2cpp(ashape[-2]) opt['N'] = sym2cpp(bshape[-1]) opt['K'] = sym2cpp(ashape[-1]) opt['lda'] = sym2cpp(opt['lda']) opt['ldb'] = sym2cpp(opt['ldb']) opt['ldc'] = sym2cpp(opt['ldc']) if opt['swap']: if bopt: bopt['sa'], bopt['sb'] = bopt['sb'], bopt['sa'] opt['lda'], opt['ldb'] = opt['ldb'], opt['lda'] opt['x'], opt['y'] = opt['y'], opt['x'] opt['xdtype'], opt['ydtype'] = opt['ydtype'], opt['xdtype'] opt['ta'], opt['tb'] = opt['tb'], opt['ta'] opt['M'], opt['N'] = opt['N'], opt['M'] opt['alpha'] = alpha opt['beta'] = beta opt['dtype'] = cdtype opt['func'] = func if bopt: opt['stride_a'] = sym2cpp(bopt['sa']) opt['stride_b'] = sym2cpp(bopt['sb']) opt['stride_c'] = sym2cpp(bopt['sc']) opt['BATCH'] = sym2cpp(bopt['b']) else: opt['BATCH'] = None return opt
def loop_bound_str(self): from dace.codegen.targets.common import sym2cpp bound = 1 for begin, end, step in self.range: bound *= (step + end - begin) // step # Add init and drain phases when relevant add_str = (" + " + sym2cpp(self.init_size) if self.init_size != 0 and not self.init_overlap else "") add_str += (" + " + sym2cpp(self.drain_size) if self.drain_size != 0 and not self.drain_overlap else "") return sym2cpp(bound) + add_str
def cpp_offset_expr(d: data.Data, subset_in: subsets.Subset, offset=None, packed_veclen=1, indices=None): """ Creates a C++ expression that can be added to a pointer in order to offset it to the beginning of the given subset and offset. :param d: The data structure to use for sizes/strides. :param subset_in: The subset to offset by. :param offset: An additional list of offsets or a Subset object :param packed_veclen: If packed types are targeted, specifies the vector length that the final offset should be divided by. :param indices: A tuple of indices to use for expression. :return: A string in C++ syntax with the correct offset """ # Offset according to parameters, then offset according to array if offset is not None: subset = subset_in.offset_new(offset, False) subset.offset(d.offset, False) else: subset = subset_in.offset_new(d.offset, False) # Obtain start range from offsetted subset indices = indices or ([0] * len(d.strides)) index = subset.at(indices, d.strides) if packed_veclen > 1: index /= packed_veclen return sym2cpp(index)
def on_map_entry(self, sdfg, state, node, outer_stream, inner_stream): dfg = state.scope_subgraph(node) state_id = sdfg.node_id(state) if node.map.instrument != dace.InstrumentationType.PAPI_Counters: return unified_id = _unified_id(dfg.node_id(node), state_id) ######################################################### # Outer part result = outer_stream input_size: str = PAPIUtils.get_memory_input_size(node, sdfg, state_id) # Emit supersection if possible result.write(self.perf_get_supersection_start_string(node, dfg, unified_id)) if not self.should_instrument_entry(node): return size = PAPIUtils.accumulate_byte_movement(node, node, dfg, sdfg, state_id) size = sym2cpp(sp.simplify(size)) result.write(self.perf_section_start_string(unified_id, size, input_size)) ######################################################### # Inner part result = inner_stream map_name = node.map.params[-1] result.write(self.perf_counter_start_measurement_string(unified_id, map_name), sdfg, state_id, node)
def generate_rtl_parameters(self, constants): # construct parameters module header if len(constants) == 0: return str() else: return "#(\n{}\n)".format(" " + "\n".join([ "{} parameter {} = {}".format("," if i > 0 else "", key, sym2cpp(constants[key])) for i, key in enumerate(constants) ]))
def _get_codegen_gemm_opts(node, state, sdfg, adesc, bdesc, cdesc, alpha, beta, cdtype, func) -> Dict[str, Any]: """ Get option map for GEMM code generation (with column-major order). """ # Avoid import loops from dace.codegen.targets.common import sym2cpp (_, _, ashape, astride), (_, _, bshape, bstride) = _get_matmul_inputs(node, state, sdfg) opt = get_gemm_opts(astride, bstride, cdesc.strides) bopt = get_batchmm_opts(ashape, astride, bshape, bstride, cdesc.shape, cdesc.strides) opt['x'] = '_a' opt['y'] = '_b' opt['M'] = sym2cpp(ashape[-2]) opt['N'] = sym2cpp(bshape[-1]) opt['K'] = sym2cpp(ashape[-1]) opt['lda'] = sym2cpp(opt['lda']) opt['ldb'] = sym2cpp(opt['ldb']) opt['ldc'] = sym2cpp(opt['ldc']) if opt['swap']: if bopt: bopt['sa'], bopt['sb'] = bopt['sb'], bopt['sa'] opt['lda'], opt['ldb'] = opt['ldb'], opt['lda'] opt['x'], opt['y'] = opt['y'], opt['x'] opt['ta'], opt['tb'] = opt['tb'], opt['ta'] opt['M'], opt['N'] = opt['N'], opt['M'] opt['alpha'] = alpha opt['beta'] = beta opt['dtype'] = cdtype opt['func'] = func if bopt: opt['stride_a'] = sym2cpp(bopt['sa']) opt['stride_b'] = sym2cpp(bopt['sb']) opt['stride_c'] = sym2cpp(bopt['sc']) opt['BATCH'] = sym2cpp(bopt['b']) else: opt['BATCH'] = None return opt
def get_memory_input_size(node, sdfg, state_id) -> str: curr_state = sdfg.nodes()[state_id] input_size = 0 for edge in curr_state.in_edges(node): # Accumulate over range size and get the amount of data accessed num_accesses = edge.data.num_accesses # It might be better to just take the source object size bytes_per_element = sdfg.arrays[edge.data.data].dtype.bytes input_size = input_size + (bytes_per_element * num_accesses) return sym2cpp(input_size)
def visit_Subscript(self, node): target = rname(node) if target not in self.memlets and target not in self.constants: return self.generic_visit(node) subscript = self._subscript_expr(node.slice, target) # New subscript is created as a name AST object (rather than a # subscript), as otherwise the visitor will recursively descend into # the new expression and modify it erroneously. newnode = ast.Name(id="%s[%s]" % (target, sym2cpp(subscript))) return ast.copy_location(newnode, node)
def generate_constants(self, sdfg: SDFG, callsite_stream: CodeIOStream): # Write constants for cstname, (csttype, cstval) in sdfg.constants_prop.items(): if isinstance(csttype, data.Array): const_str = "constexpr " + csttype.dtype.ctype + \ " " + cstname + "[" + str(cstval.size) + "] = {" it = np.nditer(cstval, order='C') for i in range(cstval.size - 1): const_str += str(it[0]) + ", " it.iternext() const_str += str(it[0]) + "};\n" callsite_stream.write(const_str, sdfg) else: callsite_stream.write( "constexpr %s %s = %s;\n" % (csttype.dtype.ctype, cstname, sym2cpp(cstval)), sdfg)
def get_tasklet_byte_accesses(tasklet: nodes.CodeNode, dfg: StateGraphView, sdfg: dace.SDFG, state_id: int) -> str: """ Get the amount of bytes processed by `tasklet`. The formula is sum(inedges * size) + sum(outedges * size) """ in_accum = [] out_accum = [] in_edges = dfg.in_edges(tasklet) for ie in in_edges: in_accum.append(PAPIUtils.get_memlet_byte_size(sdfg, ie.data)) out_accum.append(PAPIUtils.get_out_memlet_costs(sdfg, state_id, tasklet, dfg)) # Merge full = in_accum full.extend(out_accum) return "(" + sym2cpp(sum(full)) + ")"
def on_map_entry(self, sdfg, state, node, outer_stream, inner_stream): dfg = state.scope_subgraph(node) state_id = sdfg.node_id(state) if node.map.instrument != dace.InstrumentationType.PAPI_Counters: return unified_id = _unified_id(dfg.node_id(node), state_id) ######################################################### # Outer part result = outer_stream input_size: str = PAPIUtils.get_memory_input_size(node, sdfg, state_id) # Emit supersection if possible result.write( self.perf_get_supersection_start_string(node, dfg, unified_id)) if not self.should_instrument_entry(node): return size = PAPIUtils.accumulate_byte_movement(node, node, dfg, sdfg, state_id) size = sym2cpp(sp.simplify(size)) result.write( self.perf_section_start_string(unified_id, size, input_size)) ######################################################### # Inner part result = inner_stream if node.map.flatten: # Performance counters for flattened maps include the calculations # made to obtain the different axis indices map_name = "__DACEMAP_%d_%d_iter" % (state_id, state.node_id(node)) else: map_name = node.map.params[-1] result.write( self.perf_counter_start_measurement_string(unified_id, map_name), sdfg, state_id, node)
def ndslice_cpp(slice, dims, rowmajor=True): result = StringIO() if len(slice) == 0: # Scalar return "0" for i, d in enumerate(slice): if isinstance(d, tuple): raise SyntaxError( "CPU backend does not yet support ranges as inputs/outputs" ) result.write(sym2cpp(d)) # If not last if i < len(slice) - 1: # We use the shape as-is since this function is intended for # constant arrays only strdims = [str(dim) for dim in dims[i + 1:]] result.write( "*%s + " % "*".join(strdims)) # Multiply by leading dimensions return result.getvalue()
def on_copy_begin(self, sdfg, state, src_node, dst_node, edge, local_stream, global_stream, copy_shape, src_strides, dst_strides): if not self._papi_used: return state_id = sdfg.node_id(state) memlet = edge.data # For perfcounters, we have to make sure that: # 1) No other measurements are done for the containing scope (no map # operation containing this copy is instrumented) src_instrumented = PAPIInstrumentation.has_surrounding_perfcounters( src_node, state) dst_instrumented = PAPIInstrumentation.has_surrounding_perfcounters( dst_node, state) src_storage = src_node.desc(sdfg).storage dst_storage = dst_node.desc(sdfg).storage cpu_storage_types = [ dtypes.StorageType.CPU_Heap, dtypes.StorageType.CPU_ThreadLocal, dtypes.StorageType.CPU_Pinned, dtypes.StorageType.Register, ] perf_cpu_only = (src_storage in cpu_storage_types) and (dst_storage in cpu_storage_types) self.perf_should_instrument = ( not src_instrumented and not dst_instrumented and perf_cpu_only and state.instrument == dace.InstrumentationType.PAPI_Counters) if self.perf_should_instrument is False: return unique_cpy_id = self.get_unique_number() dst_nodedesc = dst_node.desc(sdfg) ctype = dst_nodedesc.dtype.ctype fac3 = (" * ".join(sym2cpp(copy_shape)) + " / " + "/".join(sym2cpp(dst_strides))) copy_size = "sizeof(%s) * (%s)" % (ctype, fac3) node_id = _unified_id(state.node_id(dst_node), state_id) # Mark a section start (this is not really a section in itself (it # would be a section with 1 entry)) local_stream.write( self.perf_section_start_string(node_id, copy_size, copy_size), sdfg, state_id, [src_node, dst_node], ) local_stream.write( ''' dace::perf::{pcs} __perf_cpy_{nodeid}_{unique_id}; auto& __vs_cpy_{nodeid}_{unique_id} = __perf_store.getNewValueSet( __perf_cpy_{nodeid}_{unique_id}, {nodeid}, PAPI_thread_id(), {size}, dace::perf::ValueSetType::Copy); __perf_cpy_{nodeid}_{unique_id}.enterCritical();'''.format( pcs=self.perf_counter_string(), nodeid=node_id, unique_id=unique_cpy_id, size=copy_size, ), sdfg, state_id, [src_node, dst_node], )
def visit_Assign(self, node): target = rname(node.targets[-1]) if target not in self.memlets: return self.generic_visit(node) memlet, nc, wcr, dtype = self.memlets[target] value = self.visit(node.value) if not isinstance(node.targets[-1], ast.Subscript): # Dynamic accesses or streams -> every access counts try: if memlet and memlet.data and (memlet.dynamic or isinstance( self.sdfg.arrays[memlet.data], data.Stream)): if wcr is not None: newnode = ast.Name( id=self.codegen.write_and_resolve_expr( self.sdfg, memlet, nc, target, cppunparse.cppunparse(value, expr_semicolon=False), dtype=dtype)) node.value = ast.copy_location(newnode, node.value) return node elif isinstance(self.sdfg.arrays[memlet.data], data.Stream): newnode = ast.Name(id="%s.push(%s);" % ( memlet.data, cppunparse.cppunparse(value, expr_semicolon=False), )) else: var_type, ctypedef = self.codegen._dispatcher.defined_vars.get( memlet.data) if var_type == DefinedType.Scalar: newnode = ast.Name(id="%s = %s;" % ( memlet.data, cppunparse.cppunparse(value, expr_semicolon=False), )) else: newnode = ast.Name(id="%s = %s;" % ( cpp_array_expr(self.sdfg, memlet), cppunparse.cppunparse(value, expr_semicolon=False), )) return self._replace_assignment(newnode, node) except TypeError: # cannot determine truth value of Relational pass return self.generic_visit(node) subscript = self._subscript_expr(node.targets[-1].slice, target) if wcr is not None: newnode = ast.Name(id=self.codegen.write_and_resolve_expr( self.sdfg, memlet, nc, target, cppunparse.cppunparse(value, expr_semicolon=False), indices=sym2cpp(subscript), dtype=dtype) + ';') else: newnode = ast.Name( id="%s[%s] = %s;" % (target, sym2cpp(subscript), cppunparse.cppunparse(value, expr_semicolon=False))) return self._replace_assignment(newnode, node)
def copy_expr( dispatcher, sdfg, dataname, memlet, offset=None, relative_offset=True, packed_types=False, ): datadesc = sdfg.arrays[dataname] if relative_offset: s = memlet.subset o = offset else: if offset is None: s = None elif not isinstance(offset, subsets.Subset): s = subsets.Indices(offset) else: s = offset o = None if s is not None: offset_cppstr = cpp_offset_expr(datadesc, s, o, memlet.veclen if packed_types else 1) else: offset_cppstr = "0" dt = "" if memlet.veclen != 1 and not packed_types: offset_cppstr = "(%s) / %s" % (offset_cppstr, sym2cpp(memlet.veclen)) dt = "(dace::vec<%s, %s> *)" % ( datadesc.dtype.ctype, sym2cpp(memlet.veclen), ) expr = dataname def_type = dispatcher.defined_vars.get(dataname) add_offset = offset_cppstr != "0" if def_type == DefinedType.Pointer: return "{}{}{}".format( dt, expr, " + {}".format(offset_cppstr) if add_offset else "") elif def_type == DefinedType.ArrayView: return "{}{}.ptr(){}".format( dt, expr, " + {}".format(offset_cppstr) if add_offset else "") elif def_type == DefinedType.StreamArray: return "{}[{}]".format(expr, offset_cppstr) elif def_type in [ DefinedType.Scalar, DefinedType.Stream, DefinedType.StreamView ]: if add_offset: raise TypeError("Tried to offset address of scalar {}: {}".format( dataname, offset_cppstr)) if def_type == DefinedType.Scalar: return "{}&{}".format(dt, expr) else: return dataname else: raise NotImplementedError("copy_expr not implemented " "for connector type: {}".format(def_type))
def on_node_end(self, sdfg, state, node, outer_stream, inner_stream, global_stream): if not self._papi_used: return state_id = sdfg.node_id(state) node_id = state.node_id(node) unified_id = _unified_id(node_id, state_id) if isinstance(node, nodes.CodeNode): if node.instrument == dace.InstrumentationType.PAPI_Counters: if not PAPIInstrumentation.has_surrounding_perfcounters( node, state): inner_stream.write( "__perf_%s.leaveCritical(__perf_vs_%s);" % (node.label, node.label), sdfg, state_id, node, ) # Add bytes moved inner_stream.write( "__perf_store.addBytesMoved(%s);" % PAPIUtils.get_tasklet_byte_accesses( node, state, sdfg, state_id), sdfg, state_id, node) elif isinstance(node, nodes.Reduce): result = inner_stream ############################################################# # Instrumentation: Post-Reduce (pre-braces) byte_moved_measurement = "__perf_store.addBytesMoved(%s);\n" # For reductions, we assume Read-Modify-Write for all operations # Every reduction statement costs sizeof(input) + sizeof(output). # This is wrong with some custom reductions or extending operations # (e.g., i32 * i32 => i64) # It also is wrong for write-avoiding min/max (min/max that only # overwrite the reduced variable when it needs to be changed) if node.instrument == dace.InstrumentationType.PAPI_Counters: input_memlet = state.in_edges(node)[0].data output_memlet = state.out_edges(node)[0].data num_reduced_inputs = input_memlet.subset.num_elements() result.write( byte_moved_measurement % ("%s * (sizeof(%s) + sizeof(%s))" % (sym2cpp(num_reduced_inputs), sdfg.arrays[output_memlet.data].dtype.ctype, sdfg.arrays[input_memlet.data].dtype.ctype)), sdfg, state_id, node, ) if not self.has_surrounding_perfcounters(node, state): result.write( self.perf_counter_end_measurement_string(unified_id), sdfg, state_id, node, )
def on_node_begin(self, sdfg, state, node, outer_stream, inner_stream, global_stream): if not self._papi_used: return state_id = sdfg.node_id(state) unified_id = _unified_id(state.node_id(node), state_id) perf_should_instrument = ( node.instrument == dace.InstrumentationType.PAPI_Counters and not PAPIInstrumentation.has_surrounding_perfcounters(node, state)) if not perf_should_instrument: return if isinstance(node, nodes.Tasklet): inner_stream.write( "dace::perf::%s __perf_%s;\n" % (self.perf_counter_string(), node.label), sdfg, state_id, node, ) inner_stream.write( 'auto& __perf_vs_%s = __perf_store.getNewValueSet(__perf_%s, ' ' %d, PAPI_thread_id(), 0);\n' % (node.label, node.label, unified_id), sdfg, state_id, node, ) inner_stream.write("__perf_%s.enterCritical();\n" % node.label, sdfg, state_id, node) elif isinstance(node, nodes.Reduce): unified_id = _unified_id(state.node_id(node), state_id) input_size: str = PAPIUtils.get_memory_input_size( node, sdfg, state_id) # For measuring the memory bandwidth, we analyze the amount of data # moved. result = outer_stream perf_expected_data_movement_sympy = 1 input_memlet = state.in_edges(node)[0].data output_memlet = state.out_edges(node)[0].data # If axes were not defined, use all input dimensions input_dims = input_memlet.subset.dims() output_dims = output_memlet.subset.data_dims() axes = node.axes if axes is None: axes = tuple(range(input_dims)) isize = input_memlet.subset.size() osize = input_memlet.subset.size() for axis in range(output_dims): perf_expected_data_movement_sympy *= osize[axis] for axis in axes: perf_expected_data_movement_sympy *= isize[axis] if not dace.sdfg.is_parallel(state, node): # We put a start marker, but only if we are in a serial state result.write( self.perf_supersection_start_string(unified_id), sdfg, state_id, node, ) result.write( self.perf_section_start_string( unified_id, sym2cpp(sp.simplify(perf_expected_data_movement_sympy)) + (" * (sizeof(%s) + sizeof(%s))" % ( sdfg.arrays[output_memlet.data].dtype.ctype, sdfg.arrays[input_memlet.data].dtype.ctype, )), input_size, ), sdfg, state_id, node, ) ############################################################# # Internal part result = inner_stream result.write( self.perf_counter_start_measurement_string( unified_id, '__o%d' % (output_dims - 1)), sdfg, state_id, node, )