Example #1
0
def _get_codegen_gemm_opts(node, state, sdfg, adesc, bdesc, cdesc, alpha, beta,
                           cdtype, func) -> Dict[str, Any]:
    """ Get option map for GEMM code generation (with column-major order). """
    # Avoid import loops
    from dace.codegen.targets.common import sym2cpp
    from dace.libraries.blas.blas_helpers import get_gemm_opts

    (_, _, ashape,
     astride), (_, _, bshape,
                bstride), (_, _, cshape,
                           cstride) = _get_matmul_operands(node, state, sdfg)

    if getattr(node, 'transA', False):
        ashape = list(reversed(ashape))
        astride = list(reversed(astride))
    if getattr(node, 'transB', False):
        bshape = list(reversed(bshape))
        bstride = list(reversed(bstride))

    opt = get_gemm_opts(astride, bstride, cstride)
    bopt = _get_batchmm_opts(ashape, astride, bshape, bstride, cshape, cstride)

    opt['x'] = '_a'
    opt['y'] = '_b'
    opt['xdtype'] = adesc.dtype
    opt['ydtype'] = bdesc.dtype
    opt['cdtype'] = cdesc.dtype
    opt['M'] = sym2cpp(ashape[-2])
    opt['N'] = sym2cpp(bshape[-1])
    opt['K'] = sym2cpp(ashape[-1])
    opt['lda'] = sym2cpp(opt['lda'])
    opt['ldb'] = sym2cpp(opt['ldb'])
    opt['ldc'] = sym2cpp(opt['ldc'])

    if opt['swap']:
        if bopt:
            bopt['sa'], bopt['sb'] = bopt['sb'], bopt['sa']
        opt['lda'], opt['ldb'] = opt['ldb'], opt['lda']
        opt['x'], opt['y'] = opt['y'], opt['x']
        opt['xdtype'], opt['ydtype'] = opt['ydtype'], opt['xdtype']
        opt['ta'], opt['tb'] = opt['tb'], opt['ta']
        opt['M'], opt['N'] = opt['N'], opt['M']

    opt['alpha'] = alpha
    opt['beta'] = beta
    opt['dtype'] = cdtype
    opt['func'] = func
    if bopt:
        opt['stride_a'] = sym2cpp(bopt['sa'])
        opt['stride_b'] = sym2cpp(bopt['sb'])
        opt['stride_c'] = sym2cpp(bopt['sc'])
        opt['BATCH'] = sym2cpp(bopt['b'])
    else:
        opt['BATCH'] = None

    return opt
Example #2
0
 def loop_bound_str(self):
     from dace.codegen.targets.common import sym2cpp
     bound = 1
     for begin, end, step in self.range:
         bound *= (step + end - begin) // step
     # Add init and drain phases when relevant
     add_str = (" + " + sym2cpp(self.init_size)
                if self.init_size != 0 and not self.init_overlap else "")
     add_str += (" + " + sym2cpp(self.drain_size)
                 if self.drain_size != 0 and not self.drain_overlap else "")
     return sym2cpp(bound) + add_str
Example #3
0
def cpp_offset_expr(d: data.Data,
                    subset_in: subsets.Subset,
                    offset=None,
                    packed_veclen=1,
                    indices=None):
    """ Creates a C++ expression that can be added to a pointer in order
        to offset it to the beginning of the given subset and offset.
        :param d: The data structure to use for sizes/strides.
        :param subset_in: The subset to offset by.
        :param offset: An additional list of offsets or a Subset object
        :param packed_veclen: If packed types are targeted, specifies the
                              vector length that the final offset should be
                              divided by.
        :param indices: A tuple of indices to use for expression.
        :return: A string in C++ syntax with the correct offset
    """
    # Offset according to parameters, then offset according to array
    if offset is not None:
        subset = subset_in.offset_new(offset, False)
        subset.offset(d.offset, False)
    else:
        subset = subset_in.offset_new(d.offset, False)

    # Obtain start range from offsetted subset
    indices = indices or ([0] * len(d.strides))

    index = subset.at(indices, d.strides)
    if packed_veclen > 1:
        index /= packed_veclen

    return sym2cpp(index)
Example #4
0
    def on_map_entry(self, sdfg, state, node, outer_stream, inner_stream):
        dfg = state.scope_subgraph(node)
        state_id = sdfg.node_id(state)
        if node.map.instrument != dace.InstrumentationType.PAPI_Counters:
            return

        unified_id = _unified_id(dfg.node_id(node), state_id)

        #########################################################
        # Outer part

        result = outer_stream

        input_size: str = PAPIUtils.get_memory_input_size(node, sdfg, state_id)

        # Emit supersection if possible
        result.write(self.perf_get_supersection_start_string(node, dfg, unified_id))

        if not self.should_instrument_entry(node):
            return

        size = PAPIUtils.accumulate_byte_movement(node, node, dfg, sdfg, state_id)
        size = sym2cpp(sp.simplify(size))

        result.write(self.perf_section_start_string(unified_id, size, input_size))

        #########################################################
        # Inner part
        result = inner_stream

        map_name = node.map.params[-1]

        result.write(self.perf_counter_start_measurement_string(unified_id, map_name), sdfg, state_id, node)
Example #5
0
 def generate_rtl_parameters(self, constants):
     # construct parameters module header
     if len(constants) == 0:
         return str()
     else:
         return "#(\n{}\n)".format(" " + "\n".join([
             "{} parameter {} = {}".format("," if i > 0 else "", key,
                                           sym2cpp(constants[key]))
             for i, key in enumerate(constants)
         ]))
Example #6
0
def _get_codegen_gemm_opts(node, state, sdfg, adesc, bdesc, cdesc, alpha, beta,
                           cdtype, func) -> Dict[str, Any]:
    """ Get option map for GEMM code generation (with column-major order). """
    # Avoid import loops
    from dace.codegen.targets.common import sym2cpp

    (_, _, ashape, astride), (_, _, bshape,
                              bstride) = _get_matmul_inputs(node, state, sdfg)
    opt = get_gemm_opts(astride, bstride, cdesc.strides)
    bopt = get_batchmm_opts(ashape, astride, bshape, bstride, cdesc.shape,
                            cdesc.strides)
    opt['x'] = '_a'
    opt['y'] = '_b'
    opt['M'] = sym2cpp(ashape[-2])
    opt['N'] = sym2cpp(bshape[-1])
    opt['K'] = sym2cpp(ashape[-1])
    opt['lda'] = sym2cpp(opt['lda'])
    opt['ldb'] = sym2cpp(opt['ldb'])
    opt['ldc'] = sym2cpp(opt['ldc'])

    if opt['swap']:
        if bopt:
            bopt['sa'], bopt['sb'] = bopt['sb'], bopt['sa']
        opt['lda'], opt['ldb'] = opt['ldb'], opt['lda']
        opt['x'], opt['y'] = opt['y'], opt['x']
        opt['ta'], opt['tb'] = opt['tb'], opt['ta']
        opt['M'], opt['N'] = opt['N'], opt['M']

    opt['alpha'] = alpha
    opt['beta'] = beta
    opt['dtype'] = cdtype
    opt['func'] = func
    if bopt:
        opt['stride_a'] = sym2cpp(bopt['sa'])
        opt['stride_b'] = sym2cpp(bopt['sb'])
        opt['stride_c'] = sym2cpp(bopt['sc'])
        opt['BATCH'] = sym2cpp(bopt['b'])
    else:
        opt['BATCH'] = None

    return opt
Example #7
0
File: papi.py Project: mfkiwl/dace
    def get_memory_input_size(node, sdfg, state_id) -> str:
        curr_state = sdfg.nodes()[state_id]

        input_size = 0
        for edge in curr_state.in_edges(node):
            # Accumulate over range size and get the amount of data accessed
            num_accesses = edge.data.num_accesses

            # It might be better to just take the source object size
            bytes_per_element = sdfg.arrays[edge.data.data].dtype.bytes
            input_size = input_size + (bytes_per_element * num_accesses)

        return sym2cpp(input_size)
Example #8
0
    def visit_Subscript(self, node):
        target = rname(node)
        if target not in self.memlets and target not in self.constants:
            return self.generic_visit(node)

        subscript = self._subscript_expr(node.slice, target)

        # New subscript is created as a name AST object (rather than a
        # subscript), as otherwise the visitor will recursively descend into
        # the new expression and modify it erroneously.
        newnode = ast.Name(id="%s[%s]" % (target, sym2cpp(subscript)))

        return ast.copy_location(newnode, node)
Example #9
0
 def generate_constants(self, sdfg: SDFG, callsite_stream: CodeIOStream):
     # Write constants
     for cstname, (csttype, cstval) in sdfg.constants_prop.items():
         if isinstance(csttype, data.Array):
             const_str = "constexpr " + csttype.dtype.ctype + \
                 " " + cstname + "[" + str(cstval.size) + "] = {"
             it = np.nditer(cstval, order='C')
             for i in range(cstval.size - 1):
                 const_str += str(it[0]) + ", "
                 it.iternext()
             const_str += str(it[0]) + "};\n"
             callsite_stream.write(const_str, sdfg)
         else:
             callsite_stream.write(
                 "constexpr %s %s = %s;\n" %
                 (csttype.dtype.ctype, cstname, sym2cpp(cstval)), sdfg)
Example #10
0
    def get_tasklet_byte_accesses(tasklet: nodes.CodeNode, dfg: StateGraphView, sdfg: dace.SDFG, state_id: int) -> str:
        """ Get the amount of bytes processed by `tasklet`. The formula is
            sum(inedges * size) + sum(outedges * size) """
        in_accum = []
        out_accum = []
        in_edges = dfg.in_edges(tasklet)

        for ie in in_edges:
            in_accum.append(PAPIUtils.get_memlet_byte_size(sdfg, ie.data))

        out_accum.append(PAPIUtils.get_out_memlet_costs(sdfg, state_id, tasklet, dfg))

        # Merge
        full = in_accum
        full.extend(out_accum)

        return "(" + sym2cpp(sum(full)) + ")"
Example #11
0
    def on_map_entry(self, sdfg, state, node, outer_stream, inner_stream):
        dfg = state.scope_subgraph(node)
        state_id = sdfg.node_id(state)
        if node.map.instrument != dace.InstrumentationType.PAPI_Counters:
            return

        unified_id = _unified_id(dfg.node_id(node), state_id)

        #########################################################
        # Outer part

        result = outer_stream

        input_size: str = PAPIUtils.get_memory_input_size(node, sdfg, state_id)

        # Emit supersection if possible
        result.write(
            self.perf_get_supersection_start_string(node, dfg, unified_id))

        if not self.should_instrument_entry(node):
            return

        size = PAPIUtils.accumulate_byte_movement(node, node, dfg, sdfg,
                                                  state_id)
        size = sym2cpp(sp.simplify(size))

        result.write(
            self.perf_section_start_string(unified_id, size, input_size))

        #########################################################
        # Inner part
        result = inner_stream

        if node.map.flatten:
            # Performance counters for flattened maps include the calculations
            # made to obtain the different axis indices
            map_name = "__DACEMAP_%d_%d_iter" % (state_id, state.node_id(node))
        else:
            map_name = node.map.params[-1]

        result.write(
            self.perf_counter_start_measurement_string(unified_id, map_name),
            sdfg, state_id, node)
Example #12
0
File: cpp.py Project: orausch/dace
    def ndslice_cpp(slice, dims, rowmajor=True):
        result = StringIO()

        if len(slice) == 0:  # Scalar
            return "0"

        for i, d in enumerate(slice):
            if isinstance(d, tuple):
                raise SyntaxError(
                    "CPU backend does not yet support ranges as inputs/outputs"
                )

            result.write(sym2cpp(d))

            # If not last
            if i < len(slice) - 1:
                # We use the shape as-is since this function is intended for
                # constant arrays only
                strdims = [str(dim) for dim in dims[i + 1:]]
                result.write(
                    "*%s + " %
                    "*".join(strdims))  # Multiply by leading dimensions

        return result.getvalue()
Example #13
0
File: papi.py Project: mfkiwl/dace
    def on_copy_begin(self, sdfg, state, src_node, dst_node, edge,
                      local_stream, global_stream, copy_shape, src_strides,
                      dst_strides):
        if not self._papi_used:
            return

        state_id = sdfg.node_id(state)
        memlet = edge.data

        # For perfcounters, we have to make sure that:
        # 1) No other measurements are done for the containing scope (no map
        # operation containing this copy is instrumented)
        src_instrumented = PAPIInstrumentation.has_surrounding_perfcounters(
            src_node, state)
        dst_instrumented = PAPIInstrumentation.has_surrounding_perfcounters(
            dst_node, state)
        src_storage = src_node.desc(sdfg).storage
        dst_storage = dst_node.desc(sdfg).storage

        cpu_storage_types = [
            dtypes.StorageType.CPU_Heap,
            dtypes.StorageType.CPU_ThreadLocal,
            dtypes.StorageType.CPU_Pinned,
            dtypes.StorageType.Register,
        ]

        perf_cpu_only = (src_storage
                         in cpu_storage_types) and (dst_storage
                                                    in cpu_storage_types)

        self.perf_should_instrument = (
            not src_instrumented and not dst_instrumented and perf_cpu_only
            and state.instrument == dace.InstrumentationType.PAPI_Counters)

        if self.perf_should_instrument is False:
            return

        unique_cpy_id = self.get_unique_number()

        dst_nodedesc = dst_node.desc(sdfg)
        ctype = dst_nodedesc.dtype.ctype

        fac3 = (" * ".join(sym2cpp(copy_shape)) + " / " +
                "/".join(sym2cpp(dst_strides)))
        copy_size = "sizeof(%s) * (%s)" % (ctype, fac3)
        node_id = _unified_id(state.node_id(dst_node), state_id)
        # Mark a section start (this is not really a section in itself (it
        # would be a section with 1 entry))
        local_stream.write(
            self.perf_section_start_string(node_id, copy_size, copy_size),
            sdfg,
            state_id,
            [src_node, dst_node],
        )
        local_stream.write(
            '''
dace::perf::{pcs} __perf_cpy_{nodeid}_{unique_id};
auto& __vs_cpy_{nodeid}_{unique_id} = __perf_store.getNewValueSet(
    __perf_cpy_{nodeid}_{unique_id}, {nodeid}, PAPI_thread_id(), {size}, 
    dace::perf::ValueSetType::Copy);
__perf_cpy_{nodeid}_{unique_id}.enterCritical();'''.format(
                pcs=self.perf_counter_string(),
                nodeid=node_id,
                unique_id=unique_cpy_id,
                size=copy_size,
            ),
            sdfg,
            state_id,
            [src_node, dst_node],
        )
Example #14
0
    def visit_Assign(self, node):
        target = rname(node.targets[-1])
        if target not in self.memlets:
            return self.generic_visit(node)

        memlet, nc, wcr, dtype = self.memlets[target]
        value = self.visit(node.value)

        if not isinstance(node.targets[-1], ast.Subscript):
            # Dynamic accesses or streams -> every access counts
            try:
                if memlet and memlet.data and (memlet.dynamic or isinstance(
                        self.sdfg.arrays[memlet.data], data.Stream)):
                    if wcr is not None:
                        newnode = ast.Name(
                            id=self.codegen.write_and_resolve_expr(
                                self.sdfg,
                                memlet,
                                nc,
                                target,
                                cppunparse.cppunparse(value,
                                                      expr_semicolon=False),
                                dtype=dtype))
                        node.value = ast.copy_location(newnode, node.value)
                        return node
                    elif isinstance(self.sdfg.arrays[memlet.data],
                                    data.Stream):
                        newnode = ast.Name(id="%s.push(%s);" % (
                            memlet.data,
                            cppunparse.cppunparse(value, expr_semicolon=False),
                        ))
                    else:
                        var_type, ctypedef = self.codegen._dispatcher.defined_vars.get(
                            memlet.data)
                        if var_type == DefinedType.Scalar:
                            newnode = ast.Name(id="%s = %s;" % (
                                memlet.data,
                                cppunparse.cppunparse(value,
                                                      expr_semicolon=False),
                            ))
                        else:
                            newnode = ast.Name(id="%s = %s;" % (
                                cpp_array_expr(self.sdfg, memlet),
                                cppunparse.cppunparse(value,
                                                      expr_semicolon=False),
                            ))

                    return self._replace_assignment(newnode, node)
            except TypeError:  # cannot determine truth value of Relational
                pass

            return self.generic_visit(node)

        subscript = self._subscript_expr(node.targets[-1].slice, target)

        if wcr is not None:
            newnode = ast.Name(id=self.codegen.write_and_resolve_expr(
                self.sdfg,
                memlet,
                nc,
                target,
                cppunparse.cppunparse(value, expr_semicolon=False),
                indices=sym2cpp(subscript),
                dtype=dtype) + ';')
        else:
            newnode = ast.Name(
                id="%s[%s] = %s;" %
                (target, sym2cpp(subscript),
                 cppunparse.cppunparse(value, expr_semicolon=False)))

        return self._replace_assignment(newnode, node)
Example #15
0
File: cpp.py Project: orausch/dace
def copy_expr(
    dispatcher,
    sdfg,
    dataname,
    memlet,
    offset=None,
    relative_offset=True,
    packed_types=False,
):
    datadesc = sdfg.arrays[dataname]
    if relative_offset:
        s = memlet.subset
        o = offset
    else:
        if offset is None:
            s = None
        elif not isinstance(offset, subsets.Subset):
            s = subsets.Indices(offset)
        else:
            s = offset
        o = None
    if s is not None:
        offset_cppstr = cpp_offset_expr(datadesc, s, o,
                                        memlet.veclen if packed_types else 1)
    else:
        offset_cppstr = "0"
    dt = ""

    if memlet.veclen != 1 and not packed_types:
        offset_cppstr = "(%s) / %s" % (offset_cppstr, sym2cpp(memlet.veclen))
        dt = "(dace::vec<%s, %s> *)" % (
            datadesc.dtype.ctype,
            sym2cpp(memlet.veclen),
        )

    expr = dataname

    def_type = dispatcher.defined_vars.get(dataname)

    add_offset = offset_cppstr != "0"

    if def_type == DefinedType.Pointer:
        return "{}{}{}".format(
            dt, expr, " + {}".format(offset_cppstr) if add_offset else "")

    elif def_type == DefinedType.ArrayView:
        return "{}{}.ptr(){}".format(
            dt, expr, " + {}".format(offset_cppstr) if add_offset else "")

    elif def_type == DefinedType.StreamArray:
        return "{}[{}]".format(expr, offset_cppstr)

    elif def_type in [
            DefinedType.Scalar, DefinedType.Stream, DefinedType.StreamView
    ]:

        if add_offset:
            raise TypeError("Tried to offset address of scalar {}: {}".format(
                dataname, offset_cppstr))

        if def_type == DefinedType.Scalar:
            return "{}&{}".format(dt, expr)
        else:
            return dataname

    else:
        raise NotImplementedError("copy_expr not implemented "
                                  "for connector type: {}".format(def_type))
Example #16
0
    def on_node_end(self, sdfg, state, node, outer_stream, inner_stream,
                    global_stream):
        if not self._papi_used:
            return

        state_id = sdfg.node_id(state)
        node_id = state.node_id(node)
        unified_id = _unified_id(node_id, state_id)

        if isinstance(node, nodes.CodeNode):
            if node.instrument == dace.InstrumentationType.PAPI_Counters:
                if not PAPIInstrumentation.has_surrounding_perfcounters(
                        node, state):
                    inner_stream.write(
                        "__perf_%s.leaveCritical(__perf_vs_%s);" %
                        (node.label, node.label),
                        sdfg,
                        state_id,
                        node,
                    )

                # Add bytes moved
                inner_stream.write(
                    "__perf_store.addBytesMoved(%s);" %
                    PAPIUtils.get_tasklet_byte_accesses(
                        node, state, sdfg, state_id), sdfg, state_id, node)
        elif isinstance(node, nodes.Reduce):
            result = inner_stream
            #############################################################
            # Instrumentation: Post-Reduce (pre-braces)
            byte_moved_measurement = "__perf_store.addBytesMoved(%s);\n"

            # For reductions, we assume Read-Modify-Write for all operations
            # Every reduction statement costs sizeof(input) + sizeof(output).
            # This is wrong with some custom reductions or extending operations
            # (e.g., i32 * i32 => i64)
            # It also is wrong for write-avoiding min/max (min/max that only
            # overwrite the reduced variable when it needs to be changed)

            if node.instrument == dace.InstrumentationType.PAPI_Counters:
                input_memlet = state.in_edges(node)[0].data
                output_memlet = state.out_edges(node)[0].data
                num_reduced_inputs = input_memlet.subset.num_elements()

                result.write(
                    byte_moved_measurement %
                    ("%s * (sizeof(%s) + sizeof(%s))" %
                     (sym2cpp(num_reduced_inputs),
                      sdfg.arrays[output_memlet.data].dtype.ctype,
                      sdfg.arrays[input_memlet.data].dtype.ctype)),
                    sdfg,
                    state_id,
                    node,
                )

                if not self.has_surrounding_perfcounters(node, state):
                    result.write(
                        self.perf_counter_end_measurement_string(unified_id),
                        sdfg,
                        state_id,
                        node,
                    )
Example #17
0
    def on_node_begin(self, sdfg, state, node, outer_stream, inner_stream,
                      global_stream):
        if not self._papi_used:
            return

        state_id = sdfg.node_id(state)
        unified_id = _unified_id(state.node_id(node), state_id)

        perf_should_instrument = (
            node.instrument == dace.InstrumentationType.PAPI_Counters and
            not PAPIInstrumentation.has_surrounding_perfcounters(node, state))
        if not perf_should_instrument:
            return

        if isinstance(node, nodes.Tasklet):
            inner_stream.write(
                "dace::perf::%s __perf_%s;\n" %
                (self.perf_counter_string(), node.label),
                sdfg,
                state_id,
                node,
            )
            inner_stream.write(
                'auto& __perf_vs_%s = __perf_store.getNewValueSet(__perf_%s, '
                '    %d, PAPI_thread_id(), 0);\n' %
                (node.label, node.label, unified_id),
                sdfg,
                state_id,
                node,
            )

            inner_stream.write("__perf_%s.enterCritical();\n" % node.label,
                               sdfg, state_id, node)
        elif isinstance(node, nodes.Reduce):
            unified_id = _unified_id(state.node_id(node), state_id)

            input_size: str = PAPIUtils.get_memory_input_size(
                node, sdfg, state_id)

            # For measuring the memory bandwidth, we analyze the amount of data
            # moved.
            result = outer_stream
            perf_expected_data_movement_sympy = 1

            input_memlet = state.in_edges(node)[0].data
            output_memlet = state.out_edges(node)[0].data
            # If axes were not defined, use all input dimensions
            input_dims = input_memlet.subset.dims()
            output_dims = output_memlet.subset.data_dims()
            axes = node.axes
            if axes is None:
                axes = tuple(range(input_dims))

            isize = input_memlet.subset.size()
            osize = input_memlet.subset.size()
            for axis in range(output_dims):
                perf_expected_data_movement_sympy *= osize[axis]
            for axis in axes:
                perf_expected_data_movement_sympy *= isize[axis]

            if not dace.sdfg.is_parallel(state, node):
                # We put a start marker, but only if we are in a serial state
                result.write(
                    self.perf_supersection_start_string(unified_id),
                    sdfg,
                    state_id,
                    node,
                )

            result.write(
                self.perf_section_start_string(
                    unified_id,
                    sym2cpp(sp.simplify(perf_expected_data_movement_sympy)) +
                    (" * (sizeof(%s) + sizeof(%s))" % (
                        sdfg.arrays[output_memlet.data].dtype.ctype,
                        sdfg.arrays[input_memlet.data].dtype.ctype,
                    )),
                    input_size,
                ),
                sdfg,
                state_id,
                node,
            )

            #############################################################
            # Internal part
            result = inner_stream
            result.write(
                self.perf_counter_start_measurement_string(
                    unified_id, '__o%d' % (output_dims - 1)),
                sdfg,
                state_id,
                node,
            )