Example #1
0
    def write_and_resolve_expr(self,
                               sdfg,
                               memlet,
                               nc,
                               outname,
                               inname,
                               indices=None,
                               dtype=None):
        """
        Emits a conflict resolution call from a memlet.
        """
        redtype = operations.detect_reduction_type(memlet.wcr)
        if isinstance(indices, str):
            ptr = '%s + %s' % (cpp.cpp_ptr_expr(sdfg, memlet), indices)
        else:
            ptr = cpp.cpp_ptr_expr(sdfg, memlet, indices=indices)

        if isinstance(dtype, dtypes.pointer):
            dtype = dtype.base_type

        # Special call for detected reduction types
        if redtype != dtypes.ReductionType.Custom:
            credtype = "dace::ReductionType::" + str(
                redtype)[str(redtype).find(".") + 1:]
            if isinstance(dtype, dtypes.vector):
                return (f'dace::xilinx_wcr_fixed_vec<{credtype}, '
                        f'{dtype.vtype.ctype}, {dtype.veclen}>::reduce('
                        f'{ptr}, {inname})')
            return (
                f'dace::xilinx_wcr_fixed<{credtype}, {dtype.ctype}>::reduce('
                f'{ptr}, {inname})')

        # General reduction
        raise NotImplementedError('General reductions not yet implemented')
Example #2
0
    def write_and_resolve_expr(self,
                               sdfg,
                               memlet,
                               nc,
                               outname,
                               inname,
                               indices=None,
                               dtype=None):
        """
        Emits a conflict resolution call from a memlet.
        """
        redtype = operations.detect_reduction_type(memlet.wcr, openmp=True)
        defined_type, _ = self._dispatcher.defined_vars.get(memlet.data)
        if isinstance(indices, str):
            ptr = '%s + %s' % (cpp.cpp_ptr_expr(
                sdfg, memlet, defined_type, is_write=True), indices)
        else:
            ptr = cpp.cpp_ptr_expr(sdfg,
                                   memlet,
                                   defined_type,
                                   indices=indices,
                                   is_write=True)

        if isinstance(dtype, dtypes.pointer):
            dtype = dtype.base_type

        # Special call for detected reduction types
        if redtype != dtypes.ReductionType.Custom:
            if redtype == dace.dtypes.ReductionType.Sub:
                # write this as an addition
                credtype = "dace::ReductionType::Sum"
                is_sub = True
            else:
                credtype = "dace::ReductionType::" + str(
                    redtype)[str(redtype).find(".") + 1:]
                is_sub = False
            if isinstance(dtype, dtypes.vector):
                return (f'dace::xilinx_wcr_fixed_vec<{credtype}, '
                        f'{dtype.vtype.ctype}, {dtype.veclen}>::reduce('
                        f'{ptr}, {"-" if is_sub else ""}{inname})')
            return (
                f'dace::xilinx_wcr_fixed<{credtype}, {dtype.ctype}>::reduce('
                f'{ptr}, {"-" if is_sub else ""}{inname})')

        # General reduction
        raise NotImplementedError('General reductions not yet implemented')
Example #3
0
    def vector_reduction_expr(self, edge, dtype, rhs):
        # Check whether it is a known reduction that is possible in SVE
        reduction_type = detect_reduction_type(edge.data.wcr)
        if reduction_type not in util.REDUCTION_TYPE_TO_SVE:
            raise util.NotSupportedError('Unsupported reduction in SVE')

        nc = not is_write_conflicted(self.dfg, edge)
        if not nc or not isinstance(edge.src.out_connectors[edge.src_conn],
                                    (dtypes.pointer, dtypes.vector)):
            # WCR on vectors works in two steps:
            # 1. Reduce the SVE register using SVE instructions into a scalar
            # 2. WCR the scalar to memory using DaCe functionality
            dst_node = self.dfg.memlet_path(edge)[-1].dst
            if (isinstance(dst_node, nodes.AccessNode) and dst_node.desc(
                    self.sdfg).storage == dtypes.StorageType.SVE_Register):
                return

            wcr = self.cpu_codegen.write_and_resolve_expr(self.sdfg,
                                                          edge.data,
                                                          not nc,
                                                          None,
                                                          '@',
                                                          dtype=dtype)
            self.fill(wcr[:wcr.find('@')])
            self.write(util.REDUCTION_TYPE_TO_SVE[reduction_type])
            self.write('(')
            self.write(self.pred_name)
            self.write(', ')
            self.dispatch_expect(rhs, dtypes.vector(dtype, -1))
            self.write(')')
            self.write(wcr[wcr.find('@') + 1:])
            self.write(';')
        else:
            ######################
            # Horizontal non-atomic reduction

            stride = edge.data.get_stride(self.sdfg, self.map)

            # long long fix
            ptr_cast = ''
            src_type = edge.src.out_connectors[edge.src_conn]

            if src_type.type == np.int64:
                ptr_cast = '(int64_t*) '
            elif src_type.type == np.uint64:
                ptr_cast = '(uint64_t*) '

            store_args = '{}, {}'.format(
                self.pred_name,
                ptr_cast +
                cpp_ptr_expr(self.sdfg, edge.data, DefinedType.Pointer),
            )

            red_type = util.REDUCTION_TYPE_TO_SVE[reduction_type][:-1] + '_x'
            if stride == 1:
                self.write(
                    f'svst1({store_args}, {red_type}({self.pred_name}, svld1({store_args}), '
                )
                self.dispatch_expect(rhs, dtypes.vector(dtype, -1))
                self.write('));')
            else:
                store_args = f'{store_args}, svindex_s{util.get_base_type(src_type).bytes * 8}(0, {sym2cpp(stride)})'
                self.write(
                    f'svst1_scatter_index({store_args}, {red_type}({self.pred_name}, svld1_gather_index({store_args}), '
                )
                self.dispatch_expect(rhs, dtypes.vector(dtype, -1))
                self.write('));')
Example #4
0
    def write_back(self, sdfg: SDFG, dfg: state.StateSubgraphView,
                   state_id: int, src_node: nodes.Node, dst_node: nodes.Node,
                   edge: graph.MultiConnectorEdge,
                   function_stream: CodeIOStream,
                   callsite_stream: CodeIOStream):
        scope = util.get_sve_scope(sdfg, dfg, src_node)
        if scope is None:
            raise NotImplementedError('Not in an SVE scope')

        out_conn = src_node.out_connectors[edge.src_conn]
        if out_conn.type not in util.TYPE_TO_SVE:
            raise NotImplementedError(
                f'Data type {out_conn.type} not supported')

        if edge.data.wcr is None:
            # No WCR required

            if isinstance(dst_node, dace.nodes.Tasklet):
                # Writeback into a tasklet is just writing into the shared register
                callsite_stream.write(f'{edge.data.data} = {edge.src_conn};')
                return

            if isinstance(out_conn, dtypes.vector):
                # If no WCR, we can directly store the vector (SVE register) in memory
                # Determine the stride of the store and use a scatter load if applicable

                stride = self.get_load_stride(sdfg, dfg, src_node, edge.data)

                ptr_cast = ''
                if out_conn.type == np.int64:
                    ptr_cast = '(int64_t*) '
                elif out_conn.type == np.uint64:
                    ptr_cast = '(uint64_t*) '

                store_args = '{}, {}'.format(
                    util.get_loop_predicate(sdfg, dfg, src_node),
                    ptr_cast +
                    cpp.cpp_ptr_expr(sdfg, edge.data, DefinedType.Pointer),
                )

                if stride == 1:
                    callsite_stream.write(
                        f'svst1({store_args}, {edge.src_conn});')
                else:
                    callsite_stream.write(
                        f'svst1_scatter_index({store_args}, svindex_s{util.get_base_type(out_conn).bytes * 8}(0, {sym2cpp(stride)}), {edge.src_conn});'
                    )
            else:
                raise NotImplementedError('Writeback into non-vector')
        else:
            # TODO: Check what are we WCR'ing in?

            # Since we have WCR, we must determine a suitable SVE reduce instruction
            # Check whether it is a known reduction that is possible in SVE
            reduction_type = detect_reduction_type(edge.data.wcr)
            if reduction_type not in util.REDUCTION_TYPE_TO_SVE:
                raise util.NotSupportedError('Unsupported reduction in SVE')

            # If the memlet contains the innermost SVE param, we have a problem, because
            # SVE doesn't support WCR stores. This would require unrolling the loop.
            if scope.params[-1] in edge.data.free_symbols:
                raise util.NotSupportedError(
                    'SVE loop param used in WCR memlet')

            # WCR on vectors works in two steps:
            # 1. Reduce the SVE register using SVE instructions into a scalar
            # 2. WCR the scalar to memory using DaCe functionality

            sve_reduction = '{}({}, {})'.format(
                util.REDUCTION_TYPE_TO_SVE[reduction_type],
                util.get_loop_predicate(sdfg, dfg, src_node), edge.src_conn)

            ptr_cast = ''
            if out_conn.type == np.int64:
                ptr_cast = '(long long*) '
            elif out_conn.type == np.uint64:
                ptr_cast = '(unsigned long long*) '

            wcr_expr = self.cpu_codegen.write_and_resolve_expr(
                sdfg,
                edge.data,
                edge.data.wcr_nonatomic,
                None,
                ptr_cast + sve_reduction,
                dtype=out_conn.vtype)

            callsite_stream.write(wcr_expr + ';')
Example #5
0
    def copy_memory(self, sdfg: SDFG, dfg: SDFGState, state_id: int,
                    src_node: nodes.Node, dst_node: nodes.Node,
                    edge: gr.MultiConnectorEdge[mm.Memlet],
                    function_stream: CodeIOStream,
                    callsite_stream: CodeIOStream):
        # We should always be in an SVE scope
        scope = util.get_sve_scope(sdfg, dfg, dst_node)
        if scope is None:
            raise NotImplementedError('Not in an SVE scope')

        in_conn = dst_node.in_connectors[edge.dst_conn]

        if isinstance(src_node, dace.nodes.Tasklet):
            # Copy from tasklet is just copying the shared register
            # Use defined_vars to get the C++ type of the shared register
            callsite_stream.write(
                f'{self.dispatcher.defined_vars.get(edge.data.data)[1]} {edge.dst_conn} = {edge.data.data};'
            )
            return

        if not isinstance(src_node, dace.nodes.AccessNode):
            raise util.NotSupportedError(
                'Copy neither from Tasklet nor AccessNode')

        src_desc = src_node.desc(sdfg)

        if isinstance(src_desc, dace.data.Stream):
            # A copy from a stream will trigger a vector pop
            raise NotImplementedError()

            # FIXME: Issue when we can pop different amounts of data!
            # If we limit to the smallest amount, certain data will be lost (never processed)
            """
            # SVE register where the stream will be popped to
            self.create_empty_definition(in_conn, edge, callsite_stream, output=True)

            var_name = edge.dst_conn

            callsite_stream.write(
                f'{util.TYPE_TO_SVE[in_conn.type]} {var_name};')

            callsite_stream.write('{')
            callsite_stream.write('// Stream pop')

            # Pop into local buffer
            # 256 // in_conn.vtype.bytes
            n_vec = f'{util.REGISTER_BYTE_SIZE} / {in_conn.vtype.bytes}'
            callsite_stream.write(f'{in_conn.vtype.ctype} __tmp[{n_vec}];')
            callsite_stream.write(
                f'size_t __cnt = {edge.data.data}.pop_try(__tmp, {n_vec});')

            # Limit the loop predicate
            loop_pred = util.get_loop_predicate(sdfg, dfg, dst_node)
            callsite_stream.write(
                f'{loop_pred} = svand_z({loop_pred}, {loop_pred}, svwhilelt_b{in_conn.vtype.bytes * 8}(0ll, __cnt));')

            # Transfer to register
            callsite_stream.write(f'{var_name} = svld1({loop_pred}, __tmp);')

            callsite_stream.write('}')
            """
            return

        if isinstance(in_conn, dtypes.vector):
            # Copy from vector, so we can use svld

            if in_conn.type not in util.TYPE_TO_SVE:
                raise NotImplementedError(
                    f'Data type {in_conn.type} not supported')

            self.dispatcher.defined_vars.add(edge.dst_conn, dtypes.vector,
                                             in_conn.ctype)

            # Determine the stride of the load and use a gather if applicable
            stride = self.get_load_stride(sdfg, dfg, dst_node, edge.data)

            # First part of the declaration is `type name`
            load_lhs = '{} {}'.format(util.TYPE_TO_SVE[in_conn.type],
                                      edge.dst_conn)

            ptr_cast = ''
            if in_conn.type == np.int64:
                ptr_cast = '(int64_t*) '
            elif in_conn.type == np.uint64:
                ptr_cast = '(uint64_t*) '

            # Regular load and gather share the first arguments
            load_args = '{}, {}'.format(
                util.get_loop_predicate(sdfg, dfg, dst_node), ptr_cast +
                cpp.cpp_ptr_expr(sdfg, edge.data, DefinedType.Pointer))

            if stride == 1:
                callsite_stream.write('{} = svld1({});'.format(
                    load_lhs, load_args))
            else:
                callsite_stream.write(
                    '{} = svld1_gather_index({}, svindex_s{}(0, {}));'.format(
                        load_lhs, load_args,
                        util.get_base_type(in_conn).bytes * 8, sym2cpp(stride)))
        else:
            # Any other copy (e.g. pointer or scalar) is handled by the default CPU codegen
            self.cpu_codegen.copy_memory(sdfg, dfg, state_id, src_node,
                                         dst_node, edge, function_stream,
                                         callsite_stream)
Example #6
0
    def generate_writeback(self, sdfg: SDFG, state: SDFGState, map: nodes.Map,
                           edge: graph.MultiConnectorEdge[mm.Memlet], code: CodeIOStream):
        """
            Responsible for generating code for a writeback in a Tasklet, given the outgoing edge.
            This is mainly taking the temporary register and writing it back.
        """
        if edge.src_conn is None:
            return

        dst_node = state.memlet_path(edge)[-1].dst

        src_type = edge.src.out_connectors[edge.src_conn]
        src_name = edge.src_conn

        if isinstance(dst_node, nodes.Tasklet):
            ##################
            # Code->Code edges
            dst_type = edge.dst.in_connectors[edge.dst_conn]

            if (util.is_vector(src_type) and util.is_vector(dst_type)) or (util.is_scalar(src_type)
                                                                           and util.is_scalar(dst_type)):
                # Simply write back to shared register
                code.write(f'{edge.data.data} = {src_name};')
            elif util.is_scalar(src_type) and util.is_vector(dst_type):
                # Scalar broadcast to shared vector register
                code.write(f'{edge.data.data} = svdup_{util.TYPE_TO_SVE_SUFFIX[dst_type.type]}({src_name});')
            else:
                raise util.NotSupportedError('Unsupported Code->Code edge')
        elif isinstance(dst_node, nodes.AccessNode):
            ##################
            # Write to AccessNode
            desc = dst_node.desc(sdfg)
            if isinstance(desc, data.Array):
                ##################
                # Write into Array
                if util.is_pointer(src_type):
                    raise util.NotSupportedError('Unsupported writeback')
                elif util.is_vector(src_type):
                    ##################
                    # Scatter vector store into array

                    stride = edge.data.get_stride(sdfg, map)

                    # long long fix
                    ptr_cast = ''
                    if src_type.type == np.int64:
                        ptr_cast = '(int64_t*) '
                    elif src_type.type == np.uint64:
                        ptr_cast = '(uint64_t*) '

                    store_args = '{}, {}'.format(
                        util.get_loop_predicate(sdfg, state, edge.src),
                        ptr_cast + cpp.cpp_ptr_expr(sdfg, edge.data, DefinedType.Pointer, codegen=self.frame),
                    )

                    if stride == 1:
                        code.write(f'svst1({store_args}, {src_name});')
                    else:
                        code.write(
                            f'svst1_scatter_index({store_args}, svindex_s{util.get_base_type(src_type).bytes * 8}(0, {sym2cpp(stride)}), {src_name});'
                        )
                else:
                    ##################
                    # Scalar write into array
                    code.write(f'{cpp.cpp_array_expr(sdfg, edge.data, codegen=self.frame)} = {src_name};')
            elif isinstance(desc, data.Scalar):
                ##################
                # Write into Scalar
                if util.is_pointer(src_type):
                    raise util.NotSupportedError('Unsupported writeback')
                elif util.is_vector(src_type):
                    if util.is_vector(desc.dtype):
                        ##################
                        # Vector write into vector Scalar access node
                        code.write(f'{edge.data.data} = {src_name};')
                    else:
                        raise util.NotSupportedError('Unsupported writeback')
                else:
                    if util.is_vector(desc.dtype):
                        ##################
                        # Broadcast into scalar AccessNode
                        code.write(f'{edge.data.data} = svdup_{util.TYPE_TO_SVE_SUFFIX[src_type]}({src_name});')
                    else:
                        ##################
                        # Scalar write into scalar AccessNode
                        code.write(f'{edge.data.data} = {src_name};')

        else:
            raise util.NotSupportedError('Only writeback to Tasklets and AccessNodes is supported')
Example #7
0
    def generate_read(self, sdfg: SDFG, state: SDFGState, map: nodes.Map, edge: graph.MultiConnectorEdge[mm.Memlet],
                      code: CodeIOStream):
        """
            Responsible for generating code for reads into a Tasklet, given the ingoing edge.
        """
        if edge.dst_conn is None:
            return
        src_node = state.memlet_path(edge)[0].src
        dst_type = edge.dst.in_connectors[edge.dst_conn]
        dst_name = edge.dst_conn
        if isinstance(src_node, nodes.Tasklet):
            ##################
            # Code->Code edges
            src_type = edge.src.out_connectors[edge.src_conn]
            if util.is_vector(src_type) and util.is_vector(dst_type):
                # Directly read from shared vector register
                code.write(f'{util.TYPE_TO_SVE[dst_type.type]} {dst_name} = {edge.data.data};')
            elif util.is_scalar(src_type) and util.is_scalar(dst_type):
                # Directly read from shared scalar register
                code.write(f'{dst_type} {dst_name} = {edge.data.data};')
            elif util.is_scalar(src_type) and util.is_vector(dst_type):
                # Scalar broadcast from shared scalar register
                code.write(
                    f'{util.TYPE_TO_SVE[dst_type.type]} {dst_name} = svdup_{util.TYPE_TO_SVE_SUFFIX[dst_type.type]}({edge.data.data});'
                )
            else:
                raise util.NotSupportedError('Unsupported Code->Code edge')
        elif isinstance(src_node, nodes.AccessNode):
            ##################
            # Read from AccessNode
            desc = src_node.desc(sdfg)
            if isinstance(desc, data.Array):
                # Copy from array
                if util.is_pointer(dst_type):
                    ##################
                    # Pointer reference
                    code.write(
                        f'{dst_type} {dst_name} = {cpp.cpp_ptr_expr(sdfg, edge.data, None, codegen=self.frame)};')
                elif util.is_vector(dst_type):
                    ##################
                    # Vector load

                    stride = edge.data.get_stride(sdfg, map)

                    # First part of the declaration is `type name`
                    load_lhs = '{} {}'.format(util.TYPE_TO_SVE[dst_type.type], dst_name)

                    # long long issue casting
                    ptr_cast = ''
                    if dst_type.type == np.int64:
                        ptr_cast = '(int64_t*) '
                    elif dst_type.type == np.uint64:
                        ptr_cast = '(uint64_t*) '

                    # Regular load and gather share the first arguments
                    load_args = '{}, {}'.format(
                        util.get_loop_predicate(sdfg, state, edge.dst),
                        ptr_cast + cpp.cpp_ptr_expr(sdfg, edge.data, DefinedType.Pointer, codegen=self.frame))

                    if stride == 1:
                        code.write('{} = svld1({});'.format(load_lhs, load_args))
                    else:
                        code.write('{} = svld1_gather_index({}, svindex_s{}(0, {}));'.format(
                            load_lhs, load_args,
                            util.get_base_type(dst_type).bytes * 8, sym2cpp(stride)))
                else:
                    ##################
                    # Scalar read from array
                    code.write(f'{dst_type} {dst_name} = {cpp.cpp_array_expr(sdfg, edge.data, codegen=self.frame)};')
            elif isinstance(desc, data.Scalar):
                # Refer to shared variable
                src_type = desc.dtype
                if util.is_vector(src_type) and util.is_vector(dst_type):
                    # Directly read from shared vector register
                    code.write(f'{util.TYPE_TO_SVE[dst_type.type]} {dst_name} = {edge.data.data};')
                elif util.is_scalar(src_type) and util.is_scalar(dst_type):
                    # Directly read from shared scalar register
                    code.write(f'{dst_type} {dst_name} = {edge.data.data};')
                elif util.is_scalar(src_type) and util.is_vector(dst_type):
                    # Scalar broadcast from shared scalar register
                    code.write(
                        f'{util.TYPE_TO_SVE[dst_type.type]} {dst_name} = svdup_{util.TYPE_TO_SVE_SUFFIX[dst_type.type]}({edge.data.data});'
                    )
                else:
                    raise util.NotSupportedError('Unsupported Scalar->Code edge')
        else:
            raise util.NotSupportedError('Only copy from Tasklets and AccessNodes is supported')