Example #1
0
 def replace_param(param):
     param = symbolic.symstr(param)
     for p, pval in param_to_edge.items():
         # TODO: This special replacement condition will be removed
         #       when the code generator is modified to make consistent
         #       scalar/array decisions.
         if (isinstance(nsdfg.arrays[pval.data.data], data.Scalar)
                 or (nsdfg.arrays[pval.data.data].shape[0] == 1
                     and len(nsdfg.arrays[pval.data.data].shape) == 1)):
             param = param.replace(p, pval.data.data)
         else:
             param = param.replace(p, cpp_array_expr(nsdfg, pval.data))
     return param
Example #2
0
 def replace_param(param):
     param = symbolic.symstr(param)
     for p, pval in param_to_edge.items():
         # TODO: Correct w.r.t. connector type
         param = param.replace(p, cpp_array_expr(nsdfg, pval.data))
     return param
Example #3
0
    def expansion(node: 'Reduce', state: SDFGState, sdfg: SDFG):
        node.validate(sdfg, state)
        input_edge: graph.MultiConnectorEdge = state.in_edges(node)[0]
        output_edge: graph.MultiConnectorEdge = state.out_edges(node)[0]
        input_dims = len(input_edge.data.subset)
        input_data = sdfg.arrays[input_edge.data.data]
        output_data = sdfg.arrays[output_edge.data.data]

        # Setup all locations in which code will be written
        cuda_globalcode = CodeIOStream()
        localcode = CodeIOStream()

        # Try to autodetect reduction type
        redtype = detect_reduction_type(node.wcr)

        node_id = state.node_id(node)
        state_id = sdfg.node_id(state)
        idstr = '{sdfg}_{state}_{node}'.format(sdfg=sdfg.name,
                                               state=state_id,
                                               node=node_id)

        # Obtain some SDFG-related information
        input_memlet = input_edge.data
        output_memlet = output_edge.data

        if node.out_connectors:
            dtype = next(node.out_connectors.values())
        else:
            dtype = sdfg.arrays[output_memlet.data].dtype
        output_type = dtype.ctype

        if node.identity is None:
            raise ValueError('For device reduce nodes, initial value must be '
                             'specified')

        # Create a functor or use an existing one for reduction
        if redtype == dtypes.ReductionType.Custom:
            body, [arg1, arg2] = unparse_cr_split(sdfg, node.wcr)
            cuda_globalcode.write(
                """
        struct __reduce_{id} {{
            template <typename T>
            DACE_HDFI T operator()(const T &{arg1}, const T &{arg2}) const {{
                {contents}
            }}
        }};""".format(id=idstr, arg1=arg1, arg2=arg2, contents=body), sdfg,
                state_id, node_id)
            reduce_op = ', __reduce_' + idstr + '(), ' + symstr(node.identity)
        elif redtype in ExpandReduceCUDADevice._SPECIAL_RTYPES:
            reduce_op = ''
        else:
            credtype = 'dace::ReductionType::' + str(
                redtype)[str(redtype).find('.') + 1:]
            reduce_op = ((', dace::_wcr_fixed<%s, %s>()' %
                          (credtype, output_type)) + ', ' +
                         symstr(node.identity))

        # Try to obtain the number of threads in the block, or use the default
        # configuration
        block_threads = devicelevel_block_size(sdfg, state, node)
        if block_threads is not None:
            block_threads = functools.reduce(lambda a, b: a * b, block_threads,
                                             1)

        # Checks
        if block_threads is None:
            raise ValueError('Block-wide GPU reduction must occur within'
                             ' a GPU kernel')
        if issymbolic(block_threads, sdfg.constants):
            raise ValueError('Block size has to be constant for block-wide '
                             'reduction (got %s)' % str(block_threads))
        if (node.axes is not None and len(node.axes) < input_dims):
            raise ValueError(
                'Only full reduction is supported for block-wide reduce,'
                ' please use the pure expansion')
        if (input_data.storage != dtypes.StorageType.Register
                or output_data.storage != dtypes.StorageType.Register):
            raise ValueError(
                'Block-wise reduction only supports GPU register inputs '
                'and outputs')
        if redtype in ExpandReduceCUDABlock._SPECIAL_RTYPES:
            raise ValueError('%s block reduction not supported' % redtype)

        credtype = 'dace::ReductionType::' + str(
            redtype)[str(redtype).find('.') + 1:]
        if redtype == dtypes.ReductionType.Custom:
            redop = '__reduce_%s()' % idstr
        else:
            redop = 'dace::_wcr_fixed<%s, %s>()' % (credtype, output_type)

        # Allocate shared memory for block reduce
        localcode.write("""
        typedef cub::BlockReduce<{type}, {numthreads}> BlockReduce_{id};
        __shared__ typename BlockReduce_{id}::TempStorage temp_storage_{id};
            """.format(id=idstr,
                       type=output_data.dtype.ctype,
                       numthreads=block_threads))

        input = (input_memlet.data + ' + ' +
                 cpp_array_expr(sdfg, input_memlet, with_brackets=False))
        output = cpp_array_expr(sdfg, output_memlet)
        localcode.write("""
            {output} = BlockReduce_{id}(temp_storage_{id}).Reduce({input}, {redop});
            """.format(id=idstr,
                       redop=redop,
                       input=input_memlet.data,
                       output=output))

        # Make tasklet
        tnode = dace.nodes.Tasklet('reduce',
                                   {'_in': dace.pointer(input_data.dtype)},
                                   {'_out': dace.pointer(output_data.dtype)},
                                   localcode.getvalue(),
                                   language=dace.Language.CPP)

        # Add the rest of the code
        sdfg.append_global_code(cuda_globalcode.getvalue(), 'cuda')

        # Rename outer connectors and add to node
        input_edge._dst_conn = '_in'
        output_edge._src_conn = '_out'
        node.add_in_connector('_in')
        node.add_out_connector('_out')

        return tnode
Example #4
0
    def expansion(node: 'Reduce', state: SDFGState, sdfg: SDFG):
        node.validate(sdfg, state)
        input_edge: graph.MultiConnectorEdge = state.in_edges(node)[0]
        output_edge: graph.MultiConnectorEdge = state.out_edges(node)[0]
        input_dims = len(input_edge.data.subset)
        output_dims = len(output_edge.data.subset)
        input_data = sdfg.arrays[input_edge.data.data]
        output_data = sdfg.arrays[output_edge.data.data]

        # Setup all locations in which code will be written
        cuda_globalcode = CodeIOStream()
        cuda_initcode = CodeIOStream()
        cuda_exitcode = CodeIOStream()
        host_globalcode = CodeIOStream()
        host_localcode = CodeIOStream()
        output_memlet = output_edge.data

        # Try to autodetect reduction type
        redtype = detect_reduction_type(node.wcr)

        node_id = state.node_id(node)
        state_id = sdfg.node_id(state)
        idstr = '{sdfg}_{state}_{node}'.format(sdfg=sdfg.name,
                                               state=state_id,
                                               node=node_id)

        if node.out_connectors:
            dtype = next(node.out_connectors.values())
        else:
            dtype = sdfg.arrays[output_memlet.data].dtype

        output_type = dtype.ctype

        if node.identity is None:
            raise ValueError('For device reduce nodes, initial value must be '
                             'specified')

        # Create a functor or use an existing one for reduction
        if redtype == dtypes.ReductionType.Custom:
            body, [arg1, arg2] = unparse_cr_split(sdfg, node.wcr)
            cuda_globalcode.write(
                """
        struct __reduce_{id} {{
            template <typename T>
            DACE_HDFI T operator()(const T &{arg1}, const T &{arg2}) const {{
                {contents}
            }}
        }};""".format(id=idstr, arg1=arg1, arg2=arg2, contents=body), sdfg,
                state_id, node_id)
            reduce_op = ', __reduce_' + idstr + '(), ' + symstr(node.identity)
        elif redtype in ExpandReduceCUDADevice._SPECIAL_RTYPES:
            reduce_op = ''
        else:
            credtype = 'dace::ReductionType::' + str(
                redtype)[str(redtype).find('.') + 1:]
            reduce_op = ((', dace::_wcr_fixed<%s, %s>()' %
                          (credtype, output_type)) + ', ' +
                         symstr(node.identity))

        # Obtain some SDFG-related information
        input_memlet = input_edge.data
        reduce_shape = input_memlet.subset.bounding_box_size()
        num_items = ' * '.join(symstr(s) for s in reduce_shape)
        input = (input_memlet.data + ' + ' +
                 cpp_array_expr(sdfg, input_memlet, with_brackets=False))
        output = (output_memlet.data + ' + ' +
                  cpp_array_expr(sdfg, output_memlet, with_brackets=False))

        input_dims = input_memlet.subset.dims()
        output_dims = output_memlet.subset.data_dims()

        reduce_all_axes = (node.axes is None or len(node.axes) == input_dims)
        if reduce_all_axes:
            reduce_last_axes = False
        else:
            reduce_last_axes = sorted(node.axes) == list(
                range(input_dims - len(node.axes), input_dims))

        if (not reduce_all_axes) and (not reduce_last_axes):
            raise NotImplementedError(
                'Multiple axis reductions not supported on GPUs. Please use '
                'the pure expansion or make reduce axes the last in the array.'
            )

        # Verify that data is on the GPU
        if input_data.storage not in [
                dtypes.StorageType.GPU_Global, dtypes.StorageType.CPU_Pinned
        ]:
            raise ValueError('Input of GPU reduction must either reside '
                             ' in global GPU memory or pinned CPU memory')
        if output_data.storage not in [
                dtypes.StorageType.GPU_Global, dtypes.StorageType.CPU_Pinned
        ]:
            raise ValueError('Output of GPU reduction must either reside '
                             ' in global GPU memory or pinned CPU memory')

        # Determine reduction type
        kname = (ExpandReduceCUDADevice._SPECIAL_RTYPES[redtype] if redtype
                 in ExpandReduceCUDADevice._SPECIAL_RTYPES else 'Reduce')

        # Create temp memory for this GPU
        cuda_globalcode.write(
            """
            void *__cub_storage_{sdfg}_{state}_{node} = NULL;
            size_t __cub_ssize_{sdfg}_{state}_{node} = 0;
        """.format(sdfg=sdfg.name, state=state_id, node=node_id), sdfg,
            state_id, node)

        if reduce_all_axes:
            reduce_type = 'DeviceReduce'
            reduce_range = num_items
            reduce_range_def = 'size_t num_items'
            reduce_range_use = 'num_items'
            reduce_range_call = num_items
        elif reduce_last_axes:
            num_reduce_axes = len(node.axes)
            not_reduce_axes = reduce_shape[:-num_reduce_axes]
            reduce_axes = reduce_shape[-num_reduce_axes:]

            num_segments = ' * '.join([symstr(s) for s in not_reduce_axes])
            segment_size = ' * '.join([symstr(s) for s in reduce_axes])

            reduce_type = 'DeviceSegmentedReduce'
            iterator = 'dace::stridedIterator({size})'.format(
                size=segment_size)
            reduce_range = '{num}, {it}, {it} + 1'.format(num=num_segments,
                                                          it=iterator)
            reduce_range_def = 'size_t num_segments, size_t segment_size'
            iterator_use = 'dace::stridedIterator(segment_size)'
            reduce_range_use = 'num_segments, {it}, {it} + 1'.format(
                it=iterator_use)
            reduce_range_call = '%s, %s' % (num_segments, segment_size)

        # Call CUB to get the storage size, allocate and free it
        cuda_initcode.write(
            """
            cub::{reduce_type}::{kname}(nullptr, __cub_ssize_{sdfg}_{state}_{node},
                                        ({intype}*)nullptr, ({outtype}*)nullptr, {reduce_range}{redop});
            cudaMalloc(&__cub_storage_{sdfg}_{state}_{node}, __cub_ssize_{sdfg}_{state}_{node});
""".format(sdfg=sdfg.name,
           state=state_id,
           node=node_id,
           reduce_type=reduce_type,
           reduce_range=reduce_range,
           redop=reduce_op,
           intype=input_data.dtype.ctype,
           outtype=output_data.dtype.ctype,
           kname=kname), sdfg, state_id, node)

        cuda_exitcode.write(
            'cudaFree(__cub_storage_{sdfg}_{state}_{node});'.format(
                sdfg=sdfg.name, state=state_id, node=node_id), sdfg, state_id,
            node)

        # Write reduction function definition
        cuda_globalcode.write("""
DACE_EXPORTED void __dace_reduce_{id}({intype} *input, {outtype} *output, {reduce_range_def}, cudaStream_t stream);
void __dace_reduce_{id}({intype} *input, {outtype} *output, {reduce_range_def}, cudaStream_t stream)
{{
cub::{reduce_type}::{kname}(__cub_storage_{id}, __cub_ssize_{id},
                            input, output, {reduce_range_use}{redop}, stream);
}}
        """.format(id=idstr,
                   intype=input_data.dtype.ctype,
                   outtype=output_data.dtype.ctype,
                   reduce_type=reduce_type,
                   reduce_range_def=reduce_range_def,
                   reduce_range_use=reduce_range_use,
                   kname=kname,
                   redop=reduce_op))

        # Write reduction function definition in caller file
        host_globalcode.write(
            """
DACE_EXPORTED void __dace_reduce_{id}({intype} *input, {outtype} *output, {reduce_range_def}, cudaStream_t stream);
        """.format(id=idstr,
                   reduce_range_def=reduce_range_def,
                   intype=input_data.dtype.ctype,
                   outtype=output_data.dtype.ctype), sdfg, state_id, node)

        # Call reduction function where necessary
        host_localcode.write(
            '__dace_reduce_{id}({input}, {output}, {reduce_range_call}, __dace_current_stream);'
            .format(id=idstr,
                    input=input,
                    output=output,
                    reduce_range_call=reduce_range_call))

        # Make tasklet
        tnode = dace.nodes.Tasklet('reduce',
                                   {'_in': dace.pointer(input_data.dtype)},
                                   {'_out': dace.pointer(output_data.dtype)},
                                   host_localcode.getvalue(),
                                   language=dace.Language.CPP)

        # Add the rest of the code
        sdfg.append_global_code(host_globalcode.getvalue())
        sdfg.append_global_code(cuda_globalcode.getvalue(), 'cuda')
        sdfg.append_init_code(cuda_initcode.getvalue(), 'cuda')
        sdfg.append_exit_code(cuda_exitcode.getvalue(), 'cuda')

        # Rename outer connectors and add to node
        input_edge._dst_conn = '_in'
        output_edge._src_conn = '_out'
        node.add_in_connector('_in')
        node.add_out_connector('_out')

        return tnode
Example #5
0
    def copy_memory(self, sdfg: dace.SDFG, dfg: StateSubgraphView,
                    state_id: int, src_node: nodes.Node, dst_node: nodes.Node,
                    edge: MultiConnectorEdge, function_stream: CodeIOStream,
                    callsite_stream: CodeIOStream):
        # Obtain source and destination information, handle access<->tasklet
        # If copying from tensor core fragments to/from tasklets, we only need
        # to emit a reference, as the fragment contains the memory.
        src_desc = (src_node.desc(sdfg)
                    if isinstance(src_node, nodes.AccessNode) else None)
        # Tasklet -> Array
        if not src_desc:
            local_name = dfg.memlet_path(edge)[0].src_conn
            callsite_stream.write(
                'auto& %s = %s;' % (local_name, dst_node.data), sdfg, state_id,
                [src_node, dst_node])
            return

        dst_desc = (dst_node.desc(sdfg)
                    if isinstance(dst_node, nodes.AccessNode) else None)
        # Array -> Tasklet
        if not dst_desc:
            local_name = dfg.memlet_path(edge)[-1].dst_conn
            callsite_stream.write(
                'auto& %s = %s;' % (local_name, src_node.data), sdfg, state_id,
                [src_node, dst_node])
            return

        nontc_desc = (dst_desc
                      if 'TensorCore' in src_desc.storage.name else src_desc)
        nontc_node = (dst_node
                      if 'TensorCore' in src_desc.storage.name else src_node)

        # Majority is detected by the strides of the data
        row_major = True if nontc_desc.strides[-1] == 1 else False
        #####################################################################

        # Set non-tensor-core C++ expression based on memlet
        if edge.data.data == nontc_node.data:
            other_expr = cpp_array_expr(sdfg, edge.data)
        elif edge.data.other_subset is not None:
            offset_cppstr = cpp_offset_expr(nontc_desc, edge.data.other_subset)
            other_expr = '%s[%s]' % (nontc_node.data, offset_cppstr)
        else:
            other_expr = '%s[0]' % nontc_node.data
        #####################################################################

        # Emit copy code
        if 'TensorCore' in dst_desc.storage.name:
            # GPU memory to Tensor Cores
            callsite_stream.write(
                'wmma::load_matrix_sync({tc}, &{other}, '
                '{stride});'.format(
                    tc=dst_node.data,
                    other=other_expr,
                    stride=src_desc.strides[0 if row_major else 1]), sdfg,
                state_id, [src_node, dst_node])
        else:
            # Tensor Cores to GPU memory
            callsite_stream.write(
                'wmma::store_matrix_sync(&{other}, {tc}, '
                '{stride}, wmma::mem_{maj}_major);'.format(
                    tc=src_node.data,
                    other=other_expr,
                    maj='row' if row_major else 'col',
                    stride=dst_desc.strides[0 if row_major else 1]), sdfg,
                state_id, [src_node, dst_node])
Example #6
0
    def memlet_view_ctor(self, sdfg, memlet, direction):
        useskip = False
        memlet_params = []

        memlet_name = memlet.data
        if isinstance(sdfg.arrays[memlet.data], data.Scalar):
            raise ValueError("This should never have happened")

        if isinstance(memlet.subset, subsets.Indices):
            # Compute address
            memlet_params.append(cpp_array_expr(sdfg, memlet, False))
            dims = 0

        elif isinstance(memlet.subset, subsets.Range):
            dims = len(memlet.subset.ranges)
            #memlet_params.append("")

            # Dimensions to remove from view (due to having one value)
            indexdims = []
            nonIndexDims = []

            for dim, (rb, re, rs) in enumerate(memlet.subset.ranges):
                if rs != 1:
                    useskip = True
                try:
                    if (re - rb) == 0:
                        indexdims.append(dim)
                    else:
                        nonIndexDims.append(dim)
                except TypeError:  # cannot determine truth value of Relational
                    nonIndexDims.append(dim)

            if len(nonIndexDims) > 1 and len(indexdims) > 0:
                raise NotImplementedError(
                    'subviews of more than one dimension ' + 'not implemented')
            elif len(
                    nonIndexDims) == 1 and len(indexdims) > 0:  # One dimension
                indexdim = nonIndexDims[0]

                # Contiguous dimension
                if indexdim == dims - 1:
                    memlet_params[-1] += ' + %s' % cpp_array_expr(
                        sdfg, memlet, False)
                    memlet_params.append(
                        '0, %s' % (sym2cpp(memlet.subset.ranges[-1][1] -
                                           memlet.subset.ranges[-1][0])))
                else:  # Non-contiguous dimension
                    useskip = True
                    memlet_params[-1] += ' + %s' % cpp_array_expr(
                        sdfg, memlet, False)
                    memlet_range = memlet.subset.ranges[indexdim]

                    memlet_stride = sdfg.arrays[memlet.data].strides[indexdim]
                    memlet_stride = sym2cpp(memlet_stride)

                    memlet_params.append(
                        '0, %s, %s' %
                        (sym2cpp(memlet_range[1] - memlet_range[0]),
                         sym2cpp(memlet_stride)))

                # Subtract index dimensions from array dimensions
                dims -= len(indexdims)

            elif len(indexdims) == 0:
                for (rb, re, rs), s in zip(memlet.subset.ranges,
                                           sdfg.arrays[memlet.data].shape):
                    if useskip:
                        memlet_params.append(
                            '%s, %s, %s' %
                            (cppunparse.pyexpr2cpp(symbolic.symstr(rb)),
                             cppunparse.pyexpr2cpp(symbolic.symstr(s)),
                             cppunparse.pyexpr2cpp(symbolic.symstr(rs))))
                    else:
                        memlet_params.append(
                            '%s, %s' %
                            (cppunparse.pyexpr2cpp(symbolic.symstr(rb)),
                             cppunparse.pyexpr2cpp(symbolic.symstr(s))))
            elif len(nonIndexDims) == 0:  # Scalar view
                if len(memlet_params) > 0:
                    # Compute address
                    memlet_params[-1] += ' + ' + cpp_array_expr(
                        sdfg, memlet, False)
                else:
                    memlet_params.append(cpp_array_expr(sdfg, memlet, False))
                dims = 0

        else:
            raise RuntimeError('Memlet type "%s" not implemented' %
                               memlet.subset)

        if dims == 0:
            return 'dace::ArrayViewImmaterial%s%s<%s, %s, int32_t> ("%s", %s)' % (
                'In' if direction == "in" else "Out", 'Skip'
                if useskip else '', sdfg.arrays[memlet.data].dtype.ctype,
                symbolic.symstr(
                    memlet.veclen), memlet.data, ', '.join(memlet_params))
        else:
            return 'dace::ArrayViewImmaterial%s%s<%s, %s, int32_t, %s> ("%s", %s)' % (
                'In' if direction == "in" else "Out", 'Skip'
                if useskip else '', sdfg.arrays[memlet.data].dtype.ctype,
                symbolic.symstr(memlet.veclen), ', '.join([
                    str(s) for s in memlet.subset.bounding_box_size()
                ]), memlet.data, ', '.join(memlet_params))