Example #1
0
def _make_chunk_wise_kernel(func, argnames, extras):
    """
    Make a kernel that does a stride loop over the input chunks.

    Each block is responsible for a chunk in each iteration.
    Several iteration may be needed to handling a large number of chunks.

    The user function *func* will have all threads in the block for its
    computation.

    The resulting kernel can be used with any 1D grid size and 1D block size.
    """

    # Build kernel source
    argnames = list(map(_mangle_user, argnames))
    extras = list(map(_mangle_user, extras))
    source = """
def chunk_wise_kernel(nrows, chunks, {args}):
{body}
"""

    args = ', '.join(argnames)
    body = []

    body.append('blkid = cuda.blockIdx.x')
    body.append('nblkid = cuda.gridDim.x')
    body.append('tid = cuda.threadIdx.x')
    body.append('ntid = cuda.blockDim.x')

    # Stride loop over the block
    body.append('for curblk in range(blkid, chunks.size, nblkid):')
    indent = ' ' * 4

    body.append(indent + 'start = chunks[curblk]')
    body.append(
        indent +
        'stop = chunks[curblk + 1] if curblk + 1 < chunks.size else nrows')

    slicedargs = {}
    for a in argnames:
        if a not in extras:
            slicedargs[a] = "{}[start:stop]".format(a)
        else:
            slicedargs[a] = str(a)
    body.append("{}inner({})".format(
        indent, ', '.join(slicedargs[k] for k in argnames)))

    indented = ['{}{}'.format(' ' * 4, ln) for ln in body]
    # Finalize source
    concrete = source.format(args=args, body='\n'.join(indented))
    # Get bytecode
    glbs = {'inner': cuda.jit(device=True)(func), 'cuda': cuda}
    exec_(concrete, glbs)
    # Compile as CUDA kernel
    kernel = cuda.jit(glbs['chunk_wise_kernel'])
    return kernel
Example #2
0
def _make_row_wise_kernel(func, argnames, extras):
    """
    Make a kernel that does a stride loop over the input rows.

    Each thread is responsible for a row in each iteration.
    Several iteration may be needed to handling a large number of rows.

    The resulting kernel can be used with any 1D grid size and 1D block size.
    """
    # Build kernel source
    argnames = list(map(_mangle_user, argnames))
    extras = list(map(_mangle_user, extras))
    source = """
def row_wise_kernel({args}):
{body}
"""

    args = ", ".join(argnames)
    body = []

    body.append("tid = cuda.grid(1)")
    body.append("ntid = cuda.gridsize(1)")

    for a in argnames:
        if a not in extras:
            start = "tid"
            stop = ""
            stride = "ntid"
            srcidx = "{a} = {a}[{start}:{stop}:{stride}]"
            body.append(
                srcidx.format(a=a, start=start, stop=stop, stride=stride)
            )

    body.append("inner({})".format(args))

    indented = ["{}{}".format(" " * 4, ln) for ln in body]
    # Finalize source
    concrete = source.format(args=args, body="\n".join(indented))
    # Get bytecode
    glbs = {"inner": cuda.jit(device=True)(func), "cuda": cuda}
    exec_(concrete, glbs)
    # Compile as CUDA kernel
    kernel = cuda.jit(glbs["row_wise_kernel"])
    return kernel