Esempio n. 1
0
def row_matrix(df):
    """Compute the C (row major) version gpu matrix of df

    :param col_major: an `np.ndarray` or a `DeviceNDArrayBase` subclass.
        If already on the device, its stream will be used to perform the
        transpose (and to copy `row_major` to the device if necessary).

    To be replaced by CUDA ml-prim in upcoming version
    """

    cols = [df._cols[k] for k in df._cols]
    ncols = len(cols)
    nrows = len(df)
    dtype = cols[0].dtype

    col_major = df.as_gpu_matrix(order='F')
    row_major = rmm.device_array((nrows, ncols), dtype=dtype, order='C')

    tpb = driver.get_device().MAX_THREADS_PER_BLOCK
    bpg = (nrows + tpb - 1) // tpb

    @cuda.jit
    def kernel(_col_major, _row_major):
        tid = cuda.blockIdx.x * cuda.blockDim.x + cuda.threadIdx.x
        if tid >= nrows:
            return
        _col_offset = 0
        while _col_offset < _col_major.shape[1]:
            col_idx = _col_offset
            _row_major[tid, col_idx] = _col_major[tid, col_idx]
            _col_offset += 1

    kernel[bpg, tpb](col_major, row_major)

    return row_major
Esempio n. 2
0
def transpose(a, b=None):
    """Compute the transpose of 'a' and store it into 'b', if given,
    and return it. If 'b' is not given, allocate a new array
    and return that.

    This implements the algorithm documented in
    http://devblogs.nvidia.com/parallelforall/efficient-matrix-transpose-cuda-cc/

    :param a: an `np.ndarray` or a `DeviceNDArrayBase` subclass. If already on
        the device its stream will be used to perform the transpose (and to copy
        `b` to the device if necessary).
    """

    # prefer `a`'s stream if
    stream = getattr(a, 'stream', 0)

    if not b:
        cols, rows = a.shape
        strides = a.dtype.itemsize * cols, a.dtype.itemsize
        b = cuda.cudadrv.devicearray.DeviceNDArray(
            (rows, cols),
            strides,
            dtype=a.dtype,
            stream=stream)

    dt=nps.from_dtype(a.dtype)

    tpb = driver.get_device().MAX_THREADS_PER_BLOCK
    # we need to factor available threads into x and y axis
    tile_width = int(math.pow(2, math.log(tpb, 2)/2))
    tile_height = int(tpb / tile_width)

    tile_shape=(tile_height, tile_width + 1)

    @cuda.jit
    def kernel(input, output):

        tile = cuda.shared.array(shape=tile_shape, dtype=dt)

        tx = cuda.threadIdx.x
        ty = cuda.threadIdx.y
        bx = cuda.blockIdx.x * cuda.blockDim.x
        by = cuda.blockIdx.y * cuda.blockDim.y
        x = by + tx
        y = bx + ty

        if by+ty < input.shape[0] and bx+tx < input.shape[1]:
            tile[ty, tx] = input[by+ty, bx+tx]
        cuda.syncthreads()
        if y < output.shape[0] and x < output.shape[1]:
            output[y, x] = tile[tx, ty]


    # one block per tile, plus one for remainders
    blocks = int(b.shape[0]/tile_height + 1), int(b.shape[1]/tile_width + 1)
    # one thread per tile element
    threads = tile_height, tile_width
    kernel[blocks, threads, stream](a, b)

    return b
Esempio n. 3
0
def transpose(a, b=None):
    """Compute the transpose of 'a' and store it into 'b', if given,
    and return it. If 'b' is not given, allocate a new array
    and return that.

    This implements the algorithm documented in
    http://devblogs.nvidia.com/parallelforall/efficient-matrix-transpose-cuda-cc/

    :param a: an `np.ndarray` or a `DeviceNDArrayBase` subclass. If already on
        the device its stream will be used to perform the transpose (and to copy
        `b` to the device if necessary).
    """

    # prefer `a`'s stream if
    stream = getattr(a, 'stream', 0)

    if not b:
        cols, rows = a.shape
        strides = a.dtype.itemsize * cols, a.dtype.itemsize
        b = cuda.cudadrv.devicearray.DeviceNDArray(
            (rows, cols),
            strides,
            dtype=a.dtype,
            stream=stream)

    dt = nps.from_dtype(a.dtype)

    tpb = driver.get_device().MAX_THREADS_PER_BLOCK
    # we need to factor available threads into x and y axis
    tile_width = int(math.pow(2, math.log(tpb, 2) / 2))
    tile_height = int(tpb / tile_width)

    tile_shape = (tile_height, tile_width + 1)

    @cuda.jit
    def kernel(input, output):

        tile = cuda.shared.array(shape=tile_shape, dtype=dt)

        tx = cuda.threadIdx.x
        ty = cuda.threadIdx.y
        bx = cuda.blockIdx.x * cuda.blockDim.x
        by = cuda.blockIdx.y * cuda.blockDim.y
        x = by + tx
        y = bx + ty

        if by + ty < input.shape[0] and bx + tx < input.shape[1]:
            tile[ty, tx] = input[by + ty, bx + tx]
        cuda.syncthreads()
        if y < output.shape[0] and x < output.shape[1]:
            output[y, x] = tile[tx, ty]

    # one block per tile, plus one for remainders
    blocks = int(b.shape[0] / tile_height + 1), int(b.shape[1] / tile_width + 1)
    # one thread per tile element
    threads = tile_height, tile_width
    kernel[blocks, threads, stream](a, b)

    return b
Esempio n. 4
0
def row_matrix(df):
    """Compute the C (row major) version gpu matrix of df

    This implements the algorithm documented in
    http://devblogs.nvidia.com/parallelforall/efficient-matrix-transpose-cuda-cc/

    :param a: an `np.ndarray` or a `DeviceNDArrayBase` subclass. If already on
        the device its stream will be used to perform the transpose (and to copy
        `b` to the device if necessary).

    Adapted from numba:
    https://github.com/numba/numba/blob/master/numba/cuda/kernels/transpose.py

    To be replaced by CUDA ml-prim in upcoming version
    """

    cols = [df._cols[k] for k in df._cols]
    ncol = len(cols)
    nrow = len(df)
    dtype = cols[0].dtype

    a = df.as_gpu_matrix(order='F')
    b = rmm.device_array((nrow, ncol), dtype=dtype, order='C')
    dtype = numba.typeof(a)

    tpb = driver.get_device().MAX_THREADS_PER_BLOCK

    tile_width = int(math.pow(2, math.log(tpb, 2) / 2))
    tile_height = int(tpb / tile_width)

    tile_shape = (tile_height, tile_width + 1)

    @cuda.jit
    def kernel(input, output):

        tile = cuda.shared.array(shape=tile_shape, dtype=numba.float32)

        tx = cuda.threadIdx.x
        ty = cuda.threadIdx.y
        bx = cuda.blockIdx.x * cuda.blockDim.x
        by = cuda.blockIdx.y * cuda.blockDim.y
        y = by + tx
        x = bx + ty

        if by + ty < input.shape[0] and bx + tx < input.shape[1]:
            tile[ty, tx] = input[by + ty, bx + tx]
        cuda.syncthreads()
        if y < output.shape[0] and x < output.shape[1]:
            output[y, x] = tile[tx, ty]

    # one block per tile, plus one for remainders
    blocks = int((b.shape[1]) / tile_height +
                 1), int((b.shape[0]) / tile_width + 1)
    # one thread per tile element
    threads = tile_height, tile_width
    kernel[blocks, threads](a, b)

    return b
Esempio n. 5
0
 def setUp(self):
     self.assertTrue(driver.get_device_count())
     device = driver.get_device()
     ccmajor, _ = device.compute_capability
     if ccmajor >= 2:
         self.ptx = ptx2
     else:
         self.ptx = ptx1
     self.context = device.get_or_create_context()
Esempio n. 6
0
 def setUp(self):
     self.assertTrue(driver.get_device_count())
     device = driver.get_device()
     ccmajor, _ = device.compute_capability
     if ccmajor >= 2:
         self.ptx = ptx2
     else:
         self.ptx = ptx1
     self.context = device.get_or_create_context()
Esempio n. 7
0
def zeros(size, dtype, order="F"):
    """
    Return device array of zeros generated on device.
    """
    out = cuda.device_array(size, dtype=dtype, order=order)
    if isinstance(size, tuple):
        tpb = driver.get_device().MAX_THREADS_PER_BLOCK
        nrows = size[0]
        bpg = (nrows + tpb - 1) // tpb

        gpu_zeros_2d[bpg, tpb](out)

    elif size > 0:
        gpu_zeros_1d.forall(size)(out)

    return out
Esempio n. 8
0
def gpu_major_converter(original, nrows, ncols, dtype, to_order='C'):
    row_major = rmm.device_array((nrows, ncols), dtype=dtype, order=to_order)

    tpb = driver.get_device().MAX_THREADS_PER_BLOCK

    tile_width = int(math.pow(2, math.log(tpb, 2) / 2))
    tile_height = int(tpb / tile_width)

    tile_shape = (tile_height, tile_width + 1)

    # blocks and threads for the shared memory/tiled algorithm
    # see http://devblogs.nvidia.com/parallelforall/efficient-matrix-transpose-cuda-cc/ # noqa
    blocks = int((row_major.shape[1]) / tile_height + 1), \
        int((row_major.shape[0]) / tile_width + 1)

    threads = tile_height, tile_width

    # blocks per gpu for the general kernel
    bpg = (nrows + tpb - 1) // tpb

    if dtype == 'float32':
        dev_dtype = numba.float32

    else:
        dev_dtype = numba.float64

    @cuda.jit
    def general_kernel(input, output):
        tid = cuda.blockIdx.x * cuda.blockDim.x + cuda.threadIdx.x
        if tid >= nrows:
            return
        _col_offset = 0
        while _col_offset < input.shape[1]:
            col_idx = _col_offset
            output[tid, col_idx] = input[tid, col_idx]
            _col_offset += 1

    @cuda.jit
    def shared_kernel(input, output):

        tile = cuda.shared.array(shape=tile_shape, dtype=dev_dtype)

        tx = cuda.threadIdx.x
        ty = cuda.threadIdx.y
        bx = cuda.blockIdx.x * cuda.blockDim.x
        by = cuda.blockIdx.y * cuda.blockDim.y
        y = by + tx
        x = bx + ty

        if by + ty < input.shape[0] and bx + tx < input.shape[1]:
            tile[ty, tx] = input[by + ty, bx + tx]
        cuda.syncthreads()
        if y < output.shape[0] and x < output.shape[1]:
            output[y, x] = tile[tx, ty]

    # check if we cannot call the shared memory kernel
    # block limits: 2**31-1 for x, 65535 for y dim of blocks
    if blocks[0] > 2147483647 or blocks[1] > 65535:
        general_kernel[bpg, tpb](original, row_major)

    else:
        shared_kernel[blocks, threads](original, row_major)

    return row_major
Esempio n. 9
0
def row_matrix(df):
    """Compute the C (row major) version gpu matrix of df

    :param col_major: an `np.ndarray` or a `DeviceNDArrayBase` subclass.
        If already on the device, its stream will be used to perform the
        transpose (and to copy `row_major` to the device if necessary).

    """

    cols = [df._cols[k] for k in df._cols]
    ncols = len(cols)
    nrows = len(df)
    dtype = cols[0].dtype

    col_major = df.as_gpu_matrix(order='F')
    row_major = rmm.device_array((nrows, ncols), dtype=dtype, order='C')

    tpb = driver.get_device().MAX_THREADS_PER_BLOCK

    tile_width = int(math.pow(2, math.log(tpb, 2) / 2))
    tile_height = int(tpb / tile_width)

    tile_shape = (tile_height, tile_width + 1)

    # blocks and threads for the shared memory/tiled algorithm
    # see http://devblogs.nvidia.com/parallelforall/efficient-matrix-transpose-cuda-cc/ # noqa
    blocks = int((row_major.shape[1]) / tile_height + 1), int(
        (row_major.shape[0]) / tile_width + 1)  # noqa
    threads = tile_height, tile_width

    # blocks per gpu for the general kernel
    bpg = (nrows + tpb - 1) // tpb

    if dtype == 'float32':
        dev_dtype = numba.float32

    else:
        dev_dtype = numba.float64

    @cuda.jit
    def general_kernel(_col_major, _row_major):
        tid = cuda.blockIdx.x * cuda.blockDim.x + cuda.threadIdx.x
        if tid >= nrows:
            return
        _col_offset = 0
        while _col_offset < _col_major.shape[1]:
            col_idx = _col_offset
            _row_major[tid, col_idx] = _col_major[tid, col_idx]
            _col_offset += 1

    @cuda.jit
    def shared_kernel(input, output):

        tile = cuda.shared.array(shape=tile_shape, dtype=dev_dtype)

        tx = cuda.threadIdx.x
        ty = cuda.threadIdx.y
        bx = cuda.blockIdx.x * cuda.blockDim.x
        by = cuda.blockIdx.y * cuda.blockDim.y
        y = by + tx
        x = bx + ty

        if by + ty < input.shape[0] and bx + tx < input.shape[1]:
            tile[ty, tx] = input[by + ty, bx + tx]
        cuda.syncthreads()
        if y < output.shape[0] and x < output.shape[1]:
            output[y, x] = tile[tx, ty]

    # check if we cannot call the shared memory kernel
    # block limits: 2**31-1 for x, 65535 for y dim of blocks
    if blocks[0] > 2147483647 or blocks[1] > 65535:
        general_kernel[bpg, tpb](col_major, row_major)

    else:
        shared_kernel[blocks, threads](col_major, row_major)

    return row_major