def row_matrix(df): """Compute the C (row major) version gpu matrix of df :param col_major: an `np.ndarray` or a `DeviceNDArrayBase` subclass. If already on the device, its stream will be used to perform the transpose (and to copy `row_major` to the device if necessary). To be replaced by CUDA ml-prim in upcoming version """ cols = [df._cols[k] for k in df._cols] ncols = len(cols) nrows = len(df) dtype = cols[0].dtype col_major = df.as_gpu_matrix(order='F') row_major = rmm.device_array((nrows, ncols), dtype=dtype, order='C') tpb = driver.get_device().MAX_THREADS_PER_BLOCK bpg = (nrows + tpb - 1) // tpb @cuda.jit def kernel(_col_major, _row_major): tid = cuda.blockIdx.x * cuda.blockDim.x + cuda.threadIdx.x if tid >= nrows: return _col_offset = 0 while _col_offset < _col_major.shape[1]: col_idx = _col_offset _row_major[tid, col_idx] = _col_major[tid, col_idx] _col_offset += 1 kernel[bpg, tpb](col_major, row_major) return row_major
def transpose(a, b=None): """Compute the transpose of 'a' and store it into 'b', if given, and return it. If 'b' is not given, allocate a new array and return that. This implements the algorithm documented in http://devblogs.nvidia.com/parallelforall/efficient-matrix-transpose-cuda-cc/ :param a: an `np.ndarray` or a `DeviceNDArrayBase` subclass. If already on the device its stream will be used to perform the transpose (and to copy `b` to the device if necessary). """ # prefer `a`'s stream if stream = getattr(a, 'stream', 0) if not b: cols, rows = a.shape strides = a.dtype.itemsize * cols, a.dtype.itemsize b = cuda.cudadrv.devicearray.DeviceNDArray( (rows, cols), strides, dtype=a.dtype, stream=stream) dt=nps.from_dtype(a.dtype) tpb = driver.get_device().MAX_THREADS_PER_BLOCK # we need to factor available threads into x and y axis tile_width = int(math.pow(2, math.log(tpb, 2)/2)) tile_height = int(tpb / tile_width) tile_shape=(tile_height, tile_width + 1) @cuda.jit def kernel(input, output): tile = cuda.shared.array(shape=tile_shape, dtype=dt) tx = cuda.threadIdx.x ty = cuda.threadIdx.y bx = cuda.blockIdx.x * cuda.blockDim.x by = cuda.blockIdx.y * cuda.blockDim.y x = by + tx y = bx + ty if by+ty < input.shape[0] and bx+tx < input.shape[1]: tile[ty, tx] = input[by+ty, bx+tx] cuda.syncthreads() if y < output.shape[0] and x < output.shape[1]: output[y, x] = tile[tx, ty] # one block per tile, plus one for remainders blocks = int(b.shape[0]/tile_height + 1), int(b.shape[1]/tile_width + 1) # one thread per tile element threads = tile_height, tile_width kernel[blocks, threads, stream](a, b) return b
def transpose(a, b=None): """Compute the transpose of 'a' and store it into 'b', if given, and return it. If 'b' is not given, allocate a new array and return that. This implements the algorithm documented in http://devblogs.nvidia.com/parallelforall/efficient-matrix-transpose-cuda-cc/ :param a: an `np.ndarray` or a `DeviceNDArrayBase` subclass. If already on the device its stream will be used to perform the transpose (and to copy `b` to the device if necessary). """ # prefer `a`'s stream if stream = getattr(a, 'stream', 0) if not b: cols, rows = a.shape strides = a.dtype.itemsize * cols, a.dtype.itemsize b = cuda.cudadrv.devicearray.DeviceNDArray( (rows, cols), strides, dtype=a.dtype, stream=stream) dt = nps.from_dtype(a.dtype) tpb = driver.get_device().MAX_THREADS_PER_BLOCK # we need to factor available threads into x and y axis tile_width = int(math.pow(2, math.log(tpb, 2) / 2)) tile_height = int(tpb / tile_width) tile_shape = (tile_height, tile_width + 1) @cuda.jit def kernel(input, output): tile = cuda.shared.array(shape=tile_shape, dtype=dt) tx = cuda.threadIdx.x ty = cuda.threadIdx.y bx = cuda.blockIdx.x * cuda.blockDim.x by = cuda.blockIdx.y * cuda.blockDim.y x = by + tx y = bx + ty if by + ty < input.shape[0] and bx + tx < input.shape[1]: tile[ty, tx] = input[by + ty, bx + tx] cuda.syncthreads() if y < output.shape[0] and x < output.shape[1]: output[y, x] = tile[tx, ty] # one block per tile, plus one for remainders blocks = int(b.shape[0] / tile_height + 1), int(b.shape[1] / tile_width + 1) # one thread per tile element threads = tile_height, tile_width kernel[blocks, threads, stream](a, b) return b
def row_matrix(df): """Compute the C (row major) version gpu matrix of df This implements the algorithm documented in http://devblogs.nvidia.com/parallelforall/efficient-matrix-transpose-cuda-cc/ :param a: an `np.ndarray` or a `DeviceNDArrayBase` subclass. If already on the device its stream will be used to perform the transpose (and to copy `b` to the device if necessary). Adapted from numba: https://github.com/numba/numba/blob/master/numba/cuda/kernels/transpose.py To be replaced by CUDA ml-prim in upcoming version """ cols = [df._cols[k] for k in df._cols] ncol = len(cols) nrow = len(df) dtype = cols[0].dtype a = df.as_gpu_matrix(order='F') b = rmm.device_array((nrow, ncol), dtype=dtype, order='C') dtype = numba.typeof(a) tpb = driver.get_device().MAX_THREADS_PER_BLOCK tile_width = int(math.pow(2, math.log(tpb, 2) / 2)) tile_height = int(tpb / tile_width) tile_shape = (tile_height, tile_width + 1) @cuda.jit def kernel(input, output): tile = cuda.shared.array(shape=tile_shape, dtype=numba.float32) tx = cuda.threadIdx.x ty = cuda.threadIdx.y bx = cuda.blockIdx.x * cuda.blockDim.x by = cuda.blockIdx.y * cuda.blockDim.y y = by + tx x = bx + ty if by + ty < input.shape[0] and bx + tx < input.shape[1]: tile[ty, tx] = input[by + ty, bx + tx] cuda.syncthreads() if y < output.shape[0] and x < output.shape[1]: output[y, x] = tile[tx, ty] # one block per tile, plus one for remainders blocks = int((b.shape[1]) / tile_height + 1), int((b.shape[0]) / tile_width + 1) # one thread per tile element threads = tile_height, tile_width kernel[blocks, threads](a, b) return b
def setUp(self): self.assertTrue(driver.get_device_count()) device = driver.get_device() ccmajor, _ = device.compute_capability if ccmajor >= 2: self.ptx = ptx2 else: self.ptx = ptx1 self.context = device.get_or_create_context()
def zeros(size, dtype, order="F"): """ Return device array of zeros generated on device. """ out = cuda.device_array(size, dtype=dtype, order=order) if isinstance(size, tuple): tpb = driver.get_device().MAX_THREADS_PER_BLOCK nrows = size[0] bpg = (nrows + tpb - 1) // tpb gpu_zeros_2d[bpg, tpb](out) elif size > 0: gpu_zeros_1d.forall(size)(out) return out
def gpu_major_converter(original, nrows, ncols, dtype, to_order='C'): row_major = rmm.device_array((nrows, ncols), dtype=dtype, order=to_order) tpb = driver.get_device().MAX_THREADS_PER_BLOCK tile_width = int(math.pow(2, math.log(tpb, 2) / 2)) tile_height = int(tpb / tile_width) tile_shape = (tile_height, tile_width + 1) # blocks and threads for the shared memory/tiled algorithm # see http://devblogs.nvidia.com/parallelforall/efficient-matrix-transpose-cuda-cc/ # noqa blocks = int((row_major.shape[1]) / tile_height + 1), \ int((row_major.shape[0]) / tile_width + 1) threads = tile_height, tile_width # blocks per gpu for the general kernel bpg = (nrows + tpb - 1) // tpb if dtype == 'float32': dev_dtype = numba.float32 else: dev_dtype = numba.float64 @cuda.jit def general_kernel(input, output): tid = cuda.blockIdx.x * cuda.blockDim.x + cuda.threadIdx.x if tid >= nrows: return _col_offset = 0 while _col_offset < input.shape[1]: col_idx = _col_offset output[tid, col_idx] = input[tid, col_idx] _col_offset += 1 @cuda.jit def shared_kernel(input, output): tile = cuda.shared.array(shape=tile_shape, dtype=dev_dtype) tx = cuda.threadIdx.x ty = cuda.threadIdx.y bx = cuda.blockIdx.x * cuda.blockDim.x by = cuda.blockIdx.y * cuda.blockDim.y y = by + tx x = bx + ty if by + ty < input.shape[0] and bx + tx < input.shape[1]: tile[ty, tx] = input[by + ty, bx + tx] cuda.syncthreads() if y < output.shape[0] and x < output.shape[1]: output[y, x] = tile[tx, ty] # check if we cannot call the shared memory kernel # block limits: 2**31-1 for x, 65535 for y dim of blocks if blocks[0] > 2147483647 or blocks[1] > 65535: general_kernel[bpg, tpb](original, row_major) else: shared_kernel[blocks, threads](original, row_major) return row_major
def row_matrix(df): """Compute the C (row major) version gpu matrix of df :param col_major: an `np.ndarray` or a `DeviceNDArrayBase` subclass. If already on the device, its stream will be used to perform the transpose (and to copy `row_major` to the device if necessary). """ cols = [df._cols[k] for k in df._cols] ncols = len(cols) nrows = len(df) dtype = cols[0].dtype col_major = df.as_gpu_matrix(order='F') row_major = rmm.device_array((nrows, ncols), dtype=dtype, order='C') tpb = driver.get_device().MAX_THREADS_PER_BLOCK tile_width = int(math.pow(2, math.log(tpb, 2) / 2)) tile_height = int(tpb / tile_width) tile_shape = (tile_height, tile_width + 1) # blocks and threads for the shared memory/tiled algorithm # see http://devblogs.nvidia.com/parallelforall/efficient-matrix-transpose-cuda-cc/ # noqa blocks = int((row_major.shape[1]) / tile_height + 1), int( (row_major.shape[0]) / tile_width + 1) # noqa threads = tile_height, tile_width # blocks per gpu for the general kernel bpg = (nrows + tpb - 1) // tpb if dtype == 'float32': dev_dtype = numba.float32 else: dev_dtype = numba.float64 @cuda.jit def general_kernel(_col_major, _row_major): tid = cuda.blockIdx.x * cuda.blockDim.x + cuda.threadIdx.x if tid >= nrows: return _col_offset = 0 while _col_offset < _col_major.shape[1]: col_idx = _col_offset _row_major[tid, col_idx] = _col_major[tid, col_idx] _col_offset += 1 @cuda.jit def shared_kernel(input, output): tile = cuda.shared.array(shape=tile_shape, dtype=dev_dtype) tx = cuda.threadIdx.x ty = cuda.threadIdx.y bx = cuda.blockIdx.x * cuda.blockDim.x by = cuda.blockIdx.y * cuda.blockDim.y y = by + tx x = bx + ty if by + ty < input.shape[0] and bx + tx < input.shape[1]: tile[ty, tx] = input[by + ty, bx + tx] cuda.syncthreads() if y < output.shape[0] and x < output.shape[1]: output[y, x] = tile[tx, ty] # check if we cannot call the shared memory kernel # block limits: 2**31-1 for x, 65535 for y dim of blocks if blocks[0] > 2147483647 or blocks[1] > 65535: general_kernel[bpg, tpb](col_major, row_major) else: shared_kernel[blocks, threads](col_major, row_major) return row_major