def _maybe_transpose(self, d_ary, d_out): """Transpose device arrays into row-major format if needed, as cuFFT can't handle column-major data.""" transpose_in = len(d_ary.shape) == 2 and d_ary.is_f_contiguous() transpose_out = len(d_out.shape) == 2 and d_out.is_f_contiguous() if transpose_in: # Create a row-major device array used_in = DeviceNDArray( shape=(d_ary.shape[1], d_ary.shape[0]), strides=(d_ary.dtype.itemsize, d_ary.dtype.itemsize * d_ary.shape[1]), dtype=d_ary.dtype) transpose(d_ary, used_in) else: used_in = d_ary if transpose_out: # Create a row-major device array used_out = DeviceNDArray( shape=d_out.shape, strides=(d_out.dtype.itemsize * d_out.shape[1], d_out.dtype.itemsize), dtype=d_out.dtype) else: used_out = d_out return used_in, used_out, transpose_out
def test_pyarrow_memalloc(c, dtype): ctx, nb_ctx = context_choices[c] size = 10 arr, cbuf = make_random_buffer(size, target='device', dtype=dtype, ctx=ctx) # wrap CudaBuffer with numba device array mem = cbuf.to_numba() darr = DeviceNDArray(arr.shape, arr.strides, arr.dtype, gpu_data=mem) np.testing.assert_equal(darr.copy_to_host(), arr)
def test_numba_context(c, dtype): ctx, nb_ctx = context_choices[c] size = 10 with nb_cuda.gpus[0]: arr, cbuf = make_random_buffer(size, target='device', dtype=dtype, ctx=ctx) assert cbuf.context.handle == nb_ctx.handle.value mem = cbuf.to_numba() darr = DeviceNDArray(arr.shape, arr.strides, arr.dtype, gpu_data=mem) np.testing.assert_equal(darr.copy_to_host(), arr) darr[0] = 99 cbuf.context.synchronize() arr2 = np.frombuffer(cbuf.copy_to_host(), dtype=dtype) assert arr2[0] == 99
def test_numba_memalloc(c, dtype): ctx, nb_ctx = context_choices[c] dtype = np.dtype(dtype) # Allocate memory using numba context # Warning: this will not be reflected in pyarrow context manager # (e.g bytes_allocated does not change) size = 10 mem = nb_ctx.memalloc(size * dtype.itemsize) darr = DeviceNDArray((size, ), (dtype.itemsize, ), dtype, gpu_data=mem) darr[:5] = 99 darr[5:] = 88 np.testing.assert_equal(darr.copy_to_host()[:5], 99) np.testing.assert_equal(darr.copy_to_host()[5:], 88) # wrap numba allocated memory with CudaBuffer cbuf = cuda.CudaBuffer.from_numba(mem) arr2 = np.frombuffer(cbuf.copy_to_host(), dtype=dtype) np.testing.assert_equal(arr2, darr.copy_to_host())
def numba_cuda_DeviceNDArray(cbuf): """Return numba DeviceNDArray view of a pyarrow.cuda.CudaBuffer. """ import numpy as np from numba.cuda.cudadrv.devicearray import DeviceNDArray dtype = np.dtype('uint8') return DeviceNDArray((cbuf.size, ), (dtype.itemsize, ), dtype, gpu_data=cbuf.to_numba())
def test_numba_memalloc(c, dtype): ctx, nb_ctx = context_choices[c] dtype = np.dtype(dtype) # Allocate memory using numba context # Warning: this will not be reflected in pyarrow context manager # (e.g bytes_allocated does not change) size = 10 mem = nb_ctx.memalloc(size * dtype.itemsize) darr = DeviceNDArray((size,), (dtype.itemsize,), dtype, gpu_data=mem) darr[:5] = 99 darr[5:] = 88 np.testing.assert_equal(darr.copy_to_host()[:5], 99) np.testing.assert_equal(darr.copy_to_host()[5:], 88) # wrap numba allocated memory with CudaBuffer cbuf = cuda.CudaBuffer.from_numba(mem) arr2 = np.frombuffer(cbuf.copy_to_host(), dtype=dtype) np.testing.assert_equal(arr2, darr.copy_to_host())
def gpu_view_as(arr, dtype, shape=None, strides=None): dtype = np.dtype(dtype) if strides is None: strides = (arr.strides if arr.dtype == dtype else dtype.itemsize) if shape is None: shape = (arr.shape if arr.dtype == dtype else arr.size // dtype.itemsize) return DeviceNDArray(shape=shape, strides=strides, dtype=dtype, gpu_data=arr.gpu_data)
def get_column(schema): offset = schema['data_buffer']['offset'] raw_size = schema['data_buffer']['length'] size = schema['length'] assert schema['dtype']['bitwidth'] == 32 assert schema['dtype']['name'] == 'FloatingPoint' raw_data_col1 = data_region[offset:offset + raw_size] assert raw_data_col1.size == raw_size dtype = np.dtype(np.float32) itemsize = dtype.itemsize ary = DeviceNDArray(shape=(raw_size // itemsize, ), strides=(itemsize, ), dtype=dtype, gpu_data=raw_data_col1.gpu_data) hary = ary[:size].copy_to_host() return hary
def test_pyarrow_jit(c, dtype): ctx, nb_ctx = context_choices[c] @nb_cuda.jit def increment_by_one(an_array): pos = nb_cuda.grid(1) if pos < an_array.size: an_array[pos] += 1 # applying numba.cuda kernel to memory hold by CudaBuffer size = 10 arr, cbuf = make_random_buffer(size, target='device', dtype=dtype, ctx=ctx) threadsperblock = 32 blockspergrid = (arr.size + (threadsperblock - 1)) // threadsperblock mem = cbuf.to_numba() darr = DeviceNDArray(arr.shape, arr.strides, arr.dtype, gpu_data=mem) increment_by_one[blockspergrid, threadsperblock](darr) cbuf.context.synchronize() arr1 = np.frombuffer(cbuf.copy_to_host(), dtype=arr.dtype) np.testing.assert_equal(arr1, arr + 1)