def test_slice_as_arg(self): global cufoo cufoo = cuda.jit("void(int32[:], int32[:])", device=True)(foo) cucopy = cuda.jit("void(int32[:,:], int32[:,:])")(copy) inp = np.arange(100, dtype=np.int32).reshape(10, 10) out = np.zeros_like(inp) cucopy[1, 10](inp, out)
def test_kernel(self): def foo(arr, val): i = cuda.grid(1) if i < arr.size: arr[i] = float32(i) / val fastver = cuda.jit("void(float32[:], float32)", fastmath=True)(foo) precver = cuda.jit("void(float32[:], float32)")(foo) self.assertIn('div.full.ftz.f32', fastver.ptx) self.assertNotIn('div.full.ftz.f32', precver.ptx)
def test_exception(self): unsafe_foo = cuda.jit(foo) safe_foo = cuda.jit(debug=True)(foo) if not config.ENABLE_CUDASIM: # Simulator throws exceptions regardless of debug # setting unsafe_foo[1, 2](numpy.array([0, 1])) with self.assertRaises(IndexError) as cm: safe_foo[1, 2](numpy.array([0, 1])) self.assertIn("tuple index out of range", str(cm.exception))
def test_const_record_align(self): A = np.zeros(2, dtype=np.float64) B = np.zeros(2, dtype=np.float64) C = np.zeros(2, dtype=np.float64) D = np.zeros(2, dtype=np.float64) E = np.zeros(2, dtype=np.float64) jcuconst = cuda.jit(cuconstRecAlign).specialize(A, B, C, D, E) if not ENABLE_CUDASIM: self.assertIn( 'ld.const.v4.u8', jcuconst.ptx, 'load the first three bytes as a vector') self.assertIn( 'ld.const.u32', jcuconst.ptx, 'load the uint32 natively') self.assertIn( 'ld.const.u8', jcuconst.ptx, 'load the last byte by itself') jcuconst[2, 1](A, B, C, D, E) np.testing.assert_allclose(A, CONST_RECORD_ALIGN['a']) np.testing.assert_allclose(B, CONST_RECORD_ALIGN['b']) np.testing.assert_allclose(C, CONST_RECORD_ALIGN['x']) np.testing.assert_allclose(D, CONST_RECORD_ALIGN['y']) np.testing.assert_allclose(E, CONST_RECORD_ALIGN['z'])
def test_fill_threadidx(self): compiled = cuda.jit("void(int32[:])")(fill_threadidx) N = 10 ary = np.ones(N, dtype=np.int32) exp = np.arange(N, dtype=np.int32) compiled[1, N](ary) self.assertTrue(np.all(ary == exp))
def test_simple_grid1d(self): compiled = cuda.jit("void(int32[::1])")(simple_grid1d) ntid, nctaid = 3, 7 nelem = ntid * nctaid ary = np.empty(nelem, dtype=np.int32) compiled[nctaid, ntid](ary) self.assertTrue(np.all(ary == np.arange(nelem)))
def test_device(self): # fastmath option is ignored for device function @cuda.jit("float32(float32, float32)", device=True) def foo(a, b): return a / b def bar(arr, val): i = cuda.grid(1) if i < arr.size: arr[i] = foo(i, val) fastver = cuda.jit("void(float32[:], float32)", fastmath=True)(bar) precver = cuda.jit("void(float32[:], float32)")(bar) self.assertIn('div.full.ftz.f32', fastver.ptx) self.assertNotIn('div.full.ftz.f32', precver.ptx)
def test_useless_sync(self): compiled = cuda.jit("void(int32[::1])")(useless_sync) nelem = 10 ary = np.empty(nelem, dtype=np.int32) exp = np.arange(nelem, dtype=np.int32) compiled[1, nelem](ary) self.assertTrue(np.all(ary == exp))
def test_const_array(self): jcuconst = cuda.jit('void(float64[:])')(cuconst) print(jcuconst.ptx) self.assertTrue('.const' in jcuconst.ptx) A = numpy.empty_like(CONST1D) jcuconst[2, 5](A) self.assertTrue(numpy.all(A == CONST1D))
def test_local_array(self): jculocal = cuda.jit('void(int32[:], int32[:])')(culocal) self.assertTrue('.local' in jculocal.ptx) A = numpy.arange(100, dtype='int32') B = numpy.zeros_like(A) jculocal(A, B) self.assertTrue(numpy.all(A == B))
def test_atomic_add_double_global_3(self): ary = np.random.randint(0, 32, size=32).astype(np.float64).reshape(4, 8) orig = ary.copy() cuda_func = cuda.jit('void(float64[:,:])')(atomic_add_double_global_3) cuda_func[1, (4, 8)](ary) np.testing.assert_equal(ary, orig + 1)
def check_atomic_max(self, dtype, lo, hi): vals = np.random.randint(lo, hi, size=(32, 32)).astype(dtype) res = np.zeros(1, dtype=vals.dtype) cuda_func = cuda.jit(atomic_max) cuda_func[32, 32](res, vals) gold = np.max(vals) np.testing.assert_equal(res, gold)
def test_boolean(self): func = cuda.jit('void(float64[:], bool_)')(boolean_func) A = np.array([0], dtype='float64') func(A, True) self.assertTrue(A[0] == 123) func(A, False) self.assertTrue(A[0] == 321)
def test_printfloat(self): jprintfloat = cuda.jit('void()', debug=False)(printfloat) with captured_cuda_stdout() as stdout: jprintfloat() # CUDA and the simulator use different formats for float formatting self.assertIn(stdout.getvalue(), ["0 23 34.750000 321\n", "0 23 34.75 321\n"])
def test_atomic_add3(self): ary = np.random.randint(0, 32, size=32).astype(np.uint32).reshape(4, 8) orig = ary.copy() cuda_atomic_add3 = cuda.jit('void(uint32[:,:])')(atomic_add3) cuda_atomic_add3[1, (4, 8)](ary) self.assertTrue(np.all(ary == orig + 1))
def test_const_record(self): A = np.zeros(2, dtype=float) B = np.zeros(2, dtype=int) jcuconst = cuda.jit(cuconstRec).specialize(A, B) if not ENABLE_CUDASIM: if not any(c in jcuconst.ptx for c in [ # a vector load: the compiler fuses the load # of the x and y fields into a single instruction! 'ld.const.v2.u64', # for some reason Win64 / Py3 / CUDA 9.1 decides # to do two u32 loads, and shifts and ors the # values to get the float `x` field, then uses # another ld.const.u32 to load the int `y` as # a 32-bit value! 'ld.const.u32', ]): raise AssertionError( "the compiler should realise it doesn't " \ "need to interpret the bytes as float!") jcuconst[2, 1](A, B) np.testing.assert_allclose(A, CONST_RECORD['x']) np.testing.assert_allclose(B, CONST_RECORD['y'])
def test_syncthreads_count(self): compiled = cuda.jit("void(int32[:], int32[:])")(use_syncthreads_count) ary_in = np.ones(72, dtype=np.int32) ary_out = np.zeros(72, dtype=np.int32) ary_in[31] = 0 ary_in[42] = 0 compiled[1, 72](ary_in, ary_out) self.assertTrue(np.all(ary_out == 70))
def test_cuhello(self): jcuhello = cuda.jit('void()', debug=False)(cuhello) with captured_cuda_stdout() as stdout: jcuhello[2, 3]() # The output of GPU threads is intermingled, just sanity check it out = stdout.getvalue() expected = ''.join('%d 999\n' % i for i in range(6)) self.assertEqual(sorted(out), sorted(expected))
def test_print_array(self): """ Eyeballing required """ jcuprintary = cuda.jit('void(float32[:])')(cuprintary) A = np.arange(10, dtype=np.float32) jcuprintary[2, 5](A) cuda.synchronize()
def test_atomic_max_nan_location(self): vals = np.random.randint(0, 128, size=(1,1)).astype(np.float64) gold = vals.copy().reshape(1) res = np.zeros(1, np.float64) + np.nan cuda_func = cuda.jit('void(float64[:], float64[:,:])')(atomic_max) cuda_func[1, 1](res, vals) np.testing.assert_equal(res, gold)
def test_atomic_max_double_shared(self): vals = np.random.randint(0, 32, size=32).astype(np.float64) res = np.zeros(1, np.float64) cuda_func = cuda.jit('void(float64[:], float64[:])')(atomic_max_double_shared) cuda_func[1, 32](res, vals) gold = np.max(vals) np.testing.assert_equal(res, gold)
def test_atomic_max_nan_val(self): res = np.random.randint(0, 128, size=1).astype(np.float64) gold = res.copy() vals = np.zeros((1, 1), np.float64) + np.nan cuda_func = cuda.jit('void(float64[:], float64[:,:])')(atomic_max) cuda_func[1, 1](res, vals) np.testing.assert_equal(res, gold)
def check_atomic_min(self, dtype, lo, hi): vals = np.random.randint(lo, hi, size=(32, 32)).astype(dtype) res = np.array([65535], dtype=vals.dtype) cuda_func = cuda.jit(atomic_min) cuda_func[32, 32](res, vals) gold = np.min(vals) np.testing.assert_equal(res, gold)
def unary_template(self, func, npfunc, npdtype, npmtype, start, stop): nelem = 50 A = np.linspace(start, stop, nelem).astype(npdtype) B = np.empty_like(A) arytype = npmtype[::1] cfunc = cuda.jit((arytype, arytype))(func) cfunc[1, nelem](A, B) self.assertTrue(np.allclose(npfunc(A), B))
def test_string(self): cufunc = cuda.jit("void()", debug=False)(printstring) with captured_cuda_stdout() as stdout: cufunc[1, 3]() out = stdout.getvalue() lines = sorted(out.splitlines(True)) expected = ["%d hop! 999\n" % i for i in range(3)] self.assertEqual(lines, expected)
def test_local_array_complex(self): sig = 'void(complex128[:], complex128[:])' jculocalcomplex = cuda.jit(sig)(culocalcomplex) self.assertTrue('.local' in jculocalcomplex.ptx) A = (numpy.arange(100, dtype='complex128') - 1) / 2j B = numpy.zeros_like(A) jculocalcomplex(A, B) self.assertTrue(numpy.all(A == B))
def test_threadfence_codegen(self): # Does not test runtime behavior, just the code generation. compiled = cuda.jit("void(int32[:])")(use_threadfence) ary = np.zeros(10, dtype=np.int32) compiled[1, 1](ary) self.assertEqual(123 + 321, ary[0]) if not ENABLE_CUDASIM: self.assertIn("membar.gl;", compiled.ptx)
def test_indirect_add2f(self): compiled = cuda.jit("void(float32[:])")(indirect_add2f) nelem = 10 ary = np.arange(nelem, dtype=np.float32) exp = ary + ary compiled[1, nelem](ary) self.assertTrue(np.all(ary == exp), (ary, exp))
def test_atomic_max_double_normalizedindex(self): vals = np.random.randint(0, 65535, size=(32, 32)).astype(np.float64) res = np.zeros(1, np.float64) cuda_func = cuda.jit('void(float64[:], float64[:,:])')( atomic_max_double_normalizedindex) cuda_func[32, 32](res, vals) gold = np.max(vals) np.testing.assert_equal(res, gold)
def compile(self, sig, locals={}, **targetoptions): assert self._compiled is None assert not locals options = self.targetoptions.copy() options.update(targetoptions) kernel = jit(sig, **options)(self.py_func) self._compiled = kernel if hasattr(kernel, "_npm_context_"): self._npm_context_ = kernel._npm_context_
def test_ffs_i4_1s(self): compiled = cuda.jit("void(int32[:], int32)")(simple_ffs) ary = np.zeros(1, dtype=np.int32) compiled(ary, 0xFFFFFFFF) self.assertEquals(ary[0], 0)
def _(kernel): kernel = cuda.jit(kernel) kernel[self.block_dim, self.grid_dim](*args)
def test_ffs_i4_0s(self): compiled = cuda.jit("void(int32[:], int32)")(simple_ffs) ary = np.zeros(1, dtype=np.int32) compiled(ary, 0x0) self.assertEquals(ary[0], 32, "CUDA semantics")
import numpy as np import math from timeit import default_timer as timer from numba import cuda from numba import * def mult(a, b): return a * b mult_gpu = cuda.jit(restype=float32, argtypes=[float32, float32], device=True)(mult) @cuda.jit(argtypes=[float32[:, :], float32[:, :], float32[:, :, :]]) def mult_kernel(a, b, c): Ni = c.shape[0] Nj = c.shape[1] Nk = c.shape[2] startX, startY, startZ = cuda.grid(3) gridX = cuda.gridDim.x * cuda.blockDim.x gridY = cuda.gridDim.y * cuda.blockDim.y gridZ = cuda.gridDim.z * cuda.blockDim.z for i in range(startX, Ni, gridX): for j in range(startY, Nj, gridY): c[i, j] = 0 for k in range(startZ, Nk, gridZ): c[i, j] = c[i, j] + mult_gpu(a[i, k], b[j, k])
def test_ffs_i8(self): compiled = cuda.jit("void(int32[:], int64)")(simple_ffs) ary = np.zeros(1, dtype=np.int32) compiled(ary, 0x000000000010000) self.assertEquals(ary[0], 16)
def test_unconfigured_untyped_cudakernel(self): kernfunc = cuda.jit(noop) self._test_unconfigured(kernfunc)
def c_contigous(): compiled = cuda.jit("void(int32[:,:,::1])")(fill3d_threadidx) ary = np.zeros((X, Y, Z), dtype=np.int32) compiled[1, (X, Y, Z)](ary) return ary
import math import numpy as np import numba from numba import cuda from cuda_friendly_vincenty import vincenty wrap = cuda.jit('float32(float32, float32, float32, float32)', device=True) vincenty = wrap(vincenty) @cuda.jit('int32(int32)', device=True) def node_to_level(node): return math.floor(math.log(np.float32(node + 1)) / math.log(np.float32(2))) @cuda.jit('int32(int32, int32)', device=True) def node_range_start(node, n): level = node_to_level(node) step = n / (2**level) pos = node - 2**level + 1 return math.floor(pos * step) @cuda.jit('int32(int32, int32)', device=True) def node_range_end(node, n): level = node_to_level(node) step = n / (2**level) pos = node - 2**level + 1 return math.floor((pos + 1) * step)
def test_fma_f8(self): compiled = cuda.jit("void(f8[:], f8, f8, f8)")(simple_fma) ary = np.zeros(1, dtype=np.float64) compiled(ary, 2., 3., 4.) np.testing.assert_allclose(ary[0], 2 * 3 + 4)
def _compile_kernel(self, fnobj, sig): return cuda.jit(sig)(fnobj)
import MRT_LB_local as d2q5 from numba import cuda, float64, float32 import numpy as np import matplotlib.pyplot as plt getg = cuda.jit('void (f8[:,:,:], f8[::1], i8, i8)', device=True)(d2q5.getg) getfl = cuda.jit('void (f8[:,:], f8[::1], i8, i8)', device=True)(d2q5.getfl) #getn = cuda.jit('void (f8[:,:,:], f8[::1], i8, i8)', device=True)(d2q5.getn) calc_T = cuda.jit('void (f8[::1], f8[::1])', device=True)(d2q5.calc_T) calc_copiafl = cuda.jit('void (f8[::1],f8[::1])', device=True)(d2q5.calc_copiafl) calc_Hk = cuda.jit('void (f8[::1], f8[::1], f8[::1])', device=True)(d2q5.calc_Hk) calc_fl = cuda.jit('void (f8[::1], f8[::1])', device=True)(d2q5.calc_fl) calc_g2n = cuda.jit('void (f8[::1], f8[::1])', device=True)(d2q5.calc_g2n) calc_alfe = cuda.jit('void (f8[::1], f8[::1], f8[:])', device=True)(d2q5.calc_alfe) calc_taut = cuda.jit('void (f8[::1], f8[::1])', device=True)(d2q5.calc_taut) calc_relax = cuda.jit('void (f8[::1], f8[::1])', device=True)(d2q5.calc_relax) calc_Ssurce = cuda.jit('void (f8[::1], f8[::1], f8[::1])', device=True)(d2q5.calc_Ssurce) n_eq_loc = cuda.jit('void (f8[::1], f8[::1])', device=True)(d2q5.n_eq_loc) calc_colision = cuda.jit('void (f8[::1], f8[::1], f8[::1], f8[::1])', device=True)(d2q5.calc_colision) n2g_loc = cuda.jit('void (f8[::1], f8[::1])', device=True)(d2q5.n2g_loc) setfl = cuda.jit('void (f8[:,:], f8[::1], i8, i8)', device=True)(d2q5.setfl) #setT = cuda.jit('void (f8[:,:], f8[:1], i4, i4)', device=True)(d2q5.setT) setg = cuda.jit('void (f8[:,:,:], f8[::1], i8, i8)', device=True)(d2q5.setg) set_prueba = cuda.jit('void (f8[:,:], f8[::1], i8, i8)', device=True)(d2q5.set_prueba) @cuda.jit('void(f8[:,:,:],f8[:,:,:])') def propagacion(d_g, copia_g): nx, ny, ns = d_g.shape
def get_cfunc(self, pyfunc, argspec): return cuda.jit()(pyfunc)
def test_clz_i4(self): compiled = cuda.jit("void(int32[:], int32)")(simple_clz) ary = np.zeros(1, dtype=np.int32) compiled(ary, 0x00100000) self.assertEquals(ary[0], 11)
def test_global_constant_tuple(self): udt = cuda.jit((float32[:, :], ))(udt_global_constant_tuple) udt[1, 1](self.getarg2())
def _compile_core(self, sig): cudevfn = cuda.jit(sig, device=True, inline=True)(self.pyfunc) return cudevfn, cudevfn.overloads[sig.args].signature.return_type
def test_const_align(self): jcuconstAlign = cuda.jit('void(float64[:])')(cuconstAlign) A = np.full(3, fill_value=np.nan, dtype=float) jcuconstAlign[1, 3](A) self.assertTrue(np.all(A == (CONST3BYTES + CONST1D[:3])))
def test_brev_u8(self): compiled = cuda.jit("void(uint64[:], uint64)")(simple_brev) ary = np.zeros(1, dtype=np.uint64) compiled(ary, 0x000030F0000030F0) self.assertEquals(ary[0], 0x0F0C00000F0C0000)
steps = 1000 def mandel(x, y, max_iters, zoom): c = complex(x, y) / zoom + complex(-0.743643887037158704752191506114774, 0.131825904205311970493132056385139) z = 0.0j for i in range(max_iters): z = z * z + c if (z.real * z.real + z.imag * z.imag) >= 4: return i return max_iters mandel_gpu = cuda.jit(device=True)(mandel) @cuda.jit def mandel_kernel(min_x, max_x, min_y, max_y, image, iters, steps): height = image.shape[0] width = image.shape[1] pixel_size_x = (max_x - min_x) / width pixel_size_y = (max_y - min_y) / height startX, startY = cuda.grid(2) gridX = cuda.gridDim.x * cuda.blockDim.x gridY = cuda.gridDim.y * cuda.blockDim.y #start = complex(0,0)
def test_popc_u8(self): compiled = cuda.jit("void(int32[:], uint64)")(simple_popc) ary = np.zeros(1, dtype=np.int32) compiled(ary, 0xF00000000000) self.assertEquals(ary[0], 4)
def test_unconfigured_typed_cudakernel(self): kernfunc = cuda.jit("void(int32)")(noop) self._test_unconfigured(kernfunc)
def test_const_empty(self): jcuconstEmpty = cuda.jit('void(float64[:])')(cuconstEmpty) A = np.full(1, fill_value=-1, dtype=int) jcuconstEmpty[1, 1](A) self.assertTrue(np.all(A == 0))
"""Adaptor for composed CUDA math functions This is necessary because creating a device function will not recursively compile nested functions. See: https://stackoverflow.com/questions/52807489/programmatic-nested-numba-cuda-function-calls """ from . import math from numba import cuda r = cuda.jit(math.r, device=True) r2 = cuda.jit(lambda gn0, gn1: r(gn0, gn1) ** 2, device=True)
def test_simple_gridsize1d(self): compiled = cuda.jit("void(int32[::1])")(simple_gridsize1d) ntid, nctaid = 3, 7 ary = np.zeros(1, dtype=np.int32) compiled[nctaid, ntid](ary) self.assertEqual(ary[0], nctaid * ntid)
def f_contigous(): compiled = cuda.jit("void(int32[::1,:,:])")(fill3d_threadidx) ary = np.asfortranarray(np.zeros((X, Y, Z), dtype=np.int32)) compiled[1, (X, Y, Z)](ary) return ary
''' # Importing external libs import numpy as np from numba import cuda, uint8, float32 # Importing app lib import corrcoef as my # Note the hardcoded 4 value. # Four is the number of item per pixel. # They represent a pixel in the form of (r, g, b, alpha) PIXEL_DATA_LENGTH = 4 # pre compile the standard corrcoef method to be use as cuda call cuda_corrcoef = cuda.jit(restype=float32, argtypes=[float32[:], float32[:], float32], device=True)(my.corrcoef) # pylint: disable=bad-whitespace @cuda.jit(argtypes=[float32[:,:,:], float32[:,:,:], float32[:,:]]) # pylint: disable=bad-whitespace def correlate_frames_kernel(cuda_img01_frames, cuda_img02_frames, cuda_coefficients): '''To calculate the correlation between each frame''' coord_x = cuda.blockIdx.x coord_y = cuda.blockIdx.y coef = 0 cuda_coefficients[coord_x][coord_y] = cuda_corrcoef( cuda_img01_frames[coord_x][coord_y], cuda_img02_frames[coord_x][coord_y], coef, ) @cuda.jit(argtypes=[uint8[:,:,:], float32[:,:,:], uint8]) # pylint: disable=bad-whitespace def flatten_frame_kernel(cuda_img, cuda_flatten_frame, frame_size):
def test_global_constants(self): udt = cuda.jit((float32[:], ))(udt_global_constants) udt[1, 1](self.getarg())
dVARdt[ii, jj, :] = 0. exchange_BC(VAR) exchange_BC(VAR1) exchange_BC(VAR2) stream = cuda.stream() VARd = cuda.to_device(VAR, stream) dVARdtd = cuda.to_device(dVARdt, stream) VAR1d = cuda.to_device(VAR1, stream) VAR2d = cuda.to_device(VAR2, stream) kernel_trivial = cuda.jit(cuda_kernel_decorator(kernel_trivial))\ (kernel_trivial) kernel_improve = cuda.jit(cuda_kernel_decorator(kernel_improve))\ (kernel_improve) kernel_shared = cuda.jit(cuda_kernel_decorator(kernel_shared))\ (kernel_shared) dVARdt[ii, jj, :] = 0. dVARdtd = cuda.to_device(dVARdt, stream) print('trivial') tpb = (1, 1, nz) bpg = (math.ceil((nx + 2 * nb) / tpb[0]), math.ceil( (ny + 2 * nb) / tpb[1]), math.ceil((nz) / tpb[2])) print(tpb) print(bpg) kernel_trivial[bpg, tpb](dVARdtd, VARd)
def test_simple_warpsize(self): compiled = cuda.jit("void(int32[:])")(simple_warpsize) ary = np.zeros(1, dtype=np.int32) compiled(ary) self.assertEquals(ary[0], 32, "CUDA semantics")
def test_const_record_empty(self): jcuconstRecEmpty = cuda.jit('void(int64[:])')(cuconstRecEmpty) A = np.full(1, fill_value=-1, dtype=np.int64) jcuconstRecEmpty[1, 1](A) self.assertTrue(np.all(A == 0))
def _get_globals(self, sig): corefn = cuda.jit(sig, device=True)(self.pyfunc) glbls = self.py_func.__globals__.copy() glbls.update({'__cuda__': cuda, '__core__': corefn}) return glbls