Beispiel #1
0
    def test_slice_as_arg(self):
        global cufoo
        cufoo = cuda.jit("void(int32[:], int32[:])", device=True)(foo)
        cucopy = cuda.jit("void(int32[:,:], int32[:,:])")(copy)

        inp = np.arange(100, dtype=np.int32).reshape(10, 10)
        out = np.zeros_like(inp)

        cucopy[1, 10](inp, out)
Beispiel #2
0
    def test_kernel(self):

        def foo(arr, val):
            i = cuda.grid(1)
            if i < arr.size:
                arr[i] = float32(i) / val

        fastver = cuda.jit("void(float32[:], float32)", fastmath=True)(foo)
        precver = cuda.jit("void(float32[:], float32)")(foo)

        self.assertIn('div.full.ftz.f32', fastver.ptx)
        self.assertNotIn('div.full.ftz.f32', precver.ptx)
Beispiel #3
0
    def test_exception(self):
        unsafe_foo = cuda.jit(foo)
        safe_foo = cuda.jit(debug=True)(foo)

        if not config.ENABLE_CUDASIM:
            # Simulator throws exceptions regardless of debug
            # setting
            unsafe_foo[1, 2](numpy.array([0, 1]))

        with self.assertRaises(IndexError) as cm:
            safe_foo[1, 2](numpy.array([0, 1]))
        self.assertIn("tuple index out of range", str(cm.exception))
Beispiel #4
0
    def test_const_record_align(self):
        A = np.zeros(2, dtype=np.float64)
        B = np.zeros(2, dtype=np.float64)
        C = np.zeros(2, dtype=np.float64)
        D = np.zeros(2, dtype=np.float64)
        E = np.zeros(2, dtype=np.float64)
        jcuconst = cuda.jit(cuconstRecAlign).specialize(A, B, C, D, E)

        if not ENABLE_CUDASIM:
            self.assertIn(
                'ld.const.v4.u8',
                jcuconst.ptx,
                'load the first three bytes as a vector')

            self.assertIn(
                'ld.const.u32',
                jcuconst.ptx,
                'load the uint32 natively')

            self.assertIn(
                'ld.const.u8',
                jcuconst.ptx,
                'load the last byte by itself')

        jcuconst[2, 1](A, B, C, D, E)
        np.testing.assert_allclose(A, CONST_RECORD_ALIGN['a'])
        np.testing.assert_allclose(B, CONST_RECORD_ALIGN['b'])
        np.testing.assert_allclose(C, CONST_RECORD_ALIGN['x'])
        np.testing.assert_allclose(D, CONST_RECORD_ALIGN['y'])
        np.testing.assert_allclose(E, CONST_RECORD_ALIGN['z'])
Beispiel #5
0
 def test_fill_threadidx(self):
     compiled = cuda.jit("void(int32[:])")(fill_threadidx)
     N = 10
     ary = np.ones(N, dtype=np.int32)
     exp = np.arange(N, dtype=np.int32)
     compiled[1, N](ary)
     self.assertTrue(np.all(ary == exp))
Beispiel #6
0
 def test_simple_grid1d(self):
     compiled = cuda.jit("void(int32[::1])")(simple_grid1d)
     ntid, nctaid = 3, 7
     nelem = ntid * nctaid
     ary = np.empty(nelem, dtype=np.int32)
     compiled[nctaid, ntid](ary)
     self.assertTrue(np.all(ary == np.arange(nelem)))
Beispiel #7
0
    def test_device(self):
        # fastmath option is ignored for device function
        @cuda.jit("float32(float32, float32)", device=True)
        def foo(a, b):
            return a / b

        def bar(arr, val):
            i = cuda.grid(1)
            if i < arr.size:
                arr[i] = foo(i, val)

        fastver = cuda.jit("void(float32[:], float32)", fastmath=True)(bar)
        precver = cuda.jit("void(float32[:], float32)")(bar)

        self.assertIn('div.full.ftz.f32', fastver.ptx)
        self.assertNotIn('div.full.ftz.f32', precver.ptx)
Beispiel #8
0
 def test_useless_sync(self):
     compiled = cuda.jit("void(int32[::1])")(useless_sync)
     nelem = 10
     ary = np.empty(nelem, dtype=np.int32)
     exp = np.arange(nelem, dtype=np.int32)
     compiled[1, nelem](ary)
     self.assertTrue(np.all(ary == exp))
Beispiel #9
0
 def test_const_array(self):
     jcuconst = cuda.jit('void(float64[:])')(cuconst)
     print(jcuconst.ptx)
     self.assertTrue('.const' in jcuconst.ptx)
     A = numpy.empty_like(CONST1D)
     jcuconst[2, 5](A)
     self.assertTrue(numpy.all(A == CONST1D))
Beispiel #10
0
 def test_local_array(self):
     jculocal = cuda.jit('void(int32[:], int32[:])')(culocal)
     self.assertTrue('.local' in jculocal.ptx)
     A = numpy.arange(100, dtype='int32')
     B = numpy.zeros_like(A)
     jculocal(A, B)
     self.assertTrue(numpy.all(A == B))
Beispiel #11
0
    def test_atomic_add_double_global_3(self):
        ary = np.random.randint(0, 32, size=32).astype(np.float64).reshape(4, 8)
        orig = ary.copy()
        cuda_func = cuda.jit('void(float64[:,:])')(atomic_add_double_global_3)
        cuda_func[1, (4, 8)](ary)

        np.testing.assert_equal(ary, orig + 1)
Beispiel #12
0
 def check_atomic_max(self, dtype, lo, hi):
     vals = np.random.randint(lo, hi, size=(32, 32)).astype(dtype)
     res = np.zeros(1, dtype=vals.dtype)
     cuda_func = cuda.jit(atomic_max)
     cuda_func[32, 32](res, vals)
     gold = np.max(vals)
     np.testing.assert_equal(res, gold)
Beispiel #13
0
 def test_boolean(self):
     func = cuda.jit('void(float64[:], bool_)')(boolean_func)
     A = np.array([0], dtype='float64')
     func(A, True)
     self.assertTrue(A[0] == 123)
     func(A, False)
     self.assertTrue(A[0] == 321)
Beispiel #14
0
 def test_printfloat(self):
     jprintfloat = cuda.jit('void()', debug=False)(printfloat)
     with captured_cuda_stdout() as stdout:
         jprintfloat()
     # CUDA and the simulator use different formats for float formatting
     self.assertIn(stdout.getvalue(), ["0 23 34.750000 321\n",
                                       "0 23 34.75 321\n"])
Beispiel #15
0
    def test_atomic_add3(self):
        ary = np.random.randint(0, 32, size=32).astype(np.uint32).reshape(4, 8)
        orig = ary.copy()
        cuda_atomic_add3 = cuda.jit('void(uint32[:,:])')(atomic_add3)
        cuda_atomic_add3[1, (4, 8)](ary)

        self.assertTrue(np.all(ary == orig + 1))
Beispiel #16
0
    def test_const_record(self):
        A = np.zeros(2, dtype=float)
        B = np.zeros(2, dtype=int)
        jcuconst = cuda.jit(cuconstRec).specialize(A, B)

        if not ENABLE_CUDASIM:
            if not any(c in jcuconst.ptx for c in [
                # a vector load: the compiler fuses the load
                # of the x and y fields into a single instruction!
                'ld.const.v2.u64',

                # for some reason Win64 / Py3 / CUDA 9.1 decides
                # to do two u32 loads, and shifts and ors the
                # values to get the float `x` field, then uses
                # another ld.const.u32 to load the int `y` as
                # a 32-bit value!
                'ld.const.u32',
            ]):
                raise AssertionError(
                    "the compiler should realise it doesn't " \
                    "need to interpret the bytes as float!")

        jcuconst[2, 1](A, B)
        np.testing.assert_allclose(A, CONST_RECORD['x'])
        np.testing.assert_allclose(B, CONST_RECORD['y'])
Beispiel #17
0
 def test_syncthreads_count(self):
     compiled = cuda.jit("void(int32[:], int32[:])")(use_syncthreads_count)
     ary_in = np.ones(72, dtype=np.int32)
     ary_out = np.zeros(72, dtype=np.int32)
     ary_in[31] = 0
     ary_in[42] = 0
     compiled[1, 72](ary_in, ary_out)
     self.assertTrue(np.all(ary_out == 70))
Beispiel #18
0
 def test_cuhello(self):
     jcuhello = cuda.jit('void()', debug=False)(cuhello)
     with captured_cuda_stdout() as stdout:
         jcuhello[2, 3]()
     # The output of GPU threads is intermingled, just sanity check it
     out = stdout.getvalue()
     expected = ''.join('%d 999\n' % i for i in range(6))
     self.assertEqual(sorted(out), sorted(expected))
Beispiel #19
0
 def test_print_array(self):
     """
     Eyeballing required
     """
     jcuprintary = cuda.jit('void(float32[:])')(cuprintary)
     A = np.arange(10, dtype=np.float32)
     jcuprintary[2, 5](A)
     cuda.synchronize()
Beispiel #20
0
    def test_atomic_max_nan_location(self):
        vals = np.random.randint(0, 128, size=(1,1)).astype(np.float64)
        gold = vals.copy().reshape(1)
        res = np.zeros(1, np.float64) + np.nan
        cuda_func = cuda.jit('void(float64[:], float64[:,:])')(atomic_max)
        cuda_func[1, 1](res, vals)

        np.testing.assert_equal(res, gold)
Beispiel #21
0
    def test_atomic_max_double_shared(self):
        vals = np.random.randint(0, 32, size=32).astype(np.float64)
        res = np.zeros(1, np.float64)
        cuda_func = cuda.jit('void(float64[:], float64[:])')(atomic_max_double_shared)
        cuda_func[1, 32](res, vals)

        gold = np.max(vals)
        np.testing.assert_equal(res, gold)
Beispiel #22
0
    def test_atomic_max_nan_val(self):
        res = np.random.randint(0, 128, size=1).astype(np.float64)
        gold = res.copy()
        vals = np.zeros((1, 1), np.float64) + np.nan
        cuda_func = cuda.jit('void(float64[:], float64[:,:])')(atomic_max)
        cuda_func[1, 1](res, vals)

        np.testing.assert_equal(res, gold)
Beispiel #23
0
    def check_atomic_min(self, dtype, lo, hi):
        vals = np.random.randint(lo, hi, size=(32, 32)).astype(dtype)
        res = np.array([65535], dtype=vals.dtype)
        cuda_func = cuda.jit(atomic_min)
        cuda_func[32, 32](res, vals)

        gold = np.min(vals)
        np.testing.assert_equal(res, gold)
Beispiel #24
0
 def unary_template(self, func, npfunc, npdtype, npmtype, start, stop):
     nelem = 50
     A = np.linspace(start, stop, nelem).astype(npdtype)
     B = np.empty_like(A)
     arytype = npmtype[::1]
     cfunc = cuda.jit((arytype, arytype))(func)
     cfunc[1, nelem](A, B)
     self.assertTrue(np.allclose(npfunc(A), B))
Beispiel #25
0
 def test_string(self):
     cufunc = cuda.jit("void()", debug=False)(printstring)
     with captured_cuda_stdout() as stdout:
         cufunc[1, 3]()
     out = stdout.getvalue()
     lines = sorted(out.splitlines(True))
     expected = ["%d hop! 999\n" % i for i in range(3)]
     self.assertEqual(lines, expected)
Beispiel #26
0
 def test_local_array_complex(self):
     sig = 'void(complex128[:], complex128[:])'
     jculocalcomplex = cuda.jit(sig)(culocalcomplex)
     self.assertTrue('.local' in jculocalcomplex.ptx)
     A = (numpy.arange(100, dtype='complex128') - 1) / 2j
     B = numpy.zeros_like(A)
     jculocalcomplex(A, B)
     self.assertTrue(numpy.all(A == B))
Beispiel #27
0
 def test_threadfence_codegen(self):
     # Does not test runtime behavior, just the code generation.
     compiled = cuda.jit("void(int32[:])")(use_threadfence)
     ary = np.zeros(10, dtype=np.int32)
     compiled[1, 1](ary)
     self.assertEqual(123 + 321, ary[0])
     if not ENABLE_CUDASIM:
         self.assertIn("membar.gl;", compiled.ptx)
Beispiel #28
0
    def test_indirect_add2f(self):
        compiled = cuda.jit("void(float32[:])")(indirect_add2f)

        nelem = 10
        ary = np.arange(nelem, dtype=np.float32)
        exp = ary + ary
        compiled[1, nelem](ary)

        self.assertTrue(np.all(ary == exp), (ary, exp))
Beispiel #29
0
    def test_atomic_max_double_normalizedindex(self):
        vals = np.random.randint(0, 65535, size=(32, 32)).astype(np.float64)
        res = np.zeros(1, np.float64)
        cuda_func = cuda.jit('void(float64[:], float64[:,:])')(
            atomic_max_double_normalizedindex)
        cuda_func[32, 32](res, vals)

        gold = np.max(vals)
        np.testing.assert_equal(res, gold)
Beispiel #30
0
 def compile(self, sig, locals={}, **targetoptions):
     assert self._compiled is None
     assert not locals
     options = self.targetoptions.copy()
     options.update(targetoptions)
     kernel = jit(sig, **options)(self.py_func)
     self._compiled = kernel
     if hasattr(kernel, "_npm_context_"):
         self._npm_context_ = kernel._npm_context_
Beispiel #31
0
 def test_ffs_i4_1s(self):
     compiled = cuda.jit("void(int32[:], int32)")(simple_ffs)
     ary = np.zeros(1, dtype=np.int32)
     compiled(ary, 0xFFFFFFFF)
     self.assertEquals(ary[0], 0)
Beispiel #32
0
 def _(kernel):
     kernel = cuda.jit(kernel)
     kernel[self.block_dim, self.grid_dim](*args)
Beispiel #33
0
 def test_ffs_i4_0s(self):
     compiled = cuda.jit("void(int32[:], int32)")(simple_ffs)
     ary = np.zeros(1, dtype=np.int32)
     compiled(ary, 0x0)
     self.assertEquals(ary[0], 32, "CUDA semantics")
Beispiel #34
0
import numpy as np
import math
from timeit import default_timer as timer
from numba import cuda
from numba import *


def mult(a, b):
    return a * b


mult_gpu = cuda.jit(restype=float32, argtypes=[float32, float32],
                    device=True)(mult)


@cuda.jit(argtypes=[float32[:, :], float32[:, :], float32[:, :, :]])
def mult_kernel(a, b, c):
    Ni = c.shape[0]
    Nj = c.shape[1]
    Nk = c.shape[2]

    startX, startY, startZ = cuda.grid(3)
    gridX = cuda.gridDim.x * cuda.blockDim.x
    gridY = cuda.gridDim.y * cuda.blockDim.y
    gridZ = cuda.gridDim.z * cuda.blockDim.z

    for i in range(startX, Ni, gridX):
        for j in range(startY, Nj, gridY):
            c[i, j] = 0
            for k in range(startZ, Nk, gridZ):
                c[i, j] = c[i, j] + mult_gpu(a[i, k], b[j, k])
Beispiel #35
0
 def test_ffs_i8(self):
     compiled = cuda.jit("void(int32[:], int64)")(simple_ffs)
     ary = np.zeros(1, dtype=np.int32)
     compiled(ary, 0x000000000010000)
     self.assertEquals(ary[0], 16)
Beispiel #36
0
 def test_unconfigured_untyped_cudakernel(self):
     kernfunc = cuda.jit(noop)
     self._test_unconfigured(kernfunc)
Beispiel #37
0
 def c_contigous():
     compiled = cuda.jit("void(int32[:,:,::1])")(fill3d_threadidx)
     ary = np.zeros((X, Y, Z), dtype=np.int32)
     compiled[1, (X, Y, Z)](ary)
     return ary
import math

import numpy as np
import numba
from numba import cuda
from cuda_friendly_vincenty import vincenty

wrap = cuda.jit('float32(float32, float32, float32, float32)', device=True)
vincenty = wrap(vincenty)


@cuda.jit('int32(int32)', device=True)
def node_to_level(node):
    return math.floor(math.log(np.float32(node + 1)) / math.log(np.float32(2)))


@cuda.jit('int32(int32, int32)', device=True)
def node_range_start(node, n):
    level = node_to_level(node)
    step = n / (2**level)
    pos = node - 2**level + 1
    return math.floor(pos * step)


@cuda.jit('int32(int32, int32)', device=True)
def node_range_end(node, n):
    level = node_to_level(node)
    step = n / (2**level)
    pos = node - 2**level + 1
    return math.floor((pos + 1) * step)
Beispiel #39
0
 def test_fma_f8(self):
     compiled = cuda.jit("void(f8[:], f8, f8, f8)")(simple_fma)
     ary = np.zeros(1, dtype=np.float64)
     compiled(ary, 2., 3., 4.)
     np.testing.assert_allclose(ary[0], 2 * 3 + 4)
Beispiel #40
0
 def _compile_kernel(self, fnobj, sig):
     return cuda.jit(sig)(fnobj)
Beispiel #41
0
import MRT_LB_local as d2q5
from numba import cuda, float64, float32
import numpy as np
import matplotlib.pyplot as plt

getg = cuda.jit('void (f8[:,:,:], f8[::1], i8, i8)', device=True)(d2q5.getg)
getfl = cuda.jit('void (f8[:,:], f8[::1], i8, i8)', device=True)(d2q5.getfl)
#getn = cuda.jit('void (f8[:,:,:], f8[::1], i8, i8)', device=True)(d2q5.getn)
calc_T = cuda.jit('void (f8[::1], f8[::1])', device=True)(d2q5.calc_T)
calc_copiafl = cuda.jit('void (f8[::1],f8[::1])', device=True)(d2q5.calc_copiafl)
calc_Hk = cuda.jit('void (f8[::1], f8[::1], f8[::1])', device=True)(d2q5.calc_Hk)
calc_fl = cuda.jit('void (f8[::1], f8[::1])', device=True)(d2q5.calc_fl)
calc_g2n = cuda.jit('void (f8[::1], f8[::1])', device=True)(d2q5.calc_g2n)

calc_alfe = cuda.jit('void (f8[::1], f8[::1], f8[:])', device=True)(d2q5.calc_alfe)
calc_taut = cuda.jit('void (f8[::1], f8[::1])', device=True)(d2q5.calc_taut)
calc_relax = cuda.jit('void (f8[::1], f8[::1])', device=True)(d2q5.calc_relax)
calc_Ssurce = cuda.jit('void (f8[::1], f8[::1], f8[::1])', device=True)(d2q5.calc_Ssurce)
n_eq_loc = cuda.jit('void (f8[::1], f8[::1])', device=True)(d2q5.n_eq_loc)
calc_colision = cuda.jit('void (f8[::1], f8[::1], f8[::1], f8[::1])', device=True)(d2q5.calc_colision)
n2g_loc = cuda.jit('void (f8[::1], f8[::1])', device=True)(d2q5.n2g_loc)

setfl = cuda.jit('void (f8[:,:], f8[::1], i8, i8)', device=True)(d2q5.setfl)
#setT = cuda.jit('void (f8[:,:], f8[:1], i4, i4)', device=True)(d2q5.setT)
setg = cuda.jit('void (f8[:,:,:], f8[::1], i8, i8)', device=True)(d2q5.setg)
set_prueba = cuda.jit('void (f8[:,:], f8[::1], i8, i8)', device=True)(d2q5.set_prueba)


@cuda.jit('void(f8[:,:,:],f8[:,:,:])')
def propagacion(d_g, copia_g):
    nx, ny, ns = d_g.shape
Beispiel #42
0
 def get_cfunc(self, pyfunc, argspec):
     return cuda.jit()(pyfunc)
Beispiel #43
0
 def test_clz_i4(self):
     compiled = cuda.jit("void(int32[:], int32)")(simple_clz)
     ary = np.zeros(1, dtype=np.int32)
     compiled(ary, 0x00100000)
     self.assertEquals(ary[0], 11)
Beispiel #44
0
 def test_global_constant_tuple(self):
     udt = cuda.jit((float32[:, :], ))(udt_global_constant_tuple)
     udt[1, 1](self.getarg2())
Beispiel #45
0
 def _compile_core(self, sig):
     cudevfn = cuda.jit(sig, device=True, inline=True)(self.pyfunc)
     return cudevfn, cudevfn.overloads[sig.args].signature.return_type
Beispiel #46
0
 def test_const_align(self):
     jcuconstAlign = cuda.jit('void(float64[:])')(cuconstAlign)
     A = np.full(3, fill_value=np.nan, dtype=float)
     jcuconstAlign[1, 3](A)
     self.assertTrue(np.all(A == (CONST3BYTES + CONST1D[:3])))
Beispiel #47
0
 def test_brev_u8(self):
     compiled = cuda.jit("void(uint64[:], uint64)")(simple_brev)
     ary = np.zeros(1, dtype=np.uint64)
     compiled(ary, 0x000030F0000030F0)
     self.assertEquals(ary[0], 0x0F0C00000F0C0000)
Beispiel #48
0
steps = 1000


def mandel(x, y, max_iters, zoom):

    c = complex(x, y) / zoom + complex(-0.743643887037158704752191506114774,
                                       0.131825904205311970493132056385139)
    z = 0.0j
    for i in range(max_iters):
        z = z * z + c
        if (z.real * z.real + z.imag * z.imag) >= 4:
            return i
    return max_iters


mandel_gpu = cuda.jit(device=True)(mandel)


@cuda.jit
def mandel_kernel(min_x, max_x, min_y, max_y, image, iters, steps):
    height = image.shape[0]
    width = image.shape[1]

    pixel_size_x = (max_x - min_x) / width
    pixel_size_y = (max_y - min_y) / height

    startX, startY = cuda.grid(2)
    gridX = cuda.gridDim.x * cuda.blockDim.x
    gridY = cuda.gridDim.y * cuda.blockDim.y

    #start = complex(0,0)
Beispiel #49
0
 def test_popc_u8(self):
     compiled = cuda.jit("void(int32[:], uint64)")(simple_popc)
     ary = np.zeros(1, dtype=np.int32)
     compiled(ary, 0xF00000000000)
     self.assertEquals(ary[0], 4)
Beispiel #50
0
 def test_unconfigured_typed_cudakernel(self):
     kernfunc = cuda.jit("void(int32)")(noop)
     self._test_unconfigured(kernfunc)
Beispiel #51
0
 def test_const_empty(self):
     jcuconstEmpty = cuda.jit('void(float64[:])')(cuconstEmpty)
     A = np.full(1, fill_value=-1, dtype=int)
     jcuconstEmpty[1, 1](A)
     self.assertTrue(np.all(A == 0))
Beispiel #52
0
"""Adaptor for composed CUDA math functions
    
This is necessary because creating a device function will not recursively compile nested functions.

See: https://stackoverflow.com/questions/52807489/programmatic-nested-numba-cuda-function-calls
"""
from . import math
from numba import cuda 

r = cuda.jit(math.r, device=True)
r2 = cuda.jit(lambda gn0, gn1: r(gn0, gn1) ** 2, device=True)
Beispiel #53
0
 def test_simple_gridsize1d(self):
     compiled = cuda.jit("void(int32[::1])")(simple_gridsize1d)
     ntid, nctaid = 3, 7
     ary = np.zeros(1, dtype=np.int32)
     compiled[nctaid, ntid](ary)
     self.assertEqual(ary[0], nctaid * ntid)
Beispiel #54
0
 def f_contigous():
     compiled = cuda.jit("void(int32[::1,:,:])")(fill3d_threadidx)
     ary = np.asfortranarray(np.zeros((X, Y, Z), dtype=np.int32))
     compiled[1, (X, Y, Z)](ary)
     return ary
Beispiel #55
0
'''

# Importing external libs
import numpy as np
from numba import cuda, uint8, float32

# Importing app lib
import corrcoef as my

# Note the hardcoded 4 value.
# Four is the number of item per pixel.
# They represent a pixel in the form of (r, g, b, alpha)
PIXEL_DATA_LENGTH = 4

# pre compile the standard corrcoef method to be use as cuda call
cuda_corrcoef = cuda.jit(restype=float32, argtypes=[float32[:], float32[:], float32], device=True)(my.corrcoef) # pylint: disable=bad-whitespace

@cuda.jit(argtypes=[float32[:,:,:], float32[:,:,:], float32[:,:]]) # pylint: disable=bad-whitespace
def correlate_frames_kernel(cuda_img01_frames, cuda_img02_frames, cuda_coefficients):
  '''To calculate the correlation between each frame'''
  coord_x = cuda.blockIdx.x
  coord_y = cuda.blockIdx.y
  coef = 0
  cuda_coefficients[coord_x][coord_y] = cuda_corrcoef(
    cuda_img01_frames[coord_x][coord_y],
    cuda_img02_frames[coord_x][coord_y],
    coef,
  )

@cuda.jit(argtypes=[uint8[:,:,:], float32[:,:,:], uint8]) # pylint: disable=bad-whitespace
def flatten_frame_kernel(cuda_img, cuda_flatten_frame, frame_size):
Beispiel #56
0
 def test_global_constants(self):
     udt = cuda.jit((float32[:], ))(udt_global_constants)
     udt[1, 1](self.getarg())
    dVARdt[ii, jj, :] = 0.

    exchange_BC(VAR)
    exchange_BC(VAR1)
    exchange_BC(VAR2)

    stream = cuda.stream()

    VARd = cuda.to_device(VAR, stream)
    dVARdtd = cuda.to_device(dVARdt, stream)
    VAR1d = cuda.to_device(VAR1, stream)
    VAR2d = cuda.to_device(VAR2, stream)


    kernel_trivial = cuda.jit(cuda_kernel_decorator(kernel_trivial))\
                                (kernel_trivial)
    kernel_improve = cuda.jit(cuda_kernel_decorator(kernel_improve))\
                                (kernel_improve)
    kernel_shared = cuda.jit(cuda_kernel_decorator(kernel_shared))\
                                (kernel_shared)

    dVARdt[ii, jj, :] = 0.
    dVARdtd = cuda.to_device(dVARdt, stream)

    print('trivial')
    tpb = (1, 1, nz)
    bpg = (math.ceil((nx + 2 * nb) / tpb[0]), math.ceil(
        (ny + 2 * nb) / tpb[1]), math.ceil((nz) / tpb[2]))
    print(tpb)
    print(bpg)
    kernel_trivial[bpg, tpb](dVARdtd, VARd)
Beispiel #58
0
 def test_simple_warpsize(self):
     compiled = cuda.jit("void(int32[:])")(simple_warpsize)
     ary = np.zeros(1, dtype=np.int32)
     compiled(ary)
     self.assertEquals(ary[0], 32, "CUDA semantics")
Beispiel #59
0
 def test_const_record_empty(self):
     jcuconstRecEmpty = cuda.jit('void(int64[:])')(cuconstRecEmpty)
     A = np.full(1, fill_value=-1, dtype=np.int64)
     jcuconstRecEmpty[1, 1](A)
     self.assertTrue(np.all(A == 0))
Beispiel #60
0
 def _get_globals(self, sig):
     corefn = cuda.jit(sig, device=True)(self.pyfunc)
     glbls = self.py_func.__globals__.copy()
     glbls.update({'__cuda__': cuda,
                   '__core__': corefn})
     return glbls