from numbapro import cuda, vectorize, float32, void
import numpy
import time

@cuda.jit(void(float32, float32[:], float32[:], float32[:]))
def saxpy(a, x, y, out):
	i = cuda.grid(1) # Short for cuda.threadIdx.x + cuda.blockIdx.x * cuda.blockDim.x
	if i < out.size:
		out[i] = a*x[i] + y[i]

@vectorize([float32(float32, float32, float32)], target='gpu')
def vec_saxpy(a, x, y):
	return a*x + y

n = 16*1024*1024

a = numpy.float32(2.0)
x = numpy.arange(n, dtype='float32')
y = numpy.arange(n, dtype='float32')
out = numpy.empty_like(x)

start_time = time.time()

size_block = 1024
size_grid = int((n-1)/size_block + 1)
saxpy[size_grid, size_block](a, x, y, out)
Example #2
        # val = np.linalg.norm(a[I[-k:]]) #index backwards to get k largest

        # I = sorter.argselect(a[:, 0], k=k, reverse=True)
        I = sorter.argselect(k, a[:, 0])

        val = np.linalg.norm(a[:k])  #index to get k largest

        if val > opt_v:
            opt_v = val
            opt_x = np.zeros((p, 1), dtype=float_dtype)
            opt_x[I] = a[:k] / val

    return opt_x

@cuda.jit(void(float_type[:, :], int32))
def norm_random_nums(C, d):
    i = cuda.grid(1)
    if i >= C.shape[1]:

    c = C[:, i]
    sum = float_type(0.0)
    for j in range(d):
        cj = c[j]
        sum += cj * cj
    val = math.sqrt(sum)
    for j in range(d):
        c[j] /= val

Example #3
from numbapro import cuda, float32, void
import numpy
import time

@cuda.jit(void(float32[:], float32[:], float32[:]))
def sumarrays(a, b, c):
	i = cuda.grid(1) # Short for cuda.threadIdx.x + cuda.blockIdx.x * cuda.blockDim.x
	if i < c.size:
		c[i] = a[i] + b[i]

n = 16*1024*1024
a = numpy.arange(n, dtype='float32')
b = a*2

start_time = time.time()

da = cuda.to_device(a)
db = cuda.to_device(b)
dc = cuda.device_array_like(a)

size_block = 1024
size_grid = int((n-1)/size_block + 1)
sumarrays[size_grid, size_block](da, db, dc)

c = dc.copy_to_host()

print "Time elapsed: ", time.time() - start_time, "s"
Example #4
        # val = np.linalg.norm(a[I[-k:]]) #index backwards to get k largest

        # I = sorter.argselect(a[:, 0], k=k, reverse=True)
        I = sorter.argselect(k, a[:, 0])

        val = np.linalg.norm(a[:k]) #index to get k largest

        if val > opt_v:
            opt_v = val
            opt_x = np.zeros((p, 1), dtype=float_dtype)
            opt_x[I] = a[:k] / val

    return opt_x

@cuda.jit(void(float_type[:, :], int32))
def norm_random_nums(C, d):
    i = cuda.grid(1)
    if i >= C.shape[1]:

    c = C[:, i]
    sum = float_type(0.0)
    for j in range(d):
        cj = c[j]
        sum += cj * cj
    val = math.sqrt(sum)
    for j in range(d):
        c[j] /= val