from numbapro import cuda, vectorize, float32, void import numpy import time @cuda.jit(void(float32, float32[:], float32[:], float32[:])) def saxpy(a, x, y, out): i = cuda.grid(1) # Short for cuda.threadIdx.x + cuda.blockIdx.x * cuda.blockDim.x if i < out.size: out[i] = a*x[i] + y[i] @vectorize([float32(float32, float32, float32)], target='gpu') def vec_saxpy(a, x, y): return a*x + y n = 16*1024*1024 a = numpy.float32(2.0) x = numpy.arange(n, dtype='float32') y = numpy.arange(n, dtype='float32') out = numpy.empty_like(x) start_time = time.time() size_block = 1024 size_grid = int((n-1)/size_block + 1) saxpy[size_grid, size_block](a, x, y, out)
# val = np.linalg.norm(a[I[-k:]]) #index backwards to get k largest # I = sorter.argselect(a[:, 0], k=k, reverse=True) I = sorter.argselect(k, a[:, 0]) val = np.linalg.norm(a[:k]) #index to get k largest if val > opt_v: opt_v = val opt_x = np.zeros((p, 1), dtype=float_dtype) opt_x[I] = a[:k] / val return opt_x @cuda.jit(void(float_type[:, :], int32)) def norm_random_nums(C, d): i = cuda.grid(1) if i >= C.shape[1]: return c = C[:, i] sum = float_type(0.0) for j in range(d): cj = c[j] sum += cj * cj val = math.sqrt(sum) for j in range(d): c[j] /= val
from numbapro import cuda, float32, void import numpy import time @cuda.jit(void(float32[:], float32[:], float32[:])) def sumarrays(a, b, c): i = cuda.grid(1) # Short for cuda.threadIdx.x + cuda.blockIdx.x * cuda.blockDim.x if i < c.size: c[i] = a[i] + b[i] n = 16*1024*1024 a = numpy.arange(n, dtype='float32') b = a*2 start_time = time.time() da = cuda.to_device(a) db = cuda.to_device(b) dc = cuda.device_array_like(a) size_block = 1024 size_grid = int((n-1)/size_block + 1) sumarrays[size_grid, size_block](da, db, dc) c = dc.copy_to_host() print "Time elapsed: ", time.time() - start_time, "s"