def test_copy(self): from pycuda.curandom import rand as curand a_gpu = curand((3,3)) for start, stop, step in [(0,3,1), (1,2,1), (0,3,2), (0,3,3)]: assert np.allclose(a_gpu[start:stop:step].get(), a_gpu.get()[start:stop:step]) a_gpu = curand((3,1)) for start, stop, step in [(0,3,1), (1,2,1), (0,3,2), (0,3,3)]: assert np.allclose(a_gpu[start:stop:step].get(), a_gpu.get()[start:stop:step]) a_gpu = curand((3,3,3)) for start, stop, step in [(0,3,1), (1,2,1), (0,3,2), (0,3,3)]: assert np.allclose(a_gpu[start:stop:step,start:stop:step].get(), a_gpu.get()[start:stop:step,start:stop:step]) a_gpu = curand((3,3,3)).transpose((1,2,0)) a = a_gpu.get() for start, stop, step in [(0,3,1), (1,2,1), (0,3,2), (0,3,3)]: assert np.allclose(a_gpu[start:stop:step,:,start:stop:step].get(), a_gpu.get()[start:stop:step,:,start:stop:step]) # 4-d should work as long as only 2 axes are discontiguous a_gpu = curand((3,3,3,3)) a = a_gpu.get() for start, stop, step in [(0,3,1), (1,2,1), (0,3,3)]: assert np.allclose(a_gpu[start:stop:step,:,start:stop:step].get(), a_gpu.get()[start:stop:step,:,start:stop:step])
def test_dot(self): from pycuda.curandom import rand as curand a_gpu = curand((200000,)) a = a_gpu.get() b_gpu = curand((200000,)) b = b_gpu.get() dot_ab = numpy.dot(a, b) dot_ab_gpu = gpuarray.dot(a_gpu, b_gpu).get() assert abs(dot_ab_gpu-dot_ab)/abs(dot_ab) < 1e-4
def test_insert_columns(self): for _ in range(20): dtype = random.choice((np.float32, np.float64)) N = np.random.randint(100, 1000) M = np.random.randint(100, 1000) m = np.random.randint(1, M) offset = np.random.randint(0, M - m) X = curand((N, M), dtype) Y = curand((N, m), dtype) insert_columns(Y, X, offset) self.assertTrue(np.all(X.get()[:, offset:offset+m] == Y.get()))
def main(): from pytools import Table tbl = Table() tbl.add_row(("type", "size [MiB]", "time [ms]", "mem.bw [GB/s]")) from random import shuffle for dtype_out in [numpy.float32, numpy.float64]: for ex in range(15,27): sz = 1 << ex print sz from pycuda.curandom import rand as curand a_gpu = curand((sz,)) b_gpu = curand((sz,)) assert sz == a_gpu.shape[0] assert len(a_gpu.shape) == 1 from pycuda.reduction import get_sum_kernel, get_dot_kernel krnl = get_dot_kernel(dtype_out, a_gpu.dtype) elapsed = [0] def wrap_with_timer(f): def result(*args, **kwargs): start = cuda.Event() stop = cuda.Event() start.record() f(*args, **kwargs) stop.record() stop.synchronize() elapsed[0] += stop.time_since(start) return result # warm-up for i in range(3): krnl(a_gpu, b_gpu) cnt = 10 for i in range(cnt): krnl(a_gpu, b_gpu, #krnl(a_gpu, kernel_wrapper=wrap_with_timer) bytes = a_gpu.nbytes*2*cnt secs = elapsed[0]*1e-3 tbl.add_row((str(dtype_out), a_gpu.nbytes/(1<<20), elapsed[0]/cnt, bytes/secs/1e9)) print tbl
def test_dot(self): from pycuda.curandom import rand as curand for l in [2, 3, 4, 5, 6, 7, 31, 32, 33, 127, 128, 129, 255, 256, 257, 16384 - 993, 20000]: a_gpu = curand((l,)) a = a_gpu.get() b_gpu = curand((l,)) b = b_gpu.get() dot_ab = np.dot(a, b) dot_ab_gpu = gpuarray.dot(a_gpu, b_gpu).get() assert abs(dot_ab_gpu - dot_ab) / abs(dot_ab) < 1e-4
def test_elwise_kernel(self): from pycuda.curandom import rand as curand a_gpu = curand((50,)) b_gpu = curand((50,)) from pycuda.elementwise import ElementwiseKernel lin_comb = ElementwiseKernel( "float a, float *x, float b, float *y, float *z", "z[i] = a*x[i] + b*y[i]", "linear_combination") c_gpu = gpuarray.empty_like(a_gpu) lin_comb(5, a_gpu, 6, b_gpu, c_gpu) assert la.norm((c_gpu - (5*a_gpu+6*b_gpu)).get()) < 1e-5
def test_subset_minmax(self): from pycuda.curandom import rand as curand l_a = 200000 gran = 5 l_m = l_a - l_a // gran + 1 if has_double_support(): dtypes = [np.float64, np.float32, np.int32] else: dtypes = [np.float32, np.int32] for dtype in dtypes: a_gpu = curand((l_a,), dtype) a = a_gpu.get() meaningful_indices_gpu = gpuarray.zeros(l_m, dtype=np.int32) meaningful_indices = meaningful_indices_gpu.get() j = 0 for i in range(len(meaningful_indices)): meaningful_indices[i] = j j = j + 1 if j % gran == 0: j = j + 1 meaningful_indices_gpu = gpuarray.to_gpu(meaningful_indices) b = a[meaningful_indices] min_a = np.min(b) min_a_gpu = gpuarray.subset_min(meaningful_indices_gpu, a_gpu).get() assert min_a_gpu == min_a
def test_transpose(self): import pycuda.gpuarray as gpuarray from pycuda.curandom import rand as curand a_gpu = curand((10,20,30)) a = a_gpu.get() #assert np.allclose(a_gpu.transpose((1,2,0)).get(), a.transpose((1,2,0))) # not contiguous assert np.allclose(a_gpu.T.get(), a.T)
def test_sum(self): from pycuda.curandom import rand as curand a_gpu = curand((200000,)) a = a_gpu.get() sum_a = np.sum(a) sum_a_gpu = gpuarray.sum(a_gpu).get() assert abs(sum_a_gpu-sum_a)/abs(sum_a) < 1e-4
def test_if_positive(self): from pycuda.curandom import rand as curand l = 20 a_gpu = curand((l,)) b_gpu = curand((l,)) a = a_gpu.get() b = b_gpu.get() import pycuda.gpuarray as gpuarray max_a_b_gpu = gpuarray.maximum(a_gpu, b_gpu) min_a_b_gpu = gpuarray.minimum(a_gpu, b_gpu) print (max_a_b_gpu) print((np.maximum(a, b))) assert la.norm(max_a_b_gpu.get() - np.maximum(a, b)) == 0 assert la.norm(min_a_b_gpu.get() - np.minimum(a, b)) == 0
def test_view_and_strides(self): from pycuda.curandom import rand as curand X = curand((5, 10), dtype=np.float32) Y = X[:3, :5] y = Y.view() assert y.shape == Y.shape assert y.strides == Y.strides assert np.array_equal(y.get(), X.get()[:3, :5])
def test_sum(self): from pycuda.curandom import rand as curand a_gpu = curand((200000,)) a = a_gpu.get() sum_a = numpy.sum(a) from pycuda.reduction import get_sum_kernel sum_a_gpu = gpuarray.sum(a_gpu).get() assert abs(sum_a_gpu-sum_a)/abs(sum_a) < 1e-4
def test_complex_bits(self): from pycuda.curandom import rand as curand if has_double_support(): dtypes = [np.complex64, np.complex128] else: dtypes = [np.complex64] n = 20 for tp in dtypes: dtype = np.dtype(tp) from pytools import match_precision real_dtype = match_precision(np.dtype(np.float64), dtype) z = (curand((n,), real_dtype).astype(dtype) + 1j*curand((n,), real_dtype).astype(dtype)) assert la.norm(z.get().real - z.real.get()) == 0 assert la.norm(z.get().imag - z.imag.get()) == 0 assert la.norm(z.get().conj() - z.conj().get()) == 0
def test_astype(self): from pycuda.curandom import rand as curand if not has_double_support(): return a_gpu = curand((2000, ), dtype=np.float32) a = a_gpu.get().astype(np.float64) a2 = a_gpu.astype(np.float64).get() assert a2.dtype == np.float64 assert la.norm(a - a2) == 0, (a, a2) a_gpu = curand((2000, ), dtype=np.float64) a = a_gpu.get().astype(np.float32) a2 = a_gpu.astype(np.float32).get() assert a2.dtype == np.float32 assert la.norm(a - a2) / la.norm(a) < 1e-7
def test_newaxis(self): import pycuda.gpuarray as gpuarray from pycuda.curandom import rand as curand a_gpu = curand((10,20,30)) a = a_gpu.get() b_gpu = a_gpu[:,np.newaxis] b = a[:,np.newaxis] assert b_gpu.shape == b.shape assert b_gpu.strides == b.strides
def test_view_and_strides(self): from pycuda.curandom import rand as curand X = curand((5, 10), dtype=np.float32) Y = X[:3, :5] y = Y.view() assert y.shape == Y.shape assert y.strides == Y.strides with pytest.raises(AssertionError): assert (y.get() == X.get()[:3, :5]).all()
def test_astype(self): from pycuda.curandom import rand as curand if not has_double_support(): return a_gpu = curand((2000,), dtype=np.float32) a = a_gpu.get().astype(np.float64) a2 = a_gpu.astype(np.float64).get() assert a2.dtype == np.float64 assert la.norm(a - a2) == 0, (a, a2) a_gpu = curand((2000,), dtype=np.float64) a = a_gpu.get().astype(np.float32) a2 = a_gpu.astype(np.float32).get() assert a2.dtype == np.float32 assert la.norm(a - a2)/la.norm(a) < 1e-7
def test_view_and_strides(self): from pycuda.curandom import rand as curand X = curand((5, 10), dtype=np.float32) Y = X[:3, :5] y = Y.view() assert y.shape == Y.shape assert y.strides == Y.strides import pytest with pytest.raises(AssertionError): assert (y.get() == X.get()[:3, :5]).all()
def test_random(self): from pycuda.curandom import rand as curand if has_double_support(): dtypes = [np.float32, np.float64] else: dtypes = [np.float32] for dtype in dtypes: a = curand((10, 100), dtype=dtype).get() assert (0 <= a).all() assert (a < 1).all()
def test_complex_bits(self): from pycuda.curandom import rand as curand if has_double_support(): dtypes = [np.complex64, np.complex128] else: dtypes = [np.complex64] n = 20 for tp in dtypes: dtype = np.dtype(tp) from pytools import match_precision real_dtype = match_precision(np.dtype(np.float64), dtype) z = curand((n, ), real_dtype).astype(dtype) + 1j * curand( (n, ), real_dtype).astype(dtype) assert la.norm(z.get().real - z.real.get()) == 0 assert la.norm(z.get().imag - z.imag.get()) == 0 assert la.norm(z.get().conj() - z.conj().get()) == 0 # verify contiguity is preserved for order in ["C", "F"]: # test both zero and non-zero value code paths z_real = gpuarray.zeros(z.shape, dtype=real_dtype, order=order) z2 = z.reshape(z.shape, order=order) for zdata in [z_real, z2]: if order == "C": assert zdata.flags.c_contiguous assert zdata.real.flags.c_contiguous assert zdata.imag.flags.c_contiguous assert zdata.conj().flags.c_contiguous elif order == "F": assert zdata.flags.f_contiguous assert zdata.real.flags.f_contiguous assert zdata.imag.flags.f_contiguous assert zdata.conj().flags.f_contiguous
def test_complex_bits(self): from pycuda.curandom import rand as curand if has_double_support(): dtypes = [np.complex64, np.complex128] else: dtypes = [np.complex64] n = 20 for tp in dtypes: dtype = np.dtype(tp) from pytools import match_precision real_dtype = match_precision(np.dtype(np.float64), dtype) z = (curand((n,), real_dtype).astype(dtype) + 1j*curand((n,), real_dtype).astype(dtype)) assert la.norm(z.get().real - z.real.get()) == 0 assert la.norm(z.get().imag - z.imag.get()) == 0 assert la.norm(z.get().conj() - z.conj().get()) == 0 # verify contiguity is preserved for order in ["C", "F"]: # test both zero and non-zero value code paths z_real = gpuarray.zeros(z.shape, dtype=real_dtype, order=order) z2 = z.reshape(z.shape, order=order) for zdata in [z_real, z2]: if order == "C": assert zdata.flags.c_contiguous == True assert zdata.real.flags.c_contiguous == True assert zdata.imag.flags.c_contiguous == True assert zdata.conj().flags.c_contiguous == True elif order == "F": assert zdata.flags.f_contiguous == True assert zdata.real.flags.f_contiguous == True assert zdata.imag.flags.f_contiguous == True assert zdata.conj().flags.f_contiguous == True
def test_reduce_out(self): from pycuda.curandom import rand as curand a_gpu = curand((10, 200), dtype=np.float32) a = a_gpu.get() from pycuda.reduction import ReductionKernel red = ReductionKernel(np.float32, neutral=0, reduce_expr="max(a,b)", arguments="float *in") max_gpu = gpuarray.empty(10, dtype=np.float32) for i in range(10): red(a_gpu[i], out=max_gpu[i]) assert np.alltrue(a.max(axis=1) == max_gpu.get())
def test_minimum_maximum_scalar(self): from pycuda.curandom import rand as curand l = 20 a_gpu = curand((l,)) a = a_gpu.get() import pycuda.gpuarray as gpuarray max_a0_gpu = gpuarray.maximum(a_gpu, 0) min_a0_gpu = gpuarray.minimum(0, a_gpu) assert la.norm(max_a0_gpu.get() - np.maximum(a, 0)) == 0 assert la.norm(min_a0_gpu.get() - np.minimum(0, a)) == 0
def test_extract_columns(self): for _ in range(20): dtype = random.choice((np.float32, np.float64)) N = np.random.randint(100, 1000) M = np.random.randint(100, 1000) a = np.random.randint(0, M) b = np.random.randint(a + 1, M) m = b - a assert m > 0 X = curand((N, M), dtype) Y = extract_columns(X, a, b) self.assertTrue(np.all(X.get()[:, a:b] == Y.get()))
def test_minimum_maximum_scalar(self): from pycuda.curandom import rand as curand sz = 20 a_gpu = curand((sz, )) a = a_gpu.get() import pycuda.gpuarray as gpuarray max_a0_gpu = gpuarray.maximum(a_gpu, 0) min_a0_gpu = gpuarray.minimum(0, a_gpu) assert la.norm(max_a0_gpu.get() - np.maximum(a, 0)) == 0 assert la.norm(min_a0_gpu.get() - np.minimum(0, a)) == 0
def test_copy(self): from pycuda.curandom import rand as curand a_gpu = curand((3, 3)) for start, stop, step in [(0, 3, 1), (1, 2, 1), (0, 3, 2), (0, 3, 3)]: assert np.allclose( a_gpu[start:stop:step].get(), a_gpu.get()[start:stop:step] ) a_gpu = curand((3, 1)) for start, stop, step in [(0, 3, 1), (1, 2, 1), (0, 3, 2), (0, 3, 3)]: assert np.allclose( a_gpu[start:stop:step].get(), a_gpu.get()[start:stop:step] ) a_gpu = curand((3, 3, 3)) for start, stop, step in [(0, 3, 1), (1, 2, 1), (0, 3, 2), (0, 3, 3)]: assert np.allclose( a_gpu[start:stop:step, start:stop:step].get(), a_gpu.get()[start:stop:step, start:stop:step], ) a_gpu = curand((3, 3, 3)).transpose((1, 2, 0)) for start, stop, step in [(0, 3, 1), (1, 2, 1), (0, 3, 2), (0, 3, 3)]: assert np.allclose( a_gpu[start:stop:step, :, start:stop:step].get(), a_gpu.get()[start:stop:step, :, start:stop:step], ) # 4-d should work as long as only 2 axes are discontiguous a_gpu = curand((3, 3, 3, 3)) for start, stop, step in [(0, 3, 1), (1, 2, 1), (0, 3, 3)]: assert np.allclose( a_gpu[start:stop:step, :, start:stop:step].get(), a_gpu.get()[start:stop:step, :, start:stop:step], )
def test_slice(self): from pycuda.curandom import rand as curand l = 20000 a_gpu = curand((l, )) a = a_gpu.get() from random import randrange for i in range(200): start = randrange(l) end = randrange(start, l) a_gpu_slice = a_gpu[start:end] a_slice = a[start:end] assert la.norm(a_gpu_slice.get() - a_slice) == 0
def test_slice(self): from pycuda.curandom import rand as curand l = 20000 a_gpu = curand((l,)) a = a_gpu.get() from random import randrange for i in range(200): start = randrange(l) end = randrange(start, l) a_gpu_slice = a_gpu[start:end] a_slice = a[start:end] assert la.norm(a_gpu_slice.get()-a_slice) == 0
def test_2d_slice_c(self): from pycuda.curandom import rand as curand n = 1000 m = 300 a_gpu = curand((n, m)) a = a_gpu.get() from random import randrange for i in range(200): start = randrange(n) end = randrange(start, n) a_gpu_slice = a_gpu[start:end] a_slice = a[start:end] assert la.norm(a_gpu_slice.get() - a_slice) == 0
def test_minmax(self): from pycuda.curandom import rand as curand if has_double_support(): dtypes = [np.float64, np.float32, np.int32] else: dtypes = [np.float32, np.int32] for what in ["min", "max"]: for dtype in dtypes: a_gpu = curand((200000, ), dtype) a = a_gpu.get() op_a = getattr(np, what)(a) op_a_gpu = getattr(gpuarray, what)(a_gpu).get() assert op_a_gpu == op_a, (op_a_gpu, op_a, dtype, what)
def test_minmax(self): from pycuda.curandom import rand as curand if has_double_support(): dtypes = [np.float64, np.float32, np.int32] else: dtypes = [np.float32, np.int32] for what in ["min", "max"]: for dtype in dtypes: a_gpu = curand((200000,), dtype) a = a_gpu.get() op_a = getattr(np, what)(a) op_a_gpu = getattr(gpuarray, what)(a_gpu).get() assert op_a_gpu == op_a, (op_a_gpu, op_a, dtype, what)
def test_2d_slice_c(self): from pycuda.curandom import rand as curand n = 1000 m = 300 a_gpu = curand((n, m)) a = a_gpu.get() from random import randrange for i in range(200): start = randrange(n) end = randrange(start, n) a_gpu_slice = a_gpu[start:end] a_slice = a[start:end] assert la.norm(a_gpu_slice.get()-a_slice) == 0
def test_2d_slice_f(self): from pycuda.curandom import rand as curand import pycuda.gpuarray as gpuarray n = 1000 m = 300 a_gpu = curand((n, m)) a_gpu_f = gpuarray.GPUArray((m, n), np.float32, gpudata=a_gpu.gpudata, order="F") a = a_gpu_f.get() from random import randrange for i in range(200): start = randrange(n) end = randrange(start, n) a_gpu_slice = a_gpu_f[:, start:end] a_slice = a[:, start:end] assert la.norm(a_gpu_slice.get() - a_slice) == 0
def test_2d_slice_f(self): from pycuda.curandom import rand as curand import pycuda.gpuarray as gpuarray n = 1000 m = 300 a_gpu = curand((n, m)) a_gpu_f = gpuarray.GPUArray((m, n), np.float32, gpudata=a_gpu.gpudata, order="F") a = a_gpu_f.get() from random import randrange for i in range(200): start = randrange(n) end = randrange(start, n) a_gpu_slice = a_gpu_f[:, start:end] a_slice = a[:, start:end] assert la.norm(a_gpu_slice.get()-a_slice) == 0
def main(): from pytools import Table tbl = Table() tbl.add_row(("type", "size [MiB]", "time [ms]", "mem.bw [GB/s]")) from random import shuffle for dtype_out in [numpy.float32, numpy.float64]: for ex in range(15, 27): sz = 1 << ex print(sz) from pycuda.curandom import rand as curand a_gpu = curand((sz, )) b_gpu = curand((sz, )) assert sz == a_gpu.shape[0] assert len(a_gpu.shape) == 1 from pycuda.reduction import get_sum_kernel, get_dot_kernel krnl = get_dot_kernel(dtype_out, a_gpu.dtype) elapsed = [0] def wrap_with_timer(f): def result(*args, **kwargs): start = cuda.Event() stop = cuda.Event() start.record() f(*args, **kwargs) stop.record() stop.synchronize() elapsed[0] += stop.time_since(start) return result # warm-up for i in range(3): krnl(a_gpu, b_gpu) cnt = 10 for i in range(cnt): krnl( a_gpu, b_gpu, # krnl(a_gpu, kernel_wrapper=wrap_with_timer, ) bytes = a_gpu.nbytes * 2 * cnt secs = elapsed[0] * 1e-3 tbl.add_row(( str(dtype_out), a_gpu.nbytes / (1 << 20), elapsed[0] / cnt, bytes / secs / 1e9, )) print(tbl)
from pycuda.curandom import rand as curand # import CUDA random number module a_gpu = curand((50,)) # create a 1-d array with random number b_gpu = curand((50,)) from pycuda.elementwise import ElementwiseKernel # import ElementwiseKernel module # specify the detail of element-wise operation lin_comb = ElementwiseKernel( " float a, float *x, float b, float *y, float *z", "z[i] = a*x[i] + b*y[i]") c_gpu = gpuarray.empty_like(a_gpu) # create a GPU array of same size lin_comb(5, a_gpu, 6, b_gpu, c_gpu) # run the ElementwiseKernel function assert la.norm((c_gpu - (5*a_gpu+6*b_gpu)).get()) < 1e-5 print a_gpu print b_gpu print c_gpu
import pycuda.gpuarray as gpuarray import pycuda.autoinit import numpy from pycuda.curandom import rand as curand a_gpu = curand((50,)) b_gpu = curand((50,)) from pycuda.elementwise import ElementwiseKernel lin_comb = ElementwiseKernel( "float a, float *x, float b, float *y, float *z", "z[i] = my_f(a*x[i], b*y[i])", "linear_combination", preamble=""" __device__ float my_f(float x, float y) { return x + y; } """) c_gpu = gpuarray.empty_like(a_gpu) lin_comb(5, a_gpu, 6, b_gpu, c_gpu) print c_gpu #print (5*a_gpu+6*b_gpu) #import numpy.linalg as la #assert la.norm((c_gpu - (5*a_gpu+6*b_gpu)).get()) < 1e-5
from pycuda.curandom import rand as curand import numpy n = 1024 matMultKernel = """ __global__ void mat_mult(float *a, float *b, float *c) { int x = threadIdx.x + blockIdx.x * blockDim.x; int y = threadIdx.y + blockIdx.y * blockDim.y; for(int k = 0; k < %(ENE)s; k++) c[y + x * %(ENE)s] += a[k + x * %(ENE)s] * b[y + k * %(ENE)s]; } """ a_gpu = curand((n, n)) b_gpu = curand((n, n)) c_gpu = gpuarray.zeros((n, n), dtype=numpy.float32) matMultKernel = matMultKernel % { "ENE": n, } mod = SourceModule(matMultKernel) mat_mult = mod.get_function("mat_mult") mat_mult(a_gpu, b_gpu, c_gpu, block=(32, 32, 1), grid=(n // 32, n // 32, 1)) print(a_gpu.get()) print("-" * 80) print(b_gpu.get())
import pycuda.autoinit import pycuda.gpuarray as gpuarray from pycuda.elementwise import ElementwiseKernel from pycuda.curandom import rand as curand n = 1000000 reverseKernel = ElementwiseKernel("float *a, float *b, int c", "b[i] = a[n-1-i]", "reverse") a_gpu = curand((n)) b_gpu = gpuarray.empty_like(a_gpu) reverseKernel(a_gpu, b_gpu, n) print(a_gpu) print("-" * 80) print(b_gpu) print("-" * 80) print(n)
def SSA(update_matrix, initial_conditions, function_rates, t_max, **kwargs): # noqa # Fix the maximum number of steps available at each repetition. Should be function of the # amount of memory available on the device and the number of iterations (= threads) requested. _num_steps = 20 _num_reacs = len(kwargs["variables"]) start_time, end_time = np.float32(0), np.float32(t_max) function_rates_wo_param = deepcopy(function_rates) for fr_id, f_rate in enumerate(function_rates_wo_param): for par, val in kwargs["parameters"].items(): f_rate = f_rate.replace(par, str(val)) for sp_id, spec in enumerate(kwargs["variables"]): f_rate = f_rate.replace( spec, "_time_and_states[th_id * (@num__reacs@ + 1) * @num__rep@" " + rep * (@num__reacs@ + 1) + 1 + {}]".format(sp_id)) function_rates_wo_param[fr_id] = f_rate unroll_func_rate = "\n".join( (f_rate.join(("_rates_arr[{}] = ".format(fr_id), ";")) for fr_id, f_rate in enumerate(function_rates_wo_param))) kernel_ready = _kernel_str \ .replace("@unroll__func__rate@", unroll_func_rate) \ .replace("@num__iter@", str(kwargs["iterations"])) \ .replace("@num__rep@", str(_num_steps)) \ .replace("@num__reacs@", str(_num_reacs)) if kwargs.get("print_cuda"): print("\n".join( " ".join((str(line_no + 2), line)) for line_no, line in enumerate(kernel_ready.split("\n")))) upd_mat_dev = gpuarray.to_gpu(update_matrix.astype(np.float32)) # The vector of initial conditions has to be repeated for each thread, since in the future, # when we will split in chunks, each chunk will restart from a different initial condition. init_cond_dev = gpuarray.to_gpu( np.tile(initial_conditions.astype(np.float32), (kwargs["iterations"], 1))) # Each thread should produce its own array of random numbers or at least have access to a # private set of random numbers: we need two numbers for each repetition, one to select the # reaction and one to select the time. # Note that pycuda.curandom.rand is a toy-random generator, and all the threads share the array. # https://documen.tician.de/pycuda/array.html?highlight=random#module-pycuda.curandom rand_arr_dev = curand((_num_steps, 2, kwargs["iterations"])) # There seems to be no need to manually copy back to host gpuarrays, see example/demo.py. time_states_dev = gpuarray.GPUArray( (kwargs["iterations"], _num_steps, _num_reacs + 1), dtype=np.float32) mod = SourceModule(kernel_ready) func = mod.get_function("ssa_simple") func(upd_mat_dev, init_cond_dev, start_time, end_time, time_states_dev, rand_arr_dev, block=(kwargs["iterations"], 1, 1)) return time_states_dev
import pycuda.gpuarray as gpuarray import pycuda.autoinit import numpy from pycuda.curandom import rand as curand from pycuda.elementwise import ElementwiseKernel import numpy.linalg as la input_vector_a = curand((50, )) input_vector_b = curand((50, )) mult_coefficient_a = 2 mult_coefficient_b = 5 linear_combination = ElementwiseKernel( "float a, float *x, float b, float *y, float *c", "c[i] = a*x[i] + b*y[i]", "linear_combination") linear_combination_result = gpuarray.empty_like(input_vector_a) linear_combination(mult_coefficient_a, input_vector_a,\ mult_coefficient_b, input_vector_b,\ linear_combination_result) print("INPUT VECTOR A =") print(input_vector_a) print("INPUT VECTOR B = ") print(input_vector_b) print("RESULTING VECTOR C = ") print linear_combination_result print(
def cuda_mutate(sols,prob_mut, mut_range,min_param,max_param): """ mutates the values of the solutions given @params sols, probability of mutation, mutation range, min param, max param @returns mutated sols """ #ALL SOLUTIONS MUST BE OF SAME LENGTH num_sols = len(sols); #get length of solutions sol_len = len(sols[0]); #get number of nodes num_nodes = netParams.nodeConfig['I'] + netParams.nodeConfig['H'] + netParams.nodeConfig['O']; #mutate not on architecture mutateFrom = constants.META_INFO_COUNT + num_nodes; #range m_range = 2 * mut_range; #convert to form of numpy arrays old_sols = numpy.array(sols[:,mutateFrom:], numpy.float32); cost_genes = numpy.ones((num_sols),numpy.float32); contrb_genes = numpy.zeros((num_sols),numpy.float32); mutants = numpy.array(sols).astype(numpy.float32); cost_genes *= -1; age_genes = numpy.zeros((num_sols),numpy.float32); #copy to gpu sols_gpu = gpuarray.to_gpu(old_sols); sol_len = len(old_sols[0]); #operation MutSols_gpu = gpuarray.zeros_like(sols_gpu).astype(numpy.float32); Mvals_gpu = (curand((num_sols,sol_len),numpy.float32) * m_range) - mut_range; #mutation values #calculate probabilites of mutation and form mutation mask Mprob_gpu = curand((num_sols,sol_len),numpy.float32); #mutation probabilities MutMask_gpu = gpuarray.zeros_like(Mprob_gpu).astype(numpy.float32); #-form mutation form_mutation_mask(Mprob_gpu,MutMask_gpu,prob_mut); #-mutate genes MutSols_gpu = sols_gpu + (MutMask_gpu * Mvals_gpu); #get mutated solutions mutants[:,mutateFrom:] = MutSols_gpu.get(); mutants[:,constants.COST_GENE] = cost_genes; mutants[:,constants.COST2_GENE] = cost_genes; mutants[:,constants.MISC_GENE] = contrb_genes; mutants[:,constants.AGE_GENE] = age_genes; if debug == True: print "sols",sols; print "mut_mask", MutMask_gpu.view(); print "mut_sols", mutants; #return mutated solutions return mutants.tolist();
@author: bhaumik """ import pycuda.gpuarray as gpuarray import pycuda.driver as drv from pycuda.elementwise import ElementwiseKernel import pycuda.autoinit from pycuda.curandom import rand as curand # Kernel function add = ElementwiseKernel("float *d_a, float *d_b, float *d_c", "d_c[i] = d_a[i] + d_b[i]", "add") # create a couple of random matrices with a given shape shape = 1000000 d_a = curand(shape) d_b = curand(shape) d_c = gpuarray.empty_like(d_a) start = drv.Event() end = drv.Event() start.record() # Calling kernel add(d_a, d_b, d_c) end.record() end.synchronize() secs = start.time_till(end) * 1e-3 print("Addition of %d element of GPU" % shape) print("%fs" % (secs)) # check the result if d_c == (d_a + d_b): print("The sum computed on GPU is correct")
def test_struct_reduce(self): preamble = """ struct minmax_collector { float cur_min; float cur_max; __device__ minmax_collector() { } __device__ minmax_collector(float cmin, float cmax) : cur_min(cmin), cur_max(cmax) { } __device__ minmax_collector(minmax_collector const &src) : cur_min(src.cur_min), cur_max(src.cur_max) { } __device__ minmax_collector(minmax_collector const volatile &src) : cur_min(src.cur_min), cur_max(src.cur_max) { } __device__ minmax_collector volatile &operator=( minmax_collector const &src) volatile { cur_min = src.cur_min; cur_max = src.cur_max; return *this; } }; __device__ minmax_collector agg_mmc(minmax_collector a, minmax_collector b) { return minmax_collector( fminf(a.cur_min, b.cur_min), fmaxf(a.cur_max, b.cur_max)); } """ mmc_dtype = np.dtype([("cur_min", np.float32), ("cur_max", np.float32)]) from pycuda.curandom import rand as curand a_gpu = curand((20000, ), dtype=np.float32) a = a_gpu.get() from pycuda.tools import register_dtype register_dtype(mmc_dtype, "minmax_collector") from pycuda.reduction import ReductionKernel red = ReductionKernel( mmc_dtype, neutral="minmax_collector(10000, -10000)", # FIXME: needs infinity literal in real use, ok here reduce_expr="agg_mmc(a, b)", map_expr="minmax_collector(x[i], x[i])", arguments="float *x", preamble=preamble, ) minmax = red(a_gpu).get() # print minmax["cur_min"], minmax["cur_max"] # print np.min(a), np.max(a) assert minmax["cur_min"] == np.min(a) assert minmax["cur_max"] == np.max(a)
from pycuda.reduction import ReductionKernel # import ReductionKernel module # specify the detail of the reduction operation dot = ReductionKernel( dtype_out=numpy.float32, neutral="0", reduce_expr="a+b", map_expr="x[i]*y[i]", arguments="const float *x, const float *y") from pycuda.curandom import rand as curand x = curand((1000*1000), dtype=numpy.float32) y = curand((1000*1000), dtype=numpy.float32) x_dot_y = dot(x, y).get() x_dot_y_cpu = numpy.dot(x.get(), y.get()) print x print y print x_dot_y print x_dot_y_cpu
TILE_DIM = 32 transpuestaKernel = """ __global__ void transpuesta(float *a, float *b) { int x = blockIdx.x * blockDim.x + threadIdx.x; int y = blockIdx.y * blockDim.y + threadIdx.y; int i = y + x * %(EME)s; int j = x + y * %(ENE)s; if(i < (%(ENE)s * %(EME)s)) b[j] = a[i]; } """ transpuestaKernel = transpuestaKernel % {"ENE": n, "EME": m} a_gpu = curand((n * m)) b_gpu = gpuarray.empty_like(a_gpu) mod = SourceModule(transpuestaKernel) func = mod.get_function("transpuesta") func(a_gpu, b_gpu, block=(TILE_DIM, TILE_DIM, 1), grid=(m // TILE_DIM, n // TILE_DIM, 1)) a_gpu = a_gpu.reshape((n, m)) b_gpu = b_gpu.reshape((m, n)) print(a_gpu) print("-" * 80)
# print(torch.cuda.device_count()) # # print(torch.cuda.get_device_name(0)) # import torch.cuda # if torch.cuda.is_available(): # print('PyTorch found cuda') # else: # print('PyTorch could not find cuda') # # import pycuda # from pycuda import compiler # import pycuda.driver as drv # # drv.init() # print("%d device(s) found." % drv.Device.count()) # # for ordinal in range(drv.Device.count()): # dev = drv.Device(ordinal) # print(ordinal, dev.name()) from pycuda import gpuarray from pycuda.curandom import rand as curand # -- initialize the device import pycuda.autoinit height = 100 width = 200 X = curand((height, width), np.float32) X.flags.c_contiguous print(type(X))
import pycuda.autoinit import pycuda.driver as cuda import pycuda.gpuarray as gpuarray from pycuda.reduction import ReductionKernel from pycuda.curandom import rand as curand import numpy n = 1000000 a = curand(n, dtype=numpy.float32) b = curand(n, dtype=numpy.float32) dotKernel = ReductionKernel(numpy.float32, neutral="0", reduce_expr="a+b", map_expr="x[i]*y[i]", arguments="float *x, float*y") doot = dotKernel(a, b).get() print(doot)
s = time() dC = cumath.log(dA) e = time() print 'gpu elapsed time: %f \n' % (e-s) ################### # 3) elementwise kernel # performs array operations much faster than gpu_array print '\n elementwise kernel\n' print '---------------------\n' from pycuda.curandom import rand as curand a_gpu = curand((1000,)) b_gpu = curand((1000,)) from pycuda.elementwise import ElementwiseKernel lin_comb = ElementwiseKernel( "float a, float *x, float b, float *y, float *z", "z[i] = a*x[i] + b*y[i]", "linear_combination") c_gpu = gpuarray.empty_like(a_gpu) s = time() lin_comb(5, a_gpu, 6, b_gpu, c_gpu) e = time() print 'elementwise kernel elapsed time: %f \n' % (e-s)
# Element wise add operation from __future__ import absolute_import import pycuda.driver as cuda import pycuda.gpuarray as gpuarray import pycuda.autoinit import torch import numpy from pycuda.curandom import rand as curand # Vector size N = 10000 a_gpu = curand((N, )) b_gpu = 1 - a_gpu c_cpu = torch.cuda.FloatTensor(N) from pycuda.elementwise import ElementwiseKernel func_kernel = ElementwiseKernel("float *a, float *b, float *c", "c[i] = a[i] + b[i]", "add") c_gpu = gpuarray.empty_like(a_gpu) func_kernel(a_gpu, b_gpu, c_gpu) # Copy result to host #cuda.memcpy_dtoh(c_cpu, c_gpu)
mod = SourceModule(source) get_energy = mod.get_function("energy") polKroku = mod.get_function("polKroku") fupdate = mod.get_function("fupdate") leapfrog = mod.get_function("leapfrog") repopulate = mod.get_function("repopulate") #sila = mod.get_function("sila") # Initialize data t = 0 particles = [] velocities = [] energy = [] celllist = {} # random velocities px = curand((stale.particleNumber, )).get().astype(np.float32) py = curand((stale.particleNumber, )).get().astype(np.float32) # velocity distribution around 0, not 0.5 px = px - 0.5 py = py - 0.5 # Here we have energy, not velocity ([XXX] needs correction) v = np.zeros((stale.particleNumber, )).astype(np.float32) rx = np.zeros((stale.particleNumber, )).astype(np.float32) ry = np.zeros((stale.particleNumber, )).astype(np.float32) fx = np.zeros((stale.particleNumber, )).astype(np.float32) fy = np.zeros((stale.particleNumber, )).astype(np.float32) # Initializing a list of neighbors (structure) # It reduces complexity from O(N^2) to O(N) nl = (-1) * np.ones((stale.particleNumber, stale.rn)).astype(np.float32)
def test_struct_reduce(self): preamble = """ struct minmax_collector { float cur_min; float cur_max; __device__ minmax_collector() { } __device__ minmax_collector(float cmin, float cmax) : cur_min(cmin), cur_max(cmax) { } __device__ minmax_collector(minmax_collector const &src) : cur_min(src.cur_min), cur_max(src.cur_max) { } __device__ minmax_collector(minmax_collector const volatile &src) : cur_min(src.cur_min), cur_max(src.cur_max) { } __device__ minmax_collector volatile &operator=( minmax_collector const &src) volatile { cur_min = src.cur_min; cur_max = src.cur_max; return *this; } }; __device__ minmax_collector agg_mmc(minmax_collector a, minmax_collector b) { return minmax_collector( fminf(a.cur_min, b.cur_min), fmaxf(a.cur_max, b.cur_max)); } """ mmc_dtype = np.dtype([("cur_min", np.float32), ("cur_max", np.float32)]) from pycuda.curandom import rand as curand a_gpu = curand((20000,), dtype=np.float32) a = a_gpu.get() from pycuda.tools import register_dtype register_dtype(mmc_dtype, "minmax_collector") from pycuda.reduction import ReductionKernel red = ReductionKernel(mmc_dtype, neutral="minmax_collector(10000, -10000)", # FIXME: needs infinity literal in real use, ok here reduce_expr="agg_mmc(a, b)", map_expr="minmax_collector(x[i], x[i])", arguments="float *x", preamble=preamble) minmax = red(a_gpu).get() #print minmax["cur_min"], minmax["cur_max"] #print np.min(a), np.max(a) assert minmax["cur_min"] == np.min(a) assert minmax["cur_max"] == np.max(a)
from pycuda.reduction import ReductionKernel import numpy dot = ReductionKernel(dtype_out=numpy.float32, neutral="0", reduce_expr="a+b", map_expr="x[i]∗y[i]", arguments="const float ∗x, const float ∗y") from pycuda.curandom import rand as curand x = curand((1000 * 1000), dtype=numpy.float32) y = curand((1000 * 1000), dtype=numpy.float32) x_dot_y = dot(x, y).get() x_dot_y_cpu = numpy.dot(x.get(), y.get()) print x_dot_y print x_dot_y_cpu
import pycuda.gpuarray as gpuarray import pycuda.autoinit import numpy from pycuda.curandom import rand as curand a_gpu = curand((50, )) b_gpu = curand((50, )) from pycuda.elementwise import ElementwiseKernel lin_comb = ElementwiseKernel("float a, float *x, float b, float *y, float *z", "z[i] = my_f(a*x[i], b*y[i])", "linear_combination", preamble=""" __device__ float my_f(float x, float y) { return x + y; } """) c_gpu = gpuarray.empty_like(a_gpu) lin_comb(5, a_gpu, 6, b_gpu, c_gpu) print c_gpu #print (5*a_gpu+6*b_gpu) #import numpy.linalg as la #assert la.norm((c_gpu - (5*a_gpu+6*b_gpu)).get()) < 1e-5
import pycuda.autoinit import pycuda.gpuarray as gpuarray import pycuda.driver as cuda import numpy as np from pycuda.compiler import SourceModule from pycuda.elementwise import ElementwiseKernel from pycuda.curandom import rand as curand add = ElementwiseKernel("float *a, float *b, float *c", "c[i] = a[i] + b[i]", "add") shape = 128, 1024 a_gpu = curand(shape) b_gpu = curand(shape) c_gpu = gpuarray.empty_like(a_gpu) add(a_gpu, b_gpu, c_gpu) print np.max(np.abs(c_gpu.get() - a_gpu.get() - b_gpu.get()))
n = 1024 m = 512 l = 128 matMultKernel = """ __global__ void mat_mult(float *a, float *b, float *c) { int x = threadIdx.x + blockIdx.x * blockDim.x; int y = threadIdx.y + blockIdx.y * blockDim.y; for(int k = 0; k < %(EME)s; k++) c[y + x * %(ELE)s] += a[k + x * %(EME)s] * b[y + k * %(ELE)s]; } """ a_gpu = curand((n,m)) b_gpu = curand((m,l)) c_gpu = gpuarray.zeros((n,l), dtype=numpy.float32) matMultKernel = matMultKernel % { "EME" : m, "ELE" : l } mod = SourceModule(matMultKernel) mat_mult = mod.get_function("mat_mult") mat_mult( a_gpu, b_gpu, c_gpu,