// Convert the linear index to subscripts: unsigned int a = idx/(${B}*${C}); unsigned int b = (idx%(${B}*${C}))/${C}; unsigned int c = (idx%(${B}*${C}))%${C}; // Use the subscripts to access the array: if (idx < N) { if (b == 0) x[INDEX(a,b,c)] = 100; } } """ ) max_threads_per_block, max_block_dim, max_grid_dim = misc.get_dev_attrs(pycuda.autoinit.device) block_dim, grid_dim = misc.select_block_grid_sizes(pycuda.autoinit.device, x.shape) max_blocks_per_grid = max(max_grid_dim) func_mod = SourceModule( func_mod_template.substitute( max_threads_per_block=max_threads_per_block, max_blocks_per_grid=max_blocks_per_grid, A=A, B=B, C=C ) ) func = func_mod.get_function("func") x_gpu = gpuarray.to_gpu(x) func(x_gpu.gpudata, np.uint32(x_gpu.size), block=block_dim, grid=grid_dim) x_np = x.copy() x_np[:, 0, :] = 100 print "Success status: ", np.allclose(x_np, x_gpu.get())
blockIdx.x*${max_threads_per_block}+threadIdx.x; // Convert the linear index to subscripts: unsigned int a = idx/(${B}*${C}); unsigned int b = (idx%(${B}*${C}))/${C}; unsigned int c = (idx%(${B}*${C}))%${C}; // Use the subscripts to access the array: if (idx < N) { if (b == 0) x[INDEX(a,b,c)] = 100; } } """) max_threads_per_block, max_block_dim, max_grid_dim = misc.get_dev_attrs( pycuda.autoinit.device) block_dim, grid_dim = misc.select_block_grid_sizes(pycuda.autoinit.device, x.shape) max_blocks_per_grid = max(max_grid_dim) func_mod = \ SourceModule(func_mod_template.substitute(max_threads_per_block=max_threads_per_block, max_blocks_per_grid=max_blocks_per_grid, A=A, B=B, C=C)) func = func_mod.get_function('func') x_gpu = gpuarray.to_gpu(x) func(x_gpu.gpudata, np.uint32(x_gpu.size), block=block_dim, grid=grid_dim) x_np = x.copy() x_np[:, 0, :] = 100 print 'Success status: ', np.allclose(x_np, x_gpu.get())
import pycuda.gpuarray as gpuarray from pycuda.compiler import SourceModule import numpy as np from scikits.cuda.misc import get_dev_attrs, select_block_grid_sizes # Device selected by PyCUDA: dev = pycuda.autoinit.device # Allocate input and output arrays: a = np.asarray(np.random.rand(1000, 1000), np.float32) b = np.empty_like(a) # Determine device constraints and block/grid sizes: max_threads_per_block, max_block_dim, max_grid_dim = \ get_dev_attrs(dev) block_dim, grid_dim = select_block_grid_sizes(dev, a.shape) # Perform element-wise operation on input matrix: func_mod_template = Template(""" __global__ void func(${float} *a, ${float} *b, unsigned int N) { unsigned int idx = blockIdx.y*${max_threads_per_block}*${max_blocks_per_grid}+ blockIdx.x*${max_threads_per_block}+threadIdx.x; if (idx < N) b[idx] = 5*a[idx]; } """) func_mod = SourceModule(func_mod_template.substitute(float='float', max_threads_per_block=str(max_threads_per_block),