def gamma_exp_kernel(self, N, dtype): if (self._gamma_exp_kernel is None or dtype != self._gamma_exp_kernel_dtype): func_mod_template = Template(""" // Macro for converting subscripts to linear index: #define INDEX(a, b) a*${M}+b __global__ void func(${dtype} *x, ${dtype} *y, ${dtype} *l, unsigned int N) { // Obtain the linear index corresponding to the current thread: unsigned int idx = blockIdx.y*${max_threads_per_block}*${max_blocks_per_grid}+ blockIdx.x*${max_threads_per_block}+threadIdx.x; // Convert the linear index to subscripts: unsigned int a = idx/${M}; unsigned int b = idx%${M}; ${dtype} Pvalue = 0.0; ${dtype} darg = 0.0; ${dtype} abs_arg = 0.0; // Use the subscripts to access the array: if (idx < N) { darg = (y[a] - y[b]) / l[0]; abs_arg = fabs(darg); Pvalue = exp(-1.0*pow(abs_arg, l[1])); x[INDEX(a,b)] = Pvalue; } } """) max_threads_per_block, max_block_dim, max_grid_dim = misc.get_dev_attrs( # NOQA self.device) max_blocks_per_grid = max(max_grid_dim) func_mod = SourceModule( func_mod_template.substitute( max_threads_per_block=max_threads_per_block, max_blocks_per_grid=max_blocks_per_grid, M=N, dtype=dtype)) self._gamma_exp_kernel = func_mod.get_function('func') self._gamma_exp_kernel_dtype = dtype return self._gamma_exp_kernel
def pairwised(self, N, dtype): if (self._pairwised is None or dtype != self._pairwised_dtype): func_mod_template = Template(""" // Macro for converting subscripts to linear index: #define INDEX(a, b) a*${M}+b __global__ void func(${dtype} *x, ${dtype} *y, unsigned int N) { // Obtain the linear index corresponding to the current thread: unsigned int idx = blockIdx.y*${max_threads_per_block}*${max_blocks_per_grid}+ blockIdx.x*${max_threads_per_block}+threadIdx.x; // Convert the linear index to subscripts: unsigned int a = idx/${M}; unsigned int b = idx%${M}; ${dtype} Pvalue = 0.0; // Use the subscripts to access the array: if (idx < N) { Pvalue = y[a] - y[b]; if (Pvalue<0) { Pvalue *= -1.0; } x[INDEX(a,b)] = Pvalue; } } """) max_threads_per_block, max_block_dim, max_grid_dim = misc.get_dev_attrs( # NOQA self.device) max_blocks_per_grid = max(max_grid_dim) func_mod = SourceModule( func_mod_template.substitute( max_threads_per_block=max_threads_per_block, max_blocks_per_grid=max_blocks_per_grid, M=N, dtype=dtype)) self._pairwised = func_mod.get_function('func') self._pairwised_dtype = dtype return self._pairwised
// Convert the linear index to subscripts: unsigned int a = idx/(${B}*${C}*${D}); unsigned int b = (idx%(${B}*${C}*${D}))/(${C}*${D}); unsigned int c = ((idx%(${B}*${C}*${D}))%(${C}*${D}))/${D}; unsigned int d = ((idx%(${B}*${C}*${D}))%(${C}*${D}))%${D}; // Use the subscripts to access the array: if (idx < N) { if (c == 0) x[INDEX(a,b,c,d)] = 100; } } """) max_threads_per_block, max_block_dim, max_grid_dim = misc.get_dev_attrs( pycuda.autoinit.device) block_dim, grid_dim = misc.select_block_grid_sizes(pycuda.autoinit.device, x.shape) max_blocks_per_grid = max(max_grid_dim) func_mod = \ SourceModule(func_mod_template.substitute(max_threads_per_block=max_threads_per_block, max_blocks_per_grid=max_blocks_per_grid, A=A, B=B, C=C, D=D)) func = func_mod.get_function('func') x_gpu = gpuarray.to_gpu(x) func(x_gpu.gpudata, np.uint32(x_gpu.size), block=block_dim, grid=grid_dim) x_np = x.copy() x_np[:, :, 0, :] = 100 print 'Success status: ', np.allclose(x_np, x_gpu.get())
blockIdx.x*${max_threads_per_block}+threadIdx.x; // Convert the linear index to subscripts: unsigned int a = idx/(${B}*${C}); unsigned int b = (idx%(${B}*${C}))/${C}; unsigned int c = (idx%(${B}*${C}))%${C}; // Use the subscripts to access the array: if (idx < N) { if (b == 0) x[INDEX(a,b,c)] = 100; } } """) max_threads_per_block, max_block_dim, max_grid_dim = misc.get_dev_attrs(pycuda.autoinit.device) block_dim, grid_dim = misc.select_block_grid_sizes(pycuda.autoinit.device, x.shape) max_blocks_per_grid = max(max_grid_dim) func_mod = \ SourceModule(func_mod_template.substitute(max_threads_per_block=max_threads_per_block, max_blocks_per_grid=max_blocks_per_grid, A=A, B=B, C=C)) func = func_mod.get_function('func') x_gpu = gpuarray.to_gpu(x) func(x_gpu.gpudata, np.uint32(x_gpu.size), block=block_dim, grid=grid_dim) x_np = x.copy() x_np[:, 0, :] = 100