# @copyright: https://gitee.com/weili_yzzcq/C-and-C-plus-plus/CUDA_CPlusPlus/ # @copyright: https://github.com/2694048168/C-and-C-plus-plus/CUDA_CPlusPlus/ # @function: pycuda 高级内核函数之 scan 内核函数。 import pycuda.gpuarray as gpuarray import pycuda.driver as drv import numpy from pycuda.scan import InclusiveScanKernel import pycuda.autoinit n = 10 start = drv.Event() end = drv.Event() start.record() kernel = InclusiveScanKernel(numpy.uint32,"a+b") h_a = numpy.random.randint(1,10,n).astype(numpy.int32) d_a = gpuarray.to_gpu(h_a) kernel(d_a) end.record() end.synchronize() secs = start.time_till(end) * 1e-3 assert(d_a.get() == numpy.cumsum(h_a,axis=0)).all() print("The input data:") print(h_a) print("The computed cumulative sum using Scan:") print(d_a.get())
import numpy as np import pycuda.autoinit from pycuda import gpuarray from pycuda.scan import InclusiveScanKernel seq = np.array([1,2,3,4],dtype=np.int32) seq_gpu = gpuarray.to_gpu(seq) sum_gpu = InclusiveScanKernel(np.int32, "a+b") print(sum_gpu(seq_gpu).get()) # WS mod print(np.cumsum(seq)) # WS mod
import numpy as np import pycuda.autoinit from pycuda import gpuarray from pycuda.scan import InclusiveScanKernel seq = np.array([1, 100, -3, -10000, 4, 10000, 66, 14, 21], dtype=np.int32) seq_gpu = gpuarray.to_gpu(seq) max_gpu = InclusiveScanKernel(np.int32, "a > b ? a : b") print(max_gpu(seq_gpu).get()[-1]) print(np.max(seq))
def get_cumsum_kernel(dtype): return InclusiveScanKernel(dtype, "a+b", preamble=include_complex)
import pycuda.gpuarray as gpuarray import pycuda.driver as cuda import pycuda.autoinit import numpy as np from pycuda.scan import InclusiveScanKernel knl = InclusiveScanKernel(np.int32, "a+b") n = 2**20 - 2**18 + 5 host_data = np.random.randint(0, 10, n).astype(np.int32) dev_data = gpuarray.to_gpu(host_data) knl(dev_data) assert (dev_data.get() == np.cumsum(host_data, axis=0)).all()
Created on Mon May 17 12:54:56 2021 @author: aishw """ import numpy as np import pycuda.autoinit from pycuda import gpuarray from pycuda.scan import InclusiveScanKernel from pycuda.scan import ReductionKernel # Cumulative Summation seq = np.array([1, 2, 3, 4], dtype=np.int32) seq_gpu = gpuarray.to_gpu(seq) # InclusiveScanKernel needs datatype and lambda function sum_gpu_ker = InclusiveScanKernel(np.int32, "a+b") sum_cpu = sum_gpu_ker(seq_gpu).get() print(f'Sum using GPU: {sum_cpu}') print(f'Sum using np: {np.cumsum(seq)}') # Find max in a list seq = np.array([1, 100, -3, -10000, 4, 10000, 66, 14, 21], dtype=np.int32) seq_gpu = gpuarray.to_gpu(seq) max_gpu_ker = InclusiveScanKernel(np.int32, "a > b ? a : b") print(f'MAX using GPU: {max_gpu_ker(seq_gpu).get()[-1]}') print(f'Max using np: {np.max(seq)}') # Reduction Kernel acts like a ElementWiseKernel function followed by a parallel scan kernel # Dot product of two vectors # dot_product_ket = ReductionKernel(np.float32, neutral="0". reduce_expr="a+b", map_expr="vec1[i]*vec2[i]", arguments="float *vec1, float *vec2")