Esempio n. 1
0
# @copyright: https://gitee.com/weili_yzzcq/C-and-C-plus-plus/CUDA_CPlusPlus/
# @copyright: https://github.com/2694048168/C-and-C-plus-plus/CUDA_CPlusPlus/
# @function: pycuda 高级内核函数之 scan 内核函数。

import pycuda.gpuarray as gpuarray
import pycuda.driver as drv
import numpy
from pycuda.scan import InclusiveScanKernel
import pycuda.autoinit

n = 10
start = drv.Event()
end = drv.Event()
start.record()

kernel = InclusiveScanKernel(numpy.uint32,"a+b")

h_a = numpy.random.randint(1,10,n).astype(numpy.int32)
d_a = gpuarray.to_gpu(h_a)
kernel(d_a)

end.record()
end.synchronize()
secs = start.time_till(end) * 1e-3

assert(d_a.get() == numpy.cumsum(h_a,axis=0)).all()
print("The input data:")
print(h_a)

print("The computed cumulative sum using Scan:")
print(d_a.get())
import numpy as np
import pycuda.autoinit
from pycuda import gpuarray
from pycuda.scan import InclusiveScanKernel

seq = np.array([1,2,3,4],dtype=np.int32)
seq_gpu = gpuarray.to_gpu(seq)
sum_gpu = InclusiveScanKernel(np.int32, "a+b")
print(sum_gpu(seq_gpu).get())  # WS mod
print(np.cumsum(seq))  # WS mod
import numpy as np
import pycuda.autoinit
from pycuda import gpuarray
from pycuda.scan import InclusiveScanKernel

seq = np.array([1, 100, -3, -10000, 4, 10000, 66, 14, 21], dtype=np.int32)
seq_gpu = gpuarray.to_gpu(seq)
max_gpu = InclusiveScanKernel(np.int32, "a > b ? a : b")
print(max_gpu(seq_gpu).get()[-1])
print(np.max(seq))
Esempio n. 4
0
def get_cumsum_kernel(dtype):
    return InclusiveScanKernel(dtype, "a+b", preamble=include_complex)
Esempio n. 5
0
import pycuda.gpuarray as gpuarray
import pycuda.driver as cuda
import pycuda.autoinit
import numpy as np
from pycuda.scan import InclusiveScanKernel

knl = InclusiveScanKernel(np.int32, "a+b")

n = 2**20 - 2**18 + 5
host_data = np.random.randint(0, 10, n).astype(np.int32)
dev_data = gpuarray.to_gpu(host_data)

knl(dev_data)
assert (dev_data.get() == np.cumsum(host_data, axis=0)).all()
Esempio n. 6
0
Created on Mon May 17 12:54:56 2021

@author: aishw
"""

import numpy as np
import pycuda.autoinit
from pycuda import gpuarray
from pycuda.scan import InclusiveScanKernel
from pycuda.scan import ReductionKernel

# Cumulative Summation
seq = np.array([1, 2, 3, 4], dtype=np.int32)
seq_gpu = gpuarray.to_gpu(seq)
# InclusiveScanKernel needs datatype and lambda function
sum_gpu_ker = InclusiveScanKernel(np.int32, "a+b")
sum_cpu = sum_gpu_ker(seq_gpu).get()
print(f'Sum using GPU: {sum_cpu}')
print(f'Sum using np: {np.cumsum(seq)}')

# Find max in a list
seq = np.array([1, 100, -3, -10000, 4, 10000, 66, 14, 21], dtype=np.int32)
seq_gpu = gpuarray.to_gpu(seq)
max_gpu_ker = InclusiveScanKernel(np.int32, "a > b ? a : b")
print(f'MAX using GPU: {max_gpu_ker(seq_gpu).get()[-1]}')
print(f'Max using np: {np.max(seq)}')

# Reduction Kernel acts like a ElementWiseKernel function followed by a parallel scan kernel
# Dot product of two vectors
# dot_product_ket = ReductionKernel(np.float32, neutral="0". reduce_expr="a+b", map_expr="vec1[i]*vec2[i]", arguments="float *vec1, float *vec2")