def cumsum(x_gpu): """ Cumulative sum. Return the cumulative sum of the elements in the specified array. Parameters ---------- x_gpu : pycuda.gpuarray.GPUArray Input array. Returns ------- c_gpu : pycuda.gpuarray.GPUArray Output array containing cumulative sum of `x_gpu`. Notes ----- Higher dimensional arrays are implicitly flattened row-wise by this function. Examples -------- >>> import pycuda.autoinit >>> import pycuda.gpuarray as gpuarray >>> import misc >>> x_gpu = gpuarray.to_gpu(np.random.rand(5).astype(np.float32)) >>> c_gpu = misc.cumsum(x_gpu) >>> np.allclose(c_gpu.get(), np.cumsum(x_gpu.get())) True """ try: func = cumsum.cache[x_gpu.dtype] except KeyError: func = scan.InclusiveScanKernel( x_gpu.dtype, 'a+b', preamble='#include <pycuda-complex.hpp>') cumsum.cache[x_gpu.dtype] = func return func(x_gpu)
a_gpu, np.int32(size), block=(BLOCK_S, 1, 1), grid=((size - 1) // BLOCK_S + 1, 1, 1)) time_inefficient.append(time.time() - start) start = time.time() scan_efficient(b_gpu2, a_gpu2, np.int32(size), block=(BLOCK_S, 1, 1), grid=((size - 1) // BLOCK_S + 1, 1, 1)) time_efficient.append(time.time() - start) knl_gpu = gpuarray.to_gpu(a) scan_knl = scan.InclusiveScanKernel(np.float32, "a + b", "0") start = time.time() scan_knl(knl_gpu) time_scan_knl.append(time.time() - start) ''' print a print "\n" print b_gpu.get() print "\n" print scan_cpu(a, size) ''' MAKE_PLOT = True if MAKE_PLOT: import matplotlib as mpl mpl.use('agg')