def init_cuda(): global COMPILED_MODULE cuda.init() context = tools.make_default_context() device = context.get_device() atexit.register(context.pop) COMPILED_MODULE = buffers_cu.percentile_buffer()
def suite(): context = make_default_context() device = context.get_device() context.pop() s = TestSuite() s.addTest(test_fft('test_fft_float32_to_complex64_1d')) s.addTest(test_fft('test_fft_float32_to_complex64_2d')) s.addTest(test_fft('test_batch_fft_float32_to_complex64_1d')) s.addTest(test_fft('test_batch_fft_float32_to_complex64_2d')) s.addTest(test_fft('test_ifft_complex64_to_float32_1d')) s.addTest(test_fft('test_ifft_complex64_to_float32_2d')) s.addTest(test_fft('test_batch_ifft_complex64_to_float32_1d')) s.addTest(test_fft('test_batch_ifft_complex64_to_float32_2d')) s.addTest(test_fft('test_multiple_streams')) s.addTest(test_fft('test_work_area')) if misc.get_compute_capability(device) >= 1.3: s.addTest(test_fft('test_fft_float64_to_complex128_1d')) s.addTest(test_fft('test_fft_float64_to_complex128_2d')) s.addTest(test_fft('test_batch_fft_float64_to_complex128_1d')) s.addTest(test_fft('test_batch_fft_float64_to_complex128_2d')) s.addTest(test_fft('test_ifft_complex128_to_float64_1d')) s.addTest(test_fft('test_ifft_complex128_to_float64_2d')) s.addTest(test_fft('test_batch_ifft_complex128_to_float64_1d')) s.addTest(test_fft('test_batch_ifft_complex128_to_float64_2d')) return s
def gpuErrorEvaluate(actual, expected): context = make_default_context() device = context.get_device() p=gpuarray.to_gpu(numpy.array(actual))- gpuarray.to_gpu(numpy.array(expected)) res= 1.0 - gpuarray.dot(p,p) context.pop() return res
def init_cuda(self): # pycuda alloc drv.init() global context from pycuda.tools import make_default_context context = make_default_context() mod = SourceModule(r""" #include <stdio.h> #include <math.h> #define PI 3.14159265 __global__ void detect(int data[][2], int* rad, int* range, unsigned char *frame, int *pcol) { for(int r = 0; r < rad[0]; r++) { const int thetaIdx = threadIdx.x; const int theta = thetaIdx + range[0]; int x = rad[0] + int(r * cos(theta * PI/180)) - 1; int y = rad[0] - int(r * sin(theta * PI/180)) - 1; if (data[thetaIdx][0] == 0) data[thetaIdx][1] = r; if (*(frame + y * *pcol + x) != 0) data[thetaIdx][0] = 1; } } """) self.path = mod.get_function("detect") print("PLANNER: pycuda alloc end") # pycuda alloc end time.sleep(2)
def init_context(self, device_id=None): if device_id is None: context = make_default_context() self._context = context else: context = cuda.Device(device_id).make_context() self._context = context
def init_context(self, device_id=None): if device_id is None: context = make_default_context() self._context = context else: self._context = driver.Device(device_id).make_context() self._context.push()
def setUpClass(cls): np.random.seed(0) cls.ctx = make_default_context() cls.handle = cublasxt.cublasXtCreate() cls.nbDevices = 1 cls.deviceId = np.array([0], np.int32) cublasxt.cublasXtDeviceSelect(cls.handle, cls.nbDevices, cls.deviceId)
def init(): # MAGIC MAGIC import pycuda.driver as cuda cuda.init() from pycuda.tools import make_default_context context = make_default_context() device = context.get_device() import atexit atexit.register(context.detach)
def __init__(self, coord_format=CoordFormat.nx, scalar=False, threadsperblock=64, types={}): super().__init__(coord_format, scalar, threadsperblock, types) self._context = make_default_context() self._device = self._context.get_device() mod = SourceModule(self.source) self.update_v = mod.get_function("update_v") self.update_p = mod.get_function("update_p") self.rebound = mod.get_function("rebound") self.update = mod.get_function("update")
def examine_subsets_cuda(task, A, N, K, threads_per_block): # Unpack the task tuple subset_start, subset_end = task # Create CUDA context cuda.init() ctx = make_default_context() KFAC = math.factorial(K) # Keep track of the stride stride = subset_end - subset_start # Copy A to the GPU device_A = cuda.to_device(A.astype(np.int16)) # Create results array results = np.zeros(stride, dtype=np.int32) # Copy results array device_results = cuda.to_device(results) # Number of CUDA blocks cuda_blocks = ((stride + threads_per_block - 1) / threads_per_block) # Compile CUDA kernel mod = SourceModule( open( "/Users/rsearles/Documents/Repositories/cisc849-16s/project/src/Spike_Neural_Nets/subsets.cu", "r").read()) kernel = mod.get_function("examine_subsets") # Run kernel kernel(np.int32(N), np.int32(K), np.int64(subset_start), np.int64(subset_end), np.int32(KFAC), device_A, device_results, block=(cuda_blocks, 1, 1), grid=(threads_per_block, 1)) # Copy results back results = cuda.from_device_like(device_results, results) # Free GPU memory device_results.free() device_A.free() # Pop CUDA context (driver yells otherwise) ctx.pop() # Return the counts counts = Counter(results) return counts
def suite(): context = make_default_context() device = context.get_device() context.pop() s = TestSuite() s.addTest(test_misc('test_maxabs_float32')) s.addTest(test_misc('test_maxabs_complex64')) s.addTest(test_misc('test_cumsum_float32')) s.addTest(test_misc('test_cumsum_complex64')) s.addTest(test_misc('test_diff_float32')) s.addTest(test_misc('test_diff_complex64')) s.addTest(test_misc('test_get_by_index_float32')) s.addTest(test_misc('test_set_by_index_dest_float32')) s.addTest(test_misc('test_set_by_index_src_float32')) s.addTest(test_misc('test_binaryop_2d_int32')) s.addTest(test_misc('test_binaryop_2d_float32')) s.addTest(test_misc('test_binaryop_2d_complex64')) s.addTest(test_misc('test_binaryop_matvec_int32')) s.addTest(test_misc('test_binaryop_matvec_float32')) s.addTest(test_misc('test_binaryop_matvec_complex64')) s.addTest(test_misc('test_sum_float32')) s.addTest(test_misc('test_sum_complex64')) s.addTest(test_misc('test_mean_float32')) s.addTest(test_misc('test_mean_complex64')) s.addTest(test_misc('test_var_float32')) s.addTest(test_misc('test_var_complex64')) s.addTest(test_misc('test_std_float32')) s.addTest(test_misc('test_std_complex64')) s.addTest(test_misc('test_minmax_float32')) s.addTest(test_misc('test_argminmax_float32')) if misc.get_compute_capability(device) >= 1.3: s.addTest(test_misc('test_maxabs_float64')) s.addTest(test_misc('test_maxabs_complex128')) s.addTest(test_misc('test_cumsum_float64')) s.addTest(test_misc('test_cumsum_complex128')) s.addTest(test_misc('test_diff_float64')) s.addTest(test_misc('test_diff_complex128')) s.addTest(test_misc('test_get_by_index_float64')) s.addTest(test_misc('test_set_by_index_dest_float64')) s.addTest(test_misc('test_set_by_index_src_float64')) s.addTest(test_misc('test_sum_float64')) s.addTest(test_misc('test_sum_complex128')) s.addTest(test_misc('test_mean_float64')) s.addTest(test_misc('test_mean_complex128')) s.addTest(test_misc('test_binaryop_2d_float64')) s.addTest(test_misc('test_binaryop_2d_complex128')) s.addTest(test_misc('test_binaryop_matvec_float64')) s.addTest(test_misc('test_binaryop_matvec_complex128')) s.addTest(test_misc('test_var_float64')) s.addTest(test_misc('test_var_complex128')) s.addTest(test_misc('test_std_float64')) s.addTest(test_misc('test_std_complex128')) s.addTest(test_misc('test_minmax_float64')) s.addTest(test_misc('test_argminmax_float64')) return s
def init(): # MAGIC MAGIC from pycuda import driver driver.init() from pycuda.tools import make_default_context context = make_default_context() device = context.get_device() import atexit atexit.register(context.detach) return context
def init(device=None): """Initializes CUDA global state. Chainer maintains CUDA context, CUBLAS context, random number generator and device memory pool for each GPU device and for each process (the main process or a process forked by :mod:`multiprocessing`) as global states. When called for the first time on the process, this function initializes these global states. .. warning:: This function also initializes PyCUDA and scikits.cuda. Since these packages do not support forking after initialization, do not call this function before forking the process. This function also registers :func:`shutdown` to :mod:`atexit` slot. It also initializes random number generator. User can set fixed seed with ``CHAINER_SEED`` environment variable. Args: device (``int`` or :class:`~pycuda.driver.Device` or ``None``): Device ID to initialize on. """ global _contexts, _cublas_handles, _generators, _pid, _pools if not available: global _import_error raise RuntimeError( 'CUDA environment is not correctly set up. ' + 'The original import error said: ' + str(_import_error)) pid = os.getpid() if _pid == pid: # already initialized return drv.init() if device is None: # use default device context = cutools.make_default_context() device = Context.get_device() else: device = Device(device) context = device.make_context() _contexts = {device: context} _generators = {} _pools = {} _cublas_handles = {} cumisc.init(mem_alloc) seed(os.environ.get('CHAINER_SEED')) _pid = pid # mark as initialized atexit.register(shutdown)
def setup_pyfft(device_id=0): global CONTEXTS if device_id not in CONTEXTS: print('initializing GPU device', device_id) cuda.init() if device_id is None: CONTEXTS[0] = make_default_context() else: dev = cuda.Device(device_id) CONTEXTS[device_id] = dev.make_context() CONTEXTS[device_id].pop() return CONTEXTS[device_id]
def suite(): """test suite """ context = make_default_context() device = context.get_device() context.pop() testsuite = TestSuite() testsuite.addTest(test_magma('test_magma_geev_novecs')) testsuite.addTest(test_magma('test_symmetric_eig_float32')) testsuite.addTest(test_magma('test_symmetric_eig_float64')) return testsuite
def suite(): context = make_default_context() device = context.get_device() context.pop() s = TestSuite() s.addTest(test_cublasxt('test_cublasXtSgemm')) s.addTest(test_cublasxt('test_cublasXtCgemm')) if misc.get_compute_capability(device) >= 1.3: s.addTest(test_cublasxt('test_cublasXtDgemm')) s.addTest(test_cublasxt('test_cublasXtZgemm')) return s
def suite(): context = make_default_context() device = context.get_device() context.pop() s = TestSuite() s.addTest(test_special('test_sici_float32')) s.addTest(test_special('test_exp1_complex64')) s.addTest(test_special('test_expi_complex64')) if misc.get_compute_capability(device) >= 1.3: s.addTest(test_special('test_sici_float64')) s.addTest(test_special('test_exp1_complex128')) s.addTest(test_special('test_expi_complex128')) return s
def prep(image, psf): datadim1 = image.shape[0] datadim2 = image.shape[1] if datadim1 != datadim2: ddim = max(datadim1, datadim2) s = numpy.binary_repr(ddim - 1) s = s[:-1] + '0' # Guarantee that padding is used else: ddim = datadim1 s = numpy.binary_repr(ddim - 1) if s.find('0') > 0: size = 2**len(s) boxd = numpy.zeros((size, size)) r = size - datadim1 r1 = r2 = r / 2 if r % 2 == 1: r1 = r // 2 + 1 c = size - datadim2 c1 = c2 = c // 2 if c % 2 == 1: c1 = c // 2 + 1 boxdslice = (slice(r1, datadim1 + r1), slice(c1, datadim2 + c1)) boxd[boxdslice] = image else: boxd = image boxp = boxd * 0. if boxd.shape[0] == psf.shape[0]: boxp = psf.copy() else: r = boxp.shape[0] - psf.shape[0] r1 = r // 2 + 1 c = boxp.shape[1] - psf.shape[1] c1 = c // 2 + 1 boxpslice = (slice(r1, psf.shape[0] + r1), slice(c1, psf.shape[1] + c1)) boxp[boxpslice] = psf.copy() from pyfft.cuda import Plan import pycuda.driver as cuda from pycuda.tools import make_default_context import pycuda.gpuarray as gpuarray cuda.init() context = make_default_context() stream = cuda.Stream() plan = Plan(boxp.shape, stream=stream) gdata = gpuarray.to_gpu(boxp.astype(numpy.complex64)) plan.execute(gdata) return gdata, boxd.shape, boxdslice, plan, stream
def filter(self): import pycuda.gpuarray as gpuarray import skcuda.fft as cu_fft import skcuda.linalg as linalg import pycuda.driver as cuda from pycuda.tools import make_default_context cuda.init() context = make_default_context() device = context.get_device() signal = self.series[0] window = self.series[1] linalg.init() nfft = determine_size(len(signal) + len(window) - 1) # Move data to GPU sig_zero_pad = np.zeros(nfft, dtype=self.precision['float']) win_zero_pad = np.zeros(nfft, dtype=self.precision['float']) sig_gpu = gpuarray.zeros(sig_zero_pad.shape, dtype=self.precision['float']) win_gpu = gpuarray.zeros(win_zero_pad.shape, dtype=self.precision['float']) sig_zero_pad[0:len(signal)] = signal win_zero_pad[0:len(window)] = window sig_gpu.set(sig_zero_pad) win_gpu.set(win_zero_pad) # Plan forwards sig_fft_gpu = gpuarray.zeros(nfft, dtype=self.precision['complex']) win_fft_gpu = gpuarray.zeros(nfft, dtype=self.precision['complex']) sig_plan_forward = cu_fft.Plan(sig_fft_gpu.shape, self.precision['float'], self.precision['complex']) win_plan_forward = cu_fft.Plan(win_fft_gpu.shape, self.precision['float'], self.precision['complex']) cu_fft.fft(sig_gpu, sig_fft_gpu, sig_plan_forward) cu_fft.fft(win_gpu, win_fft_gpu, win_plan_forward) # Convolve out_fft = linalg.multiply(sig_fft_gpu, win_fft_gpu, overwrite=True) linalg.scale(2.0, out_fft) # Plan inverse out_gpu = gpuarray.zeros_like(out_fft) plan_inverse = cu_fft.Plan(out_fft.shape, self.precision['complex'], self.precision['complex']) cu_fft.ifft(out_fft, out_gpu, plan_inverse, True) out_np = np.zeros(len(out_gpu), self.precision['complex']) out_gpu.get(out_np) context.pop() return out_np
def prep(image,psf): datadim1 = image.shape[0] datadim2 = image.shape[1] if datadim1!=datadim2: ddim = max(datadim1,datadim2) s = numpy.binary_repr(ddim-1) s = s[:-1]+'0' # Guarantee that padding is used else: ddim = datadim1 s = numpy.binary_repr(ddim-1) if s.find('0')>0: size = 2**len(s) boxd = numpy.zeros((size,size)) r = size-datadim1 r1 = r2 = r/2 if r%2==1: r1 = r/2+1 c = size-datadim2 c1 = c2 = c/2 if c%2==1: c1 = c/2+1 boxdslice = (slice(r1,datadim1+r1),slice(c1,datadim2+c1)) boxd[boxdslice] = image else: boxd = image boxp = boxd*0. if boxd.shape[0]==psf.shape[0]: boxp = psf.copy() else: r = boxp.shape[0]-psf.shape[0] r1 = r/2+1 c = boxp.shape[1]-psf.shape[1] c1 = c/2+1 boxpslice = (slice(r1,psf.shape[0]+r1),slice(c1,psf.shape[1]+c1)) boxp[boxpslice] = psf.copy() from pyfft.cuda import Plan import pycuda.driver as cuda from pycuda.tools import make_default_context import pycuda.gpuarray as gpuarray cuda.init() context = make_default_context() stream = cuda.Stream() plan = Plan(boxp.shape,stream=stream) gdata = gpuarray.to_gpu(boxp.astype(numpy.complex64)) plan.execute(gdata) return gdata,boxd.shape,boxdslice,plan,stream
def _setup(self): ctx = None try: import pycuda.driver as pd import pycuda.tools as pt pd.init() ctx = pt.make_default_context() except: pass try: super()._setup() finally: if ctx is not None: ctx.detach()
def cuda_init(shape): try: import pyfft, pycuda except ImportError: out( 2, 'No CUDA bindings found (pyfft). Using regular convolution modules' ) return None, None out(2, 'CUDA bindings found!') from pycuda.tools import make_default_context # import pycuda.gpuarray as gpuarray import pycuda.driver as cuda cuda.init() #@UndefinedVariable context = make_default_context() return context, get_pyfft_plan(shape)
def suite(): context = make_default_context() device = context.get_device() context.pop() s = TestSuite() s.addTest(test_integrate('test_trapz_float32')) s.addTest(test_integrate('test_trapz_complex64')) s.addTest(test_integrate('test_trapz2d_float32')) s.addTest(test_integrate('test_trapz2d_complex64')) if misc.get_compute_capability(device) >= 1.3: s.addTest(test_integrate('test_trapz_float64')) s.addTest(test_integrate('test_trapz_complex128')) s.addTest(test_integrate('test_trapz2d_float64')) s.addTest(test_integrate('test_trapz2d_complex128')) return s
def Add_gpu(x, y): cuda.init() ctx = make_default_context() device = ctx.get_device() a = numpy.random.randn(4, 4) a = a.astype(numpy.float32) a_gpu = cuda.mem_alloc(a.nbytes) cuda.memcpy_htod(a_gpu, a) ''' x_gpu = cuda.mem_alloc(sys.getsizeof(x)) y_gpu = cuda.mem_alloc(sys.getsizeof(y)) z_gpu = cuda.mem_alloc(sys.getsizeof(x)) cuda.memcpy_htod(x_gpu, x) cuda.memcpy_htod(y_gpu, y) mod = SourceModule(""" __global__ void add(int *x_addr, int *y_addr, int* z_addr) { // naive computation sum = *x_addr + *y_addr; *z_addr = sum; } """) func = mod.get_function("add") func(x_gpu, y_gpu, z_gpu, block=(4,4,1)) ''' mod = SourceModule(""" __global__ void doublify(float *a) { int idx = threadIdx.x + threadIdx.y*4; a[idx] *= 2; } """) func = mod.get_function("doublify") func(a_gpu, block=(4, 4, 1)) #cuda.memcpy_dtoh(x, z_gpu) a_doubled = numpy.empty_like(a) cuda.memcpy_dtoh(a_doubled, a_gpu) ctx.pop() return 2 * (x + y)
def is_memory_enough(a): try: rest, total = driver.mem_get_info() except driver.LogicError: # child thread cannot use context from the main thread... # the following does not work yet from pycuda import tools import skcuda driver.init() context = tools.make_default_context() # try to make as new context, but cannot deactivate the old context stack device = context.get_device() skcuda.misc.init_context(device) rest, total = driver.mem_get_info() if (sys.getsizeof(a) * 2) < rest: return True
def count_triangles_cublas(adjacency_list): driver.init() context = tools.make_default_context() h = cublas.cublasCreate() n = len(adjacency_list) A = np.zeros([n,n], dtype=np.float64) for row_idx, neighbor_list in adjacency_list: A[row_idx, neighbor_list] = 1.0 a_gpu = gpuarray.to_gpu(A) b_gpu = gpuarray.empty(A.shape, A.dtype) c_gpu = gpuarray.empty(A.shape, A.dtype) one = np.float64(1.0) zero = np.float64(0.0) cublas.cublasDsymm(h, 'L', 'U', n, n, one, a_gpu.gpudata, n, a_gpu.gpudata, n, zero, b_gpu.gpudata, n) cublas.cublasDsymm(h, 'L', 'U', n, n, one, a_gpu.gpudata, n, b_gpu.gpudata, n, zero, c_gpu.gpudata, n) trace = linalg.trace(c_gpu, h) cublas.cublasDestroy(h) context.detach() return int(trace/6)
def __init__(self, devnum=None): # type: (int) -> None env = os.environ.get("SAS_OPENCL", "").lower() if devnum is None and env.startswith("cuda:"): devnum = int(env[5:]) # Set the global context to the particular device number if one is # given, otherwise use the default context. Perhaps this will be set # by an environment variable within autoinit. if devnum is not None: self.context = cuda.Device(devnum).make_context() else: self.context = make_default_context() ## Byte boundary for data alignment. #self.data_boundary = max(d.min_data_type_align_size # for d in self.context.devices) # Cache for compiled programs, and for items in context. self.compiled = {}
def suite(): context = make_default_context() device = context.get_device() context.pop() s = TestSuite() s.addTest(test_integrate('test_trapz_float32')) s.addTest(test_integrate('test_trapz_complex64')) s.addTest(test_integrate('test_simps_float32')) s.addTest(test_integrate('test_simps_complex64')) s.addTest(test_integrate('test_trapz2d_float32')) s.addTest(test_integrate('test_trapz2d_complex64')) if misc.get_compute_capability(device) >= 1.3: s.addTest(test_integrate('test_trapz_float64')) s.addTest(test_integrate('test_trapz_complex128')) s.addTest(test_integrate('test_simps_float64')) s.addTest(test_integrate('test_simps_complex128')) s.addTest(test_integrate('test_trapz2d_float64')) s.addTest(test_integrate('test_trapz2d_complex128')) return s
def init(device=-1): global CONTEXT if CONTEXT is not None: return # MAGIC MAGIC from pycuda import driver driver.init() if device == -1: from pycuda.tools import make_default_context CONTEXT = make_default_context() device = CONTEXT.get_device() else: device = driver.Device(device % driver.Device.count()) CONTEXT = device.make_context() print 'Starting up using device: %s:%s' % (device.name(), device.pci_bus_id()) import atexit atexit.register(CONTEXT.detach) return CONTEXT
def init(device=-1): global CONTEXT if CONTEXT is not None: return # MAGIC MAGIC from pycuda import driver driver.init() if device == -1: from pycuda.tools import make_default_context CONTEXT = make_default_context() device = CONTEXT.get_device() else: device = driver.Device(device % driver.Device.count()) CONTEXT = device.make_context() #print 'Starting up using device: %s:%s' % (device.name(), device.pci_bus_id()) import atexit atexit.register(CONTEXT.detach) return CONTEXT
def solve(self,a,b,max): #context and kernel initialisation util.log.info("Initialising CUDA device") self.ctx = ctools.make_default_context() self.ctx.push() self.kernels=SourceModule(self.r_kernels) #Memory d_a=cuda.mem_alloc(a.astype(self.type).nbytes) d_b=cuda.mem_alloc(b.astype(self.type).nbytes) cuda.memcpy_htod(d_a,a.astype(self.type)) cuda.memcpy_htod(d_b,b.astype(self.type)) h_b=np.empty_like(b.astype(self.type)) self.go=time() #Go solve go=self.kernels.get_function("solve") go(d_a,d_b,block=(1,1,1),grid=(1,1)) cuda.memcpy_dtoh(h_b,d_b) self.done=time() self.ctx.pop() self.ctx.detach() return h_b
def image_cuda(grids): """ Run 2d FFT to image each plane of grid array """ from pyfft.cuda import Plan from pycuda.tools import make_default_context import pycuda.gpuarray as gpuarray import pycuda.driver as cuda nints, npixx, npixy = grids.shape cuda.init() context = make_default_context() stream = cuda.Stream() plan = Plan((npixx, npixy), stream=stream) grid_gpu = gpuarray.to_gpu(grids) for i in range(0, nints): plan.execute(grid_gpu[i], inverse=True) grids = grid_gpu.get() context.pop() return recenter(grids.real, (npixx//2, npixy//2))
def simpleFourierTest2D(N=2048): """ Using PyFFT to call CUDA. :return: """ from pyfft.cuda import Plan import pycuda.driver as cuda from pycuda.tools import make_default_context import pycuda.gpuarray as gpuarray import time cuda.init() context = make_default_context() stream = cuda.Stream() plan = Plan((N, N), dtype=np.complex64, stream=stream) x = np.ones((N, N), dtype=np.complex64) x_gpu = gpuarray.to_gpu(x) plan.execute(x_gpu) res = x_gpu.get() plan.execute(x_gpu, inverse=True) result = x_gpu.get() context.pop() error = np.abs(np.sum(np.abs(x) - np.abs(result)) / x.size) #print 'Error:', error #Single precision print 'Array size %i x %i' % (N, N) print 'Single Precisions' x = np.random.random((N, N)) x = x.astype(np.complex64) start = time.time() cuda.init() context = make_default_context() stream = cuda.Stream() plan = Plan((N, N), dtype=np.complex64, stream=stream, fast_math=True) x_gpu = gpuarray.to_gpu(x) plan.execute(x_gpu) result = x_gpu.get() context.pop() end = time.time() cudatime = end - start #numpy start = time.time() xf = np.fft.fft2(x) end = time.time() numpytime = end - start print 'Same to 1e-2?' print np.testing.assert_allclose(xf, result, rtol=1e-2) print 'Numpy time', numpytime print 'CUDA time', cudatime #Double precision print '\n\nDouble Precision' x = np.random.random((N, N)) x = x.astype(np.complex128) start = time.time() cuda.init() context = make_default_context() stream = cuda.Stream() plan = Plan((N, N), dtype=np.complex128, stream=stream, fast_math=True) x_gpu = gpuarray.to_gpu(x) plan.execute(x_gpu) result = x_gpu.get() context.pop() end = time.time() cudatime = end - start #numpy start = time.time() xf = np.fft.fft2(x) end = time.time() numpytime = end - start print 'Same to 1e-7?' print np.testing.assert_allclose(xf, result, rtol=1e-7) print 'Numpy time', numpytime print 'CUDA time', cudatime
def setUpClass(cls): cls.ctx = make_default_context()
def ssfgpuStd(u0, dt, dz, nz, alpha, betap, gamma, maxiter=4, tol=1e-5, phiNLOut=False): ''' Very simple implementation of the symmetrized split-step fourier algo. Solve the NLS equation with the SPM nonlinear terme only. * error: third in step size * u0 : Input field * dt: Time increment * dz: Space increment * nz: Number of space propagation step * alpha: Loss/Gain parameter (array) * betap: Beta array beta[2] = GVD, beta[3] = TOD, etc... * gamma: Nonlinear parameter * maxiter: Maximal number of iteration per step (4) * tol: Error for each step (1e-5) * phiNLOut: If True return the nonlinear phase shift (True) --- GPU Version (float precision) --- ''' nt = len(u0) e_ini = pow(abs(u0), 2).sum() w = wspace(dt * nt, nt) phiNL = 0.0 # Make sure u0 is in single precision u0 = u0.astype(complex64) alpha = alpha.astype(complex64) u1 = u0 uv = empty_like(u0) # Construction of the linear operator halfstep = -alpha / 2.0 if len(betap) != nt: for ii in arange(len(betap)): halfstep = halfstep - 1.0j * betap[ii] * pow(w, ii) / factorial(ii) halfstep = exp(halfstep * dz / 2.0).astype(complex64) # CUDA Kitchen sink cuda.init() context = make_default_context() fftPlan = Plan((1, nt), dtype=numpy.complex64) # Allocate memory to the device gpu_halfstep = gpuarray.to_gpu(halfstep) gpu_u0 = gpuarray.to_gpu(u0) gpu_u1 = gpuarray.to_gpu(u1) gpu_uhalf = gpuarray.empty_like(gpu_u0) gpu_uv = gpuarray.empty_like(gpu_u0) gpu_ufft = gpuarray.empty_like(gpu_u0) fftPlan.execute(gpu_u0, gpu_ufft) # GPU Kernel corresponding to the linear operator halfStepKernel = ElementwiseKernel( "pycuda::complex<float> *u, pycuda::complex<float> *halfstep, pycuda::complex<float> *uhalf", "uhalf[i] = u[i] * halfstep[i]", "halfstep_linear", preamble="#include <pycuda-complex.hpp>", ) # GPU Kernel corresponding to the nonlinear operator nlKernel = ElementwiseKernel( "pycuda::complex<float> *uhalf, pycuda::complex<float> *u0, pycuda::complex<float> *u1, pycuda::complex<float> *uv, float gamma, float dz", """ float u0_int = pow(u0[i]._M_re,2) + pow(u0[i]._M_im,2); float u1_int = pow(u1[i]._M_re,2) + pow(u1[i]._M_im,2); float realArg = -gamma*(u1_int + u0_int)*dz; float euler1 = cos(realArg); float euler2 = sin(realArg); uv[i]._M_re = uhalf[i]._M_re * euler1 - uhalf[i]._M_im * euler2; uv[i]._M_im = uhalf[i]._M_im * euler1 + uhalf[i]._M_re * euler2; """, "halfstep_nonlinear", preamble="#include <pycuda-complex.hpp>", ) # GPU reduction kernel computing the error between two complex array computeError = ReductionKernel( numpy.float32, neutral="0", reduce_expr="a+b", map_expr="pow(abs(a[i] - b[i]),2)", arguments="pycuda::complex<float> *a, pycuda::complex<float> *b", name="error_reduction", preamble="#include <pycuda-complex.hpp>", ) # Perfom a deep copy of a complex gpuarray complexDeepCopy = ElementwiseKernel( "pycuda::complex<float> *u1, pycuda::complex<float> *u2", "u1[i]._M_re = u2[i]._M_re;u1[i]._M_im = u2[i]._M_im", "gpuarray_deepcopy", preamble="#include <pycuda-complex.hpp>", ) # Main Loop for iz in arange(nz): # First application of the linear operator halfStepKernel(gpu_ufft, gpu_halfstep, gpu_uhalf) fftPlan.execute(gpu_uhalf, inverse=True) for ii in arange(maxiter): # Application de l'operateur nonlineaire en approx. l'integral de N(z)dz # avec la methode du trapeze nlKernel(gpu_uhalf, gpu_u0, gpu_u1, gpu_uv, float(gamma), float(dz / 2.0)) fftPlan.execute(gpu_uv) # Second application of the linear operator halfStepKernel(gpu_uv, gpu_halfstep, gpu_ufft) fftPlan.execute(gpu_ufft, gpu_uv, inverse=True) error = computeError(gpu_u1, gpu_uv).get() / e_ini if (error < tol): complexDeepCopy(gpu_u1, gpu_uv) break else: complexDeepCopy(gpu_u1, gpu_uv) if (ii >= maxiter - 1): context.pop() raise Exception, "Failed to converge" complexDeepCopy(gpu_u0, gpu_u1) u1 = gpu_u1.get() context.pop() if phiNLOut: return [u1, phiNL] else: return u1
def suite(): context = make_default_context() device = context.get_device() context.pop() s = TestSuite() s.addTest(test_cublas('test_cublasIsamax')) s.addTest(test_cublas('test_cublasIcamax')) s.addTest(test_cublas('test_cublasIsamin')) s.addTest(test_cublas('test_cublasIcamin')) s.addTest(test_cublas('test_cublasSasum')) s.addTest(test_cublas('test_cublasScasum')) s.addTest(test_cublas('test_cublasSaxpy')) s.addTest(test_cublas('test_cublasCaxpy')) s.addTest(test_cublas('test_cublasScopy')) s.addTest(test_cublas('test_cublasCcopy')) s.addTest(test_cublas('test_cublasSdot')) s.addTest(test_cublas('test_cublasCdotu')) s.addTest(test_cublas('test_cublasCdotc')) s.addTest(test_cublas('test_cublasSrnm2')) s.addTest(test_cublas('test_cublasScrnm2')) s.addTest(test_cublas('test_cublasSscal')) s.addTest(test_cublas('test_cublasCscal')) s.addTest(test_cublas('test_cublasSrot')) s.addTest(test_cublas('test_cublasSswap')) s.addTest(test_cublas('test_cublasCswap')) s.addTest(test_cublas('test_cublasSgemv')) s.addTest(test_cublas('test_cublasCgemv')) s.addTest(test_cublas('test_cublasSgeam')) s.addTest(test_cublas('test_cublasCgeam')) s.addTest(test_cublas('test_cublasSgemmBatched')) s.addTest(test_cublas('test_cublasCgemmBatched')) s.addTest(test_cublas('test_cublasStrsmBatched')) s.addTest(test_cublas('test_cublasSgetrfBatched')) if misc.get_compute_capability(device) >= 1.3: s.addTest(test_cublas('test_cublasIdamax')) s.addTest(test_cublas('test_cublasIzamax')) s.addTest(test_cublas('test_cublasIdamin')) s.addTest(test_cublas('test_cublasIzamin')) s.addTest(test_cublas('test_cublasDasum')) s.addTest(test_cublas('test_cublasDzasum')) s.addTest(test_cublas('test_cublasDaxpy')) s.addTest(test_cublas('test_cublasZaxpy')) s.addTest(test_cublas('test_cublasDcopy')) s.addTest(test_cublas('test_cublasZcopy')) s.addTest(test_cublas('test_cublasDdot')) s.addTest(test_cublas('test_cublasZdotu')) s.addTest(test_cublas('test_cublasZdotc')) s.addTest(test_cublas('test_cublasDrnm2')) s.addTest(test_cublas('test_cublasDzrnm2')) s.addTest(test_cublas('test_cublasDscal')) s.addTest(test_cublas('test_cublasZscal')) s.addTest(test_cublas('test_cublasZdscal')) s.addTest(test_cublas('test_cublasDswap')) s.addTest(test_cublas('test_cublasZswap')) s.addTest(test_cublas('test_cublasDgemv')) s.addTest(test_cublas('test_cublasZgemv')) s.addTest(test_cublas('test_cublasDgeam')) s.addTest(test_cublas('test_cublasZgeam')) s.addTest(test_cublas('test_cublasDgemmBatched')) s.addTest(test_cublas('test_cublasZgemmBatched')) s.addTest(test_cublas('test_cublasDtrsmBatched')) s.addTest(test_cublas('test_cublasDgetrfBatched')) return s
def ssfgpuStd(u0, dt, dz, nz, alpha, betap, gamma, maxiter = 4, tol = 1e-5, phiNLOut = False): ''' Very simple implementation of the symmetrized split-step fourier algo. Solve the NLS equation with the SPM nonlinear terme only. * error: third in step size * u0 : Input field * dt: Time increment * dz: Space increment * nz: Number of space propagation step * alpha: Loss/Gain parameter (array) * betap: Beta array beta[2] = GVD, beta[3] = TOD, etc... * gamma: Nonlinear parameter * maxiter: Maximal number of iteration per step (4) * tol: Error for each step (1e-5) * phiNLOut: If True return the nonlinear phase shift (True) --- GPU Version (float precision) --- ''' nt = len(u0) e_ini = pow(abs(u0),2).sum() w = wspace(dt*nt,nt) phiNL = 0.0 # Make sure u0 is in single precision u0=u0.astype(complex64) alpha=alpha.astype(complex64) u1 = u0 uv = empty_like(u0) # Construction of the linear operator halfstep = -alpha/2.0 if len(betap) != nt: for ii in arange(len(betap)): halfstep = halfstep - 1.0j*betap[ii]*pow(w,ii)/factorial(ii) halfstep = exp(halfstep*dz/2.0).astype(complex64) # CUDA Kitchen sink cuda.init() context = make_default_context() fftPlan = Plan((1, nt), dtype=numpy.complex64) # Allocate memory to the device gpu_halfstep = gpuarray.to_gpu(halfstep) gpu_u0 = gpuarray.to_gpu(u0) gpu_u1 = gpuarray.to_gpu(u1) gpu_uhalf = gpuarray.empty_like(gpu_u0) gpu_uv = gpuarray.empty_like(gpu_u0) gpu_ufft = gpuarray.empty_like(gpu_u0) fftPlan.execute(gpu_u0, gpu_ufft) # GPU Kernel corresponding to the linear operator halfStepKernel = ElementwiseKernel("pycuda::complex<float> *u, pycuda::complex<float> *halfstep, pycuda::complex<float> *uhalf", "uhalf[i] = u[i] * halfstep[i]", "halfstep_linear", preamble="#include <pycuda-complex.hpp>",) # GPU Kernel corresponding to the nonlinear operator nlKernel = ElementwiseKernel("pycuda::complex<float> *uhalf, pycuda::complex<float> *u0, pycuda::complex<float> *u1, pycuda::complex<float> *uv, float gamma, float dz", """ float u0_int = pow(u0[i]._M_re,2) + pow(u0[i]._M_im,2); float u1_int = pow(u1[i]._M_re,2) + pow(u1[i]._M_im,2); float realArg = -gamma*(u1_int + u0_int)*dz; float euler1 = cos(realArg); float euler2 = sin(realArg); uv[i]._M_re = uhalf[i]._M_re * euler1 - uhalf[i]._M_im * euler2; uv[i]._M_im = uhalf[i]._M_im * euler1 + uhalf[i]._M_re * euler2; """, "halfstep_nonlinear", preamble="#include <pycuda-complex.hpp>",) # GPU reduction kernel computing the error between two complex array computeError = ReductionKernel(numpy.float32, neutral="0", reduce_expr="a+b", map_expr="pow(abs(a[i] - b[i]),2)", arguments="pycuda::complex<float> *a, pycuda::complex<float> *b", name="error_reduction", preamble="#include <pycuda-complex.hpp>",) # Perfom a deep copy of a complex gpuarray complexDeepCopy = ElementwiseKernel("pycuda::complex<float> *u1, pycuda::complex<float> *u2", "u1[i]._M_re = u2[i]._M_re;u1[i]._M_im = u2[i]._M_im", "gpuarray_deepcopy", preamble="#include <pycuda-complex.hpp>",) # Main Loop for iz in arange(nz): # First application of the linear operator halfStepKernel(gpu_ufft, gpu_halfstep, gpu_uhalf) fftPlan.execute(gpu_uhalf, inverse=True) for ii in arange(maxiter): # Application de l'operateur nonlineaire en approx. l'integral de N(z)dz # avec la methode du trapeze nlKernel(gpu_uhalf, gpu_u0, gpu_u1, gpu_uv, float(gamma), float(dz/2.0)) fftPlan.execute(gpu_uv) # Second application of the linear operator halfStepKernel(gpu_uv, gpu_halfstep, gpu_ufft) fftPlan.execute(gpu_ufft, gpu_uv, inverse=True) error = computeError(gpu_u1, gpu_uv).get() / e_ini if (error < tol): complexDeepCopy(gpu_u1, gpu_uv) break else: complexDeepCopy(gpu_u1, gpu_uv) if (ii >= maxiter-1): context.pop() raise Exception, "Failed to converge" complexDeepCopy(gpu_u0, gpu_u1) u1 = gpu_u1.get() context.pop() if phiNLOut: return [u1, phiNL] else: return u1
def setUpClass(cls): cls.ctx = make_default_context() cls.cublas_handle = cublas.cublasCreate()
def split_step_GPU(A0, z_array, # Array for solution points t_op = 0, w_op = 0, nlin = 0, # Constant operators dt = 1, # sampling time t_nl_op = None, # Additional operator f(A, dt, z) apod = True, # Boundary conditition varying_operator = False, # Do operators vary in x dynamic_predictor = True, plot_hook = None, n_plots = 3, # not used anymore tollerance = 0.04, ): import pycuda.autoinit from pycuda.tools import make_default_context, dtype_to_ctype import pycuda.gpuarray as gpuarray from pycuda import cumath from pyfft.cuda import Plan from pycuda.compiler import SourceModule from pycuda.driver import Context from pycuda.elementwise import get_axpbyz_kernel, get_axpbz_kernel, get_binary_op_kernel, get_elwise_kernel,ElementwiseKernel ## Initialization n_points = A0.shape[0] # w = fftfreq(npoints, dx) * 2 * pi A_t = A0[:] +0.j #A_t.dtype = complex64 A_w = fft(A_t) * dt # Apodization (AK boundary conditions) # TODO making it smooth apod_array = ones(n_points, dtype = complex64) apod_array[0:n_points/50] = 0 apod_array[-n_points/50:-1] = 0 z0 = z_array[0] zf = z_array[-1] delta_z = 1.*(z_array[1]-z_array[0])/4 done_once = False #plan = c_uint() #dll.cufftPlan1d(byref(plan), n_points, 0x29, 1) #fft_g = lambda x, y: dll.cufftExecC2C(plan, x.ptr, y.ptr, -1) #ifft_g = lambda x, y: dll.cufftExecC2C(plan, x.ptr, y.ptr, 1) ## GPU modules ##### if pycuda.autoinit.context: context = pycuda.autoinit.context else: context = make_default_context() block = (16,1,1) grid = (n_points/block[0], 1) ## Init GPU kernels #### ## fft, scale dx is included in the definition here plan = Plan(n_points,wait_for_finish = True, scale = dt) fft_g = lambda ain, aout: plan.execute(ain, aout,) ifft_g = lambda x, y: plan.execute(x, y, inverse = True) ## Multiplication prod = ElementwiseKernel( "pycuda::complex<float> *x, pycuda::complex<float> *y, pycuda::complex<float> *z", """ z[i] = x[i] * y[i]; """, "product", preamble = "") #prod = lambda x,y,z: prod(x,y,z, block, grid) ## Non-linearity nonLinear = ElementwiseKernel( "pycuda::complex<float> *x, pycuda::complex<float> nlin, pycuda::complex<float> *y, pycuda::complex<float> *z", """ pycuda::complex<float> I_UNIT(0.,1.); float I = pycuda::abs(y[i]); z[i] = x[i] * pycuda::exp(I_UNIT * I * nlin); """, "nonLinear", preamble = "") ## Evaluate the solution with current values at delta_z step ## separated so that can be re-used for error prediction ## contains some lazy eveluation just to be CUDA-implementation ready ## and reducing the number of array creation def f(A_t, A_w, dz = delta_z): if f.delta_z != dz: f.w_exp = cumath.exp(-1j * dz/2. * w_op) f.t_exp = cumath.exp(-1j * dz * t_op) f.delta_z = dz ## Dispersion (I pass) f.A_t = A_t f.A_w = A_w #print A_w.get()[n_points/2], prod(A_w, f.w_exp, A_w) #A_w = f.w_exp*A_w #print A_w.get()[n_points/2], ifft_g(f.A_w, f.A_t) ## Scale factor included in fft_g ## Constant potential term prod(f.A_t, f.t_exp, f.A_t) ## Nonlinear operator as intensity dependency if nlin != 0: f.A_t = f.A_t * cumath.exp(-1j * delta_z * nlin * f.A_t * f.A_t.conj()) ## Additional nonlinear terms as a function t_nl_op(A(t),dt,z) if t_nl_op != None: f.A_t = f.A_t * cumath.exp(-1j * delta_z * t_nl_op(f.A_t, dt, z0+delta_z/2) ) ## Apodization if apod: prod(f.A_t, apod_array, f.A_t) fft_g(f.A_t, f.A_w) ## Scale factor included in fft_g ## Dispersion (II pass) prod(f.A_w, f.w_exp, f.A_w) ifft_g(f.A_w, f.A_t) ## Scale factor included in fft_g return f.A_t, f.A_w ## Init the f function f.delta_z = 0 # The rest will be evaluated lazily ## Convert to GPU arrays f.A_t = gpuarray.to_gpu(ones(n_points, complex64)) f.A_w = gpuarray.to_gpu(ones(n_points, complex64)) A_t = gpuarray.to_gpu(A_t.astype(complex64)) A_w = gpuarray.to_gpu(A_w.astype(complex64)) apod_array = gpuarray.to_gpu(apod_array.astype(complex64)) if hasattr(w_op,'__len__'): w_op = gpuarray.to_gpu(w_op.astype(complex64)) else: ## Use array even if it's a single values, othewise error when updating dz w_op = gpuarray.to_gpu(w_op*ones(n_points).astype(complex64)) if hasattr(t_op,'__len__'): t_op = gpuarray.to_gpu(t_op.astype(complex64)) else: t_op = gpuarray.to_gpu(t_op*ones(n_points).astype(complex64)) error = tollerance print "Ready for integration" ## Init loop variables sol_i = 0 sols = [A0] iters = 0 ## Integration loop while z0 <= zf: ## Cycle check if z0 >= z_array[sol_i]: #print "dz = %.2e error=%.2f z = %.2e"%(delta_z,error,z0) sols.append(A_t.get()) sol_i +=1 try: ## Force to have steps smaller than the distance between 2 solutions while z0 + delta_z >= z_array[sol_i + 1]: delta_z /= 2. except: pass ## Dynamical correction while dynamic_predictor: A_coarse = f(gpuarray.to_gpu(A_t.get()), gpuarray.to_gpu(A_w.get()), dz=2*delta_z)[0].get() A_fine = f(*f(gpuarray.to_gpu(A_t.get()), gpuarray.to_gpu(A_w.get()), delta_z), dz=delta_z)[0].get() delta = A_fine-A_coarse error = sqrt( trapz(delta*delta.conj())/ \ trapz(A_fine*A_fine.conj())) #print "Error : ",error, " dz :", delta_z if error < 2 * tollerance: done_once = True break ## Error is less then the tollerance, proceed delta_z = delta_z / 2. # update step A_t, A_w = f(A_t, A_w, delta_z) z0 += delta_z iters += 1 # Dynamic step (additional correction for faster convergence) if (dynamic_predictor or not (done_once or dynamic_predictor) ): if error > tollerance: delta_z = delta_z / 1.23 if error < 0.5/tollerance: delta_z = delta_z * 1.23 # Show the state of the loop every 200 loops (approx every few secs) if iters %200 == 0: print "Iter %8d (to end %8d) %4.1f %%"%(iters, (z_array[-1]-z0)/delta_z, 100.*iters/(iters+(z_array[-1]-z0)/delta_z)) ## Integration is over print "Total iterations: ", iters ## Return array with solutions (and their ftt) return sols
def __init__(self, cfg, scene, data_writer, receiver_files, output_fn): import pycuda.driver as cuda from pycuda.tools import make_default_context from pycuda.compiler import SourceModule from pycuda.driver import Function from pyfft.cuda import Plan cuda.init() context = make_default_context() stream = cuda.Stream() plan_set = {} # will be filled as needed in spatderp3_gpu(~), prefill with 128, 256 and 512 plan_set[str(128)] = Plan(128, dtype=np.float64, context=context, stream=stream, fast_math=False) plan_set[str(256)] = Plan(256, dtype=np.float64, context=context, stream=stream, fast_math=False) plan_set[str(512)] = Plan(512, dtype=np.float64, context=context, stream=stream, fast_math=False) g_bufl = ( {} ) # m/d(r/i) -> windowed matrix/derfact real/imag buffers. m(1/2/3)->p(#) buffers. spatderp3 will expand them if needed g_bufl["mr"] = cuda.mem_alloc(8 * 128 * 128) g_bufl["mi"] = cuda.mem_alloc(8 * 128 * 128) g_bufl["m1"] = cuda.mem_alloc(8 * 128 * 128) g_bufl["m2"] = cuda.mem_alloc(8 * 128 * 128) g_bufl["m3"] = cuda.mem_alloc(8 * 128 * 128) g_bufl["m_size"] = 8 * 128 * 128 g_bufl["dr"] = cuda.mem_alloc(8 * 128) # dr also used by A (window matrix) g_bufl["di"] = cuda.mem_alloc(8 * 128) g_bufl["d_size"] = 8 * 128 kernelcode = SourceModule( """ #include <stdio.h> __global__ void derifact_multiplication(double *matr, double *mati, double *vecr, double *veci, int fftlen, int fftnum) { int index_x = blockIdx.x*blockDim.x + threadIdx.x; int index_y = blockIdx.y*blockDim.y + threadIdx.y; int matindex = index_y*fftlen+index_x; //mat should be a contiguous array //printf("Block(x,y): (%d,%d). Thread(x,y): (%d,%d)\\n",blockIdx.x,blockIdx.y,threadIdx.x,threadIdx.y); // if N1%16>0, we're starting too many threads. // There is probably a better way to do this, but just eating the surplus should work. if (matindex < fftlen*fftnum) { double matreal = matr[matindex]; double matimag = mati[matindex]; double vecreal = vecr[index_x]; double vecimag = veci[index_x]; matr[matindex] = matreal*vecreal - matimag*vecimag; mati[matindex] = matreal*vecimag + matimag*vecreal; } } __global__ void pressure_window_multiplication(double *mr, double *mi, double *A, double *p1, double *p2, double *p3, int winlen, int Ns1, int Ns2, int Ns3, int fftlen, int fftnum, double R21, double R00, double R31, double R10) //passing a few by value seems to be more efficient than building an array first in pycuda { int index_x = blockIdx.x*blockDim.x + threadIdx.x; int index_y = blockIdx.y*blockDim.y + threadIdx.y; int matindex = index_y*fftlen+index_x; double G = 1; if (index_x < winlen) { G = A[index_x]; } else if (index_x > winlen+Ns2-1 && index_x < winlen*2+Ns2) { G = A[index_x-Ns2]; } if (index_y < fftnum) { //eat the surplus mi[matindex] = 0; if (index_x < winlen) { mr[matindex] = G*(R21*p1[Ns1*index_y+index_x-winlen+Ns1] + R00*p2[Ns2*index_y+winlen-1-index_x]); } else if (index_x < winlen + Ns2) { mr[matindex] = p2[Ns2*index_y+index_x-winlen]; } else if (index_x < winlen*2+Ns2) { mr[matindex] = G*(R31*p3[Ns3*index_y+index_x-winlen-Ns2] + R10*p2[Ns2*index_y+2*Ns2+winlen-1-index_x]); } else { mr[matindex] = 0; //zero padding } //if(mr[matindex]==0 && matindex < 50) printf("zero at:%d\\n",matindex%fftlen); } } __global__ void velocity_window_multiplication(double *mr, double *mi, double *A, double *p1, double *p2, double *p3, int winlen, int Ns1, int Ns2, int Ns3, int fftlen, int fftnum, double R21, double R00, double R31, double R10) //passing a few by value seems to be more efficient than building an array first in pycuda { int index_x = blockIdx.x*blockDim.x + threadIdx.x; int index_y = blockIdx.y*blockDim.y + threadIdx.y; int matindex = index_y*fftlen+index_x; double G = 1; if (index_x < winlen) { G = A[index_x]; } else if (index_x > winlen+Ns2-1 && index_x < winlen*2+Ns2) { G = A[index_x-Ns2]; } if (index_y < fftnum) { //eat the surplus mi[matindex] = 0; if (index_x < winlen) { mr[matindex] = G*(R21*p1[Ns1*index_y+index_x-winlen+Ns1-1] + R00*p2[Ns2*index_y+winlen-index_x]); } else if (index_x < winlen + Ns2) { mr[matindex] = p2[Ns2*index_y+index_x-winlen]; } else if (index_x < winlen*2+Ns2) { mr[matindex] = G*(R31*p3[Ns3*index_y+index_x-winlen-Ns2+1] + R10*p2[Ns2*index_y+2*Ns2+winlen-2-index_x]); } else { mr[matindex] = 0; //zero padding } //if(mr[matindex]==0 && matindex < 50) printf("zero at:%d\\n",matindex%fftlen); } } """ ) mulfunc = {} mulfunc["pres_window"] = kernelcode.get_function("pressure_window_multiplication") mulfunc["velo_window"] = kernelcode.get_function("velocity_window_multiplication") mulfunc["derifact"] = kernelcode.get_function("derifact_multiplication") # Loop over time steps for frame in range(int(cfg.TRK)): output_fn({"status": "running", "message": "Calculation frame:%d" % (frame + 1), "frame": frame + 1}) # Keep a reference to current matrix contents for domain in scene.domains: domain.push_values() # Loop over subframes for sub_frame in range(6): # Loop over calculation directions and measures for domain in scene.domains: calc_domain_gpu(domain, context, stream, plan_set, g_bufl, mulfunc) # Update acoustic values for domain in scene.domains: if not domain.is_rigid(): update_domain(domain, sub_frame) # Sum the pressure components for domain in scene.domains: domain.field_dict["p0"] = domain.field_dict["px0"] + domain.field_dict["pz0"] # Apply pml matrices to boundary domains scene.apply_pml_matrices() for rf, r in zip(receiver_files, scene.receivers): rf.write(struct.pack("f", r.calc())) rf.flush() if frame % cfg.save_nth_frame == 0: data_writer.write_to_file(frame) context.pop()
def setUpClass(cls): cls.ctx = make_default_context() integrate.init()
def main(): ## Default input parameter specification r=1.0 nz=100 G=1.8962 print("Starting\n") start_time = time.time() print("Creating Initial Profile\n") ##simulation parameter n_points = (1024,1024) Xmax= (5.0,5.0) # grid and window dx = [2.*Xmax[i]/n_points[i] for i in [0,1]] dz=0.003 beta = 500 #print "Enter step size [Ldf]\n" #scanf("%lf",&dz #print "Enter number of steps \n" #scanf("%lf",&nz #nz=(int)nz; #print "Enter number of critical powers\n" #scanf("%lf",&beta gamma=G*beta x=linspace(-Xmax[0],Xmax[0],n_points[0]) y=linspace(-Xmax[1],Xmax[1],n_points[1]) kx=fftfreq(n_points[0],dx[0]) ky=fftfreq(n_points[1],dx[1]) X,Y = meshgrid(x,y) Kx, Ky = meshgrid(kx,ky) keepMax=zeros(N_Z) II_out= zeros(n_points) U_m = zeros(n_points, dtype = complex64) IM_out= zeros(n_points) IF_out= zeros(n_points) ufft= zeros(n_points, dtype = complex64) ufft_pc= zeros(n_points, dtype = complex64) ##for (j=0;j<nx;j++) { ## x[j]=(double)(-nx/2+j+1)*dx; ## kx[j]=(j < nx/2 ) ? ## (pi*(double)(j))*(1./dx/((double)(nx))): ## (pi*(double)(j-nx))*(1./dx/((double)(nx)) ##fx_s=1/dx, dfx= fx_s/N-> d omega= 2pi dfx ## ????? kx[j]=kx[j]*kx[j]; u = exp( -(X**2 + Y**2)/r ) + 0.j u = u.astype(complex64) u_m= zeros(n_points) u_f= zeros(n_points) II_out = (u.real**2 + u.imag**2) ## ??? steps=2.*nz print "Step size %g [Ldf]\n"%dz print "Number of critical powers %g\n"%beta print "Number of steps %d\n"%N_Z ## cuFFT planning and preparation # cuda.init() dev = pycuda.autoinit.device context = make_default_context() nonlinearMod = SourceModule(""" #include <pycuda-complex.hpp> __global__ void nonlinear(pycuda::complex<float> *u_mat, float beta, float dz, pycuda::complex<float> *keepMax, int step) { const int y = blockDim.y * blockIdx.y + threadIdx.y; const int x = blockDim.x * blockIdx.x + threadIdx.x; float I; pycuda::complex<float> I_UNIT(0.,1.); int i = x* %(n)d + y; I=pycuda::abs(u_mat[i]); u_mat[i]= u_mat[i]* pycuda::exp(I_UNIT*I*beta*dz); if ((x==i/2) && (y==i/2)) keepMax[step]=pycuda::exp(I_UNIT*I*beta*dz); } __global__ void prod(pycuda::complex<float> *X, pycuda::complex<float> *Y, pycuda::complex<float> *Z) { const int y = blockDim.y * blockIdx.y + threadIdx.y; const int x = blockDim.x * blockIdx.x + threadIdx.x; int i = x*%(n)d + y; Z[i]=X[i] * Y[i]; }"""%{'n' : n_points[0]}) print "Device %d: \"%s\" with Compute %d.%d capability\n"%(dev.pci_bus_id, dev.name(), dev.compute_capability()[0], dev.compute_capability()[1]) print "Creating FFT Plans\n" plan = Plan(n_points, wait_for_finish = True, scale = dx[0]*dx[1]) block = (16,16,1) grid = (n_points[0]/block[0], n_points[1]/block[1]) ## Threads per block fft_g = lambda x, y: plan.execute(x, y) ifft_g = lambda x, y: plan.execute(x, y, inverse = True) g_mult = nonlinearMod.get_function('prod') runNonLinear = nonlinearMod.get_function("nonlinear") print "Allocating memory on device\n" u_gpu = gpuarray.to_gpu(u.astype(complex64)) U_gpu = gpuarray.to_gpu(zeros(n_points, complex64)) print "Allocating kx, ky & keepMax\n" cukx = gpuarray.to_gpu(kx) cuky = gpuarray.to_gpu(ky) cukeepMax = gpuarray.to_gpu(ones(nz, complex64)) ## preparing the data to transfer to the device IM_out = u.real #fileout("A",0.,h_in,nx, ny) print "Starting %i FFT pairs\n"%steps start = time.time() op_diff = exp(5e2j*(Kx**2+Ky**2) *dz/2.) op_diff = gpuarray.to_gpu(op_diff.astype(complex64)) zero_j = array([0],dtype = complex64) one_j = array([1],dtype = complex64) idxdy = array([1./(dx[0]*dx[1])], dtype = complex64) dxdy = array([(dx[0]*dx[1])], dtype = complex64) g_mult(U_gpu, op_diff, U_gpu, block = block, grid = grid) context.synchronize() #print abs(U_gpu.get()) #pl.imshow(abs(U_gpu.get())) #pl.figure() for l in xrange(nz): ## FFT into the spatial frequency domain fft_g(u_gpu, U_gpu) g_mult(U_gpu, op_diff, U_gpu, block = block, grid = grid) context.synchronize() ## inverse FFT into space domain ifft_g(U_gpu, u_gpu) ## Nonlinear step in space domain runNonLinear(u_gpu, float32(gamma), float32(dz), cukeepMax, int32(l), block = block, grid = grid) context.synchronize() ## cast to double fft_g(u_gpu, U_gpu) g_mult(U_gpu, op_diff, U_gpu, block = block, grid = grid) context.synchronize() ## inverse FFT into space domain ifft_g(U_gpu, u_gpu)
from pycuda.tools import PageLockedMemoryPool import numpy as np import time import ctypes import pdb from queue import Queue from threading import Thread from pycuda.tools import make_default_context import matplotlib.pyplot as plt import threading # Initialize CUDA cuda.init() global ctx ctx = make_default_context() # will initialize the first device it finds dev = ctx.get_device() def _finish_up(): global ctx ctx.pop() ctx = None from pycuda.tools import clear_context_caches clear_context_caches() import atexit atexit.register(_finish_up)
from __future__ import absolute_import import pycuda.driver as cuda # Initialize CUDA cuda.init() from pycuda.tools import make_default_context global context context = make_default_context() device = context.get_device() def _finish_up(): global context context.pop() context = None from pycuda.tools import clear_context_caches clear_context_caches() import atexit atexit.register(_finish_up)
from __future__ import absolute_import import pycuda.driver as cuda import pycuda.gl as cudagl cuda.init() assert cuda.Device.count() >= 1 from pycuda.tools import make_default_context context = make_default_context(lambda dev: cudagl.make_context(dev)) device = context.get_device() import atexit atexit.register(context.pop)