def evolve_linear(z, deltax): """ Input type IN must be numpy or 21cmfast """ fgrowth = pb.fgrowth(z, COSMO['omega_M_0']) #normalized to 1 at z=0 #primordial_fgrowth = pb.fgrowth(INITIAL_REDSHIFT, cosmo['omega_M_0']) #normalized to 1 at z=0 updated = deltax * fgrowth np.save( parent_folder + "/Boxes/updated_smoothed_deltax_z{0:.2f}_{1:d}_{2:.0f}Mpc".format( z, HII_DIM, BOX_LEN), updated) if False: #velocity information may not be useful for linear field plan = Plan(HII_shape, dtype=np.complex64) deltak_d = deltax_d.astype(np.complex64) vbox_d = gpuarray.zeros_like(deltak_d) plan.execute(deltak_d) dDdt_D = np.float32(dDdt_D(z)) for num, mode in enumerate(['x', 'y', 'z']): velocity_kernel(deltak_d, vbox_d, dDdt_D, DIM, np.int32(num), block=block_size, grid=grid_size) np.save( parent_folder + "/Boxes/updated_v{0}overddot_{1:d}_{2:.0f}Mpc".format( mode, HII_DIM, BOX_LEN), smallvbox_d.get()) return
def gpu_fft(data, inverse=False): global plan, ctx, stream ##cuda if not plan: print 'building plan', data.shape plan = Plan(data.shape, stream=stream, wait_for_finish=True) result = gpuarray.zeros_like(data) plan.execute(data, data_out=result, inverse=inverse) return result
def prep(image, psf): datadim1 = image.shape[0] datadim2 = image.shape[1] if datadim1 != datadim2: ddim = max(datadim1, datadim2) s = numpy.binary_repr(ddim - 1) s = s[:-1] + '0' # Guarantee that padding is used else: ddim = datadim1 s = numpy.binary_repr(ddim - 1) if s.find('0') > 0: size = 2**len(s) boxd = numpy.zeros((size, size)) r = size - datadim1 r1 = r2 = r / 2 if r % 2 == 1: r1 = r // 2 + 1 c = size - datadim2 c1 = c2 = c // 2 if c % 2 == 1: c1 = c // 2 + 1 boxdslice = (slice(r1, datadim1 + r1), slice(c1, datadim2 + c1)) boxd[boxdslice] = image else: boxd = image boxp = boxd * 0. if boxd.shape[0] == psf.shape[0]: boxp = psf.copy() else: r = boxp.shape[0] - psf.shape[0] r1 = r // 2 + 1 c = boxp.shape[1] - psf.shape[1] c1 = c // 2 + 1 boxpslice = (slice(r1, psf.shape[0] + r1), slice(c1, psf.shape[1] + c1)) boxp[boxpslice] = psf.copy() from pyfft.cuda import Plan import pycuda.driver as cuda from pycuda.tools import make_default_context import pycuda.gpuarray as gpuarray cuda.init() context = make_default_context() stream = cuda.Stream() plan = Plan(boxp.shape, stream=stream) gdata = gpuarray.to_gpu(boxp.astype(numpy.complex64)) plan.execute(gdata) return gdata, boxd.shape, boxdslice, plan, stream
def prep(image,psf): datadim1 = image.shape[0] datadim2 = image.shape[1] if datadim1!=datadim2: ddim = max(datadim1,datadim2) s = numpy.binary_repr(ddim-1) s = s[:-1]+'0' # Guarantee that padding is used else: ddim = datadim1 s = numpy.binary_repr(ddim-1) if s.find('0')>0: size = 2**len(s) boxd = numpy.zeros((size,size)) r = size-datadim1 r1 = r2 = r/2 if r%2==1: r1 = r/2+1 c = size-datadim2 c1 = c2 = c/2 if c%2==1: c1 = c/2+1 boxdslice = (slice(r1,datadim1+r1),slice(c1,datadim2+c1)) boxd[boxdslice] = image else: boxd = image boxp = boxd*0. if boxd.shape[0]==psf.shape[0]: boxp = psf.copy() else: r = boxp.shape[0]-psf.shape[0] r1 = r/2+1 c = boxp.shape[1]-psf.shape[1] c1 = c/2+1 boxpslice = (slice(r1,psf.shape[0]+r1),slice(c1,psf.shape[1]+c1)) boxp[boxpslice] = psf.copy() from pyfft.cuda import Plan import pycuda.driver as cuda from pycuda.tools import make_default_context import pycuda.gpuarray as gpuarray cuda.init() context = make_default_context() stream = cuda.Stream() plan = Plan(boxp.shape,stream=stream) gdata = gpuarray.to_gpu(boxp.astype(numpy.complex64)) plan.execute(gdata) return gdata,boxd.shape,boxdslice,plan,stream
def run(self): drv.init() a0=numpy.zeros((p,),dtype=numpy.complex64) self.dev = drv.Device(self.number) self.ctx = self.dev.make_context() #TO VERIFY WHETHER ALL THE MEMORY IS FREED BEFORE NEXT ALLOCATION (THIS DOES NOT HAPPEN IN MULTITHREADING) print drv.mem_get_info() self.gpu_a = garray.empty((self.input_cpu.size,), dtype=numpy.complex64) self.gpu_b = garray.zeros_like(self.gpu_a) self.gpu_a = garray.to_gpu(self.input_cpu) plan = Plan(a0.shape,context=self.ctx) plan.execute(self.gpu_a, self.gpu_b, batch=p/m) self.temp = self.gpu_b.get() print output_cpu._closed self.output_cpu.put(self.temp)
def _get_plan(itype, otype, inlen): try: theplan = _plans[(itype, otype, inlen)] except KeyError: theplan = Plan(inlen, dtype=itype, normalize=False, fast_math=True) _plans.update({(itype, otype, inlen): theplan}) return theplan
def conv(delta_d, filt_d, shape, fil): smoothI = np.zeros(shape, dtype=np.complex64) smoothed_d = gpuarray.to_gpu(smoothI) plan = Plan(shape, dtype=np.complex64) plan.execute(delta_d) if fil == 'rspace': plan.execute(filt_d) smoothed_d = delta_d * filt_d.conj() plan.execute(smoothed_d, inverse=True) return smoothed_d.real
def image_cuda(grids): """ Run 2d FFT to image each plane of grid array """ from pyfft.cuda import Plan from pycuda.tools import make_default_context import pycuda.gpuarray as gpuarray import pycuda.driver as cuda nints, npixx, npixy = grids.shape cuda.init() context = make_default_context() stream = cuda.Stream() plan = Plan((npixx, npixy), stream=stream) grid_gpu = gpuarray.to_gpu(grids) for i in range(0, nints): plan.execute(grid_gpu[i], inverse=True) grids = grid_gpu.get() context.pop() return recenter(grids.real, (npixx//2, npixy//2))
def calcCorrShiftGPU(H, G): log.info("Using: "+dev.name()) res=[] # make sure we get arrays of items if type(G) != type([]): G=[G] if type(H) != type([]): H=[H] for H_t in H: # Setup plan has to be power of 2 fftPlan = Plan(len(H_t.dataTrans), wait_for_finish=True) # H is long and G is short log.debug("Starting H_ fft on GPU...") # push to GPU H_gpu = gpuarray.to_gpu(H_t.dataTrans) # do forwards FFT in place fftPlan.execute(H_gpu) for G_t in G: # pad G with zeros to size of H log.debug("Starting G_ fft on GPU...") G_gpu = gpuarray.to_gpu(G_t.getTransPadded(len(H_t.dataTrans))) # do forwards FFT in place fftPlan.execute(G_gpu) F_gpu = H_gpu * G_gpu.conj() fftPlan.execute(F_gpu, inverse=True) f_host = F_gpu.get() maxVal = f_host.max() k=np.where(f_host==maxVal) res.append({"H":H_t, "G":G_t, "shift":k[0][0], "corr":maxVal.real}) #maxVal = gpuarray.max(F_gpu.real) #print maxVal return res
def main(): ## Default input parameter specification r=1.0 nz=100 G=1.8962 print("Starting\n") start_time = time.time() print("Creating Initial Profile\n") ##simulation parameter n_points = (1024,1024) Xmax= (5.0,5.0) # grid and window dx = [2.*Xmax[i]/n_points[i] for i in [0,1]] dz=0.003 beta = 500 #print "Enter step size [Ldf]\n" #scanf("%lf",&dz #print "Enter number of steps \n" #scanf("%lf",&nz #nz=(int)nz; #print "Enter number of critical powers\n" #scanf("%lf",&beta gamma=G*beta x=linspace(-Xmax[0],Xmax[0],n_points[0]) y=linspace(-Xmax[1],Xmax[1],n_points[1]) kx=fftfreq(n_points[0],dx[0]) ky=fftfreq(n_points[1],dx[1]) X,Y = meshgrid(x,y) Kx, Ky = meshgrid(kx,ky) keepMax=zeros(N_Z) II_out= zeros(n_points) U_m = zeros(n_points, dtype = complex64) IM_out= zeros(n_points) IF_out= zeros(n_points) ufft= zeros(n_points, dtype = complex64) ufft_pc= zeros(n_points, dtype = complex64) ##for (j=0;j<nx;j++) { ## x[j]=(double)(-nx/2+j+1)*dx; ## kx[j]=(j < nx/2 ) ? ## (pi*(double)(j))*(1./dx/((double)(nx))): ## (pi*(double)(j-nx))*(1./dx/((double)(nx)) ##fx_s=1/dx, dfx= fx_s/N-> d omega= 2pi dfx ## ????? kx[j]=kx[j]*kx[j]; u = exp( -(X**2 + Y**2)/r ) + 0.j u = u.astype(complex64) u_m= zeros(n_points) u_f= zeros(n_points) II_out = (u.real**2 + u.imag**2) ## ??? steps=2.*nz print "Step size %g [Ldf]\n"%dz print "Number of critical powers %g\n"%beta print "Number of steps %d\n"%N_Z ## cuFFT planning and preparation # cuda.init() dev = pycuda.autoinit.device context = make_default_context() nonlinearMod = SourceModule(""" #include <pycuda-complex.hpp> __global__ void nonlinear(pycuda::complex<float> *u_mat, float beta, float dz, pycuda::complex<float> *keepMax, int step) { const int y = blockDim.y * blockIdx.y + threadIdx.y; const int x = blockDim.x * blockIdx.x + threadIdx.x; float I; pycuda::complex<float> I_UNIT(0.,1.); int i = x* %(n)d + y; I=pycuda::abs(u_mat[i]); u_mat[i]= u_mat[i]* pycuda::exp(I_UNIT*I*beta*dz); if ((x==i/2) && (y==i/2)) keepMax[step]=pycuda::exp(I_UNIT*I*beta*dz); } __global__ void prod(pycuda::complex<float> *X, pycuda::complex<float> *Y, pycuda::complex<float> *Z) { const int y = blockDim.y * blockIdx.y + threadIdx.y; const int x = blockDim.x * blockIdx.x + threadIdx.x; int i = x*%(n)d + y; Z[i]=X[i] * Y[i]; }"""%{'n' : n_points[0]}) print "Device %d: \"%s\" with Compute %d.%d capability\n"%(dev.pci_bus_id, dev.name(), dev.compute_capability()[0], dev.compute_capability()[1]) print "Creating FFT Plans\n" plan = Plan(n_points, wait_for_finish = True, scale = dx[0]*dx[1]) block = (16,16,1) grid = (n_points[0]/block[0], n_points[1]/block[1]) ## Threads per block fft_g = lambda x, y: plan.execute(x, y) ifft_g = lambda x, y: plan.execute(x, y, inverse = True) g_mult = nonlinearMod.get_function('prod') runNonLinear = nonlinearMod.get_function("nonlinear") print "Allocating memory on device\n" u_gpu = gpuarray.to_gpu(u.astype(complex64)) U_gpu = gpuarray.to_gpu(zeros(n_points, complex64)) print "Allocating kx, ky & keepMax\n" cukx = gpuarray.to_gpu(kx) cuky = gpuarray.to_gpu(ky) cukeepMax = gpuarray.to_gpu(ones(nz, complex64)) ## preparing the data to transfer to the device IM_out = u.real #fileout("A",0.,h_in,nx, ny) print "Starting %i FFT pairs\n"%steps start = time.time() op_diff = exp(5e2j*(Kx**2+Ky**2) *dz/2.) op_diff = gpuarray.to_gpu(op_diff.astype(complex64)) zero_j = array([0],dtype = complex64) one_j = array([1],dtype = complex64) idxdy = array([1./(dx[0]*dx[1])], dtype = complex64) dxdy = array([(dx[0]*dx[1])], dtype = complex64) g_mult(U_gpu, op_diff, U_gpu, block = block, grid = grid) context.synchronize() #print abs(U_gpu.get()) #pl.imshow(abs(U_gpu.get())) #pl.figure() for l in xrange(nz): ## FFT into the spatial frequency domain fft_g(u_gpu, U_gpu) g_mult(U_gpu, op_diff, U_gpu, block = block, grid = grid) context.synchronize() ## inverse FFT into space domain ifft_g(U_gpu, u_gpu) ## Nonlinear step in space domain runNonLinear(u_gpu, float32(gamma), float32(dz), cukeepMax, int32(l), block = block, grid = grid) context.synchronize() ## cast to double fft_g(u_gpu, U_gpu) g_mult(U_gpu, op_diff, U_gpu, block = block, grid = grid) context.synchronize() ## inverse FFT into space domain ifft_g(U_gpu, u_gpu)
img = Image.open("fox.jpg").convert("L") # Convert image to numpy array arr = np.array(img, dtype=np.float32) pad_arr = pad_power_of_two(arr) pad_arr2 = np.empty_like(pad_arr) # FFT with CUDA cuda.init() ctx = cuda.Device(0).make_context() strm = cuda.Stream() pad_arr_gpu = cuda.to_device(pad_arr) plan = Plan(pad_arr.shape, dtype=np.float32, context=ctx, stream=strm) plan.execute(pad_arr_gpu) cuda.memcpy_dtoh(pad_arr2, pad_arr_gpu) pad_arr3 = np.fft.fftshift(pad_arr2) # -------------------------------------------------------------------------- # Plot # -------------------------------------------------------------------------- plt.ion() fig = plt.figure(figsize=(20, 7)) ax1 = fig.add_subplot(1, 2, 1) ax2 = fig.add_subplot(1, 2, 2) ax1.set_title("Original") ax2.set_title("FFT")
def evolve_zeldovich(z, deltax): """First order Zeldovich approximation. """ if BOX_LEN > DIM: print "perturb_field: WARNING: Resolution is likely too low for accurate evolved density fields" #move_mass(updated_d, deltax_d, vx_d, vy_d, vz_d, np.float32(1./primordial_fgrowth)) kernel_source = open(cmd_folder + "/perturb_field.cu").read() kernel_code = kernel_source % { 'DELTAK': DELTA_K, 'HII_DIM': HII_DIM, 'DIM': DIM, 'PIXEL_FACTOR': PIXEL_FACTOR } main_module = nvcc.SourceModule(kernel_code) move_mass = main_module.get_function("move_mass") velocity_kernel = main_module.get_function("set_velocity") filter_kernel = main_module.get_function("filter") subsample_kernel = main_module.get_function("subsample") fgrowth = np.float32(pb.fgrowth( z, COSMO['omega_M_0'])) #normalized to 1 at z=0 primordial_fgrowth = np.float32( pb.fgrowth(INITIAL_REDSHIFT, COSMO['omega_M_0'])) #normalized to 1 at z=0 vx = np.load( parent_folder + "/Boxes/vxoverddot_{0:d}_{1:.0f}Mpc.npy".format(HII_DIM, BOX_LEN)) vy = np.load( parent_folder + "/Boxes/vyoverddot_{0:d}_{1:.0f}Mpc.npy".format(HII_DIM, BOX_LEN)) vz = np.load( parent_folder + "/Boxes/vzoverddot_{0:d}_{1:.0f}Mpc.npy".format(HII_DIM, BOX_LEN)) vx_d = gpuarray.to_gpu(vx) vy_d = gpuarray.to_gpu(vy) vz_d = gpuarray.to_gpu(vz) vx_d *= ((fgrowth - primordial_fgrowth) / BOX_LEN ) #this is now comoving displacement in units of box size vy_d *= ((fgrowth - primordial_fgrowth) / BOX_LEN) vz_d *= ((fgrowth - primordial_fgrowth) / BOX_LEN) #updated_d = gpuarray.zeros_like(vx_d) start = cuda.Event() updated_d = gpuarray.zeros(HII_shape, dtype=np.float32) delta_d = gpuarray.to_gpu(deltax) start.record() start.synchronize() move_mass(updated_d, delta_d, vx_d, vy_d, vz_d, primordial_fgrowth, block=block_size, grid=grid_size) updated_d /= MASS_FACTOR updated_d -= np.float32( 1.) #renormalize to the new pixel size, and make into delta updated = updated_d.get_async() #import IPython; IPython.embed() np.save( parent_folder + "/Boxes/updated_smoothed_deltax_z{0:.2f}_{1:d}_{2:.0f}Mpc".format( z, HII_DIM, BOX_LEN), updated) plan = Plan((DIM, DIM, DIM), dtype=np.complex64) delta_d = delta_d.astype(np.complex64) #import IPython; IPython.embed() vbox_d = gpuarray.zeros_like(delta_d) smallvbox_d = gpuarray.zeros(HII_shape, dtype=np.float32) plan.execute(delta_d) #now deltak dDdt_D = np.float32(dDdtoverD(z)) #print dDdt_D #import IPython; IPython.embed() smoothR = np.float32(L_FACTOR * BOX_LEN / HII_DIM) for num, mode in enumerate(['x', 'y', 'z']): velocity_kernel(delta_d, vbox_d, dDdt_D, DIM, np.int32(num), block=block_size, grid=grid_size) filter_kernel(vbox_d, DIM, ZERO, smoothR, block=block_size, grid=grid_size) plan.execute(vbox_d, inverse=True) subsample_kernel(vbox_d.real, smallvbox_d, DIM, HII_DIM, PIXEL_FACTOR, block=block_size, grid=HII_grid_size) np.save( parent_folder + "/Boxes/updated_v{0}overddot_{1:d}_{2:.0f}Mpc".format( mode, HII_DIM, BOX_LEN), smallvbox_d.get()) return
def split_step_GPU(A0, z_array, # Array for solution points t_op = 0, w_op = 0, nlin = 0, # Constant operators dt = 1, # sampling time t_nl_op = None, # Additional operator f(A, dt, z) apod = True, # Boundary conditition varying_operator = False, # Do operators vary in x dynamic_predictor = True, plot_hook = None, n_plots = 3, # not used anymore tollerance = 0.04, ): import pycuda.autoinit from pycuda.tools import make_default_context, dtype_to_ctype import pycuda.gpuarray as gpuarray from pycuda import cumath from pyfft.cuda import Plan from pycuda.compiler import SourceModule from pycuda.driver import Context from pycuda.elementwise import get_axpbyz_kernel, get_axpbz_kernel, get_binary_op_kernel, get_elwise_kernel,ElementwiseKernel ## Initialization n_points = A0.shape[0] # w = fftfreq(npoints, dx) * 2 * pi A_t = A0[:] +0.j #A_t.dtype = complex64 A_w = fft(A_t) * dt # Apodization (AK boundary conditions) # TODO making it smooth apod_array = ones(n_points, dtype = complex64) apod_array[0:n_points/50] = 0 apod_array[-n_points/50:-1] = 0 z0 = z_array[0] zf = z_array[-1] delta_z = 1.*(z_array[1]-z_array[0])/4 done_once = False #plan = c_uint() #dll.cufftPlan1d(byref(plan), n_points, 0x29, 1) #fft_g = lambda x, y: dll.cufftExecC2C(plan, x.ptr, y.ptr, -1) #ifft_g = lambda x, y: dll.cufftExecC2C(plan, x.ptr, y.ptr, 1) ## GPU modules ##### if pycuda.autoinit.context: context = pycuda.autoinit.context else: context = make_default_context() block = (16,1,1) grid = (n_points/block[0], 1) ## Init GPU kernels #### ## fft, scale dx is included in the definition here plan = Plan(n_points,wait_for_finish = True, scale = dt) fft_g = lambda ain, aout: plan.execute(ain, aout,) ifft_g = lambda x, y: plan.execute(x, y, inverse = True) ## Multiplication prod = ElementwiseKernel( "pycuda::complex<float> *x, pycuda::complex<float> *y, pycuda::complex<float> *z", """ z[i] = x[i] * y[i]; """, "product", preamble = "") #prod = lambda x,y,z: prod(x,y,z, block, grid) ## Non-linearity nonLinear = ElementwiseKernel( "pycuda::complex<float> *x, pycuda::complex<float> nlin, pycuda::complex<float> *y, pycuda::complex<float> *z", """ pycuda::complex<float> I_UNIT(0.,1.); float I = pycuda::abs(y[i]); z[i] = x[i] * pycuda::exp(I_UNIT * I * nlin); """, "nonLinear", preamble = "") ## Evaluate the solution with current values at delta_z step ## separated so that can be re-used for error prediction ## contains some lazy eveluation just to be CUDA-implementation ready ## and reducing the number of array creation def f(A_t, A_w, dz = delta_z): if f.delta_z != dz: f.w_exp = cumath.exp(-1j * dz/2. * w_op) f.t_exp = cumath.exp(-1j * dz * t_op) f.delta_z = dz ## Dispersion (I pass) f.A_t = A_t f.A_w = A_w #print A_w.get()[n_points/2], prod(A_w, f.w_exp, A_w) #A_w = f.w_exp*A_w #print A_w.get()[n_points/2], ifft_g(f.A_w, f.A_t) ## Scale factor included in fft_g ## Constant potential term prod(f.A_t, f.t_exp, f.A_t) ## Nonlinear operator as intensity dependency if nlin != 0: f.A_t = f.A_t * cumath.exp(-1j * delta_z * nlin * f.A_t * f.A_t.conj()) ## Additional nonlinear terms as a function t_nl_op(A(t),dt,z) if t_nl_op != None: f.A_t = f.A_t * cumath.exp(-1j * delta_z * t_nl_op(f.A_t, dt, z0+delta_z/2) ) ## Apodization if apod: prod(f.A_t, apod_array, f.A_t) fft_g(f.A_t, f.A_w) ## Scale factor included in fft_g ## Dispersion (II pass) prod(f.A_w, f.w_exp, f.A_w) ifft_g(f.A_w, f.A_t) ## Scale factor included in fft_g return f.A_t, f.A_w ## Init the f function f.delta_z = 0 # The rest will be evaluated lazily ## Convert to GPU arrays f.A_t = gpuarray.to_gpu(ones(n_points, complex64)) f.A_w = gpuarray.to_gpu(ones(n_points, complex64)) A_t = gpuarray.to_gpu(A_t.astype(complex64)) A_w = gpuarray.to_gpu(A_w.astype(complex64)) apod_array = gpuarray.to_gpu(apod_array.astype(complex64)) if hasattr(w_op,'__len__'): w_op = gpuarray.to_gpu(w_op.astype(complex64)) else: ## Use array even if it's a single values, othewise error when updating dz w_op = gpuarray.to_gpu(w_op*ones(n_points).astype(complex64)) if hasattr(t_op,'__len__'): t_op = gpuarray.to_gpu(t_op.astype(complex64)) else: t_op = gpuarray.to_gpu(t_op*ones(n_points).astype(complex64)) error = tollerance print "Ready for integration" ## Init loop variables sol_i = 0 sols = [A0] iters = 0 ## Integration loop while z0 <= zf: ## Cycle check if z0 >= z_array[sol_i]: #print "dz = %.2e error=%.2f z = %.2e"%(delta_z,error,z0) sols.append(A_t.get()) sol_i +=1 try: ## Force to have steps smaller than the distance between 2 solutions while z0 + delta_z >= z_array[sol_i + 1]: delta_z /= 2. except: pass ## Dynamical correction while dynamic_predictor: A_coarse = f(gpuarray.to_gpu(A_t.get()), gpuarray.to_gpu(A_w.get()), dz=2*delta_z)[0].get() A_fine = f(*f(gpuarray.to_gpu(A_t.get()), gpuarray.to_gpu(A_w.get()), delta_z), dz=delta_z)[0].get() delta = A_fine-A_coarse error = sqrt( trapz(delta*delta.conj())/ \ trapz(A_fine*A_fine.conj())) #print "Error : ",error, " dz :", delta_z if error < 2 * tollerance: done_once = True break ## Error is less then the tollerance, proceed delta_z = delta_z / 2. # update step A_t, A_w = f(A_t, A_w, delta_z) z0 += delta_z iters += 1 # Dynamic step (additional correction for faster convergence) if (dynamic_predictor or not (done_once or dynamic_predictor) ): if error > tollerance: delta_z = delta_z / 1.23 if error < 0.5/tollerance: delta_z = delta_z * 1.23 # Show the state of the loop every 200 loops (approx every few secs) if iters %200 == 0: print "Iter %8d (to end %8d) %4.1f %%"%(iters, (z_array[-1]-z0)/delta_z, 100.*iters/(iters+(z_array[-1]-z0)/delta_z)) ## Integration is over print "Total iterations: ", iters ## Return array with solutions (and their ftt) return sols
def simpleFourierTest2D(N=2048): """ Using PyFFT to call CUDA. :return: """ from pyfft.cuda import Plan import pycuda.driver as cuda from pycuda.tools import make_default_context import pycuda.gpuarray as gpuarray import time cuda.init() context = make_default_context() stream = cuda.Stream() plan = Plan((N, N), dtype=np.complex64, stream=stream) x = np.ones((N, N), dtype=np.complex64) x_gpu = gpuarray.to_gpu(x) plan.execute(x_gpu) res = x_gpu.get() plan.execute(x_gpu, inverse=True) result = x_gpu.get() context.pop() error = np.abs(np.sum(np.abs(x) - np.abs(result)) / x.size) #print 'Error:', error #Single precision print 'Array size %i x %i' % (N, N) print 'Single Precisions' x = np.random.random((N, N)) x = x.astype(np.complex64) start = time.time() cuda.init() context = make_default_context() stream = cuda.Stream() plan = Plan((N, N), dtype=np.complex64, stream=stream, fast_math=True) x_gpu = gpuarray.to_gpu(x) plan.execute(x_gpu) result = x_gpu.get() context.pop() end = time.time() cudatime = end - start #numpy start = time.time() xf = np.fft.fft2(x) end = time.time() numpytime = end - start print 'Same to 1e-2?' print np.testing.assert_allclose(xf, result, rtol=1e-2) print 'Numpy time', numpytime print 'CUDA time', cudatime #Double precision print '\n\nDouble Precision' x = np.random.random((N, N)) x = x.astype(np.complex128) start = time.time() cuda.init() context = make_default_context() stream = cuda.Stream() plan = Plan((N, N), dtype=np.complex128, stream=stream, fast_math=True) x_gpu = gpuarray.to_gpu(x) plan.execute(x_gpu) result = x_gpu.get() context.pop() end = time.time() cudatime = end - start #numpy start = time.time() xf = np.fft.fft2(x) end = time.time() numpytime = end - start print 'Same to 1e-7?' print np.testing.assert_allclose(xf, result, rtol=1e-7) print 'Numpy time', numpytime print 'CUDA time', cudatime
def conv_bubbles(I, param_dict, Z, scale=None, fil=1, update=0, LE=False, visualize=False): """uses fft convolution""" zeta = 40. Lfactor = 0.620350491 # Z = param_dict['z'] DELTA_R_FACTOR = 1.05 print "Using filter_type {}".format(fil) if scale is None: scale = param_dict['BoxeSize']/param_dict['HIIdim'] dk = 2*np.pi/I.shape[0]*scale#param_dict['BoxSize'] #delta k in inverse Mpc RMAX = np.float32(30) #in Mpc RMIN = np.float32(1.) mm = mmin(Z, Tvir=1.e4) smin = sig0(m2R(mm)) #smin = pb.sigma_r(m2R(mm), Z, **cosmo)[0] deltac = Deltac(Z) fgrowth = np.float32(deltac/1.686) #fgrowth = 1./pb.fgrowth(Z, cosmo['omega_M_0'], unnormed=True) fc_mean_ps = pb.collapse_fraction(np.sqrt(smin), deltac).astype(np.float32) #mean collapse fraction of universe print fc_mean_ps """find bubbbles for deltax box I""" kernel_source = open("find_bubbles.cu").read() kernel_code = kernel_source % { 'DELTAC': deltac, 'RMIN': RMIN, 'SMIN': smin, 'ZETA': zeta, 'DELTAK': dk } main_module = nvcc.SourceModule(kernel_code) fcoll_kernel = main_module.get_function("fcoll_kernel") update_kernel = main_module.get_function("update_kernel") update_sphere_kernel = main_module.get_function("update_sphere_kernel") final_kernel = main_module.get_function("final_kernel") HII_filter = main_module.get_function("HII_filter") # Get contiguous image + shape. height, width, depth = I.shape HII_TOT_NUM_PIXELS = height*width*depth # Get block/grid size make sure divisible (currrently only power of 2 so ok) block_size = (8,8,8) grid_size = (width/(block_size[0]), height/(block_size[0]), depth/(block_size[0])) # Initialize variables. #ionized = np.zeros([height,width,depth]) #ionized = np.float32(ionized) width = np.int32(width) I = np.float32(I.copy()) if not LE: I *= fgrowth #linearly extrapolate the non-linear density to present #filt = np.ones_like(I) # Transfer labels asynchronously. ionized_d = gpuarray.zeros([height,width,depth], dtype=np.float32) delta_d = gpuarray.to_gpu_async(I) # I_cu = cu.np_to_array(I, order='C') # cu.bind_array_to_texref(I_cu, image_texture) fftplan = Plan(I.shape, dtype=np.complex64) R = RMAX; cnt = 0 if visualize is not None: fig = plt.figure() ax0 = fig.add_subplot(131) ax0.set_title('Density') mydelta0 = plt.imshow(I.real[width/2]) plt.colorbar() ax1 = fig.add_subplot(132) fig.suptitle(" Smoothed Density and Ionization") ax1.set_title('smoothed Density') mydelta = plt.imshow(delta_d.get().real[width/2]) plt.colorbar() ax2 = fig.add_subplot(133) ax2.set_title('Ionization') myion = plt.imshow(np.ones_like(I)[width/2]) plt.colorbar() if visualize == 'draw': plt.pause(.01) plt.draw() else: plt.savefig('tmp/{0:03d}.png'.format(cnt)) #plt.colorbar() final_step = False final_denom = -1 if RMIN < Lfactor*scale: temparg = 2*(smin - sig0(Lfactor*scale) ) if temparg < 0: raise(Exception) else: final_denom = np.sqrt(temparg).astype(np.float32) while not final_step: print 'R={} Mpc'.format(R) if (R/DELTA_R_FACTOR) <= (Lfactor*scale) or ((R/DELTA_R_FACTOR) <= RMIN): #stop if reach either rmin or cell size final_step = True R = np.float32(R) Rpix = np.float32(R/scale) S0 = np.float32(sig0(R)) #S0 = np.float32(pb.sigma_r(R, Z, **cosmo)[0]) denom = np.sqrt(2*(smin - S0)).astype(np.float32) print 'denom', denom start = cu.Event() step1 = cu.Event() step2 = cu.Event() step3 = cu.Event() step4 = cu.Event() end = cu.Event() start.record() #smoothed_d = conv(delta_d.astype(np.complex64), I.shape, fil=fil) delta_d = gpuarray.to_gpu_async(I).astype(np.complex64) fcoll_d = gpuarray.zeros(I.shape, dtype=np.float32) start.synchronize() if R > 1 or True: # smoothing fftplan.execute(delta_d) step1.record(); step1.synchronize() HII_filter(delta_d, width, np.int32(fil), R, block=block_size, grid=grid_size) step2.record(); step2.synchronize() #import IPython; IPython.embed() fftplan.execute(delta_d, inverse=True) if not final_step: fcoll_kernel(fcoll_d, delta_d.real, width, denom, block=block_size, grid=grid_size) step3.record(); step3.synchronize() if not LE: #fcollmean = gpuarray.sum((1+delta_d.real)*fcoll_d).get()/float(HII_TOT_NUM_PIXELS) fcollmean = gpuarray.sum(fcoll_d).get()/np.float32(HII_TOT_NUM_PIXELS) fcoll_d *= fc_mean_ps/fcollmean# #normalize since we used non-linear density step4.record(); step4.synchronize() if update == 0: update_kernel(ionized_d, fcoll_d, width, block=block_size, grid=grid_size) elif update == 1: update_sphere_kernel(ionized_d, fcoll_d, width, Rpix, block=block_size, grid=grid_size) #import IPython; IPython.embed() else: if (RMIN > Lfactor*scale) or (final_denom < 0): final_denom = denom print 'final denom', final_denom fcoll_kernel(fcoll_d, delta_d.real, width, denom, block=block_size, grid=grid_size) step3.record(); step3.synchronize() if not LE: fcollmean = gpuarray.sum(fcoll_d).get()/np.float32(HII_TOT_NUM_PIXELS) #fcollmean = gpuarray.sum((1+delta_d.real)*fcoll_d).get()/float(HII_TOT_NUM_PIXELS) fcoll_d *= fc_mean_ps/fcollmean step4.record(); step4.synchronize() if update == 0: update_kernel(ionized_d, fcoll_d, width, block=block_size, grid=grid_size) elif update == 1: update_sphere_kernel(ionized_d, fcoll_d, width, Rpix, block=block_size, grid=grid_size) final_kernel(ionized_d, fcoll_d, width, block=block_size, grid=grid_size) end.record() end.synchronize() if visualize is not None: mydelta.set_data(delta_d[width/2].real.get()) myion.set_data(ionized_d[width/2].get()) ax1.set_title('R = %f'%(R)) if visualize == 'draw': plt.pause(.01) plt.draw() else: plt.savefig('tmp/{0:03d}.png'.format(cnt)) R = R/DELTA_R_FACTOR cnt +=1 ionized = ionized_d.get() return ionized
def run(xHfile=None, IO_DIR=None): if IO_DIR is None: IO_DIR = parent_folder if not os.path.exists(IO_DIR + "/Outfiles"): os.makedirs(IO_DIR + "/Outfiles") if xHfile is None: xHfile = find_files(IO_DIR + "/Boxes/", pattern="xH*{0:06.2f}_{1:i}_{2:.0f}*".format( Z, HII_DIM, BOX_LEN))[0] if xHfile.endswith('.npy'): xH = np.load(xHfile) p_dict = boxio.parse_filename(os.path.splitext(xHfile)[0]) else: b = boxio.readbox(xHfile) xH = b.box_data p_dict = b.param_dict Z = p_dict['z'] #growth_factor = pb.fgrowth(Z, COSMO['omega_M_0'], unnormed=True) #overwrite global variables HII_DIM = p_dict['dim'] BOX_LEN = np.float32(p_dict['BoxSize']) DELTA_K = np.float32(2 * np.pi / BOX_LEN) VOLUME = (BOX_LEN * BOX_LEN * BOX_LEN) HII_TOT_NUM_PIXELS = HII_DIM**3 try: deltax = np.load( IO_DIR + "/Boxes/updated_smoothed_deltax_z0{0:.2f}_{1:d}_{2:.0f}Mpc.npy". format(Z, HII_DIM, BOX_LEN)) except: #deltax = boxio.readbox(IO_DIR+"/Boxes/updated_smoothed_deltax_z{0:.2f}_{1:d}_{2:.0f}Mpc".format(Z, HII_DIM, BOX_LEN)).box_data deltax = boxio.readbox( IO_DIR + "/Boxes/updated_smoothed_deltax_z0{0:.2f}_{1:d}_{2:.0f}Mpc".format( Z, HII_DIM * 2, BOX_LEN)).box_data[:HII_DIM, :HII_DIM, :HII_DIM] kernel_source = open(cmd_folder + "/delta_T.cu").read() kernel_code = kernel_source % { 'DELTAK': DELTA_K, 'VOLUME': VOLUME, 'NUM_BINS': NUM_BINS } main_module = nvcc.SourceModule(kernel_code) pbox_kernel = main_module.get_function("pbox_kernel") #pixel_deltax_d = gpuarray.to_gpu(deltax) #pixel_xH_d = gpuarray.to_gpu(xH) _const_factor = np.float32( 27 * (COSMO['omega_b_0'] * COSMO['h'] * COSMO['h'] / 0.023) * np.sqrt( (0.15 / COSMO['omega_M_0'] / COSMO['h'] / COSMO['h']) * (1 + Z) / 10.0)) delta_T = np.float32(_const_factor * xH * (1.0 + deltax)) #in mK ave = np.mean(delta_T) np.save( IO_DIR + "/Boxes/delta_T_no_halos_z{0:.2f}_nf{1:f}_useTs{2:d}_zetaX{3:.1e}_TvirminX{4:.1e}_aveTb{5:.2f}_{6:d}_{7:d}Mpc.npy" .format(Z, p_dict['nf'], USE_TS_IN_21CM, p_dict['eff'], ION_Tvir_MIN, ave, HII_DIM, int(BOX_LEN)), delta_T) deldel_T = (delta_T / ave - 1) * VOLUME / HII_TOT_NUM_PIXELS if DIMENSIONAL_T_POWER_SPEC: deldel_T *= ave plan = Plan(HII_shape, dtype=np.complex64) deldel_T_d = gpuarray.to_gpu(deldel_T.astype(np.complex64)) plan.execute(deldel_T_d) K = np.float32( np.logspace(np.log10(DELTA_K), np.log10(DELTA_K * np.sqrt(3.) * DIM), NUM_BINS)) K_d = gpuarray.to_gpu(K) k_ave_d = gpuarray.zeros_like(K_d) in_bin_ct_d = gpuarray.zeros_like(K_d) ps_d = gpuarray.zeros_like(K_d) pbox_kernel(deldel_T_d, DIM, ps_d, k_ave_d, in_bin_ct_d, K_d, block=block_size, grid=HII_grid_size) ps = ps_d.get() in_bin_ct = in_bin_ct_d.get() k_ave = k_ave_d.get() k_ave = np.where(in_bin_ct > 0, k_ave / in_bin_ct, 0.) ps_ave = np.where(in_bin_ct > 0, ps / in_bin_ct, 0.) #ps_fname = "/ps_nov_no_halos_z{0:.2f}_nf{1:f}_useTs{2:d}_zetaX{3:.1e}_TvirminX{4:.1e}_aveTb{5:.2f}_{6:d}_{7:d}Mpc".format(Z, p_dict['nf'], USE_TS_IN_21CM, p_dict['eff'], ION_Tvir_MIN, ave, HII_DIM, np.int32(BOX_LEN)) #np.savez(IO_DIR+ps_fname, k_ave=k_ave, ps_ave=ps_ave) return K, k_ave, ps_ave
def conv_bubbles(deltax, param_dict, Z=None, scale=None, fil=1, update=0, LE=False, visualize=0, quiet=False): """ Excursion-set formalism, or Fast Fourier Radiative-Transform. Calculates ionization fields from density field provided. For each box pixel, it cycles through various bubble radii , until it finds the largest radius such that the enclosed collapsed mass fraction (obtained by summing masses from the halo list file of halos whose centers are within the bubble, or by taking the mean collapsed mass from conditional press-schechter) is larger than 1/ZETA. Parameters ---------- deltax : numpy.float32 array Real space density box, must have dimensions powers of 2. param_dict: python dictionary dictionary of parameters created by boxio.parse_filename Z: float32 Required if input density is the present day linear density, program would extrapolate to Z. fil: int32 type of filter for smoothing : 0: rtophat; 1: ktophat, 2: Gaussian update: int32 Method to update the ionization field 0: center pixel, 1: sphere painting visualize: bool if True, draw slice of density field and created ionization field quiet: bool Returns ---------- ion_field: numpy array float32 """ if not quiet: print "Using filter_type {}".format(fil) if scale is None: scale = param_dict['BoxSize']/param_dict['HIIdim'] if Z is None: Z = param_dict['Z'] sigmamin, deltac = pb.sig_del(ION_Tvir_MIN, Z, **COSMO) fgrowth = np.float32(deltac/1.686) smin = sigmamin**2 fc_mean_ps = pb.collapse_fraction(sigmamin, deltac).astype(np.float32) #mean collapse fraction of universe """find bubbbles for deltax box I""" kernel_source = open(cmd_folder+"/find_bubbles.cu").read() kernel_code = kernel_source % { 'DELTAC': deltac, 'RMIN': R_BUBBLE_MIN, 'ZETA': ZETA, 'DELTAK': DELTA_K } main_module = nvcc.SourceModule(kernel_code) fcoll_kernel = main_module.get_function("fcoll_kernel") update_kernel = main_module.get_function("update_kernel") update_sphere_kernel = main_module.get_function("update_sphere_kernel") final_kernel = main_module.get_function("final_kernel") HII_filter = main_module.get_function("HII_filter") # Get contiguous image + shape. height, width, depth = deltax.shape HII_TOT_NUM_PIXELS = height*width*depth # Initialize variables. width = np.int32(width) deltax = np.float32(deltax.copy()) if not LE: deltax *= fgrowth #linearly extrapolate the non-linear density to present # Transfer asynchronously. ionized_d = gpuarray.zeros([height,width,depth], dtype=np.float32) delta_d = gpuarray.to_gpu_async(deltax) fftplan = Plan(deltax.shape, dtype=np.complex64) R = R_BUBBLE_MAX; cnt = 0 if visualize > 0: fig = plt.figure() ax1 = fig.add_subplot(121) fig.suptitle(" Smoothed Density and Ionization") ax1.set_title('Density') mydelta = plt.imshow(delta_d.get().real[width/2]) plt.colorbar() ax2 = fig.add_subplot(122) ax2.set_title('Ionization') myion = plt.imshow(np.ones_like(deltax)[width/2]) plt.colorbar() if visualize == 1: print "HERE" plt.pause(.01) plt.draw() elif visualize == 2: plt.savefig('tmp/{0:03d}.png'.format(cnt)) #plt.colorbar() final_step = False final_denom = -1 if R_BUBBLE_MIN < L_FACTOR*scale: temparg = 2*(smin - sig0(L_FACTOR*scale) ) if temparg < 0: raise(Exception) else: final_denom = np.sqrt(temparg).astype(np.float32) while not final_step: if (R/DELTA_R_FACTOR) <= (L_FACTOR*scale) or ((R/DELTA_R_FACTOR) <= R_BUBBLE_MIN): #stop if reach either rmin or cell size final_step = True R = np.float32(R) S0 = np.float32(sig0(R)) #S0 = np.float32(pb.sigma_r(R, Z, **cosmo)[0]) denom = np.sqrt(2*(smin - S0)).astype(np.float32) if not quiet: print 'R={} Mpc'.format(R) print 'denom', denom start = cu.Event() step1 = cu.Event() step2 = cu.Event() step3 = cu.Event() step4 = cu.Event() end = cu.Event() start.record() delta_d = gpuarray.to_gpu_async(deltax.astype(np.complex64)) fcoll_d = gpuarray.zeros(deltax.shape, dtype=np.float32) start.synchronize() fftplan.execute(delta_d) step1.record(); step1.synchronize() HII_filter(delta_d, width, np.int32(fil), R, block=block_size, grid=grid_size) step2.record(); step2.synchronize() #import IPython; IPython.embed() fftplan.execute(delta_d, inverse=True) step2.synchronize() # if not the final step, get ionized regions, if final step paint partial ionizations if not final_step: fcoll_kernel(fcoll_d, delta_d.real, width, denom, block=block_size, grid=grid_size) step3.record(); step3.synchronize() if not LE: fcollmean = gpuarray.sum(fcoll_d).get()/float(HII_TOT_NUM_PIXELS) fcoll_d *= fc_mean_ps/fcollmean# #normalize since we used non-linear density step4.record(); step4.synchronize() if update == 0: update_kernel(ionized_d, fcoll_d, width, block=block_size, grid=grid_size) else: update_sphere_kernel(ionized_d, fcoll_d, width, R, block=block_size, grid=grid_size) else: if final_denom < 0: final_denom = denom # print 'final denom', final_denom fcoll_kernel(fcoll_d, delta_d.real, width, denom, block=block_size, grid=grid_size) step3.record(); step3.synchronize() if not LE: fcollmean = gpuarray.sum(fcoll_d).get()/float(HII_TOT_NUM_PIXELS) fcoll_d *= fc_mean_ps/fcollmean step4.record(); step4.synchronize() final_kernel(ionized_d, fcoll_d, width, block=block_size, grid=grid_size) end.record() end.synchronize() if visualize > 0: mydelta.set_data(delta_d.real.get()[width/2]) myion.set_data(ionized_d.get()[width/2]) ax1.set_title('R = %f'%(R)) if visualize == 1: plt.pause(.01) plt.draw() elif visualize == 2: plt.savefig('tmp/{0:03d}.png'.format(cnt)) R = R/DELTA_R_FACTOR cnt +=1 ionized = ionized_d.get() return ionized
def init(): """outputs the high resolution k-box, and the smoothed r box""" N = np.int32(DIM) #prepare for stitching #HII_DIM = np.int32(HII_DIM) f_pixel_factor = DIM/HII_DIM; scale = np.float32(BOX_LEN)/DIM HII_scale = np.float32(BOX_LEN)/HII_DIM shape = (N,N,N) MRGgen = MRG32k3aRandomNumberGenerator(seed_getter=seed_getter_uniform, offset=0) kernel_source = open(cmd_folder+"/initialize.cu").read() kernel_code = kernel_source % { 'DELTAK': DELTA_K, 'VOLUME': VOLUME, 'DIM': DIM } main_module = nvcc.SourceModule(kernel_code) init_kernel = main_module.get_function("init_kernel") HII_filter = main_module.get_function("HII_filter") adj_complex_conj = main_module.get_function("adj_complex_conj") subsample_kernel = main_module.get_function("subsample") velocity_kernel = main_module.get_function("set_velocity") pspec_texture = main_module.get_texref("pspec") interpPspec, interpSize = init_pspec() #interpPspec contains both k array and P array interp_cu = cuda.matrix_to_array(interpPspec, order='F') cuda.bind_array_to_texref(interp_cu, pspec_texture) largebox_d = gpuarray.zeros(shape, dtype=np.float32) init_kernel(largebox_d, np.int32(DIM), block=block_size, grid=grid_size) #import IPython; IPython.embed() largebox_d_imag = gpuarray.zeros(shape, dtype=np.float32) init_kernel(largebox_d_imag, np.int32(DIM), block=block_size, grid=grid_size) largebox_d *= MRGgen.gen_normal(shape, dtype=np.float32) largebox_d_imag *= MRGgen.gen_normal(shape, dtype=np.float32) largebox_d = largebox_d + np.complex64(1.j) * largebox_d_imag #adj_complex_conj(largebox_d, DIM, block=block_size, grid=grid_size) largebox = largebox_d.get() #np.save(parent_folder+"/Boxes/deltak_z0.00_{0:d}_{1:.0f}Mpc".format(DIM, BOX_LEN), largebox) #save real space box before smoothing plan = Plan(shape, dtype=np.complex64) plan.execute(largebox_d, inverse=True) #FFT to real space of smoothed box largebox_d /= scale**3 np.save(parent_folder+"/Boxes/deltax_z0.00_{0:d}_{1:.0f}Mpc".format(DIM, BOX_LEN), largebox_d.real.get_async()) #save real space box after smoothing and subsampling # host largebox is still in k space, no need to reload from disk largebox_d = gpuarray.to_gpu(largebox) smoothR = np.float32(L_FACTOR*BOX_LEN/HII_DIM) HII_filter(largebox_d, N, ZERO, smoothR, block=block_size, grid=grid_size); plan.execute(largebox_d, inverse=True) #FFT to real space of smoothed box largebox_d /= scale**3 smallbox_d = gpuarray.zeros(HII_shape, dtype=np.float32) subsample_kernel(largebox_d.real, smallbox_d, N, HII_DIM, PIXEL_FACTOR, block=block_size, grid=HII_grid_size) #subsample in real space np.save(parent_folder+"/Boxes/smoothed_deltax_z0.00_{0:d}_{1:.0f}Mpc".format(HII_DIM, BOX_LEN), smallbox_d.get_async()) # reload the k-space box for velocity boxes largebox_d = gpuarray.to_gpu(largebox) #largebox_d /= VOLUME #divide by VOLUME if using fft (vs ifft) smoothR = np.float32(L_FACTOR*BOX_LEN/HII_DIM) largevbox_d = gpuarray.zeros((DIM,DIM,DIM), dtype=np.complex64) smallbox_d = gpuarray.zeros(HII_shape, dtype=np.float32) for num, mode in enumerate(['x', 'y', 'z']): velocity_kernel(largebox_d, largevbox_d, DIM, np.int32(num), block=block_size, grid=grid_size) HII_filter(largevbox_d, DIM, ZERO, smoothR, block=block_size, grid=grid_size) plan.execute(largevbox_d, inverse=True) largevbox_d /= scale**3 #import IPython; IPython.embed() subsample_kernel(largevbox_d.real, smallbox_d, DIM, HII_DIM,PIXEL_FACTOR, block=block_size, grid=HII_grid_size) np.save(parent_folder+"/Boxes/v{0}overddot_{1:d}_{2:.0f}Mpc".format(mode, HII_DIM, BOX_LEN), smallbox_d.get()) return
sy0 = ny/2 - sny/2 sy1 = ny/2 + sny/2 # initial wavefunction sigma0 = 50 * dx k0 = 20 x = np.arange(nx) * dx y = np.arange(ny) * dy psi.real[:] = gaussian(x, sigma=sigma0, x0=(sx0+200)*dx)[:,np.newaxis] psi[:] *= np.exp(1j * k0 * x)[:,np.newaxis] # cuda init cuda.init() ctx = cuda.Device(0).make_context() strm = cuda.Stream() plan = Plan((nx, ny), dtype=np.complex64, context=ctx, stream=strm) psi_gpu = gpuarray.to_gpu(psi) lcx_gpu = cuda.mem_alloc(nx * np.nbytes['complex64']) lcy_gpu = cuda.mem_alloc(ny * np.nbytes['complex64']) # potential vx0, vwidth = nx/2, 70 vmax = (k0 ** 2) / 2 # plot ''' import matplotlib.pyplot as plt from matplotlib.patches import Rectangle plt.ion() fig = plt.figure(figsize=(8,6)) ax1 = fig.add_subplot(1,1,1)
def get_pyfft_plan(shape): try: from pyfft.cuda import Plan except ImportError: return None return Plan(shape, normalize=True, wait_for_finish=True)
import numpy from pyfft.cuda import Plan import pycuda.autoinit import pycuda.gpuarray as gpuarray # w,h,k are the array dimensions in a power of 2 # im1, im2 are the input 3d arrays of dtype complex64 w = h = k = 512 im1 = numpy.random.rand(w, h, k).astype(numpy.complex64) im2 = numpy.random.rand(w, h, k).astype(numpy.complex64) %time plan = Plan((w, h, k), normalize=True) # forward transform on device %time im1_gpu = gpuarray.to_gpu(im1) %time plan.execute(im1_gpu) %time im1_ft = im1_gpu.get() del im1_gpu %time im2_gpu = gpuarray.to_gpu(im2) %time plan.execute(im2_gpu) %time im2_ft = im2_gpu.get() del im2_gpu # do multiplication on host - can be done on device. %time conv = im1_ft * im2_ft # inverse transform on device %time conv_gpu = gpuarray.to_gpu(conv) # del conv %time plan.execute(conv_gpu, inverse=True) %time corr_gpu = conv_gpu.get() # Reference calculation on CPU: %time im1_ft = numpy.fft.fftn(im1) %time im2_ft = numpy.fft.fftn(im2) %time conv = im1_ft * im2_ft del im1
def ssfgpuFull(u0, dt, dz, nz, alpha, betap, gamma, context, maxiter = 4, tol = 1e-5, phiNLOut = False): ''' Very simple implementation of the symmetrized split-step fourier algo. Solve the NLS equation with the SPM nonlinear terme only. * error: third in step size * u0 : Input field * dt: Time increment * dz: Space increment * nz: Number of space propagation step * alpha: Loss/Gain parameter (array) * betap: Beta array beta[2] = GVD, beta[3] = TOD, etc... * gamma: Nonlinear parameter * maxiter: Maximal number of iteration per step (4) * tol: Error for each step (1e-5) * phiNLOut: If True return the nonlinear phase shift (True) --- GPU Version (float precision) --- ''' nt = len(u0) e_ini = pow(abs(u0),2).sum() w = wspace(dt*nt,nt) phiNL = 0.0 # Make sure u0 is in single precision u0=u0.astype(complex64) alpha=alpha.astype(complex64) u1 = u0 uArch = zeros([nz,nt],float32) uv = empty_like(u0) # Construction of the linear operator halfstep = -alpha/2.0 if len(betap) != nt: for ii in arange(len(betap)): halfstep = halfstep - 1.0j*betap[ii]*pow(w,ii)/factorial(ii) halfstep = exp(halfstep*dz/2.0).astype(complex64) # CUDA Kitchen sink fftPlan = Plan((1, nt), dtype=numpy.complex64) # Allocate memory to the device gpu_halfstep = gpuarray.to_gpu(halfstep) gpu_u0 = gpuarray.to_gpu(u0) gpu_u1 = gpuarray.to_gpu(u1) gpu_uhalf = gpuarray.empty_like(gpu_u0) gpu_uv = gpuarray.empty_like(gpu_u0) gpu_ufft = gpuarray.empty_like(gpu_u0) fftPlan.execute(gpu_u0, gpu_ufft) # GPU Kernel corresponding to the linear operator halfStepKernel = ElementwiseKernel("pycuda::complex<float> *u, pycuda::complex<float> *halfstep, pycuda::complex<float> *uhalf", "uhalf[i] = u[i] * halfstep[i]", "halfstep_linear", preamble="#include <pycuda-complex.hpp>",) # GPU Kernel corresponding to the nonlinear operator nlKernel = ElementwiseKernel("pycuda::complex<float> *uhalf, pycuda::complex<float> *u0, pycuda::complex<float> *u1, pycuda::complex<float> *uv, float gamma, float dz", """ float u0_int = pow(u0[i]._M_re,2) + pow(u0[i]._M_im,2); float u1_int = pow(u1[i]._M_re,2) + pow(u1[i]._M_im,2); float realArg = -gamma*(u1_int + u0_int)*dz; float euler1 = cos(realArg); float euler2 = sin(realArg); uv[i]._M_re = uhalf[i]._M_re * euler1 - uhalf[i]._M_im * euler2; uv[i]._M_im = uhalf[i]._M_im * euler1 + uhalf[i]._M_re * euler2; """, "halfstep_nonlinear", preamble="#include <pycuda-complex.hpp>",) # GPU reduction kernel computing the error between two complex array computeError = ReductionKernel(numpy.float32, neutral="0", reduce_expr="a+b", map_expr="pow(abs(a[i] - b[i]),2)", arguments="pycuda::complex<float> *a, pycuda::complex<float> *b", name="error_reduction", preamble="#include <pycuda-complex.hpp>",) # Perfom a deep copy of a complex gpuarray complexDeepCopy = ElementwiseKernel("pycuda::complex<float> *u1, pycuda::complex<float> *u2", "u1[i]._M_re = u2[i]._M_re;u1[i]._M_im = u2[i]._M_im", "gpuarray_deepcopy", preamble="#include <pycuda-complex.hpp>",) # Main Loop for iz in arange(nz): # First application of the linear operator halfStepKernel(gpu_ufft, gpu_halfstep, gpu_uhalf) fftPlan.execute(gpu_uhalf, inverse=True) for ii in arange(maxiter): # Application de l'operateur nonlineaire en approx. l'integral de N(z)dz # avec la methode du trapeze nlKernel(gpu_uhalf, gpu_u0, gpu_u1, gpu_uv, float(gamma), float(dz/2.0)) fftPlan.execute(gpu_uv) # Second application of the linear operator halfStepKernel(gpu_uv, gpu_halfstep, gpu_ufft) fftPlan.execute(gpu_ufft, gpu_uv, inverse=True) error = computeError(gpu_u1, gpu_uv).get() / e_ini if (error < tol): complexDeepCopy(gpu_u1, gpu_uv) break else: complexDeepCopy(gpu_u1, gpu_uv) if (ii >= maxiter-1): raise Exception, "Failed to converge" complexDeepCopy(gpu_u0, gpu_u1) uArch[iz] = pow(abs(gpu_u1.get()),2) u1 = gpu_u1.get() if phiNLOut: return [u1, uArch, phiNL] else: return [u1, uArch]
def init_stitch(N): """outputs the high resolution k-box, and the smoothed r box Input ----------- N: int32 size of box to load onto the GPU, should be related to DIM by powers of 2 """ if N is None: N = np.int32(HII_DIM) #prepare for stitching META_GRID_SIZE = DIM/N M = np.int32(HII_DIM/META_GRID_SIZE) #HII_DIM = np.int32(HII_DIM) f_pixel_factor = DIM/HII_DIM; scale = np.float32(BOX_LEN/DIM) print 'scale', scale HII_scale = np.float32(BOX_LEN/HII_DIM) shape = (DIM,DIM,N) stitch_grid_size = (DIM/(block_size[0]), DIM/(block_size[0]), N/(block_size[0])) HII_stitch_grid_size = (HII_DIM/(block_size[0]), HII_DIM/(block_size[0]), M/(block_size[0])) #ratio of large box to small size kernel_source = open(cmd_folder+"/initialize_stitch.cu").read() kernel_code = kernel_source % { 'DELTAK': DELTA_K, 'DIM': DIM, 'VOLUME': VOLUME, 'META_BLOCKDIM': N } main_module = nvcc.SourceModule(kernel_code) init_stitch = main_module.get_function("init_kernel") HII_filter = main_module.get_function("HII_filter") subsample_kernel = main_module.get_function("subsample") velocity_kernel = main_module.get_function("set_velocity") pspec_texture = main_module.get_texref("pspec") MRGgen = MRG32k3aRandomNumberGenerator(seed_getter=seed_getter_uniform, offset=0) plan2d = Plan((np.int64(DIM), np.int64(DIM)), dtype=np.complex64) plan1d = Plan((np.int64(DIM)), dtype=np.complex64) print "init pspec" interpPspec, interpSize = init_pspec() #interpPspec contains both k array and P array interp_cu = cuda.matrix_to_array(interpPspec, order='F') cuda.bind_array_to_texref(interp_cu, pspec_texture) #hbox_large = pyfftw.empty_aligned((DIM, DIM, DIM), dtype='complex64') hbox_large = np.zeros((DIM, DIM, DIM), dtype=np.complex64) #hbox_small = np.zeros(HII_shape, dtype=np.float32) #hbox_large = n smoothR = np.float32(L_FACTOR*BOX_LEN/HII_DIM) # Set up pinned memory for transfer #largebox_hs = cuda.aligned_empty(shape=shape, dtype=np.float32, alignment=resource.getpagesize()) largebox_pin = cuda.pagelocked_empty(shape=shape, dtype=np.float32) largecbox_pin = cuda.pagelocked_empty(shape=shape, dtype=np.complex64) largebox_d = gpuarray.zeros(shape, dtype=np.float32) largebox_d_imag = gpuarray.zeros(shape, dtype=np.float32) print "init boxes" for meta_z in xrange(META_GRID_SIZE): # MRGgen = MRG32k3aRandomNumberGenerator(seed_getter=seed_getter_uniform, offset=meta_x*N**3) init_stitch(largebox_d, DIM, np.int32(meta_z),block=block_size, grid=stitch_grid_size) init_stitch(largebox_d_imag, DIM, np.int32(meta_z),block=block_size, grid=stitch_grid_size) largebox_d *= MRGgen.gen_normal(shape, dtype=np.float32) largebox_d_imag *= MRGgen.gen_normal(shape, dtype=np.float32) largebox_d = largebox_d + np.complex64(1.j) * largebox_d_imag cuda.memcpy_dtoh_async(largecbox_pin, largebox_d) hbox_large[:, :, meta_z*N:(meta_z+1)*N] = largecbox_pin.copy() #if want to get velocity need to use this if True: print "saving kbox" np.save(parent_folder+"/Boxes/deltak_z0.00_{0:d}_{1:.0f}Mpc.npy".format(DIM, BOX_LEN), hbox_large) print "Executing FFT on device" #hbox_large = pyfftw.interfaces.numpy_fft.ifftn(hbox_large).real hbox_large = fft_stitch(N, plan2d, plan1d, hbox_large, largebox_d).real print hbox_large.dtype print "Finished FFT on device" np.save(parent_folder+"/Boxes/deltax_z0.00_{0:d}_{1:.0f}Mpc.npy".format(DIM, BOX_LEN), hbox_large) if True: print "loading kbox" hbox_large = np.load(parent_folder+"/Boxes/deltak_z0.00_{0:d}_{1:.0f}Mpc.npy".format(DIM, BOX_LEN)) for meta_z in xrange(META_GRID_SIZE): largebox_pin = hbox_large[:, :, meta_z*N:(meta_z+1)*N].copy() #cuda.memcpy_htod_async(largebox_d, largebox_pin) largebox_d = gpuarray.to_gpu_async(hbox_large[:, :, meta_z*N:(meta_z+1)*N].copy()) HII_filter(largebox_d, DIM, np.int32(meta_z), ZERO, smoothR, block=block_size, grid=stitch_grid_size); hbox_large[:, :, meta_z*N:(meta_z+1)*N] = largebox_d.get_async() #import IPython; IPython.embed() print "Executing FFT on host" #hbox_large = hifft(hbox_large).astype(np.complex64).real #hbox_large = pyfftw.interfaces.numpy_fft.ifftn(hbox_large).real hbox_large = fft_stitch(N, plan2d, plan1d, hbox_large, largebox_d).real print "Finished FFT on host" #import IPython; IPython.embed() # for meta_x in xrange(META_GRID_SIZE): # for meta_y in xrange(META_GRID_SIZE): # for meta_z in xrange(META_GRID_SIZE): # largebox_d = gpuarray.to_gpu(hbox_large[meta_x*N:(meta_x+1)*N, meta_y*N:(meta_y+1)*N, meta_z*N:(meta_z+1)*N]) # HII_filter(largebox_d, N, np.int32(meta_x), np.int32(meta_y), np.int32(meta_z), ZERO, smoothR, block=block_size, grid=grid_size); # hbox_large[meta_x*N:(meta_x+1)*N, meta_y*N:(meta_y+1)*N, meta_z*N:(meta_z+1)*N] = largebox_d.get() #plan = Plan(shape, dtype=np.complex64) #plan.execute(largebox_d, inverse=True) #FFT to real space of smoothed box #largebox_d /= VOLUME #divide by VOLUME if using fft (vs ifft) # This saves a large resolution deltax print "downsampling" smallbox_d = gpuarray.zeros((HII_DIM,HII_DIM,M), dtype=np.float32) for meta_z in xrange(META_GRID_SIZE): largebox_pin = hbox_large[:, :, meta_z*N:(meta_z+1)*N].copy() cuda.memcpy_dtoh_async(largecbox_pin, largebox_d) #largebox_d = gpuarray.to_gpu_async(hbox_large[:, :, meta_z*N:(meta_z+1)*N].copy()) largebox_d /= scale**3 # subsample_kernel(largebox_d, smallbox_d, DIM, HII_DIM, PIXEL_FACTOR, block=block_size, grid=HII_stitch_grid_size) #subsample in real space hbox_small[:, :, meta_z*M:(meta_z+1)*M] = smallbox_d.get_async() np.save(parent_folder+"/Boxes/smoothed_deltax_z0.00_{0:d}_{1:.0f}Mpc".format(HII_DIM, BOX_LEN), hbox_small) #import IPython; IPython.embed() # To get velocities: reload the k-space box hbox_large = np.load(parent_folder+"/Boxes/deltak_z0.00_{0:d}_{1:.0f}Mpc.npy".format(DIM, BOX_LEN)) hvbox_large = np.zeros((DIM, DIM, DIM), dtype=np.float32) hvbox_small = np.zeros(HII_shape, dtype=np.float32) smoothR = np.float32(L_FACTOR*BOX_LEN/HII_DIM) largevbox_d = gpuarray.zeros((DIM,DIM,N), dtype=np.complex64) smallvbox_d = gpuarray.zeros((HII_DIM, HII_DIM, M), dtype=np.float32) for num, mode in enumerate(['x', 'y', 'z']): for meta_z in xrange(META_GRID_SIZE): largebox_d = gpuarray.to_gpu_async(hbox_large[:, :, meta_z*N:(meta_z+1)*N].copy()) #largebox_d /= VOLUME #divide by VOLUME if using fft (vs ifft) velocity_kernel(largebox_d, largevbox_d, DIM, np.int32(meta_z), np.int32(num), block=block_size, grid=stitch_grid_size) HII_filter(largevbox_d, DIM, ZERO, smoothR, block=block_size, grid=stitch_grid_size) print hvbox_large.shape, largevbox_d.shape hvbox_large[:, :, meta_z*N:(meta_z+1)*N] = largevbox_d.get_async() hvbox_large = fft_stitch(N, plan2d, plan1d, hvbox_large, largevbox_d).real for meta_z in xrange(META_GRID_SIZE): largevbox_d = gpuarray.to_gpu_async(hvbox_large[:, :, meta_z*N:(meta_z+1)*N].copy()) subsample_kernel(largevbox_d.real, smallvbox_d, DIM, HII_DIM,PIXEL_FACTOR, block=block_size, grid=HII_stitch_grid_size) hvbox_small[:, :, meta_z*M:(meta_z+1)*M] = smallvbox_d.get_async() np.save(parent_folder+"/Boxes/v{0}overddot_{1:d}_{2:.0f}Mpc".format(mode, HII_DIM, BOX_LEN), smallvbox_d.get()) return
t1 = t() ifft2(a) print 'NUMPY took', t()-t1 from pyfft.cuda import Plan import pycuda.driver as cuda from pycuda.tools import make_default_context import pycuda.gpuarray as gpuarray cuda.init() context = make_default_context() stream = cuda.Stream() plan = Plan((128,128), stream=stream) t1 = t() gpu_data = gpuarray.to_gpu(a) print 'togpu took', t()-t1 plan.execute(gpu_data) result = gpu_data.get() gpu_data = gpuarray.to_gpu(a) plan.execute(gpu_data) result = gpu_data.get() # np.conj(result) print 'CUDA took', (t()-t1)/2, result.shape t2 = t()
def ssfgpuFull(u0, dt, dz, nz, alpha, betap, gamma, context, maxiter=4, tol=1e-5, phiNLOut=False): ''' Very simple implementation of the symmetrized split-step fourier algo. Solve the NLS equation with the SPM nonlinear terme only. * error: third in step size * u0 : Input field * dt: Time increment * dz: Space increment * nz: Number of space propagation step * alpha: Loss/Gain parameter (array) * betap: Beta array beta[2] = GVD, beta[3] = TOD, etc... * gamma: Nonlinear parameter * maxiter: Maximal number of iteration per step (4) * tol: Error for each step (1e-5) * phiNLOut: If True return the nonlinear phase shift (True) --- GPU Version (float precision) --- ''' nt = len(u0) e_ini = pow(abs(u0), 2).sum() w = wspace(dt * nt, nt) phiNL = 0.0 # Make sure u0 is in single precision u0 = u0.astype(complex64) alpha = alpha.astype(complex64) u1 = u0 uArch = zeros([nz, nt], float32) uv = empty_like(u0) # Construction of the linear operator halfstep = -alpha / 2.0 if len(betap) != nt: for ii in arange(len(betap)): halfstep = halfstep - 1.0j * betap[ii] * pow(w, ii) / factorial(ii) halfstep = exp(halfstep * dz / 2.0).astype(complex64) # CUDA Kitchen sink fftPlan = Plan((1, nt), dtype=numpy.complex64) # Allocate memory to the device gpu_halfstep = gpuarray.to_gpu(halfstep) gpu_u0 = gpuarray.to_gpu(u0) gpu_u1 = gpuarray.to_gpu(u1) gpu_uhalf = gpuarray.empty_like(gpu_u0) gpu_uv = gpuarray.empty_like(gpu_u0) gpu_ufft = gpuarray.empty_like(gpu_u0) fftPlan.execute(gpu_u0, gpu_ufft) # GPU Kernel corresponding to the linear operator halfStepKernel = ElementwiseKernel( "pycuda::complex<float> *u, pycuda::complex<float> *halfstep, pycuda::complex<float> *uhalf", "uhalf[i] = u[i] * halfstep[i]", "halfstep_linear", preamble="#include <pycuda-complex.hpp>", ) # GPU Kernel corresponding to the nonlinear operator nlKernel = ElementwiseKernel( "pycuda::complex<float> *uhalf, pycuda::complex<float> *u0, pycuda::complex<float> *u1, pycuda::complex<float> *uv, float gamma, float dz", """ float u0_int = pow(u0[i]._M_re,2) + pow(u0[i]._M_im,2); float u1_int = pow(u1[i]._M_re,2) + pow(u1[i]._M_im,2); float realArg = -gamma*(u1_int + u0_int)*dz; float euler1 = cos(realArg); float euler2 = sin(realArg); uv[i]._M_re = uhalf[i]._M_re * euler1 - uhalf[i]._M_im * euler2; uv[i]._M_im = uhalf[i]._M_im * euler1 + uhalf[i]._M_re * euler2; """, "halfstep_nonlinear", preamble="#include <pycuda-complex.hpp>", ) # GPU reduction kernel computing the error between two complex array computeError = ReductionKernel( numpy.float32, neutral="0", reduce_expr="a+b", map_expr="pow(abs(a[i] - b[i]),2)", arguments="pycuda::complex<float> *a, pycuda::complex<float> *b", name="error_reduction", preamble="#include <pycuda-complex.hpp>", ) # Perfom a deep copy of a complex gpuarray complexDeepCopy = ElementwiseKernel( "pycuda::complex<float> *u1, pycuda::complex<float> *u2", "u1[i]._M_re = u2[i]._M_re;u1[i]._M_im = u2[i]._M_im", "gpuarray_deepcopy", preamble="#include <pycuda-complex.hpp>", ) # Main Loop for iz in arange(nz): # First application of the linear operator halfStepKernel(gpu_ufft, gpu_halfstep, gpu_uhalf) fftPlan.execute(gpu_uhalf, inverse=True) for ii in arange(maxiter): # Application de l'operateur nonlineaire en approx. l'integral de N(z)dz # avec la methode du trapeze nlKernel(gpu_uhalf, gpu_u0, gpu_u1, gpu_uv, float(gamma), float(dz / 2.0)) fftPlan.execute(gpu_uv) # Second application of the linear operator halfStepKernel(gpu_uv, gpu_halfstep, gpu_ufft) fftPlan.execute(gpu_ufft, gpu_uv, inverse=True) error = computeError(gpu_u1, gpu_uv).get() / e_ini if (error < tol): complexDeepCopy(gpu_u1, gpu_uv) break else: complexDeepCopy(gpu_u1, gpu_uv) if (ii >= maxiter - 1): raise Exception, "Failed to converge" complexDeepCopy(gpu_u0, gpu_u1) uArch[iz] = pow(abs(gpu_u1.get()), 2) u1 = gpu_u1.get() if phiNLOut: return [u1, uArch, phiNL] else: return [u1, uArch]
from pyfft.cuda import Plan import numpy from pycuda.tools import make_default_context import pycuda.gpuarray as gpuarray import pycuda.driver as cuda # initialize context cuda.init() context = make_default_context() stream = cuda.Stream() # create plan plan = Plan((16, 16), stream=stream) # prepare data data = numpy.ones((16, 16), dtype=numpy.complex64) gpu_data = gpuarray.to_gpu(data) print gpu_data # forward transform plan.execute(gpu_data) result = gpu_data.get() print result # inverse transform plan.execute(gpu_data, inverse=True) result = gpu_data.get() error = numpy.abs(numpy.sum(numpy.abs(data) - numpy.abs(result)) / data.size) print error < 1e-6 context.pop()