Example #1
0
def evolve_linear(z, deltax):
    """
	Input type IN must be numpy or 21cmfast
	"""

    fgrowth = pb.fgrowth(z, COSMO['omega_M_0'])  #normalized to 1 at z=0
    #primordial_fgrowth = pb.fgrowth(INITIAL_REDSHIFT, cosmo['omega_M_0']) #normalized to 1 at z=0

    updated = deltax * fgrowth

    np.save(
        parent_folder +
        "/Boxes/updated_smoothed_deltax_z{0:.2f}_{1:d}_{2:.0f}Mpc".format(
            z, HII_DIM, BOX_LEN), updated)

    if False:  #velocity information may not be useful for linear field
        plan = Plan(HII_shape, dtype=np.complex64)
        deltak_d = deltax_d.astype(np.complex64)
        vbox_d = gpuarray.zeros_like(deltak_d)
        plan.execute(deltak_d)
        dDdt_D = np.float32(dDdt_D(z))
        for num, mode in enumerate(['x', 'y', 'z']):
            velocity_kernel(deltak_d,
                            vbox_d,
                            dDdt_D,
                            DIM,
                            np.int32(num),
                            block=block_size,
                            grid=grid_size)
            np.save(
                parent_folder +
                "/Boxes/updated_v{0}overddot_{1:d}_{2:.0f}Mpc".format(
                    mode, HII_DIM, BOX_LEN), smallvbox_d.get())

    return
Example #2
0
def gpu_fft(data, inverse=False):
    global plan, ctx, stream  ##cuda
    if not plan:
        print 'building plan', data.shape
        plan = Plan(data.shape, stream=stream, wait_for_finish=True)

    result = gpuarray.zeros_like(data)

    plan.execute(data, data_out=result, inverse=inverse)

    return result
Example #3
0
def prep(image, psf):
    datadim1 = image.shape[0]
    datadim2 = image.shape[1]
    if datadim1 != datadim2:
        ddim = max(datadim1, datadim2)
        s = numpy.binary_repr(ddim - 1)
        s = s[:-1] + '0'  # Guarantee that padding is used
    else:
        ddim = datadim1
        s = numpy.binary_repr(ddim - 1)
    if s.find('0') > 0:
        size = 2**len(s)
        boxd = numpy.zeros((size, size))
        r = size - datadim1
        r1 = r2 = r / 2
        if r % 2 == 1:
            r1 = r // 2 + 1
        c = size - datadim2
        c1 = c2 = c // 2
        if c % 2 == 1:
            c1 = c // 2 + 1
        boxdslice = (slice(r1, datadim1 + r1), slice(c1, datadim2 + c1))
        boxd[boxdslice] = image
    else:
        boxd = image

    boxp = boxd * 0.
    if boxd.shape[0] == psf.shape[0]:
        boxp = psf.copy()
    else:
        r = boxp.shape[0] - psf.shape[0]
        r1 = r // 2 + 1
        c = boxp.shape[1] - psf.shape[1]
        c1 = c // 2 + 1
        boxpslice = (slice(r1,
                           psf.shape[0] + r1), slice(c1, psf.shape[1] + c1))
        boxp[boxpslice] = psf.copy()

    from pyfft.cuda import Plan
    import pycuda.driver as cuda
    from pycuda.tools import make_default_context
    import pycuda.gpuarray as gpuarray
    cuda.init()
    context = make_default_context()
    stream = cuda.Stream()

    plan = Plan(boxp.shape, stream=stream)
    gdata = gpuarray.to_gpu(boxp.astype(numpy.complex64))
    plan.execute(gdata)
    return gdata, boxd.shape, boxdslice, plan, stream
Example #4
0
def prep(image,psf):
    datadim1 = image.shape[0]
    datadim2 = image.shape[1]
    if datadim1!=datadim2:
        ddim = max(datadim1,datadim2)
        s = numpy.binary_repr(ddim-1)
        s = s[:-1]+'0' # Guarantee that padding is used
    else:
        ddim = datadim1
        s = numpy.binary_repr(ddim-1)
    if s.find('0')>0:
        size = 2**len(s)
        boxd = numpy.zeros((size,size))
        r = size-datadim1
        r1 = r2 = r/2
        if r%2==1:
            r1 = r/2+1
        c = size-datadim2
        c1 = c2 = c/2
        if c%2==1:
            c1 = c/2+1
        boxdslice = (slice(r1,datadim1+r1),slice(c1,datadim2+c1))
        boxd[boxdslice] = image
    else:
        boxd = image

    boxp = boxd*0.
    if boxd.shape[0]==psf.shape[0]:
        boxp = psf.copy()
    else:
        r = boxp.shape[0]-psf.shape[0]
        r1 = r/2+1
        c = boxp.shape[1]-psf.shape[1]
        c1 = c/2+1
        boxpslice = (slice(r1,psf.shape[0]+r1),slice(c1,psf.shape[1]+c1))
        boxp[boxpslice] = psf.copy()

    from pyfft.cuda import Plan
    import pycuda.driver as cuda
    from pycuda.tools import make_default_context
    import pycuda.gpuarray as gpuarray
    cuda.init()
    context = make_default_context()
    stream = cuda.Stream()

    plan = Plan(boxp.shape,stream=stream)
    gdata = gpuarray.to_gpu(boxp.astype(numpy.complex64))
    plan.execute(gdata)
    return gdata,boxd.shape,boxdslice,plan,stream
Example #5
0
    def run(self):
        drv.init()
        a0=numpy.zeros((p,),dtype=numpy.complex64)
        self.dev = drv.Device(self.number)
        self.ctx = self.dev.make_context()
#TO VERIFY WHETHER ALL THE MEMORY IS FREED BEFORE NEXT ALLOCATION (THIS DOES NOT HAPPEN IN MULTITHREADING)
        print drv.mem_get_info() 
        self.gpu_a = garray.empty((self.input_cpu.size,), dtype=numpy.complex64)
        self.gpu_b = garray.zeros_like(self.gpu_a)
        self.gpu_a = garray.to_gpu(self.input_cpu)
        plan = Plan(a0.shape,context=self.ctx)
        plan.execute(self.gpu_a, self.gpu_b, batch=p/m)
        self.temp = self.gpu_b.get()
        print output_cpu._closed
        self.output_cpu.put(self.temp)
def _get_plan(itype, otype, inlen):
    try:
        theplan = _plans[(itype, otype, inlen)]
    except KeyError:
        theplan = Plan(inlen, dtype=itype, normalize=False, fast_math=True)
        _plans.update({(itype, otype, inlen): theplan})

    return theplan
Example #7
0
def conv(delta_d, filt_d, shape, fil):
	smoothI = np.zeros(shape, dtype=np.complex64)
	smoothed_d = gpuarray.to_gpu(smoothI)
	plan = Plan(shape, dtype=np.complex64)
	plan.execute(delta_d)
	if fil == 'rspace':
		plan.execute(filt_d)
	smoothed_d = delta_d * filt_d.conj()
	plan.execute(smoothed_d, inverse=True)
	return smoothed_d.real
Example #8
0
def image_cuda(grids):
    """ Run 2d FFT to image each plane of grid array
    """

    from pyfft.cuda import Plan
    from pycuda.tools import make_default_context
    import pycuda.gpuarray as gpuarray
    import pycuda.driver as cuda

    nints, npixx, npixy = grids.shape

    cuda.init()
    context = make_default_context()
    stream = cuda.Stream()

    plan = Plan((npixx, npixy), stream=stream)

    grid_gpu = gpuarray.to_gpu(grids)
    for i in range(0, nints):
        plan.execute(grid_gpu[i], inverse=True)
    grids = grid_gpu.get()

    context.pop()
    return recenter(grids.real, (npixx//2, npixy//2))
Example #9
0
def calcCorrShiftGPU(H, G):
	log.info("Using: "+dev.name())
	res=[]
	# make sure we get arrays of items
	if type(G) != type([]):
		G=[G]
	if type(H) != type([]):
		H=[H]
	for H_t in H:
		# Setup plan has to be power of 2
		fftPlan = Plan(len(H_t.dataTrans), wait_for_finish=True)
		
		# H is long and G is short
		log.debug("Starting H_ fft on GPU...")
		
		# push to GPU
		H_gpu = gpuarray.to_gpu(H_t.dataTrans)
		
		# do forwards FFT in place
		fftPlan.execute(H_gpu)
		
		for G_t in G:
			# pad G with zeros to size of H
			log.debug("Starting G_ fft on GPU...")
			G_gpu = gpuarray.to_gpu(G_t.getTransPadded(len(H_t.dataTrans)))
			
			# do forwards FFT in place
			fftPlan.execute(G_gpu)
			
			F_gpu = H_gpu * G_gpu.conj()
			
			fftPlan.execute(F_gpu, inverse=True)
			
			f_host = F_gpu.get()
			
			maxVal = f_host.max()
			k=np.where(f_host==maxVal)
			
			res.append({"H":H_t, "G":G_t, "shift":k[0][0], "corr":maxVal.real})
			#maxVal = gpuarray.max(F_gpu.real)
			#print maxVal
	return res
Example #10
0
def main():
    ## Default input parameter specification
    r=1.0
    nz=100
    G=1.8962

    print("Starting\n")
    start_time = time.time()

    print("Creating Initial Profile\n")
    ##simulation parameter
    n_points = (1024,1024)
    Xmax= (5.0,5.0) #  grid and window
    dx = [2.*Xmax[i]/n_points[i] for i in [0,1]]

    dz=0.003
    beta = 500

    #print "Enter step size [Ldf]\n" 
	#scanf("%lf",&dz 
    #print "Enter number of steps \n" 
	#scanf("%lf",&nz 
	#nz=(int)nz;
	#print "Enter number of critical powers\n" 
	#scanf("%lf",&beta 
    gamma=G*beta

    x=linspace(-Xmax[0],Xmax[0],n_points[0])
    y=linspace(-Xmax[1],Xmax[1],n_points[1])
    
    kx=fftfreq(n_points[0],dx[0])
    ky=fftfreq(n_points[1],dx[1])
    
    X,Y = meshgrid(x,y)
    Kx, Ky = meshgrid(kx,ky)
    
    keepMax=zeros(N_Z)

    II_out= zeros(n_points)
    U_m = zeros(n_points, dtype = complex64)
    IM_out= zeros(n_points)
    IF_out= zeros(n_points)
    ufft= zeros(n_points, dtype = complex64)
    ufft_pc= zeros(n_points, dtype = complex64)

	##for (j=0;j<nx;j++) {
	##	x[j]=(double)(-nx/2+j+1)*dx;
	##	kx[j]=(j < nx/2 ) ?
    ##             (pi*(double)(j))*(1./dx/((double)(nx))):
    ##             (pi*(double)(j-nx))*(1./dx/((double)(nx))    ##fx_s=1/dx, dfx= fx_s/N-> d omega= 2pi dfx		
	## ?????	kx[j]=kx[j]*kx[j];
	
    u = exp( -(X**2 + Y**2)/r ) + 0.j
    u = u.astype(complex64)
    
    u_m= zeros(n_points)
    u_f= zeros(n_points)
    
    II_out = (u.real**2 + u.imag**2)  ## ???
	
    steps=2.*nz

    print "Step size %g [Ldf]\n"%dz
    print "Number of critical powers %g\n"%beta
    print "Number of steps %d\n"%N_Z


    ## cuFFT planning and preparation
    # cuda.init()
    dev = pycuda.autoinit.device
    context = make_default_context()
    
    nonlinearMod = SourceModule("""
 #include <pycuda-complex.hpp>

__global__ void nonlinear(pycuda::complex<float> *u_mat, float beta, float dz, pycuda::complex<float> *keepMax, int step)
{
		const int y = blockDim.y * blockIdx.y + threadIdx.y;
		const int x = blockDim.x * blockIdx.x + threadIdx.x;
		float I;
        pycuda::complex<float> I_UNIT(0.,1.);
        int i = x* %(n)d + y;
        
		I=pycuda::abs(u_mat[i]);

		u_mat[i]= u_mat[i]* pycuda::exp(I_UNIT*I*beta*dz);
		
		if ((x==i/2) && (y==i/2))
			keepMax[step]=pycuda::exp(I_UNIT*I*beta*dz);
	
}
    
__global__ void prod(pycuda::complex<float> *X,
                     pycuda::complex<float> *Y, pycuda::complex<float> *Z)
{
		const int y = blockDim.y * blockIdx.y + threadIdx.y;
		const int x = blockDim.x * blockIdx.x + threadIdx.x;
        int i = x*%(n)d + y;
		Z[i]=X[i] * Y[i];
	
    }"""%{'n' : n_points[0]})

    
    print "Device %d: \"%s\" with Compute %d.%d capability\n"%(dev.pci_bus_id,
                                                               dev.name(),
                                                               dev.compute_capability()[0],
                                                               dev.compute_capability()[1])
    print "Creating FFT Plans\n"
	
    
    plan = Plan(n_points, wait_for_finish = True, scale = dx[0]*dx[1])
    block = (16,16,1)
    grid = (n_points[0]/block[0],
            n_points[1]/block[1])   ## Threads per block
    fft_g  = lambda x, y: plan.execute(x, y)
    ifft_g = lambda x, y: plan.execute(x, y, inverse = True)
    
    g_mult = nonlinearMod.get_function('prod')
    runNonLinear = nonlinearMod.get_function("nonlinear")
    
    print "Allocating memory on device\n"
    u_gpu = gpuarray.to_gpu(u.astype(complex64))
    U_gpu = gpuarray.to_gpu(zeros(n_points, complex64))


    print "Allocating kx, ky & keepMax\n" 
    cukx = gpuarray.to_gpu(kx)
    cuky = gpuarray.to_gpu(ky)
    cukeepMax = gpuarray.to_gpu(ones(nz, complex64))
 
	## preparing the data to transfer to the device 
    
    IM_out = u.real
    #fileout("A",0.,h_in,nx, ny) 
    
    print "Starting %i FFT pairs\n"%steps
    start = time.time()

    op_diff = exp(5e2j*(Kx**2+Ky**2) *dz/2.)
    op_diff = gpuarray.to_gpu(op_diff.astype(complex64))
    
    zero_j = array([0],dtype = complex64)
    one_j = array([1],dtype = complex64)
    idxdy = array([1./(dx[0]*dx[1])], dtype = complex64)
    dxdy = array([(dx[0]*dx[1])], dtype = complex64)
    
    g_mult(U_gpu, op_diff, U_gpu, block = block, grid = grid)
    context.synchronize()
    #print abs(U_gpu.get())
    #pl.imshow(abs(U_gpu.get()))
    #pl.figure()
    
    for l in xrange(nz):
		## FFT into the spatial frequency domain
        fft_g(u_gpu, U_gpu)
        g_mult(U_gpu, op_diff, U_gpu, block = block, grid = grid)
        context.synchronize()
        
		## inverse FFT into space domain
        ifft_g(U_gpu, u_gpu)
        
    	
		## Nonlinear step in space domain
        runNonLinear(u_gpu, float32(gamma), float32(dz), cukeepMax, int32(l), block = block, grid = grid)
        context.synchronize()
		## cast to double
        fft_g(u_gpu, U_gpu)
        
        g_mult(U_gpu, op_diff, U_gpu, block = block, grid = grid)
        context.synchronize()
		## inverse FFT into space domain
        ifft_g(U_gpu, u_gpu)
img = Image.open("fox.jpg").convert("L")


# Convert image to numpy array
arr = np.array(img, dtype=np.float32)
pad_arr = pad_power_of_two(arr)
pad_arr2 = np.empty_like(pad_arr)


# FFT with CUDA
cuda.init()
ctx = cuda.Device(0).make_context()
strm = cuda.Stream()

pad_arr_gpu = cuda.to_device(pad_arr)
plan = Plan(pad_arr.shape, dtype=np.float32, context=ctx, stream=strm)
plan.execute(pad_arr_gpu)
cuda.memcpy_dtoh(pad_arr2, pad_arr_gpu)
pad_arr3 = np.fft.fftshift(pad_arr2)


# --------------------------------------------------------------------------
# Plot
# --------------------------------------------------------------------------
plt.ion()
fig = plt.figure(figsize=(20, 7))
ax1 = fig.add_subplot(1, 2, 1)
ax2 = fig.add_subplot(1, 2, 2)
ax1.set_title("Original")
ax2.set_title("FFT")
Example #12
0
def evolve_zeldovich(z, deltax):
    """First order Zeldovich approximation. """
    if BOX_LEN > DIM:
        print "perturb_field: WARNING: Resolution is likely too low for accurate evolved density fields"
    #move_mass(updated_d, deltax_d, vx_d, vy_d, vz_d, np.float32(1./primordial_fgrowth))
    kernel_source = open(cmd_folder + "/perturb_field.cu").read()
    kernel_code = kernel_source % {
        'DELTAK': DELTA_K,
        'HII_DIM': HII_DIM,
        'DIM': DIM,
        'PIXEL_FACTOR': PIXEL_FACTOR
    }
    main_module = nvcc.SourceModule(kernel_code)
    move_mass = main_module.get_function("move_mass")
    velocity_kernel = main_module.get_function("set_velocity")
    filter_kernel = main_module.get_function("filter")
    subsample_kernel = main_module.get_function("subsample")

    fgrowth = np.float32(pb.fgrowth(
        z, COSMO['omega_M_0']))  #normalized to 1 at z=0
    primordial_fgrowth = np.float32(
        pb.fgrowth(INITIAL_REDSHIFT,
                   COSMO['omega_M_0']))  #normalized to 1 at z=0

    vx = np.load(
        parent_folder +
        "/Boxes/vxoverddot_{0:d}_{1:.0f}Mpc.npy".format(HII_DIM, BOX_LEN))
    vy = np.load(
        parent_folder +
        "/Boxes/vyoverddot_{0:d}_{1:.0f}Mpc.npy".format(HII_DIM, BOX_LEN))
    vz = np.load(
        parent_folder +
        "/Boxes/vzoverddot_{0:d}_{1:.0f}Mpc.npy".format(HII_DIM, BOX_LEN))
    vx_d = gpuarray.to_gpu(vx)
    vy_d = gpuarray.to_gpu(vy)
    vz_d = gpuarray.to_gpu(vz)
    vx_d *= ((fgrowth - primordial_fgrowth) / BOX_LEN
             )  #this is now comoving displacement in units of box size
    vy_d *= ((fgrowth - primordial_fgrowth) / BOX_LEN)
    vz_d *= ((fgrowth - primordial_fgrowth) / BOX_LEN)

    #updated_d = gpuarray.zeros_like(vx_d)
    start = cuda.Event()
    updated_d = gpuarray.zeros(HII_shape, dtype=np.float32)
    delta_d = gpuarray.to_gpu(deltax)
    start.record()
    start.synchronize()

    move_mass(updated_d,
              delta_d,
              vx_d,
              vy_d,
              vz_d,
              primordial_fgrowth,
              block=block_size,
              grid=grid_size)
    updated_d /= MASS_FACTOR
    updated_d -= np.float32(
        1.)  #renormalize to the new pixel size, and make into delta
    updated = updated_d.get_async()
    #import IPython; IPython.embed()
    np.save(
        parent_folder +
        "/Boxes/updated_smoothed_deltax_z{0:.2f}_{1:d}_{2:.0f}Mpc".format(
            z, HII_DIM, BOX_LEN), updated)

    plan = Plan((DIM, DIM, DIM), dtype=np.complex64)
    delta_d = delta_d.astype(np.complex64)
    #import IPython; IPython.embed()
    vbox_d = gpuarray.zeros_like(delta_d)
    smallvbox_d = gpuarray.zeros(HII_shape, dtype=np.float32)
    plan.execute(delta_d)  #now deltak
    dDdt_D = np.float32(dDdtoverD(z))
    #print dDdt_D
    #import IPython; IPython.embed()
    smoothR = np.float32(L_FACTOR * BOX_LEN / HII_DIM)
    for num, mode in enumerate(['x', 'y', 'z']):
        velocity_kernel(delta_d,
                        vbox_d,
                        dDdt_D,
                        DIM,
                        np.int32(num),
                        block=block_size,
                        grid=grid_size)
        filter_kernel(vbox_d,
                      DIM,
                      ZERO,
                      smoothR,
                      block=block_size,
                      grid=grid_size)
        plan.execute(vbox_d, inverse=True)
        subsample_kernel(vbox_d.real,
                         smallvbox_d,
                         DIM,
                         HII_DIM,
                         PIXEL_FACTOR,
                         block=block_size,
                         grid=HII_grid_size)
        np.save(
            parent_folder +
            "/Boxes/updated_v{0}overddot_{1:d}_{2:.0f}Mpc".format(
                mode, HII_DIM, BOX_LEN), smallvbox_d.get())

    return
Example #13
0
def split_step_GPU(A0, z_array,       # Array for solution points
                t_op = 0, w_op = 0, nlin = 0,  # Constant operators 
                dt = 1,           # sampling time
                t_nl_op = None,   # Additional operator f(A, dt, z)
                apod = True,      # Boundary conditition
                varying_operator = False, # Do operators vary in x
                dynamic_predictor = True,
                plot_hook = None, n_plots = 3,  # not used anymore
                tollerance = 0.04, ):
    
    import pycuda.autoinit

    from pycuda.tools import make_default_context, dtype_to_ctype
    import pycuda.gpuarray as gpuarray
    from pycuda import cumath
    from pyfft.cuda import Plan
    from pycuda.compiler import SourceModule
    from pycuda.driver import Context
    from pycuda.elementwise import get_axpbyz_kernel, get_axpbz_kernel, get_binary_op_kernel, get_elwise_kernel,ElementwiseKernel
            
    ## Initialization
    n_points = A0.shape[0]
    # w = fftfreq(npoints, dx) * 2 * pi
    A_t = A0[:] +0.j
    #A_t.dtype = complex64
    A_w = fft(A_t) * dt
    
    
    # Apodization (AK boundary conditions)
    # TODO making it smooth
    apod_array = ones(n_points, dtype = complex64)
    apod_array[0:n_points/50] = 0
    apod_array[-n_points/50:-1] = 0

    z0 = z_array[0]
    zf = z_array[-1]
    
    delta_z = 1.*(z_array[1]-z_array[0])/4
    done_once = False

    #plan = c_uint()
    #dll.cufftPlan1d(byref(plan), n_points, 0x29, 1)
    #fft_g  = lambda x, y: dll.cufftExecC2C(plan, x.ptr, y.ptr, -1)
    #ifft_g = lambda x, y: dll.cufftExecC2C(plan, x.ptr, y.ptr, 1)
    
    ## GPU modules #####
    if pycuda.autoinit.context:
        context = pycuda.autoinit.context
    else:
        context =  make_default_context()
    block = (16,1,1)
    grid  = (n_points/block[0], 1)

    ## Init GPU kernels ####
    ## fft, scale dx is included in the definition here
    plan = Plan(n_points,wait_for_finish = True, scale = dt)   
    fft_g  = lambda ain, aout: plan.execute(ain, aout,)
    ifft_g = lambda x, y: plan.execute(x, y, inverse = True)

    ## Multiplication
    prod  = ElementwiseKernel(
            "pycuda::complex<float> *x, pycuda::complex<float> *y, pycuda::complex<float> *z",
            """
            z[i] = x[i] * y[i];
            """,
            "product",
            preamble = "")
    #prod  = lambda x,y,z: prod(x,y,z, block, grid)
    
    ## Non-linearity
    nonLinear = ElementwiseKernel(
            "pycuda::complex<float> *x, pycuda::complex<float> nlin, pycuda::complex<float> *y, pycuda::complex<float> *z",
            """
            pycuda::complex<float> I_UNIT(0.,1.);
            float I = pycuda::abs(y[i]);
            z[i] = x[i] * pycuda::exp(I_UNIT * I * nlin);
            """,
            "nonLinear",
            preamble = "")
    
    ## Evaluate the solution with current values at delta_z step
    ## separated so that can be re-used for error prediction
    ## contains some lazy eveluation just to be CUDA-implementation ready
    ## and reducing the number of array creation
    
    def f(A_t, A_w, dz = delta_z):
        if f.delta_z != dz:
            f.w_exp = cumath.exp(-1j * dz/2. * w_op)
            f.t_exp = cumath.exp(-1j * dz * t_op)
            f.delta_z = dz
        
        ## Dispersion (I pass)
        f.A_t = A_t
        f.A_w = A_w
        
        #print A_w.get()[n_points/2],     
        prod(A_w, f.w_exp, A_w)
        #A_w = f.w_exp*A_w
        #print A_w.get()[n_points/2],
        ifft_g(f.A_w, f.A_t)  ## Scale factor included in fft_g
        
        
        ## Constant potential term
        prod(f.A_t, f.t_exp, f.A_t)

        ## Nonlinear operator as intensity dependency
        if nlin != 0:
            f.A_t = f.A_t * cumath.exp(-1j * delta_z * nlin * f.A_t * f.A_t.conj())
        ## Additional nonlinear terms as a function t_nl_op(A(t),dt,z)
        if t_nl_op != None:
            f.A_t = f.A_t * cumath.exp(-1j * delta_z * t_nl_op(f.A_t, dt, z0+delta_z/2) )
        ## Apodization
        if apod:
            prod(f.A_t, apod_array, f.A_t)
            
        fft_g(f.A_t, f.A_w) ## Scale factor included in fft_g
        
        ## Dispersion (II pass)
        prod(f.A_w, f.w_exp, f.A_w)
        
        ifft_g(f.A_w, f.A_t)  ## Scale factor included in fft_g
        
        
        return f.A_t, f.A_w

    ## Init the f function
    f.delta_z = 0 # The rest will be evaluated lazily

    ## Convert to GPU arrays
    f.A_t = gpuarray.to_gpu(ones(n_points, complex64))
    f.A_w = gpuarray.to_gpu(ones(n_points, complex64))
    A_t   = gpuarray.to_gpu(A_t.astype(complex64))
    A_w   = gpuarray.to_gpu(A_w.astype(complex64))
    
    
    
    apod_array  = gpuarray.to_gpu(apod_array.astype(complex64))
    if hasattr(w_op,'__len__'):
        w_op = gpuarray.to_gpu(w_op.astype(complex64))
    else:  ## Use array even if it's a single values, othewise error when updating dz
        w_op = gpuarray.to_gpu(w_op*ones(n_points).astype(complex64))
    if hasattr(t_op,'__len__'):
        t_op = gpuarray.to_gpu(t_op.astype(complex64))
    else:
        t_op = gpuarray.to_gpu(t_op*ones(n_points).astype(complex64))
    error = tollerance
    
    print "Ready for integration"
    
    ## Init loop variables
    sol_i = 0    
    sols  = [A0]
    iters = 0
    
    ## Integration loop
    while z0 <= zf:
        ## Cycle check
        if z0 >= z_array[sol_i]:
            #print "dz = %.2e error=%.2f z = %.2e"%(delta_z,error,z0)
            sols.append(A_t.get())
            sol_i +=1
            
        try:  ## Force to have steps smaller than the distance between 2 solutions
            while z0 + delta_z >= z_array[sol_i + 1]:
                delta_z /= 2.
        except:
            pass
    
        ## Dynamical correction
        while dynamic_predictor:
            A_coarse = f(gpuarray.to_gpu(A_t.get()),
                         gpuarray.to_gpu(A_w.get()),
                         dz=2*delta_z)[0].get()
            A_fine = f(*f(gpuarray.to_gpu(A_t.get()),
                          gpuarray.to_gpu(A_w.get()),
                         delta_z), dz=delta_z)[0].get()
            delta = A_fine-A_coarse
            error = sqrt( trapz(delta*delta.conj())/ \
                          trapz(A_fine*A_fine.conj()))
            #print "Error : ",error, " dz :", delta_z
            if error < 2 * tollerance:
                done_once = True
                break  ## Error is less then the tollerance, proceed
            delta_z = delta_z / 2.
        
        # update step        
        A_t, A_w = f(A_t, A_w, delta_z)
        z0 += delta_z
        iters += 1
        
        # Dynamic step (additional correction for faster convergence)
        if (dynamic_predictor or not (done_once or dynamic_predictor) ):
            if error > tollerance:
                delta_z = delta_z / 1.23
            if error < 0.5/tollerance:
                delta_z = delta_z * 1.23
                
        # Show the state of the loop every 200 loops (approx every few secs)
        if iters %200 == 0:
            print "Iter %8d (to end %8d) %4.1f %%"%(iters, 
                            (z_array[-1]-z0)/delta_z,
                            100.*iters/(iters+(z_array[-1]-z0)/delta_z))
    
    ## Integration is over
    print "Total iterations: ", iters
    ## Return array with solutions (and their ftt)
    return sols
def simpleFourierTest2D(N=2048):
    """
    Using PyFFT to call CUDA.

    :return:
    """
    from pyfft.cuda import Plan
    import pycuda.driver as cuda
    from pycuda.tools import make_default_context
    import pycuda.gpuarray as gpuarray
    import time

    cuda.init()
    context = make_default_context()
    stream = cuda.Stream()

    plan = Plan((N, N), dtype=np.complex64, stream=stream)
    x = np.ones((N, N), dtype=np.complex64)

    x_gpu = gpuarray.to_gpu(x)

    plan.execute(x_gpu)
    res = x_gpu.get()
    plan.execute(x_gpu, inverse=True)
    result = x_gpu.get()
    context.pop()

    error = np.abs(np.sum(np.abs(x) - np.abs(result)) / x.size)
    #print 'Error:', error

    #Single precision
    print 'Array size %i x %i' % (N, N)
    print 'Single Precisions'
    x = np.random.random((N, N))
    x = x.astype(np.complex64)

    start = time.time()
    cuda.init()
    context = make_default_context()
    stream = cuda.Stream()

    plan = Plan((N, N), dtype=np.complex64, stream=stream, fast_math=True)

    x_gpu = gpuarray.to_gpu(x)
    plan.execute(x_gpu)
    result = x_gpu.get()
    context.pop()
    end = time.time()
    cudatime = end - start

    #numpy
    start = time.time()
    xf = np.fft.fft2(x)
    end = time.time()
    numpytime = end - start

    print 'Same to 1e-2?'
    print np.testing.assert_allclose(xf, result, rtol=1e-2)
    print 'Numpy time', numpytime
    print 'CUDA time', cudatime

    #Double precision
    print '\n\nDouble Precision'
    x = np.random.random((N, N))
    x = x.astype(np.complex128)

    start = time.time()

    cuda.init()
    context = make_default_context()
    stream = cuda.Stream()

    plan = Plan((N, N), dtype=np.complex128, stream=stream, fast_math=True)

    x_gpu = gpuarray.to_gpu(x)
    plan.execute(x_gpu)
    result = x_gpu.get()
    context.pop()

    end = time.time()
    cudatime = end - start

    #numpy
    start = time.time()
    xf = np.fft.fft2(x)
    end = time.time()
    numpytime = end - start

    print 'Same to 1e-7?'
    print np.testing.assert_allclose(xf, result, rtol=1e-7)
    print 'Numpy time', numpytime
    print 'CUDA time', cudatime
Example #15
0
def conv_bubbles(I, param_dict, Z, scale=None, fil=1, update=0, LE=False, visualize=False):
	"""uses fft convolution"""
	zeta = 40.
	Lfactor = 0.620350491
	# Z = param_dict['z']
	DELTA_R_FACTOR = 1.05
	print "Using filter_type {}".format(fil)	
	if scale is None:
		scale = param_dict['BoxeSize']/param_dict['HIIdim']
	dk = 2*np.pi/I.shape[0]*scale#param_dict['BoxSize'] #delta k in inverse Mpc
	RMAX = np.float32(30) #in Mpc
	RMIN = np.float32(1.)
	mm = mmin(Z, Tvir=1.e4)
	smin = sig0(m2R(mm))
	#smin = pb.sigma_r(m2R(mm), Z, **cosmo)[0]
	deltac = Deltac(Z)
	fgrowth = np.float32(deltac/1.686)
	#fgrowth = 1./pb.fgrowth(Z, cosmo['omega_M_0'], unnormed=True)
	fc_mean_ps = pb.collapse_fraction(np.sqrt(smin), deltac).astype(np.float32)  #mean collapse fraction of universe
	print fc_mean_ps
	"""find bubbbles for deltax box I"""
	kernel_source = open("find_bubbles.cu").read()
	kernel_code = kernel_source % {
        'DELTAC': deltac,
        'RMIN': RMIN,
        'SMIN': smin, 
        'ZETA': zeta,
        'DELTAK': dk
    }
	main_module = nvcc.SourceModule(kernel_code)
	fcoll_kernel = main_module.get_function("fcoll_kernel")
	update_kernel = main_module.get_function("update_kernel")
	update_sphere_kernel = main_module.get_function("update_sphere_kernel")
	final_kernel = main_module.get_function("final_kernel")
	HII_filter = main_module.get_function("HII_filter")
	# Get contiguous image + shape.
	height, width, depth = I.shape
	HII_TOT_NUM_PIXELS = height*width*depth
	
	
	# Get block/grid size make sure divisible (currrently only power of 2 so ok)
	block_size =  (8,8,8)
	grid_size =   (width/(block_size[0]),
				height/(block_size[0]),
				depth/(block_size[0]))
	 # Initialize variables.
	#ionized       = np.zeros([height,width,depth]) 
	#ionized       = np.float32(ionized)
	width         = np.int32(width)
	I             = np.float32(I.copy()) 
	if not LE:
		I *= fgrowth #linearly extrapolate the non-linear density to present
	#filt          = np.ones_like(I)


	# Transfer labels asynchronously.
	ionized_d = gpuarray.zeros([height,width,depth], dtype=np.float32) 
	delta_d = gpuarray.to_gpu_async(I)
	# I_cu = cu.np_to_array(I, order='C')
	# cu.bind_array_to_texref(I_cu, image_texture)

	fftplan = Plan(I.shape, dtype=np.complex64)
	R = RMAX; cnt = 0

	if visualize is not None:
		fig = plt.figure()
		ax0 = fig.add_subplot(131)
		ax0.set_title('Density')
		mydelta0 = plt.imshow(I.real[width/2])
		plt.colorbar()
		ax1 = fig.add_subplot(132)
		fig.suptitle(" Smoothed Density and Ionization")
		ax1.set_title('smoothed Density')
		mydelta = plt.imshow(delta_d.get().real[width/2])
		plt.colorbar()
		ax2 = fig.add_subplot(133)
		ax2.set_title('Ionization')
		myion = plt.imshow(np.ones_like(I)[width/2])
		plt.colorbar()
		if visualize == 'draw':
			plt.pause(.01)
			plt.draw()
		else:
			plt.savefig('tmp/{0:03d}.png'.format(cnt))

		#plt.colorbar()
	final_step = False
	final_denom = -1
	if RMIN < Lfactor*scale:
		temparg = 2*(smin - sig0(Lfactor*scale) )
		if temparg < 0:
			raise(Exception)
		else:
			final_denom = np.sqrt(temparg).astype(np.float32)
	while not final_step:
		print 'R={} Mpc'.format(R)
		if (R/DELTA_R_FACTOR) <= (Lfactor*scale) or ((R/DELTA_R_FACTOR) <= RMIN): #stop if reach either rmin or cell size
			final_step = True
		R = np.float32(R)
		Rpix = np.float32(R/scale)

		S0 = np.float32(sig0(R))
		#S0 = np.float32(pb.sigma_r(R, Z, **cosmo)[0])
		denom = np.sqrt(2*(smin - S0)).astype(np.float32)
		print 'denom', denom

		start = cu.Event()
		step1 = cu.Event()
		step2 = cu.Event()
		step3 = cu.Event()
		step4 = cu.Event()
		end = cu.Event()

		start.record()
		#smoothed_d = conv(delta_d.astype(np.complex64), I.shape, fil=fil)

		delta_d = gpuarray.to_gpu_async(I).astype(np.complex64)
		fcoll_d = gpuarray.zeros(I.shape, dtype=np.float32)
		start.synchronize()
		if R > 1 or True: # smoothing
			fftplan.execute(delta_d)
			step1.record(); step1.synchronize()
			
			HII_filter(delta_d, width, np.int32(fil), R, block=block_size, grid=grid_size)
			step2.record(); step2.synchronize()
			#import IPython; IPython.embed()
			fftplan.execute(delta_d, inverse=True)

		if not final_step:
			fcoll_kernel(fcoll_d, delta_d.real, width, denom, block=block_size, grid=grid_size)
			step3.record(); step3.synchronize()
			if not LE:
				#fcollmean = gpuarray.sum((1+delta_d.real)*fcoll_d).get()/float(HII_TOT_NUM_PIXELS)
				fcollmean = gpuarray.sum(fcoll_d).get()/np.float32(HII_TOT_NUM_PIXELS)
				fcoll_d *= fc_mean_ps/fcollmean# #normalize since we used non-linear density
				step4.record(); step4.synchronize()
			if update == 0:
				update_kernel(ionized_d, fcoll_d, width, block=block_size, grid=grid_size)
			elif update == 1:
				update_sphere_kernel(ionized_d, fcoll_d, width, Rpix, block=block_size, grid=grid_size)
			#import IPython; IPython.embed()
		else:
			if (RMIN > Lfactor*scale) or (final_denom < 0): final_denom = denom
			print 'final denom', final_denom
			fcoll_kernel(fcoll_d, delta_d.real, width, denom, block=block_size, grid=grid_size)
			step3.record(); step3.synchronize()
			if not LE:
				fcollmean = gpuarray.sum(fcoll_d).get()/np.float32(HII_TOT_NUM_PIXELS)
				#fcollmean = gpuarray.sum((1+delta_d.real)*fcoll_d).get()/float(HII_TOT_NUM_PIXELS)
				fcoll_d *= fc_mean_ps/fcollmean
				step4.record(); step4.synchronize()
			if update == 0:
				update_kernel(ionized_d, fcoll_d, width, block=block_size, grid=grid_size)
			elif update == 1:
				update_sphere_kernel(ionized_d, fcoll_d, width, Rpix, block=block_size, grid=grid_size)
			final_kernel(ionized_d, fcoll_d, width, block=block_size, grid=grid_size)
		end.record()
		end.synchronize()
		if visualize is not None:
			mydelta.set_data(delta_d[width/2].real.get())
			myion.set_data(ionized_d[width/2].get())
			ax1.set_title('R = %f'%(R))
			if visualize == 'draw':
				plt.pause(.01)
				plt.draw()
			else:
				plt.savefig('tmp/{0:03d}.png'.format(cnt))


		R = R/DELTA_R_FACTOR
		cnt +=1 

	ionized = ionized_d.get()
	return ionized
Example #16
0
def run(xHfile=None, IO_DIR=None):

    if IO_DIR is None:
        IO_DIR = parent_folder
    if not os.path.exists(IO_DIR + "/Outfiles"):
        os.makedirs(IO_DIR + "/Outfiles")
    if xHfile is None:
        xHfile = find_files(IO_DIR + "/Boxes/",
                            pattern="xH*{0:06.2f}_{1:i}_{2:.0f}*".format(
                                Z, HII_DIM, BOX_LEN))[0]
    if xHfile.endswith('.npy'):
        xH = np.load(xHfile)
        p_dict = boxio.parse_filename(os.path.splitext(xHfile)[0])
    else:
        b = boxio.readbox(xHfile)
        xH = b.box_data
        p_dict = b.param_dict
    Z = p_dict['z']
    #growth_factor = pb.fgrowth(Z, COSMO['omega_M_0'], unnormed=True)
    #overwrite global variables
    HII_DIM = p_dict['dim']
    BOX_LEN = np.float32(p_dict['BoxSize'])
    DELTA_K = np.float32(2 * np.pi / BOX_LEN)
    VOLUME = (BOX_LEN * BOX_LEN * BOX_LEN)
    HII_TOT_NUM_PIXELS = HII_DIM**3
    try:
        deltax = np.load(
            IO_DIR +
            "/Boxes/updated_smoothed_deltax_z0{0:.2f}_{1:d}_{2:.0f}Mpc.npy".
            format(Z, HII_DIM, BOX_LEN))
    except:
        #deltax = boxio.readbox(IO_DIR+"/Boxes/updated_smoothed_deltax_z{0:.2f}_{1:d}_{2:.0f}Mpc".format(Z, HII_DIM, BOX_LEN)).box_data
        deltax = boxio.readbox(
            IO_DIR +
            "/Boxes/updated_smoothed_deltax_z0{0:.2f}_{1:d}_{2:.0f}Mpc".format(
                Z, HII_DIM *
                2, BOX_LEN)).box_data[:HII_DIM, :HII_DIM, :HII_DIM]

    kernel_source = open(cmd_folder + "/delta_T.cu").read()
    kernel_code = kernel_source % {
        'DELTAK': DELTA_K,
        'VOLUME': VOLUME,
        'NUM_BINS': NUM_BINS
    }
    main_module = nvcc.SourceModule(kernel_code)
    pbox_kernel = main_module.get_function("pbox_kernel")
    #pixel_deltax_d = gpuarray.to_gpu(deltax)
    #pixel_xH_d = gpuarray.to_gpu(xH)

    _const_factor = np.float32(
        27 * (COSMO['omega_b_0'] * COSMO['h'] * COSMO['h'] / 0.023) * np.sqrt(
            (0.15 / COSMO['omega_M_0'] / COSMO['h'] / COSMO['h']) *
            (1 + Z) / 10.0))
    delta_T = np.float32(_const_factor * xH * (1.0 + deltax))  #in mK
    ave = np.mean(delta_T)
    np.save(
        IO_DIR +
        "/Boxes/delta_T_no_halos_z{0:.2f}_nf{1:f}_useTs{2:d}_zetaX{3:.1e}_TvirminX{4:.1e}_aveTb{5:.2f}_{6:d}_{7:d}Mpc.npy"
        .format(Z, p_dict['nf'], USE_TS_IN_21CM, p_dict['eff'], ION_Tvir_MIN,
                ave, HII_DIM, int(BOX_LEN)), delta_T)

    deldel_T = (delta_T / ave - 1) * VOLUME / HII_TOT_NUM_PIXELS
    if DIMENSIONAL_T_POWER_SPEC:
        deldel_T *= ave

    plan = Plan(HII_shape, dtype=np.complex64)
    deldel_T_d = gpuarray.to_gpu(deldel_T.astype(np.complex64))
    plan.execute(deldel_T_d)
    K = np.float32(
        np.logspace(np.log10(DELTA_K), np.log10(DELTA_K * np.sqrt(3.) * DIM),
                    NUM_BINS))
    K_d = gpuarray.to_gpu(K)
    k_ave_d = gpuarray.zeros_like(K_d)
    in_bin_ct_d = gpuarray.zeros_like(K_d)
    ps_d = gpuarray.zeros_like(K_d)

    pbox_kernel(deldel_T_d,
                DIM,
                ps_d,
                k_ave_d,
                in_bin_ct_d,
                K_d,
                block=block_size,
                grid=HII_grid_size)
    ps = ps_d.get()
    in_bin_ct = in_bin_ct_d.get()
    k_ave = k_ave_d.get()
    k_ave = np.where(in_bin_ct > 0, k_ave / in_bin_ct, 0.)
    ps_ave = np.where(in_bin_ct > 0, ps / in_bin_ct, 0.)
    #ps_fname = "/ps_nov_no_halos_z{0:.2f}_nf{1:f}_useTs{2:d}_zetaX{3:.1e}_TvirminX{4:.1e}_aveTb{5:.2f}_{6:d}_{7:d}Mpc".format(Z, p_dict['nf'], USE_TS_IN_21CM, p_dict['eff'], ION_Tvir_MIN, ave, HII_DIM, np.int32(BOX_LEN))
    #np.savez(IO_DIR+ps_fname, k_ave=k_ave, ps_ave=ps_ave)

    return K, k_ave, ps_ave
Example #17
0
def conv_bubbles(deltax, param_dict, Z=None, scale=None, fil=1, update=0, LE=False, visualize=0, quiet=False):
	"""
Excursion-set formalism, or Fast Fourier Radiative-Transform. 
Calculates ionization fields from density field provided. 
For each box pixel, it cycles through various bubble radii
  , until it finds the largest radius such that the enclosed collapsed mass fraction 
  (obtained by summing masses from the halo list file of
  halos whose centers are within the bubble, or by taking 
  the mean collapsed mass from conditional press-schechter)
  is larger than 1/ZETA. 

Parameters
----------
deltax : numpy.float32 array
	Real space density box, must have dimensions powers of 2. 
param_dict: python dictionary
	dictionary of parameters created by boxio.parse_filename
Z: float32
	Required if input density is the present day linear density, program would extrapolate to Z. 
fil: int32
	type of filter for smoothing : 0: rtophat; 1: ktophat, 2: Gaussian
update: int32
	Method to update the ionization field 0: center pixel, 1: sphere painting
visualize: bool
	if True, draw slice of density field and created ionization field 
quiet: bool

Returns
----------

ion_field: numpy array float32

"""
	
	if not quiet: 
		print "Using filter_type {}".format(fil)	
	if scale is None:
		scale = param_dict['BoxSize']/param_dict['HIIdim']
	if Z is None:
		Z = param_dict['Z']
	sigmamin, deltac = pb.sig_del(ION_Tvir_MIN, Z, **COSMO)
	fgrowth = np.float32(deltac/1.686)
	smin = sigmamin**2
	fc_mean_ps = pb.collapse_fraction(sigmamin, deltac).astype(np.float32)  #mean collapse fraction of universe

	"""find bubbbles for deltax box I"""
	kernel_source = open(cmd_folder+"/find_bubbles.cu").read()
	kernel_code = kernel_source % {
        'DELTAC': deltac,
        'RMIN': R_BUBBLE_MIN,
        'ZETA': ZETA,
        'DELTAK': DELTA_K
    }
	main_module = nvcc.SourceModule(kernel_code)
	fcoll_kernel = main_module.get_function("fcoll_kernel")
	update_kernel = main_module.get_function("update_kernel")
	update_sphere_kernel = main_module.get_function("update_sphere_kernel")
	final_kernel = main_module.get_function("final_kernel")
	HII_filter = main_module.get_function("HII_filter")
	# Get contiguous image + shape.
	height, width, depth = deltax.shape
	HII_TOT_NUM_PIXELS = height*width*depth
	
	
	 # Initialize variables.
	width         = np.int32(width)
	deltax        = np.float32(deltax.copy()) 
	if not LE:
		deltax *= fgrowth #linearly extrapolate the non-linear density to present
	# Transfer asynchronously.
	ionized_d = gpuarray.zeros([height,width,depth], dtype=np.float32)
	delta_d = gpuarray.to_gpu_async(deltax)


	fftplan = Plan(deltax.shape, dtype=np.complex64)
	R = R_BUBBLE_MAX; cnt = 0

	if visualize > 0:
		fig = plt.figure()
		ax1 = fig.add_subplot(121)
		fig.suptitle(" Smoothed Density and Ionization")
		ax1.set_title('Density')
		mydelta = plt.imshow(delta_d.get().real[width/2])
		plt.colorbar()
		ax2 = fig.add_subplot(122)
		ax2.set_title('Ionization')
		myion = plt.imshow(np.ones_like(deltax)[width/2])
		plt.colorbar()
		if visualize == 1:
			print "HERE"
			plt.pause(.01)
			plt.draw()
		elif visualize == 2:
			plt.savefig('tmp/{0:03d}.png'.format(cnt))

		#plt.colorbar()
	final_step = False
	final_denom = -1
	if R_BUBBLE_MIN < L_FACTOR*scale:
		temparg = 2*(smin - sig0(L_FACTOR*scale) )
		if temparg < 0:
			raise(Exception)
		else:
			final_denom = np.sqrt(temparg).astype(np.float32)
	while not final_step:
		
		if (R/DELTA_R_FACTOR) <= (L_FACTOR*scale) or ((R/DELTA_R_FACTOR) <= R_BUBBLE_MIN): #stop if reach either rmin or cell size
			final_step = True
		R = np.float32(R)
		S0 = np.float32(sig0(R))
		#S0 = np.float32(pb.sigma_r(R, Z, **cosmo)[0])
		denom = np.sqrt(2*(smin - S0)).astype(np.float32)
		if not quiet:
			print 'R={} Mpc'.format(R)
			print 'denom', denom

		start = cu.Event()
		step1 = cu.Event()
		step2 = cu.Event()
		step3 = cu.Event()
		step4 = cu.Event()
		end = cu.Event()

		start.record()
		delta_d = gpuarray.to_gpu_async(deltax.astype(np.complex64))
		fcoll_d = gpuarray.zeros(deltax.shape, dtype=np.float32)
		start.synchronize()
		fftplan.execute(delta_d)
		step1.record(); step1.synchronize()
		
		HII_filter(delta_d, width, np.int32(fil), R, block=block_size, grid=grid_size)
		step2.record(); step2.synchronize()
		#import IPython; IPython.embed()
		fftplan.execute(delta_d, inverse=True)
		step2.synchronize()
		

		# if not the final step, get ionized regions, if final step paint partial ionizations
		if not final_step:
			fcoll_kernel(fcoll_d, delta_d.real, width, denom, block=block_size, grid=grid_size)
			step3.record(); step3.synchronize()
			if not LE:
				fcollmean = gpuarray.sum(fcoll_d).get()/float(HII_TOT_NUM_PIXELS)
				fcoll_d *= fc_mean_ps/fcollmean# #normalize since we used non-linear density
				step4.record(); step4.synchronize()
			if update == 0:
				update_kernel(ionized_d, fcoll_d, width, block=block_size, grid=grid_size)
			else:
				update_sphere_kernel(ionized_d, fcoll_d, width, R, block=block_size, grid=grid_size)
		else:
			if final_denom < 0: final_denom = denom
			# print 'final denom', final_denom
			fcoll_kernel(fcoll_d, delta_d.real, width, denom, block=block_size, grid=grid_size)
			step3.record(); step3.synchronize()
			if not LE:
				fcollmean = gpuarray.sum(fcoll_d).get()/float(HII_TOT_NUM_PIXELS)
				fcoll_d *= fc_mean_ps/fcollmean
				step4.record(); step4.synchronize()
			final_kernel(ionized_d, fcoll_d, width, block=block_size, grid=grid_size)
		end.record()
		end.synchronize()
		if visualize > 0:
			mydelta.set_data(delta_d.real.get()[width/2])
			myion.set_data(ionized_d.get()[width/2])
			ax1.set_title('R = %f'%(R))
			if visualize == 1:
				plt.pause(.01)
				plt.draw()
			elif visualize == 2:
				plt.savefig('tmp/{0:03d}.png'.format(cnt))


		R = R/DELTA_R_FACTOR
		cnt +=1 

	ionized = ionized_d.get()
	return ionized
Example #18
0
def init():
	"""outputs the high resolution k-box, and the smoothed r box"""
	N = np.int32(DIM) #prepare for stitching
	#HII_DIM = np.int32(HII_DIM)
	f_pixel_factor = DIM/HII_DIM;
	scale = np.float32(BOX_LEN)/DIM
	HII_scale = np.float32(BOX_LEN)/HII_DIM
	shape = (N,N,N)
	
	MRGgen = MRG32k3aRandomNumberGenerator(seed_getter=seed_getter_uniform, offset=0)

	kernel_source = open(cmd_folder+"/initialize.cu").read()
	kernel_code = kernel_source % {

		'DELTAK': DELTA_K,
		'VOLUME': VOLUME,
		'DIM': DIM
	}
	main_module = nvcc.SourceModule(kernel_code)
	init_kernel = main_module.get_function("init_kernel")
	HII_filter = main_module.get_function("HII_filter")
	adj_complex_conj = main_module.get_function("adj_complex_conj")
	subsample_kernel = main_module.get_function("subsample")
	velocity_kernel = main_module.get_function("set_velocity")
	pspec_texture = main_module.get_texref("pspec")

	interpPspec, interpSize = init_pspec() #interpPspec contains both k array and P array
	interp_cu = cuda.matrix_to_array(interpPspec, order='F')
	cuda.bind_array_to_texref(interp_cu, pspec_texture)

	largebox_d = gpuarray.zeros(shape, dtype=np.float32)
	init_kernel(largebox_d, np.int32(DIM), block=block_size, grid=grid_size)

	#import IPython; IPython.embed()
	largebox_d_imag = gpuarray.zeros(shape, dtype=np.float32)
	init_kernel(largebox_d_imag, np.int32(DIM), block=block_size, grid=grid_size)

	largebox_d *= MRGgen.gen_normal(shape, dtype=np.float32)
	largebox_d_imag *= MRGgen.gen_normal(shape, dtype=np.float32)
	largebox_d = largebox_d + np.complex64(1.j) * largebox_d_imag

	#adj_complex_conj(largebox_d, DIM, block=block_size, grid=grid_size)
	largebox = largebox_d.get()
	#np.save(parent_folder+"/Boxes/deltak_z0.00_{0:d}_{1:.0f}Mpc".format(DIM, BOX_LEN), largebox)

	#save real space box before smoothing
	plan = Plan(shape, dtype=np.complex64)
	plan.execute(largebox_d, inverse=True)  #FFT to real space of smoothed box
	largebox_d /= scale**3
	np.save(parent_folder+"/Boxes/deltax_z0.00_{0:d}_{1:.0f}Mpc".format(DIM, BOX_LEN), largebox_d.real.get_async())

	#save real space box after smoothing and subsampling
	# host largebox is still in k space, no need to reload from disk
	largebox_d = gpuarray.to_gpu(largebox)
	smoothR = np.float32(L_FACTOR*BOX_LEN/HII_DIM)
	HII_filter(largebox_d, N, ZERO, smoothR, block=block_size, grid=grid_size);
	plan.execute(largebox_d, inverse=True)  #FFT to real space of smoothed box
	largebox_d /= scale**3
	smallbox_d = gpuarray.zeros(HII_shape, dtype=np.float32)
	subsample_kernel(largebox_d.real, smallbox_d, N, HII_DIM, PIXEL_FACTOR, block=block_size, grid=HII_grid_size) #subsample in real space
	np.save(parent_folder+"/Boxes/smoothed_deltax_z0.00_{0:d}_{1:.0f}Mpc".format(HII_DIM, BOX_LEN), smallbox_d.get_async())

	# reload the k-space box for velocity boxes
	largebox_d = gpuarray.to_gpu(largebox)
	
	#largebox_d /=  VOLUME  #divide by VOLUME if using fft (vs ifft)
	smoothR = np.float32(L_FACTOR*BOX_LEN/HII_DIM)
	largevbox_d = gpuarray.zeros((DIM,DIM,DIM), dtype=np.complex64)
	smallbox_d = gpuarray.zeros(HII_shape, dtype=np.float32)
	for num, mode in enumerate(['x', 'y', 'z']):
		velocity_kernel(largebox_d, largevbox_d, DIM, np.int32(num), block=block_size, grid=grid_size)
		HII_filter(largevbox_d, DIM, ZERO, smoothR, block=block_size, grid=grid_size)
		plan.execute(largevbox_d, inverse=True)
		largevbox_d /= scale**3
		#import IPython; IPython.embed()
		subsample_kernel(largevbox_d.real, smallbox_d, DIM, HII_DIM,PIXEL_FACTOR, block=block_size, grid=HII_grid_size)
		np.save(parent_folder+"/Boxes/v{0}overddot_{1:d}_{2:.0f}Mpc".format(mode, HII_DIM, BOX_LEN), smallbox_d.get())

	return
Example #19
0
sy0 = ny/2 - sny/2
sy1 = ny/2 + sny/2

# initial wavefunction
sigma0 = 50 * dx
k0 = 20
x = np.arange(nx) * dx
y = np.arange(ny) * dy
psi.real[:] = gaussian(x, sigma=sigma0, x0=(sx0+200)*dx)[:,np.newaxis]
psi[:] *= np.exp(1j * k0 * x)[:,np.newaxis]

# cuda init
cuda.init()
ctx = cuda.Device(0).make_context()
strm = cuda.Stream()
plan = Plan((nx, ny), dtype=np.complex64, context=ctx, stream=strm)
psi_gpu = gpuarray.to_gpu(psi)
lcx_gpu = cuda.mem_alloc(nx * np.nbytes['complex64'])
lcy_gpu = cuda.mem_alloc(ny * np.nbytes['complex64'])

# potential
vx0, vwidth = nx/2, 70
vmax = (k0 ** 2) / 2

# plot
'''
import matplotlib.pyplot as plt
from matplotlib.patches import Rectangle
plt.ion()
fig = plt.figure(figsize=(8,6))
ax1 = fig.add_subplot(1,1,1)
Example #20
0
def get_pyfft_plan(shape):
    try:
        from pyfft.cuda import Plan
    except ImportError:
        return None
    return Plan(shape, normalize=True, wait_for_finish=True)
Example #21
0
import numpy
from pyfft.cuda import Plan
import pycuda.autoinit
import pycuda.gpuarray as gpuarray
# w,h,k are the array dimensions in a power of 2
# im1, im2 are the input 3d arrays of dtype complex64
w = h = k = 512
im1 = numpy.random.rand(w, h, k).astype(numpy.complex64)
im2 = numpy.random.rand(w, h, k).astype(numpy.complex64)
%time plan = Plan((w, h, k), normalize=True)
# forward transform on device
%time im1_gpu = gpuarray.to_gpu(im1)
%time plan.execute(im1_gpu)
%time im1_ft = im1_gpu.get()
del im1_gpu
%time im2_gpu = gpuarray.to_gpu(im2)
%time plan.execute(im2_gpu)
%time im2_ft = im2_gpu.get()
del im2_gpu
# do multiplication on host - can be done on device.
%time conv = im1_ft * im2_ft
# inverse transform on device
%time conv_gpu = gpuarray.to_gpu(conv)
# del conv
%time plan.execute(conv_gpu, inverse=True)
%time corr_gpu = conv_gpu.get()
# Reference calculation on CPU:
%time im1_ft = numpy.fft.fftn(im1)
%time im2_ft = numpy.fft.fftn(im2)
%time conv = im1_ft * im2_ft
del im1
Example #22
0
def simpleFourierTest2D(N=2048):
    """
    Using PyFFT to call CUDA.

    :return:
    """
    from pyfft.cuda import Plan
    import pycuda.driver as cuda
    from pycuda.tools import make_default_context
    import pycuda.gpuarray as gpuarray
    import time

    cuda.init()
    context = make_default_context()
    stream = cuda.Stream()

    plan = Plan((N, N), dtype=np.complex64, stream=stream)
    x = np.ones((N, N), dtype=np.complex64)

    x_gpu = gpuarray.to_gpu(x)

    plan.execute(x_gpu)
    res = x_gpu.get()
    plan.execute(x_gpu, inverse=True)
    result = x_gpu.get()
    context.pop()

    error = np.abs(np.sum(np.abs(x) - np.abs(result)) / x.size)
    #print 'Error:', error

    #Single precision
    print 'Array size %i x %i' % (N, N)
    print 'Single Precisions'
    x = np.random.random((N, N))
    x = x.astype(np.complex64)

    start = time.time()
    cuda.init()
    context = make_default_context()
    stream = cuda.Stream()

    plan = Plan((N, N), dtype=np.complex64, stream=stream, fast_math=True)

    x_gpu = gpuarray.to_gpu(x)
    plan.execute(x_gpu)
    result = x_gpu.get()
    context.pop()
    end = time.time()
    cudatime = end - start

    #numpy
    start = time.time()
    xf = np.fft.fft2(x)
    end = time.time()
    numpytime = end - start

    print 'Same to 1e-2?'
    print np.testing.assert_allclose(xf, result, rtol=1e-2)
    print 'Numpy time', numpytime
    print 'CUDA time', cudatime

    #Double precision
    print '\n\nDouble Precision'
    x = np.random.random((N, N))
    x = x.astype(np.complex128)

    start = time.time()

    cuda.init()
    context = make_default_context()
    stream = cuda.Stream()

    plan = Plan((N, N), dtype=np.complex128, stream=stream, fast_math=True)

    x_gpu = gpuarray.to_gpu(x)
    plan.execute(x_gpu)
    result = x_gpu.get()
    context.pop()

    end = time.time()
    cudatime = end - start

    #numpy
    start = time.time()
    xf = np.fft.fft2(x)
    end = time.time()
    numpytime = end - start

    print 'Same to 1e-7?'
    print np.testing.assert_allclose(xf, result, rtol=1e-7)
    print 'Numpy time', numpytime
    print 'CUDA time', cudatime
Example #23
0
File: ssf.py Project: cvarin/PyOFTK
def ssfgpuFull(u0, dt, dz, nz, alpha, betap, gamma, context, maxiter = 4, tol = 1e-5, phiNLOut = False):

	'''	
	Very simple implementation of the symmetrized split-step fourier algo.
	Solve the NLS equation with the SPM nonlinear terme only.

		* error: third in step size
		* u0 : Input field
		* dt: Time increment
		* dz: Space increment
		* nz: Number of space propagation step
		* alpha: Loss/Gain parameter (array)
		* betap: Beta array beta[2] = GVD, beta[3] = TOD, etc...
		* gamma: Nonlinear parameter
		* maxiter: Maximal number of iteration per step (4)
		* tol: Error for each step (1e-5)
		* phiNLOut: If True return the nonlinear phase shift (True)

		--- GPU Version (float precision) ---
	'''	

	nt = len(u0)
	e_ini = pow(abs(u0),2).sum()
	w = wspace(dt*nt,nt)
	phiNL = 0.0

	# Make sure u0 is in single precision
	u0=u0.astype(complex64)
	alpha=alpha.astype(complex64)
	u1 = u0
	uArch = zeros([nz,nt],float32)
	uv = empty_like(u0)

	# Construction of the linear operator
	halfstep = -alpha/2.0	
	if len(betap) != nt:
		for ii in arange(len(betap)):
			halfstep = halfstep - 1.0j*betap[ii]*pow(w,ii)/factorial(ii)
	halfstep = exp(halfstep*dz/2.0).astype(complex64)

	# CUDA Kitchen sink
	fftPlan = Plan((1, nt), dtype=numpy.complex64)

	# Allocate memory to the device
	gpu_halfstep = gpuarray.to_gpu(halfstep)
	gpu_u0 = gpuarray.to_gpu(u0)
	gpu_u1 = gpuarray.to_gpu(u1)
	gpu_uhalf = gpuarray.empty_like(gpu_u0)
	gpu_uv = gpuarray.empty_like(gpu_u0)
	gpu_ufft = gpuarray.empty_like(gpu_u0)
		
	fftPlan.execute(gpu_u0, gpu_ufft)
	
	# GPU Kernel corresponding to the linear operator
	halfStepKernel = ElementwiseKernel("pycuda::complex<float> *u, pycuda::complex<float> *halfstep, pycuda::complex<float> *uhalf",
		"uhalf[i] = u[i] * halfstep[i]",
		"halfstep_linear",
		preamble="#include <pycuda-complex.hpp>",)
	
	# GPU Kernel corresponding to the nonlinear operator
	nlKernel = ElementwiseKernel("pycuda::complex<float> *uhalf, pycuda::complex<float> *u0, pycuda::complex<float> *u1, pycuda::complex<float> *uv, float gamma, float dz",
		"""
		float u0_int = pow(u0[i]._M_re,2) + pow(u0[i]._M_im,2);
		float u1_int = pow(u1[i]._M_re,2) + pow(u1[i]._M_im,2);
		float realArg = -gamma*(u1_int + u0_int)*dz;
		float euler1 = cos(realArg);
		float euler2 = sin(realArg);
		uv[i]._M_re = uhalf[i]._M_re * euler1 - uhalf[i]._M_im * euler2;
		uv[i]._M_im = uhalf[i]._M_im * euler1 + uhalf[i]._M_re * euler2;
		""",
		"halfstep_nonlinear",
		preamble="#include <pycuda-complex.hpp>",)
	
	# GPU reduction kernel computing the error between two complex array
	computeError = ReductionKernel(numpy.float32, neutral="0",
		reduce_expr="a+b", map_expr="pow(abs(a[i] - b[i]),2)",
		arguments="pycuda::complex<float> *a, pycuda::complex<float> *b",
		name="error_reduction",
		preamble="#include <pycuda-complex.hpp>",)

	# Perfom a deep copy of a complex gpuarray
	complexDeepCopy = ElementwiseKernel("pycuda::complex<float> *u1, pycuda::complex<float> *u2",
		"u1[i]._M_re = u2[i]._M_re;u1[i]._M_im = u2[i]._M_im",
		"gpuarray_deepcopy",
		preamble="#include <pycuda-complex.hpp>",)
	
	# Main Loop
	for iz in arange(nz):
		# First application of the linear operator
		halfStepKernel(gpu_ufft, gpu_halfstep, gpu_uhalf)
		fftPlan.execute(gpu_uhalf, inverse=True)
		for ii in arange(maxiter):
			# Application de l'operateur nonlineaire en approx. l'integral de N(z)dz
			# avec la methode du trapeze 
			nlKernel(gpu_uhalf, gpu_u0, gpu_u1, gpu_uv, float(gamma), float(dz/2.0))
			fftPlan.execute(gpu_uv)
			# Second application of the linear operator
			halfStepKernel(gpu_uv, gpu_halfstep, gpu_ufft)
			fftPlan.execute(gpu_ufft, gpu_uv, inverse=True)

			error = computeError(gpu_u1, gpu_uv).get() / e_ini

			if (error < tol):
				complexDeepCopy(gpu_u1, gpu_uv)
				break
			else:
				complexDeepCopy(gpu_u1, gpu_uv)
		
		if (ii >= maxiter-1):
			raise Exception, "Failed to converge"

		complexDeepCopy(gpu_u0, gpu_u1)
		uArch[iz] = pow(abs(gpu_u1.get()),2)

	u1 = gpu_u1.get()
	
	if phiNLOut:
		return [u1, uArch, phiNL]
	else:
		return [u1, uArch]
Example #24
0
def init_stitch(N):
	"""outputs the high resolution k-box, and the smoothed r box

	Input
	-----------
	N:  int32
		size of box to load onto the GPU, should be related to DIM by powers of 2

	"""
	if N is None:
		N = np.int32(HII_DIM) #prepare for stitching
	META_GRID_SIZE = DIM/N
	M = np.int32(HII_DIM/META_GRID_SIZE)
	#HII_DIM = np.int32(HII_DIM)
	f_pixel_factor = DIM/HII_DIM;
	scale = np.float32(BOX_LEN/DIM)
	print 'scale', scale
	HII_scale = np.float32(BOX_LEN/HII_DIM)
	shape = (DIM,DIM,N)
	stitch_grid_size = (DIM/(block_size[0]),
						DIM/(block_size[0]),
						N/(block_size[0]))
	HII_stitch_grid_size = (HII_DIM/(block_size[0]),
						HII_DIM/(block_size[0]),
						M/(block_size[0]))
	#ratio of large box to small size
	kernel_source = open(cmd_folder+"/initialize_stitch.cu").read()
	kernel_code = kernel_source % {

		'DELTAK': DELTA_K,
		'DIM': DIM, 
		'VOLUME': VOLUME,
		'META_BLOCKDIM': N
	}
	main_module = nvcc.SourceModule(kernel_code)
	init_stitch = main_module.get_function("init_kernel")
	HII_filter = main_module.get_function("HII_filter")
	subsample_kernel = main_module.get_function("subsample")
	velocity_kernel = main_module.get_function("set_velocity")
	pspec_texture = main_module.get_texref("pspec")
	MRGgen = MRG32k3aRandomNumberGenerator(seed_getter=seed_getter_uniform, offset=0)
	plan2d = Plan((np.int64(DIM), np.int64(DIM)), dtype=np.complex64)
	plan1d = Plan((np.int64(DIM)), dtype=np.complex64)
	print "init pspec"
	interpPspec, interpSize = init_pspec() #interpPspec contains both k array and P array
	interp_cu = cuda.matrix_to_array(interpPspec, order='F')
	cuda.bind_array_to_texref(interp_cu, pspec_texture)
	#hbox_large = pyfftw.empty_aligned((DIM, DIM, DIM), dtype='complex64')
	hbox_large = np.zeros((DIM, DIM, DIM), dtype=np.complex64)
	#hbox_small = np.zeros(HII_shape, dtype=np.float32)
	#hbox_large = n
	smoothR = np.float32(L_FACTOR*BOX_LEN/HII_DIM)

	# Set up pinned memory for transfer
	#largebox_hs = cuda.aligned_empty(shape=shape, dtype=np.float32, alignment=resource.getpagesize())
	largebox_pin = cuda.pagelocked_empty(shape=shape, dtype=np.float32)
	largecbox_pin = cuda.pagelocked_empty(shape=shape, dtype=np.complex64)

	largebox_d = gpuarray.zeros(shape, dtype=np.float32)
	largebox_d_imag = gpuarray.zeros(shape, dtype=np.float32)
	print "init boxes"
	for meta_z in xrange(META_GRID_SIZE):
		# MRGgen = MRG32k3aRandomNumberGenerator(seed_getter=seed_getter_uniform, offset=meta_x*N**3)
		init_stitch(largebox_d, DIM, np.int32(meta_z),block=block_size, grid=stitch_grid_size)
		init_stitch(largebox_d_imag, DIM, np.int32(meta_z),block=block_size, grid=stitch_grid_size)
		largebox_d *= MRGgen.gen_normal(shape, dtype=np.float32)
		largebox_d_imag *= MRGgen.gen_normal(shape, dtype=np.float32)
		largebox_d = largebox_d + np.complex64(1.j) * largebox_d_imag
		cuda.memcpy_dtoh_async(largecbox_pin, largebox_d)
		hbox_large[:, :, meta_z*N:(meta_z+1)*N] = largecbox_pin.copy()
	#if want to get velocity need to use this
	if True:
		print "saving kbox"
		np.save(parent_folder+"/Boxes/deltak_z0.00_{0:d}_{1:.0f}Mpc.npy".format(DIM, BOX_LEN), hbox_large)

	print "Executing FFT on device"
	#hbox_large = pyfftw.interfaces.numpy_fft.ifftn(hbox_large).real
	hbox_large = fft_stitch(N, plan2d, plan1d, hbox_large, largebox_d).real
	print hbox_large.dtype
	print "Finished FFT on device"
	np.save(parent_folder+"/Boxes/deltax_z0.00_{0:d}_{1:.0f}Mpc.npy".format(DIM, BOX_LEN), hbox_large)
	
	if True:
		print "loading kbox"
		hbox_large = np.load(parent_folder+"/Boxes/deltak_z0.00_{0:d}_{1:.0f}Mpc.npy".format(DIM, BOX_LEN))
	for meta_z in xrange(META_GRID_SIZE):
		largebox_pin = hbox_large[:, :, meta_z*N:(meta_z+1)*N].copy()
		#cuda.memcpy_htod_async(largebox_d, largebox_pin)
		largebox_d = gpuarray.to_gpu_async(hbox_large[:, :, meta_z*N:(meta_z+1)*N].copy())
		HII_filter(largebox_d, DIM, np.int32(meta_z), ZERO, smoothR, block=block_size, grid=stitch_grid_size);
		hbox_large[:, :, meta_z*N:(meta_z+1)*N] = largebox_d.get_async()
	#import IPython; IPython.embed()
	print "Executing FFT on host"
	#hbox_large = hifft(hbox_large).astype(np.complex64).real
	#hbox_large = pyfftw.interfaces.numpy_fft.ifftn(hbox_large).real
	hbox_large = fft_stitch(N, plan2d, plan1d, hbox_large, largebox_d).real
	print "Finished FFT on host"
	#import IPython; IPython.embed()

	# for meta_x in xrange(META_GRID_SIZE):
	# 	for meta_y in xrange(META_GRID_SIZE):
	# 		for meta_z in xrange(META_GRID_SIZE):
	# 			largebox_d = gpuarray.to_gpu(hbox_large[meta_x*N:(meta_x+1)*N, meta_y*N:(meta_y+1)*N, meta_z*N:(meta_z+1)*N])
	# 			HII_filter(largebox_d, N, np.int32(meta_x), np.int32(meta_y), np.int32(meta_z), ZERO, smoothR, block=block_size, grid=grid_size);
	# 			hbox_large[meta_x*N:(meta_x+1)*N, meta_y*N:(meta_y+1)*N, meta_z*N:(meta_z+1)*N] = largebox_d.get()
	#plan = Plan(shape, dtype=np.complex64)
	#plan.execute(largebox_d, inverse=True)  #FFT to real space of smoothed box
	#largebox_d /=  VOLUME  #divide by VOLUME if using fft (vs ifft)


	# This saves a large resolution deltax

	
	print "downsampling"
	smallbox_d = gpuarray.zeros((HII_DIM,HII_DIM,M), dtype=np.float32)
	for meta_z in xrange(META_GRID_SIZE):
		largebox_pin = hbox_large[:, :, meta_z*N:(meta_z+1)*N].copy()
		cuda.memcpy_dtoh_async(largecbox_pin, largebox_d)
		#largebox_d = gpuarray.to_gpu_async(hbox_large[:, :, meta_z*N:(meta_z+1)*N].copy())
		largebox_d /= scale**3 #
		subsample_kernel(largebox_d, smallbox_d, DIM, HII_DIM, PIXEL_FACTOR, block=block_size, grid=HII_stitch_grid_size) #subsample in real space
		hbox_small[:, :, meta_z*M:(meta_z+1)*M] = smallbox_d.get_async()
	np.save(parent_folder+"/Boxes/smoothed_deltax_z0.00_{0:d}_{1:.0f}Mpc".format(HII_DIM, BOX_LEN), hbox_small)
	#import IPython; IPython.embed()


	# To get velocities: reload the k-space box
	hbox_large = np.load(parent_folder+"/Boxes/deltak_z0.00_{0:d}_{1:.0f}Mpc.npy".format(DIM, BOX_LEN))
	hvbox_large = np.zeros((DIM, DIM, DIM), dtype=np.float32)
	hvbox_small = np.zeros(HII_shape, dtype=np.float32)
	smoothR = np.float32(L_FACTOR*BOX_LEN/HII_DIM)
	largevbox_d = gpuarray.zeros((DIM,DIM,N), dtype=np.complex64)
	smallvbox_d = gpuarray.zeros((HII_DIM, HII_DIM, M), dtype=np.float32)
	for num, mode in enumerate(['x', 'y', 'z']):
		for meta_z in xrange(META_GRID_SIZE):
			largebox_d = gpuarray.to_gpu_async(hbox_large[:, :, meta_z*N:(meta_z+1)*N].copy())
			#largebox_d /=  VOLUME  #divide by VOLUME if using fft (vs ifft)
			velocity_kernel(largebox_d, largevbox_d, DIM, np.int32(meta_z), np.int32(num), block=block_size, grid=stitch_grid_size)
			HII_filter(largevbox_d, DIM, ZERO, smoothR, block=block_size, grid=stitch_grid_size)
			print hvbox_large.shape, largevbox_d.shape
			hvbox_large[:, :, meta_z*N:(meta_z+1)*N] = largevbox_d.get_async()
		hvbox_large = fft_stitch(N, plan2d, plan1d, hvbox_large, largevbox_d).real
		for meta_z in xrange(META_GRID_SIZE):
			largevbox_d = gpuarray.to_gpu_async(hvbox_large[:, :, meta_z*N:(meta_z+1)*N].copy())
			subsample_kernel(largevbox_d.real, smallvbox_d, DIM, HII_DIM,PIXEL_FACTOR, block=block_size, grid=HII_stitch_grid_size)
			hvbox_small[:, :, meta_z*M:(meta_z+1)*M] = smallvbox_d.get_async()
		np.save(parent_folder+"/Boxes/v{0}overddot_{1:d}_{2:.0f}Mpc".format(mode, HII_DIM, BOX_LEN), smallvbox_d.get())

	return
Example #25
0

t1 = t()
ifft2(a)
print 'NUMPY took', t()-t1

from pyfft.cuda import Plan
import pycuda.driver as cuda
from pycuda.tools import make_default_context
import pycuda.gpuarray as gpuarray

cuda.init()
context = make_default_context()
stream = cuda.Stream()

plan = Plan((128,128), stream=stream)

t1 = t()
gpu_data = gpuarray.to_gpu(a)
print 'togpu took', t()-t1
plan.execute(gpu_data)
result = gpu_data.get()

gpu_data = gpuarray.to_gpu(a)
plan.execute(gpu_data)
result = gpu_data.get()

# np.conj(result)
print 'CUDA took', (t()-t1)/2, result.shape

t2 = t()
Example #26
0
File: ssf.py Project: cvarin/PyOFTK
def ssfgpuFull(u0,
               dt,
               dz,
               nz,
               alpha,
               betap,
               gamma,
               context,
               maxiter=4,
               tol=1e-5,
               phiNLOut=False):
    '''	
	Very simple implementation of the symmetrized split-step fourier algo.
	Solve the NLS equation with the SPM nonlinear terme only.

		* error: third in step size
		* u0 : Input field
		* dt: Time increment
		* dz: Space increment
		* nz: Number of space propagation step
		* alpha: Loss/Gain parameter (array)
		* betap: Beta array beta[2] = GVD, beta[3] = TOD, etc...
		* gamma: Nonlinear parameter
		* maxiter: Maximal number of iteration per step (4)
		* tol: Error for each step (1e-5)
		* phiNLOut: If True return the nonlinear phase shift (True)

		--- GPU Version (float precision) ---
	'''

    nt = len(u0)
    e_ini = pow(abs(u0), 2).sum()
    w = wspace(dt * nt, nt)
    phiNL = 0.0

    # Make sure u0 is in single precision
    u0 = u0.astype(complex64)
    alpha = alpha.astype(complex64)
    u1 = u0
    uArch = zeros([nz, nt], float32)
    uv = empty_like(u0)

    # Construction of the linear operator
    halfstep = -alpha / 2.0
    if len(betap) != nt:
        for ii in arange(len(betap)):
            halfstep = halfstep - 1.0j * betap[ii] * pow(w, ii) / factorial(ii)
    halfstep = exp(halfstep * dz / 2.0).astype(complex64)

    # CUDA Kitchen sink
    fftPlan = Plan((1, nt), dtype=numpy.complex64)

    # Allocate memory to the device
    gpu_halfstep = gpuarray.to_gpu(halfstep)
    gpu_u0 = gpuarray.to_gpu(u0)
    gpu_u1 = gpuarray.to_gpu(u1)
    gpu_uhalf = gpuarray.empty_like(gpu_u0)
    gpu_uv = gpuarray.empty_like(gpu_u0)
    gpu_ufft = gpuarray.empty_like(gpu_u0)

    fftPlan.execute(gpu_u0, gpu_ufft)

    # GPU Kernel corresponding to the linear operator
    halfStepKernel = ElementwiseKernel(
        "pycuda::complex<float> *u, pycuda::complex<float> *halfstep, pycuda::complex<float> *uhalf",
        "uhalf[i] = u[i] * halfstep[i]",
        "halfstep_linear",
        preamble="#include <pycuda-complex.hpp>",
    )

    # GPU Kernel corresponding to the nonlinear operator
    nlKernel = ElementwiseKernel(
        "pycuda::complex<float> *uhalf, pycuda::complex<float> *u0, pycuda::complex<float> *u1, pycuda::complex<float> *uv, float gamma, float dz",
        """
		float u0_int = pow(u0[i]._M_re,2) + pow(u0[i]._M_im,2);
		float u1_int = pow(u1[i]._M_re,2) + pow(u1[i]._M_im,2);
		float realArg = -gamma*(u1_int + u0_int)*dz;
		float euler1 = cos(realArg);
		float euler2 = sin(realArg);
		uv[i]._M_re = uhalf[i]._M_re * euler1 - uhalf[i]._M_im * euler2;
		uv[i]._M_im = uhalf[i]._M_im * euler1 + uhalf[i]._M_re * euler2;
		""",
        "halfstep_nonlinear",
        preamble="#include <pycuda-complex.hpp>",
    )

    # GPU reduction kernel computing the error between two complex array
    computeError = ReductionKernel(
        numpy.float32,
        neutral="0",
        reduce_expr="a+b",
        map_expr="pow(abs(a[i] - b[i]),2)",
        arguments="pycuda::complex<float> *a, pycuda::complex<float> *b",
        name="error_reduction",
        preamble="#include <pycuda-complex.hpp>",
    )

    # Perfom a deep copy of a complex gpuarray
    complexDeepCopy = ElementwiseKernel(
        "pycuda::complex<float> *u1, pycuda::complex<float> *u2",
        "u1[i]._M_re = u2[i]._M_re;u1[i]._M_im = u2[i]._M_im",
        "gpuarray_deepcopy",
        preamble="#include <pycuda-complex.hpp>",
    )

    # Main Loop
    for iz in arange(nz):
        # First application of the linear operator
        halfStepKernel(gpu_ufft, gpu_halfstep, gpu_uhalf)
        fftPlan.execute(gpu_uhalf, inverse=True)
        for ii in arange(maxiter):
            # Application de l'operateur nonlineaire en approx. l'integral de N(z)dz
            # avec la methode du trapeze
            nlKernel(gpu_uhalf, gpu_u0, gpu_u1, gpu_uv, float(gamma),
                     float(dz / 2.0))
            fftPlan.execute(gpu_uv)
            # Second application of the linear operator
            halfStepKernel(gpu_uv, gpu_halfstep, gpu_ufft)
            fftPlan.execute(gpu_ufft, gpu_uv, inverse=True)

            error = computeError(gpu_u1, gpu_uv).get() / e_ini

            if (error < tol):
                complexDeepCopy(gpu_u1, gpu_uv)
                break
            else:
                complexDeepCopy(gpu_u1, gpu_uv)

        if (ii >= maxiter - 1):
            raise Exception, "Failed to converge"

        complexDeepCopy(gpu_u0, gpu_u1)
        uArch[iz] = pow(abs(gpu_u1.get()), 2)

    u1 = gpu_u1.get()

    if phiNLOut:
        return [u1, uArch, phiNL]
    else:
        return [u1, uArch]
Example #27
0
from pyfft.cuda import Plan
import numpy
from pycuda.tools import make_default_context
import pycuda.gpuarray as gpuarray
import pycuda.driver as cuda

# initialize context
cuda.init()
context = make_default_context()
stream = cuda.Stream()

# create plan
plan = Plan((16, 16), stream=stream)

# prepare data
data = numpy.ones((16, 16), dtype=numpy.complex64)
gpu_data = gpuarray.to_gpu(data)
print gpu_data

# forward transform
plan.execute(gpu_data)
result = gpu_data.get()
print result

# inverse transform
plan.execute(gpu_data, inverse=True)
result = gpu_data.get()
error = numpy.abs(numpy.sum(numpy.abs(data) - numpy.abs(result)) / data.size)
print error < 1e-6

context.pop()