def test_against_fft_2d_mgpu(self): from pyculib.fft.binding import Plan, CUFFT_R2C rank = 64 rowsize = 64 N = rank * rowsize x = np.arange(N, dtype=np.float32) halfZ = rowsize // 2 + 1 xh = np.arange(rank * halfZ, dtype=np.complex64) for j in range(rank): for i in range(halfZ - 1): ii = j * rowsize + 2 * i r = x[ii] if ii + 1 < N: imag = x[ii + 1] else: imag = 0 xh[j * halfZ + i] = np.complex(r, imag) xh[j * halfZ + halfZ - 1] = 0 x = x.reshape(rank, rowsize) xh = xh.reshape(rank, halfZ) xf = np.fft.fft2(x) plan = Plan.many([rank, rowsize], CUFFT_R2C, 1, 2) d_x_gpu = plan.to_device(xh) xf_gpu = np.zeros(shape=(rank, halfZ), dtype=np.complex64) #d_xf_gpu = plan.to_device(xf_gpu) plan.forward(d_x_gpu, d_x_gpu) #Inplace d_x_gpu.copy_to_host(xf_gpu) self.assertTrue(np.allclose(xf[:, 0:halfZ], xf_gpu, atol=1e-6))
def test_against_fft_1d_mgpu(self): return True from pyculib.fft.binding import Plan, CUFFT_R2C, CUFFT_C2R N = 32 x = np.arange(N, dtype=np.float32) halfZ = N // 2 + 1 xh = np.arange(halfZ, dtype=np.complex64) for i in range(halfZ - 1): r = x[2 * i] if 2 * i + 1 < N: imag = x[2 * i + 1] else: imag = 0 xh[i] = np.complex(r, imag) xh[halfZ - 1] = 0 print(x) print(xh) xf = np.fft.fft(x) plan = Plan.many([N], CUFFT_R2C, 1, 2) d_x_gpu = plan.to_device(xh) xf_gpu = np.zeros(halfZ, dtype=np.complex64) #d_xf_gpu = plan.to_device(xf_gpu) plan.forward(d_x_gpu, d_x_gpu) d_x_gpu.copy_to_host(xf_gpu) self.assertTrue(np.allclose(xf[0:halfZ], xf_gpu, atol=1e-6))
def test_plan2d(self): from pyculib.fft.binding import Plan, CUFFT_C2C n = 2**4 data = np.arange(n, dtype=np.complex64).reshape(2, n//2) orig = data.copy() d_data = cuda.to_device(data) fftplan = Plan.two(CUFFT_C2C, *data.shape) fftplan.forward(d_data, d_data) fftplan.inverse(d_data, d_data) d_data.copy_to_host(data) result = data / n self.assertTrue(np.allclose(orig, result.real))
def test_against_fft_1d(self): from pyculib.fft.binding import Plan, CUFFT_R2C N = 128 x = np.asarray(np.arange(N), dtype=np.float32) xf = np.fft.fft(x) d_x_gpu = cuda.to_device(x) xf_gpu = np.zeros(N//2+1, np.complex64) d_xf_gpu = cuda.to_device(xf_gpu) plan = Plan.many(x.shape, CUFFT_R2C) plan.forward(d_x_gpu, d_xf_gpu) d_xf_gpu.copy_to_host(xf_gpu) self.assertTrue( np.allclose(xf[0:N//2+1], xf_gpu, atol=1e-6) )
def test_plan1d(self): from pyculib.fft.binding import Plan, CUFFT_C2C n = 10 data = np.arange(n, dtype=np.complex64) orig = data.copy() fftplan = Plan.one(CUFFT_C2C, n) d_data = fftplan.to_device(data) fftplan.forward(d_data, d_data) fftplan.inverse(d_data, d_data) d_data.copy_to_host(data) result = data / n self.assertTrue(np.allclose(orig, result.real))
def test_against_fft_2d(self): from pyculib.fft.binding import Plan, CUFFT_R2C rank = 2 rowsize = 128 N = rowsize * rank x = np.arange(N, dtype=np.float32).reshape(rank, rowsize) xf = np.fft.fft2(x) d_x_gpu = cuda.to_device(x) xf_gpu = np.zeros(shape=(rank, rowsize//2 + 1), dtype=np.complex64) d_xf_gpu = cuda.to_device(xf_gpu) plan = Plan.many(x.shape, CUFFT_R2C) plan.forward(d_x_gpu, d_xf_gpu) d_xf_gpu.copy_to_host(xf_gpu) self.assertTrue(np.allclose(xf[:, 0:rowsize//2+1], xf_gpu, atol=1e-6))
def test_against_fft_3d(self): from pyculib.fft.binding import Plan, CUFFT_R2C depth = 2 colsize = 2 rowsize = 64 N = depth * colsize * rowsize x = np.arange(N, dtype=np.float32).reshape(depth, colsize, rowsize) xf = np.fft.fftn(x) halfZ = rowsize // 2 + 1 plan = Plan.many(x.shape, CUFFT_R2C) d_x_gpu = plan.to_device(x) xf_gpu = np.zeros(shape=(depth, colsize, halfZ), dtype=np.complex64) d_xf_gpu = plan.to_device(xf_gpu) plan.forward(d_x_gpu, d_xf_gpu) d_xf_gpu.copy_to_host(xf_gpu) self.assertTrue(np.allclose(xf[:, :, 0:halfZ], xf_gpu, atol=1e-6))
def test_against_fft_3d_mgpu(self): from pyculib.fft.binding import Plan, CUFFT_R2C depth = 32 colsize = 32 rowsize = 32 N = depth * colsize * rowsize x = np.arange(N, dtype=np.float32) halfZ = rowsize // 2 + 1 xh = np.arange(depth * colsize * halfZ, dtype=np.complex64) for k in range(depth): for j in range(colsize): for i in range(halfZ - 1): ii = k * colsize * rowsize + j * rowsize + 2 * i r = x[ii] if ii + 1 < N: imag = x[ii + 1] else: imag = 0 xh[k * colsize * halfZ + j * halfZ + i] = np.complex( r, imag) xh[k * colsize * halfZ + j * halfZ + halfZ - 1] = 0 x = x.reshape(depth, colsize, rowsize) xh = xh.reshape(depth, colsize, halfZ) xf = np.fft.fftn(x) plan = Plan.many([depth, colsize, rowsize], CUFFT_R2C, 1, 2) d_x_gpu = plan.to_device(xh) xf_gpu = np.zeros(shape=(depth, colsize, halfZ), dtype=np.complex64) #d_xf_gpu = plan.to_device(xf_gpu) plan.forward(d_x_gpu, d_x_gpu) d_x_gpu.copy_to_host(xf_gpu) self.assertTrue(np.allclose(xf[:, :, 0:halfZ], xf_gpu, atol=1e-6))
# f = fft.FFTPlan(img_shape, np.complex64, np.complex64, 1, 0, fft.FFTPlan.MODE_FFTW_PADDING) from pyculib.fft.binding import Plan, CUFFT_C2C from pyculib import blas as cublas n = (128 * 10)**2 data1 = np.arange(n, dtype=np.complex64).reshape(2, n // 2) data = np.arange(n, dtype=np.complex64) orig = data.copy() d_data = cuda.to_device(data) #s0 = cuda.stream() # cuda.select_device(1) # d_data1 = cuda.to_device(data) #s1 = cuda.stream() # fftplan = Plan.one(CUFFT_C2C, *data.shape) # Plan.many() fftplan1 = Plan.many(data.shape, CUFFT_C2C, 1500) b = cublas.Blas() rounds = 10000 start = time.clock() for x in range(rounds): # fft.fft_inplace(img) # cuda.select_device(0) # fftplan1.forward(d_data, d_data) # fftplan1.inverse(d_data, d_data) # cuda.select_device(1) # fftplan1.forward(d_data1, d_data1) #fftplan1.forward(d_data1, d_data1) # fftplan.inverse(d_data, d_data) # d_data = cuda.to_device(data) # cublas.dot(d_data, d_data)
arg3 = clip(max((-z + r1 + r2)*(z + r1 - r2)*(z - r1 + r2)*(z + r1 + r2), 0.),-1,1) if (r1 <= r2 - z) : return math.pi*r1*r1 # planet completely overlaps stellar circle elif (r1 >= r2 + z) : return math.pi*r2*r2 # stellar circle completely overlaps planet else : return r1*r1*math.acos(arg1) + r2*r2*math.acos(arg2) - 0.5*math.sqrt(arg3) # partial overlap #################### # GPU functions ################## if numba.cuda.is_available(): # FFT plan from pyculib.fft.binding import Plan, CUFFT_C2C fftplan17 = Plan.one(CUFFT_C2C, 2**17) fftplan18 = Plan.one(CUFFT_C2C, 2**18) @numba.cuda.jit('float64(float64,float64,float64)', device=True, inline=True) def d_clip(a, b, c): if (a < b) : return b elif (a > c) : return c else : return a @numba.cuda.jit('float64(float64,float64,float64)', device=True, inline=True) def d_area(z, r1, r2): arg1 = d_clip((z*z + r1*r1 - r2*r2)/(2.*z*r1),-1,1) arg2 = d_clip((z*z + r2*r2 - r1*r1)/(2.*z*r2),-1,1) arg3 = d_clip(max((-z + r1 + r2)*(z + r1 - r2)*(z - r1 + r2)*(z + r1 + r2), 0.),-1,1)