def test_2d_real_to_complex_double(self, ctx): if not has_double(ctx): #TODO: find better way to skip test return queue = cl.CommandQueue(ctx) M = 64 N = 32 nd_data = np.arange(M * N, dtype=np.float64) nd_data.shape = (M, N) cl_data = cla.to_device(queue, nd_data) cl_data_transformed = cla.zeros(queue, (M, N // 2 + 1), dtype=np.complex128) transform = FFT( ctx, queue, cl_data, cl_data_transformed, axes=(1, 0), ) transform.enqueue() print(cl_data_transformed.get) print(np.fft.rfft2(nd_data)) assert np.allclose(cl_data_transformed.get(), np.fft.rfft2(nd_data), rtol=1e-8, atol=1e-8)
def test_2d_in_4d_out_of_place(self, ctx): queue = cl.CommandQueue(ctx) L1 = 4 L2 = 5 M = 64 N = 32 axes = (-1, -2) #ok #axes = (0,1) #ok #axes = (0,2) #cannot be collapsed nd_data = np.arange(L1*L2*M*N, dtype=np.complex64) nd_data.shape = (L1, L2, M, N) cl_data = cla.to_device(queue, nd_data) cl_data_transformed = cla.zeros_like(cl_data) transform = FFT(ctx, queue, cl_data, cl_data_transformed, axes = axes, ) transform.enqueue() print(cl_data_transformed.get) print(np.fft.fft2(nd_data)) assert np.allclose(cl_data_transformed.get(), np.fft.fft2(nd_data, axes=axes), rtol=1e-3, atol=1e-3)
def test_2d_in_4d_out_of_place(self, ctx): queue = cl.CommandQueue(ctx) L1 = 4 L2 = 5 M = 64 N = 32 axes = (-1, -2) #ok #axes = (0,1) #ok #axes = (0,2) #cannot be collapsed nd_data = np.arange(L1 * L2 * M * N, dtype=np.complex64) nd_data.shape = (L1, L2, M, N) cl_data = cla.to_device(queue, nd_data) cl_data_transformed = cla.zeros_like(cl_data) transform = FFT( ctx, queue, cl_data, cl_data_transformed, axes=axes, ) transform.enqueue() print(cl_data_transformed.get) print(np.fft.fft2(nd_data)) assert np.allclose(cl_data_transformed.get(), np.fft.fft2(nd_data, axes=axes), rtol=1e-3, atol=1e-3)
def test_2d_real_to_complex_double(self, ctx): if not has_double(ctx): #TODO: find better way to skip test return queue = cl.CommandQueue(ctx) M = 64 N = 32 nd_data = np.arange(M*N, dtype=np.float64) nd_data.shape = (M, N) cl_data = cla.to_device(queue, nd_data) cl_data_transformed = cla.zeros(queue, (M, N//2+1), dtype = np.complex128) transform = FFT(ctx, queue, cl_data, cl_data_transformed, axes = (1,0), ) transform.enqueue() print(cl_data_transformed.get) print(np.fft.rfft2(nd_data)) assert np.allclose(cl_data_transformed.get(), np.fft.rfft2(nd_data), rtol=1e-8, atol=1e-8)
def __init__(self, decomp, context, queue, grid_shape, dtype): self.decomp = decomp self.grid_shape = grid_shape self.dtype = np.dtype(dtype) self.is_real = is_real = self.dtype.kind == "f" from pystella.fourier import get_complex_dtype_with_matching_prec self.cdtype = cdtype = get_complex_dtype_with_matching_prec(self.dtype) from pystella.fourier import get_real_dtype_with_matching_prec self.rdtype = get_real_dtype_with_matching_prec(self.dtype) self.fx = cla.zeros(queue, grid_shape, dtype) self.fk = cla.zeros(queue, self.shape(is_real), cdtype) from gpyfft import FFT self.forward = FFT(context, queue, self.fx, out_array=self.fk, real=is_real, scale_forward=1, scale_backward=1) self.backward = FFT(context, queue, self.fk, out_array=self.fx, real=is_real, scale_forward=1, scale_backward=1) slc = ( (), (), (), ) self.sub_k = get_sliced_momenta(grid_shape, self.dtype, slc, queue)
def test_2d_out_of_place(self, ctx): queue = cl.CommandQueue(ctx) L = 4 M = 64 N = 32 axes = (-1, -2) nd_data = np.arange(L * M * N, dtype=np.complex64) nd_data.shape = (L, M, N) cl_data = cla.to_device(queue, nd_data) cl_data_transformed = cla.zeros_like(cl_data) transform = FFT( ctx, queue, cl_data, cl_data_transformed, axes=axes, ) transform.enqueue() print(cl_data_transformed.get) print(np.fft.fft2(nd_data)) assert np.allclose(cl_data_transformed.get(), np.fft.fft2(nd_data, axes=axes), rtol=1e-3, atol=1e-3)
def test_2d_real_to_complex(self, ctx): queue = cl.CommandQueue(ctx) M = 64 N = 32 nd_data = np.arange(M * N, dtype=np.float32) nd_data.shape = (M, N) cl_data = cla.to_device(queue, nd_data) cl_data_transformed = cla.zeros(queue, (M, N // 2 + 1), dtype=np.complex64) transform = FFT( ctx, queue, cl_data, cl_data_transformed, axes=(1, 0), ) transform.enqueue() print(cl_data_transformed.get) print(np.fft.rfft2(nd_data)) assert np.allclose(cl_data_transformed.get(), np.fft.rfft2(nd_data), rtol=1e-3, atol=1e-3)
def test_2d_real_to_complex(self, ctx): queue = cl.CommandQueue(ctx) M = 64 N = 32 nd_data = np.arange(M*N, dtype=np.float32) nd_data.shape = (M, N) cl_data = cla.to_device(queue, nd_data) cl_data_transformed = cla.zeros(queue, (M, N//2+1), dtype = np.complex64) transform = FFT(ctx, queue, cl_data, cl_data_transformed, axes = (1,0), ) transform.enqueue() print(cl_data_transformed.get) print(np.fft.rfft2(nd_data)) assert np.allclose(cl_data_transformed.get(), np.fft.rfft2(nd_data), rtol=1e-3, atol=1e-3)
def test_2d_out_of_place(self, ctx): queue = cl.CommandQueue(ctx) L = 4 M = 64 N = 32 axes = (-1, -2) nd_data = np.arange(L*M*N, dtype=np.complex64) nd_data.shape = (L, M, N) cl_data = cla.to_device(queue, nd_data) cl_data_transformed = cla.zeros_like(cl_data) transform = FFT(ctx, queue, cl_data, cl_data_transformed, axes = axes, ) transform.enqueue() print(cl_data_transformed.get) print(np.fft.fft2(nd_data)) assert np.allclose(cl_data_transformed.get(), np.fft.fft2(nd_data, axes=axes), rtol=1e-3, atol=1e-3)
def test_1d_out_of_place(self, ctx): queue = cl.CommandQueue(ctx) nd_data = np.arange(32, dtype=np.complex64) cl_data = cla.to_device(queue, nd_data) cl_data_transformed = cla.zeros_like(cl_data) transform = FFT(ctx, queue, cl_data, cl_data_transformed) transform.enqueue() assert np.allclose(cl_data_transformed.get(), np.fft.fft(nd_data))
def test_1d_inplace_double(self, ctx): if not has_double(ctx): #TODO: find better way to skip test return queue = cl.CommandQueue(ctx) nd_data = np.arange(32, dtype=np.complex128) cl_data = cla.to_device(queue, nd_data) transform = FFT(ctx, queue, cl_data) transform.enqueue() assert np.allclose(cl_data.get(), np.fft.fft(nd_data))
def test_1d_inplace_double(self, ctx): if not has_double(ctx): #TODO: find better way to skip test return queue = cl.CommandQueue(ctx) nd_data = np.arange(32, dtype=np.complex128) cl_data = cla.to_device(queue, nd_data) transform = FFT(ctx, queue, cl_data) transform.enqueue() assert np.allclose(cl_data.get(), np.fft.fft(nd_data))
def test_1d_out_of_place(self, ctx): queue = cl.CommandQueue(ctx) nd_data = np.arange(32, dtype=np.complex64) cl_data = cla.to_device(queue, nd_data) cl_data_transformed = cla.zeros_like(cl_data) transform = FFT(ctx, queue, cl_data, cl_data_transformed ) transform.enqueue() assert np.allclose(cl_data_transformed.get(), np.fft.fft(nd_data))
def test_1d_real_to_complex(self, ctx): queue = cl.CommandQueue(ctx) N = 32 nd_data = np.arange(N, dtype=np.float32) cl_data = cla.to_device(queue, nd_data) cl_data_transformed = cla.zeros(queue, (N//2+1,), dtype = np.complex64) transform = FFT(ctx, queue, cl_data, cl_data_transformed, ) transform.enqueue() assert np.allclose(cl_data_transformed.get(), np.fft.rfft(nd_data))
def test_1d_real_to_complex(self, ctx): queue = cl.CommandQueue(ctx) N = 32 nd_data = np.arange(N, dtype=np.float32) cl_data = cla.to_device(queue, nd_data) cl_data_transformed = cla.zeros(queue, (N//2+1,), dtype = np.complex64) transform = FFT(ctx, queue, cl_data, cl_data_transformed, ) transform.enqueue() assert np.allclose(cl_data_transformed.get(), np.fft.rfft(nd_data))
def run(double_precision=False): context = cl.create_some_context() queue = cl.CommandQueue(context) dtype = np.complex64 if not double_precision else np.complex128 n_run = 100 #set to 1 for testing for correct result if n_run > 1: nd_dataC = np.random.normal(size=(1024, 1024)).astype(dtype) else: nd_dataC = np.ones((1024, 1024), dtype=dtype) #set n_run to 1 nd_dataF = np.asfortranarray(nd_dataC) dataC = cla.to_device(queue, nd_dataC) dataF = cla.to_device(queue, nd_dataF) nd_result = np.zeros_like(nd_dataC, dtype=dtype) resultC = cla.to_device(queue, nd_result) resultF = cla.to_device(queue, np.asfortranarray(nd_result)) result = resultF axes_list = [(-2, -1), (-1, -2), None] #batched 2d transforms if True: print('out of place transforms', dataC.shape, dataC.dtype) print('axes in out') for axes in axes_list: for data in (dataC, dataF): for result in (resultC, resultF): t_ms, gflops = 0, 0 try: transform = FFT(context, queue, data, result, axes=axes) #transform.plan.transpose_result = True #not implemented for some transforms (works e.g. for out of place, (2,1) C C) print( '%-10s %3s %3s' % ( axes, 'C' if data.flags.c_contiguous else 'F', 'C' if result.flags.c_contiguous else 'F', ), end=' ', ) tic = timeit.default_timer() for i in range(n_run): events = transform.enqueue() #events = transform.enqueue(False) for e in events: e.wait() toc = timeit.default_timer() t_ms = 1e3 * (toc - tic) / n_run gflops = 5e-9 * np.log2(np.prod( transform.t_shape)) * np.prod( transform.t_shape) * transform.batchsize / ( 1e-3 * t_ms) npfft_result = npfftn(nd_dataC, axes=axes) if transform.plan.transpose_result: npfft_result = np.swapaxes(npfft_result, axes[0], axes[1]) max_error = np.max(abs(result.get() - npfft_result)) print('%8.1e' % max_error, end=' ') assert_allclose( result.get(), npfft_result, atol=1e-8 if double_precision else 1e-3, rtol=1e-8 if double_precision else 1e-3) #assert_array_almost_equal(abs(result.get() - npfftn(data.get(), axes = axes)), # 1e-4) except GpyFFT_Error as e: print(e) except AssertionError as e: print(e) except Exception as e: print(e) finally: print('%5.2fms %6.2f Gflops' % (t_ms, gflops)) print('in place transforms', nd_dataC.shape, nd_dataC.dtype) for axes in axes_list: for nd_data in (nd_dataC, nd_dataF): data = cla.to_device(queue, nd_data) transform = FFT(context, queue, data, axes=axes) #transform.plan.transpose_result = True #not implemented tic = timeit.default_timer() for i in range(n_run): # inplace transform fails for n_run > 1 events = transform.enqueue() for e in events: e.wait() toc = timeit.default_timer() t_ms = 1e3 * (toc - tic) / n_run gflops = 5e-9 * np.log2(np.prod(transform.t_shape)) * np.prod( transform.t_shape) * transform.batchsize / (1e-3 * t_ms) print( '%-10s %3s %5.2fms %6.2f Gflops' % (axes, 'C' if data.flags.c_contiguous else 'F', t_ms, gflops))
def create_workspace(self): """ init ... x_{k+1} = ref ... a_k = 0. iter 0 ... h_k = None y_k = x_{k+1} y_k = (a_k + 1) * x_{k+1} - a_k * x_k ... t = f(x_k) ... g_k = t - y_k x_k = x_{k+1} x_{k+1} = y_k + g_k ... g_{k-2} = g_{k-1} g_{k-1} = g_k ... a_k = 0. iter 1 ... h_k = x_{k+1} - x_k y_k = x_{k+1} ... t = f(x_{k+1}) ... g_k = t - y_k x_k = x_{k+1} x_{k+1} = y_k + g_k ... g_{k-2} = g_{k-1} g_{k-1} = g_k ... a_k = L(g_{k-1}, g_{k-2}) iter 2 ... h_k = x_{k+1} - x_k y_k = x_{k+1} + a_k * h_k ... t = f(x_{k+1}) ... g_k = t - y_k x_k = x_{k+1} x_{k+1} = y_k + g_k ... g_{k-2} = g_{k-1} g_{k-1} = g_k ... a_k = L(g_{k-1}, g_{k-2}) iter n (y_k, x_{k+1}, x_k, g_{k+1}, g_k) ... y_k = (a_k + 1) * x_{k+1} - a_k * x_k ... t = f(x_{k+1}) ... g_k = g_{k+1} g_{k+1} = t - y_k ... x_k = x_{k+1} x_{k+1} = y_k + g_{k+1} ... a_k = L(g_{k+1}, g_k) --> return x_{k+1} --> return t, bypass acceleration """ # pre-calculate shapes nz, ny, nx = self._out_shape real_shape = (nz, ny, nx) complex_shape = (nz, ny, nx // 2 + 1) # create memory pool allocator = cl.tools.ImmediateAllocator( self.queue, mem_flags=cl.mem_flags.READ_WRITE) self._mem_pool = cl.tools.MemoryPool(allocator) #TODO wrap this section in ExitStack, callback(destroy_workspace) # reference image self.h_buf = np.empty(real_shape, dtype=np.float32) self.d_ref = cl.array.empty(self.queue, real_shape, np.float32, allocator=self._mem_pool) # otf self.d_otf = cl.array.empty(self.queue, complex_shape, np.complex64, allocator=self._mem_pool) # deconvolution io buffers self.d_dec_bufs = AttrDict() self.d_dec_bufs['tmp'] = cl.array.empty(self.queue, real_shape, np.float32, allocator=self._mem_pool) self.d_dec_bufs['fft'] = cl.array.empty(self.queue, complex_shape, np.complex64, allocator=self._mem_pool) # deconvolution fft/ifft plans self.fft = FFT(self.context, self.queue, self.d_dec_bufs.tmp, out_array=self.d_dec_bufs.fft) logger.debug("fft buffer size: {}".format( format_byte_size(self.fft.plan.temp_array_size, binary=True))) self.ifft = FFT(self.context, self.queue, self.d_dec_bufs.fft, out_array=self.d_dec_bufs.tmp, real=True) logger.debug("ifft buffer size: {}".format( format_byte_size(self.ifft.plan.temp_array_size, binary=True))) # accelerator buffers self.d_acc_bufs = AttrDict() for name in ('y', 'x1', 'x0', 'g1', 'g0'): self.d_acc_bufs[name] = cl.array.empty(self.queue, real_shape, np.float32, allocator=self._mem_pool) logger.debug("held={}, active={}".format(self._mem_pool.held_blocks, self._mem_pool.active_blocks))
class gDFT(BaseDFT): """ A wrapper to :mod:`gpyfft` to compute Fast Fourier transforms with :mod:`clfft`. See :class:`pystella.fourier.dft.BaseDFT`. :arg decomp: A :class:`pystella.DomainDecomposition`. :arg context: A :class:`pyopencl.Context`. :arg queue: A :class:`pyopencl.CommandQueue`. :arg grid_shape: A 3-:class:`tuple` specifying the shape of position-space arrays to be transformed. :arg dtype: The datatype of position-space arrays to be transformed. The complex datatype for momentum-space arrays is chosen to have the same precision. .. versionchanged:: 2020.1 Support for complex-to-complex transforms. """ def __init__(self, decomp, context, queue, grid_shape, dtype): self.decomp = decomp self.grid_shape = grid_shape self.dtype = np.dtype(dtype) self.is_real = is_real = self.dtype.kind == "f" from pystella.fourier import get_complex_dtype_with_matching_prec self.cdtype = cdtype = get_complex_dtype_with_matching_prec(self.dtype) from pystella.fourier import get_real_dtype_with_matching_prec self.rdtype = get_real_dtype_with_matching_prec(self.dtype) self.fx = cla.zeros(queue, grid_shape, dtype) self.fk = cla.zeros(queue, self.shape(is_real), cdtype) from gpyfft import FFT self.forward = FFT(context, queue, self.fx, out_array=self.fk, real=is_real, scale_forward=1, scale_backward=1) self.backward = FFT(context, queue, self.fk, out_array=self.fx, real=is_real, scale_forward=1, scale_backward=1) slc = ( (), (), (), ) self.sub_k = get_sliced_momenta(grid_shape, self.dtype, slc, queue) @property def proc_permutation(self): return tuple(range(len(self.grid_shape))) def shape(self, forward_output=True): if forward_output and self.is_real: shape = list(self.grid_shape) shape[-1] = shape[-1] // 2 + 1 return tuple(shape) else: return self.grid_shape def forward_transform(self, fx, fk, **kwargs): event, = self.forward.enqueue_arrays(data=fx, result=fk, forward=True) fx.add_event(event) fk.add_event(event) return fk def backward_transform(self, fk, fx, **kwargs): event, = self.backward.enqueue_arrays(data=fk, result=fx, forward=False) fx.add_event(event) fk.add_event(event) return fx
def run(double_precision=False): context = cl.create_some_context() queue = cl.CommandQueue(context) dtype = np.complex64 if not double_precision else np.complex128 n_run = 100 #set to 1 for proper testing if n_run > 1: nd_dataC = np.random.normal(size=(4,1024, 1024)).astype(dtype) #faster than 1024x1024? else: nd_dataC = np.ones((4,1024, 1024), dtype = dtype) #set n_run to 1 nd_dataF = np.asfortranarray(nd_dataC) dataC = cla.to_device(queue, nd_dataC) dataF = cla.to_device(queue, nd_dataF) nd_result = np.zeros_like(nd_dataC, dtype = dtype) resultC = cla.to_device(queue, nd_result) resultF = cla.to_device(queue, np.asfortranarray(nd_result)) result = resultF axes_list = [(1,2), (2,1)] #batched 2d transforms if True: print('out of place transforms', dataC.shape, dataC.dtype) print('axes in out') for axes in axes_list: for data in (dataC, dataF): for result in (resultC, resultF): try: transform = FFT(context, queue, data, result, axes = axes) #transform.plan.transpose_result = True #not implemented for some transforms (works e.g. for out of place, (2,1) C C) print('%-10s %3s %3s' % ( axes, 'C' if data.flags.c_contiguous else 'F', 'C' if result.flags.c_contiguous else 'F', ), end=' ', ) tic = timeit.default_timer() for i in range(n_run): events = transform.enqueue() #events = transform.enqueue(False) for e in events: e.wait() toc = timeit.default_timer() t_ms = 1e3*(toc-tic)/n_run gflops = 5e-9 * np.log2(np.prod(transform.t_shape))*np.prod(transform.t_shape) * transform.batchsize / (1e-3*t_ms) npfft_result = npfftn(nd_dataC, axes = axes) if transform.plan.transpose_result: npfft_result = np.swapaxes(npfft_result, axes[0], axes[1]) max_error = np.max(abs(result.get() - npfft_result)) print('%8.1e'%max_error, end=' ') assert_allclose(result.get(), npfft_result, atol = 1e-8 if double_precision else 1e-3, rtol = 1e-8 if double_precision else 1e-3) #assert_array_almost_equal(abs(result.get() - npfftn(data.get(), axes = axes)), # 1e-4) except GpyFFT_Error as e: print(e) t_ms, gflops = 0, 0 except AssertionError as e: print(e) finally: print('%5.2fms %6.2f Gflops' % (t_ms, gflops) ) print('in place transforms', nd_dataC.shape, nd_dataC.dtype) for axes in axes_list: for nd_data in (nd_dataC, nd_dataF): data = cla.to_device(queue, nd_data) transform = FFT(context, queue, data, axes = axes) #transform.plan.transpose_result = True #not implemented tic = timeit.default_timer() for i in range(n_run): # inplace transform fails for n_run > 1 events = transform.enqueue() for e in events: e.wait() toc = timeit.default_timer() t_ms = 1e3*(toc-tic)/n_run gflops = 5e-9 * np.log2(np.prod(transform.t_shape))*np.prod(transform.t_shape) * transform.batchsize / (1e-3*t_ms) print('%-10s %3s %5.2fms %6.2f Gflops' % ( axes, 'C' if data.flags.c_contiguous else 'F', t_ms, gflops ))