def _exec_fft(a, direction, value_type, norm, axis, overwrite_x, out_size=None, out=None, plan=None): fft_type = _convert_fft_type(a, value_type) if axis % a.ndim != a.ndim - 1: a = a.swapaxes(axis, -1) if a.base is not None or not a.flags.c_contiguous: a = a.copy() if out_size is None: out_size = a.shape[-1] batch = a.size // a.shape[-1] curr_plan = cufft.get_current_plan() if curr_plan is not None: if plan is None: plan = curr_plan else: raise RuntimeError('Use the cuFFT plan either as a context manager' ' or as an argument.') if plan is None: devices = None if not config.use_multi_gpus else config._devices plan = cufft.Plan1d(out_size, fft_type, batch, devices=devices) else: # check plan validity if not isinstance(plan, cufft.Plan1d): raise ValueError('expected plan to have type cufft.Plan1d') if fft_type != plan.fft_type: raise ValueError('CUFFT plan dtype mismatch.') if out_size != plan.nx: raise ValueError('Target array size does not match the plan.') if batch != plan.batch: raise ValueError('Batch size does not match the plan.') if config.use_multi_gpus != plan._use_multi_gpus: raise ValueError('Unclear if multiple GPUs are to be used or not.') if overwrite_x and value_type == 'C2C': out = a elif out is not None: # verify that out has the expected shape and dtype plan.check_output_array(a, out) else: out = plan.get_output_array(a) plan.fft(a, out, direction) sz = out.shape[-1] if fft_type == cufft.CUFFT_R2C or fft_type == cufft.CUFFT_D2Z: sz = a.shape[-1] if norm is None: if direction == cufft.CUFFT_INVERSE: out /= sz else: out /= math.sqrt(sz) if axis % a.ndim != a.ndim - 1: out = out.swapaxes(axis, -1) return out
def test_ifft(self, dtype): _skip_multi_gpu_bug(self.shape, self.gpus) a = testing.shaped_random(self.shape, numpy, dtype) if len(self.shape) == 1: batch = 1 nx = self.shape[0] elif len(self.shape) == 2: batch = self.shape[0] nx = self.shape[1] # compute via cuFFT cufft_type = _convert_fft_type(a.dtype, 'C2C') plan = cufft.Plan1d(nx, cufft_type, batch, devices=config._devices) out_cp = numpy.empty_like(a) plan.fft(a, out_cp, cufft.CUFFT_INVERSE) # normalization out_cp /= nx out_np = numpy.fft.ifft(a) # np.fft.fft alway returns np.complex128 if dtype is numpy.complex64: out_np = out_np.astype(dtype) assert numpy.allclose(out_cp, out_np, rtol=1e-4, atol=1e-7) # compute it again to ensure Plan1d's internal state is reset plan.fft(a, out_cp, cufft.CUFFT_INVERSE) # normalization out_cp /= nx assert numpy.allclose(out_cp, out_np, rtol=1e-4, atol=1e-7)
def _exec_fft(a, direction, value_type, norm, axis, overwrite_x, out_size=None): fft_type = _convert_fft_type(a, value_type) if axis % a.ndim != a.ndim - 1: a = a.swapaxes(axis, -1) if a.base is not None: a = a.copy() plan = cufft.Plan1d(a.shape[-1] if out_size is None else out_size, fft_type, a.size // a.shape[-1]) if overwrite_x and value_type == 'C2C': plan.fft(a, a, direction) out = a else: out = plan.get_output_array(a) plan.fft(a, out, direction) sz = out.shape[-1] if fft_type == cufft.CUFFT_R2C or fft_type == cufft.CUFFT_D2Z: sz = a.shape[-1] if norm is None: if direction == cufft.CUFFT_INVERSE: out /= sz else: out /= cupy.sqrt(sz) if axis % a.ndim != a.ndim - 1: out = out.swapaxes(axis, -1) return out
def _exec_fft(a, direction, value_type, norm, axis, overwrite_x, out_size=None, out=None, plan=None): fft_type = _convert_fft_type(a, value_type) if axis % a.ndim != a.ndim - 1: a = a.swapaxes(axis, -1) if a.base is not None or not a.flags.c_contiguous: a = a.copy() if out_size is None: out_size = a.shape[-1] batch = a.size // a.shape[-1] if plan is None: plan = cufft.Plan1d(out_size, fft_type, batch) else: # check plan validity if not isinstance(plan, cufft.Plan1d): raise ValueError('expected plan to have type cufft.Plan1d') if fft_type != plan.fft_type: raise ValueError('CUFFT plan dtype mismatch.') if out_size != plan.nx: raise ValueError('Target array size does not match the plan.') if batch != plan.batch: raise ValueError('Batch size does not match the plan.') if overwrite_x and value_type == 'C2C': out = a elif out is not None: # verify that out has the expected shape and dtype plan.check_output_array(a, out) else: out = plan.get_output_array(a) plan.fft(a, out, direction) sz = out.shape[-1] if fft_type == cufft.CUFFT_R2C or fft_type == cufft.CUFFT_D2Z: sz = a.shape[-1] if norm is None: if direction == cufft.CUFFT_INVERSE: out /= sz else: out /= sqrt(sz) if axis % a.ndim != a.ndim - 1: out = out.swapaxes(axis, -1) return out
def _exec_fft(a, direction, value_type, norm, axis, overwrite_x, out_size=None, out=None): fft_type = _convert_fft_type(a, value_type) if axis % a.ndim != a.ndim - 1: a = a.swapaxes(axis, -1) if a.base is not None or not a.flags.c_contiguous: a = a.copy() if out_size is None: out_size = a.shape[-1] batch = a.size // a.shape[-1] plan = cufft.Plan1d(out_size, fft_type, batch) if overwrite_x and value_type == 'C2C': out = a elif out is not None: # verify that out has the expected shape and dtype plan.check_output_array(a, out) else: out = plan.get_output_array(a) plan.fft(a, out, direction) sz = out.shape[-1] if fft_type == cufft.CUFFT_R2C or fft_type == cufft.CUFFT_D2Z: sz = a.shape[-1] if norm is None: if direction == cufft.CUFFT_INVERSE: out /= sz else: out /= sqrt(sz) if axis % a.ndim != a.ndim - 1: out = out.swapaxes(axis, -1) return out
def _exec_fft(a, direction, value_type, norm, axis, overwrite_x, out_size=None, out=None, plan=None): fft_type = _convert_fft_type(a.dtype, value_type) if axis % a.ndim != a.ndim - 1: a = a.swapaxes(axis, -1) if a.base is not None or not a.flags.c_contiguous: a = a.copy() elif (value_type == 'C2R' and not overwrite_x and 10010 <= cupy.cuda.runtime.runtimeGetVersion()): # The input array may be modified in CUDA 10.1 and above. # See #3763 for the discussion. a = a.copy() n = a.shape[-1] if n < 1: raise ValueError( 'Invalid number of FFT data points (%d) specified.' % n) if out_size is None: out_size = n batch = a.size // n curr_plan = cufft.get_current_plan() if curr_plan is not None: if plan is None: plan = curr_plan else: raise RuntimeError('Use the cuFFT plan either as a context manager' ' or as an argument.') if plan is None: devices = None if not config.use_multi_gpus else config._devices plan = cufft.Plan1d(out_size, fft_type, batch, devices=devices) else: # check plan validity if not isinstance(plan, cufft.Plan1d): raise ValueError('expected plan to have type cufft.Plan1d') if fft_type != plan.fft_type: raise ValueError('cuFFT plan dtype mismatch.') if out_size != plan.nx: raise ValueError('Target array size does not match the plan.', out_size, plan.nx) if batch != plan.batch: raise ValueError('Batch size does not match the plan.') if config.use_multi_gpus != plan._use_multi_gpus: raise ValueError('Unclear if multiple GPUs are to be used or not.') if overwrite_x and value_type == 'C2C': out = a elif out is not None: # verify that out has the expected shape and dtype plan.check_output_array(a, out) else: out = plan.get_output_array(a) if batch != 0: plan.fft(a, out, direction) sz = out.shape[-1] if fft_type == cufft.CUFFT_R2C or fft_type == cufft.CUFFT_D2Z: sz = n if norm is None: if direction == cufft.CUFFT_INVERSE: out /= sz else: out /= math.sqrt(sz) if axis % a.ndim != a.ndim - 1: out = out.swapaxes(axis, -1) return out
def get_fft_plan(a, shape=None, axes=None, value_type='C2C'): """ Generate a CUDA FFT plan for transforming up to three axes. Args: a (cupy.ndarray): Array to be transform, assumed to be either C- or F- contiguous. shape (None or tuple of ints): Shape of the transformed axes of the output. If ``shape`` is not given, the lengths of the input along the axes specified by ``axes`` are used. axes (None or int or tuple of int): The axes of the array to transform. If `None`, it is assumed that all axes are transformed. Currently, for performing N-D transform these must be a set of up to three adjacent axes, and must include either the first or the last axis of the array. value_type (str): The FFT type to perform. Acceptable values are: * 'C2C': complex-to-complex transform (default) * 'R2C': real-to-complex transform * 'C2R': complex-to-real transform Returns: a cuFFT plan for either 1D transform (``cupy.cuda.cufft.Plan1d``) or N-D transform (``cupy.cuda.cufft.PlanNd``). .. note:: The returned plan can not only be passed as one of the arguments of the functions in ``cupyx.scipy.fftpack``, but also be used as a context manager for both ``cupy.fft`` and ``cupyx.scipy.fftpack`` functions: .. code-block:: python x = cupy.random.random(16).reshape(4, 4).astype(cupy.complex) plan = cupyx.scipy.fftpack.get_fft_plan(x) with plan: y = cupy.fft.fftn(x) # alternatively: y = cupyx.scipy.fftpack.fftn(x) # no explicit plan is given! # alternatively: y = cupyx.scipy.fftpack.fftn(x, plan=plan) # pass plan explicitly In the first case, no cuFFT plan will be generated automatically, even if ``cupy.fft.config.enable_nd_planning = True`` is set. .. note:: If this function is called under the context of :func:`~cupy.fft.config.set_cufft_callbacks`, the generated plan will have callbacks enabled. .. warning:: This API is a deviation from SciPy's, is currently experimental, and may be changed in the future version. """ # check input array if a.flags.c_contiguous: order = 'C' elif a.flags.f_contiguous: order = 'F' else: raise ValueError('Input array a must be contiguous') if isinstance(shape, int): shape = (shape, ) if isinstance(axes, int): axes = (axes, ) if (shape is not None) and (axes is not None) and len(shape) != len(axes): raise ValueError('Shape and axes have different lengths.') # check axes # n=1: 1d (need axis1D); n>1: Nd if axes is None: n = a.ndim if shape is None else len(shape) axes = tuple(i for i in range(-n, 0)) if n == 1: axis1D = 0 else: # axes is a tuple n = len(axes) if n == 1: axis1D = axes[0] if axis1D >= a.ndim or axis1D < -a.ndim: err = 'The chosen axis ({0}) exceeds the number of '\ 'dimensions of a ({1})'.format(axis1D, a.ndim) raise ValueError(err) elif n > 3: raise ValueError('Only up to three axes is supported') # Note that "shape" here refers to the shape along trasformed axes, not # the shape of the output array, and we need to convert it to the latter. # The result is as if "a=_cook_shape(a); return a.shape" is called. # Because of this, we need to use (possibly unsorted) axes. transformed_shape = shape shape = list(a.shape) if transformed_shape is not None: for s, axis in zip(transformed_shape, axes): if s is not None: if axis == axes[-1] and value_type == 'C2R': s = s // 2 + 1 shape[axis] = s shape = tuple(shape) # check value_type out_dtype = _output_dtype(a.dtype, value_type) fft_type = _convert_fft_type(out_dtype, value_type) # TODO(leofang): figure out if we really have to skip F-order? if n > 1 and value_type != 'C2C' and a.flags.f_contiguous: raise ValueError('C2R/R2C PlanNd for F-order arrays is not supported') # generate plan # (load from cache if it exists, otherwise create one but don't cache it) if n > 1: # ND transform if cupy.cuda.runtime.is_hip and value_type == 'C2R': raise RuntimeError("hipFFT's C2R PlanNd is buggy and unsupported") out_size = _get_fftn_out_size(shape, transformed_shape, axes[-1], value_type) # _get_cufft_plan_nd interacts with plan cache and callback plan = _get_cufft_plan_nd(shape, fft_type, axes=axes, order=order, out_size=out_size, to_cache=False) else: # 1D transform # prepare plan arguments if value_type != 'C2R': out_size = shape[axis1D] else: out_size = _get_fftn_out_size(shape, transformed_shape, axis1D, value_type) batch = prod(shape) // shape[axis1D] devices = None if not config.use_multi_gpus else config._devices keys = (out_size, fft_type, batch, devices) mgr = config.get_current_callback_manager() if mgr is not None: # to avoid a weird segfault, we generate and cache distinct plans # for every possible (load_aux, store_aux) pairs; the plans are # still generated from the same external Python module load_aux = mgr.cb_load_aux_arr store_aux = mgr.cb_store_aux_arr keys += (mgr.cb_load, mgr.cb_store, 0 if load_aux is None else load_aux.data.ptr, 0 if store_aux is None else store_aux.data.ptr) cache = get_plan_cache() cached_plan = cache.get(keys) if cached_plan is not None: plan = cached_plan elif mgr is None: plan = cufft.Plan1d(out_size, fft_type, batch, devices=devices) else: # has callback # TODO(leofang): support multi-GPU callback (devices is ignored) if devices: raise NotImplementedError('multi-GPU cuFFT callbacks are not ' 'yet supported') plan = mgr.create_plan(('Plan1d', keys[:-3])) mgr.set_callbacks(plan) return plan
def _exec_fft(a, direction, value_type, norm, axis, overwrite_x, out_size=None, out=None, plan=None): fft_type = _convert_fft_type(a.dtype, value_type) if axis % a.ndim != a.ndim - 1: a = a.swapaxes(axis, -1) if a.base is not None or not a.flags.c_contiguous: a = a.copy() elif (value_type == 'C2R' and not overwrite_x and 10010 <= cupy.cuda.runtime.runtimeGetVersion()): # The input array may be modified in CUDA 10.1 and above. # See #3763 for the discussion. a = a.copy() elif cupy.cuda.runtime.is_hip and value_type != 'C2C': # hipFFT's R2C would overwrite input # hipFFT's C2R needs a workaround (see below) a = a.copy() n = a.shape[-1] if n < 1: raise ValueError('Invalid number of FFT data points (%d) specified.' % n) # Workaround for hipFFT/rocFFT: # Both cuFFT and hipFFT/rocFFT have this requirement that 0-th and # N/2-th element must be real, but cuFFT internally simply ignores it # while hipFFT handles it badly in both Plan1d and PlanNd, so we must # do the correction ourselves to ensure the condition is met. if cupy.cuda.runtime.is_hip and value_type == 'C2R': a[..., 0] = a[..., 0].real + 0j if out_size is None: a[..., -1] = a[..., -1].real + 0j elif out_size % 2 == 0: a[..., out_size // 2] = a[..., out_size // 2].real + 0j if out_size is None: out_size = n batch = a.size // n # plan search precedence: # 1. plan passed in as an argument # 2. plan as context manager # 3. cached plan # 4. create a new one curr_plan = cufft.get_current_plan() if curr_plan is not None: if plan is None: plan = curr_plan else: raise RuntimeError('Use the cuFFT plan either as a context manager' ' or as an argument.') if plan is None: devices = None if not config.use_multi_gpus else config._devices # TODO(leofang): do we need to add the current stream to keys? keys = (out_size, fft_type, batch, devices) mgr = config.get_current_callback_manager() if mgr is not None: # to avoid a weird segfault, we generate and cache distinct plans # for every possible (load_aux, store_aux) pairs; the plans are # still generated from the same external Python module load_aux = mgr.cb_load_aux_arr store_aux = mgr.cb_store_aux_arr keys += (mgr.cb_load, mgr.cb_store, 0 if load_aux is None else load_aux.data.ptr, 0 if store_aux is None else store_aux.data.ptr) cache = get_plan_cache() cached_plan = cache.get(keys) if cached_plan is not None: plan = cached_plan elif mgr is None: plan = cufft.Plan1d(out_size, fft_type, batch, devices=devices) cache[keys] = plan else: # has callback # TODO(leofang): support multi-GPU callback (devices is ignored) if devices: raise NotImplementedError('multi-GPU cuFFT callbacks are not ' 'yet supported') plan = mgr.create_plan(('Plan1d', keys[:-5])) mgr.set_callbacks(plan) cache[keys] = plan else: # check plan validity if not isinstance(plan, cufft.Plan1d): raise ValueError('expected plan to have type cufft.Plan1d') if fft_type != plan.fft_type: raise ValueError('cuFFT plan dtype mismatch.') if out_size != plan.nx: raise ValueError('Target array size does not match the plan.', out_size, plan.nx) if batch != plan.batch: raise ValueError('Batch size does not match the plan.') if config.use_multi_gpus != (plan.gpus is not None): raise ValueError('Unclear if multiple GPUs are to be used or not.') if overwrite_x and value_type == 'C2C': out = a elif out is not None: # verify that out has the expected shape and dtype plan.check_output_array(a, out) else: out = plan.get_output_array(a) if batch != 0: plan.fft(a, out, direction) sz = out.shape[-1] if fft_type == cufft.CUFFT_R2C or fft_type == cufft.CUFFT_D2Z: sz = n if norm is None: if direction == cufft.CUFFT_INVERSE: out /= sz else: out /= math.sqrt(sz) if axis % a.ndim != a.ndim - 1: out = out.swapaxes(axis, -1) return out
def get_fft_plan(a, shape=None, axes=None, value_type='C2C'): """ Generate a CUDA FFT plan for transforming up to three axes. Args: a (cupy.ndarray): Array to be transform, assumed to be either C- or F- contiguous. shape (None or tuple of ints): Shape of the transformed axes of the output. If ``shape`` is not given, the lengths of the input along the axes specified by ``axes`` are used. axes (None or int or tuple of int): The axes of the array to transform. If `None`, it is assumed that all axes are transformed. Currently, for performing N-D transform these must be a set of up to three adjacent axes, and must include either the first or the last axis of the array. value_type (str): The FFT type to perform. Acceptable values are: * 'C2C': complex-to-complex transform (default) * 'R2C': real-to-complex transform * 'C2R': complex-to-real transform Returns: a cuFFT plan for either 1D transform (``cupy.cuda.cufft.Plan1d``) or N-D transform (``cupy.cuda.cufft.PlanNd``). .. note:: The returned plan can not only be passed as one of the arguments of the functions in ``cupyx.scipy.fftpack``, but also be used as a context manager for both ``cupy.fft`` and ``cupyx.scipy.fftpack`` functions: .. code-block:: python x = cupy.random.random(16).reshape(4, 4).astype(cupy.complex) plan = cupyx.scipy.fftpack.get_fft_plan(x) with plan: y = cupy.fft.fftn(x) # alternatively: y = cupyx.scipy.fftpack.fftn(x) # no explicit plan is given! # alternatively: y = cupyx.scipy.fftpack.fftn(x, plan=plan) # pass plan explicitly In the first case, no cuFFT plan will be generated automatically, even if ``cupy.fft.config.enable_nd_planning = True`` is set. .. warning:: This API is a deviation from SciPy's, is currently experimental, and may be changed in the future version. """ # check input array if a.flags.c_contiguous: order = 'C' elif a.flags.f_contiguous: order = 'F' else: raise ValueError('Input array a must be contiguous') if isinstance(shape, int): shape = (shape, ) if isinstance(axes, int): axes = (axes, ) if (shape is not None) and (axes is not None) and len(shape) != len(axes): raise ValueError('Shape and axes have different lengths.') # check axes # n=1: 1d (need axis1D); n>1: Nd if axes is None: n = a.ndim if shape is None else len(shape) axes = tuple(i for i in range(-n, 0)) if n == 1: axis1D = 0 else: # axes is a tuple n = len(axes) if n == 1: axis1D = axes[0] if axis1D >= a.ndim or axis1D < -a.ndim: err = 'The chosen axis ({0}) exceeds the number of '\ 'dimensions of a ({1})'.format(axis1D, a.ndim) raise ValueError(err) elif n > 3: raise ValueError('Only up to three axes is supported') # Note that "shape" here refers to the shape along trasformed axes, not # the shape of the output array, and we need to convert it to the latter. # The result is as if "a=_cook_shape(a); return a.shape" is called. # Because of this, we need to use (possibly unsorted) axes. transformed_shape = shape shape = list(a.shape) if transformed_shape is not None: for s, axis in zip(transformed_shape, axes): if s is not None: if axis == axes[-1] and value_type == 'C2R': s = s // 2 + 1 shape[axis] = s shape = tuple(shape) # check value_type out_dtype = _output_dtype(a.dtype, value_type) fft_type = _convert_fft_type(out_dtype, value_type) # TODO(leofang): figure out if we really have to skip F-order? if n > 1 and value_type != 'C2C' and a.flags.f_contiguous: raise ValueError('C2R/R2C PlanNd for F-order arrays is not supported') # generate plan if n > 1: # ND transform out_size = _get_fftn_out_size(shape, transformed_shape, axes[-1], value_type) plan = _get_cufft_plan_nd(shape, fft_type, axes=axes, order=order, out_size=out_size) else: # 1D transform if value_type != 'C2R': out_size = shape[axis1D] else: out_size = _get_fftn_out_size(shape, transformed_shape, axis1D, value_type) batch = prod(shape) // shape[axis1D] devices = None if not config.use_multi_gpus else config._devices plan = cufft.Plan1d(out_size, fft_type, batch, devices=devices) return plan
def _fft_convolve(a1, a2, mode): offset = 0 if a1.size < a2.size: a1, a2 = a2, a1 offset = 1 - a2.size % 2 # if either of them is complex, the dtype after multiplication will also be if a1.dtype.kind == 'c' or a2.dtype.kind == 'c': fft, ifft = cupy.fft.fft, cupy.fft.ifft is_c2c = True else: fft, ifft = cupy.fft.rfft, cupy.fft.irfft is_c2c = False # hack to work around NumPy/CuPy FFT dtype incompatibility: # CuPy internally converts fp16 to fp32 before doing FFT (whereas Numpy # converts both fp16 and fp32 to fp64), so here we do the cast early and # explicitly, and make sure a correct cuFFT plan can be generated. After # the fft-ifft round trip, we cast the output dtype to the correct one. out_dtype = cupy.result_type(a1, a2) dtype = _output_dtype(out_dtype, 'C2C' if is_c2c else 'R2C') a1 = a1.astype(dtype, copy=False) a2 = a2.astype(dtype, copy=False) n1, n2 = a1.size, a2.size out_size = cupyx.scipy.fft.next_fast_len(n1 + n2 - 1) # skip calling get_fft_plan() as we know the args exactly if is_c2c: fft_t = cufft.CUFFT_C2C if dtype == cupy.complex64 else cufft.CUFFT_Z2Z fft_plan = cufft.Plan1d(out_size, fft_t, 1) ifft_plan = fft_plan else: fft_t = cufft.CUFFT_R2C if dtype == cupy.float32 else cufft.CUFFT_D2Z fft_plan = cufft.Plan1d(out_size, fft_t, 1) # this is a no-op context manager # TODO(leofang): use contextlib.nullcontext() for PY37+? ifft_plan = contextlib.suppress() with fft_plan: fa1 = fft(a1, out_size) fa2 = fft(a2, out_size) with ifft_plan: out = ifft(fa1 * fa2, out_size) if mode == 'full': start, end = 0, n1 + n2 - 1 elif mode == 'same': start = (n2 - 1) // 2 + offset end = start + n1 elif mode == 'valid': start, end = n2 - 1, n1 else: raise ValueError( 'acceptable mode flags are `valid`, `same`, or `full`.') out = out[start:end] if out.dtype.kind in 'iu': out = cupy.around(out) return out.astype(out_dtype, copy=False)
def __init__(self, Nr, Nz, use_cuda=False, nthreads=None): """ Initialize an FFT object Parameters ---------- Nr: int Number of grid points along the r axis (axis -1) Nz: int Number of grid points along the z axis (axis 0) use_cuda: bool, optional Whether to perform the Fourier transform on the z axis nthreads : int, optional Number of threads for the FFTW transform. If None, the default number of threads of numba is used (environment variable NUMBA_NUM_THREADS) """ # Check whether to use cuda self.use_cuda = use_cuda if (self.use_cuda is True) and (cuda_installed is False): self.use_cuda = False print('** Cuda not available for Fourier transform.') print('** Performing the Fourier transform on the CPU.') # Check whether to use MKL self.use_mkl = mkl_installed # Initialize the object for calculation on the GPU if self.use_cuda: # Set optimal number of CUDA threads per block # for copy 1d/2d kernels (determined empirically) copy_tpb = (8, 32) if cuda_gpu_model == "V100" else (2, 16) # Initialize the dimension of the grid and blocks self.dim_grid, self.dim_block = cuda_tpb_bpg_2d(Nz, Nr, *copy_tpb) # Initialize 1d buffer for cufft self.buffer1d_in = cupy.empty((Nz * Nr, ), dtype=np.complex128) self.buffer1d_out = cupy.empty((Nz * Nr, ), dtype=np.complex128) # Initialize the CUDA FFT plan object self.fft = cufft.Plan1d(Nz, cufft.CUFFT_Z2Z, Nr) self.inv_Nz = 1. / Nz # For normalization of the iFFT # Initialize the object for calculation on the CPU else: # For MKL FFT if self.use_mkl: # Initialize the MKL plan with dummy array spect_buffer = np.zeros((Nz, Nr), dtype=np.complex128) self.mklfft = MKLFFT(spect_buffer) # For FFTW else: # Determine number of threads if nthreads is None: # Get the default number of threads for numba nthreads = numba.config.NUMBA_NUM_THREADS # Initialize the FFT plan with dummy arrays interp_buffer = np.zeros((Nz, Nr), dtype=np.complex128) spect_buffer = np.zeros((Nz, Nr), dtype=np.complex128) self.fft = pyfftw.FFTW(interp_buffer, spect_buffer, axes=(0, ), direction='FFTW_FORWARD', threads=nthreads) self.ifft = pyfftw.FFTW(spect_buffer, interp_buffer, axes=(0, ), direction='FFTW_BACKWARD', threads=nthreads)
def get_fft_plan(a, shape=None, axes=None, value_type='C2C'): """ Generate a CUDA FFT plan for transforming up to three axes. Args: a (cupy.ndarray): Array to be transform, assumed to be either C- or F- contiguous. shape (None or tuple of ints): Shape of the transformed axes of the output. If ``shape`` is not given, the lengths of the input along the axes specified by ``axes`` are used. axes (None or int or tuple of int): The axes of the array to transform. If `None`, it is assumed that all axes are transformed. Currently, for performing N-D transform these must be a set of up to three adjacent axes, and must include either the first or the last axis of the array. value_type ('C2C'): The FFT type to perform. Currently only complex-to-complex transforms are supported. Returns: a cuFFT plan for either 1D transform (``cupy.cuda.cufft.Plan1d``) or N-D transform (``cupy.cuda.cufft.PlanNd``). .. note:: The returned plan can not only be passed as one of the arguments of the functions in ``cupyx.scipy.fftpack``, but also be used as a context manager for both ``cupy.fft`` and ``cupyx.scipy.fftpack`` functions: .. code-block:: python x = cupy.random.random(16).reshape(4, 4).astype(cupy.complex) plan = cupyx.scipy.fftpack.get_fft_plan(x) with plan: y = cupy.fft.fftn(x) # alternatively: y = cupyx.scipy.fftpack.fftn(x) # no explicit plan is given! # alternatively: y = cupyx.scipy.fftpack.fftn(x, plan=plan) # pass plan explicitly In the first case, no cuFFT plan will be generated automatically, even if ``cupy.fft.config.enable_nd_planning = True`` is set. .. warning:: This API is a deviation from SciPy's, is currently experimental, and may be changed in the future version. """ # check input array if a.flags.c_contiguous: order = 'C' elif a.flags.f_contiguous: order = 'F' else: raise ValueError('Input array a must be contiguous') # check axes # n=1: 1d (need axis1D); n>1: Nd if axes is None: n = a.ndim axes = tuple(i for i in range(n)) if n == 1: axis1D = 0 elif isinstance(axes, int): n = 1 axis1D = axes axes = (axes, ) if axis1D >= a.ndim or axis1D < -a.ndim: raise ValueError('The chosen axis ({0}) exceeds the number of ' 'dimensions of a ({1})'.format(axis1D, a.ndim)) else: # axes is a tuple n = len(axes) if n == 1: axis1D = axes[0] elif n > 3: raise ValueError('Only up to three axes is supported') # check shape if isinstance(shape, int): shape = (shape, ) if (shape is not None) and len(shape) != n: raise ValueError('Shape and axes have different lengths.') # Note that "shape" here refers to the shape along trasformed axes, not # the shape of the output array, and we need to convert it to the latter. # The result is as if "a=_cook_shape(a); return a.shape" is called. transformed_shape = shape shape = list(a.shape) if transformed_shape is not None: for s, axis in zip(transformed_shape, axes): shape[axis] = s shape = tuple(shape) # check value_type fft_type = _convert_fft_type(a, value_type) if n > 1 and fft_type not in [cufft.CUFFT_C2C, cufft.CUFFT_Z2Z]: raise NotImplementedError('Only C2C and Z2Z are supported for N-dim' ' transform.') # generate plan if n > 1: # ND transform plan = _get_cufft_plan_nd(shape, fft_type, axes=axes, order=order) else: # 1D transform out_size = shape[axis1D] batch = prod(shape) // out_size plan = cufft.Plan1d(out_size, fft_type, batch) return plan
def get_fft_plan(a, shape=None, axes=None, value_type='C2C'): """ Generate a CUDA FFT plan for transforming up to three axes. Args: a (cupy.ndarray): Array to be transform, assumed to be either C- or F- contiguous. shape (None or tuple of ints): Shape of the transformed axes of the output. If ``shape`` is not given, the lengths of the input along the axes specified by ``axes`` are used. axes (None or int or tuple of int): The axes of the array to transform. If `None`, it is assumed that all axes are transformed. Currently, for performing N-D transform these must be a set of up to three adjacent axes, and must include either the first or the last axis of the array. value_type ('C2C'): The FFT type to perform. Currently only complex-to-complex transforms are supported. Returns: plan: a cuFFT plan for either 1D transform (cupy.cuda.cufft.Plan1d) or N-D transform (cupy.cuda.cufft.PlanNd). """ # check input array if a.flags.c_contiguous: order = 'C' elif a.flags.f_contiguous: order = 'F' else: raise ValueError('Input array a must be contiguous') # check axes # n=1: 1d (need axis1D); n>1: Nd if axes is None: n = a.ndim axes = tuple(i for i in range(n)) if n == 1: axis1D = 0 elif isinstance(axes, int): n = 1 axis1D = axes axes = (axes, ) if axis1D >= a.ndim or axis1D < -a.ndim: raise ValueError('The chosen axis ({0}) exceeds the number of ' 'dimensions of a ({1})'.format(axis1D, a.ndim)) else: # axes is a tuple n = len(axes) if n == 1: axis1D = axes[0] elif n > 3: raise ValueError('Only up to three axes is supported') # check shape if isinstance(shape, int): shape = (shape, ) if (shape is not None) and len(shape) != n: raise ValueError('Shape and axes have different lengths.') # Note that "shape" here refers to the shape along trasformed axes, not # the shape of the output array, and we need to convert it to the latter. # The result is as if "a=_cook_shape(a); return a.shape" is called. transformed_shape = shape shape = list(a.shape) if transformed_shape is not None: for s, axis in zip(transformed_shape, axes): shape[axis] = s shape = tuple(shape) # check value_type fft_type = _convert_fft_type(a, value_type) if n > 1 and fft_type not in [cufft.CUFFT_C2C, cufft.CUFFT_Z2Z]: raise NotImplementedError('Only C2C and Z2Z are supported for N-dim' ' transform.') # generate plan if n > 1: # ND transform plan = _get_cufft_plan_nd(shape, fft_type, axes=axes, order=order) else: # 1D transform out_size = shape[axis1D] batch = prod(shape) // out_size plan = cufft.Plan1d(out_size, fft_type, batch) return plan