Esempio n. 1
0
 def get_Omega(self):
     """
     -1/2 Laplac Omega = omega
     :return:
     """
     rfft_Om = np.fft.rfft2(self.get_omega())
     rs = rfft_Om.shape
     ky = (2. * np.pi) / self.lsides[0] * Freq(np.arange(self.shape[0]), self.shape[0])
     ky[self.shape[0] / 2:] *= -1.
     kx = (2. * np.pi) / self.lsides[1] * Freq(np.arange(rs[1]), self.shape[1])
     rfft_Om = rfft_Om.flatten()
     rfft_Om[1:] /= (np.outer(ky ** 2, np.ones(rs[1])) + np.outer(np.ones(rs[0]), kx ** 2)).flatten()[1:]
     return np.fft.irfft2(2 * rfft_Om.reshape(rs), self.shape)
Esempio n. 2
0
def apply_FDxiDtFt_GPU_inplace_timed(type,
                                     lib_alm_dat,
                                     lib_alm_sky,
                                     alms_unlCMB,
                                     f,
                                     f_inv,
                                     cls_unl,
                                     func='bicubic',
                                     double_precision_ffts=False):
    """
    Note that the first call might be substantially slower than subsequent calls, as it caches the fft and ifft plans
    for subsequent usage.
    :param type : 'T', 'QU' or 'TQU'
    :param alms_unlCMB: ffs_alms to apply FDxiDtFt to.
    :param func: bicubic or bilinear
    :param cls_unl : unlensed CMB cls dictionary (used in get_P_mat)
    :return: ffs_alms of shape (len(type,lib_alm_dat.alm_size)
    """
    if True:
        ti = time.time()
    assert func in ['bicubic', 'bilinear'], func
    assert alms_unlCMB.shape == (len(type), lib_alm_dat.alm_size)
    assert lib_alm_dat.ell_mat.shape == lib_alm_sky.ell_mat.shape
    assert lib_alm_dat.ell_mat.lsides == lib_alm_sky.ell_mat.lsides
    # Useful declarations :
    nfields = len(type)
    rshape = lib_alm_sky.ell_mat.rshape
    shape = (rshape[0], 2 * (rshape[1] - 1))
    flat_shape = np.prod(shape)

    GPU_grid = (shape[0] / GPU_block[0], shape[1] / GPU_block[1], 1)

    assert shape[0] % GPU_block[0] == 0

    assert shape[0] == shape[1], shape
    assert IsPowerOfTwo(shape[0]), shape
    assert f.shape == shape, (f.shape, shape)
    assert f_inv.shape == shape, (f_inv.shape, shape)
    assert np.all(np.array(shape) % GPU_block[0] == 0), shape

    if shape[0] > 4096:
        print "--- Exercise caution, array shapes larger than 4096 have never been tested so far ---"

    def get_rfft_unlCMB(idx):
        return lib_alm_dat.alm2rfft(alms_unlCMB[idx])

    # TODO : some get_Pij method
    if True:
        t0 = time.time()
    unlPmat = get_Pmat(type, lib_alm_sky, cls_unl)
    if True:
        dt = time.time() - t0
        print "     unl Pmat at %s Mpixel / sec, ex. time %s sec." % (
            np.prod(shape) / 1e6 / dt, dt)
        t0 = time.time()

    # 2D texture references :
    unl_CMB_tex = CUDA_module.get_texref("unl_CMB")
    dx_tex = CUDA_module.get_texref("tex_dx")
    dy_tex = CUDA_module.get_texref("tex_dy")

    # loading fft plans :
    plan, plan_inv = get_rfft_plans(shape,
                                    double_precision=double_precision_ffts)
    # Function references :
    # Spline bicubic prefiltering, bicubic interpolation and multiplication with magnification.
    prefilter = CUDA_module.get_function(
        "cf_outer_w"
    ) if not double_precision_ffts else CUDA_module.get_function("cdd_outer_w")
    lens_func = CUDA_module.get_function("%slensKernel_normtex" % func)
    magn_func = CUDA_module.get_function("detmagn_normtex")

    cplx_type = np.complex64 if not double_precision_ffts else np.complex128
    f_type = np.float32 if not double_precision_ffts else np.float64

    # We will store in host memory some maps for convenience
    temp_alms = np.zeros((nfields, lib_alm_sky.alm_size), dtype=cplx_type)

    setup_texture_nparr(dx_tex, f_inv.get_dx_ingridunits())
    setup_texture_nparr(dy_tex, f_inv.get_dy_ingridunits())
    coeffs_gpu = gpuarray.empty(shape, dtype=f_type, order='C')
    # Setting up the texture references to the displacement
    # (This is what  contribute most to the cost actually)
    rfft2_unlCMB_gpu = gpuarray.empty(rshape, dtype=cplx_type)
    if True:
        dt = time.time() - t0
        print "  First tex. setup at %s Mpixel / sec, ex. time %s sec." % (
            np.prod(shape) / 1e6 / dt, dt)
        t0 = time.time()
    wx_gpu = (6. / (2. * np.cos(
        2. * np.pi * Freq(np.arange(shape[0]), shape[0]) / shape[0]) + 4.) /
              shape[0])
    wx_gpu = gpuarray.to_gpu(wx_gpu.astype(f_type))

    for _f in xrange(nfields):
        # Multiplying with the spline coefficients and Fourier transforming
        rfft2_unlCMB_gpu.set(get_rfft_unlCMB(_f).astype(cplx_type))
        prefilter(rfft2_unlCMB_gpu,
                  wx_gpu,
                  np.int32(rshape[1]),
                  np.int32(rshape[0]),
                  block=GPU_block,
                  grid=GPU_grid)
        ifft(rfft2_unlCMB_gpu, coeffs_gpu, plan_inv, False)
        # coeffs_gpu now contains the prefiltered map to be now bicubic interpolated
        if f_type != np.float32: coeffs_gpu = coeffs_gpu.astype(np.float32)
        setup_texture_gpuarr(unl_CMB_tex, coeffs_gpu)
        if True:
            dt = time.time() - t0
            print "     CMB field %s texture setup at %s Mpixel / sec, ex. time %s sec." % (
                _f, np.prod(shape) / 1e6 / dt, dt)
            t0 = time.time()

        # Now bicubic interpolation with inverse displacement, and mult. with magnification.
        lens_func(coeffs_gpu,
                  np.int32(shape[0]),
                  block=GPU_block,
                  grid=GPU_grid,
                  texrefs=[unl_CMB_tex, dx_tex, dy_tex])
        magn_func(coeffs_gpu,
                  np.int32(shape[0]),
                  np.int32(flat_shape),
                  block=GPU_block,
                  grid=GPU_grid,
                  texrefs=[dx_tex, dy_tex])
        if True:
            dt = time.time() - t0
            print "     CMB field %s lensed and magnified at %s Mpixel / sec, ex. time %s sec." % (
                _f, np.prod(shape) / 1e6 / dt, dt)
            t0 = time.time()

        if f_type != np.float32: coeffs_gpu = coeffs_gpu.astype(f_type)

        fft(coeffs_gpu, rfft2_unlCMB_gpu, plan)

        # To be GPU memory friendly these maps are in the host memory :
        # TODO : should be possible to adapt the code to do everything on the GPU, by using 4 displacement textures.
        temp_alm = lib_alm_sky.rfftmap2alm(rfft2_unlCMB_gpu.get())
        for _g in xrange(nfields):
            temp_alms[_g] += temp_alm * unlPmat[:, _g, _f]

        if True:
            dt = time.time() - t0
            print "     CMB field %s built temp_alms at %s Mpixel / sec, ex. time %s sec." % (
                _f, np.prod(shape) / 1e6 / dt, dt)
            t0 = time.time()

    # We now lens and then fft each map, and return.
    # We lens now with the forward displacement :
    setup_texture_nparr(dx_tex, f.get_dx_ingridunits())
    setup_texture_nparr(dy_tex, f.get_dy_ingridunits())
    if True:
        dt = time.time() - t0
        print "     Setup of forw. displ. textures at %s Mpixel / sec, ex. time %s sec." % (
            np.prod(shape) / 1e6 / dt, dt)
        t0 = time.time()

    lenCMB_gpu = gpuarray.empty(shape, dtype=np.float32, order='C')
    for _g in xrange(nfields):

        rfft2_unlCMB_gpu.set(
            lib_alm_sky.alm2rfft(temp_alms[_g]).astype(cplx_type))
        if True:
            dt = time.time() - t0
            print "     Pushing temp alm field %s at %s Mpixel / sec, ex. time %s sec." % (
                _g, np.prod(shape) / 1e6 / dt, dt)
            t0 = time.time()

        prefilter(rfft2_unlCMB_gpu,
                  wx_gpu,
                  np.int32(rshape[1]),
                  np.int32(rshape[0]),
                  block=GPU_block,
                  grid=GPU_grid)
        ifft(rfft2_unlCMB_gpu, coeffs_gpu, plan_inv, False)
        # Lensing :
        if f_type != np.float32: coeffs_gpu = coeffs_gpu.astype(np.float32)
        setup_texture_gpuarr(unl_CMB_tex, coeffs_gpu)
        lens_func(lenCMB_gpu,
                  np.int32(shape[0]),
                  block=GPU_block,
                  grid=GPU_grid,
                  texrefs=[unl_CMB_tex, dx_tex, dy_tex])
        if f_type != np.float32: lenCMB_gpu = lenCMB_gpu.astype(f_type)

        # coeffs_gpu is now D xi D^t. Turn this to Fourier space :
        fft(lenCMB_gpu, rfft2_unlCMB_gpu, plan)
        if True:
            dt = time.time() - t0
            print "     Lensing + rfft of field %s at %s Mpixel / sec, ex. time %s sec." % (
                _g, np.prod(shape) / 1e6 / dt, dt)
            t0 = time.time()

        alms_unlCMB[_g] = lib_alm_dat.rfftmap2alm(
            rfft2_unlCMB_gpu.get())  # Pulling result from GPU to CPUcd
        if True:
            dt = time.time() - t0
            print "     Pulling back field %s at %s Mpixel / sec, ex. time %s sec." % (
                _g, np.prod(shape) / 1e6 / dt, dt)
            t0 = time.time()

    if True:
        dt = time.time() - ti
        print "GPU TQU did G D xi D^t G^t at %s Mpixel / sec, ex. time %s sec." % (
            np.prod(shape) / 1e6 / dt, dt)

    return alms_unlCMB
Esempio n. 3
0
def _get_inverse_chk_N(args):
    """
    Returns inverse displacement in chunk N
    Uses periodic boundary conditions, which is not applicable to chunks, thus there
    will be boudary effects on the edges (2 or 4 pixels depending on the rule). Make sure the buffer is large enough.
    """
    assert len(args) == 15, args

    N, path_to_map, path_to_dx, path_to_dy, buff0, buff1, \
    lside0, lside1, HD_res0, HD_res1, NR_iter, kspl, LD0, LD1, do_not_prefilter = args

    HD_res = (HD_res0, HD_res1)
    LD = (LD0, LD1)
    s = (2**LD[0] + 2 * buff0, 2**LD[1] + 2 * buff1)  # Chunk shape

    rmin0 = lside0 / 2**HD_res[0]
    rmin1 = lside1 / 2**HD_res[1]

    # Get magn. matrix of the chunk:
    extra_buff = np.array(
        (5, 5))  # To avoid surprises with the periodic derivatives
    dx = np.zeros(
        s + 2 * extra_buff
    )  # will dx displ. in grid units of each chunk (typ. (256 * 256) )
    dy = np.zeros(
        s + 2 * extra_buff
    )  # will dy displ. in grid units of each chunk (typ. (256 * 256) )
    sLDs, sHDs = periodicmap_spliter().get_slices_chk_N(
        N, LD, HD_res, (buff0 + extra_buff[0], buff1 + extra_buff[1]))
    for sLD, sHD in zip(sLDs, sHDs):
        dx[sLD] = np.load(
            path_to_dx, mmap_mode='r'
        )[sHD] / rmin1  # Need grid units displacement for the bicubic spline
        dy[sLD] = np.load(path_to_dy, mmap_mode='r')[sHD] / rmin0

    # Jacobian matrix of the chunk :
    sl0 = slice(extra_buff[0], dx.shape[0] - extra_buff[0])
    sl1 = slice(extra_buff[1], dx.shape[1] - extra_buff[1])

    Minv_yy = -(PartialDerivativePeriodic(dx, axis=1)[sl0, sl1] + 1.)
    Minv_xx = -(PartialDerivativePeriodic(dy, axis=0)[sl0, sl1] + 1.)
    Minv_xy = PartialDerivativePeriodic(dy, axis=1)[sl0, sl1]
    Minv_yx = PartialDerivativePeriodic(dx, axis=0)[sl0, sl1]
    det = Minv_yy * Minv_xx - Minv_xy * Minv_yx
    if not np.all(det > 0.):
        print "ffs_displ::Negative value in det k : something's weird, you'd better check that"
    # Inverse magn. elements. (with a minus sign) We may need to spline these later for further NR iterations :
    Minv_xx /= det
    Minv_yy /= det
    Minv_xy /= det
    Minv_yx /= det
    del det

    dx = dx[sl0, sl1]  # Getting rid of extra buffer
    dy = dy[sl0, sl1]  # Getting rid of extra buffer
    ex = (Minv_xx * dx + Minv_xy * dy)
    ey = (Minv_yx * dx + Minv_yy * dy)

    if NR_iter == 0: return ex * rmin1, ey * rmin0

    # Setting up a bunch of splines to interpolate the increment to the displacement according to Newton-Raphson.
    # Needed are splines of the forward displacement and of the (inverse, as implemented here) magnification matrix.
    # Hopefully the map resolution is enough to spline the magnification matrix.
    s0, s1 = dx.shape
    r0 = s0
    r1 = s1 / 2 + 1  # rfft shape

    w0 = 6. / (2. * np.cos(2. * np.pi * Freq(np.arange(r0), s0) / s0) + 4.)
    w1 = 6. / (2. * np.cos(2. * np.pi * Freq(np.arange(r1), s1) / s1) + 4.)
    # FIXME: switch to pyfftw :
    bic_filter = lambda _map: np.fft.irfft2(
        np.fft.rfft2(_map) * np.outer(w0, w1))

    dx = bic_filter(dx)
    dy = bic_filter(dy)
    Minv_xy = bic_filter(Minv_xy)
    Minv_yx = bic_filter(Minv_yx)
    Minv_xx = bic_filter(Minv_xx)
    Minv_yy = bic_filter(Minv_yy)

    header = r' "%s/lensit/gpu/bicubicspline.h" ' % os.path.abspath(os.curdir)
    iterate = r"\
        double fx,fy;\
        double ex_len_dx,ey_len_dy,len_Mxx,len_Mxy,len_Myx,len_Myy;\
        int i = 0;\
        for(int y= 0; y < width; y++ )\
           {\
           for(int x = 0; x < width; x++,i++)\
            {\
            fx = x +  ex[i];\
            fy = y +  ey[i];\
            ex_len_dx = ex[i] +  bicubiclensKernel(dx,fx,fy,width);\
            ey_len_dy = ey[i] +  bicubiclensKernel(dy,fx,fy,width);\
            len_Mxx =  bicubiclensKernel(Minv_xx,fx,fy,width);\
            len_Myy =  bicubiclensKernel(Minv_yy,fx,fy,width);\
            len_Mxy =  bicubiclensKernel(Minv_xy,fx,fy,width);\
            len_Myx =  bicubiclensKernel(Minv_yx,fx,fy,width);\
            ex[i] += len_Mxx * ex_len_dx + len_Mxy * ey_len_dy;\
            ey[i] += len_Myx * ex_len_dx + len_Myy * ey_len_dy;\
            }\
        }\
        "

    width = int(s0)
    assert s0 == s1, 'Havent checked how this works with rectangular maps'
    for i in range(0, NR_iter):
        weave.inline(iterate, [
            'ex', 'ey', 'dx', 'dy', 'Minv_xx', 'Minv_yy', 'Minv_xy', 'Minv_yx',
            'width'
        ],
                     headers=[header])
    return ex * rmin1, ey * rmin0
Esempio n. 4
0
def alm2lenmap_onGPU(lib_alm, unlalm, dx_gu, dy_gu, do_not_prefilter=False):
    """
    Lens the input unl_CMB map on the GPU using the pyCUDA interface.
    dx dy displacement in grid units. (f.get_dx_ingridunits() e.g.)
    Can be path to arrays or arrays or memmap.
    Will probably crash for too large maps, with need to split the job.
    Works for 4096 x 4096 at least on my laptop.

    Cost dominated by texture setup. # FIXME : try get rid of texture
    Note that the first call might be substantially slower than subsequent calls, as it caches the fft and ifft plans
    for subsequent usage.
    :param unl_CMB:
    :param func: bicubic or bilinear
    :param normalized_tex: use a modified version of the GPU bicubic spline to account for periodicity of the map
    :return:
    """
    if timed:
        ti = time.time()
    shape = lib_alm.ell_mat.shape
    rshape = (shape[0], shape[1] / 2 + 1)
    assert shape[0] == shape[1], shape
    assert IsPowerOfTwo(shape[0]), shape
    assert load_map(dx_gu).shape == shape, (load_map(dx_gu).shape,
                                            lib_alm.ell_mat.shape)
    assert load_map(dy_gu).shape == shape, (load_map(dy_gu).shape,
                                            lib_alm.ell_mat.shape)

    assert np.all(np.array(shape) % GPU_block[0] == 0), shape
    if shape[0] > 4096:
        print "--- Exercise caution, array shapes larger than 4096 have never been tested so far ---"

    GPU_grid = (shape[0] / GPU_block[0], shape[1] / GPU_block[1], 1)

    # Prefiltering forces the interpolant to pass through the samples and increase accuracy, but dominates the cost.
    rfft2_unlCMB_gpu = gpuarray.to_gpu(
        lib_alm.alm2rfft(unlalm / np.prod(shape)).astype(np.complex64))
    coeffs_gpu = gpuarray.empty(lib_alm.ell_mat.shape, dtype=np.float32)
    plan, plan_inv = get_rfft_plans(shape)

    if not do_not_prefilter:
        # The prefilter makes sure the spline is exact at the nodes.
        # Uncomments this to put coeffs_gpu on pitched memory to allow later for 2D texture binding :
        # alloc,pitch  = cuda.mem_alloc_pitch(shape[0] * 4,shape[1],4) # 4 bytes per float32
        wx = (6. / (2. * np.cos(
            2. * np.pi * Freq(np.arange(shape[0]), shape[0]) / shape[0]) + 4.))
        wx_gpu = gpuarray.to_gpu(wx.astype(np.float32))
        prefilter = CUDA_module.get_function("cf_outer_w")
        prefilter(rfft2_unlCMB_gpu,
                  wx_gpu,
                  np.int32(rshape[1]),
                  np.int32(rshape[0]),
                  block=GPU_block,
                  grid=GPU_grid)
        del wx_gpu

    ifft(rfft2_unlCMB_gpu, coeffs_gpu, plan_inv, False)

    # Binding arrays to textures and getting lensing func.
    if texture_count == 0:
        lens_func = CUDA_module.get_function("bicubiclensKernel_notex")
        tex_refs = []
        dx_gu = gpuarray.to_gpu(load_map(dx_gu).astype(np.float32))
        dy_gu = gpuarray.to_gpu(load_map(dy_gu).astype(np.float32))
    elif texture_count == 1:
        unl_CMB_tex = CUDA_module.get_texref("unl_CMB")
        tex_refs = [unl_CMB_tex]
        unl_CMB_tex.set_array(cuda.gpuarray_to_array(coeffs_gpu, "C"))
        del coeffs_gpu
        dx_gu = gpuarray.to_gpu(load_map(dx_gu).astype(np.float32))
        dy_gu = gpuarray.to_gpu(load_map(dy_gu).astype(np.float32))
        lens_func = CUDA_module.get_function(
            "bicubiclensKernel_normtex_singletex")
    elif texture_count == 3:
        unl_CMB_tex = CUDA_module.get_texref("unl_CMB")
        dx_tex = CUDA_module.get_texref("tex_dx")
        dy_tex = CUDA_module.get_texref("tex_dy")
        tex_refs = ([unl_CMB_tex, dx_tex, dy_tex])
        unl_CMB_tex.set_array(cuda.gpuarray_to_array(coeffs_gpu, "C"))
        del coeffs_gpu
        cuda.matrix_to_texref(load_map(dx_gu).astype(np.float32),
                              dx_tex,
                              order="C")
        cuda.matrix_to_texref(load_map(dy_gu).astype(np.float32),
                              dy_tex,
                              order="C")
        lens_func = CUDA_module.get_function("bicubiclensKernel_normtex")
    else:
        tex_refs = []
        lens_func = 0
        assert 0
    # Wraping, important for periodic boundary conditions.
    # Note that WRAP has not effect for unnormalized texture coordinates.

    for tex_ref in tex_refs:
        tex_ref.set_address_mode(0, cuda.address_mode.WRAP)
        tex_ref.set_address_mode(1, cuda.address_mode.WRAP)
        tex_ref.set_filter_mode(cuda.filter_mode.POINT)
        tex_ref.set_flags(cuda.TRSF_NORMALIZED_COORDINATES)

    if timed: t0 = time.time()

    len_CMB = np.empty(shape, dtype=np.float32)

    if texture_count == 0:
        lens_func(cuda.Out(len_CMB),
                  coeffs_gpu,
                  dx_gu,
                  dy_gu,
                  np.int32(shape[0]),
                  block=GPU_block,
                  grid=GPU_grid,
                  texrefs=tex_refs)
    elif texture_count == 1:
        lens_func(cuda.Out(len_CMB),
                  dx_gu,
                  dy_gu,
                  np.int32(shape[0]),
                  block=GPU_block,
                  grid=GPU_grid,
                  texrefs=tex_refs)
    elif texture_count == 3:
        lens_func(cuda.Out(len_CMB),
                  np.int32(shape[0]),
                  block=GPU_block,
                  grid=GPU_grid,
                  texrefs=tex_refs)

    if timed:
        dt = time.time() - t0
        t_tot = time.time() - ti
        print "     GPU bicubic spline and transfer at %s Mpixel / sec, time %s sec" % (
            np.prod(lib_alm.ell_mat.shape) / 1e6 / dt, dt)
        print " Total ex. time at %s Mpixel / sec, ex. time %s sec." % (
            np.prod(shape) / 1e6 / t_tot, t_tot)
    return len_CMB.astype(np.float64)
Esempio n. 5
0
def apply_cond3_GPU_inplace(type,
                            lib_alm_dat,
                            alms_unlCMB,
                            f,
                            f_inv,
                            cls_unl,
                            cl_transf,
                            cls_noise,
                            func='bicubic',
                            double_precision_ffts=False):
    """
    cond3 is F D^-t (B xi B^t + N)D^-1 F^t
    Note that the first call might be substantially slower than subsequent calls, as it caches the fft and ifft plans
    for subsequent usage, if not already in the fft plans (See __init__.py)
    :param type : 'T', 'QU' or 'TQU'
    :param alms_unlCMB: ffs_alms to apply FDxiDtFt to.
    :param func: bicubic or bilinear
    :param cls_unl : unlensed CMB cls dictionary (used in get_P_mat)
    :return: ffs_alms of shape (len(type,lib_alm_dat.alm_size)
    """
    if timed:
        ti = time.time()

    assert func in ['bicubic', 'bilinear'], func
    assert alms_unlCMB.shape == (len(type), lib_alm_dat.alm_size)

    # Useful declarations :
    nfields = len(type)
    rshape = lib_alm_dat.ell_mat.rshape
    shape = (rshape[0], 2 * (rshape[1] - 1))
    flat_shape = np.prod(shape)

    GPU_grid = (shape[0] / GPU_block[0], shape[1] / GPU_block[1], 1)

    assert shape[0] % GPU_block[0] == 0, shape

    assert shape[0] == shape[1], shape
    assert IsPowerOfTwo(shape[0]), shape
    assert f.shape == shape, (f.shape, shape)
    assert f_inv.shape == shape, (f_inv.shape, shape)
    assert f.lsides == lib_alm_dat.ell_mat.lsides, (f.lsides,
                                                    lib_alm_dat.ell_mat.lsides)
    assert f_inv.lsides == lib_alm_dat.ell_mat.lsides, (
        f_inv.lsides, lib_alm_dat.ell_mat.lsides)

    assert np.all(np.array(shape) % GPU_block[0] == 0), shape

    if shape[0] > 4096:
        print "--- Exercise caution, array shapes larger than 4096 have never been tested so far ---"

    def get_rfft_unlCMB(idx):
        return lib_alm_dat.alm2rfft(alms_unlCMB[idx])

    unlPmat = get_Pmat(type,
                       lib_alm_dat,
                       cls_unl,
                       cl_transf=cl_transf,
                       cls_noise=cls_noise,
                       inverse=True)

    # 2D texture references :
    unl_CMB_tex = CUDA_module.get_texref("unl_CMB")
    dx_tex = CUDA_module.get_texref("tex_dx")
    dy_tex = CUDA_module.get_texref("tex_dy")

    # loading fft plans :
    plan, plan_inv = get_rfft_plans(shape,
                                    double_precision=double_precision_ffts)
    # Function references :
    prefilter = CUDA_module.get_function(
        "cf_outer_w"
    ) if not double_precision_ffts else CUDA_module.get_function("cdd_outer_w")
    lens_func = CUDA_module.get_function("%slensKernel_normtex" % func)
    magn_func = CUDA_module.get_function("detmagn_normtex")

    cplx_type = np.complex64 if not double_precision_ffts else np.complex128
    f_type = np.float32 if not double_precision_ffts else np.float64

    # We will store in host memory some maps for convenience
    temp_alm = np.zeros((nfields, lib_alm_dat.alm_size), dtype=cplx_type)

    # Setting up the texture references to the displacement
    # (This is what  contribute most to the cost actually)
    setup_texture_nparr(dx_tex, f_inv.get_dx_ingridunits())
    setup_texture_nparr(dy_tex, f_inv.get_dy_ingridunits())
    # Building spline coefficients (1 / shape[0] comes from ifft convention)
    wx_gpu = (6. / (2. * np.cos(
        2. * np.pi * Freq(np.arange(shape[0]), shape[0]) / shape[0]) + 4.) /
              shape[0])
    wx_gpu = gpuarray.to_gpu(wx_gpu.astype(f_type))
    coeffs_gpu = gpuarray.empty(shape, dtype=f_type, order='C')
    for _f in xrange(nfields):
        # Multiplying with the spline coefficients and Fourier transforming
        rfft2_unlCMB_gpu = gpuarray.to_gpu(
            get_rfft_unlCMB(_f).astype(cplx_type))
        prefilter(rfft2_unlCMB_gpu,
                  wx_gpu,
                  np.int32(rshape[1]),
                  np.int32(rshape[0]),
                  block=GPU_block,
                  grid=GPU_grid)
        ifft(rfft2_unlCMB_gpu, coeffs_gpu, plan_inv, False)
        # coeffs_gpu now contains the prefiltered map to be now bicubic interpolated

        # Now bicubic interpolation with inverse displacement.
        setup_texture_gpuarr(unl_CMB_tex, coeffs_gpu)
        lenCMB_gpu = gpuarray.empty(shape, dtype=np.float32, order='C')
        lens_func(lenCMB_gpu,
                  np.int32(shape[0]),
                  block=GPU_block,
                  grid=GPU_grid,
                  texrefs=[unl_CMB_tex, dx_tex, dy_tex])
        if f_type != np.float32: lenCMB_gpu = lenCMB_gpu.astype(f_type)

        # Back to Fourier space :
        rfft2_unlCMB_gpu = gpuarray.empty(rshape, dtype=cplx_type, order='C')
        fft(lenCMB_gpu, rfft2_unlCMB_gpu, plan)

        # We construct the map P_ij m_j which we will have to lens afterwards.
        # To be GPU memory friendly these maps are in the host memory :
        # for _g in xrange(nfields): ret[_g] += rfft2_unlCMB_gpu.get() * get_unlPmat(_g,_f)
        for _g in xrange(nfields):
            temp_alm[_g] += lib_alm_dat.rfftmap2alm(
                rfft2_unlCMB_gpu.get()) * unlPmat[:, _g, _f]

    # We now lens and then fft each map, and return.
    # We lens now with the forward displacement :
    setup_texture_nparr(dx_tex, f.get_dx_ingridunits())
    setup_texture_nparr(dy_tex, f.get_dy_ingridunits())
    for _g in xrange(nfields):
        rfft2_unlCMB_gpu = gpuarray.to_gpu(
            lib_alm_dat.alm2rfft(temp_alm[_g]).astype(cplx_type))
        prefilter(rfft2_unlCMB_gpu,
                  wx_gpu,
                  np.int32(rshape[1]),
                  np.int32(rshape[0]),
                  block=GPU_block,
                  grid=GPU_grid)
        ifft(rfft2_unlCMB_gpu, coeffs_gpu, plan_inv, False)
        # Lensing by forward displacement, and multiplication by magnification :
        setup_texture_gpuarr(unl_CMB_tex, coeffs_gpu)
        lenCMB_gpu = gpuarray.empty(shape, dtype=np.float32, order='C')
        lens_func(lenCMB_gpu,
                  np.int32(shape[0]),
                  block=GPU_block,
                  grid=GPU_grid,
                  texrefs=[unl_CMB_tex, dx_tex, dy_tex])
        magn_func(lenCMB_gpu,
                  np.int32(shape[0]),
                  np.int32(flat_shape),
                  block=GPU_block,
                  grid=GPU_grid,
                  texrefs=[dx_tex, dy_tex])
        if f_type != np.float32: lenCMB_gpu = lenCMB_gpu.astype(f_type)
        # coeffs_gpu is now D xi D^t. Turn this to Fourier space :
        fft(lenCMB_gpu, rfft2_unlCMB_gpu, plan)
        alms_unlCMB[_g] = lib_alm_dat.rfftmap2alm(
            rfft2_unlCMB_gpu.get().astype(
                np.complex128))  # Pulling result from GPU to CPUcd
    if timed:
        dt = time.time() - ti
        print "GPU TQU did conditioner 3 at %s Mpixel / sec, ex. time %s sec." % (
            np.prod(shape) / 1e6 / dt, dt)
    return