Exemple #1
0
    def setup(self, size, units, lam=0.5, n0=1.0, use_fresnel_approx=False):
        """
            sets up the internal variables e.g. propagators etc...

            :param size:  the size of the geometry in pixels (Nx,Ny,Nz)
            :param units: the phyiscal units of each voxel in microns (dx,dy,dz)
            :param lam: the wavelength of light in microns
            :param n0:  the refractive index of the surrounding media
            :param use_fresnel_approx:  if True, uses fresnel approximation for propagator


        """
        Bpm3d_Base.setup(self, size, units, lam=lam, n0=n0, use_fresnel_approx=use_fresnel_approx)

        # setting up the gpu buffers and kernels
        self.program = OCLProgram(absPath("kernels/bpm_3d_kernels.cl"))

        Nx, Ny = self.size[:2]
        plan = fft_plan(())
        self._H_g = OCLArray.from_array(self._H.astype(np.complex64))

        self.scatter_weights_g = OCLArray.from_array(self.scatter_weights.astype(np.float32))
        self.gfactor_weights_g = OCLArray.from_array(self.gfactor_weights.astype(np.float32))

        self.scatter_cross_sec_g = OCLArray.zeros(Nz, "float32")
        self.gfactor_g = OCLArray.zeros(Nz, "float32")

        self.reduce_kernel = OCLReductionKernel(
            np.float32,
            neutral="0",
            reduce_expr="a+b",
            map_expr="weights[i]*cfloat_abs(field[i]-(i==0)*plain)*cfloat_abs(field[i]-(i==0)*plain)",
            arguments="__global cfloat_t *field, __global float * weights,cfloat_t plain",
        )
Exemple #2
0
    def _setup_gpu(self):
        dev = get_device()
        self._queue = dev.queue
        self._ctx = dev.context
        prog = OCLProgram(absPath("kernels/bpm_3d_kernels.cl"))

        # the buffers/ images
        Nx, Ny = self.simul_xy
        Nx0, Ny0 = self.shape[:2]

        self._plan = fft_plan((Ny, Nx), **self.fftplan_kwargs)
        self._buf_plane = OCLArray.empty((Ny, Nx), np.complex64)
        self._buf_H = OCLArray.empty((Ny, Nx), np.complex64)
        self._img_xy = OCLImage.empty((Ny, Nx),
                                      dtype=np.float32,
                                      num_channels=2)

        # buffer for the weighted dn average
        self.intens_g = OCLArray.empty((1, Ny, Nx), dtype=Bpm3d._real_type)
        self.intens_dn_g = OCLArray.empty((1, Ny, Nx), dtype=Bpm3d._real_type)
        self.intens_sum_g = OCLArray.zeros((), dtype=Bpm3d._real_type)
        self.intens_dn_sum_g = OCLArray.zeros((), dtype=Bpm3d._real_type)

        # the kernels
        self._kernel_compute_propagator = prog.compute_propagator
        self._kernel_compute_propagator.set_scalar_arg_dtypes((None, ) +
                                                              (np.float32, ) *
                                                              5)
        self._kernel_compute_propagator_buf = prog.compute_propagator_buf
        self._kernel_compute_propagator_buf.set_scalar_arg_dtypes(
            (None, ) + (np.float32, ) * 5 + (None, ) * 2)

        self._kernel_mult_complex = prog.mult

        self._kernel_im_to_buf_field = prog.img_to_buf_field
        self._kernel_im_to_buf_intensity = prog.img_to_buf_intensity
        self._kernel_im_to_im_intensity = prog.img_to_img_intensity
        self._kernel_buf_to_buf_field = prog.buf_to_buf_field
        self._kernel_buf_to_buf_intensity = prog.buf_to_buf_intensity

        self._kernel_mult_dn_img_float = prog.mult_dn_image
        self._kernel_mult_dn_buf_float = prog.mult_dn
        self._kernel_mult_dn_img_complex = prog.mult_dn_image_complex
        self._kernel_mult_dn_buf_complex = prog.mult_dn_complex

        self._kernel_mult_dn_img_float_local = prog.mult_dn_image_local
        self._kernel_mult_dn_buf_float_local = prog.mult_dn_local
        self._kernel_mult_dn_img_complex_local = prog.mult_dn_image_complex_local
        self._kernel_mult_dn_buf_complex_local = prog.mult_dn_complex_local

        self._kernel_reduction = OCLMultiReductionKernel(
            np.float32,
            neutral="0",
            reduce_expr="a+b",
            map_exprs=["a[i]", "b[i]"],
            arguments="__global float *a, __global float *b")

        self._fill_propagator(self.n0)
Exemple #3
0
def time_gpu(dshape, niter=100, fast_math=False):
    d_g = OCLArray.empty(dshape, np.complex64)
    get_device().queue.finish()
    plan = fft_plan(dshape, fast_math=fast_math)
    t = time()
    for _ in xrange(niter):
        fft(d_g, inplace=True, plan=plan)
    get_device().queue.finish()
    t = (time()-t)/niter
    print "GPU (fast_math = %s)\t%s\t\t%.2f ms"%(fast_math, dshape, 1000.*t)
def time_gpu(dshape, niter=100, fast_math=False):
    d_g = OCLArray.empty(dshape, np.complex64)
    get_device().queue.finish()
    plan = fft_plan(dshape, fast_math=fast_math)
    t = time()
    for _ in range(niter):
        fft(d_g, inplace=True, plan=plan)
    get_device().queue.finish()
    t = (time() - t) / niter
    print("GPU (fast_math = %s)\t%s\t\t%.2f ms" % (fast_math, dshape, 1000. * t))
    return t
Exemple #5
0
def _deconv_rl_np_fft(data, h, Niter = 10, 
                h_is_fftshifted = False):
    """ deconvolves data with given psf (kernel) h

    data and h have to be same shape

    
    via lucy richardson deconvolution
    """

    if data.shape != h.shape:
        raise ValueError("data and h have to be same shape")

    if not h_is_fftshifted:
        h = np.fft.fftshift(h)


    hflip = h[::-1,::-1]
        
    #set up some gpu buffers
    y_g = OCLArray.from_array(data.astype(np.complex64))
    u_g = OCLArray.from_array(data.astype(np.complex64))
    
    tmp_g = OCLArray.empty(data.shape,np.complex64)

    hf_g = OCLArray.from_array(h.astype(np.complex64))
    hflip_f_g = OCLArray.from_array(hflip.astype(np.complex64))

    # hflipped_g = OCLArray.from_array(h.astype(np.complex64))
    
    plan = fft_plan(data.shape)

    #transform psf
    fft(hf_g,inplace = True)
    fft(hflip_f_g,inplace = True)

    for i in range(Niter):
        print i
        fft_convolve(u_g, hf_g,
                     res_g = tmp_g,
                     kernel_is_fft = True)

        _complex_divide_inplace(y_g,tmp_g)

        fft_convolve(tmp_g,hflip_f_g,
                     inplace = True,
                     kernel_is_fft = True)

        _complex_multiply_inplace(u_g,tmp_g)
        

    return np.abs(u_g.get())
Exemple #6
0
    def _setup_gpu(self):
        dev = get_device()
        self._queue = dev.queue
        self._ctx = dev.context
        prog = OCLProgram(absPath("kernels/bpm_3d_kernels.cl"))

        # the buffers/ images
        Nx, Ny = self.simul_xy
        Nx0, Ny0 = self.shape[:2]

        self._plan = fft_plan((Ny, Nx), **self.fftplan_kwargs)
        self._buf_plane = OCLArray.empty((Ny, Nx), np.complex64)
        self._buf_H = OCLArray.empty((Ny, Nx), np.complex64)
        self._img_xy = OCLImage.empty((Ny, Nx), dtype=np.float32, num_channels=2)

        # buffer for the weighted dn average
        self.intens_g = OCLArray.empty((1, Ny, Nx), dtype=Bpm3d._real_type)
        self.intens_dn_g = OCLArray.empty((1, Ny, Nx), dtype=Bpm3d._real_type)
        self.intens_sum_g = OCLArray.zeros((), dtype=Bpm3d._real_type)
        self.intens_dn_sum_g = OCLArray.zeros((), dtype=Bpm3d._real_type)

        # the kernels
        self._kernel_compute_propagator = prog.compute_propagator
        self._kernel_compute_propagator.set_scalar_arg_dtypes((None,)+(np.float32,)*5)
        self._kernel_compute_propagator_buf = prog.compute_propagator_buf
        self._kernel_compute_propagator_buf.set_scalar_arg_dtypes((None,)+(np.float32,)*5+(None,)*2)

        self._kernel_mult_complex = prog.mult

        self._kernel_im_to_buf_field = prog.img_to_buf_field
        self._kernel_im_to_buf_intensity = prog.img_to_buf_intensity
        self._kernel_im_to_im_intensity = prog.img_to_img_intensity
        self._kernel_buf_to_buf_field = prog.buf_to_buf_field
        self._kernel_buf_to_buf_intensity = prog.buf_to_buf_intensity

        self._kernel_mult_dn_img_float = prog.mult_dn_image
        self._kernel_mult_dn_buf_float = prog.mult_dn
        self._kernel_mult_dn_img_complex = prog.mult_dn_image_complex
        self._kernel_mult_dn_buf_complex = prog.mult_dn_complex

        self._kernel_mult_dn_img_float_local = prog.mult_dn_image_local
        self._kernel_mult_dn_buf_float_local = prog.mult_dn_local
        self._kernel_mult_dn_img_complex_local = prog.mult_dn_image_complex_local
        self._kernel_mult_dn_buf_complex_local = prog.mult_dn_complex_local

        self._kernel_reduction = OCLMultiReductionKernel(np.float32,
                                                         neutral="0", reduce_expr="a+b",
                                                         map_exprs=["a[i]", "b[i]"],
                                                         arguments="__global float *a, __global float *b")

        self._fill_propagator(self.n0)
Exemple #7
0
def _deconv_rl_np_fft(data, h, Niter=10, h_is_fftshifted=False):
    """ deconvolves data with given psf (kernel) h

    data and h have to be same shape


    via lucy richardson deconvolution
    """

    if data.shape != h.shape:
        raise ValueError("data and h have to be same shape")

    if not h_is_fftshifted:
        h = np.fft.fftshift(h)

    hflip = h[::-1, ::-1]

    #set up some gpu buffers
    y_g = OCLArray.from_array(data.astype(np.complex64))
    u_g = OCLArray.from_array(data.astype(np.complex64))

    tmp_g = OCLArray.empty(data.shape, np.complex64)

    hf_g = OCLArray.from_array(h.astype(np.complex64))
    hflip_f_g = OCLArray.from_array(hflip.astype(np.complex64))

    # hflipped_g = OCLArray.from_array(h.astype(np.complex64))

    plan = fft_plan(data.shape)

    #transform psf
    fft(hf_g, inplace=True)
    fft(hflip_f_g, inplace=True)

    for i in range(Niter):
        logger.info("Iteration: {}".format(i))
        fft_convolve(u_g, hf_g, res_g=tmp_g, kernel_is_fft=True)

        _complex_divide_inplace(y_g, tmp_g)

        fft_convolve(tmp_g, hflip_f_g, inplace=True, kernel_is_fft=True)

        _complex_multiply_inplace(u_g, tmp_g)

    return np.abs(u_g.get())
Exemple #8
0
def _deconv_rl_gpu_fft(data_g, h_g, Niter = 10):
    """ 
    using fft_convolve

    """


    if data_g.shape != h_g.shape:
        raise ValueError("data and h have to be same shape")

        
    #set up some gpu buffers
    u_g = OCLArray.empty(data_g.shape,np.complex64)

    u_g.copy_buffer(data_g)
    
    tmp_g = OCLArray.empty(data_g.shape,np.complex64)

    #fix this
    hflip_g = OCLArray.from_array((h_g.get()[::-1,::-1]).copy())

    plan = fft_plan(data_g.shape)

    #transform psf
    fft(h_g,inplace = True)
    fft(hflip_g,inplace = True)

    for i in range(Niter):
        print i
        fft_convolve(u_g, h_g,
                     res_g = tmp_g,
                     kernel_is_fft = True)


        _complex_divide_inplace(data_g,tmp_g)

        
        fft_convolve(tmp_g,hflip_g,
                     inplace = True,
                     kernel_is_fft = True)

        _complex_multiply_inplace(u_g,tmp_g)

    return u_g
Exemple #9
0
def get_gpu(N = 256, niter=100, sig = 1.):
    np.random.seed(0)
    a = np.random.normal(0,sig,(N,N)).astype(np.complex64)
    b = (1.*a.copy()).astype(np.complex64)

    c_g = OCLArray.empty_like(b)
    b_g = OCLArray.from_array(b)
    p = fft_plan((N,N), fast_math = False)
    
    rels = []
    for _ in range(niter):
        fft(b_g,res_g = c_g, plan = p)
        fft(c_g, res_g = b_g, inverse = True, plan = p)

        # b = fft(fft(b), inverse = True)
        # rels.append(np.amax(np.abs(a-b))/np.amax(np.abs(a)))
        rels.append(np.amax(np.abs(a-b_g.get()))/np.amax(np.abs(a)))

    return np.array(rels)
Exemple #10
0
def get_gpu(N=256, niter=100, sig=1.):
    np.random.seed(0)
    a = np.random.normal(0, sig, (N, N)).astype(np.complex64)
    b = (1. * a.copy()).astype(np.complex64)

    c_g = OCLArray.empty_like(b)
    b_g = OCLArray.from_array(b)
    p = fft_plan((N, N), fast_math=False)

    rels = []
    for _ in range(niter):
        fft(b_g, res_g=c_g, plan=p)
        fft(c_g, res_g=b_g, inverse=True, plan=p)

        # b = fft(fft(b), inverse = True)
        # rels.append(np.amax(np.abs(a-b))/np.amax(np.abs(a)))
        rels.append(np.amax(np.abs(a - b_g.get())) / np.amax(np.abs(a)))

    return np.array(rels)
Exemple #11
0
    def _setup_impl(self):
        """setting up the gpu buffers and kernels
        """

        self.bpm_program = OCLProgram(absPath("kernels/bpm_3d_kernels.cl"))

        Nx, Ny, Nz = self.size

        self._plan = fft_plan((Ny, Nx))

        self._H_g = OCLArray.from_array(self._H.astype(np.complex64))

        if not self.dn is None and self.n_volumes == 1:
            self.dn_g = OCLArray.from_array(self.dn)

        self.scatter_weights_g = OCLArray.from_array(
            self.scatter_weights.astype(np.float32))
        self.gfactor_weights_g = OCLArray.from_array(
            self.gfactor_weights.astype(np.float32))

        self.scatter_cross_sec_g = OCLArray.zeros(Nz, "float32")
        self.gfactor_g = OCLArray.zeros(Nz, "float32")
Exemple #12
0
    def _setup_impl(self):
        """setting up the gpu buffers and kernels
        """

        self.bpm_program = OCLProgram(absPath("kernels/bpm_3d_kernels.cl"))

        Nx, Ny, Nz  = self.size

        self._plan = fft_plan((Ny,Nx))


        self._H_g = OCLArray.from_array(self._H.astype(np.complex64))

        if not self.dn is None and self.n_volumes==1:
           self.dn_g = OCLArray.from_array(self.dn)


        self.scatter_weights_g = OCLArray.from_array(self.scatter_weights.astype(np.float32))
        self.gfactor_weights_g = OCLArray.from_array(self.gfactor_weights.astype(np.float32))

        self.scatter_cross_sec_g = OCLArray.zeros(Nz,"float32")
        self.gfactor_g = OCLArray.zeros(Nz,"float32")
Exemple #13
0
    def __init__(self,
                 psf: np.ndarray,
                 psf_is_fftshifted: bool = False,
                 n_iter=10):
        """ setup deconvolution for a given shape """
        self.shape = psf.shape
        if not psf_is_fftshifted:
            psf = np.fft.fftshift(psf)

        self.n_iter = n_iter
        # What happens here? Indices are being flipped ? Why. What if it is 3D?
        psfflip = psf[::-1, ::-1]

        self.psf_g = OCLArray.from_array(psf.astype(np.complex64))
        self.psfflip_f_g = OCLArray.from_array(psfflip.astype(np.complex64))
        self.plan = fft_plan(self.shape)

        # transform psf
        fft(self.psf_g, inplace=True)
        fft(self.psfflip_f_g, inplace=True)

        # get temp
        self.tmp_g = OCLArray.empty(psf.shape, np.complex64)
Exemple #14
0
    def setup(self, size, units, lam = .5, n0 = 1.,
              use_fresnel_approx = False):
        """
            sets up the internal variables e.g. propagators etc...

            :param size:  the size of the geometry in pixels (Nx,Ny,Nz)
            :param units: the phyiscal units of each voxel in microns (dx,dy,dz)
            :param lam: the wavelength of light in microns
            :param n0:  the refractive index of the surrounding media
            :param use_fresnel_approx:  if True, uses fresnel approximation for propagator


        """
        Bpm3d_Base.setup(self,size, units, lam = lam, n0 = n0,
              use_fresnel_approx = use_fresnel_approx)

        #setting up the gpu buffers and kernels
        self.program = OCLProgram(absPath("kernels/bpm_3d_kernels.cl"))

        Nx, Ny  = self.size[:2]
        plan = fft_plan(())
        self._H_g = OCLArray.from_array(self._H.astype(np.complex64))


        self.scatter_weights_g = OCLArray.from_array(self.scatter_weights.astype(np.float32))
        self.gfactor_weights_g = OCLArray.from_array(self.gfactor_weights.astype(np.float32))

        self.scatter_cross_sec_g = OCLArray.zeros(Nz,"float32")
        self.gfactor_g = OCLArray.zeros(Nz,"float32")



        self.reduce_kernel = OCLReductionKernel(
        np.float32, neutral="0",
            reduce_expr="a+b",
            map_expr="weights[i]*cfloat_abs(field[i]-(i==0)*plain)*cfloat_abs(field[i]-(i==0)*plain)",
            arguments="__global cfloat_t *field, __global float * weights,cfloat_t plain")
Exemple #15
0
def _deconv_rl_gpu_fft(data_g, h_g, Niter=10):
    """
    using fft_convolve

    """

    if data_g.shape != h_g.shape:
        raise ValueError("data and h have to be same shape")

    #set up some gpu buffers
    u_g = OCLArray.empty(data_g.shape, np.complex64)

    u_g.copy_buffer(data_g)

    tmp_g = OCLArray.empty(data_g.shape, np.complex64)

    #fix this
    hflip_g = OCLArray.from_array((h_g.get()[::-1, ::-1]).copy())

    plan = fft_plan(data_g.shape)

    #transform psf
    fft(h_g, inplace=True)
    fft(hflip_g, inplace=True)

    for i in range(Niter):
        logger.info("Iteration: {}".format(i))
        fft_convolve(u_g, h_g, res_g=tmp_g, kernel_is_fft=True)

        _complex_divide_inplace(data_g, tmp_g)

        fft_convolve(tmp_g, hflip_g, inplace=True, kernel_is_fft=True)

        _complex_multiply_inplace(u_g, tmp_g)

    return u_g
def _convolve_spatial2(im, hs,
                      mode = "constant",
                      grid_dim = None,
                      pad_factor = 2,
                      plan = None,
                      return_plan = False):
    """
    spatial varying convolution of an 2d image with a 2d grid of psfs

    shape(im_ = (Ny,Nx)
    shape(hs) = (Gy,Gx, Hy,Hx)

    the input image im is subdivided into (Gy,Gx) blocks
    hs[j,i] is the psf at the center of each block (i,j)

    as of now each image dimension has to be divisible by the grid dim, i.e.
    Nx % Gx == 0
    Ny % Gy == 0


    mode can be:
    "constant" - assumed values to be zero
    "wrap" - periodic boundary condition
    """

    if grid_dim:
        Gs = tuple(grid_dim)
    else:
        Gs = hs.shape[:2]


    mode_str = {"constant":"CLK_ADDRESS_CLAMP",
                "wrap":"CLK_ADDRESS_REPEAT"}

    Ny, Nx = im.shape
    Gy, Gx = Gs


    # the size of each block within the grid
    Nblock_y, Nblock_x = Ny/Gy, Nx/Gx


    # the size of the overlapping patches with safety padding
    Npatch_x, Npatch_y = _next_power_of_2(pad_factor*Nblock_x), _next_power_of_2(pad_factor*Nblock_y)


    prog = OCLProgram(abspath("kernels/conv_spatial2.cl"),
                      build_options=["-D","ADDRESSMODE=%s"%mode_str[mode]])

    if plan is None:
        plan = fft_plan((Npatch_y,Npatch_x))

    x0s = Nblock_x*np.arange(Gx)
    y0s = Nblock_y*np.arange(Gy)


    patches_g = OCLArray.empty((Gy,Gx,Npatch_y,Npatch_x),np.complex64)

    #prepare psfs
    if grid_dim:
        h_g = OCLArray.zeros((Gy,Gx,Npatch_y,Npatch_x),np.complex64)
        tmp_g = OCLArray.from_array(hs.astype(np.float32, copy = False))
        for i,_x0 in enumerate(x0s):
            for j,_y0 in enumerate(y0s):
                prog.run_kernel("fill_psf_grid2",
                                (Nblock_x,Nblock_y),None,
                        tmp_g.data,
                        np.int32(Nx),
                        np.int32(i*Nblock_x),
                        np.int32(j*Nblock_y),
                        h_g.data,
                        np.int32(Npatch_x),
                        np.int32(Npatch_y),
                        np.int32(-Nblock_x/2+Npatch_x/2),
                        np.int32(-Nblock_y/2+Npatch_y/2),
                        np.int32(i*Npatch_x*Npatch_y+j*Gx*Npatch_x*Npatch_y)
                            )
    else:
        hs = np.fft.fftshift(pad_to_shape(hs,(Gy,Gx,Npatch_y,Npatch_x)),axes=(2,3))
        h_g = OCLArray.from_array(hs.astype(np.complex64))


    #prepare image
    im_g = OCLImage.from_array(im.astype(np.float32,copy=False))

    for i,_x0 in enumerate(x0s):
        for j,_y0 in enumerate(y0s):
            prog.run_kernel("fill_patch2",(Npatch_x,Npatch_y),None,
                    im_g,
                    np.int32(_x0+Nblock_x/2-Npatch_x/2),
                    np.int32(_y0+Nblock_y/2-Npatch_y/2),
                    patches_g.data,
                    np.int32(i*Npatch_x*Npatch_y+j*Gx*Npatch_x*Npatch_y))


    #return np.abs(patches_g.get())
    # convolution
    fft(patches_g,inplace=True, batch = Gx*Gy, plan = plan)
    fft(h_g,inplace=True, batch = Gx*Gy, plan = plan)
    prog.run_kernel("mult_inplace",(Npatch_x*Npatch_y*Gx*Gy,),None,
                    patches_g.data, h_g.data)
    fft(patches_g,inplace=True, inverse = True, batch = Gx*Gy, plan = plan)


    print Nblock_x, Npatch_x
    #return np.abs(patches_g.get())
    #accumulate
    res_g = OCLArray.empty(im.shape,np.float32)

    for j in xrange(Gy+1):
        for i in xrange(Gx+1):
            prog.run_kernel("interpolate2",(Nblock_x,Nblock_y),None,
                            patches_g.data,res_g.data,
                            np.int32(i),np.int32(j),
                            np.int32(Gx),np.int32(Gy),
                            np.int32(Npatch_x),np.int32(Npatch_y))

    res = res_g.get()

    if return_plan:
        return res, plan
    else:
        return res
Exemple #17
0
def _convolve_spatial2(im,
                       hs,
                       mode="constant",
                       grid_dim=None,
                       pad_factor=2,
                       plan=None,
                       return_plan=False):
    """
    spatial varying convolution of an 2d image with a 2d grid of psfs

    shape(im_ = (Ny,Nx)
    shape(hs) = (Gy,Gx, Hy,Hx)

    the input image im is subdivided into (Gy,Gx) blocks
    hs[j,i] is the psf at the center of each block (i,j)

    as of now each image dimension has to be divisible by the grid dim, i.e.
    Nx % Gx == 0
    Ny % Gy == 0


    mode can be:
    "constant" - assumed values to be zero
    "wrap" - periodic boundary condition
    """

    if grid_dim:
        Gs = tuple(grid_dim)
    else:
        Gs = hs.shape[:2]

    mode_str = {"constant": "CLK_ADDRESS_CLAMP", "wrap": "CLK_ADDRESS_REPEAT"}

    Ny, Nx = im.shape
    Gy, Gx = Gs

    # the size of each block within the grid
    Nblock_y, Nblock_x = Ny // Gy, Nx // Gx

    # the size of the overlapping patches with safety padding
    Npatch_x, Npatch_y = _next_power_of_2(
        pad_factor * Nblock_x), _next_power_of_2(pad_factor * Nblock_y)

    prog = OCLProgram(abspath("kernels/conv_spatial2.cl"),
                      build_options=["-D",
                                     "ADDRESSMODE=%s" % mode_str[mode]])

    if plan is None:
        plan = fft_plan((Gy, Gx, Npatch_y, Npatch_x), axes=(-2, -1))

    x0s = Nblock_x * np.arange(Gx)
    y0s = Nblock_y * np.arange(Gy)

    patches_g = OCLArray.empty((Gy, Gx, Npatch_y, Npatch_x), np.complex64)

    #prepare psfs
    if grid_dim:
        h_g = OCLArray.zeros((Gy, Gx, Npatch_y, Npatch_x), np.complex64)
        tmp_g = OCLArray.from_array(hs.astype(np.float32, copy=False))
        for i, _x0 in enumerate(x0s):
            for j, _y0 in enumerate(y0s):
                prog.run_kernel(
                    "fill_psf_grid2", (Nblock_x, Nblock_y), None, tmp_g.data,
                    np.int32(Nx),
                    np.int32(i * Nblock_x), np.int32(j * Nblock_y), h_g.data,
                    np.int32(Npatch_x), np.int32(Npatch_y),
                    np.int32(-Nblock_x // 2 + Npatch_x // 2),
                    np.int32(-Nblock_y // 2 + Npatch_y // 2),
                    np.int32(i * Npatch_x * Npatch_y +
                             j * Gx * Npatch_x * Npatch_y))
    else:
        hs = np.fft.fftshift(pad_to_shape(hs, (Gy, Gx, Npatch_y, Npatch_x)),
                             axes=(2, 3))
        h_g = OCLArray.from_array(hs.astype(np.complex64))

    #prepare image
    im_g = OCLImage.from_array(im.astype(np.float32, copy=False))

    for i, _x0 in enumerate(x0s):
        for j, _y0 in enumerate(y0s):
            prog.run_kernel(
                "fill_patch2", (Npatch_x, Npatch_y), None, im_g,
                np.int32(_x0 + Nblock_x // 2 - Npatch_x // 2),
                np.int32(_y0 + Nblock_y // 2 - Npatch_y // 2), patches_g.data,
                np.int32(i * Npatch_x * Npatch_y +
                         j * Gx * Npatch_x * Npatch_y))

    #return np.abs(patches_g.get())
    # convolution
    fft(patches_g, inplace=True, plan=plan)
    fft(h_g, inplace=True, plan=plan)
    prog.run_kernel("mult_inplace", (Npatch_x * Npatch_y * Gx * Gy, ), None,
                    patches_g.data, h_g.data)
    fft(patches_g, inplace=True, inverse=True, plan=plan)

    logger.debug("Nblock_x: {}, Npatch_x: {}".format(Nblock_x, Npatch_x))
    #return np.abs(patches_g.get())
    #accumulate
    res_g = OCLArray.empty(im.shape, np.float32)

    for j in range(Gy + 1):
        for i in range(Gx + 1):
            prog.run_kernel("interpolate2", (Nblock_x, Nblock_y),
                            None, patches_g.data, res_g.data, np.int32(i),
                            np.int32(j), np.int32(Gx), np.int32(Gy),
                            np.int32(Npatch_x), np.int32(Npatch_y))

    res = res_g.get()

    if return_plan:
        return res, plan
    else:
        return res
def convolve_spatial3(im, hs,
                      mode = "constant",
                      plan = None,
                      return_plan = False,
                      pad_factor = 2):
    """
    spatial varying convolution of an 3d image with a 3d grid of psfs

    shape(im_ = (Nz,Ny,Nx)
    shape(hs) = (Gz,Gy,Gx, Hz,Hy,Hx)

    the input image im is subdivided into (Gx,Gy,Gz) blocks
    hs[k,j,i] is the psf at the center of each block (i,j,k)

    as of now each image dimension has to be divisble by the grid dim, i.e.
    Nx % Gx == 0
    Ny % Gy == 0
    Nz % Gz == 0

    mode can be:
    "constant" - assumed values to be zero
    "wrap" - periodic boundary condition


    """
    if im.ndim !=3 or hs.ndim !=6:
        raise ValueError("wrong dimensions of input!")

    if not np.all([n%g==0 for n,g in zip(im.shape,hs.shape[:3])]):
        raise NotImplementedError("shape of image has to be divisible by Gx Gy  = %s !"%(str(hs.shape[:3])))


    mode_str = {"constant":"CLK_ADDRESS_CLAMP",
                "wrap":"CLK_ADDRESS_REPEAT"}

    Ns = tuple(im.shape)
    Gs = tuple(hs.shape[:3])


    # the size of each block within the grid
    Nblocks = [n/g for n,g  in zip(Ns,Gs)]


    # the size of the overlapping patches with safety padding
    Npatchs = tuple([_next_power_of_2(pad_factor*nb) for nb in Nblocks])

    print hs.shape
    hs = np.fft.fftshift(pad_to_shape(hs,Gs+Npatchs),axes=(3,4,5))



    prog = OCLProgram(abspath("kernels/conv_spatial.cl"),
                      build_options=["-D","ADDRESSMODE=%s"%mode_str[mode]])

    if plan is None:
        plan = fft_plan(Npatchs)

    patches_g = OCLArray.empty(Gs+Npatchs,np.complex64)

    h_g = OCLArray.from_array(hs.astype(np.complex64))

    im_g = OCLImage.from_array(im.astype(np.float32,copy=False))

    Xs = [nb*np.arange(g) for nb, g in zip(Nblocks,Gs)]




    print Nblocks
    # this loops over all i,j,k
    for (k,_z0), (j,_y0),(i,_x0) in product(*[enumerate(X) for X in Xs]):
        prog.run_kernel("fill_patch3",Npatchs[::-1],None,
                im_g,
                    np.int32(_x0+Nblocks[2]/2-Npatchs[2]/2),
                    np.int32(_y0+Nblocks[1]/2-Npatchs[1]/2),
                    np.int32(_z0+Nblocks[0]/2-Npatchs[0]/2),
                    patches_g.data,
                    np.int32(i*np.prod(Npatchs)+
                             j*Gs[2]*np.prod(Npatchs)+
                             k*Gs[2]*Gs[1]*np.prod(Npatchs)))



    print patches_g.shape, h_g.shape




    # convolution
    fft(patches_g,inplace=True, batch = np.prod(Gs), plan = plan)
    fft(h_g,inplace=True, batch = np.prod(Gs), plan = plan)
    prog.run_kernel("mult_inplace",(np.prod(Npatchs)*np.prod(Gs),),None,
                    patches_g.data, h_g.data)

    fft(patches_g,
        inplace=True,
        inverse = True,
        batch = np.prod(Gs),
        plan = plan)

    #return patches_g.get()
    #accumulate
    res_g = OCLArray.zeros(im.shape,np.float32)

    for k, j, i in product(*[range(g+1) for g in Gs]):
        prog.run_kernel("interpolate3",Nblocks[::-1],None,
                        patches_g.data,
                        res_g.data,
                        np.int32(i),np.int32(j),np.int32(k),
                        np.int32(Gs[2]),np.int32(Gs[1]),np.int32(Gs[0]),
                        np.int32(Npatchs[2]),np.int32(Npatchs[1]),np.int32(Npatchs[0]))


    res = res_g.get()

    if return_plan:
        return res, plan
    else:
        return res
def convolve_spatial3(im,
                      hs,
                      mode="constant",
                      plan=None,
                      return_plan=False,
                      pad_factor=2):
    """
    spatial varying convolution of an 3d image with a 3d grid of psfs

    shape(im_ = (Nz,Ny,Nx)
    shape(hs) = (Gz,Gy,Gx, Hz,Hy,Hx)

    the input image im is subdivided into (Gx,Gy,Gz) blocks
    hs[k,j,i] is the psf at the center of each block (i,j,k)

    as of now each image dimension has to be divisble by the grid dim, i.e.
    Nx % Gx == 0
    Ny % Gy == 0
    Nz % Gz == 0

    mode can be:
    "constant" - assumed values to be zero
    "wrap" - periodic boundary condition


    """
    if im.ndim != 3 or hs.ndim != 6:
        raise ValueError("wrong dimensions of input!")

    if not np.all([n % g == 0 for n, g in zip(im.shape, hs.shape[:3])]):
        raise NotImplementedError(
            "shape of image has to be divisible by Gx Gy  = %s !" %
            (str(hs.shape[:3])))

    mode_str = {"constant": "CLK_ADDRESS_CLAMP", "wrap": "CLK_ADDRESS_REPEAT"}

    Ns = tuple(im.shape)
    Gs = tuple(hs.shape[:3])

    # the size of each block within the grid
    Nblocks = [n / g for n, g in zip(Ns, Gs)]

    # the size of the overlapping patches with safety padding
    Npatchs = tuple([_next_power_of_2(pad_factor * nb) for nb in Nblocks])

    print(hs.shape)
    hs = np.fft.fftshift(pad_to_shape(hs, Gs + Npatchs), axes=(3, 4, 5))

    prog = OCLProgram(abspath("kernels/conv_spatial.cl"),
                      build_options=["-D",
                                     "ADDRESSMODE=%s" % mode_str[mode]])

    if plan is None:
        plan = fft_plan(Npatchs)

    patches_g = OCLArray.empty(Gs + Npatchs, np.complex64)

    h_g = OCLArray.from_array(hs.astype(np.complex64))

    im_g = OCLImage.from_array(im.astype(np.float32, copy=False))

    Xs = [nb * np.arange(g) for nb, g in zip(Nblocks, Gs)]

    print(Nblocks)
    # this loops over all i,j,k
    for (k, _z0), (j, _y0), (i, _x0) in product(*[enumerate(X) for X in Xs]):
        prog.run_kernel(
            "fill_patch3", Npatchs[::-1], None, im_g,
            np.int32(_x0 + Nblocks[2] / 2 - Npatchs[2] / 2),
            np.int32(_y0 + Nblocks[1] / 2 - Npatchs[1] / 2),
            np.int32(_z0 + Nblocks[0] / 2 - Npatchs[0] / 2), patches_g.data,
            np.int32(i * np.prod(Npatchs) + j * Gs[2] * np.prod(Npatchs) +
                     k * Gs[2] * Gs[1] * np.prod(Npatchs)))

    print(patches_g.shape, h_g.shape)

    # convolution
    fft(patches_g, inplace=True, batch=np.prod(Gs), plan=plan)
    fft(h_g, inplace=True, batch=np.prod(Gs), plan=plan)
    prog.run_kernel("mult_inplace", (np.prod(Npatchs) * np.prod(Gs), ), None,
                    patches_g.data, h_g.data)

    fft(patches_g, inplace=True, inverse=True, batch=np.prod(Gs), plan=plan)

    #return patches_g.get()
    #accumulate
    res_g = OCLArray.zeros(im.shape, np.float32)

    for k, j, i in product(*[list(range(g + 1)) for g in Gs]):
        prog.run_kernel("interpolate3", Nblocks[::-1], None, patches_g.data,
                        res_g.data, np.int32(i), np.int32(j), np.int32(k),
                        np.int32(Gs[2]), np.int32(Gs[1]), np.int32(Gs[0]),
                        np.int32(Npatchs[2]), np.int32(Npatchs[1]),
                        np.int32(Npatchs[0]))

    res = res_g.get()

    if return_plan:
        return res, plan
    else:
        return res
def convolve_spatial2(im, hs, mode="constant", plan=None, return_plan=False):
    """
    spatial varying convolution of an 2d image with a 2d grid of psfs

    shape(im_ = (Ny,Nx)
    shape(hs) = (Gy,Gx, Hy,Hx)

    the input image im is subdivided into (Gy,Gz) blocks
    hs[j,i] is the psf at the center of each block (i,j)

    as of now each image dimension has to be divisble by the grid dim, i.e.
    Nx % Gx == 0
    Ny % Gy == 0

    mode can be:
    "constant" - assumed values to be zero
    "wrap" - periodic boundary condition
    """

    if im.ndim != 2 or hs.ndim != 4:
        raise ValueError("wrong dimensions of input!")

    if not np.all([n % g == 0 for n, g in zip(im.shape, hs.shape[:2])]):
        raise NotImplementedError(
            "shape of image has to be divisible by Gx Gy  = %s shape mismatch"
            % (str(hs.shape[:2])))

    mode_str = {"constant": "CLK_ADDRESS_CLAMP", "wrap": "CLK_ADDRESS_REPEAT"}

    Ny, Nx = im.shape
    Gy, Gx = hs.shape[:2]

    # the size of each block within the grid
    Nblock_y, Nblock_x = Ny / Gy, Nx / Gx

    # the size of the overlapping patches with safety padding
    Npatch_x, Npatch_y = _next_power_of_2(3 * Nblock_x), _next_power_of_2(
        3 * Nblock_y)
    #Npatch_x, Npatch_y = _next_power_of_2(2*Nblock_x), _next_power_of_2(2*Nblock_y)

    print(Nblock_x, Npatch_x)

    hs = np.fft.fftshift(pad_to_shape(hs, (Gy, Gx, Npatch_y, Npatch_x)),
                         axes=(2, 3))

    prog = OCLProgram(abspath("kernels/conv_spatial.cl"),
                      build_options=["-D",
                                     "ADDRESSMODE=%s" % mode_str[mode]])

    if plan is None:
        plan = fft_plan((Npatch_y, Npatch_x))

    patches_g = OCLArray.empty((Gy, Gx, Npatch_y, Npatch_x), np.complex64)

    h_g = OCLArray.from_array(hs.astype(np.complex64))

    im_g = OCLImage.from_array(im.astype(np.float32, copy=False))

    x0s = Nblock_x * np.arange(Gx)
    y0s = Nblock_y * np.arange(Gy)

    print(x0s)

    for i, _x0 in enumerate(x0s):
        for j, _y0 in enumerate(y0s):
            prog.run_kernel(
                "fill_patch2", (Npatch_x, Npatch_y), None, im_g,
                np.int32(_x0 + Nblock_x / 2 - Npatch_x / 2),
                np.int32(_y0 + Nblock_y / 2 - Npatch_y / 2), patches_g.data,
                np.int32(i * Npatch_x * Npatch_y +
                         j * Gx * Npatch_x * Npatch_y))

    # convolution
    fft(patches_g, inplace=True, batch=Gx * Gy, plan=plan)
    fft(h_g, inplace=True, batch=Gx * Gy, plan=plan)
    prog.run_kernel("mult_inplace", (Npatch_x * Npatch_y * Gx * Gy, ), None,
                    patches_g.data, h_g.data)

    fft(patches_g, inplace=True, inverse=True, batch=Gx * Gy, plan=plan)

    #return patches_g.get()

    #accumulate
    res_g = OCLArray.empty(im.shape, np.float32)

    for i in range(Gx + 1):
        for j in range(Gy + 1):
            prog.run_kernel("interpolate2", (Nblock_x, Nblock_y),
                            None, patches_g.data, res_g.data, np.int32(i),
                            np.int32(j), np.int32(Gx), np.int32(Gy),
                            np.int32(Npatch_x), np.int32(Npatch_y))

    res = res_g.get()

    if return_plan:
        return res, plan
    else:
        return res
Exemple #21
0
def _bpm_3d2(size,
            units,
            lam = .5,
            u0 = None,
            dn = None,
            subsample = 1,
            n0 = 1.,
            return_scattering = False,
            return_g = False,
            return_full = True,
            return_field = True,
            use_fresnel_approx = False,
            absorbing_width = 0,
            scattering_plane_ind = 0,
            return_last_plane = False,
            store_dn_as_half = False):
    """
    simulates the propagation of monochromatic wave of wavelength lam with initial conditions u0 along z in a media filled with dn

    size     -    the dimension of the image to be calulcated  in pixels (Nx,Ny,Nz)
    units    -    the unit lengths of each dimensions in microns
    lam      -    the wavelength
    u0       -    the initial field distribution, if u0 = None an incident  plane wave is assumed
    dn       -    the refractive index of the medium (can be complex)

    """


    if subsample != 1:
        raise NotImplementedError("subsample still has to be 1")

    clock = StopWatch()

    clock.tic("setup")

    Nx, Ny, Nz = size
    dx, dy, dz = units


    #setting up the propagator
    k0 = 2.*np.pi/lam

    kxs = 2.*np.pi*np.fft.fftfreq(Nx,dx)
    kys = 2.*np.pi*np.fft.fftfreq(Ny,dy)

    KY, KX = np.meshgrid(kys,kxs, indexing= "ij")

    #H0 = np.sqrt(0.j+n0**2*k0**2-KX**2-KY**2)
    H0 = np.sqrt(n0**2*k0**2-KX**2-KY**2)

    if use_fresnel_approx:
        H0  = 0.j+n0*k0-.5*(KX**2+KY**2)/n0/k0


    outsideInds = np.isnan(H0)

    H = np.exp(-1.j*dz*H0)

    H[outsideInds] = 0.
    H0[outsideInds] = 0.

    if u0 is None:
        u0 = np.ones((Ny,Nx),np.complex64)

    # setting up the gpu buffers and kernels

    program = OCLProgram(absPath("kernels/bpm_3d_kernels.cl"))

    plan = fft_plan((Ny,Nx))
    plane_g = OCLArray.from_array(u0.astype(np.complex64, copy = False))
    h_g = OCLArray.from_array(H.astype(np.complex64))

    if dn is not None:
        if isinstance(dn,OCLArray):
            dn_g = dn
        else:
            if dn.dtype.type in (np.complex64,np.complex128):
                isComplexDn = True
                dn_g = OCLArray.from_array(dn.astype(np.complex64,copy= False))

            else:
                isComplexDn = False
                if store_dn_as_half:
                    dn_g = OCLArray.from_array(dn.astype(np.float16,copy= False))
                else:
                    dn_g = OCLArray.from_array(dn.astype(np.float32,copy= False))

    else:
        #dummy dn
        dn_g = OCLArray.empty((1,)*3,np.float32)


    if return_scattering:
        cos_theta = np.real(H0)/n0/k0

        # _H = np.sqrt(n0**2*k0**2-KX**2-KY**2)
        # _H[np.isnan(_H)] = 0.
        #
        # cos_theta = _H/n0/k0
        # # = cos(theta)
        scatter_weights = cos_theta

        #scatter_weights = np.sqrt(KX**2+KY**2)/k0/np.real(H0)
        #scatter_weights[outsideInds] = 0.

        scatter_weights_g = OCLArray.from_array(scatter_weights.astype(np.float32))

        # = cos(theta)^2
        gfactor_weights = cos_theta**2

        gfactor_weights_g = OCLArray.from_array(gfactor_weights.astype(np.float32))


        #return None,None,scatter_weights, gfactor_weights

        scatter_cross_sec_g = OCLArray.zeros(Nz,"float32")
        gfactor_g = OCLArray.zeros(Nz,"float32")

        plain_wave_dct = Nx*Ny*np.exp(-1.j*k0*n0*(scattering_plane_ind+np.arange(Nz))*dz).astype(np.complex64)


        reduce_kernel = OCLReductionKernel(
        np.float32, neutral="0",
            reduce_expr="a+b",
            map_expr="weights[i]*cfloat_abs(field[i]-(i==0)*plain)*cfloat_abs(field[i]-(i==0)*plain)",
            arguments="__global cfloat_t *field, __global float * weights,cfloat_t plain")

        # reduce_kernel = OCLReductionKernel(
        # np.float32, neutral="0",
        #     reduce_expr="a+b",
        #     map_expr = "weights[i]*(i!=0)*cfloat_abs(field[i])*cfloat_abs(field[i])",
        #     arguments = "__global cfloat_t *field, __global float * weights,cfloat_t plain")

    if return_full:
        if return_field:
            u_g = OCLArray.empty((Nz,Ny,Nx),dtype=np.complex64)
            u_g[0] = plane_g
        else:
            u_g = OCLArray.empty((Nz,Ny,Nx),dtype=np.float32)
            program.run_kernel("copy_intens",(Nx*Ny,),None,
                           plane_g.data,u_g.data, np.int32(0))


    clock.toc("setup")

    clock.tic("run")


    for i in range(Nz-1):
        fft(plane_g,inplace = True, plan  = plan)

        program.run_kernel("mult",(Nx*Ny,),None,
                           plane_g.data,h_g.data)


        #a =  dn_g.sum()
        if return_scattering:
            scatter_cross_sec_g[i+1] = reduce_kernel(plane_g,
                                                     scatter_weights_g,
                                                     plain_wave_dct[i+1])
            gfactor_g[i+1] = reduce_kernel(plane_g,
                                                     gfactor_weights_g,
                                                     plain_wave_dct[i+1])

        fft(plane_g,inplace = True, inverse = True,  plan  = plan)

        if dn is not None:
            if isComplexDn:

                kernel_str = "mult_dn_complex"
            else:
                if dn_g.dtype.type == np.float16:
                    kernel_str = "mult_dn_half"
                else:
                    kernel_str = "mult_dn"


            program.run_kernel(kernel_str,(Nx,Ny,),None,
                                   plane_g.data,dn_g.data,
                                   np.float32(k0*dz),
                                   np.int32(Nx*Ny*(i+1)),
                               np.int32(absorbing_width))




        if return_full:
            if return_field:
                u_g[i+1] = plane_g
            else:
                program.run_kernel("copy_intens",(Nx*Ny,),None,
                           plane_g.data,u_g.data, np.int32(Nx*Ny*(i+1)))

    clock.toc("run")

    print clock

    if return_full:
        u = u_g.get()
    else:
        u = plane_g.get()
        if not return_field:
            u = np.abs(u)**2

    if return_scattering:
        # normalizing prefactor dkx = dx/Nx
        # prefac = 1./Nx/Ny*dx*dy/4./np.pi/n0
        prefac = 1./Nx/Ny*dx*dy
        p = prefac*scatter_cross_sec_g.get()


    if return_g:
        prefac = 1./Nx/Ny*dx*dy
        g = prefac*gfactor_g.get()/p



    if return_scattering:
        if return_g:
            result = u,  p, g
        else:
            result =  u,  p
    else:
        result = u

    if return_last_plane:
        if isinstance(result,tuple):
            result = result + (plane_g.get(),)
        else:
            result = (result, plane_g.get())


    return result
Exemple #22
0
def _bpm_3d_image(size,
            units,
            lam = .5,
            u0 = None, dn = None,
            subsample = 1,
            n0 = 1.,
            return_scattering = False,
            return_g = False,
            return_full_last = False,
            use_fresnel_approx = False,
            ):
    """
    simulates the propagation of monochromativ wave of wavelength lam with initial conditions u0 along z in a media filled with dn

    size     -    the dimension of the image to be calulcated  in pixels (Nx,Ny,Nz)
    units    -    the unit lengths of each dimensions in microns
    lam      -    the wavelength
    u0       -    the initial field distribution, if u0 = None an incident  plane wave is assumed
    dn       -    the refractive index of the medium (can be complex)

    """
    clock = StopWatch()

    clock.tic("setup")

    Nx, Ny, Nz = size
    dx, dy, dz = units

    # subsampling
    Nx2, Ny2, Nz2 = (subsample*N for N in size)
    dx2, dy2, dz2 = (1.*d/subsample for d in units)

    #setting up the propagator
    k0 = 2.*np.pi/lam

    kxs = 2.*np.pi*np.fft.fftfreq(Nx2,dx2)
    kys = 2.*np.pi*np.fft.fftfreq(Ny2,dy2)

    KY, KX = np.meshgrid(kys,kxs, indexing= "ij")

    #H0 = np.sqrt(0.j+n0**2*k0**2-KX**2-KY**2)
    H0 = np.sqrt(n0**2*k0**2-KX**2-KY**2)

    if use_fresnel_approx:
        H0  = 0.j+n0**2*k0-.5*(KX**2+KY**2)


    outsideInds = np.isnan(H0)

    H = np.exp(-1.j*dz2*H0)

    H[outsideInds] = 0.
    H0[outsideInds] = 0.

    if u0 is None:
        u0 = np.ones((Ny2,Nx2),np.complex64)
    else:
        if subsample >1:
            u0 = zoom(np.real(u0),subsample) + 1.j*zoom(np.imag(u0),subsample)

    # setting up the gpu buffers and kernels

    program = OCLProgram(absPath("kernels/bpm_3d_kernels.cl"))

    plan = fft_plan((Ny2,Nx2))
    plane_g = OCLArray.from_array(u0.astype(np.complex64))

    h_g = OCLArray.from_array(H.astype(np.complex64))

    if dn is not None:
        if isinstance(dn,OCLImage):
            dn_g = dn
        else:
            if dn.dtype.type in (np.complex64,np.complex128):

                dn_complex = np.zeros(dn.shape+(2,),np.float32)
                dn_complex[...,0] = np.real(dn)
                dn_complex[...,1] = np.imag(dn)
                dn_g = OCLImage.from_array(dn_complex)

            else:
                dn_g = OCLImage.from_array(dn.astype(np.float32))

        isComplexDn = dn.dtype.type in (np.complex64,np.complex128)

    else:
        #dummy dn
        dn_g = OCLArray.empty((1,)*3,np.float16)


    if return_scattering:
        cos_theta = np.real(H0)/n0/k0

        # = cos(theta)
        scatter_weights = cos_theta

        scatter_weights_g = OCLArray.from_array(scatter_weights.astype(np.float32))

        # = cos(theta)^2
        gfactor_weights = cos_theta**2

        gfactor_weights_g = OCLArray.from_array(gfactor_weights.astype(np.float32))


        #return None,None,scatter_weights, gfactor_weights

        scatter_cross_sec_g = OCLArray.zeros(Nz,"float32")
        gfactor_g = OCLArray.zeros(Nz,"float32")

        plain_wave_dct = Nx2*Ny2*np.exp(-1.j*k0*n0*np.arange(Nz)*dz).astype(np.complex64)


        reduce_kernel = OCLReductionKernel(
        np.float32, neutral="0",
            reduce_expr="a+b",
            map_expr="weights[i]*cfloat_abs(field[i]-(i==0)*plain)*cfloat_abs(field[i]-(i==0)*plain)",
            arguments="__global cfloat_t *field, __global float * weights,cfloat_t plain")

        # reduce_kernel = OCLReductionKernel(
        # np.float32, neutral="0",
        #     reduce_expr="a+b",
        #     map_expr = "weights[i]*(i!=0)*cfloat_abs(field[i])*cfloat_abs(field[i])",
        #     arguments = "__global cfloat_t *field, __global float * weights,cfloat_t plain")


    u_g = OCLArray.empty((Nz,Ny,Nx),dtype=np.complex64)

    program.run_kernel("copy_subsampled_buffer",(Nx,Ny),None,
                           u_g.data,plane_g.data,
                           np.int32(subsample),
                           np.int32(0))


    clock.toc("setup")

    clock.tic("run")

    for i in range(Nz-1):
        for substep in range(subsample):
            fft(plane_g,inplace = True, plan  = plan)

            program.run_kernel("mult",(Nx2*Ny2,),None,
                               plane_g.data,h_g.data)

            if return_scattering and substep == (subsample-1):
                scatter_cross_sec_g[i+1] = reduce_kernel(plane_g,
                                                     scatter_weights_g,
                                                     plain_wave_dct[i+1])
                gfactor_g[i+1] = reduce_kernel(plane_g,
                                                     gfactor_weights_g,
                                                     plain_wave_dct[i+1])

            fft(plane_g,inplace = True, inverse = True,  plan  = plan)

            if dn is not None:
                if isComplexDn:

                    program.run_kernel("mult_dn_complex_image",(Nx2,Ny2),None,
                                   plane_g.data,dn_g,
                                   np.float32(k0*dz2),
                                   np.float32(n0),
                                   np.int32(subsample*(i+1.)+substep),
                                   np.int32(subsample))
                else:
                    program.run_kernel("mult_dn_image",(Nx2,Ny2),None,
                                   plane_g.data,dn_g,
                                   np.float32(k0*dz2),
                                   np.float32(n0),
                                   np.int32(subsample*(i+1.)+substep),
                                   np.int32(subsample))


        program.run_kernel("copy_subsampled_buffer",(Nx,Ny),None,
                           u_g.data,plane_g.data,
                           np.int32(subsample),
                           np.int32((i+1)*Nx*Ny))


    clock.toc("run")

    print clock
    result = (u_g.get(), dn_g.get(),)

    if return_scattering:
        # normalizing prefactor dkx = dx2/Nx2
        # prefac = 1./Nx2/Ny2*dx2*dy2/4./np.pi/n0
        prefac = 1./Nx2/Ny2*dx2*dy2
        p = prefac*scatter_cross_sec_g.get()
        result += (p,)

    if return_g:
        prefac = 1./Nx2/Ny2*dx2*dy2
        g = prefac*gfactor_g.get()/p
        result += (g,)

    if return_full_last:
        result += (plane_g.get(),)

    return result
def convolve_spatial2(im, hs,
                      mode = "constant",
                      plan = None,
                      return_plan = False):
    """
    spatial varying convolution of an 2d image with a 2d grid of psfs

    shape(im_ = (Ny,Nx)
    shape(hs) = (Gy,Gx, Hy,Hx)

    the input image im is subdivided into (Gy,Gz) blocks
    hs[j,i] is the psf at the center of each block (i,j)

    as of now each image dimension has to be divisble by the grid dim, i.e.
    Nx % Gx == 0
    Ny % Gy == 0

    mode can be:
    "constant" - assumed values to be zero
    "wrap" - periodic boundary condition
    """

    if im.ndim !=2 or hs.ndim !=4:
        raise ValueError("wrong dimensions of input!")

    if not np.all([n%g==0 for n,g in zip(im.shape,hs.shape[:2])]):
        raise NotImplementedError("shape of image has to be divisible by Gx Gy  = %s shape mismatch"%(str(hs.shape[:2])))


    mode_str = {"constant":"CLK_ADDRESS_CLAMP",
                "wrap":"CLK_ADDRESS_REPEAT"}

    Ny, Nx = im.shape
    Gy, Gx = hs.shape[:2]


    # the size of each block within the grid
    Nblock_y, Nblock_x = Ny/Gy, Nx/Gx


    # the size of the overlapping patches with safety padding
    Npatch_x, Npatch_y = _next_power_of_2(3*Nblock_x), _next_power_of_2(3*Nblock_y)
    #Npatch_x, Npatch_y = _next_power_of_2(2*Nblock_x), _next_power_of_2(2*Nblock_y)

    print Nblock_x, Npatch_x

    hs = np.fft.fftshift(pad_to_shape(hs,(Gy,Gx,Npatch_y,Npatch_x)),axes=(2,3))


    prog = OCLProgram(abspath("kernels/conv_spatial.cl"),
                      build_options=["-D","ADDRESSMODE=%s"%mode_str[mode]])

    if plan is None:
        plan = fft_plan((Npatch_y,Npatch_x))


    patches_g = OCLArray.empty((Gy,Gx,Npatch_y,Npatch_x),np.complex64)

    h_g = OCLArray.from_array(hs.astype(np.complex64))

    im_g = OCLImage.from_array(im.astype(np.float32,copy=False))

    x0s = Nblock_x*np.arange(Gx)
    y0s = Nblock_y*np.arange(Gy)

    print x0s

    for i,_x0 in enumerate(x0s):
        for j,_y0 in enumerate(y0s):
            prog.run_kernel("fill_patch2",(Npatch_x,Npatch_y),None,
                    im_g,
                    np.int32(_x0+Nblock_x/2-Npatch_x/2),
                    np.int32(_y0+Nblock_y/2-Npatch_y/2),
                    patches_g.data,
                    np.int32(i*Npatch_x*Npatch_y+j*Gx*Npatch_x*Npatch_y))

    # convolution
    fft(patches_g,inplace=True, batch = Gx*Gy, plan = plan)
    fft(h_g,inplace=True, batch = Gx*Gy, plan = plan)
    prog.run_kernel("mult_inplace",(Npatch_x*Npatch_y*Gx*Gy,),None,
                    patches_g.data, h_g.data)

    fft(patches_g,inplace=True, inverse = True, batch = Gx*Gy, plan = plan)

    #return patches_g.get()

    #accumulate
    res_g = OCLArray.empty(im.shape,np.float32)

    for i in xrange(Gx+1):
        for j in xrange(Gy+1):
            prog.run_kernel("interpolate2",(Nblock_x,Nblock_y),None,
                            patches_g.data,res_g.data,
                            np.int32(i),np.int32(j),
                            np.int32(Gx),np.int32(Gy),
                            np.int32(Npatch_x),np.int32(Npatch_y))


    res = res_g.get()

    if return_plan:
        return res, plan
    else:
        return res
def _convolve_spatial3(im,
                       hs,
                       mode="constant",
                       grid_dim=None,
                       plan=None,
                       return_plan=False,
                       pad_factor=2):
    if im.ndim != 3:
        raise ValueError("wrong dimensions of input!")

    if not (hs.ndim == 6 or (hs.ndim == 3 and grid_dim)):
        raise ValueError("wrong dimensions of psf grid!")

    if grid_dim:
        if hs.shape != im.shape:
            raise ValueError("if grid_dim is set, then im.shape = hs.shape !")
        Gs = tuple(grid_dim)
    else:
        if not hs.ndim == 6:
            raise ValueError("wrong dimensions of psf grid! (Gy,Gx,Ny,Nx)")
        Gs = hs.shape[:3]

    if not np.all([n % g == 0 for n, g in zip(im.shape, Gs)]):
        raise NotImplementedError(
            "shape of image has to be divisible by Gx Gy  = %s shape mismatch"
            % (str(hs.shape[:2])))

    mode_str = {
        "constant": "CLK_ADDRESS_CLAMP",
        "wrap": "CLK_ADDRESS_REPEAT",
        "edge": "CLK_ADDRESS_CLAMP_TO_EDGE",
        "reflect": "CLK_ADDRESS_MIRRORED_REPEAT"
    }

    Ns = im.shape

    # the size of each block within the grid
    Nblocks = [n // g for n, g in zip(Ns, Gs)]

    # the size of the overlapping patches with safety padding
    Npatchs = tuple([next_power_of_2(pad_factor * nb) for nb in Nblocks])

    prog = OCLProgram(abspath("kernels/conv_spatial3.cl"),
                      build_options=["-D",
                                     "ADDRESSMODE=%s" % mode_str[mode]])

    if plan is None:
        plan = fft_plan(Gs + Npatchs, axes=(-3, -2, -1))

    Xs = [nb * np.arange(g) for nb, g in zip(Nblocks, Gs)]

    patches_g = OCLArray.empty(Gs + Npatchs, np.complex64)

    # prepare psfs
    if grid_dim:
        h_g = OCLArray.zeros(Gs + Npatchs, np.complex64)

        tmp_g = OCLArray.from_array(hs.astype(np.float32, copy=False))
        for (k, _z0), (j, _y0), (i,
                                 _x0) in product(*[enumerate(X) for X in Xs]):

            prog.run_kernel(
                "fill_psf_grid3", Nblocks[::-1], None, tmp_g.data,
                np.int32(im.shape[2]), np.int32(im.shape[1]),
                np.int32(i * Nblocks[2]), np.int32(j * Nblocks[1]),
                np.int32(k * Nblocks[0]), h_g.data, np.int32(Npatchs[2]),
                np.int32(Npatchs[1]), np.int32(Npatchs[0]),
                np.int32(-Nblocks[2] // 2 + Npatchs[2] // 2),
                np.int32(-Nblocks[1] // 2 + Npatchs[1] // 2),
                np.int32(-Nblocks[0] // 2 + Npatchs[0] // 2),
                np.int32(i * np.prod(Npatchs) + j * Gs[2] * np.prod(Npatchs) +
                         k * Gs[2] * Gs[1] * np.prod(Npatchs)))

    else:
        hs = np.fft.fftshift(pad_to_shape(hs, Gs + Npatchs), axes=(3, 4, 5))
        h_g = OCLArray.from_array(hs.astype(np.complex64))

    im_g = OCLImage.from_array(im.astype(np.float32, copy=False))

    # this loops over all i,j,k
    for (k, _z0), (j, _y0), (i, _x0) in product(*[enumerate(X) for X in Xs]):
        prog.run_kernel(
            "fill_patch3", Npatchs[::-1], None, im_g,
            np.int32(_x0 + Nblocks[2] // 2 - Npatchs[2] // 2),
            np.int32(_y0 + Nblocks[1] // 2 - Npatchs[1] // 2),
            np.int32(_z0 + Nblocks[0] // 2 - Npatchs[0] // 2), patches_g.data,
            np.int32(i * np.prod(Npatchs) + j * Gs[2] * np.prod(Npatchs) +
                     k * Gs[2] * Gs[1] * np.prod(Npatchs)))

    # convolution
    fft(patches_g, inplace=True, plan=plan)
    fft(h_g, inplace=True, plan=plan)
    prog.run_kernel("mult_inplace", (np.prod(Npatchs) * np.prod(Gs), ), None,
                    patches_g.data, h_g.data)

    fft(patches_g, inplace=True, inverse=True, plan=plan)

    # return patches_g.get()
    # accumulate
    res_g = OCLArray.zeros(im.shape, np.float32)

    for k, j, i in product(*[list(range(g + 1)) for g in Gs]):
        prog.run_kernel("interpolate3", Nblocks[::-1], None, patches_g.data,
                        res_g.data, np.int32(i), np.int32(j), np.int32(k),
                        np.int32(Gs[2]), np.int32(Gs[1]), np.int32(Gs[0]),
                        np.int32(Npatchs[2]), np.int32(Npatchs[1]),
                        np.int32(Npatchs[0]))

    res = res_g.get()

    if return_plan:
        return res, plan
    else:
        return res
def _convolve_spatial3(im, hs,
                      mode = "constant",
                      grid_dim = None,
                      plan = None,
                      return_plan = False,
                      pad_factor = 2):



    if im.ndim !=3:
        raise ValueError("wrong dimensions of input!")

    if not (hs.ndim==6 or (hs.ndim==3 and grid_dim)):
        raise ValueError("wrong dimensions of psf grid!")

    if grid_dim:
        if hs.shape != im.shape:
            raise ValueError("if grid_dim is set, then im.shape = hs.shape !")
        Gs = tuple(grid_dim)
    else:
        if not hs.ndim==6:
            raise ValueError("wrong dimensions of psf grid! (Gy,Gx,Ny,Nx)")
        Gs = hs.shape[:3]

    if not np.all([n%g==0 for n,g in zip(im.shape,Gs)]):
        raise NotImplementedError("shape of image has to be divisible by Gx Gy  = %s shape mismatch"%(str(hs.shape[:2])))



    mode_str = {"constant":"CLK_ADDRESS_CLAMP",
                "wrap":"CLK_ADDRESS_REPEAT"}

    Ns = im.shape


    # the size of each block within the grid
    Nblocks = [n/g for n,g  in zip(Ns,Gs)]


    # the size of the overlapping patches with safety padding
    Npatchs = tuple([_next_power_of_2(pad_factor*nb) for nb in Nblocks])

    prog = OCLProgram(abspath("kernels/conv_spatial3.cl"),
                      build_options=["-D","ADDRESSMODE=%s"%mode_str[mode]])

    if plan is None:
        plan = fft_plan(Npatchs)


    Xs = [nb*np.arange(g) for nb, g in zip(Nblocks,Gs)]

    patches_g = OCLArray.empty(Gs+Npatchs,np.complex64)

    #prepare psfs
    if grid_dim:
        h_g = OCLArray.zeros(Gs+Npatchs,np.complex64)
        tmp_g = OCLArray.from_array(hs.astype(np.float32, copy = False))
        for (k,_z0), (j,_y0),(i,_x0) in product(*[enumerate(X) for X in Xs]):
            prog.run_kernel("fill_psf_grid3",
                        Nblocks[::-1],None,
                        tmp_g.data,
                        np.int32(im.shape[2]),
                        np.int32(im.shape[1]),
                        np.int32(i*Nblocks[2]),
                        np.int32(j*Nblocks[1]),
                        np.int32(k*Nblocks[0]),
                        h_g.data,
                        np.int32(Npatchs[2]),
                        np.int32(Npatchs[1]),
                        np.int32(Npatchs[0]),
                        np.int32(-Nblocks[2]/2+Npatchs[2]/2),
                        np.int32(-Nblocks[1]/2+Npatchs[1]/2),
                        np.int32(-Nblocks[0]/2+Npatchs[0]/2),
                        np.int32(i*np.prod(Npatchs)+
                         j*Gs[2]*np.prod(Npatchs)+
                         k*Gs[2]*Gs[1]*np.prod(Npatchs)))

    else:
        hs = np.fft.fftshift(pad_to_shape(hs,Gs+Npatchs),axes=(3,4,5))
        h_g = OCLArray.from_array(hs.astype(np.complex64))


    im_g = OCLImage.from_array(im.astype(np.float32,copy=False))

    # this loops over all i,j,k
    for (k,_z0), (j,_y0),(i,_x0) in product(*[enumerate(X) for X in Xs]):
        prog.run_kernel("fill_patch3",Npatchs[::-1],None,
                im_g,
                    np.int32(_x0+Nblocks[2]/2-Npatchs[2]/2),
                    np.int32(_y0+Nblocks[1]/2-Npatchs[1]/2),
                    np.int32(_z0+Nblocks[0]/2-Npatchs[0]/2),
                    patches_g.data,
                    np.int32(i*np.prod(Npatchs)+
                             j*Gs[2]*np.prod(Npatchs)+
                             k*Gs[2]*Gs[1]*np.prod(Npatchs)))


    # convolution
    fft(patches_g,inplace=True, batch = np.prod(Gs), plan = plan)
    fft(h_g,inplace=True, batch = np.prod(Gs), plan = plan)
    prog.run_kernel("mult_inplace",(np.prod(Npatchs)*np.prod(Gs),),None,
                    patches_g.data, h_g.data)

    fft(patches_g,
        inplace=True,
        inverse = True,
        batch = np.prod(Gs),
        plan = plan)

    #return patches_g.get()
    #accumulate
    res_g = OCLArray.zeros(im.shape,np.float32)

    for k, j, i in product(*[range(g+1) for g in Gs]):
        prog.run_kernel("interpolate3",Nblocks[::-1],None,
                        patches_g.data,
                        res_g.data,
                        np.int32(i),np.int32(j),np.int32(k),
                        np.int32(Gs[2]),np.int32(Gs[1]),np.int32(Gs[0]),
                        np.int32(Npatchs[2]),np.int32(Npatchs[1]),np.int32(Npatchs[0]))


    res = res_g.get()

    if return_plan:
        return res, plan
    else:
        return res