Ejemplo n.º 1
0
def _deconv_rl_gpu_conv(data_g, h_g, Niter=10):
    """
    using convolve

    """

    #set up some gpu buffers
    u_g = OCLArray.empty(data_g.shape, np.float32)

    u_g.copy_buffer(data_g)

    tmp_g = OCLArray.empty(data_g.shape, np.float32)
    tmp2_g = OCLArray.empty(data_g.shape, np.float32)

    #fix this
    hflip_g = OCLArray.from_array((h_g.get()[::-1, ::-1]).copy())

    for i in range(Niter):
        convolve(u_g, h_g, res_g=tmp_g)

        _divide_inplace(data_g, tmp_g)

        # return data_g, tmp_g

        convolve(tmp_g, hflip_g, res_g=tmp2_g)
        _multiply_inplace(u_g, tmp2_g)

    return u_g
Ejemplo n.º 2
0
def _deconv_rl_gpu_conv(data_g, h_g, Niter = 10):
    """ 
    using convolve

    """
        
    #set up some gpu buffers
    u_g = OCLArray.empty(data_g.shape,np.float32)

    u_g.copy_buffer(data_g)
    
    tmp_g = OCLArray.empty(data_g.shape,np.float32)
    tmp2_g = OCLArray.empty(data_g.shape,np.float32)

    #fix this
    hflip_g = OCLArray.from_array((h_g.get()[::-1,::-1]).copy())

    for i in range(Niter):
        convolve(u_g, h_g,
                 res_g = tmp_g)


        _divide_inplace(data_g,tmp_g)

        # return data_g, tmp_g
        
        convolve(tmp_g, hflip_g,
                 res_g = tmp2_g)
        _multiply_inplace(u_g,tmp2_g)

    return u_g
Ejemplo n.º 3
0
    def _setup_gpu(self):
        dev = get_device()
        self._queue = dev.queue
        self._ctx = dev.context
        prog = OCLProgram(absPath("kernels/bpm_3d_kernels.cl"))

        # the buffers/ images
        Nx, Ny = self.simul_xy
        Nx0, Ny0 = self.shape[:2]

        self._plan = fft_plan((Ny, Nx), **self.fftplan_kwargs)
        self._buf_plane = OCLArray.empty((Ny, Nx), np.complex64)
        self._buf_H = OCLArray.empty((Ny, Nx), np.complex64)
        self._img_xy = OCLImage.empty((Ny, Nx),
                                      dtype=np.float32,
                                      num_channels=2)

        # buffer for the weighted dn average
        self.intens_g = OCLArray.empty((1, Ny, Nx), dtype=Bpm3d._real_type)
        self.intens_dn_g = OCLArray.empty((1, Ny, Nx), dtype=Bpm3d._real_type)
        self.intens_sum_g = OCLArray.zeros((), dtype=Bpm3d._real_type)
        self.intens_dn_sum_g = OCLArray.zeros((), dtype=Bpm3d._real_type)

        # the kernels
        self._kernel_compute_propagator = prog.compute_propagator
        self._kernel_compute_propagator.set_scalar_arg_dtypes((None, ) +
                                                              (np.float32, ) *
                                                              5)
        self._kernel_compute_propagator_buf = prog.compute_propagator_buf
        self._kernel_compute_propagator_buf.set_scalar_arg_dtypes(
            (None, ) + (np.float32, ) * 5 + (None, ) * 2)

        self._kernel_mult_complex = prog.mult

        self._kernel_im_to_buf_field = prog.img_to_buf_field
        self._kernel_im_to_buf_intensity = prog.img_to_buf_intensity
        self._kernel_im_to_im_intensity = prog.img_to_img_intensity
        self._kernel_buf_to_buf_field = prog.buf_to_buf_field
        self._kernel_buf_to_buf_intensity = prog.buf_to_buf_intensity

        self._kernel_mult_dn_img_float = prog.mult_dn_image
        self._kernel_mult_dn_buf_float = prog.mult_dn
        self._kernel_mult_dn_img_complex = prog.mult_dn_image_complex
        self._kernel_mult_dn_buf_complex = prog.mult_dn_complex

        self._kernel_mult_dn_img_float_local = prog.mult_dn_image_local
        self._kernel_mult_dn_buf_float_local = prog.mult_dn_local
        self._kernel_mult_dn_img_complex_local = prog.mult_dn_image_complex_local
        self._kernel_mult_dn_buf_complex_local = prog.mult_dn_complex_local

        self._kernel_reduction = OCLMultiReductionKernel(
            np.float32,
            neutral="0",
            reduce_expr="a+b",
            map_exprs=["a[i]", "b[i]"],
            arguments="__global float *a, __global float *b")

        self._fill_propagator(self.n0)
Ejemplo n.º 4
0
def focus_field_lattice(shape,
                        units,
                        lam=.5,
                        NA1=.4,
                        NA2=.5,
                        sigma=.1,
                        Npoly=6,
                        n0=1.,
                        n_integration_steps=100):
    """
    """

    kxs, kys = .5 * (NA1 + NA2) * poly_points(Npoly)

    p = OCLProgram(absPath("kernels/psf_lattice.cl"),
                   build_options=[
                       "-I",
                       absPath("kernels"), "-D",
                       "INT_STEPS=%s" % n_integration_steps
                   ])

    kxs = np.array(kxs)
    kys = np.array(kys)

    Nx, Ny, Nz = shape
    dx, dy, dz = units

    alpha1 = np.arcsin(NA1 / n0)
    alpha2 = np.arcsin(NA2 / n0)

    u_g = OCLArray.empty((Nz, Ny, Nx), np.float32)
    ex_g = OCLArray.empty((Nz, Ny, Nx), np.complex64)
    ey_g = OCLArray.empty((Nz, Ny, Nx), np.complex64)
    ez_g = OCLArray.empty((Nz, Ny, Nx), np.complex64)

    kxs_g = OCLArray.from_array(kxs.astype(np.float32))
    kys_g = OCLArray.from_array(kys.astype(np.float32))

    t = time.time()

    p.run_kernel(
        "debye_wolf_lattice", (Nx, Ny, Nz), None,
        ex_g.data, ey_g.data, ez_g.data, u_g.data, np.float32(1.),
        np.float32(0.), np.float32(-dx * (Nx - 1) / 2.),
        np.float32(dx * (Nx - 1) / 2.), np.float32(-dy * (Ny - 1) / 2.),
        np.float32(dy * (Ny - 1) / 2.), np.float32(-dz * (Nz - 1) / 2.),
        np.float32(dz * (Nz - 1) / 2.), np.float32(1. * lam / n0),
        np.float32(alpha1), np.float32(alpha2), kxs_g.data, kys_g.data,
        np.int32(len(kxs)), np.float32(sigma))

    ex = ex_g.get()

    print "time in secs:", time.time() - t
    return ex
Ejemplo n.º 5
0
def _integral3_buf(x_g, res_g = None, tmp_g = None):
    if not x_g.dtype.type in _output_type_dict:
        raise ValueError("dtype %s currently not supported! (%s)" % (x_g.dtype.type, str(_output_type_dict.keys())))

    dtype_out = _output_type_dict[x_g.dtype.type]
    cl_dtype_in = cl_buffer_datatype_dict[x_g.dtype.type]
    cl_dtype_out = cl_buffer_datatype_dict[dtype_out]

    dtype_itemsize = np.dtype(dtype_out).itemsize

    max_local_size = get_device().get_info("MAX_WORK_GROUP_SIZE")
    prog = OCLProgram(abspath("kernels/integral_image.cl"),
                      build_options=["-D", "DTYPE=%s" % cl_dtype_out])
    if x_g.dtype.type != dtype_out:
        x_g = x_g.astype(dtype_out)

    if tmp_g is None:
        tmp_g = OCLArray.empty(x_g.shape, dtype_out)
    if res_g is None:
        res_g = OCLArray.empty(x_g.shape, dtype_out)

    assert_bufs_type(dtype_out, tmp_g, res_g)

    nz, ny, nx = x_g.shape

    def _scan_single(src, dst, ns, strides):
        nx, ny, nz = ns
        stride_x, stride_y, stride_z = strides
        loc = min(next_power_of_2(nx // 2), max_local_size // 2)
        nx_block = 2 * loc
        nx_pad = math.ceil(nx / nx_block) * nx_block

        nblocks = math.ceil(nx_pad // 2 / loc)
        sum_blocks = OCLArray.empty((nz, ny, nblocks), dst.dtype)
        shared = cl.LocalMemory(2 * dtype_itemsize * loc)
        for b in range(nblocks):
            offset = b * loc
            prog.run_kernel("scan3d", (loc, ny, nz), (loc, 1, 1),
                            src.data, dst.data, sum_blocks.data, shared,
                            np.int32(nx_block),
                            np.int32(stride_x), np.int32(stride_y), np.int32(stride_z), np.int32(offset), np.int32(b),
                            np.int32(nblocks), np.int32(ny), np.int32(nx))
        if nblocks > 1:
            _scan_single(sum_blocks, sum_blocks, (nblocks, ny, nz), (1, nblocks, nblocks * ny))
            prog.run_kernel("add_sums3d", (nx_pad, ny, nz), (nx_block, 1, 1),
                            sum_blocks.data, dst.data,
                            np.int32(stride_x), np.int32(stride_y), np.int32(stride_z),
                            np.int32(nblocks), np.int32(ny), np.int32(nx))

    _scan_single(x_g, res_g, (nx, ny, nz), (1, nx, nx * ny))
    _scan_single(res_g, tmp_g, (ny, nx, nz), (nx, 1, nx * ny))
    _scan_single(tmp_g, res_g, (nz, nx, ny), (ny * nx, 1, nx))

    return res_g
Ejemplo n.º 6
0
    def _setup_gpu(self):
        dev = get_device()
        self._queue = dev.queue
        self._ctx = dev.context
        prog = OCLProgram(absPath("kernels/bpm_3d_kernels.cl"))

        # the buffers/ images
        Nx, Ny = self.simul_xy
        Nx0, Ny0 = self.shape[:2]

        self._plan = fft_plan((Ny, Nx), **self.fftplan_kwargs)
        self._buf_plane = OCLArray.empty((Ny, Nx), np.complex64)
        self._buf_H = OCLArray.empty((Ny, Nx), np.complex64)
        self._img_xy = OCLImage.empty((Ny, Nx), dtype=np.float32, num_channels=2)

        # buffer for the weighted dn average
        self.intens_g = OCLArray.empty((1, Ny, Nx), dtype=Bpm3d._real_type)
        self.intens_dn_g = OCLArray.empty((1, Ny, Nx), dtype=Bpm3d._real_type)
        self.intens_sum_g = OCLArray.zeros((), dtype=Bpm3d._real_type)
        self.intens_dn_sum_g = OCLArray.zeros((), dtype=Bpm3d._real_type)

        # the kernels
        self._kernel_compute_propagator = prog.compute_propagator
        self._kernel_compute_propagator.set_scalar_arg_dtypes((None,)+(np.float32,)*5)
        self._kernel_compute_propagator_buf = prog.compute_propagator_buf
        self._kernel_compute_propagator_buf.set_scalar_arg_dtypes((None,)+(np.float32,)*5+(None,)*2)

        self._kernel_mult_complex = prog.mult

        self._kernel_im_to_buf_field = prog.img_to_buf_field
        self._kernel_im_to_buf_intensity = prog.img_to_buf_intensity
        self._kernel_im_to_im_intensity = prog.img_to_img_intensity
        self._kernel_buf_to_buf_field = prog.buf_to_buf_field
        self._kernel_buf_to_buf_intensity = prog.buf_to_buf_intensity

        self._kernel_mult_dn_img_float = prog.mult_dn_image
        self._kernel_mult_dn_buf_float = prog.mult_dn
        self._kernel_mult_dn_img_complex = prog.mult_dn_image_complex
        self._kernel_mult_dn_buf_complex = prog.mult_dn_complex

        self._kernel_mult_dn_img_float_local = prog.mult_dn_image_local
        self._kernel_mult_dn_buf_float_local = prog.mult_dn_local
        self._kernel_mult_dn_img_complex_local = prog.mult_dn_image_complex_local
        self._kernel_mult_dn_buf_complex_local = prog.mult_dn_complex_local

        self._kernel_reduction = OCLMultiReductionKernel(np.float32,
                                                         neutral="0", reduce_expr="a+b",
                                                         map_exprs=["a[i]", "b[i]"],
                                                         arguments="__global float *a, __global float *b")

        self._fill_propagator(self.n0)
Ejemplo n.º 7
0
def focus_field_cylindrical(shape,
                            units,
                            lam=.5,
                            NA=.3,
                            n0=1.,
                            n_integration_steps=100):
    """computes focus field of cylindrical lerns with given NA

    see:
    Colin J. R. Sheppard,
    Cylindrical lenses—focusing and imaging: a review

    Appl. Opt. 52, 538-545 (2013)

    return u,ex,ey,ez   with u being the intensity
    """

    p = OCLProgram(absPath("kernels/psf_cylindrical.cl"),
                   build_options=str("-I %s -D INT_STEPS=%s" %
                                     (absPath("."), n_integration_steps)))

    Nx, Ny, Nz = shape
    dx, dy, dz = units

    alpha = np.arcsin(NA / n0)

    u_g = OCLArray.empty((Nz, Ny), np.float32)
    ex_g = OCLArray.empty((Nz, Ny), np.complex64)
    ey_g = OCLArray.empty((Nz, Ny), np.complex64)
    ez_g = OCLArray.empty((Nz, Ny), np.complex64)

    t = time.time()

    p.run_kernel("psf_cylindrical", u_g.shape[::-1], None,
                 ex_g.data, ey_g.data, ez_g.data, u_g.data,
                 np.float32(-dy * (Ny - 1) / 2.),
                 np.float32(dy * (Ny - 1) / 2.),
                 np.float32(-dz * (Nz - 1) / 2.),
                 np.float32(dz * (Nz - 1) / 2.), np.float32(lam / n0),
                 np.float32(alpha))

    u = np.array(np.repeat(u_g.get()[..., np.newaxis], Nx, axis=-1))
    ex = np.array(np.repeat(ex_g.get()[..., np.newaxis], Nx, axis=-1))
    ey = np.array(np.repeat(ey_g.get()[..., np.newaxis], Nx, axis=-1))
    ez = np.array(np.repeat(ez_g.get()[..., np.newaxis], Nx, axis=-1))

    print "time in secs:", time.time() - t

    return u, ex, ey, ez
Ejemplo n.º 8
0
def focus_field_cylindrical(shape,units,lam = .5,NA = .3, n0=1.,
                            n_integration_steps = 100):
    """computes focus field of cylindrical lerns with given NA

    see:
    Colin J. R. Sheppard,
    Cylindrical lenses—focusing and imaging: a review

    Appl. Opt. 52, 538-545 (2013)

    return u,ex,ey,ez   with u being the intensity
    """

    p = OCLProgram(absPath("kernels/psf_cylindrical.cl"),build_options = str("-I %s -D INT_STEPS=%s"%(absPath("."),n_integration_steps)))

    
    Nx, Ny, Nz = shape
    dx, dy, dz = units

    alpha = np.arcsin(NA/n0)
    
    u_g = OCLArray.empty((Nz,Ny),np.float32)
    ex_g = OCLArray.empty((Nz,Ny),np.complex64)
    ey_g = OCLArray.empty((Nz,Ny),np.complex64)
    ez_g = OCLArray.empty((Nz,Ny),np.complex64)

    t = time.time()
    
    p.run_kernel("psf_cylindrical",u_g.shape[::-1],None,
                 ex_g.data,
                 ey_g.data,
                 ez_g.data,
                 u_g.data,
                 np.float32(-dy*(Ny-1)/2.),np.float32(dy*(Ny-1)/2.),
                 np.float32(-dz*(Nz-1)/2.),np.float32(dz*(Nz-1)/2.),
                 np.float32(lam/n0),
                 np.float32(alpha))

    u = np.array(np.repeat(u_g.get()[...,np.newaxis],Nx,axis=-1))
    ex = np.array(np.repeat(ex_g.get()[...,np.newaxis],Nx,axis=-1))
    ey = np.array(np.repeat(ey_g.get()[...,np.newaxis],Nx,axis=-1))
    ez = np.array(np.repeat(ez_g.get()[...,np.newaxis],Nx,axis=-1))

    
    print "time in secs:" , time.time()-t
    

    return u, ex, ey, ez
Ejemplo n.º 9
0
def create_dn_buffer(size,
                     units,
                     points,
                     dn_inner=.0,
                     rad_inner=0,
                     dn_outer=.1,
                     rad_outer=.4):

    Nx, Ny, Nz = size
    dx, dy, dz = units

    program = OCLProgram(absPath("kernels/bpm_3d_spheres.cl"))

    dn_g = OCLArray.empty((Nz, Ny, Nx), dtype=np.float32)

    # sort by z
    ps = np.array(points)
    ps = ps[np.argsort(ps[:, 2]), :]

    Np = ps.shape[0]

    pointsBuf = OCLArray.from_array(ps.flatten().astype(np.float32))

    program.run_kernel("fill_dn", (Nx, Ny, Nz), None, dn_g.data,
                       pointsBuf.data, np.int32(Np), np.float32(dx),
                       np.float32(dy), np.float32(dz), np.float32(dn_inner),
                       np.float32(rad_inner), np.float32(dn_outer),
                       np.float32(rad_outer))

    return dn_g
Ejemplo n.º 10
0
def scale(data, scale=(1., 1., 1.), interp="linear"):
    """returns a interpolated, scaled version of data

    scale = (scale_z,scale_y,scale_x)
    or
    scale = scale_all

    interp = "linear" | "nearest"
    """

    bop = {"linear": "", "nearest": "-D USENEAREST"}

    if not interp in bop.keys():
        raise KeyError("interp = '%s' not defined ,valid: %s" %
                       (interp, bop.keys()))

    if not isinstance(scale, (tuple, list, np.ndarray)):
        scale = (scale, ) * 3

    if len(scale) != 3:
        raise ValueError("scale = %s misformed" % scale)

    d_im = OCLImage.from_array(data)

    nshape = np.array(data.shape) * np.array(scale)
    nshape = tuple(nshape.astype(np.int))

    res_g = OCLArray.empty(nshape, np.float32)

    prog = OCLProgram(abspath("kernels/scale.cl"), build_options=[bop[interp]])

    prog.run_kernel("scale", res_g.shape[::-1], None, d_im, res_g.data)

    return res_g.get()
Ejemplo n.º 11
0
    def time_simple(N, nargs, niter=100):
        from gputools import OCLReductionKernel

        map_exprs = ["%s*x[i]" % i for i in xrange(nargs)]

        ks = [
            OCLReductionKernel(np.float32,
                               neutral="0",
                               reduce_expr="a+b",
                               map_expr="%s*x[i]" % i,
                               arguments="__global float *x")
            for i in xrange(len(map_exprs))
        ]

        ins = [
            OCLArray.from_array(np.ones(N, np.float32))
            for _ in xrange(len(map_exprs))
        ]
        outs = [OCLArray.empty(1, np.float32) for _ in xrange(len(map_exprs))]

        from time import time
        t = time()
        for _ in xrange(niter):
            for k, inn, out in zip(ks, ins, outs):
                k(inn, out=out)
        get_device().queue.finish()
        t = (time() - t) / niter
        print "simple reduction: result =", [float(out.get()) for out in outs]
        print "simple reduction:\t\t%.2f ms" % (1000 * t)
        return t
Ejemplo n.º 12
0
def gpu_kuwahara(data, N=5):
    """Function to convolve an imgage with the Kuwahara filter on GPU."""
    # create numpy arrays


    if (N%2==0):       
        raise ValueError("Data has to be a (2n+1)x(2n+1) array.")

    
    data_g = OCLArray.from_array(data.astype(float32)) 
       
    res_g = OCLArray.empty((data.shape[0],data.shape[1]),float32) 
    
    prog = OCLProgram("./OpenCL/gpu_kernels/gpu_kuwahara.cl")
    
    # start kernel on gput
    prog.run_kernel("kuwahara",   # the name of the kernel in the cl file
                   data_g.shape[::-1], # global size, the number of threads e.g. (128,128,) 
                    None,   # local size, just leave it to None
                    data_g.data,res_g.data,
                    int32(N)) 
                    
    
#                    
    
    return res_g.get()
Ejemplo n.º 13
0
def create_dn_buffer(size, units,points,
                     dn_inner = .0, rad_inner = 0,
                     dn_outer = .1, rad_outer = .4):

    Nx, Ny, Nz = size
    dx, dy, dz = units

    program = OCLProgram(absPath("kernels/bpm_3d_spheres.cl"))


    dn_g = OCLArray.empty((Nz,Ny,Nx),dtype=np.float32)

    # sort by z
    ps = np.array(points)
    ps = ps[np.argsort(ps[:,2]),:]

    Np = ps.shape[0]

    pointsBuf = OCLArray.from_array(ps.flatten().astype(np.float32))

    program.run_kernel("fill_dn",(Nx,Ny,Nz),None,dn_g.data,
                       pointsBuf.data,np.int32(Np),
                       np.float32(dx),np.float32(dy),np.float32(dz),
                       np.float32(dn_inner),np.float32(rad_inner),
                       np.float32(dn_outer),np.float32(rad_outer))


    return dn_g
Ejemplo n.º 14
0
def _convolve_buf(data_g, h_g, res_g=None):
    """
    buffer variant
    """
    assert_bufs_type(np.float32, data_g, h_g)

    prog = OCLProgram(abspath("kernels/convolve.cl"))

    if res_g is None:
        res_g = OCLArray.empty(data_g.shape, dtype=np.float32)

    Nhs = [np.int32(n) for n in h_g.shape]

    kernel_name = "convolve%sd_buf" % (len(data_g.shape))


    try:
        prog.run_kernel(kernel_name, data_g.shape[::-1], None,
                        data_g.data, h_g.data, res_g.data,
                        *Nhs)

    except cl.cffi_cl.LogicError as e:
        # this catches the logicerror if the kernel is to big for constant memory
        if e.code == -52:
            kernel_name = "convolve%sd_buf_global" % (len(data_g.shape))
            prog.run_kernel(kernel_name, data_g.shape[::-1], None,
                            data_g.data, h_g.data, res_g.data,
                            *Nhs)

        else:
            raise e

    return res_g
Ejemplo n.º 15
0
    def time_multi(N, nargs, niter=100):
        map_exprs = ["%s*x%s[i]" % (i, i) for i in xrange(nargs)]
        arguments = ",".join("__global float *x%s" % i for i in xrange(nargs))

        k = OCLReductionKernel2(np.float32,
                                neutral="0",
                                reduce_expr="a+b",
                                map_exprs=map_exprs,
                                arguments=arguments)

        ins = [
            OCLArray.from_array(np.ones(N, np.float32))
            for _ in xrange(len(map_exprs))
        ]
        outs = [OCLArray.empty(1, np.float32) for _ in xrange(len(map_exprs))]

        from time import time
        t = time()
        for _ in xrange(niter):
            k(*ins, outs=outs)
        get_device().queue.finish()
        t = (time() - t) / niter
        print "multi reduction: result =", [float(out.get()) for out in outs]
        print "multi reduction:\t\t%.2f ms" % (1000 * t)
        return t
Ejemplo n.º 16
0
def _fft_convolve_gpu(data_g, h_g, res_g = None,
                      plan = None, inplace = False,
                      kernel_is_fft = False):
    """ fft convolve for gpu buffer
    """

    _complex_multiply_kernel = OCLElementwiseKernel(
        "cfloat_t *a, cfloat_t * b",
        "a[i] = cfloat_mul(b[i],a[i])","mult")


    dev = get_device()

    assert_bufs_type(np.complex64,data_g,h_g)

    if data_g.shape != h_g.shape:
        raise ValueError("data and kernel must have same size! %s vs %s "%(str(data_g.shape),str(h_g.shape)))


    if plan is None:
        plan = fft_plan(data_g.shape)

    if inplace:
        res_g = data_g
    else:
        if res_g is None:
            res_g = OCLArray.empty(data_g.shape,data_g.dtype)
            
        res_g.copy_buffer(data_g)
        
    if not kernel_is_fft:
        kern_g = OCLArray.empty(h_g.shape,h_g.dtype)
        kern_g.copy_buffer(h_g)
        fft(kern_g,inplace=True, plan = plan)
    else:
        kern_g = h_g


    fft(res_g,inplace=True, plan = plan)


    #multiply in fourier domain
    _complex_multiply_kernel(res_g,kern_g)

    fft(res_g,inplace = True, inverse = True, plan = plan)

    return res_g
Ejemplo n.º 17
0
def _ocl_fft_gpu(plan, ocl_arr,res_arr = None, inverse = False, batch = 1):

    assert_bufs_type(np.complex64,ocl_arr)
    if res_arr is None:
        res_arr = OCLArray.empty(ocl_arr.shape,np.complex64)
    plan.execute(ocl_arr.data,res_arr.data, inverse = inverse, batch = batch)

    return res_arr
Ejemplo n.º 18
0
def focus_field_debye_at(x,y,z,lam, NA, n0 = 1., n_integration_steps = 200):
    """ the same as focus_field_debye but for the coordinates given in x, y, z (arrays of same shape)

        slower than focus_field_debye as it doesnt assume the coordinates to be on a grid
    """

    print absPath("kernels/psf_debye.cl")
    p = OCLProgram(absPath("kernels/psf_debye.cl"),
                   build_options = str("-I %s -D INT_STEPS=%s"%(absPath("."),n_integration_steps)))

    if np.isscalar(NA):
        NA = [0.,NA]

    alphas = np.arcsin(np.array(NA)/n0)
    assert len(alphas)%2 ==0

    assert x.shape == y.shape == z.shape
    dshape =x.shape
    N = np.prod(dshape)

    x_g = OCLArray.from_array(x.flatten().astype(np.float32))
    y_g = OCLArray.from_array(y.flatten().astype(np.float32))
    z_g = OCLArray.from_array(z.flatten().astype(np.float32))

    u_g = OCLArray.empty(N,np.float32)
    ex_g = OCLArray.empty(N,np.complex64)
    ey_g = OCLArray.empty(N,np.complex64)
    ez_g = OCLArray.empty(N,np.complex64)

    alpha_g = OCLArray.from_array(alphas.astype(np.float32))

    p.run_kernel("debye_wolf_at",(N,),None,
                 x_g.data,y_g.data,z_g.data,
                 ex_g.data,ey_g.data,ez_g.data, u_g.data,
                 np.float32(1.),np.float32(0.),
                 np.float32(lam/n0),
                 alpha_g.data, np.int32(len(alphas)))

    u = u_g.get().reshape(dshape)
    ex = ex_g.get().reshape(dshape)
    ey = ey_g.get().reshape(dshape)
    ez = ez_g.get().reshape(dshape)

    return u, ex, ey, ez
Ejemplo n.º 19
0
def _deconv_rl_gpu_fft(data_g, h_g, Niter = 10):
    """ 
    using fft_convolve

    """


    if data_g.shape != h_g.shape:
        raise ValueError("data and h have to be same shape")

        
    #set up some gpu buffers
    u_g = OCLArray.empty(data_g.shape,np.complex64)

    u_g.copy_buffer(data_g)
    
    tmp_g = OCLArray.empty(data_g.shape,np.complex64)

    #fix this
    hflip_g = OCLArray.from_array((h_g.get()[::-1,::-1]).copy())

    plan = fft_plan(data_g.shape)

    #transform psf
    fft(h_g,inplace = True)
    fft(hflip_g,inplace = True)

    for i in range(Niter):
        print i
        fft_convolve(u_g, h_g,
                     res_g = tmp_g,
                     kernel_is_fft = True)


        _complex_divide_inplace(data_g,tmp_g)

        
        fft_convolve(tmp_g,hflip_g,
                     inplace = True,
                     kernel_is_fft = True)

        _complex_multiply_inplace(u_g,tmp_g)

    return u_g
Ejemplo n.º 20
0
def _ocl_star_dist(a, n_rays=32):
    from gputools import OCLProgram, OCLArray, OCLImage
    (np.isscalar(n_rays) and 0 < int(n_rays)) or _raise(ValueError())
    n_rays = int(n_rays)
    src = OCLImage.from_array(a.astype(np.uint16, copy=False))
    dst = OCLArray.empty(a.shape + (n_rays, ), dtype=np.float32)
    program = OCLProgram(path_absolute("kernels/stardist2d.cl"),
                         build_options=['-D', 'N_RAYS=%d' % n_rays])
    program.run_kernel('star_dist', src.shape, None, dst.data, src)
    return dst.get()
Ejemplo n.º 21
0
def focus_field_debye_at(x, y, z, lam, NA, n0=1., n_integration_steps=200):
    """ the same as focus_field_debye but for the coordinates given in x, y, z (arrays of same shape)

        slower than focus_field_debye as it doesnt assume the coordinates to be on a grid
    """

    print absPath("kernels/psf_debye.cl")
    p = OCLProgram(absPath("kernels/psf_debye.cl"),
                   build_options=str("-I %s -D INT_STEPS=%s" %
                                     (absPath("."), n_integration_steps)))

    if np.isscalar(NA):
        NA = [0., NA]

    alphas = np.arcsin(np.array(NA) / n0)
    assert len(alphas) % 2 == 0

    assert x.shape == y.shape == z.shape
    dshape = x.shape
    N = np.prod(dshape)

    x_g = OCLArray.from_array(x.flatten().astype(np.float32))
    y_g = OCLArray.from_array(y.flatten().astype(np.float32))
    z_g = OCLArray.from_array(z.flatten().astype(np.float32))

    u_g = OCLArray.empty(N, np.float32)
    ex_g = OCLArray.empty(N, np.complex64)
    ey_g = OCLArray.empty(N, np.complex64)
    ez_g = OCLArray.empty(N, np.complex64)

    alpha_g = OCLArray.from_array(alphas.astype(np.float32))

    p.run_kernel("debye_wolf_at", (N, ), None, x_g.data, y_g.data, z_g.data,
                 ex_g.data, ey_g.data, ez_g.data, u_g.data, np.float32(1.),
                 np.float32(0.), np.float32(lam / n0), alpha_g.data,
                 np.int32(len(alphas)))

    u = u_g.get().reshape(dshape)
    ex = ex_g.get().reshape(dshape)
    ey = ey_g.get().reshape(dshape)
    ez = ez_g.get().reshape(dshape)

    return u, ex, ey, ez
Ejemplo n.º 22
0
def time_gpu(dshape, niter=100, fast_math=False):
    d_g = OCLArray.empty(dshape, np.complex64)
    get_device().queue.finish()
    plan = fft_plan(dshape, fast_math=fast_math)
    t = time()
    for _ in xrange(niter):
        fft(d_g, inplace=True, plan=plan)
    get_device().queue.finish()
    t = (time()-t)/niter
    print "GPU (fast_math = %s)\t%s\t\t%.2f ms"%(fast_math, dshape, 1000.*t)
Ejemplo n.º 23
0
def _fft_convolve_gpu(data_g, h_g, res_g = None,
                      plan = None, inplace = False,
                      kernel_is_fft = False):
    """ fft convolve for gpu buffer
    """

    dev = get_device()

    assert_bufs_type(np.complex64,data_g,h_g)

    if data_g.shape != h_g.shape:
        raise ValueError("data and kernel must have same size! %s vs %s "%(str(data_g.shape),str(h_g.shape)))


    if plan is None:
        plan = fft_plan(data_g.shape)

    if inplace:
        res_g = data_g
    else:
        if res_g is None:
            res_g = OCLArray.empty(data_g.shape,data_g.dtype)
            
        res_g.copy_buffer(data_g)
        
    if not kernel_is_fft:
        kern_g = OCLArray.empty(h_g.shape,h_g.dtype)
        kern_g.copy_buffer(h_g)
        fft(kern_g,inplace=True, plan = plan)
    else:
        kern_g = h_g


    fft(res_g,inplace=True, plan = plan)


    #multiply in fourier domain
    print res_g.dtype, res_g.nbytes
    _complex_multiply_kernel(res_g,kern_g)

    fft(res_g,inplace = True, inverse = True, plan = plan)

    return res_g
Ejemplo n.º 24
0
def time_gpu(dshape, niter=100, fast_math=False):
    d_g = OCLArray.empty(dshape, np.complex64)
    get_device().queue.finish()
    plan = fft_plan(dshape, fast_math=fast_math)
    t = time()
    for _ in range(niter):
        fft(d_g, inplace=True, plan=plan)
    get_device().queue.finish()
    t = (time() - t) / niter
    print("GPU (fast_math = %s)\t%s\t\t%.2f ms" % (fast_math, dshape, 1000. * t))
    return t
Ejemplo n.º 25
0
def perlin2(size, units, repeat=(10., ) * 2):
    wx, wy = repeat
    dx, dy = units

    prog = OCLProgram(abspath("perlin.cl"))

    d = OCLArray.empty(size[::-1], np.float32)
    prog.run_kernel("perlin2d", d.shape[::-1], None, d.data, np.float32(dx),
                    np.float32(dy), np.float32(wx), np.float32(wy))

    return d.get()
Ejemplo n.º 26
0
def scale_bicubic(data, scale=(1., 1., 1.)):
    """
    returns a interpolated, scaled version of data

    the output shape is scaled too.

    Parameters
    ----------
    data: ndarray
        3d input array
    scale: float, tuple
        scaling factor along each axis (x,y,z) 
    interpolation: str
        either "nearest" or "linear"

    Returns
    -------
        scaled output 

    """

    if not (isinstance(data, np.ndarray) and data.ndim == 3):
        raise ValueError("input data has to be a 3d array!")

    options_types = {
        np.uint8: ["-D", "TYPENAME=uchar", "-D", "READ_IMAGE=read_imageui"],
        np.uint16: ["-D", "TYPENAME=short", "-D", "READ_IMAGE=read_imageui"],
        np.float32: ["-D", "TYPENAME=float", "-D", "READ_IMAGE=read_imagef"],
    }

    dtype = data.dtype.type

    if not dtype in options_types:
        raise ValueError("type %s not supported! Available: %s" %
                         (dtype, str(list(options_types.keys()))))

    if not isinstance(scale, (tuple, list, np.ndarray)):
        scale = (scale, ) * 3

    if len(scale) != 3:
        raise ValueError("scale = %s misformed" % scale)

    d_im = OCLImage.from_array(data)

    nshape = _scale_shape(data.shape, scale)

    res_g = OCLArray.empty(nshape, dtype)

    prog = OCLProgram(abspath("kernels/scale.cl"),
                      build_options=options_types[dtype])

    prog.run_kernel("scale_bicubic", res_g.shape[::-1], None, d_im, res_g.data)

    return res_g.get()
Ejemplo n.º 27
0
def stardist_from_labels(a, n_rays=32):
    """ assumes a to be a label image with integer values that encode object ids. id 0 denotes background. """
    out_shape = a.shape + (n_rays, )
    src = OCLImage.from_array(a.astype(np.uint16, copy=False))
    dst = OCLArray.empty(out_shape, dtype=np.float32)

    # program = OCLProgram("/home/uschmidt/research/dsb2018/notebooks/kernel.cl", build_options=["-D", "N_RAYS=%d" % n_rays])
    # program = OCLProgram("kernel.cl", build_options=["-D", "N_RAYS=%d" % n_rays])
    program = OCLProgram(src_str=kernel,
                         build_options=["-D", "N_RAYS=%d" % n_rays])
    program.run_kernel('star_dist', src.shape, None, dst.data, src)
    return dst.get()
Ejemplo n.º 28
0
def _deconv_rl_np_fft(data, h, Niter = 10, 
                h_is_fftshifted = False):
    """ deconvolves data with given psf (kernel) h

    data and h have to be same shape

    
    via lucy richardson deconvolution
    """

    if data.shape != h.shape:
        raise ValueError("data and h have to be same shape")

    if not h_is_fftshifted:
        h = np.fft.fftshift(h)


    hflip = h[::-1,::-1]
        
    #set up some gpu buffers
    y_g = OCLArray.from_array(data.astype(np.complex64))
    u_g = OCLArray.from_array(data.astype(np.complex64))
    
    tmp_g = OCLArray.empty(data.shape,np.complex64)

    hf_g = OCLArray.from_array(h.astype(np.complex64))
    hflip_f_g = OCLArray.from_array(hflip.astype(np.complex64))

    # hflipped_g = OCLArray.from_array(h.astype(np.complex64))
    
    plan = fft_plan(data.shape)

    #transform psf
    fft(hf_g,inplace = True)
    fft(hflip_f_g,inplace = True)

    for i in range(Niter):
        print i
        fft_convolve(u_g, hf_g,
                     res_g = tmp_g,
                     kernel_is_fft = True)

        _complex_divide_inplace(y_g,tmp_g)

        fft_convolve(tmp_g,hflip_f_g,
                     inplace = True,
                     kernel_is_fft = True)

        _complex_multiply_inplace(u_g,tmp_g)
        

    return np.abs(u_g.get())
Ejemplo n.º 29
0
def _ocl_fft_gpu(ocl_arr,res_arr = None,inverse = False, plan = None):

    assert_bufs_type(np.complex64,ocl_arr)

    if plan is None:
        plan = Plan(ocl_arr.shape, queue = get_device().queue)

    if res_arr is None:
        res_arr = OCLArray.empty(ocl_arr.shape,np.complex64)
        
    plan.execute(ocl_arr.data,res_arr.data, inverse = inverse)

    return res_arr
Ejemplo n.º 30
0
def perlin2(size, units, repeat = (10.,)*2):
    wx, wy = repeat
    dx, dy = units

    prog = OCLProgram(abspath("perlin.cl"))

    d = OCLArray.empty(size[::-1],np.float32)
    prog.run_kernel("perlin2d",d.shape[::-1],None,
                    d.data,
                    np.float32(dx),np.float32(dy),
                    np.float32(wx),np.float32(wy))

    return d.get()
Ejemplo n.º 31
0
def _ocl_fft_gpu(ocl_arr, res_arr=None, inverse=False, plan=None):

    assert_bufs_type(np.complex64, ocl_arr)

    if plan is None:
        plan = Plan(ocl_arr.shape, queue=get_device().queue)

    if res_arr is None:
        res_arr = OCLArray.empty(ocl_arr.shape, np.complex64)

    plan.execute(ocl_arr.data, res_arr.data, inverse=inverse)

    return res_arr
Ejemplo n.º 32
0
def _ocl_star_dist(lbl, n_rays=32, grid=(1, 1)):
    from gputools import OCLProgram, OCLArray, OCLImage
    (np.isscalar(n_rays) and 0 < int(n_rays)) or _raise(ValueError())
    n_rays = int(n_rays)
    # slicing with grid is done with tuple(slice(0, None, g) for g in grid)
    res_shape = tuple((s - 1) // g + 1 for s, g in zip(lbl.shape, grid))

    src = OCLImage.from_array(lbl.astype(np.uint16, copy=False))
    dst = OCLArray.empty(res_shape + (n_rays, ), dtype=np.float32)
    program = OCLProgram(path_absolute("kernels/stardist2d.cl"),
                         build_options=['-D', 'N_RAYS=%d' % n_rays])
    program.run_kernel('star_dist', res_shape[::-1], None, dst.data, src,
                       np.int32(grid[0]), np.int32(grid[1]))
    return dst.get()
Ejemplo n.º 33
0
def focus_field_cylindrical_plane(shape=(128, 128),
                                  units=(.1, .1),
                                  z=0.,
                                  lam=.5,
                                  NA=.6,
                                  n0=1.,
                                  ex_g=None,
                                  n_integration_steps=200):
    """
    calculates the x component of the electric field  at a given z position z for a perfect, aberration free optical system
    via the vectorial debye diffraction integral for a cylindrical lens

    see
    Colin J. R. Sheppard,
    Cylindrical lenses—focusing and imaging: a review

    Appl. Opt. 52, 538-545 (2013)


    if ex_g is a valid OCLArray it fills it and returns None
    otherwise returns ex as a numpy array


    """

    p = OCLProgram(absPath("kernels/psf_cylindrical.cl"),
                   build_options=str("-I %s -D INT_STEPS=%s" %
                                     (absPath("."), n_integration_steps)))

    Nx, Ny = shape
    dx, dy = units

    alpha = np.arcsin(NA / n0)

    if ex_g is None:
        use_buffer = False
        ex_g = OCLArray.empty((Ny, Nx), np.complex64)
    else:
        use_buffer = True

    assert ex_g.shape[::-1] == shape

    p.run_kernel("psf_cylindrical_plane", (Nx, Ny), None, ex_g.data,
                 np.float32(-dy * (Ny - 1) / 2.),
                 np.float32(dy * (Ny - 1) / 2.), np.float32(z),
                 np.float32(lam / n0), np.float32(alpha))

    if not use_buffer:
        return ex_g.get()
Ejemplo n.º 34
0
def focus_field_cylindrical_plane(shape = (128,128),
                            units = (.1,.1),
                            z = 0.,
                            lam = .5, NA = .6, n0 = 1.,
                            ex_g = None,
                            n_integration_steps = 200):
    """
    calculates the x component of the electric field  at a given z position z for a perfect, aberration free optical system
    via the vectorial debye diffraction integral for a cylindrical lens

    see
    Colin J. R. Sheppard,
    Cylindrical lenses—focusing and imaging: a review

    Appl. Opt. 52, 538-545 (2013)


    if ex_g is a valid OCLArray it fills it and returns None
    otherwise returns ex as a numpy array


    """

    p = OCLProgram(absPath("kernels/psf_cylindrical.cl"),build_options = str("-I %s -D INT_STEPS=%s"%(absPath("."),n_integration_steps)))


    Nx, Ny = shape
    dx, dy = units

    alpha = np.arcsin(NA/n0)

    if ex_g is None:
        use_buffer = False
        ex_g = OCLArray.empty((Ny,Nx),np.complex64)
    else:
        use_buffer = True

    assert ex_g.shape[::-1] == shape


    p.run_kernel("psf_cylindrical_plane",(Nx,Ny),None,
                 ex_g.data,
                 np.float32(-dy*(Ny-1)/2.),np.float32(dy*(Ny-1)/2.),
                 np.float32(z),
                 np.float32(lam/n0),
                 np.float32(alpha))

    if not use_buffer:
        return ex_g.get()
Ejemplo n.º 35
0
def _deconv_rl_gpu_fft(data_g, h_g, Niter=10):
    """
    using fft_convolve

    """

    if data_g.shape != h_g.shape:
        raise ValueError("data and h have to be same shape")

    #set up some gpu buffers
    u_g = OCLArray.empty(data_g.shape, np.complex64)

    u_g.copy_buffer(data_g)

    tmp_g = OCLArray.empty(data_g.shape, np.complex64)

    #fix this
    hflip_g = OCLArray.from_array((h_g.get()[::-1, ::-1]).copy())

    plan = fft_plan(data_g.shape)

    #transform psf
    fft(h_g, inplace=True)
    fft(hflip_g, inplace=True)

    for i in range(Niter):
        logger.info("Iteration: {}".format(i))
        fft_convolve(u_g, h_g, res_g=tmp_g, kernel_is_fft=True)

        _complex_divide_inplace(data_g, tmp_g)

        fft_convolve(tmp_g, hflip_g, inplace=True, kernel_is_fft=True)

        _complex_multiply_inplace(u_g, tmp_g)

    return u_g
Ejemplo n.º 36
0
def resample_buf(data, new_shape):
    """resamples d"""

    d1_g = OCLArray.from_array(data)
    d2_g = OCLArray.empty(new_shape, data.dtype)

    if data.dtype.type == np.float32:
        im = OCLImage.empty(data.shape[::1], dtype=np.float32)
    elif data.dtype.type == np.complex64:
        im = OCLImage.empty(data.shape[::1], dtype=np.float32, num_channels=2)

    im.copy_buffer(d1_g)
    d2_g.copy_image_resampled(im)

    return d2_g.get()
Ejemplo n.º 37
0
def resample_buf(data, new_shape):
    """resamples d"""

    d1_g = OCLArray.from_array(data)
    d2_g = OCLArray.empty(new_shape,data.dtype)

    if data.dtype.type == np.float32:
        im = OCLImage.empty(data.shape[::1],dtype = np.float32)
    elif data.dtype.type == np.complex64:
        im = OCLImage.empty(data.shape[::1],dtype = np.float32, num_channels=2)

    im.copy_buffer(d1_g)
    d2_g.copy_image_resampled(im)

    return d2_g.get()
Ejemplo n.º 38
0
    def _get_min_max(self):
        # as amax is too slow for bug arrays, do it on the gpu

        if self.dataModel:
            try:
                im = self.renderer.dataImg
                tmp_buf = OCLArray.empty(im.shape, im.dtype)
                tmp_buf.copy_image(im)
                mi = float(cl_array.min(tmp_buf).get())
                ma = float(cl_array.max(tmp_buf).get())

            except Exception as e:
                print(e)
                mi = np.amin(self.dataModel[0])
                ma = np.amax(self.dataModel[0])
        return mi, ma
Ejemplo n.º 39
0
def _perlin3_single(size,units = (1.,)*3,repeat = (10.,)*3,offz = 0,Nz0 = None):
    if Nz0 is None:
        Nz0 = size[-1]

    dx, dy, dz = units
    wx, wy, wz = repeat

    prog = OCLProgram(abspath("perlin.cl"))

    d = OCLArray.empty(size[::-1],np.float32)
    prog.run_kernel("perlin3d",d.shape[::-1],None,
                    d.data,
                    np.int32(offz),
                    np.float32(dx),np.float32(dy),np.float32(dz),
                    np.float32(wx),np.float32(wy),np.float32(wz) )

    return d.get()
Ejemplo n.º 40
0
    def _get_min_max(self):
        # as amax is too slow for bug arrays, do it on the gpu

        if self.dataModel:
            try:
                im = self.renderer.dataImg
                tmp_buf = OCLArray.empty(im.shape, im.dtype)
                tmp_buf.copy_image(im)
                mi = float(cl_array.min(tmp_buf).get())
                ma = float(cl_array.max(tmp_buf).get())


            except Exception as e:
                print(e)
                mi = np.amin(self.dataModel[0])
                ma = np.amax(self.dataModel[0])
        return mi, ma
Ejemplo n.º 41
0
def _deconv_rl_np_fft(data, h, Niter=10, h_is_fftshifted=False):
    """ deconvolves data with given psf (kernel) h

    data and h have to be same shape


    via lucy richardson deconvolution
    """

    if data.shape != h.shape:
        raise ValueError("data and h have to be same shape")

    if not h_is_fftshifted:
        h = np.fft.fftshift(h)

    hflip = h[::-1, ::-1]

    #set up some gpu buffers
    y_g = OCLArray.from_array(data.astype(np.complex64))
    u_g = OCLArray.from_array(data.astype(np.complex64))

    tmp_g = OCLArray.empty(data.shape, np.complex64)

    hf_g = OCLArray.from_array(h.astype(np.complex64))
    hflip_f_g = OCLArray.from_array(hflip.astype(np.complex64))

    # hflipped_g = OCLArray.from_array(h.astype(np.complex64))

    plan = fft_plan(data.shape)

    #transform psf
    fft(hf_g, inplace=True)
    fft(hflip_f_g, inplace=True)

    for i in range(Niter):
        logger.info("Iteration: {}".format(i))
        fft_convolve(u_g, hf_g, res_g=tmp_g, kernel_is_fft=True)

        _complex_divide_inplace(y_g, tmp_g)

        fft_convolve(tmp_g, hflip_f_g, inplace=True, kernel_is_fft=True)

        _complex_multiply_inplace(u_g, tmp_g)

    return np.abs(u_g.get())
Ejemplo n.º 42
0
def create(dimensions):
    '''
    Convenience method for creating images on the GPU. This method basically does the same as in CLIJ:

    https://github.com/clij/clij2/blob/master/src/main/java/net/haesleinhuepf/clij2/CLIJ2.java#L156

    :param dimensions: size of the image
    :return: OCLArray, potentially with random values
    '''
    if isinstance(dimensions, OCLArray):
        dimensions = dimensions.shape
    else:
        if (len(dimensions) == 2):
            dimensions = (dimensions[1], dimensions[0])
        else:
            dimensions = (dimensions[2], dimensions[1], dimensions[0])

    return OCLArray.empty(dimensions, np.float32)
Ejemplo n.º 43
0
def gpu_structure(data):
    """Function to convolve an imgage with a structure filter on GPU."""
    # create numpy arrays
    
    
    data_g = OCLArray.from_array(data.astype(float32)) 
       
    res_g = OCLArray.empty((data.shape[0],data.shape[1],2),float32) 
    
    prog = OCLProgram("./OpenCL/gpu_kernels/gpu_structure.cl")
    
    # start kernel on gput
    prog.run_kernel("structure",   # the name of the kernel in the cl file
                    data_g.shape[::-1], # global size, the number of threads e.g. (128,128,) 
                    None,   # local size, just leave it to None
                    data_g.data,res_g.data) 
                    
                    
    return res_g.get()
Ejemplo n.º 44
0
def _perlin3_single(size,
                    units=(1., ) * 3,
                    repeat=(10., ) * 3,
                    offz=0,
                    Nz0=None):
    if Nz0 is None:
        Nz0 = size[-1]

    dx, dy, dz = units
    wx, wy, wz = repeat

    prog = OCLProgram(abspath("perlin.cl"))

    d = OCLArray.empty(size[::-1], np.float32)
    prog.run_kernel("perlin3d", d.shape[::-1], None, d.data, np.int32(offz),
                    np.float32(dx), np.float32(dy), np.float32(dz),
                    np.float32(wx), np.float32(wy), np.float32(wz))

    return d.get()
Ejemplo n.º 45
0
def _convolve_buf(data_g, h_g , res_g = None):
    """
    buffer variant
    """
    assert_bufs_type(np.float32,data_g,h_g)

    prog = OCLProgram(abspath("kernels/convolve.cl"))

    if res_g is None:
        res_g = OCLArray.empty(data_g.shape,dtype=np.float32)

    Nhs = [np.int32(n) for n in h_g.shape]
    
    kernel_name = "convolve%sd_buf"%(len(data_g.shape)) 
    prog.run_kernel(kernel_name,data_g.shape[::-1],None,
                    data_g.data,h_g.data,res_g.data,
                    *Nhs)

    return res_g
Ejemplo n.º 46
0
def gpu_mean(data, Nx=10,Ny=10):
    """Function to convolve an imgage with a mean filter on GPU."""
    # create numpy arrays
    
    
    data_g = OCLArray.from_array(data.astype(float32)) 
       
    res_g = OCLArray.empty(data.shape,float32) 
    
    prog = OCLProgram("./OpenCL/gpu_kernels/gpu_mean.cl")
    
    # start kernel on gput
    prog.run_kernel("mean",   # the name of the kernel in the cl file
                    data_g.shape[::-1], # global size, the number of threads e.g. (128,128,) 
                    None,   # local size, just leave it to None
                    data_g.data,res_g.data,
                    int32(Nx),int32(Ny)) 
                    
                    
    return res_g.get()
Ejemplo n.º 47
0
def perlin2(size, units=None, repeat=(10.,)*2, scale=None, shift=(0, 0)):
    """
        2d perlin noise
        either scale =(10.,10.) or units (5.,5.) have to be given....

        scale is the characteristic length in pixels
    Parameters
    ----------
    size:

    units
    repeat
    scale
    shift

    Returns
    -------

    """

    if scale:
        if np.isscalar(scale):
            scale = (scale,)*2
        repeat = scale
        units = (1.,)*2

    wx, wy = repeat
    dx, dy = units
    offset_x, offset_y = shift

    prog = OCLProgram(abspath("kernels/perlin.cl"))

    d = OCLArray.empty(size[::-1], np.float32)
    prog.run_kernel("perlin2d", d.shape[::-1], None,
                    d.data,
                    np.float32(dx), np.float32(dy),
                    np.float32(wx), np.float32(wy),
                    np.float32(offset_x), np.float32(offset_y),
                    )

    return d.get()
Ejemplo n.º 48
0
def perlin2(size, units=None, repeat=(10.,)*2, scale=None, shift=(0, 0)):
    """
        2d perlin noise
        either scale =(10.,10.) or units (5.,5.) have to be given....

        scale is the characteristic length in pixels
    Parameters
    ----------
    size:

    units
    repeat
    scale
    shift

    Returns
    -------

    """

    if scale:
        if np.isscalar(scale):
            scale = (scale,)*2
        repeat = scale
        units = (1.,)*2

    wx, wy = repeat
    dx, dy = units
    offset_x, offset_y = shift

    prog = OCLProgram(abspath("kernels/perlin.cl"))

    d = OCLArray.empty(size[::-1], np.float32)
    prog.run_kernel("perlin2d", d.shape[::-1], None,
                    d.data,
                    np.float32(dx), np.float32(dy),
                    np.float32(wx), np.float32(wy),
                    np.float32(offset_x), np.float32(offset_y),
                    )

    return d.get()
Ejemplo n.º 49
0
    def time_multi(N, nargs, niter =100):
        map_exprs=["%s*x%s[i]"%(i,i) for i in xrange(nargs)]
        arguments = ",".join("__global float *x%s"%i for i in xrange(nargs))

        k = OCLReductionKernel2(np.float32,
                            neutral="0", reduce_expr="a+b",
                            map_exprs=map_exprs,
                            arguments=arguments)

        ins = [OCLArray.from_array(np.ones(N,np.float32)) for _ in xrange(len(map_exprs))]
        outs = [OCLArray.empty(1,np.float32) for _ in xrange(len(map_exprs))]

        from time import time
        t = time()
        for _ in xrange(niter):
            k(*ins, outs = outs)
        get_device().queue.finish()
        t = (time()-t)/niter
        print "multi reduction: result =", [float(out.get()) for out in outs]
        print "multi reduction:\t\t%.2f ms"%(1000*t)
        return t
Ejemplo n.º 50
0
    def _scan_single(src, dst, ns, strides):
        nx, ny = ns
        stride_x, stride_y = strides
        loc = min(next_power_of_2(nx // 2), max_local_size // 2)
        nx_block = 2 * loc
        nx_pad = math.ceil(nx / nx_block) * nx_block

        nblocks = math.ceil(nx_pad // 2 / loc)
        sum_blocks = OCLArray.empty((ny, nblocks), dst.dtype)
        shared = cl.LocalMemory(2 * dtype_itemsize * loc)
        for b in range(nblocks):
            offset = b * loc
            prog.run_kernel("scan2d", (loc, ny), (loc, 1),
                            src.data, dst.data, sum_blocks.data, shared,
                            np.int32(nx_block), np.int32(stride_x), np.int32(stride_y), np.int32(offset), np.int32(b),
                            np.int32(nblocks), np.int32(nx))
        if nblocks > 1:
            _scan_single(sum_blocks, sum_blocks, (nblocks, ny), (1, nblocks))
            prog.run_kernel("add_sums2d", (nx_pad, ny), (nx_block, 1),
                            sum_blocks.data, dst.data,
                            np.int32(stride_x), np.int32(stride_y), np.int32(nblocks), np.int32(nx))
Ejemplo n.º 51
0
def _ocl_star_dist3D(lbl, rays, grid=(1, 1, 1)):
    from gputools import OCLProgram, OCLArray, OCLImage

    grid = _normalize_grid(grid, 3)

    # if not all(g==1 for g in grid):
    #     raise NotImplementedError("grid not yet implemented for OpenCL version of star_dist3D()...")

    res_shape = tuple(s // g for s, g in zip(lbl.shape, grid))

    lbl_g = OCLImage.from_array(lbl.astype(np.uint16, copy=False))
    dist_g = OCLArray.empty(res_shape + (len(rays), ), dtype=np.float32)
    rays_g = OCLArray.from_array(rays.vertices.astype(np.float32, copy=False))

    program = OCLProgram(path_absolute("kernels/stardist3d.cl"),
                         build_options=['-D', 'N_RAYS=%d' % len(rays)])
    program.run_kernel('stardist3d', res_shape[::-1], None, lbl_g, rays_g.data,
                       dist_g.data, np.int32(grid[0]), np.int32(grid[1]),
                       np.int32(grid[2]))

    return dist_g.get()
Ejemplo n.º 52
0
def affine(data, mat = np.identity(4), mode ="linear"):
    """affine transform data with matrix mat

    """ 

    bop = {"linear":"","nearest":"-D USENEAREST"}

    if not mode in bop.keys():
        raise KeyError("mode = '%s' not defined ,valid: %s"%(mode, bop.keys()))
    
    d_im = OCLImage.from_array(data)
    res_g = OCLArray.empty(data.shape,np.float32)
    mat_g = OCLArray.from_array(np.linalg.inv(mat).astype(np.float32,copy=False))

    prog = OCLProgram(abspath("kernels/transformations.cl")
                      , build_options=[bop[mode]])

    prog.run_kernel("affine",
                    data.shape[::-1],None,
                    d_im,res_g.data,mat_g.data)

    return res_g.get()
Ejemplo n.º 53
0
def _convolve3_old(data,h, dev = None):
    """convolves 3d data with kernel h on the GPU Device dev
    boundary conditions are clamping to edge.
    h is converted to float32

    if dev == None the default one is used
    """

    if dev is None:
        dev = get_device()

    if dev is None:
        raise ValueError("no OpenCLDevice found...")

    dtype = data.dtype.type

    dtypes_options = {np.float32:"",
                      np.uint16:"-D SHORTTYPE"}

    if not dtype in dtypes_options.keys():
        raise TypeError("data type %s not supported yet, please convert to:"%dtype,dtypes_options.keys())

    prog = OCLProgram(abspath("kernels/convolve3.cl"),
                      build_options = dtypes_options[dtype])

    
    hbuf = OCLArray.from_array(h.astype(np.float32))
    img = OCLImage.from_array(data)
    res = OCLArray.empty(data.shape,dtype=np.float32)

    Ns = [np.int32(n) for n in data.shape+h.shape]

    prog.run_kernel("convolve3d",img.shape,None,
                    img,hbuf.data,res.data,
                    *Ns)

    return res.get()
Ejemplo n.º 54
0
def scale(data, scale = (1.,1.,1.), interp = "linear"):
    """returns a interpolated, scaled version of data

    scale = (scale_z,scale_y,scale_x)
    or
    scale = scale_all

    interp = "linear" | "nearest"
    """ 

    bop = {"linear":[],"nearest":["-D","USENEAREST"]}

    if not interp in bop.keys():
        raise KeyError("interp = '%s' not defined ,valid: %s"%(interp,bop.keys()))
    
    if not isinstance(scale,(tuple, list, np.ndarray)):
        scale = (scale,)*3

    if len(scale) != 3:
        raise ValueError("scale = %s misformed"%scale)

    d_im = OCLImage.from_array(data)

    nshape = np.array(data.shape)*np.array(scale)
    nshape = tuple(nshape.astype(np.int))

    res_g = OCLArray.empty(nshape,np.float32)


    prog = OCLProgram(abspath("kernels/scale.cl"), build_options=bop[interp])


    prog.run_kernel("scale",
                    res_g.shape[::-1],None,
                    d_im,res_g.data)

    return res_g.get()
Ejemplo n.º 55
0
    def time_simple(N, nargs, niter =100):
        from gputools import OCLReductionKernel

        map_exprs=["%s*x[i]"%i for i in xrange(nargs)]


        ks = [OCLReductionKernel(np.float32,
                            neutral="0", reduce_expr="a+b",
                            map_expr="%s*x[i]"%i,
                            arguments="__global float *x") for i in xrange(len(map_exprs))]

        ins = [OCLArray.from_array(np.ones(N,np.float32)) for _ in xrange(len(map_exprs))]
        outs = [OCLArray.empty(1,np.float32) for _ in xrange(len(map_exprs))]

        from time import time
        t = time()
        for _ in xrange(niter):
            for k,inn,out in zip(ks,ins,outs):
                k(inn, out = out)
        get_device().queue.finish()
        t = (time()-t)/niter
        print "simple reduction: result =", [float(out.get()) for out in outs]
        print "simple reduction:\t\t%.2f ms"%(1000*t)
        return t
Ejemplo n.º 56
0
def focus_field_lattice_plane(shape=(256, 256),
                              units=(.1, .1),
                              z=0.,
                              lam=.5,
                              NA1=.4, NA2=.5,
                              sigma=.1,
                              kpoints=6,
                              n0=1.,
                              apodization_bound=10,
                              ex_g=None,
                              n_integration_steps=100):
    """calculates the complex 2d input field at position -z of a \
     for a bessel lattice beam.


    Parameters
    ----------

    shape: Nx,Ny
        the shape of the geometry
    units: dx,dy
        the pixel sizes in microns
    z:  float
        defocus position in microns, such that the beam would focus at z
        e.g. an input field with z = 10. would have its focal spot after 10 microns
    lam: float
        the wavelength of light used in microns
    NA1: float/list
        the numerical aperture of the inner ring
    NA2: float/list
        the numerical aperture of the outer ring
    sigma: float
        the standard deviation of the gaussian smear function applied to each point on the aperture
        (the bigger sigma, the tighter the sheet in y)
    kpoints: int/ (2,N) array
        defines the set of points on the aperture that create the lattice, can be
        - a (2,N) ndarray, such that kpoints[:,i] are the coordinates of the ith point
        - a single int, defining points on a regular polygon (e.g. 4 for a square lattice, 6 for a hex lattice)
        :math:`k_i = \\arcsin\\frac{NA_1+NA_2}{2 n_0} \\begin{pmatrix} \\cos \\phi_i \\\\ \\sin \\phi_i \\end{pmatrix}\quad, \\phi_i = \\frac{\\pi}{2}+\\frac{2i}{N}`
    n0: float
        the refractive index of the medium
    apodization_bound: int
        width of the region where the input field is tapered to zero (with a hamming window) on the +/- x borders
    n_integration_steps: int
        number of integration steps to perform
    return_all_fields: boolean
        if True, returns u,ex,ey,ez where ex/ey/ez are the complex vector field components

    Returns
    -------
    u: ndarray
        the 2d complex field
    Example
    -------

    >>> u = focus_field_lattice_plane((128,128), (0.1,0.1), z = 2., lam=.5, NA1 = .44, NA2 = .55, kpoints = 6)

    See also
    --------
    biobeam.focus_field_lattice: the corresponding 3d function

    """

    p = OCLProgram(absPath("kernels/psf_lattice.cl"),
                   build_options=["-I", absPath("kernels"), "-D", "INT_STEPS=%s"%n_integration_steps])

    Nx, Ny = shape
    dx, dy = units

    alpha1 = np.arcsin(1.*NA1/n0)
    alpha2 = np.arcsin(1.*NA2/n0)

    if np.isscalar(kpoints):
        kxs, kys = np.arcsin(.5*(NA1+NA2)/n0)*_poly_points(kpoints)
    else:
        kxs, kys = 1.*kpoints/n0

    if ex_g is None:
        use_buffer = False
        ex_g = OCLArray.empty((Ny, Nx), np.complex64)
    else:
        use_buffer = True

    assert ex_g.shape[::-1]==shape

    kxs_g = OCLArray.from_array(kxs.astype(np.float32))
    kys_g = OCLArray.from_array(kys.astype(np.float32))

    t = time.time()

    p.run_kernel("debye_wolf_lattice_plane", (Nx, Ny),
                 None,
                 ex_g.data,
                 np.float32(1.), np.float32(0.),
                 np.float32(-dx*(Nx-1)//2.), np.float32(dx*(Nx-1)//2.),
                 np.float32(-dy*(Ny-1)//2.), np.float32(dy*(Ny-1)//2.),
                 np.float32(-z),
                 np.float32(1.*lam/n0),
                 np.float32(alpha1),
                 np.float32(alpha2),
                 kxs_g.data,
                 kys_g.data,
                 np.int32(len(kxs)),
                 np.float32(sigma),
                 np.int32(apodization_bound),
                 )

    if not use_buffer:
        res = ex_g.get()
        print("time in secs:", time.time()-t)
        return res
Ejemplo n.º 57
0
def focus_field_lattice(shape=(128, 128, 128),
                        units=(0.1, 0.1, 0.1),
                        lam=.5, NA1=.4, NA2=.5,
                        sigma=.1,
                        kpoints=6,
                        return_all_fields=False,
                        n0=1., n_integration_steps=100):
    """Calculates the focus field for a bessel lattice.
    The pupil function consists out of discrete points (kpoints) superimposed on an annulus (NA1<NA2)
    which are smeared out by a 1d gaussian of given sigma creating an array of bessel beams in the
    focal plane (see [3]_ ).


    Parameters
    ----------

    shape: Nx,Ny,Nz
        the shape of the geometry
    units: dx,dy,dz
        the pixel sizes in microns
    lam: float
        the wavelength of light used in microns
    NA1: float/list
        the numerical aperture of the inner ring
    NA2: float/list
        the numerical aperture of the outer ring
    sigma: float
        the standard deviation of the gaussian smear function applied to each point on the aperture
        (the bigger sigma, the tighter the sheet in y)
    kpoints: int/ (2,N) array
        defines the set of points on the aperture that create the lattice, can be
        - a (2,N) ndarray, such that kpoints[:,i] are the coordinates of the ith point
        - a single int, defining points on a regular polygon (e.g. 4 for a square lattice, 6 for a hex lattice)
        :math:`k_i = \\arcsin\\frac{NA_1+NA_2}{2 n_0} \\begin{pmatrix} \\cos \\phi_i \\\\ \\sin \\phi_i \\end{pmatrix}\quad, \\phi_i = \\frac{\\pi}{2}+\\frac{2i}{N}`
        
    n0: float
        the refractive index of the medium
    n_integration_steps: int
        number of integration steps to perform
    return_all_fields: boolean
        if True, returns u,ex,ey,ez where ex/ey/ez are the complex vector field components

    Returns
    -------
    u: ndarray
        the intensity of the focus field
    (u,ex,ey,ez): list(ndarray)
        the intensity of the focus field and the complex field components (if return_all_fields is True)

    Example
    -------

    >>> u = focus_field_lattice((128,128,128), (0.1,0.1,.1), lam=.5, NA1 = .44, NA2 = .55, kpoints = 6)

    References
    ----------

    .. [3] Chen et al. Lattice light-sheet microscopy: imaging molecules to embryos at high spatiotemporal resolution. Science 346, (2014).


    """

    alpha1 = np.arcsin(1.*NA1/n0)
    alpha2 = np.arcsin(1.*NA2/n0)

    if np.isscalar(kpoints):
        kxs, kys = np.arcsin(.5*(NA1+NA2)/n0)*_poly_points(kpoints)
    else:
        kxs, kys = 1.*kpoints/n0

    p = OCLProgram(absPath("kernels/psf_lattice.cl"),
                   build_options=["-I", absPath("kernels"), "-D", "INT_STEPS=%s"%n_integration_steps])

    kxs = np.array(kxs)
    kys = np.array(kys)

    Nx, Ny, Nz = shape
    dx, dy, dz = units

    u_g = OCLArray.empty((Nz, Ny, Nx), np.float32)
    ex_g = OCLArray.empty((Nz, Ny, Nx), np.complex64)
    ey_g = OCLArray.empty((Nz, Ny, Nx), np.complex64)
    ez_g = OCLArray.empty((Nz, Ny, Nx), np.complex64)

    kxs_g = OCLArray.from_array(kxs.astype(np.float32))
    kys_g = OCLArray.from_array(kys.astype(np.float32))

    t = time.time()

    p.run_kernel("debye_wolf_lattice", (Nx, Ny, Nz),
                 None,
                 ex_g.data,
                 ey_g.data,
                 ez_g.data,
                 u_g.data,
                 np.float32(1.), np.float32(0.),
                 # np.float32(-dx*(Nx-1)//2.),np.float32(dx*(Nx-1)//2.),
                 # np.float32(-dy*(Ny-1)//2.),np.float32(dy*(Ny-1)//2.),
                 # np.float32(-dz*(Nz-1)//2.),np.float32(dz*(Nz-1)//2.),
                 np.float32(dx*(-Nx//2)), np.float32(dx*(Nx//2-1)),
                 np.float32(dy*(-Ny//2)), np.float32(dy*(Ny//2-1)),
                 np.float32(dz*(-Nz//2)), np.float32(dz*(Nz//2-1)),
                 np.float32(1.*lam/n0),
                 np.float32(alpha1),
                 np.float32(alpha2),
                 kxs_g.data,
                 kys_g.data,
                 np.int32(len(kxs)),
                 np.float32(sigma)
                 )

    u = u_g.get()

    if return_all_fields:
        ex = ex_g.get()
        ey = ey_g.get()
        ez = ez_g.get()
        return u, ex, ey, ez
    else:
        return u
Ejemplo n.º 58
0
    def _propagate_core(self,
                        u0=None,
                        dn_ind_start=0,
                        dn_ind_end=1,
                        dn_ind_offset=0,
                        return_comp="field",
                        return_shape="full",
                        free_prop=False,
                        dn_mean_method="none",
                        **kwargs):
        """
        the core propagation method, the refractive index dn is
        assumed to be already residing in gpu memory
        if u0 is None, assumes that the initial field
        to be residing in self._buf_plane

        kwargs:
            return_comp in ["field", "intens"]
            return_shape in ["last", "full"]
            free_prop = False | True
            dn_mean_method = "none", "global", "local"
        """

        print("mean method: ", dn_mean_method)

        free_prop = free_prop or (self.dn is None)

        if return_comp=="field":
            res_type = Bpm3d._complex_type
        elif return_comp=="intens":
            res_type = Bpm3d._real_type
        else:
            raise ValueError(return_comp)

        if not return_shape in ["last", "full"]:
            raise ValueError()

        Nx, Ny, _ = self.shape
        Nz = dn_ind_end-dn_ind_start

        assert dn_ind_start>=0

        # if not u0 is None:
        #     print "huhu"
        #     self._buf_plane.write_array(u0.astype(np.complex64,copy=False))



        if return_shape=="full":
            u = OCLArray.empty((Nz, Ny, Nx), dtype=res_type)

        # copy the first plane
        if return_shape=="full":
            if self._is_subsampled:
                self._img_xy.copy_buffer(self._buf_plane)
                self._copy_down_img(self._img_xy, u, 0)
            else:
                self._copy_down_buf(self._buf_plane, u, 0)

        dn0 = 0

        if dn_mean_method=="local" and not self.dn is None and not free_prop:
            self.intens_sum_g = OCLArray.from_array(np.ones(1,dtype=Bpm3d._real_type))
            self.intens_dn_sum_g = OCLArray.from_array((self.dn_mean[dn_ind_start+dn_ind_offset]*
                                                       np.ones(1)).astype(dtype=Bpm3d._real_type))
            #self._fill_propagator_buf(self.n0, self.intens_dn_sum_g, self.intens_sum_g)



        self._fill_propagator(self.n0)

        for i in range(Nz-1):

            for j in range(self.simul_z):


                fft(self._buf_plane, inplace=True, plan=self._plan)

                self._mult_complex(self._buf_plane, self._buf_H)



                fft(self._buf_plane, inplace=True, inverse=True, plan=self._plan)


                if not free_prop:
                    #FIXME here we make  a slight error for the first time point, as we
                    #FIXME set dn0 first and the compute the new propagator
                    if dn_mean_method=="local":
                        self._mult_dn_local(self._buf_plane, (i+dn_ind_start+(j+1.)/self.simul_z),
                                            self.intens_sum_g,
                                            self.intens_dn_sum_g,
                                            self.intens_g,
                                            self.intens_dn_g)


                    else:
                        self._mult_dn(self._buf_plane, (i+dn_ind_start+(j+1.)/self.simul_z), dn0)



            if not self.dn is None and not free_prop:
                if dn_mean_method=="local":
                    self._kernel_reduction(self.intens_g, self.intens_dn_g,
                                           outs=[self.intens_sum_g, self.intens_dn_sum_g])



                    self._fill_propagator_buf(self.n0, self.intens_dn_sum_g, self.intens_sum_g)


                    #print(self.intens_dn_sum_g.get(), self.n0)
                    #print("mean dn: ",self.intens_dn_sum_g.get()/self.intens_sum_g.get())

                elif dn_mean_method=="global":
                    if self.dn_mean[i+dn_ind_start+dn_ind_offset]!=dn0:
                        dn0 = self.dn_mean[i+dn_ind_start+dn_ind_offset]
                        self._fill_propagator(self.n0+dn0)

            if return_shape=="full":
                if self._is_subsampled and self.simul_xy!=self.shape[:2]:
                    self._img_xy.copy_buffer(self._buf_plane)
                    self._copy_down_img(self._img_xy, u, (i+1)*(Nx*Ny))
                else:
                    self._copy_down_buf(self._buf_plane, u, (i+1)*(Nx*Ny))

        if return_shape=="full":
            return u.get()
        else:
            return self._buf_plane.get()
Ejemplo n.º 59
0
    def _propagate(self, u0=None, offset=0,
                   return_comp="field",
                   return_shape="full",
                   free_prop=False,
                   slow_mean=False,
                   **kwargs):
        """
        kwargs:
            return_comp in ["field", "intens"]
            return_shape in ["last", "full"]
            free_prop = False | True
        """

        free_prop = free_prop or (self.dn is None)

        if return_comp=="field":
            res_type = Bpm3d._complex_type
        elif return_comp=="intens":
            res_type = Bpm3d._real_type
        else:
            raise ValueError(return_comp)

        if not return_shape in ["last", "full"]:
            raise ValueError()

        if u0 is None:
            u0 = self.u0_plane()

        u0 = u0.astype(np.complex64, copy=False)

        Nx, Ny, Nz = self.shape

        assert offset>=0 and offset<(Nz-1)

        if return_shape=="full":
            u = OCLArray.empty((Nz-offset, Ny, Nx), dtype=res_type)

        self._buf_plane.write_array(u0)

        # copy the first plane
        if return_shape=="full":
            if self._is_subsampled:
                self._img_xy.copy_buffer(self._buf_plane)
                self._copy_down_img(self._img_xy, u, 0)
            else:
                self._copy_down_buf(self._buf_plane, u, 0)

        dn0 = 0

        for i in range(Nz-1-offset):
            if not self.dn is None and not free_prop:
                if slow_mean:
                    if return_shape=="full":
                        raise NotImplementedError()
                    else:
                        tmp = OCLArray.empty((1, Ny, Nx), dtype=res_type)
                        if self._is_subsampled:
                            self._img_xy.copy_buffer(self._buf_plane)
                            self._copy_down_img(self._img_xy, tmp, 0)
                        else:
                            self._copy_down_buf(self._buf_plane, tmp, 0)

                        dn0 = np.sum(np.abs(self.dn[i])*tmp.get())/np.sum(np.abs(self.dn[i])+1.e-10)

                        self._fill_propagator(self.n0+dn0)
                else:
                    if self.dn_mean[i+offset]!=dn0:
                        dn0 = self.dn_mean[i+offset]
                        self._fill_propagator(self.n0+dn0)

            for j in range(self.simul_z):

                fft(self._buf_plane, inplace=True, plan=self._plan)
                self._mult_complex(self._buf_plane, self._buf_H)
                fft(self._buf_plane, inplace=True, inverse=True, plan=self._plan)
                if not free_prop:
                    self._mult_dn(self._buf_plane, (i+offset+(j+1.)/self.simul_z), dn0)

            if return_shape=="full":
                if self._is_subsampled and self.simul_xy!=self.shape[:2]:
                    self._img_xy.copy_buffer(self._buf_plane)
                    self._copy_down_img(self._img_xy, u, (i+1)*(Nx*Ny))
                else:
                    self._copy_down_buf(self._buf_plane, u, (i+1)*(Nx*Ny))

        if return_shape=="full":
            return u.get()
        else:
            return self._buf_plane.get()