def _deconv_rl_gpu_conv(data_g, h_g, Niter=10): """ using convolve """ #set up some gpu buffers u_g = OCLArray.empty(data_g.shape, np.float32) u_g.copy_buffer(data_g) tmp_g = OCLArray.empty(data_g.shape, np.float32) tmp2_g = OCLArray.empty(data_g.shape, np.float32) #fix this hflip_g = OCLArray.from_array((h_g.get()[::-1, ::-1]).copy()) for i in range(Niter): convolve(u_g, h_g, res_g=tmp_g) _divide_inplace(data_g, tmp_g) # return data_g, tmp_g convolve(tmp_g, hflip_g, res_g=tmp2_g) _multiply_inplace(u_g, tmp2_g) return u_g
def _deconv_rl_gpu_conv(data_g, h_g, Niter = 10): """ using convolve """ #set up some gpu buffers u_g = OCLArray.empty(data_g.shape,np.float32) u_g.copy_buffer(data_g) tmp_g = OCLArray.empty(data_g.shape,np.float32) tmp2_g = OCLArray.empty(data_g.shape,np.float32) #fix this hflip_g = OCLArray.from_array((h_g.get()[::-1,::-1]).copy()) for i in range(Niter): convolve(u_g, h_g, res_g = tmp_g) _divide_inplace(data_g,tmp_g) # return data_g, tmp_g convolve(tmp_g, hflip_g, res_g = tmp2_g) _multiply_inplace(u_g,tmp2_g) return u_g
def _setup_gpu(self): dev = get_device() self._queue = dev.queue self._ctx = dev.context prog = OCLProgram(absPath("kernels/bpm_3d_kernels.cl")) # the buffers/ images Nx, Ny = self.simul_xy Nx0, Ny0 = self.shape[:2] self._plan = fft_plan((Ny, Nx), **self.fftplan_kwargs) self._buf_plane = OCLArray.empty((Ny, Nx), np.complex64) self._buf_H = OCLArray.empty((Ny, Nx), np.complex64) self._img_xy = OCLImage.empty((Ny, Nx), dtype=np.float32, num_channels=2) # buffer for the weighted dn average self.intens_g = OCLArray.empty((1, Ny, Nx), dtype=Bpm3d._real_type) self.intens_dn_g = OCLArray.empty((1, Ny, Nx), dtype=Bpm3d._real_type) self.intens_sum_g = OCLArray.zeros((), dtype=Bpm3d._real_type) self.intens_dn_sum_g = OCLArray.zeros((), dtype=Bpm3d._real_type) # the kernels self._kernel_compute_propagator = prog.compute_propagator self._kernel_compute_propagator.set_scalar_arg_dtypes((None, ) + (np.float32, ) * 5) self._kernel_compute_propagator_buf = prog.compute_propagator_buf self._kernel_compute_propagator_buf.set_scalar_arg_dtypes( (None, ) + (np.float32, ) * 5 + (None, ) * 2) self._kernel_mult_complex = prog.mult self._kernel_im_to_buf_field = prog.img_to_buf_field self._kernel_im_to_buf_intensity = prog.img_to_buf_intensity self._kernel_im_to_im_intensity = prog.img_to_img_intensity self._kernel_buf_to_buf_field = prog.buf_to_buf_field self._kernel_buf_to_buf_intensity = prog.buf_to_buf_intensity self._kernel_mult_dn_img_float = prog.mult_dn_image self._kernel_mult_dn_buf_float = prog.mult_dn self._kernel_mult_dn_img_complex = prog.mult_dn_image_complex self._kernel_mult_dn_buf_complex = prog.mult_dn_complex self._kernel_mult_dn_img_float_local = prog.mult_dn_image_local self._kernel_mult_dn_buf_float_local = prog.mult_dn_local self._kernel_mult_dn_img_complex_local = prog.mult_dn_image_complex_local self._kernel_mult_dn_buf_complex_local = prog.mult_dn_complex_local self._kernel_reduction = OCLMultiReductionKernel( np.float32, neutral="0", reduce_expr="a+b", map_exprs=["a[i]", "b[i]"], arguments="__global float *a, __global float *b") self._fill_propagator(self.n0)
def focus_field_lattice(shape, units, lam=.5, NA1=.4, NA2=.5, sigma=.1, Npoly=6, n0=1., n_integration_steps=100): """ """ kxs, kys = .5 * (NA1 + NA2) * poly_points(Npoly) p = OCLProgram(absPath("kernels/psf_lattice.cl"), build_options=[ "-I", absPath("kernels"), "-D", "INT_STEPS=%s" % n_integration_steps ]) kxs = np.array(kxs) kys = np.array(kys) Nx, Ny, Nz = shape dx, dy, dz = units alpha1 = np.arcsin(NA1 / n0) alpha2 = np.arcsin(NA2 / n0) u_g = OCLArray.empty((Nz, Ny, Nx), np.float32) ex_g = OCLArray.empty((Nz, Ny, Nx), np.complex64) ey_g = OCLArray.empty((Nz, Ny, Nx), np.complex64) ez_g = OCLArray.empty((Nz, Ny, Nx), np.complex64) kxs_g = OCLArray.from_array(kxs.astype(np.float32)) kys_g = OCLArray.from_array(kys.astype(np.float32)) t = time.time() p.run_kernel( "debye_wolf_lattice", (Nx, Ny, Nz), None, ex_g.data, ey_g.data, ez_g.data, u_g.data, np.float32(1.), np.float32(0.), np.float32(-dx * (Nx - 1) / 2.), np.float32(dx * (Nx - 1) / 2.), np.float32(-dy * (Ny - 1) / 2.), np.float32(dy * (Ny - 1) / 2.), np.float32(-dz * (Nz - 1) / 2.), np.float32(dz * (Nz - 1) / 2.), np.float32(1. * lam / n0), np.float32(alpha1), np.float32(alpha2), kxs_g.data, kys_g.data, np.int32(len(kxs)), np.float32(sigma)) ex = ex_g.get() print "time in secs:", time.time() - t return ex
def _integral3_buf(x_g, res_g = None, tmp_g = None): if not x_g.dtype.type in _output_type_dict: raise ValueError("dtype %s currently not supported! (%s)" % (x_g.dtype.type, str(_output_type_dict.keys()))) dtype_out = _output_type_dict[x_g.dtype.type] cl_dtype_in = cl_buffer_datatype_dict[x_g.dtype.type] cl_dtype_out = cl_buffer_datatype_dict[dtype_out] dtype_itemsize = np.dtype(dtype_out).itemsize max_local_size = get_device().get_info("MAX_WORK_GROUP_SIZE") prog = OCLProgram(abspath("kernels/integral_image.cl"), build_options=["-D", "DTYPE=%s" % cl_dtype_out]) if x_g.dtype.type != dtype_out: x_g = x_g.astype(dtype_out) if tmp_g is None: tmp_g = OCLArray.empty(x_g.shape, dtype_out) if res_g is None: res_g = OCLArray.empty(x_g.shape, dtype_out) assert_bufs_type(dtype_out, tmp_g, res_g) nz, ny, nx = x_g.shape def _scan_single(src, dst, ns, strides): nx, ny, nz = ns stride_x, stride_y, stride_z = strides loc = min(next_power_of_2(nx // 2), max_local_size // 2) nx_block = 2 * loc nx_pad = math.ceil(nx / nx_block) * nx_block nblocks = math.ceil(nx_pad // 2 / loc) sum_blocks = OCLArray.empty((nz, ny, nblocks), dst.dtype) shared = cl.LocalMemory(2 * dtype_itemsize * loc) for b in range(nblocks): offset = b * loc prog.run_kernel("scan3d", (loc, ny, nz), (loc, 1, 1), src.data, dst.data, sum_blocks.data, shared, np.int32(nx_block), np.int32(stride_x), np.int32(stride_y), np.int32(stride_z), np.int32(offset), np.int32(b), np.int32(nblocks), np.int32(ny), np.int32(nx)) if nblocks > 1: _scan_single(sum_blocks, sum_blocks, (nblocks, ny, nz), (1, nblocks, nblocks * ny)) prog.run_kernel("add_sums3d", (nx_pad, ny, nz), (nx_block, 1, 1), sum_blocks.data, dst.data, np.int32(stride_x), np.int32(stride_y), np.int32(stride_z), np.int32(nblocks), np.int32(ny), np.int32(nx)) _scan_single(x_g, res_g, (nx, ny, nz), (1, nx, nx * ny)) _scan_single(res_g, tmp_g, (ny, nx, nz), (nx, 1, nx * ny)) _scan_single(tmp_g, res_g, (nz, nx, ny), (ny * nx, 1, nx)) return res_g
def _setup_gpu(self): dev = get_device() self._queue = dev.queue self._ctx = dev.context prog = OCLProgram(absPath("kernels/bpm_3d_kernels.cl")) # the buffers/ images Nx, Ny = self.simul_xy Nx0, Ny0 = self.shape[:2] self._plan = fft_plan((Ny, Nx), **self.fftplan_kwargs) self._buf_plane = OCLArray.empty((Ny, Nx), np.complex64) self._buf_H = OCLArray.empty((Ny, Nx), np.complex64) self._img_xy = OCLImage.empty((Ny, Nx), dtype=np.float32, num_channels=2) # buffer for the weighted dn average self.intens_g = OCLArray.empty((1, Ny, Nx), dtype=Bpm3d._real_type) self.intens_dn_g = OCLArray.empty((1, Ny, Nx), dtype=Bpm3d._real_type) self.intens_sum_g = OCLArray.zeros((), dtype=Bpm3d._real_type) self.intens_dn_sum_g = OCLArray.zeros((), dtype=Bpm3d._real_type) # the kernels self._kernel_compute_propagator = prog.compute_propagator self._kernel_compute_propagator.set_scalar_arg_dtypes((None,)+(np.float32,)*5) self._kernel_compute_propagator_buf = prog.compute_propagator_buf self._kernel_compute_propagator_buf.set_scalar_arg_dtypes((None,)+(np.float32,)*5+(None,)*2) self._kernel_mult_complex = prog.mult self._kernel_im_to_buf_field = prog.img_to_buf_field self._kernel_im_to_buf_intensity = prog.img_to_buf_intensity self._kernel_im_to_im_intensity = prog.img_to_img_intensity self._kernel_buf_to_buf_field = prog.buf_to_buf_field self._kernel_buf_to_buf_intensity = prog.buf_to_buf_intensity self._kernel_mult_dn_img_float = prog.mult_dn_image self._kernel_mult_dn_buf_float = prog.mult_dn self._kernel_mult_dn_img_complex = prog.mult_dn_image_complex self._kernel_mult_dn_buf_complex = prog.mult_dn_complex self._kernel_mult_dn_img_float_local = prog.mult_dn_image_local self._kernel_mult_dn_buf_float_local = prog.mult_dn_local self._kernel_mult_dn_img_complex_local = prog.mult_dn_image_complex_local self._kernel_mult_dn_buf_complex_local = prog.mult_dn_complex_local self._kernel_reduction = OCLMultiReductionKernel(np.float32, neutral="0", reduce_expr="a+b", map_exprs=["a[i]", "b[i]"], arguments="__global float *a, __global float *b") self._fill_propagator(self.n0)
def focus_field_cylindrical(shape, units, lam=.5, NA=.3, n0=1., n_integration_steps=100): """computes focus field of cylindrical lerns with given NA see: Colin J. R. Sheppard, Cylindrical lenses—focusing and imaging: a review Appl. Opt. 52, 538-545 (2013) return u,ex,ey,ez with u being the intensity """ p = OCLProgram(absPath("kernels/psf_cylindrical.cl"), build_options=str("-I %s -D INT_STEPS=%s" % (absPath("."), n_integration_steps))) Nx, Ny, Nz = shape dx, dy, dz = units alpha = np.arcsin(NA / n0) u_g = OCLArray.empty((Nz, Ny), np.float32) ex_g = OCLArray.empty((Nz, Ny), np.complex64) ey_g = OCLArray.empty((Nz, Ny), np.complex64) ez_g = OCLArray.empty((Nz, Ny), np.complex64) t = time.time() p.run_kernel("psf_cylindrical", u_g.shape[::-1], None, ex_g.data, ey_g.data, ez_g.data, u_g.data, np.float32(-dy * (Ny - 1) / 2.), np.float32(dy * (Ny - 1) / 2.), np.float32(-dz * (Nz - 1) / 2.), np.float32(dz * (Nz - 1) / 2.), np.float32(lam / n0), np.float32(alpha)) u = np.array(np.repeat(u_g.get()[..., np.newaxis], Nx, axis=-1)) ex = np.array(np.repeat(ex_g.get()[..., np.newaxis], Nx, axis=-1)) ey = np.array(np.repeat(ey_g.get()[..., np.newaxis], Nx, axis=-1)) ez = np.array(np.repeat(ez_g.get()[..., np.newaxis], Nx, axis=-1)) print "time in secs:", time.time() - t return u, ex, ey, ez
def focus_field_cylindrical(shape,units,lam = .5,NA = .3, n0=1., n_integration_steps = 100): """computes focus field of cylindrical lerns with given NA see: Colin J. R. Sheppard, Cylindrical lenses—focusing and imaging: a review Appl. Opt. 52, 538-545 (2013) return u,ex,ey,ez with u being the intensity """ p = OCLProgram(absPath("kernels/psf_cylindrical.cl"),build_options = str("-I %s -D INT_STEPS=%s"%(absPath("."),n_integration_steps))) Nx, Ny, Nz = shape dx, dy, dz = units alpha = np.arcsin(NA/n0) u_g = OCLArray.empty((Nz,Ny),np.float32) ex_g = OCLArray.empty((Nz,Ny),np.complex64) ey_g = OCLArray.empty((Nz,Ny),np.complex64) ez_g = OCLArray.empty((Nz,Ny),np.complex64) t = time.time() p.run_kernel("psf_cylindrical",u_g.shape[::-1],None, ex_g.data, ey_g.data, ez_g.data, u_g.data, np.float32(-dy*(Ny-1)/2.),np.float32(dy*(Ny-1)/2.), np.float32(-dz*(Nz-1)/2.),np.float32(dz*(Nz-1)/2.), np.float32(lam/n0), np.float32(alpha)) u = np.array(np.repeat(u_g.get()[...,np.newaxis],Nx,axis=-1)) ex = np.array(np.repeat(ex_g.get()[...,np.newaxis],Nx,axis=-1)) ey = np.array(np.repeat(ey_g.get()[...,np.newaxis],Nx,axis=-1)) ez = np.array(np.repeat(ez_g.get()[...,np.newaxis],Nx,axis=-1)) print "time in secs:" , time.time()-t return u, ex, ey, ez
def create_dn_buffer(size, units, points, dn_inner=.0, rad_inner=0, dn_outer=.1, rad_outer=.4): Nx, Ny, Nz = size dx, dy, dz = units program = OCLProgram(absPath("kernels/bpm_3d_spheres.cl")) dn_g = OCLArray.empty((Nz, Ny, Nx), dtype=np.float32) # sort by z ps = np.array(points) ps = ps[np.argsort(ps[:, 2]), :] Np = ps.shape[0] pointsBuf = OCLArray.from_array(ps.flatten().astype(np.float32)) program.run_kernel("fill_dn", (Nx, Ny, Nz), None, dn_g.data, pointsBuf.data, np.int32(Np), np.float32(dx), np.float32(dy), np.float32(dz), np.float32(dn_inner), np.float32(rad_inner), np.float32(dn_outer), np.float32(rad_outer)) return dn_g
def scale(data, scale=(1., 1., 1.), interp="linear"): """returns a interpolated, scaled version of data scale = (scale_z,scale_y,scale_x) or scale = scale_all interp = "linear" | "nearest" """ bop = {"linear": "", "nearest": "-D USENEAREST"} if not interp in bop.keys(): raise KeyError("interp = '%s' not defined ,valid: %s" % (interp, bop.keys())) if not isinstance(scale, (tuple, list, np.ndarray)): scale = (scale, ) * 3 if len(scale) != 3: raise ValueError("scale = %s misformed" % scale) d_im = OCLImage.from_array(data) nshape = np.array(data.shape) * np.array(scale) nshape = tuple(nshape.astype(np.int)) res_g = OCLArray.empty(nshape, np.float32) prog = OCLProgram(abspath("kernels/scale.cl"), build_options=[bop[interp]]) prog.run_kernel("scale", res_g.shape[::-1], None, d_im, res_g.data) return res_g.get()
def time_simple(N, nargs, niter=100): from gputools import OCLReductionKernel map_exprs = ["%s*x[i]" % i for i in xrange(nargs)] ks = [ OCLReductionKernel(np.float32, neutral="0", reduce_expr="a+b", map_expr="%s*x[i]" % i, arguments="__global float *x") for i in xrange(len(map_exprs)) ] ins = [ OCLArray.from_array(np.ones(N, np.float32)) for _ in xrange(len(map_exprs)) ] outs = [OCLArray.empty(1, np.float32) for _ in xrange(len(map_exprs))] from time import time t = time() for _ in xrange(niter): for k, inn, out in zip(ks, ins, outs): k(inn, out=out) get_device().queue.finish() t = (time() - t) / niter print "simple reduction: result =", [float(out.get()) for out in outs] print "simple reduction:\t\t%.2f ms" % (1000 * t) return t
def gpu_kuwahara(data, N=5): """Function to convolve an imgage with the Kuwahara filter on GPU.""" # create numpy arrays if (N%2==0): raise ValueError("Data has to be a (2n+1)x(2n+1) array.") data_g = OCLArray.from_array(data.astype(float32)) res_g = OCLArray.empty((data.shape[0],data.shape[1]),float32) prog = OCLProgram("./OpenCL/gpu_kernels/gpu_kuwahara.cl") # start kernel on gput prog.run_kernel("kuwahara", # the name of the kernel in the cl file data_g.shape[::-1], # global size, the number of threads e.g. (128,128,) None, # local size, just leave it to None data_g.data,res_g.data, int32(N)) # return res_g.get()
def create_dn_buffer(size, units,points, dn_inner = .0, rad_inner = 0, dn_outer = .1, rad_outer = .4): Nx, Ny, Nz = size dx, dy, dz = units program = OCLProgram(absPath("kernels/bpm_3d_spheres.cl")) dn_g = OCLArray.empty((Nz,Ny,Nx),dtype=np.float32) # sort by z ps = np.array(points) ps = ps[np.argsort(ps[:,2]),:] Np = ps.shape[0] pointsBuf = OCLArray.from_array(ps.flatten().astype(np.float32)) program.run_kernel("fill_dn",(Nx,Ny,Nz),None,dn_g.data, pointsBuf.data,np.int32(Np), np.float32(dx),np.float32(dy),np.float32(dz), np.float32(dn_inner),np.float32(rad_inner), np.float32(dn_outer),np.float32(rad_outer)) return dn_g
def _convolve_buf(data_g, h_g, res_g=None): """ buffer variant """ assert_bufs_type(np.float32, data_g, h_g) prog = OCLProgram(abspath("kernels/convolve.cl")) if res_g is None: res_g = OCLArray.empty(data_g.shape, dtype=np.float32) Nhs = [np.int32(n) for n in h_g.shape] kernel_name = "convolve%sd_buf" % (len(data_g.shape)) try: prog.run_kernel(kernel_name, data_g.shape[::-1], None, data_g.data, h_g.data, res_g.data, *Nhs) except cl.cffi_cl.LogicError as e: # this catches the logicerror if the kernel is to big for constant memory if e.code == -52: kernel_name = "convolve%sd_buf_global" % (len(data_g.shape)) prog.run_kernel(kernel_name, data_g.shape[::-1], None, data_g.data, h_g.data, res_g.data, *Nhs) else: raise e return res_g
def time_multi(N, nargs, niter=100): map_exprs = ["%s*x%s[i]" % (i, i) for i in xrange(nargs)] arguments = ",".join("__global float *x%s" % i for i in xrange(nargs)) k = OCLReductionKernel2(np.float32, neutral="0", reduce_expr="a+b", map_exprs=map_exprs, arguments=arguments) ins = [ OCLArray.from_array(np.ones(N, np.float32)) for _ in xrange(len(map_exprs)) ] outs = [OCLArray.empty(1, np.float32) for _ in xrange(len(map_exprs))] from time import time t = time() for _ in xrange(niter): k(*ins, outs=outs) get_device().queue.finish() t = (time() - t) / niter print "multi reduction: result =", [float(out.get()) for out in outs] print "multi reduction:\t\t%.2f ms" % (1000 * t) return t
def _fft_convolve_gpu(data_g, h_g, res_g = None, plan = None, inplace = False, kernel_is_fft = False): """ fft convolve for gpu buffer """ _complex_multiply_kernel = OCLElementwiseKernel( "cfloat_t *a, cfloat_t * b", "a[i] = cfloat_mul(b[i],a[i])","mult") dev = get_device() assert_bufs_type(np.complex64,data_g,h_g) if data_g.shape != h_g.shape: raise ValueError("data and kernel must have same size! %s vs %s "%(str(data_g.shape),str(h_g.shape))) if plan is None: plan = fft_plan(data_g.shape) if inplace: res_g = data_g else: if res_g is None: res_g = OCLArray.empty(data_g.shape,data_g.dtype) res_g.copy_buffer(data_g) if not kernel_is_fft: kern_g = OCLArray.empty(h_g.shape,h_g.dtype) kern_g.copy_buffer(h_g) fft(kern_g,inplace=True, plan = plan) else: kern_g = h_g fft(res_g,inplace=True, plan = plan) #multiply in fourier domain _complex_multiply_kernel(res_g,kern_g) fft(res_g,inplace = True, inverse = True, plan = plan) return res_g
def _ocl_fft_gpu(plan, ocl_arr,res_arr = None, inverse = False, batch = 1): assert_bufs_type(np.complex64,ocl_arr) if res_arr is None: res_arr = OCLArray.empty(ocl_arr.shape,np.complex64) plan.execute(ocl_arr.data,res_arr.data, inverse = inverse, batch = batch) return res_arr
def focus_field_debye_at(x,y,z,lam, NA, n0 = 1., n_integration_steps = 200): """ the same as focus_field_debye but for the coordinates given in x, y, z (arrays of same shape) slower than focus_field_debye as it doesnt assume the coordinates to be on a grid """ print absPath("kernels/psf_debye.cl") p = OCLProgram(absPath("kernels/psf_debye.cl"), build_options = str("-I %s -D INT_STEPS=%s"%(absPath("."),n_integration_steps))) if np.isscalar(NA): NA = [0.,NA] alphas = np.arcsin(np.array(NA)/n0) assert len(alphas)%2 ==0 assert x.shape == y.shape == z.shape dshape =x.shape N = np.prod(dshape) x_g = OCLArray.from_array(x.flatten().astype(np.float32)) y_g = OCLArray.from_array(y.flatten().astype(np.float32)) z_g = OCLArray.from_array(z.flatten().astype(np.float32)) u_g = OCLArray.empty(N,np.float32) ex_g = OCLArray.empty(N,np.complex64) ey_g = OCLArray.empty(N,np.complex64) ez_g = OCLArray.empty(N,np.complex64) alpha_g = OCLArray.from_array(alphas.astype(np.float32)) p.run_kernel("debye_wolf_at",(N,),None, x_g.data,y_g.data,z_g.data, ex_g.data,ey_g.data,ez_g.data, u_g.data, np.float32(1.),np.float32(0.), np.float32(lam/n0), alpha_g.data, np.int32(len(alphas))) u = u_g.get().reshape(dshape) ex = ex_g.get().reshape(dshape) ey = ey_g.get().reshape(dshape) ez = ez_g.get().reshape(dshape) return u, ex, ey, ez
def _deconv_rl_gpu_fft(data_g, h_g, Niter = 10): """ using fft_convolve """ if data_g.shape != h_g.shape: raise ValueError("data and h have to be same shape") #set up some gpu buffers u_g = OCLArray.empty(data_g.shape,np.complex64) u_g.copy_buffer(data_g) tmp_g = OCLArray.empty(data_g.shape,np.complex64) #fix this hflip_g = OCLArray.from_array((h_g.get()[::-1,::-1]).copy()) plan = fft_plan(data_g.shape) #transform psf fft(h_g,inplace = True) fft(hflip_g,inplace = True) for i in range(Niter): print i fft_convolve(u_g, h_g, res_g = tmp_g, kernel_is_fft = True) _complex_divide_inplace(data_g,tmp_g) fft_convolve(tmp_g,hflip_g, inplace = True, kernel_is_fft = True) _complex_multiply_inplace(u_g,tmp_g) return u_g
def _ocl_star_dist(a, n_rays=32): from gputools import OCLProgram, OCLArray, OCLImage (np.isscalar(n_rays) and 0 < int(n_rays)) or _raise(ValueError()) n_rays = int(n_rays) src = OCLImage.from_array(a.astype(np.uint16, copy=False)) dst = OCLArray.empty(a.shape + (n_rays, ), dtype=np.float32) program = OCLProgram(path_absolute("kernels/stardist2d.cl"), build_options=['-D', 'N_RAYS=%d' % n_rays]) program.run_kernel('star_dist', src.shape, None, dst.data, src) return dst.get()
def focus_field_debye_at(x, y, z, lam, NA, n0=1., n_integration_steps=200): """ the same as focus_field_debye but for the coordinates given in x, y, z (arrays of same shape) slower than focus_field_debye as it doesnt assume the coordinates to be on a grid """ print absPath("kernels/psf_debye.cl") p = OCLProgram(absPath("kernels/psf_debye.cl"), build_options=str("-I %s -D INT_STEPS=%s" % (absPath("."), n_integration_steps))) if np.isscalar(NA): NA = [0., NA] alphas = np.arcsin(np.array(NA) / n0) assert len(alphas) % 2 == 0 assert x.shape == y.shape == z.shape dshape = x.shape N = np.prod(dshape) x_g = OCLArray.from_array(x.flatten().astype(np.float32)) y_g = OCLArray.from_array(y.flatten().astype(np.float32)) z_g = OCLArray.from_array(z.flatten().astype(np.float32)) u_g = OCLArray.empty(N, np.float32) ex_g = OCLArray.empty(N, np.complex64) ey_g = OCLArray.empty(N, np.complex64) ez_g = OCLArray.empty(N, np.complex64) alpha_g = OCLArray.from_array(alphas.astype(np.float32)) p.run_kernel("debye_wolf_at", (N, ), None, x_g.data, y_g.data, z_g.data, ex_g.data, ey_g.data, ez_g.data, u_g.data, np.float32(1.), np.float32(0.), np.float32(lam / n0), alpha_g.data, np.int32(len(alphas))) u = u_g.get().reshape(dshape) ex = ex_g.get().reshape(dshape) ey = ey_g.get().reshape(dshape) ez = ez_g.get().reshape(dshape) return u, ex, ey, ez
def time_gpu(dshape, niter=100, fast_math=False): d_g = OCLArray.empty(dshape, np.complex64) get_device().queue.finish() plan = fft_plan(dshape, fast_math=fast_math) t = time() for _ in xrange(niter): fft(d_g, inplace=True, plan=plan) get_device().queue.finish() t = (time()-t)/niter print "GPU (fast_math = %s)\t%s\t\t%.2f ms"%(fast_math, dshape, 1000.*t)
def _fft_convolve_gpu(data_g, h_g, res_g = None, plan = None, inplace = False, kernel_is_fft = False): """ fft convolve for gpu buffer """ dev = get_device() assert_bufs_type(np.complex64,data_g,h_g) if data_g.shape != h_g.shape: raise ValueError("data and kernel must have same size! %s vs %s "%(str(data_g.shape),str(h_g.shape))) if plan is None: plan = fft_plan(data_g.shape) if inplace: res_g = data_g else: if res_g is None: res_g = OCLArray.empty(data_g.shape,data_g.dtype) res_g.copy_buffer(data_g) if not kernel_is_fft: kern_g = OCLArray.empty(h_g.shape,h_g.dtype) kern_g.copy_buffer(h_g) fft(kern_g,inplace=True, plan = plan) else: kern_g = h_g fft(res_g,inplace=True, plan = plan) #multiply in fourier domain print res_g.dtype, res_g.nbytes _complex_multiply_kernel(res_g,kern_g) fft(res_g,inplace = True, inverse = True, plan = plan) return res_g
def time_gpu(dshape, niter=100, fast_math=False): d_g = OCLArray.empty(dshape, np.complex64) get_device().queue.finish() plan = fft_plan(dshape, fast_math=fast_math) t = time() for _ in range(niter): fft(d_g, inplace=True, plan=plan) get_device().queue.finish() t = (time() - t) / niter print("GPU (fast_math = %s)\t%s\t\t%.2f ms" % (fast_math, dshape, 1000. * t)) return t
def perlin2(size, units, repeat=(10., ) * 2): wx, wy = repeat dx, dy = units prog = OCLProgram(abspath("perlin.cl")) d = OCLArray.empty(size[::-1], np.float32) prog.run_kernel("perlin2d", d.shape[::-1], None, d.data, np.float32(dx), np.float32(dy), np.float32(wx), np.float32(wy)) return d.get()
def scale_bicubic(data, scale=(1., 1., 1.)): """ returns a interpolated, scaled version of data the output shape is scaled too. Parameters ---------- data: ndarray 3d input array scale: float, tuple scaling factor along each axis (x,y,z) interpolation: str either "nearest" or "linear" Returns ------- scaled output """ if not (isinstance(data, np.ndarray) and data.ndim == 3): raise ValueError("input data has to be a 3d array!") options_types = { np.uint8: ["-D", "TYPENAME=uchar", "-D", "READ_IMAGE=read_imageui"], np.uint16: ["-D", "TYPENAME=short", "-D", "READ_IMAGE=read_imageui"], np.float32: ["-D", "TYPENAME=float", "-D", "READ_IMAGE=read_imagef"], } dtype = data.dtype.type if not dtype in options_types: raise ValueError("type %s not supported! Available: %s" % (dtype, str(list(options_types.keys())))) if not isinstance(scale, (tuple, list, np.ndarray)): scale = (scale, ) * 3 if len(scale) != 3: raise ValueError("scale = %s misformed" % scale) d_im = OCLImage.from_array(data) nshape = _scale_shape(data.shape, scale) res_g = OCLArray.empty(nshape, dtype) prog = OCLProgram(abspath("kernels/scale.cl"), build_options=options_types[dtype]) prog.run_kernel("scale_bicubic", res_g.shape[::-1], None, d_im, res_g.data) return res_g.get()
def stardist_from_labels(a, n_rays=32): """ assumes a to be a label image with integer values that encode object ids. id 0 denotes background. """ out_shape = a.shape + (n_rays, ) src = OCLImage.from_array(a.astype(np.uint16, copy=False)) dst = OCLArray.empty(out_shape, dtype=np.float32) # program = OCLProgram("/home/uschmidt/research/dsb2018/notebooks/kernel.cl", build_options=["-D", "N_RAYS=%d" % n_rays]) # program = OCLProgram("kernel.cl", build_options=["-D", "N_RAYS=%d" % n_rays]) program = OCLProgram(src_str=kernel, build_options=["-D", "N_RAYS=%d" % n_rays]) program.run_kernel('star_dist', src.shape, None, dst.data, src) return dst.get()
def _deconv_rl_np_fft(data, h, Niter = 10, h_is_fftshifted = False): """ deconvolves data with given psf (kernel) h data and h have to be same shape via lucy richardson deconvolution """ if data.shape != h.shape: raise ValueError("data and h have to be same shape") if not h_is_fftshifted: h = np.fft.fftshift(h) hflip = h[::-1,::-1] #set up some gpu buffers y_g = OCLArray.from_array(data.astype(np.complex64)) u_g = OCLArray.from_array(data.astype(np.complex64)) tmp_g = OCLArray.empty(data.shape,np.complex64) hf_g = OCLArray.from_array(h.astype(np.complex64)) hflip_f_g = OCLArray.from_array(hflip.astype(np.complex64)) # hflipped_g = OCLArray.from_array(h.astype(np.complex64)) plan = fft_plan(data.shape) #transform psf fft(hf_g,inplace = True) fft(hflip_f_g,inplace = True) for i in range(Niter): print i fft_convolve(u_g, hf_g, res_g = tmp_g, kernel_is_fft = True) _complex_divide_inplace(y_g,tmp_g) fft_convolve(tmp_g,hflip_f_g, inplace = True, kernel_is_fft = True) _complex_multiply_inplace(u_g,tmp_g) return np.abs(u_g.get())
def _ocl_fft_gpu(ocl_arr,res_arr = None,inverse = False, plan = None): assert_bufs_type(np.complex64,ocl_arr) if plan is None: plan = Plan(ocl_arr.shape, queue = get_device().queue) if res_arr is None: res_arr = OCLArray.empty(ocl_arr.shape,np.complex64) plan.execute(ocl_arr.data,res_arr.data, inverse = inverse) return res_arr
def perlin2(size, units, repeat = (10.,)*2): wx, wy = repeat dx, dy = units prog = OCLProgram(abspath("perlin.cl")) d = OCLArray.empty(size[::-1],np.float32) prog.run_kernel("perlin2d",d.shape[::-1],None, d.data, np.float32(dx),np.float32(dy), np.float32(wx),np.float32(wy)) return d.get()
def _ocl_fft_gpu(ocl_arr, res_arr=None, inverse=False, plan=None): assert_bufs_type(np.complex64, ocl_arr) if plan is None: plan = Plan(ocl_arr.shape, queue=get_device().queue) if res_arr is None: res_arr = OCLArray.empty(ocl_arr.shape, np.complex64) plan.execute(ocl_arr.data, res_arr.data, inverse=inverse) return res_arr
def _ocl_star_dist(lbl, n_rays=32, grid=(1, 1)): from gputools import OCLProgram, OCLArray, OCLImage (np.isscalar(n_rays) and 0 < int(n_rays)) or _raise(ValueError()) n_rays = int(n_rays) # slicing with grid is done with tuple(slice(0, None, g) for g in grid) res_shape = tuple((s - 1) // g + 1 for s, g in zip(lbl.shape, grid)) src = OCLImage.from_array(lbl.astype(np.uint16, copy=False)) dst = OCLArray.empty(res_shape + (n_rays, ), dtype=np.float32) program = OCLProgram(path_absolute("kernels/stardist2d.cl"), build_options=['-D', 'N_RAYS=%d' % n_rays]) program.run_kernel('star_dist', res_shape[::-1], None, dst.data, src, np.int32(grid[0]), np.int32(grid[1])) return dst.get()
def focus_field_cylindrical_plane(shape=(128, 128), units=(.1, .1), z=0., lam=.5, NA=.6, n0=1., ex_g=None, n_integration_steps=200): """ calculates the x component of the electric field at a given z position z for a perfect, aberration free optical system via the vectorial debye diffraction integral for a cylindrical lens see Colin J. R. Sheppard, Cylindrical lenses—focusing and imaging: a review Appl. Opt. 52, 538-545 (2013) if ex_g is a valid OCLArray it fills it and returns None otherwise returns ex as a numpy array """ p = OCLProgram(absPath("kernels/psf_cylindrical.cl"), build_options=str("-I %s -D INT_STEPS=%s" % (absPath("."), n_integration_steps))) Nx, Ny = shape dx, dy = units alpha = np.arcsin(NA / n0) if ex_g is None: use_buffer = False ex_g = OCLArray.empty((Ny, Nx), np.complex64) else: use_buffer = True assert ex_g.shape[::-1] == shape p.run_kernel("psf_cylindrical_plane", (Nx, Ny), None, ex_g.data, np.float32(-dy * (Ny - 1) / 2.), np.float32(dy * (Ny - 1) / 2.), np.float32(z), np.float32(lam / n0), np.float32(alpha)) if not use_buffer: return ex_g.get()
def focus_field_cylindrical_plane(shape = (128,128), units = (.1,.1), z = 0., lam = .5, NA = .6, n0 = 1., ex_g = None, n_integration_steps = 200): """ calculates the x component of the electric field at a given z position z for a perfect, aberration free optical system via the vectorial debye diffraction integral for a cylindrical lens see Colin J. R. Sheppard, Cylindrical lenses—focusing and imaging: a review Appl. Opt. 52, 538-545 (2013) if ex_g is a valid OCLArray it fills it and returns None otherwise returns ex as a numpy array """ p = OCLProgram(absPath("kernels/psf_cylindrical.cl"),build_options = str("-I %s -D INT_STEPS=%s"%(absPath("."),n_integration_steps))) Nx, Ny = shape dx, dy = units alpha = np.arcsin(NA/n0) if ex_g is None: use_buffer = False ex_g = OCLArray.empty((Ny,Nx),np.complex64) else: use_buffer = True assert ex_g.shape[::-1] == shape p.run_kernel("psf_cylindrical_plane",(Nx,Ny),None, ex_g.data, np.float32(-dy*(Ny-1)/2.),np.float32(dy*(Ny-1)/2.), np.float32(z), np.float32(lam/n0), np.float32(alpha)) if not use_buffer: return ex_g.get()
def _deconv_rl_gpu_fft(data_g, h_g, Niter=10): """ using fft_convolve """ if data_g.shape != h_g.shape: raise ValueError("data and h have to be same shape") #set up some gpu buffers u_g = OCLArray.empty(data_g.shape, np.complex64) u_g.copy_buffer(data_g) tmp_g = OCLArray.empty(data_g.shape, np.complex64) #fix this hflip_g = OCLArray.from_array((h_g.get()[::-1, ::-1]).copy()) plan = fft_plan(data_g.shape) #transform psf fft(h_g, inplace=True) fft(hflip_g, inplace=True) for i in range(Niter): logger.info("Iteration: {}".format(i)) fft_convolve(u_g, h_g, res_g=tmp_g, kernel_is_fft=True) _complex_divide_inplace(data_g, tmp_g) fft_convolve(tmp_g, hflip_g, inplace=True, kernel_is_fft=True) _complex_multiply_inplace(u_g, tmp_g) return u_g
def resample_buf(data, new_shape): """resamples d""" d1_g = OCLArray.from_array(data) d2_g = OCLArray.empty(new_shape, data.dtype) if data.dtype.type == np.float32: im = OCLImage.empty(data.shape[::1], dtype=np.float32) elif data.dtype.type == np.complex64: im = OCLImage.empty(data.shape[::1], dtype=np.float32, num_channels=2) im.copy_buffer(d1_g) d2_g.copy_image_resampled(im) return d2_g.get()
def resample_buf(data, new_shape): """resamples d""" d1_g = OCLArray.from_array(data) d2_g = OCLArray.empty(new_shape,data.dtype) if data.dtype.type == np.float32: im = OCLImage.empty(data.shape[::1],dtype = np.float32) elif data.dtype.type == np.complex64: im = OCLImage.empty(data.shape[::1],dtype = np.float32, num_channels=2) im.copy_buffer(d1_g) d2_g.copy_image_resampled(im) return d2_g.get()
def _get_min_max(self): # as amax is too slow for bug arrays, do it on the gpu if self.dataModel: try: im = self.renderer.dataImg tmp_buf = OCLArray.empty(im.shape, im.dtype) tmp_buf.copy_image(im) mi = float(cl_array.min(tmp_buf).get()) ma = float(cl_array.max(tmp_buf).get()) except Exception as e: print(e) mi = np.amin(self.dataModel[0]) ma = np.amax(self.dataModel[0]) return mi, ma
def _perlin3_single(size,units = (1.,)*3,repeat = (10.,)*3,offz = 0,Nz0 = None): if Nz0 is None: Nz0 = size[-1] dx, dy, dz = units wx, wy, wz = repeat prog = OCLProgram(abspath("perlin.cl")) d = OCLArray.empty(size[::-1],np.float32) prog.run_kernel("perlin3d",d.shape[::-1],None, d.data, np.int32(offz), np.float32(dx),np.float32(dy),np.float32(dz), np.float32(wx),np.float32(wy),np.float32(wz) ) return d.get()
def _deconv_rl_np_fft(data, h, Niter=10, h_is_fftshifted=False): """ deconvolves data with given psf (kernel) h data and h have to be same shape via lucy richardson deconvolution """ if data.shape != h.shape: raise ValueError("data and h have to be same shape") if not h_is_fftshifted: h = np.fft.fftshift(h) hflip = h[::-1, ::-1] #set up some gpu buffers y_g = OCLArray.from_array(data.astype(np.complex64)) u_g = OCLArray.from_array(data.astype(np.complex64)) tmp_g = OCLArray.empty(data.shape, np.complex64) hf_g = OCLArray.from_array(h.astype(np.complex64)) hflip_f_g = OCLArray.from_array(hflip.astype(np.complex64)) # hflipped_g = OCLArray.from_array(h.astype(np.complex64)) plan = fft_plan(data.shape) #transform psf fft(hf_g, inplace=True) fft(hflip_f_g, inplace=True) for i in range(Niter): logger.info("Iteration: {}".format(i)) fft_convolve(u_g, hf_g, res_g=tmp_g, kernel_is_fft=True) _complex_divide_inplace(y_g, tmp_g) fft_convolve(tmp_g, hflip_f_g, inplace=True, kernel_is_fft=True) _complex_multiply_inplace(u_g, tmp_g) return np.abs(u_g.get())
def create(dimensions): ''' Convenience method for creating images on the GPU. This method basically does the same as in CLIJ: https://github.com/clij/clij2/blob/master/src/main/java/net/haesleinhuepf/clij2/CLIJ2.java#L156 :param dimensions: size of the image :return: OCLArray, potentially with random values ''' if isinstance(dimensions, OCLArray): dimensions = dimensions.shape else: if (len(dimensions) == 2): dimensions = (dimensions[1], dimensions[0]) else: dimensions = (dimensions[2], dimensions[1], dimensions[0]) return OCLArray.empty(dimensions, np.float32)
def gpu_structure(data): """Function to convolve an imgage with a structure filter on GPU.""" # create numpy arrays data_g = OCLArray.from_array(data.astype(float32)) res_g = OCLArray.empty((data.shape[0],data.shape[1],2),float32) prog = OCLProgram("./OpenCL/gpu_kernels/gpu_structure.cl") # start kernel on gput prog.run_kernel("structure", # the name of the kernel in the cl file data_g.shape[::-1], # global size, the number of threads e.g. (128,128,) None, # local size, just leave it to None data_g.data,res_g.data) return res_g.get()
def _perlin3_single(size, units=(1., ) * 3, repeat=(10., ) * 3, offz=0, Nz0=None): if Nz0 is None: Nz0 = size[-1] dx, dy, dz = units wx, wy, wz = repeat prog = OCLProgram(abspath("perlin.cl")) d = OCLArray.empty(size[::-1], np.float32) prog.run_kernel("perlin3d", d.shape[::-1], None, d.data, np.int32(offz), np.float32(dx), np.float32(dy), np.float32(dz), np.float32(wx), np.float32(wy), np.float32(wz)) return d.get()
def _convolve_buf(data_g, h_g , res_g = None): """ buffer variant """ assert_bufs_type(np.float32,data_g,h_g) prog = OCLProgram(abspath("kernels/convolve.cl")) if res_g is None: res_g = OCLArray.empty(data_g.shape,dtype=np.float32) Nhs = [np.int32(n) for n in h_g.shape] kernel_name = "convolve%sd_buf"%(len(data_g.shape)) prog.run_kernel(kernel_name,data_g.shape[::-1],None, data_g.data,h_g.data,res_g.data, *Nhs) return res_g
def gpu_mean(data, Nx=10,Ny=10): """Function to convolve an imgage with a mean filter on GPU.""" # create numpy arrays data_g = OCLArray.from_array(data.astype(float32)) res_g = OCLArray.empty(data.shape,float32) prog = OCLProgram("./OpenCL/gpu_kernels/gpu_mean.cl") # start kernel on gput prog.run_kernel("mean", # the name of the kernel in the cl file data_g.shape[::-1], # global size, the number of threads e.g. (128,128,) None, # local size, just leave it to None data_g.data,res_g.data, int32(Nx),int32(Ny)) return res_g.get()
def perlin2(size, units=None, repeat=(10.,)*2, scale=None, shift=(0, 0)): """ 2d perlin noise either scale =(10.,10.) or units (5.,5.) have to be given.... scale is the characteristic length in pixels Parameters ---------- size: units repeat scale shift Returns ------- """ if scale: if np.isscalar(scale): scale = (scale,)*2 repeat = scale units = (1.,)*2 wx, wy = repeat dx, dy = units offset_x, offset_y = shift prog = OCLProgram(abspath("kernels/perlin.cl")) d = OCLArray.empty(size[::-1], np.float32) prog.run_kernel("perlin2d", d.shape[::-1], None, d.data, np.float32(dx), np.float32(dy), np.float32(wx), np.float32(wy), np.float32(offset_x), np.float32(offset_y), ) return d.get()
def time_multi(N, nargs, niter =100): map_exprs=["%s*x%s[i]"%(i,i) for i in xrange(nargs)] arguments = ",".join("__global float *x%s"%i for i in xrange(nargs)) k = OCLReductionKernel2(np.float32, neutral="0", reduce_expr="a+b", map_exprs=map_exprs, arguments=arguments) ins = [OCLArray.from_array(np.ones(N,np.float32)) for _ in xrange(len(map_exprs))] outs = [OCLArray.empty(1,np.float32) for _ in xrange(len(map_exprs))] from time import time t = time() for _ in xrange(niter): k(*ins, outs = outs) get_device().queue.finish() t = (time()-t)/niter print "multi reduction: result =", [float(out.get()) for out in outs] print "multi reduction:\t\t%.2f ms"%(1000*t) return t
def _scan_single(src, dst, ns, strides): nx, ny = ns stride_x, stride_y = strides loc = min(next_power_of_2(nx // 2), max_local_size // 2) nx_block = 2 * loc nx_pad = math.ceil(nx / nx_block) * nx_block nblocks = math.ceil(nx_pad // 2 / loc) sum_blocks = OCLArray.empty((ny, nblocks), dst.dtype) shared = cl.LocalMemory(2 * dtype_itemsize * loc) for b in range(nblocks): offset = b * loc prog.run_kernel("scan2d", (loc, ny), (loc, 1), src.data, dst.data, sum_blocks.data, shared, np.int32(nx_block), np.int32(stride_x), np.int32(stride_y), np.int32(offset), np.int32(b), np.int32(nblocks), np.int32(nx)) if nblocks > 1: _scan_single(sum_blocks, sum_blocks, (nblocks, ny), (1, nblocks)) prog.run_kernel("add_sums2d", (nx_pad, ny), (nx_block, 1), sum_blocks.data, dst.data, np.int32(stride_x), np.int32(stride_y), np.int32(nblocks), np.int32(nx))
def _ocl_star_dist3D(lbl, rays, grid=(1, 1, 1)): from gputools import OCLProgram, OCLArray, OCLImage grid = _normalize_grid(grid, 3) # if not all(g==1 for g in grid): # raise NotImplementedError("grid not yet implemented for OpenCL version of star_dist3D()...") res_shape = tuple(s // g for s, g in zip(lbl.shape, grid)) lbl_g = OCLImage.from_array(lbl.astype(np.uint16, copy=False)) dist_g = OCLArray.empty(res_shape + (len(rays), ), dtype=np.float32) rays_g = OCLArray.from_array(rays.vertices.astype(np.float32, copy=False)) program = OCLProgram(path_absolute("kernels/stardist3d.cl"), build_options=['-D', 'N_RAYS=%d' % len(rays)]) program.run_kernel('stardist3d', res_shape[::-1], None, lbl_g, rays_g.data, dist_g.data, np.int32(grid[0]), np.int32(grid[1]), np.int32(grid[2])) return dist_g.get()
def affine(data, mat = np.identity(4), mode ="linear"): """affine transform data with matrix mat """ bop = {"linear":"","nearest":"-D USENEAREST"} if not mode in bop.keys(): raise KeyError("mode = '%s' not defined ,valid: %s"%(mode, bop.keys())) d_im = OCLImage.from_array(data) res_g = OCLArray.empty(data.shape,np.float32) mat_g = OCLArray.from_array(np.linalg.inv(mat).astype(np.float32,copy=False)) prog = OCLProgram(abspath("kernels/transformations.cl") , build_options=[bop[mode]]) prog.run_kernel("affine", data.shape[::-1],None, d_im,res_g.data,mat_g.data) return res_g.get()
def _convolve3_old(data,h, dev = None): """convolves 3d data with kernel h on the GPU Device dev boundary conditions are clamping to edge. h is converted to float32 if dev == None the default one is used """ if dev is None: dev = get_device() if dev is None: raise ValueError("no OpenCLDevice found...") dtype = data.dtype.type dtypes_options = {np.float32:"", np.uint16:"-D SHORTTYPE"} if not dtype in dtypes_options.keys(): raise TypeError("data type %s not supported yet, please convert to:"%dtype,dtypes_options.keys()) prog = OCLProgram(abspath("kernels/convolve3.cl"), build_options = dtypes_options[dtype]) hbuf = OCLArray.from_array(h.astype(np.float32)) img = OCLImage.from_array(data) res = OCLArray.empty(data.shape,dtype=np.float32) Ns = [np.int32(n) for n in data.shape+h.shape] prog.run_kernel("convolve3d",img.shape,None, img,hbuf.data,res.data, *Ns) return res.get()
def scale(data, scale = (1.,1.,1.), interp = "linear"): """returns a interpolated, scaled version of data scale = (scale_z,scale_y,scale_x) or scale = scale_all interp = "linear" | "nearest" """ bop = {"linear":[],"nearest":["-D","USENEAREST"]} if not interp in bop.keys(): raise KeyError("interp = '%s' not defined ,valid: %s"%(interp,bop.keys())) if not isinstance(scale,(tuple, list, np.ndarray)): scale = (scale,)*3 if len(scale) != 3: raise ValueError("scale = %s misformed"%scale) d_im = OCLImage.from_array(data) nshape = np.array(data.shape)*np.array(scale) nshape = tuple(nshape.astype(np.int)) res_g = OCLArray.empty(nshape,np.float32) prog = OCLProgram(abspath("kernels/scale.cl"), build_options=bop[interp]) prog.run_kernel("scale", res_g.shape[::-1],None, d_im,res_g.data) return res_g.get()
def time_simple(N, nargs, niter =100): from gputools import OCLReductionKernel map_exprs=["%s*x[i]"%i for i in xrange(nargs)] ks = [OCLReductionKernel(np.float32, neutral="0", reduce_expr="a+b", map_expr="%s*x[i]"%i, arguments="__global float *x") for i in xrange(len(map_exprs))] ins = [OCLArray.from_array(np.ones(N,np.float32)) for _ in xrange(len(map_exprs))] outs = [OCLArray.empty(1,np.float32) for _ in xrange(len(map_exprs))] from time import time t = time() for _ in xrange(niter): for k,inn,out in zip(ks,ins,outs): k(inn, out = out) get_device().queue.finish() t = (time()-t)/niter print "simple reduction: result =", [float(out.get()) for out in outs] print "simple reduction:\t\t%.2f ms"%(1000*t) return t
def focus_field_lattice_plane(shape=(256, 256), units=(.1, .1), z=0., lam=.5, NA1=.4, NA2=.5, sigma=.1, kpoints=6, n0=1., apodization_bound=10, ex_g=None, n_integration_steps=100): """calculates the complex 2d input field at position -z of a \ for a bessel lattice beam. Parameters ---------- shape: Nx,Ny the shape of the geometry units: dx,dy the pixel sizes in microns z: float defocus position in microns, such that the beam would focus at z e.g. an input field with z = 10. would have its focal spot after 10 microns lam: float the wavelength of light used in microns NA1: float/list the numerical aperture of the inner ring NA2: float/list the numerical aperture of the outer ring sigma: float the standard deviation of the gaussian smear function applied to each point on the aperture (the bigger sigma, the tighter the sheet in y) kpoints: int/ (2,N) array defines the set of points on the aperture that create the lattice, can be - a (2,N) ndarray, such that kpoints[:,i] are the coordinates of the ith point - a single int, defining points on a regular polygon (e.g. 4 for a square lattice, 6 for a hex lattice) :math:`k_i = \\arcsin\\frac{NA_1+NA_2}{2 n_0} \\begin{pmatrix} \\cos \\phi_i \\\\ \\sin \\phi_i \\end{pmatrix}\quad, \\phi_i = \\frac{\\pi}{2}+\\frac{2i}{N}` n0: float the refractive index of the medium apodization_bound: int width of the region where the input field is tapered to zero (with a hamming window) on the +/- x borders n_integration_steps: int number of integration steps to perform return_all_fields: boolean if True, returns u,ex,ey,ez where ex/ey/ez are the complex vector field components Returns ------- u: ndarray the 2d complex field Example ------- >>> u = focus_field_lattice_plane((128,128), (0.1,0.1), z = 2., lam=.5, NA1 = .44, NA2 = .55, kpoints = 6) See also -------- biobeam.focus_field_lattice: the corresponding 3d function """ p = OCLProgram(absPath("kernels/psf_lattice.cl"), build_options=["-I", absPath("kernels"), "-D", "INT_STEPS=%s"%n_integration_steps]) Nx, Ny = shape dx, dy = units alpha1 = np.arcsin(1.*NA1/n0) alpha2 = np.arcsin(1.*NA2/n0) if np.isscalar(kpoints): kxs, kys = np.arcsin(.5*(NA1+NA2)/n0)*_poly_points(kpoints) else: kxs, kys = 1.*kpoints/n0 if ex_g is None: use_buffer = False ex_g = OCLArray.empty((Ny, Nx), np.complex64) else: use_buffer = True assert ex_g.shape[::-1]==shape kxs_g = OCLArray.from_array(kxs.astype(np.float32)) kys_g = OCLArray.from_array(kys.astype(np.float32)) t = time.time() p.run_kernel("debye_wolf_lattice_plane", (Nx, Ny), None, ex_g.data, np.float32(1.), np.float32(0.), np.float32(-dx*(Nx-1)//2.), np.float32(dx*(Nx-1)//2.), np.float32(-dy*(Ny-1)//2.), np.float32(dy*(Ny-1)//2.), np.float32(-z), np.float32(1.*lam/n0), np.float32(alpha1), np.float32(alpha2), kxs_g.data, kys_g.data, np.int32(len(kxs)), np.float32(sigma), np.int32(apodization_bound), ) if not use_buffer: res = ex_g.get() print("time in secs:", time.time()-t) return res
def focus_field_lattice(shape=(128, 128, 128), units=(0.1, 0.1, 0.1), lam=.5, NA1=.4, NA2=.5, sigma=.1, kpoints=6, return_all_fields=False, n0=1., n_integration_steps=100): """Calculates the focus field for a bessel lattice. The pupil function consists out of discrete points (kpoints) superimposed on an annulus (NA1<NA2) which are smeared out by a 1d gaussian of given sigma creating an array of bessel beams in the focal plane (see [3]_ ). Parameters ---------- shape: Nx,Ny,Nz the shape of the geometry units: dx,dy,dz the pixel sizes in microns lam: float the wavelength of light used in microns NA1: float/list the numerical aperture of the inner ring NA2: float/list the numerical aperture of the outer ring sigma: float the standard deviation of the gaussian smear function applied to each point on the aperture (the bigger sigma, the tighter the sheet in y) kpoints: int/ (2,N) array defines the set of points on the aperture that create the lattice, can be - a (2,N) ndarray, such that kpoints[:,i] are the coordinates of the ith point - a single int, defining points on a regular polygon (e.g. 4 for a square lattice, 6 for a hex lattice) :math:`k_i = \\arcsin\\frac{NA_1+NA_2}{2 n_0} \\begin{pmatrix} \\cos \\phi_i \\\\ \\sin \\phi_i \\end{pmatrix}\quad, \\phi_i = \\frac{\\pi}{2}+\\frac{2i}{N}` n0: float the refractive index of the medium n_integration_steps: int number of integration steps to perform return_all_fields: boolean if True, returns u,ex,ey,ez where ex/ey/ez are the complex vector field components Returns ------- u: ndarray the intensity of the focus field (u,ex,ey,ez): list(ndarray) the intensity of the focus field and the complex field components (if return_all_fields is True) Example ------- >>> u = focus_field_lattice((128,128,128), (0.1,0.1,.1), lam=.5, NA1 = .44, NA2 = .55, kpoints = 6) References ---------- .. [3] Chen et al. Lattice light-sheet microscopy: imaging molecules to embryos at high spatiotemporal resolution. Science 346, (2014). """ alpha1 = np.arcsin(1.*NA1/n0) alpha2 = np.arcsin(1.*NA2/n0) if np.isscalar(kpoints): kxs, kys = np.arcsin(.5*(NA1+NA2)/n0)*_poly_points(kpoints) else: kxs, kys = 1.*kpoints/n0 p = OCLProgram(absPath("kernels/psf_lattice.cl"), build_options=["-I", absPath("kernels"), "-D", "INT_STEPS=%s"%n_integration_steps]) kxs = np.array(kxs) kys = np.array(kys) Nx, Ny, Nz = shape dx, dy, dz = units u_g = OCLArray.empty((Nz, Ny, Nx), np.float32) ex_g = OCLArray.empty((Nz, Ny, Nx), np.complex64) ey_g = OCLArray.empty((Nz, Ny, Nx), np.complex64) ez_g = OCLArray.empty((Nz, Ny, Nx), np.complex64) kxs_g = OCLArray.from_array(kxs.astype(np.float32)) kys_g = OCLArray.from_array(kys.astype(np.float32)) t = time.time() p.run_kernel("debye_wolf_lattice", (Nx, Ny, Nz), None, ex_g.data, ey_g.data, ez_g.data, u_g.data, np.float32(1.), np.float32(0.), # np.float32(-dx*(Nx-1)//2.),np.float32(dx*(Nx-1)//2.), # np.float32(-dy*(Ny-1)//2.),np.float32(dy*(Ny-1)//2.), # np.float32(-dz*(Nz-1)//2.),np.float32(dz*(Nz-1)//2.), np.float32(dx*(-Nx//2)), np.float32(dx*(Nx//2-1)), np.float32(dy*(-Ny//2)), np.float32(dy*(Ny//2-1)), np.float32(dz*(-Nz//2)), np.float32(dz*(Nz//2-1)), np.float32(1.*lam/n0), np.float32(alpha1), np.float32(alpha2), kxs_g.data, kys_g.data, np.int32(len(kxs)), np.float32(sigma) ) u = u_g.get() if return_all_fields: ex = ex_g.get() ey = ey_g.get() ez = ez_g.get() return u, ex, ey, ez else: return u
def _propagate_core(self, u0=None, dn_ind_start=0, dn_ind_end=1, dn_ind_offset=0, return_comp="field", return_shape="full", free_prop=False, dn_mean_method="none", **kwargs): """ the core propagation method, the refractive index dn is assumed to be already residing in gpu memory if u0 is None, assumes that the initial field to be residing in self._buf_plane kwargs: return_comp in ["field", "intens"] return_shape in ["last", "full"] free_prop = False | True dn_mean_method = "none", "global", "local" """ print("mean method: ", dn_mean_method) free_prop = free_prop or (self.dn is None) if return_comp=="field": res_type = Bpm3d._complex_type elif return_comp=="intens": res_type = Bpm3d._real_type else: raise ValueError(return_comp) if not return_shape in ["last", "full"]: raise ValueError() Nx, Ny, _ = self.shape Nz = dn_ind_end-dn_ind_start assert dn_ind_start>=0 # if not u0 is None: # print "huhu" # self._buf_plane.write_array(u0.astype(np.complex64,copy=False)) if return_shape=="full": u = OCLArray.empty((Nz, Ny, Nx), dtype=res_type) # copy the first plane if return_shape=="full": if self._is_subsampled: self._img_xy.copy_buffer(self._buf_plane) self._copy_down_img(self._img_xy, u, 0) else: self._copy_down_buf(self._buf_plane, u, 0) dn0 = 0 if dn_mean_method=="local" and not self.dn is None and not free_prop: self.intens_sum_g = OCLArray.from_array(np.ones(1,dtype=Bpm3d._real_type)) self.intens_dn_sum_g = OCLArray.from_array((self.dn_mean[dn_ind_start+dn_ind_offset]* np.ones(1)).astype(dtype=Bpm3d._real_type)) #self._fill_propagator_buf(self.n0, self.intens_dn_sum_g, self.intens_sum_g) self._fill_propagator(self.n0) for i in range(Nz-1): for j in range(self.simul_z): fft(self._buf_plane, inplace=True, plan=self._plan) self._mult_complex(self._buf_plane, self._buf_H) fft(self._buf_plane, inplace=True, inverse=True, plan=self._plan) if not free_prop: #FIXME here we make a slight error for the first time point, as we #FIXME set dn0 first and the compute the new propagator if dn_mean_method=="local": self._mult_dn_local(self._buf_plane, (i+dn_ind_start+(j+1.)/self.simul_z), self.intens_sum_g, self.intens_dn_sum_g, self.intens_g, self.intens_dn_g) else: self._mult_dn(self._buf_plane, (i+dn_ind_start+(j+1.)/self.simul_z), dn0) if not self.dn is None and not free_prop: if dn_mean_method=="local": self._kernel_reduction(self.intens_g, self.intens_dn_g, outs=[self.intens_sum_g, self.intens_dn_sum_g]) self._fill_propagator_buf(self.n0, self.intens_dn_sum_g, self.intens_sum_g) #print(self.intens_dn_sum_g.get(), self.n0) #print("mean dn: ",self.intens_dn_sum_g.get()/self.intens_sum_g.get()) elif dn_mean_method=="global": if self.dn_mean[i+dn_ind_start+dn_ind_offset]!=dn0: dn0 = self.dn_mean[i+dn_ind_start+dn_ind_offset] self._fill_propagator(self.n0+dn0) if return_shape=="full": if self._is_subsampled and self.simul_xy!=self.shape[:2]: self._img_xy.copy_buffer(self._buf_plane) self._copy_down_img(self._img_xy, u, (i+1)*(Nx*Ny)) else: self._copy_down_buf(self._buf_plane, u, (i+1)*(Nx*Ny)) if return_shape=="full": return u.get() else: return self._buf_plane.get()
def _propagate(self, u0=None, offset=0, return_comp="field", return_shape="full", free_prop=False, slow_mean=False, **kwargs): """ kwargs: return_comp in ["field", "intens"] return_shape in ["last", "full"] free_prop = False | True """ free_prop = free_prop or (self.dn is None) if return_comp=="field": res_type = Bpm3d._complex_type elif return_comp=="intens": res_type = Bpm3d._real_type else: raise ValueError(return_comp) if not return_shape in ["last", "full"]: raise ValueError() if u0 is None: u0 = self.u0_plane() u0 = u0.astype(np.complex64, copy=False) Nx, Ny, Nz = self.shape assert offset>=0 and offset<(Nz-1) if return_shape=="full": u = OCLArray.empty((Nz-offset, Ny, Nx), dtype=res_type) self._buf_plane.write_array(u0) # copy the first plane if return_shape=="full": if self._is_subsampled: self._img_xy.copy_buffer(self._buf_plane) self._copy_down_img(self._img_xy, u, 0) else: self._copy_down_buf(self._buf_plane, u, 0) dn0 = 0 for i in range(Nz-1-offset): if not self.dn is None and not free_prop: if slow_mean: if return_shape=="full": raise NotImplementedError() else: tmp = OCLArray.empty((1, Ny, Nx), dtype=res_type) if self._is_subsampled: self._img_xy.copy_buffer(self._buf_plane) self._copy_down_img(self._img_xy, tmp, 0) else: self._copy_down_buf(self._buf_plane, tmp, 0) dn0 = np.sum(np.abs(self.dn[i])*tmp.get())/np.sum(np.abs(self.dn[i])+1.e-10) self._fill_propagator(self.n0+dn0) else: if self.dn_mean[i+offset]!=dn0: dn0 = self.dn_mean[i+offset] self._fill_propagator(self.n0+dn0) for j in range(self.simul_z): fft(self._buf_plane, inplace=True, plan=self._plan) self._mult_complex(self._buf_plane, self._buf_H) fft(self._buf_plane, inplace=True, inverse=True, plan=self._plan) if not free_prop: self._mult_dn(self._buf_plane, (i+offset+(j+1.)/self.simul_z), dn0) if return_shape=="full": if self._is_subsampled and self.simul_xy!=self.shape[:2]: self._img_xy.copy_buffer(self._buf_plane) self._copy_down_img(self._img_xy, u, (i+1)*(Nx*Ny)) else: self._copy_down_buf(self._buf_plane, u, (i+1)*(Nx*Ny)) if return_shape=="full": return u.get() else: return self._buf_plane.get()