def test_parseval(): from time import time Nx = 512 Nz = 10 d = np.random.uniform(-1, 1, (Nx, Nx)).astype(np.complex64) d_g = OCLArray.from_array(d.astype(np.complex64)) s1, s2 = [], [] t = time() for i in range(Nz): # myfunc(d_g) # fft(d_g, inplace=True, fast_math=False) # fft(d_g, inverse = True,inplace=True,fast_math=False) fft(d_g, inplace=True) # fft(d_g, inverse = True,inplace=True) s1.append(np.sum(np.abs(d_g.get())**2)) print(time() - t) for i in range(Nz): d = np.fft.fftn(d).astype(np.complex64) d = np.fft.ifftn(d).astype(np.complex64) s2.append(np.sum(np.abs(d)**2)) return s1, s2
def test_parseval(): from time import time Nx = 512 Nz = 10 d = np.random.uniform(-1,1,(Nx,Nx)).astype(np.complex64) d_g = OCLArray.from_array(d.astype(np.complex64)) s1, s2 = [],[] t = time() for i in range(Nz): print i # myfunc(d_g) # fft(d_g, inplace=True, fast_math=False) # fft(d_g, inverse = True,inplace=True,fast_math=False) fft(d_g, inplace=True) # fft(d_g, inverse = True,inplace=True) s1.append(np.sum(np.abs(d_g.get())**2)) print time()-t for i in range(Nz): print i d = np.fft.fftn(d).astype(np.complex64) d = np.fft.ifftn(d).astype(np.complex64) s2.append(np.sum(np.abs(d)**2)) return s1, s2
def time_gpu(dshape, niter=100, fast_math=False): d_g = OCLArray.empty(dshape, np.complex64) get_device().queue.finish() plan = fft_plan(dshape, fast_math=fast_math) t = time() for _ in xrange(niter): fft(d_g, inplace=True, plan=plan) get_device().queue.finish() t = (time()-t)/niter print "GPU (fast_math = %s)\t%s\t\t%.2f ms"%(fast_math, dshape, 1000.*t)
def time_gpu(dshape, niter=100, fast_math=False): d_g = OCLArray.empty(dshape, np.complex64) get_device().queue.finish() plan = fft_plan(dshape, fast_math=fast_math) t = time() for _ in range(niter): fft(d_g, inplace=True, plan=plan) get_device().queue.finish() t = (time() - t) / niter print("GPU (fast_math = %s)\t%s\t\t%.2f ms" % (fast_math, dshape, 1000. * t)) return t
def _deconv_rl_np_fft(data, h, Niter = 10, h_is_fftshifted = False): """ deconvolves data with given psf (kernel) h data and h have to be same shape via lucy richardson deconvolution """ if data.shape != h.shape: raise ValueError("data and h have to be same shape") if not h_is_fftshifted: h = np.fft.fftshift(h) hflip = h[::-1,::-1] #set up some gpu buffers y_g = OCLArray.from_array(data.astype(np.complex64)) u_g = OCLArray.from_array(data.astype(np.complex64)) tmp_g = OCLArray.empty(data.shape,np.complex64) hf_g = OCLArray.from_array(h.astype(np.complex64)) hflip_f_g = OCLArray.from_array(hflip.astype(np.complex64)) # hflipped_g = OCLArray.from_array(h.astype(np.complex64)) plan = fft_plan(data.shape) #transform psf fft(hf_g,inplace = True) fft(hflip_f_g,inplace = True) for i in range(Niter): print i fft_convolve(u_g, hf_g, res_g = tmp_g, kernel_is_fft = True) _complex_divide_inplace(y_g,tmp_g) fft_convolve(tmp_g,hflip_f_g, inplace = True, kernel_is_fft = True) _complex_multiply_inplace(u_g,tmp_g) return np.abs(u_g.get())
def _deconv_rl_np_fft(data, h, Niter=10, h_is_fftshifted=False): """ deconvolves data with given psf (kernel) h data and h have to be same shape via lucy richardson deconvolution """ if data.shape != h.shape: raise ValueError("data and h have to be same shape") if not h_is_fftshifted: h = np.fft.fftshift(h) hflip = h[::-1, ::-1] #set up some gpu buffers y_g = OCLArray.from_array(data.astype(np.complex64)) u_g = OCLArray.from_array(data.astype(np.complex64)) tmp_g = OCLArray.empty(data.shape, np.complex64) hf_g = OCLArray.from_array(h.astype(np.complex64)) hflip_f_g = OCLArray.from_array(hflip.astype(np.complex64)) # hflipped_g = OCLArray.from_array(h.astype(np.complex64)) plan = fft_plan(data.shape) #transform psf fft(hf_g, inplace=True) fft(hflip_f_g, inplace=True) for i in range(Niter): logger.info("Iteration: {}".format(i)) fft_convolve(u_g, hf_g, res_g=tmp_g, kernel_is_fft=True) _complex_divide_inplace(y_g, tmp_g) fft_convolve(tmp_g, hflip_f_g, inplace=True, kernel_is_fft=True) _complex_multiply_inplace(u_g, tmp_g) return np.abs(u_g.get())
def _deconv_rl_gpu_fft(data_g, h_g, Niter = 10): """ using fft_convolve """ if data_g.shape != h_g.shape: raise ValueError("data and h have to be same shape") #set up some gpu buffers u_g = OCLArray.empty(data_g.shape,np.complex64) u_g.copy_buffer(data_g) tmp_g = OCLArray.empty(data_g.shape,np.complex64) #fix this hflip_g = OCLArray.from_array((h_g.get()[::-1,::-1]).copy()) plan = fft_plan(data_g.shape) #transform psf fft(h_g,inplace = True) fft(hflip_g,inplace = True) for i in range(Niter): print i fft_convolve(u_g, h_g, res_g = tmp_g, kernel_is_fft = True) _complex_divide_inplace(data_g,tmp_g) fft_convolve(tmp_g,hflip_g, inplace = True, kernel_is_fft = True) _complex_multiply_inplace(u_g,tmp_g) return u_g
def get_gpu(N=256, niter=100, sig=1.): np.random.seed(0) a = np.random.normal(0, sig, (N, N)).astype(np.complex64) b = (1. * a.copy()).astype(np.complex64) c_g = OCLArray.empty_like(b) b_g = OCLArray.from_array(b) p = fft_plan((N, N), fast_math=False) rels = [] for _ in range(niter): fft(b_g, res_g=c_g, plan=p) fft(c_g, res_g=b_g, inverse=True, plan=p) # b = fft(fft(b), inverse = True) # rels.append(np.amax(np.abs(a-b))/np.amax(np.abs(a))) rels.append(np.amax(np.abs(a - b_g.get())) / np.amax(np.abs(a))) return np.array(rels)
def get_gpu(N = 256, niter=100, sig = 1.): np.random.seed(0) a = np.random.normal(0,sig,(N,N)).astype(np.complex64) b = (1.*a.copy()).astype(np.complex64) c_g = OCLArray.empty_like(b) b_g = OCLArray.from_array(b) p = fft_plan((N,N), fast_math = False) rels = [] for _ in range(niter): fft(b_g,res_g = c_g, plan = p) fft(c_g, res_g = b_g, inverse = True, plan = p) # b = fft(fft(b), inverse = True) # rels.append(np.amax(np.abs(a-b))/np.amax(np.abs(a))) rels.append(np.amax(np.abs(a-b_g.get()))/np.amax(np.abs(a))) return np.array(rels)
def test_parseval(): Nx = 512 Nz = 100 d = np.random.uniform(-1, 1, (Nx, Nx)).astype(np.complex64) d_g = OCLArray.from_array(d.astype(np.complex64)) s1, s2 = [], [] for i in range(Nz): print(i) fft(d_g, inplace=True, fast_math=False) fft(d_g, inverse=True, inplace=True, fast_math=False) s1.append(np.sum(np.abs(d_g.get())**2)) for i in range(Nz): print(i) d = np.fft.fftn(d).astype(np.complex64) d = np.fft.ifftn(d).astype(np.complex64) s2.append(np.sum(np.abs(d)**2)) return s1, s2
def test_parseval(): Nx = 512 Nz = 100 d = np.random.uniform(-1,1,(Nx,Nx)).astype(np.complex64) d_g = OCLArray.from_array(d.astype(np.complex64)) s1, s2 = [],[] for i in range(Nz): print(i) fft(d_g, inplace=True, fast_math=False) fft(d_g, inverse = True,inplace=True,fast_math=False) s1.append(np.sum(np.abs(d_g.get())**2)) for i in range(Nz): print(i) d = np.fft.fftn(d).astype(np.complex64) d = np.fft.ifftn(d).astype(np.complex64) s2.append(np.sum(np.abs(d)**2)) return s1, s2
def _propagate_to_img(self, u0=None, im=None, free_prop=False, **kwargs): """ """ free_prop = free_prop or (self.dn is None) res_type = Bpm3d._real_type if u0 is None: u0 = self.u0_plane() Nx, Ny, Nz = self.shape if im is None: im = self.result_im self._buf_plane.write_array(u0) # copy the first plane self._img_xy.copy_buffer(self._buf_plane) self._copy_down_img_to_img(self._img_xy, im, 0) for i in range(Nz - 1): for j in range(self.simul_z): fft(self._buf_plane, inplace=True, plan=self._plan) self._mult_complex(self._buf_plane, self._buf_H) fft(self._buf_plane, inplace=True, inverse=True, plan=self._plan) if not free_prop: self._mult_dn(self._buf_plane, (i + (j + 1.) / self.simul_z), self.n0) self._img_xy.copy_buffer(self._buf_plane) self._copy_down_img_to_img(self._img_xy, im, i + 1) return im
def apply(self, data): dshape = data.shape res = gputools.pad_to_power2(data.astype(np.complex64), mode="wrap") res = 1. / np.sqrt(res.size) * np.fft.fftshift(abs(gputools.fft(res))) res = gputools.pad_to_shape(res, dshape) if self.log: return np.log2(0.001 + res) else: return res
def apply(self,data): dshape = data.shape res = gputools.pad_to_power2(data.astype(np.complex64), mode = "wrap") res = 1./np.sqrt(res.size)*np.fft.fftshift(abs(gputools.fft(res))) res = gputools.pad_to_shape(res,dshape) if self.log: return np.log2(0.001+res) else: return res
def __init__(self, psf: np.ndarray, psf_is_fftshifted: bool = False, n_iter=10): """ setup deconvolution for a given shape """ self.shape = psf.shape if not psf_is_fftshifted: psf = np.fft.fftshift(psf) self.n_iter = n_iter # What happens here? Indices are being flipped ? Why. What if it is 3D? psfflip = psf[::-1, ::-1] self.psf_g = OCLArray.from_array(psf.astype(np.complex64)) self.psfflip_f_g = OCLArray.from_array(psfflip.astype(np.complex64)) self.plan = fft_plan(self.shape) # transform psf fft(self.psf_g, inplace=True) fft(self.psfflip_f_g, inplace=True) # get temp self.tmp_g = OCLArray.empty(psf.shape, np.complex64)
def _deconv_rl_gpu_fft(data_g, h_g, Niter=10): """ using fft_convolve """ if data_g.shape != h_g.shape: raise ValueError("data and h have to be same shape") #set up some gpu buffers u_g = OCLArray.empty(data_g.shape, np.complex64) u_g.copy_buffer(data_g) tmp_g = OCLArray.empty(data_g.shape, np.complex64) #fix this hflip_g = OCLArray.from_array((h_g.get()[::-1, ::-1]).copy()) plan = fft_plan(data_g.shape) #transform psf fft(h_g, inplace=True) fft(hflip_g, inplace=True) for i in range(Niter): logger.info("Iteration: {}".format(i)) fft_convolve(u_g, h_g, res_g=tmp_g, kernel_is_fft=True) _complex_divide_inplace(data_g, tmp_g) fft_convolve(tmp_g, hflip_g, inplace=True, kernel_is_fft=True) _complex_multiply_inplace(u_g, tmp_g) return u_g
def _propagate_to_img(self, u0=None, im=None, free_prop=False, **kwargs): """ """ free_prop = free_prop or (self.dn is None) res_type = Bpm3d._real_type if u0 is None: u0 = self.u0_plane() Nx, Ny, Nz = self.shape if im is None: im = self.result_im self._buf_plane.write_array(u0) # copy the first plane self._img_xy.copy_buffer(self._buf_plane) self._copy_down_img_to_img(self._img_xy, im, 0) for i in range(Nz-1): for j in range(self.simul_z): fft(self._buf_plane, inplace=True, plan=self._plan) self._mult_complex(self._buf_plane, self._buf_H) fft(self._buf_plane, inplace=True, inverse=True, plan=self._plan) if not free_prop: self._mult_dn(self._buf_plane, (i+(j+1.)/self.simul_z),self.n0) self._img_xy.copy_buffer(self._buf_plane) self._copy_down_img_to_img(self._img_xy, im, i+1) return im
def convolve(self, convolved_glyph_image): # Convolved glyph image is a complex64 image of shape <s, o, h, w>. We will return a distance and a fullness image for each. abs_input = np.abs(convolved_glyph_image) padded_input = np.pad( abs_input, [[0, 0], [0, 0], [ int(np.ceil(self.hh)), int(self.hh) ], [int(np.ceil(self.ww)), int(self.ww)]]) padded_input = np.tile(padded_input, [1, 2, 1, 1]) # double up the orientations # This runs out of memory! # We have 26 glyphs, and for each glyph we need n_scales * 2 * n_orientations * 3 * h * 3 * w = 14 * 8 * 9 * 90 * 150 = 108 megabytes. Should be no problem ... #print("Convolving glyph with the radial filters ...", padded_input.shape) #input_fft = np.fft.fft2(np.fft.ifftshift(padded_input, [2, 3])) #print("Input ffted.", input_fft.shape, self.filter1_fft.shape) #convolved_fft1 = input_fft * self.filter1_fft #convolved_fft2 = input_fft * self.filter2_fft #print("Input multiplied.") #filtered1 = np.fft.fftshift(np.fft.ifft2(convolved_fft1), [2, 3]) #print("filtered1.") #filtered2 = np.fft.fftshift(np.fft.ifft2(convolved_fft2), [2, 3]) #print("Convolution done for this glyph.") if True: padded_input1_ocl = gputools.OCLArray.from_array( np.fft.ifftshift(padded_input, (-2, -1)).astype(np.complex64)) padded_input2_ocl = gputools.OCLArray.from_array( np.fft.ifftshift(padded_input, (-2, -1)).astype(np.complex64)) #print("PADDED INPUT COL SIZE", padded_input_ocl.get().shape) # TODO MAKE GPU PLAN FIRST, and reuse it # TODO Make the broadcasting more efficient # TODO Reuse the filter bank across multiple letters. gputools.fft(padded_input1_ocl, axes=(-2, -1), inplace=True) gputools.fft(padded_input2_ocl, axes=(-2, -1), inplace=True) filter1_fft_ocl = gputools.OCLArray.from_array(self.filter1_fft) filter2_fft_ocl = gputools.OCLArray.from_array(self.filter2_fft) #print("FILTER BANK ", filter_bank_ocl_fft.get().shape) padded_input1_ocl *= filter1_fft_ocl padded_input2_ocl *= filter2_fft_ocl # in place gputools.fft(padded_input1_ocl, axes=(-2, -1), inplace=True, inverse=True) gputools.fft(padded_input2_ocl, axes=(-2, -1), inplace=True, inverse=True) filtered1 = np.fft.fftshift(padded_input1_ocl.get()) filtered2 = np.fft.fftshift(padded_input2_ocl.get()) return ( filtered1[:, :, int(np.ceil(self.hh)):int(self.box_height + np.ceil(self.hh)), int(np.ceil(self.ww)):int(self.box_width + np.ceil(self.ww))], filtered2[:, :, int(np.ceil(self.hh)):int(self.box_height + np.ceil(self.hh)), int(np.ceil(self.ww)):int(self.box_width + np.ceil(self.ww))], )
def _propagate_core(self, u0=None, dn_ind_start=0, dn_ind_end=1, dn_ind_offset=0, return_comp="field", return_shape="full", free_prop=False, dn_mean_method="none", **kwargs): """ the core propagation method, the refractive index dn is assumed to be already residing in gpu memory if u0 is None, assumes that the initial field to be residing in self._buf_plane kwargs: return_comp in ["field", "intens"] return_shape in ["last", "full"] free_prop = False | True dn_mean_method = "none", "global", "local" """ print("mean method: ", dn_mean_method) free_prop = free_prop or (self.dn is None) if return_comp=="field": res_type = Bpm3d._complex_type elif return_comp=="intens": res_type = Bpm3d._real_type else: raise ValueError(return_comp) if not return_shape in ["last", "full"]: raise ValueError() Nx, Ny, _ = self.shape Nz = dn_ind_end-dn_ind_start assert dn_ind_start>=0 # if not u0 is None: # print "huhu" # self._buf_plane.write_array(u0.astype(np.complex64,copy=False)) if return_shape=="full": u = OCLArray.empty((Nz, Ny, Nx), dtype=res_type) # copy the first plane if return_shape=="full": if self._is_subsampled: self._img_xy.copy_buffer(self._buf_plane) self._copy_down_img(self._img_xy, u, 0) else: self._copy_down_buf(self._buf_plane, u, 0) dn0 = 0 if dn_mean_method=="local" and not self.dn is None and not free_prop: self.intens_sum_g = OCLArray.from_array(np.ones(1,dtype=Bpm3d._real_type)) self.intens_dn_sum_g = OCLArray.from_array((self.dn_mean[dn_ind_start+dn_ind_offset]* np.ones(1)).astype(dtype=Bpm3d._real_type)) #self._fill_propagator_buf(self.n0, self.intens_dn_sum_g, self.intens_sum_g) self._fill_propagator(self.n0) for i in range(Nz-1): for j in range(self.simul_z): fft(self._buf_plane, inplace=True, plan=self._plan) self._mult_complex(self._buf_plane, self._buf_H) fft(self._buf_plane, inplace=True, inverse=True, plan=self._plan) if not free_prop: #FIXME here we make a slight error for the first time point, as we #FIXME set dn0 first and the compute the new propagator if dn_mean_method=="local": self._mult_dn_local(self._buf_plane, (i+dn_ind_start+(j+1.)/self.simul_z), self.intens_sum_g, self.intens_dn_sum_g, self.intens_g, self.intens_dn_g) else: self._mult_dn(self._buf_plane, (i+dn_ind_start+(j+1.)/self.simul_z), dn0) if not self.dn is None and not free_prop: if dn_mean_method=="local": self._kernel_reduction(self.intens_g, self.intens_dn_g, outs=[self.intens_sum_g, self.intens_dn_sum_g]) self._fill_propagator_buf(self.n0, self.intens_dn_sum_g, self.intens_sum_g) #print(self.intens_dn_sum_g.get(), self.n0) #print("mean dn: ",self.intens_dn_sum_g.get()/self.intens_sum_g.get()) elif dn_mean_method=="global": if self.dn_mean[i+dn_ind_start+dn_ind_offset]!=dn0: dn0 = self.dn_mean[i+dn_ind_start+dn_ind_offset] self._fill_propagator(self.n0+dn0) if return_shape=="full": if self._is_subsampled and self.simul_xy!=self.shape[:2]: self._img_xy.copy_buffer(self._buf_plane) self._copy_down_img(self._img_xy, u, (i+1)*(Nx*Ny)) else: self._copy_down_buf(self._buf_plane, u, (i+1)*(Nx*Ny)) if return_shape=="full": return u.get() else: return self._buf_plane.get()
def convolve_spatial3(im, hs, mode = "constant", plan = None, return_plan = False, pad_factor = 2): """ spatial varying convolution of an 3d image with a 3d grid of psfs shape(im_ = (Nz,Ny,Nx) shape(hs) = (Gz,Gy,Gx, Hz,Hy,Hx) the input image im is subdivided into (Gx,Gy,Gz) blocks hs[k,j,i] is the psf at the center of each block (i,j,k) as of now each image dimension has to be divisble by the grid dim, i.e. Nx % Gx == 0 Ny % Gy == 0 Nz % Gz == 0 mode can be: "constant" - assumed values to be zero "wrap" - periodic boundary condition """ if im.ndim !=3 or hs.ndim !=6: raise ValueError("wrong dimensions of input!") if not np.all([n%g==0 for n,g in zip(im.shape,hs.shape[:3])]): raise NotImplementedError("shape of image has to be divisible by Gx Gy = %s !"%(str(hs.shape[:3]))) mode_str = {"constant":"CLK_ADDRESS_CLAMP", "wrap":"CLK_ADDRESS_REPEAT"} Ns = tuple(im.shape) Gs = tuple(hs.shape[:3]) # the size of each block within the grid Nblocks = [n/g for n,g in zip(Ns,Gs)] # the size of the overlapping patches with safety padding Npatchs = tuple([_next_power_of_2(pad_factor*nb) for nb in Nblocks]) print hs.shape hs = np.fft.fftshift(pad_to_shape(hs,Gs+Npatchs),axes=(3,4,5)) prog = OCLProgram(abspath("kernels/conv_spatial.cl"), build_options=["-D","ADDRESSMODE=%s"%mode_str[mode]]) if plan is None: plan = fft_plan(Npatchs) patches_g = OCLArray.empty(Gs+Npatchs,np.complex64) h_g = OCLArray.from_array(hs.astype(np.complex64)) im_g = OCLImage.from_array(im.astype(np.float32,copy=False)) Xs = [nb*np.arange(g) for nb, g in zip(Nblocks,Gs)] print Nblocks # this loops over all i,j,k for (k,_z0), (j,_y0),(i,_x0) in product(*[enumerate(X) for X in Xs]): prog.run_kernel("fill_patch3",Npatchs[::-1],None, im_g, np.int32(_x0+Nblocks[2]/2-Npatchs[2]/2), np.int32(_y0+Nblocks[1]/2-Npatchs[1]/2), np.int32(_z0+Nblocks[0]/2-Npatchs[0]/2), patches_g.data, np.int32(i*np.prod(Npatchs)+ j*Gs[2]*np.prod(Npatchs)+ k*Gs[2]*Gs[1]*np.prod(Npatchs))) print patches_g.shape, h_g.shape # convolution fft(patches_g,inplace=True, batch = np.prod(Gs), plan = plan) fft(h_g,inplace=True, batch = np.prod(Gs), plan = plan) prog.run_kernel("mult_inplace",(np.prod(Npatchs)*np.prod(Gs),),None, patches_g.data, h_g.data) fft(patches_g, inplace=True, inverse = True, batch = np.prod(Gs), plan = plan) #return patches_g.get() #accumulate res_g = OCLArray.zeros(im.shape,np.float32) for k, j, i in product(*[range(g+1) for g in Gs]): prog.run_kernel("interpolate3",Nblocks[::-1],None, patches_g.data, res_g.data, np.int32(i),np.int32(j),np.int32(k), np.int32(Gs[2]),np.int32(Gs[1]),np.int32(Gs[0]), np.int32(Npatchs[2]),np.int32(Npatchs[1]),np.int32(Npatchs[0])) res = res_g.get() if return_plan: return res, plan else: return res
def fft_gpu(d_g): return gputools.fft(d_g, inplace = True)
def _propagate(self, u0=None, offset=0, return_comp="field", return_shape="full", free_prop=False, slow_mean=False, **kwargs): """ kwargs: return_comp in ["field", "intens"] return_shape in ["last", "full"] free_prop = False | True """ free_prop = free_prop or (self.dn is None) if return_comp=="field": res_type = Bpm3d._complex_type elif return_comp=="intens": res_type = Bpm3d._real_type else: raise ValueError(return_comp) if not return_shape in ["last", "full"]: raise ValueError() if u0 is None: u0 = self.u0_plane() u0 = u0.astype(np.complex64, copy=False) Nx, Ny, Nz = self.shape assert offset>=0 and offset<(Nz-1) if return_shape=="full": u = OCLArray.empty((Nz-offset, Ny, Nx), dtype=res_type) self._buf_plane.write_array(u0) # copy the first plane if return_shape=="full": if self._is_subsampled: self._img_xy.copy_buffer(self._buf_plane) self._copy_down_img(self._img_xy, u, 0) else: self._copy_down_buf(self._buf_plane, u, 0) dn0 = 0 for i in range(Nz-1-offset): if not self.dn is None and not free_prop: if slow_mean: if return_shape=="full": raise NotImplementedError() else: tmp = OCLArray.empty((1, Ny, Nx), dtype=res_type) if self._is_subsampled: self._img_xy.copy_buffer(self._buf_plane) self._copy_down_img(self._img_xy, tmp, 0) else: self._copy_down_buf(self._buf_plane, tmp, 0) dn0 = np.sum(np.abs(self.dn[i])*tmp.get())/np.sum(np.abs(self.dn[i])+1.e-10) self._fill_propagator(self.n0+dn0) else: if self.dn_mean[i+offset]!=dn0: dn0 = self.dn_mean[i+offset] self._fill_propagator(self.n0+dn0) for j in range(self.simul_z): fft(self._buf_plane, inplace=True, plan=self._plan) self._mult_complex(self._buf_plane, self._buf_H) fft(self._buf_plane, inplace=True, inverse=True, plan=self._plan) if not free_prop: self._mult_dn(self._buf_plane, (i+offset+(j+1.)/self.simul_z), dn0) if return_shape=="full": if self._is_subsampled and self.simul_xy!=self.shape[:2]: self._img_xy.copy_buffer(self._buf_plane) self._copy_down_img(self._img_xy, u, (i+1)*(Nx*Ny)) else: self._copy_down_buf(self._buf_plane, u, (i+1)*(Nx*Ny)) if return_shape=="full": return u.get() else: return self._buf_plane.get()
def _single_batched(d, axes): res1 = np.fft.fftn(d, axes=axes) res2 = fft(d, axes=axes) return res1, res2
#init_device(id_platform = 0, id_device = 1) def report_str(success): return colored("\t[OK]", "blue") if success else colored("\t[FAIL]", "red") def _compare_fft_np(d): res1 = np.fft.fftn(d) res2 = fft(d, fast_math=True) return res1, res2 def test_compare(): for ndim in [1, 2, 3]: for dshape in product([32, 64, 128], repeat=ndim): d = np.random.uniform(-1, 1, dshape).astype(np.complex64) res1, res2 = _compare_fft_np(d) print("validating fft of size", d.shape) npt.assert_allclose(res1, res2, rtol=1.e-0, atol=1.e-1) if __name__ == '__main__': # test_compare() # dshape = (128, 128) np.random.seed(0) d = np.random.uniform(-1, 1, dshape).astype(np.complex64) res1 = np.fft.fftn(d) res2 = fft(d)
def _convolve_spatial2(im, hs, mode="constant", grid_dim=None, pad_factor=2, plan=None, return_plan=False): """ spatial varying convolution of an 2d image with a 2d grid of psfs shape(im_ = (Ny,Nx) shape(hs) = (Gy,Gx, Hy,Hx) the input image im is subdivided into (Gy,Gx) blocks hs[j,i] is the psf at the center of each block (i,j) as of now each image dimension has to be divisible by the grid dim, i.e. Nx % Gx == 0 Ny % Gy == 0 mode can be: "constant" - assumed values to be zero "wrap" - periodic boundary condition """ if grid_dim: Gs = tuple(grid_dim) else: Gs = hs.shape[:2] mode_str = {"constant": "CLK_ADDRESS_CLAMP", "wrap": "CLK_ADDRESS_REPEAT"} Ny, Nx = im.shape Gy, Gx = Gs # the size of each block within the grid Nblock_y, Nblock_x = Ny // Gy, Nx // Gx # the size of the overlapping patches with safety padding Npatch_x, Npatch_y = _next_power_of_2( pad_factor * Nblock_x), _next_power_of_2(pad_factor * Nblock_y) prog = OCLProgram(abspath("kernels/conv_spatial2.cl"), build_options=["-D", "ADDRESSMODE=%s" % mode_str[mode]]) if plan is None: plan = fft_plan((Gy, Gx, Npatch_y, Npatch_x), axes=(-2, -1)) x0s = Nblock_x * np.arange(Gx) y0s = Nblock_y * np.arange(Gy) patches_g = OCLArray.empty((Gy, Gx, Npatch_y, Npatch_x), np.complex64) #prepare psfs if grid_dim: h_g = OCLArray.zeros((Gy, Gx, Npatch_y, Npatch_x), np.complex64) tmp_g = OCLArray.from_array(hs.astype(np.float32, copy=False)) for i, _x0 in enumerate(x0s): for j, _y0 in enumerate(y0s): prog.run_kernel( "fill_psf_grid2", (Nblock_x, Nblock_y), None, tmp_g.data, np.int32(Nx), np.int32(i * Nblock_x), np.int32(j * Nblock_y), h_g.data, np.int32(Npatch_x), np.int32(Npatch_y), np.int32(-Nblock_x // 2 + Npatch_x // 2), np.int32(-Nblock_y // 2 + Npatch_y // 2), np.int32(i * Npatch_x * Npatch_y + j * Gx * Npatch_x * Npatch_y)) else: hs = np.fft.fftshift(pad_to_shape(hs, (Gy, Gx, Npatch_y, Npatch_x)), axes=(2, 3)) h_g = OCLArray.from_array(hs.astype(np.complex64)) #prepare image im_g = OCLImage.from_array(im.astype(np.float32, copy=False)) for i, _x0 in enumerate(x0s): for j, _y0 in enumerate(y0s): prog.run_kernel( "fill_patch2", (Npatch_x, Npatch_y), None, im_g, np.int32(_x0 + Nblock_x // 2 - Npatch_x // 2), np.int32(_y0 + Nblock_y // 2 - Npatch_y // 2), patches_g.data, np.int32(i * Npatch_x * Npatch_y + j * Gx * Npatch_x * Npatch_y)) #return np.abs(patches_g.get()) # convolution fft(patches_g, inplace=True, plan=plan) fft(h_g, inplace=True, plan=plan) prog.run_kernel("mult_inplace", (Npatch_x * Npatch_y * Gx * Gy, ), None, patches_g.data, h_g.data) fft(patches_g, inplace=True, inverse=True, plan=plan) logger.debug("Nblock_x: {}, Npatch_x: {}".format(Nblock_x, Npatch_x)) #return np.abs(patches_g.get()) #accumulate res_g = OCLArray.empty(im.shape, np.float32) for j in range(Gy + 1): for i in range(Gx + 1): prog.run_kernel("interpolate2", (Nblock_x, Nblock_y), None, patches_g.data, res_g.data, np.int32(i), np.int32(j), np.int32(Gx), np.int32(Gy), np.int32(Npatch_x), np.int32(Npatch_y)) res = res_g.get() if return_plan: return res, plan else: return res
def convolve_spatial2(im, hs, mode = "constant", plan = None, return_plan = False): """ spatial varying convolution of an 2d image with a 2d grid of psfs shape(im_ = (Ny,Nx) shape(hs) = (Gy,Gx, Hy,Hx) the input image im is subdivided into (Gy,Gz) blocks hs[j,i] is the psf at the center of each block (i,j) as of now each image dimension has to be divisble by the grid dim, i.e. Nx % Gx == 0 Ny % Gy == 0 mode can be: "constant" - assumed values to be zero "wrap" - periodic boundary condition """ if im.ndim !=2 or hs.ndim !=4: raise ValueError("wrong dimensions of input!") if not np.all([n%g==0 for n,g in zip(im.shape,hs.shape[:2])]): raise NotImplementedError("shape of image has to be divisible by Gx Gy = %s shape mismatch"%(str(hs.shape[:2]))) mode_str = {"constant":"CLK_ADDRESS_CLAMP", "wrap":"CLK_ADDRESS_REPEAT"} Ny, Nx = im.shape Gy, Gx = hs.shape[:2] # the size of each block within the grid Nblock_y, Nblock_x = Ny/Gy, Nx/Gx # the size of the overlapping patches with safety padding Npatch_x, Npatch_y = _next_power_of_2(3*Nblock_x), _next_power_of_2(3*Nblock_y) #Npatch_x, Npatch_y = _next_power_of_2(2*Nblock_x), _next_power_of_2(2*Nblock_y) print Nblock_x, Npatch_x hs = np.fft.fftshift(pad_to_shape(hs,(Gy,Gx,Npatch_y,Npatch_x)),axes=(2,3)) prog = OCLProgram(abspath("kernels/conv_spatial.cl"), build_options=["-D","ADDRESSMODE=%s"%mode_str[mode]]) if plan is None: plan = fft_plan((Npatch_y,Npatch_x)) patches_g = OCLArray.empty((Gy,Gx,Npatch_y,Npatch_x),np.complex64) h_g = OCLArray.from_array(hs.astype(np.complex64)) im_g = OCLImage.from_array(im.astype(np.float32,copy=False)) x0s = Nblock_x*np.arange(Gx) y0s = Nblock_y*np.arange(Gy) print x0s for i,_x0 in enumerate(x0s): for j,_y0 in enumerate(y0s): prog.run_kernel("fill_patch2",(Npatch_x,Npatch_y),None, im_g, np.int32(_x0+Nblock_x/2-Npatch_x/2), np.int32(_y0+Nblock_y/2-Npatch_y/2), patches_g.data, np.int32(i*Npatch_x*Npatch_y+j*Gx*Npatch_x*Npatch_y)) # convolution fft(patches_g,inplace=True, batch = Gx*Gy, plan = plan) fft(h_g,inplace=True, batch = Gx*Gy, plan = plan) prog.run_kernel("mult_inplace",(Npatch_x*Npatch_y*Gx*Gy,),None, patches_g.data, h_g.data) fft(patches_g,inplace=True, inverse = True, batch = Gx*Gy, plan = plan) #return patches_g.get() #accumulate res_g = OCLArray.empty(im.shape,np.float32) for i in xrange(Gx+1): for j in xrange(Gy+1): prog.run_kernel("interpolate2",(Nblock_x,Nblock_y),None, patches_g.data,res_g.data, np.int32(i),np.int32(j), np.int32(Gx),np.int32(Gy), np.int32(Npatch_x),np.int32(Npatch_y)) res = res_g.get() if return_plan: return res, plan else: return res
def _bpm_3d_image(size, units, lam = .5, u0 = None, dn = None, subsample = 1, n0 = 1., return_scattering = False, return_g = False, return_full_last = False, use_fresnel_approx = False, ): """ simulates the propagation of monochromativ wave of wavelength lam with initial conditions u0 along z in a media filled with dn size - the dimension of the image to be calulcated in pixels (Nx,Ny,Nz) units - the unit lengths of each dimensions in microns lam - the wavelength u0 - the initial field distribution, if u0 = None an incident plane wave is assumed dn - the refractive index of the medium (can be complex) """ clock = StopWatch() clock.tic("setup") Nx, Ny, Nz = size dx, dy, dz = units # subsampling Nx2, Ny2, Nz2 = (subsample*N for N in size) dx2, dy2, dz2 = (1.*d/subsample for d in units) #setting up the propagator k0 = 2.*np.pi/lam kxs = 2.*np.pi*np.fft.fftfreq(Nx2,dx2) kys = 2.*np.pi*np.fft.fftfreq(Ny2,dy2) KY, KX = np.meshgrid(kys,kxs, indexing= "ij") #H0 = np.sqrt(0.j+n0**2*k0**2-KX**2-KY**2) H0 = np.sqrt(n0**2*k0**2-KX**2-KY**2) if use_fresnel_approx: H0 = 0.j+n0**2*k0-.5*(KX**2+KY**2) outsideInds = np.isnan(H0) H = np.exp(-1.j*dz2*H0) H[outsideInds] = 0. H0[outsideInds] = 0. if u0 is None: u0 = np.ones((Ny2,Nx2),np.complex64) else: if subsample >1: u0 = zoom(np.real(u0),subsample) + 1.j*zoom(np.imag(u0),subsample) # setting up the gpu buffers and kernels program = OCLProgram(absPath("kernels/bpm_3d_kernels.cl")) plan = fft_plan((Ny2,Nx2)) plane_g = OCLArray.from_array(u0.astype(np.complex64)) h_g = OCLArray.from_array(H.astype(np.complex64)) if dn is not None: if isinstance(dn,OCLImage): dn_g = dn else: if dn.dtype.type in (np.complex64,np.complex128): dn_complex = np.zeros(dn.shape+(2,),np.float32) dn_complex[...,0] = np.real(dn) dn_complex[...,1] = np.imag(dn) dn_g = OCLImage.from_array(dn_complex) else: dn_g = OCLImage.from_array(dn.astype(np.float32)) isComplexDn = dn.dtype.type in (np.complex64,np.complex128) else: #dummy dn dn_g = OCLArray.empty((1,)*3,np.float16) if return_scattering: cos_theta = np.real(H0)/n0/k0 # = cos(theta) scatter_weights = cos_theta scatter_weights_g = OCLArray.from_array(scatter_weights.astype(np.float32)) # = cos(theta)^2 gfactor_weights = cos_theta**2 gfactor_weights_g = OCLArray.from_array(gfactor_weights.astype(np.float32)) #return None,None,scatter_weights, gfactor_weights scatter_cross_sec_g = OCLArray.zeros(Nz,"float32") gfactor_g = OCLArray.zeros(Nz,"float32") plain_wave_dct = Nx2*Ny2*np.exp(-1.j*k0*n0*np.arange(Nz)*dz).astype(np.complex64) reduce_kernel = OCLReductionKernel( np.float32, neutral="0", reduce_expr="a+b", map_expr="weights[i]*cfloat_abs(field[i]-(i==0)*plain)*cfloat_abs(field[i]-(i==0)*plain)", arguments="__global cfloat_t *field, __global float * weights,cfloat_t plain") # reduce_kernel = OCLReductionKernel( # np.float32, neutral="0", # reduce_expr="a+b", # map_expr = "weights[i]*(i!=0)*cfloat_abs(field[i])*cfloat_abs(field[i])", # arguments = "__global cfloat_t *field, __global float * weights,cfloat_t plain") u_g = OCLArray.empty((Nz,Ny,Nx),dtype=np.complex64) program.run_kernel("copy_subsampled_buffer",(Nx,Ny),None, u_g.data,plane_g.data, np.int32(subsample), np.int32(0)) clock.toc("setup") clock.tic("run") for i in range(Nz-1): for substep in range(subsample): fft(plane_g,inplace = True, plan = plan) program.run_kernel("mult",(Nx2*Ny2,),None, plane_g.data,h_g.data) if return_scattering and substep == (subsample-1): scatter_cross_sec_g[i+1] = reduce_kernel(plane_g, scatter_weights_g, plain_wave_dct[i+1]) gfactor_g[i+1] = reduce_kernel(plane_g, gfactor_weights_g, plain_wave_dct[i+1]) fft(plane_g,inplace = True, inverse = True, plan = plan) if dn is not None: if isComplexDn: program.run_kernel("mult_dn_complex_image",(Nx2,Ny2),None, plane_g.data,dn_g, np.float32(k0*dz2), np.float32(n0), np.int32(subsample*(i+1.)+substep), np.int32(subsample)) else: program.run_kernel("mult_dn_image",(Nx2,Ny2),None, plane_g.data,dn_g, np.float32(k0*dz2), np.float32(n0), np.int32(subsample*(i+1.)+substep), np.int32(subsample)) program.run_kernel("copy_subsampled_buffer",(Nx,Ny),None, u_g.data,plane_g.data, np.int32(subsample), np.int32((i+1)*Nx*Ny)) clock.toc("run") print clock result = (u_g.get(), dn_g.get(),) if return_scattering: # normalizing prefactor dkx = dx2/Nx2 # prefac = 1./Nx2/Ny2*dx2*dy2/4./np.pi/n0 prefac = 1./Nx2/Ny2*dx2*dy2 p = prefac*scatter_cross_sec_g.get() result += (p,) if return_g: prefac = 1./Nx2/Ny2*dx2*dy2 g = prefac*gfactor_g.get()/p result += (g,) if return_full_last: result += (plane_g.get(),) return result
def _bpm_3d2(size, units, lam = .5, u0 = None, dn = None, subsample = 1, n0 = 1., return_scattering = False, return_g = False, return_full = True, return_field = True, use_fresnel_approx = False, absorbing_width = 0, scattering_plane_ind = 0, return_last_plane = False, store_dn_as_half = False): """ simulates the propagation of monochromatic wave of wavelength lam with initial conditions u0 along z in a media filled with dn size - the dimension of the image to be calulcated in pixels (Nx,Ny,Nz) units - the unit lengths of each dimensions in microns lam - the wavelength u0 - the initial field distribution, if u0 = None an incident plane wave is assumed dn - the refractive index of the medium (can be complex) """ if subsample != 1: raise NotImplementedError("subsample still has to be 1") clock = StopWatch() clock.tic("setup") Nx, Ny, Nz = size dx, dy, dz = units #setting up the propagator k0 = 2.*np.pi/lam kxs = 2.*np.pi*np.fft.fftfreq(Nx,dx) kys = 2.*np.pi*np.fft.fftfreq(Ny,dy) KY, KX = np.meshgrid(kys,kxs, indexing= "ij") #H0 = np.sqrt(0.j+n0**2*k0**2-KX**2-KY**2) H0 = np.sqrt(n0**2*k0**2-KX**2-KY**2) if use_fresnel_approx: H0 = 0.j+n0*k0-.5*(KX**2+KY**2)/n0/k0 outsideInds = np.isnan(H0) H = np.exp(-1.j*dz*H0) H[outsideInds] = 0. H0[outsideInds] = 0. if u0 is None: u0 = np.ones((Ny,Nx),np.complex64) # setting up the gpu buffers and kernels program = OCLProgram(absPath("kernels/bpm_3d_kernels.cl")) plan = fft_plan((Ny,Nx)) plane_g = OCLArray.from_array(u0.astype(np.complex64, copy = False)) h_g = OCLArray.from_array(H.astype(np.complex64)) if dn is not None: if isinstance(dn,OCLArray): dn_g = dn else: if dn.dtype.type in (np.complex64,np.complex128): isComplexDn = True dn_g = OCLArray.from_array(dn.astype(np.complex64,copy= False)) else: isComplexDn = False if store_dn_as_half: dn_g = OCLArray.from_array(dn.astype(np.float16,copy= False)) else: dn_g = OCLArray.from_array(dn.astype(np.float32,copy= False)) else: #dummy dn dn_g = OCLArray.empty((1,)*3,np.float32) if return_scattering: cos_theta = np.real(H0)/n0/k0 # _H = np.sqrt(n0**2*k0**2-KX**2-KY**2) # _H[np.isnan(_H)] = 0. # # cos_theta = _H/n0/k0 # # = cos(theta) scatter_weights = cos_theta #scatter_weights = np.sqrt(KX**2+KY**2)/k0/np.real(H0) #scatter_weights[outsideInds] = 0. scatter_weights_g = OCLArray.from_array(scatter_weights.astype(np.float32)) # = cos(theta)^2 gfactor_weights = cos_theta**2 gfactor_weights_g = OCLArray.from_array(gfactor_weights.astype(np.float32)) #return None,None,scatter_weights, gfactor_weights scatter_cross_sec_g = OCLArray.zeros(Nz,"float32") gfactor_g = OCLArray.zeros(Nz,"float32") plain_wave_dct = Nx*Ny*np.exp(-1.j*k0*n0*(scattering_plane_ind+np.arange(Nz))*dz).astype(np.complex64) reduce_kernel = OCLReductionKernel( np.float32, neutral="0", reduce_expr="a+b", map_expr="weights[i]*cfloat_abs(field[i]-(i==0)*plain)*cfloat_abs(field[i]-(i==0)*plain)", arguments="__global cfloat_t *field, __global float * weights,cfloat_t plain") # reduce_kernel = OCLReductionKernel( # np.float32, neutral="0", # reduce_expr="a+b", # map_expr = "weights[i]*(i!=0)*cfloat_abs(field[i])*cfloat_abs(field[i])", # arguments = "__global cfloat_t *field, __global float * weights,cfloat_t plain") if return_full: if return_field: u_g = OCLArray.empty((Nz,Ny,Nx),dtype=np.complex64) u_g[0] = plane_g else: u_g = OCLArray.empty((Nz,Ny,Nx),dtype=np.float32) program.run_kernel("copy_intens",(Nx*Ny,),None, plane_g.data,u_g.data, np.int32(0)) clock.toc("setup") clock.tic("run") for i in range(Nz-1): fft(plane_g,inplace = True, plan = plan) program.run_kernel("mult",(Nx*Ny,),None, plane_g.data,h_g.data) #a = dn_g.sum() if return_scattering: scatter_cross_sec_g[i+1] = reduce_kernel(plane_g, scatter_weights_g, plain_wave_dct[i+1]) gfactor_g[i+1] = reduce_kernel(plane_g, gfactor_weights_g, plain_wave_dct[i+1]) fft(plane_g,inplace = True, inverse = True, plan = plan) if dn is not None: if isComplexDn: kernel_str = "mult_dn_complex" else: if dn_g.dtype.type == np.float16: kernel_str = "mult_dn_half" else: kernel_str = "mult_dn" program.run_kernel(kernel_str,(Nx,Ny,),None, plane_g.data,dn_g.data, np.float32(k0*dz), np.int32(Nx*Ny*(i+1)), np.int32(absorbing_width)) if return_full: if return_field: u_g[i+1] = plane_g else: program.run_kernel("copy_intens",(Nx*Ny,),None, plane_g.data,u_g.data, np.int32(Nx*Ny*(i+1))) clock.toc("run") print clock if return_full: u = u_g.get() else: u = plane_g.get() if not return_field: u = np.abs(u)**2 if return_scattering: # normalizing prefactor dkx = dx/Nx # prefac = 1./Nx/Ny*dx*dy/4./np.pi/n0 prefac = 1./Nx/Ny*dx*dy p = prefac*scatter_cross_sec_g.get() if return_g: prefac = 1./Nx/Ny*dx*dy g = prefac*gfactor_g.get()/p if return_scattering: if return_g: result = u, p, g else: result = u, p else: result = u if return_last_plane: if isinstance(result,tuple): result = result + (plane_g.get(),) else: result = (result, plane_g.get()) return result
def _propagate_single(self, u0 = None, return_full = True, return_intensity = False, absorbing_width = 0, **kwargs): """ :param u0: initial complex field distribution, if None, plane wave is assumed :param kwargs: :return: """ #plane wave if none if u0 is None: u0 = np.ones(self.size2d[::-1],np.complex64) Nx,Ny,Nz = self.size dx, dy, dz = self.units plane_g = OCLArray.from_array(u0.astype(np.complex64,copy = False)) if return_full: if return_intensity: u_g = OCLArray.empty((Nz,Ny,Nx),dtype=np.float32) self.bpm_program.run_kernel("fill_with_energy",(Nx*Ny,),None, u_g.data,plane_g.data,np.int32(0)) else: u_g = OCLArray.empty((Nz,Ny,Nx),dtype=np.complex64) u_g[0] = plane_g for i in range(Nz-1): fft(plane_g,inplace = True, plan = self._plan) self.bpm_program.run_kernel("mult",(Nx*Ny,),None, plane_g.data,self._H_g.data) fft(plane_g,inplace = True, inverse = True, plan = self._plan) if self.dn is not None: if self._is_complex_dn: kernel_str = "mult_dn_complex" else: kernel_str = "mult_dn" self.bpm_program.run_kernel(kernel_str,(Nx,Ny,),None, plane_g.data,self.dn_g.data, np.float32(self.k0*dz), np.int32(Nx*Ny*(i+1)), np.int32(absorbing_width)) if return_full: if return_intensity: self.bpm_program.run_kernel("fill_with_energy",(Nx*Ny,),None, u_g.data,plane_g.data,np.int32((i+1)*Nx*Ny)) else: u_g[i+1] = plane_g if return_full: u = u_g.get() else: u = plane_g.get() return u
def _propagate_core(self, u0=None, dn_ind_start=0, dn_ind_end=1, dn_ind_offset=0, return_comp="field", return_shape="full", free_prop=False, dn_mean_method="none", **kwargs): """ the core propagation method, the refractive index dn is assumed to be already residing in gpu memory if u0 is None, assumes that the initial field to be residing in self._buf_plane kwargs: return_comp in ["field", "intens"] return_shape in ["last", "full"] free_prop = False | True dn_mean_method = "none", "global", "local" """ print("mean method: ", dn_mean_method) free_prop = free_prop or (self.dn is None) if return_comp == "field": res_type = Bpm3d._complex_type elif return_comp == "intens": res_type = Bpm3d._real_type else: raise ValueError(return_comp) if not return_shape in ["last", "full"]: raise ValueError() Nx, Ny, _ = self.shape Nz = dn_ind_end - dn_ind_start assert dn_ind_start >= 0 # if not u0 is None: # print "huhu" # self._buf_plane.write_array(u0.astype(np.complex64,copy=False)) if return_shape == "full": u = OCLArray.empty((Nz, Ny, Nx), dtype=res_type) # copy the first plane if return_shape == "full": if self._is_subsampled: self._img_xy.copy_buffer(self._buf_plane) self._copy_down_img(self._img_xy, u, 0) else: self._copy_down_buf(self._buf_plane, u, 0) dn0 = 0 if dn_mean_method == "local" and not self.dn is None and not free_prop: self.intens_sum_g = OCLArray.from_array( np.ones(1, dtype=Bpm3d._real_type)) self.intens_dn_sum_g = OCLArray.from_array( (self.dn_mean[dn_ind_start + dn_ind_offset] * np.ones(1)).astype(dtype=Bpm3d._real_type)) #self._fill_propagator_buf(self.n0, self.intens_dn_sum_g, self.intens_sum_g) self._fill_propagator(self.n0) for i in range(Nz - 1): for j in range(self.simul_z): fft(self._buf_plane, inplace=True, plan=self._plan) self._mult_complex(self._buf_plane, self._buf_H) fft(self._buf_plane, inplace=True, inverse=True, plan=self._plan) if not free_prop: #FIXME here we make a slight error for the first time point, as we #FIXME set dn0 first and the compute the new propagator if dn_mean_method == "local": self._mult_dn_local( self._buf_plane, (i + dn_ind_start + (j + 1.) / self.simul_z), self.intens_sum_g, self.intens_dn_sum_g, self.intens_g, self.intens_dn_g) else: self._mult_dn(self._buf_plane, (i + dn_ind_start + (j + 1.) / self.simul_z), dn0) if not self.dn is None and not free_prop: if dn_mean_method == "local": self._kernel_reduction( self.intens_g, self.intens_dn_g, outs=[self.intens_sum_g, self.intens_dn_sum_g]) self._fill_propagator_buf(self.n0, self.intens_dn_sum_g, self.intens_sum_g) #print(self.intens_dn_sum_g.get(), self.n0) #print("mean dn: ",self.intens_dn_sum_g.get()/self.intens_sum_g.get()) elif dn_mean_method == "global": if self.dn_mean[i + dn_ind_start + dn_ind_offset] != dn0: dn0 = self.dn_mean[i + dn_ind_start + dn_ind_offset] self._fill_propagator(self.n0 + dn0) if return_shape == "full": if self._is_subsampled and self.simul_xy != self.shape[:2]: self._img_xy.copy_buffer(self._buf_plane) self._copy_down_img(self._img_xy, u, (i + 1) * (Nx * Ny)) else: self._copy_down_buf(self._buf_plane, u, (i + 1) * (Nx * Ny)) if return_shape == "full": return u.get() else: return self._buf_plane.get()
def bpm_3d_free(size, units, dz, lam = .5, u0 = None, n0 = 1., use_fresnel_approx = False): """propagates the field u0 to distance dz """ clock = StopWatch() clock.tic("setup") Nx, Ny = size dx, dy = units #setting up the propagator k0 = 2.*np.pi/lam*n0 kxs = np.arange(-Nx/2.,Nx/2.)/Nx kys = np.arange(-Ny/2.,Ny/2.)/Ny KY, KX = np.meshgrid(kxs,kys, indexing= "ij") H0 = np.sqrt(0.j+(1./lam)**2-KX**2/dx**2-KY**2/dy**2) if use_fresnel_approx: H0 = 1./lam*(0.j+1.-.5*lam**2*(KX**2/dx**2+KY**2/dy**2)) outsideInds = np.isnan(H0) H = np.exp(2.j*np.pi*dz*H0) H[outsideInds] = 0. H0[outsideInds] = 0. H = np.fft.fftshift(H).astype(np.complex64) if u0 is None: u0 = np.ones((Ny,Nx),np.complex64) """ setting up the gpu buffers and kernels """ program = OCLProgram(absPath("kernels/bpm_3d_kernels.cl")) # program = OCLProgram(src_str = kernel_str) plan = ocl_fft_plan((Ny,Nx)) plane_g = OCLArray.from_array(u0.astype(np.complex64)) h_g = OCLArray.from_array(H.astype(np.complex64)) clock.toc("setup") clock.tic("run") fft(plane_g,inplace = True, plan = plan) program.run_kernel("mult",(Nx*Ny,),None, plane_g.data,h_g.data) fft(plane_g,inplace = True, inverse = True, plan = plan) clock.toc("run") return plane_g.get()
def test_fft_np(): d = np.ones((128,)*2) res = fft(d)
def _convolve_spatial2(im, hs, mode = "constant", grid_dim = None, pad_factor = 2, plan = None, return_plan = False): """ spatial varying convolution of an 2d image with a 2d grid of psfs shape(im_ = (Ny,Nx) shape(hs) = (Gy,Gx, Hy,Hx) the input image im is subdivided into (Gy,Gx) blocks hs[j,i] is the psf at the center of each block (i,j) as of now each image dimension has to be divisible by the grid dim, i.e. Nx % Gx == 0 Ny % Gy == 0 mode can be: "constant" - assumed values to be zero "wrap" - periodic boundary condition """ if grid_dim: Gs = tuple(grid_dim) else: Gs = hs.shape[:2] mode_str = {"constant":"CLK_ADDRESS_CLAMP", "wrap":"CLK_ADDRESS_REPEAT"} Ny, Nx = im.shape Gy, Gx = Gs # the size of each block within the grid Nblock_y, Nblock_x = Ny/Gy, Nx/Gx # the size of the overlapping patches with safety padding Npatch_x, Npatch_y = _next_power_of_2(pad_factor*Nblock_x), _next_power_of_2(pad_factor*Nblock_y) prog = OCLProgram(abspath("kernels/conv_spatial2.cl"), build_options=["-D","ADDRESSMODE=%s"%mode_str[mode]]) if plan is None: plan = fft_plan((Npatch_y,Npatch_x)) x0s = Nblock_x*np.arange(Gx) y0s = Nblock_y*np.arange(Gy) patches_g = OCLArray.empty((Gy,Gx,Npatch_y,Npatch_x),np.complex64) #prepare psfs if grid_dim: h_g = OCLArray.zeros((Gy,Gx,Npatch_y,Npatch_x),np.complex64) tmp_g = OCLArray.from_array(hs.astype(np.float32, copy = False)) for i,_x0 in enumerate(x0s): for j,_y0 in enumerate(y0s): prog.run_kernel("fill_psf_grid2", (Nblock_x,Nblock_y),None, tmp_g.data, np.int32(Nx), np.int32(i*Nblock_x), np.int32(j*Nblock_y), h_g.data, np.int32(Npatch_x), np.int32(Npatch_y), np.int32(-Nblock_x/2+Npatch_x/2), np.int32(-Nblock_y/2+Npatch_y/2), np.int32(i*Npatch_x*Npatch_y+j*Gx*Npatch_x*Npatch_y) ) else: hs = np.fft.fftshift(pad_to_shape(hs,(Gy,Gx,Npatch_y,Npatch_x)),axes=(2,3)) h_g = OCLArray.from_array(hs.astype(np.complex64)) #prepare image im_g = OCLImage.from_array(im.astype(np.float32,copy=False)) for i,_x0 in enumerate(x0s): for j,_y0 in enumerate(y0s): prog.run_kernel("fill_patch2",(Npatch_x,Npatch_y),None, im_g, np.int32(_x0+Nblock_x/2-Npatch_x/2), np.int32(_y0+Nblock_y/2-Npatch_y/2), patches_g.data, np.int32(i*Npatch_x*Npatch_y+j*Gx*Npatch_x*Npatch_y)) #return np.abs(patches_g.get()) # convolution fft(patches_g,inplace=True, batch = Gx*Gy, plan = plan) fft(h_g,inplace=True, batch = Gx*Gy, plan = plan) prog.run_kernel("mult_inplace",(Npatch_x*Npatch_y*Gx*Gy,),None, patches_g.data, h_g.data) fft(patches_g,inplace=True, inverse = True, batch = Gx*Gy, plan = plan) print Nblock_x, Npatch_x #return np.abs(patches_g.get()) #accumulate res_g = OCLArray.empty(im.shape,np.float32) for j in xrange(Gy+1): for i in xrange(Gx+1): prog.run_kernel("interpolate2",(Nblock_x,Nblock_y),None, patches_g.data,res_g.data, np.int32(i),np.int32(j), np.int32(Gx),np.int32(Gy), np.int32(Npatch_x),np.int32(Npatch_y)) res = res_g.get() if return_plan: return res, plan else: return res
def _compare_fft_np(d): res1 = np.fft.fftn(d) res2 = gputools.fft(d, fast_math=True) return res1, res2
def _propagate_single(self, u0=None, return_full=True, return_intensity=False, absorbing_width=0, **kwargs): """ :param u0: initial complex field distribution, if None, plane wave is assumed :param kwargs: :return: """ #plane wave if none if u0 is None: u0 = np.ones(self.size2d[::-1], np.complex64) Nx, Ny, Nz = self.size dx, dy, dz = self.units plane_g = OCLArray.from_array(u0.astype(np.complex64, copy=False)) if return_full: if return_intensity: u_g = OCLArray.empty((Nz, Ny, Nx), dtype=np.float32) self.bpm_program.run_kernel("fill_with_energy", (Nx * Ny, ), None, u_g.data, plane_g.data, np.int32(0)) else: u_g = OCLArray.empty((Nz, Ny, Nx), dtype=np.complex64) u_g[0] = plane_g for i in range(Nz - 1): fft(plane_g, inplace=True, plan=self._plan) self.bpm_program.run_kernel("mult", (Nx * Ny, ), None, plane_g.data, self._H_g.data) fft(plane_g, inplace=True, inverse=True, plan=self._plan) if self.dn is not None: if self._is_complex_dn: kernel_str = "mult_dn_complex" else: kernel_str = "mult_dn" self.bpm_program.run_kernel(kernel_str, ( Nx, Ny, ), None, plane_g.data, self.dn_g.data, np.float32(self.k0 * dz), np.int32(Nx * Ny * (i + 1)), np.int32(absorbing_width)) if return_full: if return_intensity: self.bpm_program.run_kernel("fill_with_energy", (Nx * Ny, ), None, u_g.data, plane_g.data, np.int32((i + 1) * Nx * Ny)) else: u_g[i + 1] = plane_g if return_full: u = u_g.get() else: u = plane_g.get() return u
def _convolve_spatial3(im, hs, mode="constant", grid_dim=None, plan=None, return_plan=False, pad_factor=2): if im.ndim != 3: raise ValueError("wrong dimensions of input!") if not (hs.ndim == 6 or (hs.ndim == 3 and grid_dim)): raise ValueError("wrong dimensions of psf grid!") if grid_dim: if hs.shape != im.shape: raise ValueError("if grid_dim is set, then im.shape = hs.shape !") Gs = tuple(grid_dim) else: if not hs.ndim == 6: raise ValueError("wrong dimensions of psf grid! (Gy,Gx,Ny,Nx)") Gs = hs.shape[:3] if not np.all([n % g == 0 for n, g in zip(im.shape, Gs)]): raise NotImplementedError( "shape of image has to be divisible by Gx Gy = %s shape mismatch" % (str(hs.shape[:2]))) mode_str = { "constant": "CLK_ADDRESS_CLAMP", "wrap": "CLK_ADDRESS_REPEAT", "edge": "CLK_ADDRESS_CLAMP_TO_EDGE", "reflect": "CLK_ADDRESS_MIRRORED_REPEAT" } Ns = im.shape # the size of each block within the grid Nblocks = [n // g for n, g in zip(Ns, Gs)] # the size of the overlapping patches with safety padding Npatchs = tuple([next_power_of_2(pad_factor * nb) for nb in Nblocks]) prog = OCLProgram(abspath("kernels/conv_spatial3.cl"), build_options=["-D", "ADDRESSMODE=%s" % mode_str[mode]]) if plan is None: plan = fft_plan(Gs + Npatchs, axes=(-3, -2, -1)) Xs = [nb * np.arange(g) for nb, g in zip(Nblocks, Gs)] patches_g = OCLArray.empty(Gs + Npatchs, np.complex64) # prepare psfs if grid_dim: h_g = OCLArray.zeros(Gs + Npatchs, np.complex64) tmp_g = OCLArray.from_array(hs.astype(np.float32, copy=False)) for (k, _z0), (j, _y0), (i, _x0) in product(*[enumerate(X) for X in Xs]): prog.run_kernel( "fill_psf_grid3", Nblocks[::-1], None, tmp_g.data, np.int32(im.shape[2]), np.int32(im.shape[1]), np.int32(i * Nblocks[2]), np.int32(j * Nblocks[1]), np.int32(k * Nblocks[0]), h_g.data, np.int32(Npatchs[2]), np.int32(Npatchs[1]), np.int32(Npatchs[0]), np.int32(-Nblocks[2] // 2 + Npatchs[2] // 2), np.int32(-Nblocks[1] // 2 + Npatchs[1] // 2), np.int32(-Nblocks[0] // 2 + Npatchs[0] // 2), np.int32(i * np.prod(Npatchs) + j * Gs[2] * np.prod(Npatchs) + k * Gs[2] * Gs[1] * np.prod(Npatchs))) else: hs = np.fft.fftshift(pad_to_shape(hs, Gs + Npatchs), axes=(3, 4, 5)) h_g = OCLArray.from_array(hs.astype(np.complex64)) im_g = OCLImage.from_array(im.astype(np.float32, copy=False)) # this loops over all i,j,k for (k, _z0), (j, _y0), (i, _x0) in product(*[enumerate(X) for X in Xs]): prog.run_kernel( "fill_patch3", Npatchs[::-1], None, im_g, np.int32(_x0 + Nblocks[2] // 2 - Npatchs[2] // 2), np.int32(_y0 + Nblocks[1] // 2 - Npatchs[1] // 2), np.int32(_z0 + Nblocks[0] // 2 - Npatchs[0] // 2), patches_g.data, np.int32(i * np.prod(Npatchs) + j * Gs[2] * np.prod(Npatchs) + k * Gs[2] * Gs[1] * np.prod(Npatchs))) # convolution fft(patches_g, inplace=True, plan=plan) fft(h_g, inplace=True, plan=plan) prog.run_kernel("mult_inplace", (np.prod(Npatchs) * np.prod(Gs), ), None, patches_g.data, h_g.data) fft(patches_g, inplace=True, inverse=True, plan=plan) # return patches_g.get() # accumulate res_g = OCLArray.zeros(im.shape, np.float32) for k, j, i in product(*[list(range(g + 1)) for g in Gs]): prog.run_kernel("interpolate3", Nblocks[::-1], None, patches_g.data, res_g.data, np.int32(i), np.int32(j), np.int32(k), np.int32(Gs[2]), np.int32(Gs[1]), np.int32(Gs[0]), np.int32(Npatchs[2]), np.int32(Npatchs[1]), np.int32(Npatchs[0])) res = res_g.get() if return_plan: return res, plan else: return res
def _convolve_spatial3(im, hs, mode = "constant", grid_dim = None, plan = None, return_plan = False, pad_factor = 2): if im.ndim !=3: raise ValueError("wrong dimensions of input!") if not (hs.ndim==6 or (hs.ndim==3 and grid_dim)): raise ValueError("wrong dimensions of psf grid!") if grid_dim: if hs.shape != im.shape: raise ValueError("if grid_dim is set, then im.shape = hs.shape !") Gs = tuple(grid_dim) else: if not hs.ndim==6: raise ValueError("wrong dimensions of psf grid! (Gy,Gx,Ny,Nx)") Gs = hs.shape[:3] if not np.all([n%g==0 for n,g in zip(im.shape,Gs)]): raise NotImplementedError("shape of image has to be divisible by Gx Gy = %s shape mismatch"%(str(hs.shape[:2]))) mode_str = {"constant":"CLK_ADDRESS_CLAMP", "wrap":"CLK_ADDRESS_REPEAT"} Ns = im.shape # the size of each block within the grid Nblocks = [n/g for n,g in zip(Ns,Gs)] # the size of the overlapping patches with safety padding Npatchs = tuple([_next_power_of_2(pad_factor*nb) for nb in Nblocks]) prog = OCLProgram(abspath("kernels/conv_spatial3.cl"), build_options=["-D","ADDRESSMODE=%s"%mode_str[mode]]) if plan is None: plan = fft_plan(Npatchs) Xs = [nb*np.arange(g) for nb, g in zip(Nblocks,Gs)] patches_g = OCLArray.empty(Gs+Npatchs,np.complex64) #prepare psfs if grid_dim: h_g = OCLArray.zeros(Gs+Npatchs,np.complex64) tmp_g = OCLArray.from_array(hs.astype(np.float32, copy = False)) for (k,_z0), (j,_y0),(i,_x0) in product(*[enumerate(X) for X in Xs]): prog.run_kernel("fill_psf_grid3", Nblocks[::-1],None, tmp_g.data, np.int32(im.shape[2]), np.int32(im.shape[1]), np.int32(i*Nblocks[2]), np.int32(j*Nblocks[1]), np.int32(k*Nblocks[0]), h_g.data, np.int32(Npatchs[2]), np.int32(Npatchs[1]), np.int32(Npatchs[0]), np.int32(-Nblocks[2]/2+Npatchs[2]/2), np.int32(-Nblocks[1]/2+Npatchs[1]/2), np.int32(-Nblocks[0]/2+Npatchs[0]/2), np.int32(i*np.prod(Npatchs)+ j*Gs[2]*np.prod(Npatchs)+ k*Gs[2]*Gs[1]*np.prod(Npatchs))) else: hs = np.fft.fftshift(pad_to_shape(hs,Gs+Npatchs),axes=(3,4,5)) h_g = OCLArray.from_array(hs.astype(np.complex64)) im_g = OCLImage.from_array(im.astype(np.float32,copy=False)) # this loops over all i,j,k for (k,_z0), (j,_y0),(i,_x0) in product(*[enumerate(X) for X in Xs]): prog.run_kernel("fill_patch3",Npatchs[::-1],None, im_g, np.int32(_x0+Nblocks[2]/2-Npatchs[2]/2), np.int32(_y0+Nblocks[1]/2-Npatchs[1]/2), np.int32(_z0+Nblocks[0]/2-Npatchs[0]/2), patches_g.data, np.int32(i*np.prod(Npatchs)+ j*Gs[2]*np.prod(Npatchs)+ k*Gs[2]*Gs[1]*np.prod(Npatchs))) # convolution fft(patches_g,inplace=True, batch = np.prod(Gs), plan = plan) fft(h_g,inplace=True, batch = np.prod(Gs), plan = plan) prog.run_kernel("mult_inplace",(np.prod(Npatchs)*np.prod(Gs),),None, patches_g.data, h_g.data) fft(patches_g, inplace=True, inverse = True, batch = np.prod(Gs), plan = plan) #return patches_g.get() #accumulate res_g = OCLArray.zeros(im.shape,np.float32) for k, j, i in product(*[range(g+1) for g in Gs]): prog.run_kernel("interpolate3",Nblocks[::-1],None, patches_g.data, res_g.data, np.int32(i),np.int32(j),np.int32(k), np.int32(Gs[2]),np.int32(Gs[1]),np.int32(Gs[0]), np.int32(Npatchs[2]),np.int32(Npatchs[1]),np.int32(Npatchs[0])) res = res_g.get() if return_plan: return res, plan else: return res
def convolve_spatial2(im, hs, mode="constant", plan=None, return_plan=False): """ spatial varying convolution of an 2d image with a 2d grid of psfs shape(im_ = (Ny,Nx) shape(hs) = (Gy,Gx, Hy,Hx) the input image im is subdivided into (Gy,Gz) blocks hs[j,i] is the psf at the center of each block (i,j) as of now each image dimension has to be divisble by the grid dim, i.e. Nx % Gx == 0 Ny % Gy == 0 mode can be: "constant" - assumed values to be zero "wrap" - periodic boundary condition """ if im.ndim != 2 or hs.ndim != 4: raise ValueError("wrong dimensions of input!") if not np.all([n % g == 0 for n, g in zip(im.shape, hs.shape[:2])]): raise NotImplementedError( "shape of image has to be divisible by Gx Gy = %s shape mismatch" % (str(hs.shape[:2]))) mode_str = {"constant": "CLK_ADDRESS_CLAMP", "wrap": "CLK_ADDRESS_REPEAT"} Ny, Nx = im.shape Gy, Gx = hs.shape[:2] # the size of each block within the grid Nblock_y, Nblock_x = Ny / Gy, Nx / Gx # the size of the overlapping patches with safety padding Npatch_x, Npatch_y = _next_power_of_2(3 * Nblock_x), _next_power_of_2( 3 * Nblock_y) #Npatch_x, Npatch_y = _next_power_of_2(2*Nblock_x), _next_power_of_2(2*Nblock_y) print(Nblock_x, Npatch_x) hs = np.fft.fftshift(pad_to_shape(hs, (Gy, Gx, Npatch_y, Npatch_x)), axes=(2, 3)) prog = OCLProgram(abspath("kernels/conv_spatial.cl"), build_options=["-D", "ADDRESSMODE=%s" % mode_str[mode]]) if plan is None: plan = fft_plan((Npatch_y, Npatch_x)) patches_g = OCLArray.empty((Gy, Gx, Npatch_y, Npatch_x), np.complex64) h_g = OCLArray.from_array(hs.astype(np.complex64)) im_g = OCLImage.from_array(im.astype(np.float32, copy=False)) x0s = Nblock_x * np.arange(Gx) y0s = Nblock_y * np.arange(Gy) print(x0s) for i, _x0 in enumerate(x0s): for j, _y0 in enumerate(y0s): prog.run_kernel( "fill_patch2", (Npatch_x, Npatch_y), None, im_g, np.int32(_x0 + Nblock_x / 2 - Npatch_x / 2), np.int32(_y0 + Nblock_y / 2 - Npatch_y / 2), patches_g.data, np.int32(i * Npatch_x * Npatch_y + j * Gx * Npatch_x * Npatch_y)) # convolution fft(patches_g, inplace=True, batch=Gx * Gy, plan=plan) fft(h_g, inplace=True, batch=Gx * Gy, plan=plan) prog.run_kernel("mult_inplace", (Npatch_x * Npatch_y * Gx * Gy, ), None, patches_g.data, h_g.data) fft(patches_g, inplace=True, inverse=True, batch=Gx * Gy, plan=plan) #return patches_g.get() #accumulate res_g = OCLArray.empty(im.shape, np.float32) for i in range(Gx + 1): for j in range(Gy + 1): prog.run_kernel("interpolate2", (Nblock_x, Nblock_y), None, patches_g.data, res_g.data, np.int32(i), np.int32(j), np.int32(Gx), np.int32(Gy), np.int32(Npatch_x), np.int32(Npatch_y)) res = res_g.get() if return_plan: return res, plan else: return res
def convolve_spatial3(im, hs, mode="constant", plan=None, return_plan=False, pad_factor=2): """ spatial varying convolution of an 3d image with a 3d grid of psfs shape(im_ = (Nz,Ny,Nx) shape(hs) = (Gz,Gy,Gx, Hz,Hy,Hx) the input image im is subdivided into (Gx,Gy,Gz) blocks hs[k,j,i] is the psf at the center of each block (i,j,k) as of now each image dimension has to be divisble by the grid dim, i.e. Nx % Gx == 0 Ny % Gy == 0 Nz % Gz == 0 mode can be: "constant" - assumed values to be zero "wrap" - periodic boundary condition """ if im.ndim != 3 or hs.ndim != 6: raise ValueError("wrong dimensions of input!") if not np.all([n % g == 0 for n, g in zip(im.shape, hs.shape[:3])]): raise NotImplementedError( "shape of image has to be divisible by Gx Gy = %s !" % (str(hs.shape[:3]))) mode_str = {"constant": "CLK_ADDRESS_CLAMP", "wrap": "CLK_ADDRESS_REPEAT"} Ns = tuple(im.shape) Gs = tuple(hs.shape[:3]) # the size of each block within the grid Nblocks = [n / g for n, g in zip(Ns, Gs)] # the size of the overlapping patches with safety padding Npatchs = tuple([_next_power_of_2(pad_factor * nb) for nb in Nblocks]) print(hs.shape) hs = np.fft.fftshift(pad_to_shape(hs, Gs + Npatchs), axes=(3, 4, 5)) prog = OCLProgram(abspath("kernels/conv_spatial.cl"), build_options=["-D", "ADDRESSMODE=%s" % mode_str[mode]]) if plan is None: plan = fft_plan(Npatchs) patches_g = OCLArray.empty(Gs + Npatchs, np.complex64) h_g = OCLArray.from_array(hs.astype(np.complex64)) im_g = OCLImage.from_array(im.astype(np.float32, copy=False)) Xs = [nb * np.arange(g) for nb, g in zip(Nblocks, Gs)] print(Nblocks) # this loops over all i,j,k for (k, _z0), (j, _y0), (i, _x0) in product(*[enumerate(X) for X in Xs]): prog.run_kernel( "fill_patch3", Npatchs[::-1], None, im_g, np.int32(_x0 + Nblocks[2] / 2 - Npatchs[2] / 2), np.int32(_y0 + Nblocks[1] / 2 - Npatchs[1] / 2), np.int32(_z0 + Nblocks[0] / 2 - Npatchs[0] / 2), patches_g.data, np.int32(i * np.prod(Npatchs) + j * Gs[2] * np.prod(Npatchs) + k * Gs[2] * Gs[1] * np.prod(Npatchs))) print(patches_g.shape, h_g.shape) # convolution fft(patches_g, inplace=True, batch=np.prod(Gs), plan=plan) fft(h_g, inplace=True, batch=np.prod(Gs), plan=plan) prog.run_kernel("mult_inplace", (np.prod(Npatchs) * np.prod(Gs), ), None, patches_g.data, h_g.data) fft(patches_g, inplace=True, inverse=True, batch=np.prod(Gs), plan=plan) #return patches_g.get() #accumulate res_g = OCLArray.zeros(im.shape, np.float32) for k, j, i in product(*[list(range(g + 1)) for g in Gs]): prog.run_kernel("interpolate3", Nblocks[::-1], None, patches_g.data, res_g.data, np.int32(i), np.int32(j), np.int32(k), np.int32(Gs[2]), np.int32(Gs[1]), np.int32(Gs[0]), np.int32(Npatchs[2]), np.int32(Npatchs[1]), np.int32(Npatchs[0])) res = res_g.get() if return_plan: return res, plan else: return res
def _propagate(self, u0=None, offset=0, return_comp="field", return_shape="full", free_prop=False, slow_mean=False, **kwargs): """ kwargs: return_comp in ["field", "intens"] return_shape in ["last", "full"] free_prop = False | True """ free_prop = free_prop or (self.dn is None) if return_comp == "field": res_type = Bpm3d._complex_type elif return_comp == "intens": res_type = Bpm3d._real_type else: raise ValueError(return_comp) if not return_shape in ["last", "full"]: raise ValueError() if u0 is None: u0 = self.u0_plane() u0 = u0.astype(np.complex64, copy=False) Nx, Ny, Nz = self.shape assert offset >= 0 and offset < (Nz - 1) if return_shape == "full": u = OCLArray.empty((Nz - offset, Ny, Nx), dtype=res_type) self._buf_plane.write_array(u0) # copy the first plane if return_shape == "full": if self._is_subsampled: self._img_xy.copy_buffer(self._buf_plane) self._copy_down_img(self._img_xy, u, 0) else: self._copy_down_buf(self._buf_plane, u, 0) dn0 = 0 for i in range(Nz - 1 - offset): if not self.dn is None and not free_prop: if slow_mean: if return_shape == "full": raise NotImplementedError() else: tmp = OCLArray.empty((1, Ny, Nx), dtype=res_type) if self._is_subsampled: self._img_xy.copy_buffer(self._buf_plane) self._copy_down_img(self._img_xy, tmp, 0) else: self._copy_down_buf(self._buf_plane, tmp, 0) dn0 = np.sum( np.abs(self.dn[i]) * tmp.get()) / np.sum(np.abs(self.dn[i]) + 1.e-10) self._fill_propagator(self.n0 + dn0) else: if self.dn_mean[i + offset] != dn0: dn0 = self.dn_mean[i + offset] self._fill_propagator(self.n0 + dn0) for j in range(self.simul_z): fft(self._buf_plane, inplace=True, plan=self._plan) self._mult_complex(self._buf_plane, self._buf_H) fft(self._buf_plane, inplace=True, inverse=True, plan=self._plan) if not free_prop: self._mult_dn(self._buf_plane, (i + offset + (j + 1.) / self.simul_z), dn0) if return_shape == "full": if self._is_subsampled and self.simul_xy != self.shape[:2]: self._img_xy.copy_buffer(self._buf_plane) self._copy_down_img(self._img_xy, u, (i + 1) * (Nx * Ny)) else: self._copy_down_buf(self._buf_plane, u, (i + 1) * (Nx * Ny)) if return_shape == "full": return u.get() else: return self._buf_plane.get()
def convolve(self, input_image): """ Input image should have dimensions <h, w> or <s, o, h, w> or <b, s, o, h, w, d>. Filter bank should have dimensions <s, o, h, w> """ if len(input_image.shape) == 2: bdsohw_input_image = input_image[None, None, None, None, :, :] elif len(input_image.shape) == 4: bdsohw_input_image = input_image[None, None, :, :, :, :] elif len(input_image.shape) == 6: bdsohw_input_image = np.einsum("bsohwd->bdsohw", input_image) padded_input = np.pad( bdsohw_input_image, [[0, 0], [0, 0], [0, 0], [0, 0], [int(np.ceil(self.box_height / 2)), int(self.box_height / 2)], [int(np.ceil(self.box_width / 2)), int(self.box_width / 2)]]) padded_input = np.tile( padded_input, [1, 1, self.n_scales, self.n_orientations, 1, 1]) padded_input_ocl = gputools.OCLArray.from_array( np.fft.ifftshift(padded_input, (-2, -1)).astype(np.complex64)) #print("PADDED INPUT COL SIZE", padded_input_ocl.get().shape) # TODO MAKE GPU PLAN FIRST, and reuse it # TODO Make the broadcasting more efficient # TODO Reuse the filter bank across multiple letters. gputools.fft(padded_input_ocl, axes=(-2, -1), inplace=True) filter_bank_ocl_fft = gputools.OCLArray.from_array( self.filter_bank[None, None, :, :, :, :]) #print("FILTER BANK ", filter_bank_ocl_fft.get().shape) padded_input_ocl *= filter_bank_ocl_fft gputools.fft(padded_input_ocl, axes=(-2, -1), inplace=True, inverse=True) padded_result = np.fft.fftshift(padded_input_ocl.get()) #input_in_freqdomain = np.fft.fft2(padded_input + 1j * np.zeros_like(padded_input)) #padded_result = (np.fft.ifft2(input_in_freqdomain * self.filter_bank[None, None, :, :, :, :])) if len(input_image.shape) == 2: presult = np.fft.fftshift(padded_result[0, 0, :, :, :, :], axes=[2, 3]) return presult[:, :, int(np.ceil(self.box_height / 2)):int(self.box_height + np.ceil(self.box_height / 2)), int(np.ceil(self.box_width / 2)):int(self.box_width + np.ceil(self.box_width / 2))] elif len(input_image.shape) == 4: presult = np.fft.fftshift(padded_result[0, 0, :, :, :, :], axes=[2, 3]) # Return <s, o, h, w> return presult[:, :, int(np.ceil(self.box_height / 2)):int(self.box_height + np.ceil(self.box_height / 2)), int(np.ceil(self.box_width / 2)):int(self.box_width + np.ceil(self.box_width / 2))] elif len(input_image.shape) == 6: presult = np.einsum("bdsohw->bsohwd", np.fft.fftshift(padded_result, axes=[2, 3])) return presult[:, :, :, :, int(np.ceil(self.box_height / 2)):int(self.box_height + np.ceil(self.box_height / 2)), int(np.ceil(self.box_width / 2)):int(self.box_width + np.ceil(self.box_width / 2))]