def __init__(self, ctx, queue, par, kwidth=3, overgridfactor=2, fft_dim=(1, 2), klength=200, DTYPE=np.complex64, DTYPE_real=np.float32): print("Setting up PyOpenCL NUFFT.") self.DTYPE = DTYPE self.DTYPE_real = DTYPE_real self.fft_shape = (par["NScan"] * par["NC"] * par["NSlice"], par["N"], par["N"]) self.traj = par["traj"] self.dcf = par["dcf"] self.Nproj = par["Nproj"] self.ctx = ctx self.queue = queue self.overgridfactor = overgridfactor self.kerneltable, self.kerneltable_FT, self.u = calckbkernel( kwidth, overgridfactor, par["N"], klength) self.kernelpoints = self.kerneltable.size self.fft_scale = DTYPE_real( np.sqrt(np.prod(self.fft_shape[fft_dim[0]:]))) self.deapo = 1 / self.kerneltable_FT.astype(DTYPE_real) self.kwidth = kwidth / 2 self.cl_kerneltable = cl.Buffer( self.ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=self.kerneltable.astype(DTYPE_real).data) self.deapo_cl = cl.Buffer(self.ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=self.deapo.data) self.dcf = clarray.to_device(self.queue, self.dcf) self.traj = clarray.to_device(self.queue, self.traj) self.tmp_fft_array = (clarray.empty(self.queue, (self.fft_shape), dtype=DTYPE)) self.check = np.ones(par["N"], dtype=DTYPE_real) self.check[1::2] = -1 self.check = clarray.to_device(self.queue, self.check) self.par_fft = int(self.fft_shape[0] / par["NScan"]) self.fft = FFT(ctx, queue, self.tmp_fft_array[0:int(self.fft_shape[0] / par["NScan"]), ...], out_array=self.tmp_fft_array[0:int(self.fft_shape[0] / par["NScan"]), ...], axes=fft_dim) self.gridsize = par["N"] self.fwd_NUFFT = self.NUFFT self.adj_NUFFT = self.NUFFTH self.prg = Program( self.ctx, open( resource_filename('rrsg_cgreco', 'kernels/opencl_nufft_kernels.c')).read())
def initClBuffers(self): mf = cl.mem_flags # self.Et_buf = cl.Buffer(self.ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf = self.Et) self.Et_cla = cla.to_device(self.q, self.Et) self.Esig_t_tau = np.zeros((self.N, self.N), self.dtype_c) # self.Esig_t_tau_buf = cl.Buffer(self.ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf = self.Esig_t_tau) self.Esig_t_tau_cla = cla.to_device(self.q, self.Esig_t_tau) self.Esig_w_tau = np.zeros((self.N, self.N), self.dtype_c) self.Esig_w_tau_cla = cla.to_device(self.q, self.Esig_w_tau) self.Esig_w_tau_fft = FFT(self.ctx, self.q, (self.Esig_t_tau_cla,), (self.Esig_w_tau_cla,), axes=[1]) self.I_w_tau_cla = cla.to_device(self.q, self.I_w_tau) self.Esig_t_tau_p = np.zeros((self.N, self.N), self.dtype_c) self.Esig_t_tau_p_cla = cla.to_device(self.q, self.Esig_t_tau_p) self.Esig_t_tau_p_fft = FFT(self.ctx, self.q, (self.Esig_w_tau_cla,), (self.Esig_t_tau_p_cla,), axes=[1]) self.initClBuffersGP()
class FrogCalculation(object): def __init__(self, useCPU=False): self.useCPU = useCPU self.useCL = True self.rollFFT = True # There is a peculiarity in the fft calculation of a set of vectors # so that the first fft end up in the end... Set this switch to roll # back the last line of the Esig_w_tau fft matrix self.dtype_c = np.complex64 self.dtype_r = np.float32 self.Esignal_w = None self.Esignal_t = None self.Et_cla = None self.initCl(useCPU=useCPU) def initCl(self, useCPU=False): root.debug("Initializing opencl") pl = cl.get_platforms() d = None v = None root.debug("".join(("Found ", str(pl.__len__()), " platforms"))) vendorDict = {"amd": 3, "nvidia": 2, "intel": 1} if useCPU == False: for p in pl: root.debug(p.vendor.lower()) if "amd" in p.vendor.lower(): vTmp = "amd" elif "nvidia" in p.vendor.lower(): vTmp = "nvidia" else: vTmp = "intel" if v == None: d = p.get_devices() v = vTmp else: if vendorDict[vTmp] > vendorDict[v]: d = p.get_devices() v = vTmp else: for p in pl: if "amd" in p.vendor.lower(): vTmp = "amd" elif "nvidia" in p.vendor.lower(): vTmp = "nvidia" else: vTmp = "intel" d = p.get_devices(device_type=cl.device_type.CPU) if d != []: v = vTmp break root.debug("".join(("Using device ", str(d), " from ", v))) self.ctx = cl.Context(devices=d) self.q = cl.CommandQueue(self.ctx) self.progs = FrogClKernels.FrogClKernels(self.ctx) def initClBuffers(self): mf = cl.mem_flags # self.Et_buf = cl.Buffer(self.ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf = self.Et) self.Et_cla = cla.to_device(self.q, self.Et) self.Esig_t_tau = np.zeros((self.N, self.N), self.dtype_c) # self.Esig_t_tau_buf = cl.Buffer(self.ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf = self.Esig_t_tau) self.Esig_t_tau_cla = cla.to_device(self.q, self.Esig_t_tau) self.Esig_w_tau = np.zeros((self.N, self.N), self.dtype_c) self.Esig_w_tau_cla = cla.to_device(self.q, self.Esig_w_tau) self.Esig_w_tau_fft = FFT(self.ctx, self.q, (self.Esig_t_tau_cla,), (self.Esig_w_tau_cla,), axes=[1]) self.I_w_tau_cla = cla.to_device(self.q, self.I_w_tau) self.Esig_t_tau_p = np.zeros((self.N, self.N), self.dtype_c) self.Esig_t_tau_p_cla = cla.to_device(self.q, self.Esig_t_tau_p) self.Esig_t_tau_p_fft = FFT(self.ctx, self.q, (self.Esig_w_tau_cla,), (self.Esig_t_tau_p_cla,), axes=[1]) self.initClBuffersGP() def initClBuffersGP(self): # Gradient vector for the functional distance in the generalized projection self.dZ_cla = cla.zeros(self.q, (self.N), self.dtype_c) # Vector for intermediate results for the error minimization calculation self.X0_cla = cla.zeros(self.q, (self.N), self.dtype_r) self.X1_cla = cla.zeros(self.q, (self.N), self.dtype_r) self.X2_cla = cla.zeros(self.q, (self.N), self.dtype_r) self.X3_cla = cla.zeros(self.q, (self.N), self.dtype_r) self.X4_cla = cla.zeros(self.q, (self.N), self.dtype_r) self.X5_cla = cla.zeros(self.q, (self.N), self.dtype_r) self.X6_cla = cla.zeros(self.q, (self.N), self.dtype_r) self.Esig_t_tau_norm = np.zeros((self.N, self.N), self.dtype_r) self.Esig_t_tau_norm_cla = cla.to_device(self.q, self.Esig_t_tau_norm) def initPulseFieldRandom(self, N, t_res, l0, seed=0): """ Initiate signal field with parameters: t_res: time resolution of the reconstruction N: number of points in time and wavelength axes l0: center wavelength Creates the following variables: self.w self.dw self.t self.dt self.tau self.Et """ np.random.seed(seed) self.N = np.int32(N) t_span = N * t_res # Now we calculate the frequency resolution required by the # time span w_res = 2 * np.pi / t_span # Frequency span is given by the time resolution f_max = 1 / (2 * t_res) w_span = f_max * 2 * 2 * np.pi c = 299792458.0 w0 = 2 * np.pi * c / l0 # w_spectrum = np.linspace(w0-w_span/2, w0+w_span/2, n_t) w_spectrum = np.linspace(-w_span / 2, -w_span / 2 + w_res * N, N) self.dw = w_res self.w = w_spectrum self.w0 = w0 # Create time vector self.dt = t_res self.t = np.linspace(-t_span / 2, t_span / 2, N) self.tau_start_ind = 0 self.tau_stop_ind = N - 1 self.tau = self.t root.info("".join(("t_span ", str(t_span)))) root.info("".join(("t_res ", str(t_res)))) # Finally calculate a gaussian E-field from the self.Et = (np.exp(1j * 2 * np.pi * np.random.rand(N))).astype(self.dtype_c) root.info("Finished") def initPulseFieldGaussian(self, N, t_res, l0, tau_pulse): """ Initiate signal field with parameters: t_res: time resolution of the reconstruction N: number of points in time and wavelength axes l0: center wavelength Creates the following variables: self.w self.dw self.t self.dt self.tau self.Et """ t_span = N * t_res self.N = np.int32(N) # Now we calculate the frequency resolution required by the # time span w_res = 2 * np.pi / t_span # Frequency span is given by the time resolution f_max = 1 / (2 * t_res) w_span = f_max * 2 * 2 * np.pi c = 299792458.0 w0 = 2 * np.pi * c / l0 # w_spectrum = np.linspace(w0-w_span/2, w0+w_span/2, n_t) w_spectrum = np.linspace(-w_span / 2, -w_span / 2 + w_res * N, N) self.dw = w_res self.w = w_spectrum self.w0 = w0 # Create time vector self.dt = t_res self.t = np.linspace(-t_span / 2, t_span / 2, N) p = sp.SimulatedFrogTrace(N, self.dt, tau=tau_pulse, l0=l0) p.pulse.generateGaussian(tau_pulse) self.tau_start_ind = 0 self.tau_stop_ind = N - 1 self.tau = self.t root.info("".join(("t_span ", str(t_span)))) root.info("".join(("t_res ", str(t_res)))) # Finally calculate a gaussian E-field from the self.Et = (np.abs(p.pulse.Et) * np.exp(1j * 2 * np.pi * np.random.rand(N))).astype(self.dtype_c) self.t = p.pulse.t self.p = p root.info("Finished") def loadFrogTrace(self, filename, thr=0.0, lStartPixel=0, lStopPixel=-1, tStartPixel=0, tStopPixel=-1): fNameRoot = "_".join((filename.split("_")[0:3])) tData = np.loadtxt("".join((fNameRoot, "_timevector.txt"))) tData = tData - tData.mean() lData = np.loadtxt("".join((fNameRoot, "_wavelengthvector.txt"))) * 1e-9 pic = np.float32(imread("".join((fNameRoot, "_image.png")))) picN = pic / pic.max() if tStopPixel == -1: tStopPixel = picN.shape[0] - 1 if lStopPixel == -1: lStopPixel = picN.shape[1] - 1 picF = self.filterFrogTrace(picN, 3, thr) self.conditionFrogTrace( picF[tStartPixel:tStopPixel, lStartPixel:lStopPixel], lData[lStartPixel], lData[lStopPixel], tData[tStartPixel], tData[tStopPixel], ) def conditionFrogTrace(self, Idata, l_start, l_stop, tau_start, tau_stop): """ Take the measured intensity data and interpolate it to the internal w, tau grid. Idata.shape[0] = number of tau points Idata.shape[1] = number of spectrum points """ tau_data = np.linspace(tau_start, tau_stop, Idata.shape[0]) l_data = np.linspace(l_start, l_stop, Idata.shape[1]) if l_start > l_stop: w_start = 2 * np.pi * 299792458.0 / l_start w_stop = 2 * np.pi * 299792458.0 / l_stop w0 = 2 * np.pi * 299792458.0 / ((l_stop + l_start) / 2) Idata_i = Idata.copy() else: w_start = 2 * np.pi * 299792458.0 / l_stop w_stop = 2 * np.pi * 299792458.0 / l_start w0 = 2 * np.pi * 299792458.0 / ((l_stop + l_start) / 2) Idata_i = np.fliplr(Idata).copy() root.debug("".join(("w_start: ", str(w_start)))) root.debug("".join(("w_stop: ", str(w_stop)))) w_data = np.linspace(w_start, w_stop, Idata.shape[1]) - w0 # w_data = 2*np.pi*299792458.0/l_data[::-1].copy() Idata_i = np.flipud(Idata_i).copy() Idata_i[0:2, :] = 0.0 Idata_i[-2:, :] = 0.0 Idata_i[:, 0:2] = 0.0 Idata_i[:, -2:] = 0.0 Idata_i = Idata_i / Idata_i.max() root.info("Creating interpolator") t0 = time.clock() Idata_interp = si.RectBivariateSpline(tau_data, w_data, Idata_i) # Idata_interp = interp2d(tau_mat, w_mat, Idata, kind='linear', fill_value = 0.0, bounds_error = False) root.info("".join(("Time spent: ", str(time.clock() - t0)))) root.info("".join(("Interpolating frog trace to ", str(self.tau.shape[0]), "x", str(self.w.shape[0])))) t0 = time.clock() self.I_w_tau = np.fft.fftshift(np.maximum(Idata_interp(self.tau, self.w), 0.0), axes=1).astype(self.dtype_r) # self.I_w_tau = np.maximum(Idata_interp(self.tau, self.w), 0.0) if self.rollFFT == True: self.I_w_tau = np.roll(self.I_w_tau, 1, axis=0) root.info("".join(("Time spent: ", str(time.clock() - t0)))) return Idata_i, w_data, tau_data def filterFrogTrace(self, Idata, kernel=5, thr=0.1): Idata_f = medfilt2d(Idata, kernel) - thr Idata_f[Idata_f < 0.0] = 0.0 return Idata_f def generateEsig_t_tau_SHG(self): """ Generate the time shifted E-field matrix for the SHG process. Output: self.Esig_t_tau, a n_tau x n_t matrix where each row is Esig(t,tau) """ root.debug("Generating new Esig_t_tau from SHG") t0 = time.clock() krn = self.progs.progs["generateEsig_t_tau_SHG"].generateEsig_t_tau_SHG krn.set_scalar_arg_dtypes((None, None, np.int32)) krn.set_args(self.Et_cla.data, self.Esig_t_tau_cla.data, self.N) ev = cl.enqueue_nd_range_kernel(self.q, krn, self.Esig_t_tau.shape, None) ev.wait() root.debug("".join(("Time spent: ", str(time.clock() - t0)))) def generateEsig_t_tau_SD(self): """ Generate the time shifted E-field matrix for the SD process. Output: self.Esig_t_tau, a n_tau x n_t matrix where each row is Esig(t,tau) """ root.debug("Generating new Esig_t_tau from SD") t0 = time.clock() krn = self.progs.progs["generateEsig_t_tau_SD"].generateEsig_t_tau_SD krn.set_scalar_arg_dtypes((None, None, np.int32)) krn.set_args(self.Et_cla.data, self.Esig_t_tau_cla.data, self.N) ev = cl.enqueue_nd_range_kernel(self.q, krn, self.Esig_t_tau.shape, None) ev.wait() root.debug("".join(("Time spent: ", str(time.clock() - t0)))) def generateEsig_w_tau(self): """ Generate the fft of the time shifted E(t) """ root.debug("Generating Esig_w_tau") rollFFT = False # There is a peculiarity in the fft calculation of a set of vectors # so that the first fft end up in the end... Set this switch to roll # back the last line of the Esig_w_tau fft matrix tic = time.clock() # transform = FFT(self.ctx, self.q, (self.Esig_t_tau_cla,) , (self.Esig_w_tau_cla,) , axes = [1]) # events = transform.enqueue() if self.useCL == True: events = self.Esig_w_tau_fft.enqueue() for e in events: e.wait() # if self.rollFFT == True: # krn = self.progs.progs['rollEsigWTau'].rollEsigWTau # krn.set_scalar_arg_dtypes((None, np.int32)) # krn.set_args(self.Esig_w_tau_cla.data, self.N) # ev = cl.enqueue_nd_range_kernel(self.q, krn, [self.Esig_w_tau.shape[0]], None) # ev.wait() else: Esig_t_tau = self.Esig_t_tau_cla.get() if self.rollFFT == True: Esig_w_tau = np.roll(np.fft.fft(Esig_t_tau, axis=1).astype(self.dtype_c), 1, axis=0) else: Esig_w_tau = np.fft.fft(Esig_t_tau, axis=1).astype(self.dtype_c) self.Esig_w_tau_cla.set(Esig_w_tau.copy()) toc = time.clock() root.debug("".join(("Time spent: ", str(toc - tic)))) def applyIntensityData(self, I_w_tau=None): root.debug("Applying intensity data from experiment") t0 = time.clock() krn = self.progs.progs["applyIntensityData"].applyIntensityData krn.set_scalar_arg_dtypes((None, None, np.int32)) krn.set_args(self.Esig_w_tau_cla.data, self.I_w_tau_cla.data, self.N) ev = cl.enqueue_nd_range_kernel(self.q, krn, self.Esig_w_tau.shape, None) ev.wait() # if self.useCL == True: # krn = self.progs.progs['applyIntensityData'].applyIntensityData # krn.set_scalar_arg_dtypes((None, None, np.int32)) # krn.set_args(self.Esig_w_tau_cla.data, self.I_w_tau_cla.data, self.N) # ev = cl.enqueue_nd_range_kernel(self.q, krn, self.Esig_w_tau.shape, None) # ev.wait() # else: # eps = 0.00 # Esig_w_tau = self.Esig_w_tau_cla.get() # Esig_mag = np.abs(Esig_w_tau) # # Esig_w_tau_p = np.zeros_like(Esig_w_tau) # good_ind = np.where(Esig_mag > eps) # Esig_w_tau_p[good_ind[0], good_ind[1]] = np.sqrt(self.I_w_tau_cla.get()[good_ind[0], good_ind[1]])*Esig_w_tau[good_ind[0], good_ind[1]]/Esig_mag[good_ind[0], good_ind[1]] root.debug("".join(("Time spent: ", str(time.clock() - t0)))) def updateEt_vanilla(self, algo="SHG"): root.debug("Updating Et using vanilla algorithm") t0 = time.clock() # transform = FFT(self.ctx, self.q, (self.Esig_w_tau_cla,) , (self.Esig_t_tau_p_cla,) , axes = [1]) # events = transform.enqueue(forward = False) # self.Esig_t_tau_p_cla.set(np.fft.ifft(self.Esig_w_tau_cla.get(), axis=1).astype(self.dtype_c).copy()) if self.useCL == True: events = self.Esig_t_tau_p_fft.enqueue(forward=False) for e in events: e.wait() if algo == "SD": krn = self.progs.progs["updateEtVanillaSumSD"].updateEtVanillaSumSD krn.set_scalar_arg_dtypes((None, None, np.int32)) krn.set_args(self.Esig_t_tau_p_cla.data, self.Et_cla.data, self.N) ev = cl.enqueue_nd_range_kernel(self.q, krn, self.Et.shape, None) ev.wait() Et = self.Et_cla.get() self.Et_cla.set(-np.conj(Et).astype(self.dtype_c).copy()) # Esig_w_tau = self.Esig_w_tau_cla.get() # Gm = np.conj(Esig_w_tau.sum(axis=1))[::-1] # self.Et_cla.set(Gm.copy()) else: krn = self.progs.progs["updateEtVanillaSumSHG"].updateEtVanillaSumSHG krn.set_scalar_arg_dtypes((None, None, np.int32)) krn.set_args(self.Esig_t_tau_p_cla.data, self.Et_cla.data, self.N) ev = cl.enqueue_nd_range_kernel(self.q, krn, self.Et.shape, None) ev.wait() krn = self.progs.progs["updateEtVanillaNorm"].updateEtVanillaNorm krn.set_scalar_arg_dtypes((None, np.int32)) krn.set_args(self.Et_cla.data, self.N) ev = cl.enqueue_nd_range_kernel(self.q, krn, [1], None) ev.wait() else: self.Esig_t_tau_p_cla.set(np.fft.ifft(self.Esig_w_tau_cla.get(), axis=1).astype(self.dtype_c).copy()) Esig_t_tau_p = self.Esig_t_tau_p_cla.get() if algo == "SD": Et = np.sqrt(Esig_t_tau_p.sum(axis=0)) # Et = (Esig_t_tau_p.sum(axis=0)) else: Et = Esig_t_tau_p.sum(axis=0) Et = Et / np.abs(Et).max() self.Et_cla.set(Et) root.debug("".join(("Time spent: ", str(time.clock() - t0)))) def gradZSHG_naive(self): root.debug("Calculating dZ for SHG using for loops") Et = self.Et_cla.get() Esigp = self.Esig_t_tau_p_cla.get() dZ = np.zeros_like(Et) N = Esigp.shape[0] sz = N * N for t0 in range(N): T = 0.0 + 1j * 0.0 for tau in range(N): tp = t0 - (tau - N / 2) if tp >= 0 and tp < N: T += (Et[t0] * Et[tp] - Esigp[tau, t0]) * np.conj(Et[tp]) tp = t0 + (tau - N / 2) if tp >= 0 and tp < N: T += (Et[t0] * Et[tp] - Esigp[tau, tp]) * np.conj(Et[tp]) dZ[t0] = -T / sz self.dZ_cla.set(dZ.copy()) def gradZSHG_gpu(self): root.debug("Calculating dZ for SHG using gpu") krn = self.progs.progs["gradZSHG"].gradZSHG krn.set_scalar_arg_dtypes((None, None, None, np.int32)) krn.set_args(self.Esig_t_tau_p_cla.data, self.Et_cla.data, self.dZ_cla.data, self.N) ev = cl.enqueue_nd_range_kernel(self.q, krn, self.Et.shape, None) ev.wait() def gradZSD_naive(self): # Todo: fix this algorithm root.debug("Calculating dZ for SD using for loops") Et = self.Et_cla.get() Esigp = self.Esig_t_tau_p_cla.get() dZ = np.zeros_like(Et) N = Esigp.shape[0] sz = N * N for t0 in range(N): T = 0.0 + 1j * 0.0 for tau in range(N): tp = t0 - (tau - N / 2) if tp >= 0 and tp < N: EtEtp = np.conj(Et[t0]) * Et[tp] T += 4 * (Et[t0] * np.conj(EtEtp) - Esigp[tau, t0]) * EtEtp tp = t0 + (tau - N / 2) if tp >= 0 and tp < N: EtpEtp = Et[tp] * Et[tp] T += 2 * (Et[t0] * np.conj(EtpEtp) - np.conj(Esigp[tau, tp])) * EtpEtp dZ[t0] = -T / sz self.dZ_cla.set(dZ.copy()) def gradZSD_gpu(self): root.debug("Calculating dZ for SD using gpu") krn = self.progs.progs["gradZSD"].gradZSD krn.set_scalar_arg_dtypes((None, None, None, np.int32)) krn.set_args(self.Esig_t_tau_p_cla.data, self.Et_cla.data, self.dZ_cla.data, self.N) ev = cl.enqueue_nd_range_kernel(self.q, krn, self.Et.shape, None) ev.wait() def minZerrKernSHG_naive(self): Et0 = self.Et_cla.get() Esig = self.Esig_t_tau_p_cla.get() dZ = self.dZ_cla.get() N = Esig.shape[0] mx = 0.0 X = np.zeros(5) for t in range(N): for tau in range(N): T = np.abs(Esig[tau, t]) ** 2 if mx < T: mx = T tp = t - (tau - N / 2) if tp >= 0 and tp < N: dZdZ = dZ[t] * dZ[tp] dZE = dZ[t] * Et0[tp] + dZ[tp] * Et0[t] DEsig = Et0[t] * Et0[tp] - Esig[tau, t] X[0] += np.abs(dZdZ) ** 2 X[1] += 2.0 * np.real(dZE * np.conj(dZdZ)) X[2] += 2.0 * np.real(DEsig * np.conj(dZdZ)) + np.abs(dZE) ** 2 X[3] += 2.0 * np.real(DEsig * np.conj(dZE)) X[4] += np.abs(DEsig) ** 2 T = N * N * mx X[0] = X[0] / T X[1] = X[1] / T X[2] = X[2] / T X[3] = X[3] / T X[4] = X[4] / T root.debug("".join(("Esig_t_tau_p norm max: ", str(mx)))) return X def minZerrKernSHG_gpu(self): krn = self.progs.progs["minZerrSHG"].minZerrSHG krn.set_scalar_arg_dtypes((None, None, None, None, None, None, None, None, np.int32)) krn.set_args( self.Esig_t_tau_p_cla.data, self.Et_cla.data, self.dZ_cla.data, self.X0_cla.data, self.X1_cla.data, self.X2_cla.data, self.X3_cla.data, self.X4_cla.data, self.N, ) ev = cl.enqueue_nd_range_kernel(self.q, krn, self.Et.shape, None) ev.wait() krn = self.progs.progs["normEsig"].normEsig krn.set_scalar_arg_dtypes((None, None, np.int32)) krn.set_args(self.Esig_t_tau_p_cla.data, self.Esig_t_tau_norm_cla.data, self.N) ev = cl.enqueue_nd_range_kernel(self.q, krn, self.Esig_t_tau_p.shape, None) ev.wait() mx = cla.max(self.Esig_t_tau_norm_cla).get() * self.N * self.N # Esig_t_tau = self.Esig_t_tau_p_cla.get() # mx = ((Esig_t_tau*Esig_t_tau.conj()).real).max() * self.N*self.N X0 = cla.sum(self.X0_cla, queue=self.q).get() / mx X1 = cla.sum(self.X1_cla, queue=self.q).get() / mx X2 = cla.sum(self.X2_cla, queue=self.q).get() / mx X3 = cla.sum(self.X3_cla, queue=self.q).get() / mx X4 = cla.sum(self.X4_cla, queue=self.q).get() / mx root.debug("".join(("X0=", str(X0), ", type ", str(type(X0))))) root.debug( "".join(("Poly: ", str(X4), " x^4 + ", str(X3), " x^3 + ", str(X2), " x^2 + ", str(X1), " x + ", str(X0))) ) # Polynomial in dZ (expansion of differential) X = np.array([X0, X1, X2, X3, X4]).astype(np.double) root.debug("".join(("Esig_t_tau_p norm max: ", str(mx / (self.N * self.N))))) return X def minZerrKernSD_naive(self): # Todo: fix this algorithm Et0 = self.Et_cla.get() Esig = self.Esig_t_tau_p_cla.get() dZ = self.dZ_cla.get() N = Esig.shape[0] mx = 0.0 X = np.zeros(7) for t in range(N): for tau in range(N): T = np.abs(Esig[tau, t]) ** 2 if mx < T: mx = T tp = t - (tau - N / 2) if tp >= 0 and tp < N: a0 = Esig[tau, t] - Et0[t] * Et0[t] * np.conj(Et0[tp]) a1 = -(2 * Et0[t] * dZ[t] * np.conj(Et0[tp]) + Et0[t] * Et0[t] * np.conj(dZ[tp])) a2 = -(dZ[t] * dZ[t] * np.conj(Et0[tp]) + 2 * Et0[t] * np.conj(dZ[tp]) * dZ[t]) a3 = -dZ[t] * dZ[t] * np.conj(dZ[tp]) X[0] += np.real(a3 * np.conj(a3)) X[1] += np.real(a2 * np.conj(a3) + a3 * np.conj(a2)) X[2] += np.real(a1 * np.conj(a3) + a3 * np.conj(a1) + a2 * np.conj(a2)) X[3] += np.real(a0 * np.conj(a3) + a3 * np.conj(a0) + a1 * np.conj(a2) + a2 * np.conj(a1)) X[4] += np.real(a0 * np.conj(a2) + a2 * np.conj(a0) + a1 * np.conj(a1)) X[5] += np.real(a0 * np.conj(a1) + a1 * np.conj(a0)) X[6] += np.real(a0 * np.conj(a0)) T = N * N * mx X = X / T root.debug("".join(("Esig_t_tau_p norm max: ", str(mx)))) return X def minZerrKernSD_gpu(self): krn = self.progs.progs["minZerrSD"].minZerrSD krn.set_scalar_arg_dtypes((None, None, None, None, None, None, None, None, None, None, np.int32)) krn.set_args( self.Esig_t_tau_p_cla.data, self.Et_cla.data, self.dZ_cla.data, self.X0_cla.data, self.X1_cla.data, self.X2_cla.data, self.X3_cla.data, self.X4_cla.data, self.X5_cla.data, self.X6_cla.data, self.N, ) ev = cl.enqueue_nd_range_kernel(self.q, krn, self.Et.shape, None) ev.wait() krn = self.progs.progs["normEsig"].normEsig krn.set_scalar_arg_dtypes((None, None, np.int32)) krn.set_args(self.Esig_t_tau_p_cla.data, self.Esig_t_tau_norm_cla.data, self.N) ev = cl.enqueue_nd_range_kernel(self.q, krn, self.Esig_t_tau_p.shape, None) ev.wait() mx = cla.max(self.Esig_t_tau_norm_cla).get() * self.N * self.N # Esig_t_tau = self.Esig_t_tau_p_cla.get() # mx = ((Esig_t_tau*Esig_t_tau.conj()).real).max() * self.N*self.N X0 = cla.sum(self.X0_cla, queue=self.q).get() / mx X1 = cla.sum(self.X1_cla, queue=self.q).get() / mx X2 = cla.sum(self.X2_cla, queue=self.q).get() / mx X3 = cla.sum(self.X3_cla, queue=self.q).get() / mx X4 = cla.sum(self.X4_cla, queue=self.q).get() / mx X5 = cla.sum(self.X5_cla, queue=self.q).get() / mx X6 = cla.sum(self.X6_cla, queue=self.q).get() / mx root.debug("".join(("X0=", str(X0), ", type ", str(type(X0))))) root.debug( "".join( ( "Poly: ", str(X6), " x^6 + ", str(X5), " x^5 + ", str(X4), " x^4 + ", str(X3), " x^3 + ", str(X2), " x^2 + ", str(X1), " x + ", str(X0), ) ) ) # Polynomial in dZ (expansion of differential) X = np.array([X0, X1, X2, X3, X4, X5, X6]).astype(np.double) root.debug("".join(("Esig_t_tau_p norm max: ", str(mx / (self.N * self.N))))) return X def updateEt_gp(self, algo="SHG"): root.debug("Updating Et using GP algorithm") tic = time.clock() events = self.Esig_t_tau_p_fft.enqueue(forward=False) for e in events: e.wait() if algo == "SHG": # Calculate the gradient of the functional distance: if self.useCL == True: self.gradZSHG_gpu() else: self.gradZSHG_naive() # Calculate error minimization polynomial if self.useCL == True: p1 = self.minZerrKernSHG_gpu() else: p1 = self.minZerrKernSHG_naive() root.debug( "".join( ( "Poly: ", str(p1[4]), " x^4 + ", str(p1[3]), " x^3 + ", str(p1[2]), " x^2 + ", str(p1[1]), " x + ", str(p1[0]), ) ) ) elif algo == "SD": # Calculate the gradient of the functional distance: if self.useCL == True: self.gradZSD_gpu() else: self.gradZSD_naive() # Calculate error minimization polynomial if self.useCL == True: p1 = self.minZerrKernSD_gpu() else: p1 = self.minZerrKernSD_naive() root.debug( "".join( ( "Poly: ", str(p1[4]), " x^4 + ", str(p1[3]), " x^3 + ", str(p1[2]), " x^2 + ", str(p1[1]), " x + ", str(p1[0]), ) ) ) # Root finding of the polynomial in the gradient expansion p = np.polyder(p1) r = np.roots(p) X = r[np.abs(r.imag) < 1e-9].real root.debug("".join(("Real roots: ", str(X)))) Z1 = np.polyval(p1, X) minZInd = Z1.argmin() Z = np.maximum(3e-16 * X[-1], Z1[minZInd]) Z = np.sqrt(Z) X = X[minZInd].astype(self.dtype_r) # Update Et if self.useCL == True: krn = self.progs.progs["updateEtGP"].updateEtGP krn.set_scalar_arg_dtypes((None, None, self.dtype_r, np.int32)) krn.set_args(self.Et_cla.data, self.dZ_cla.data, X, self.N) ev = cl.enqueue_nd_range_kernel(self.q, krn, self.Et.shape, None) ev.wait() else: root.debug("".join(("Moving distance X=", str(X)))) Et = self.Et_cla.get() Et_new = Et + X * self.dZ_cla.get() self.Et_cla.set(Et_new.copy()) toc = time.clock() root.debug("".join(("Time spent: ", str(toc - tic)))) return Z def centerPeakTime(self): Et = self.Et_cla.get() ind = np.argmax(abs(Et)) shift = Et.shape[0] / 2 - ind Et = np.roll(Et, shift) self.Et_cla.set(Et) def calcReconstructionError(self): root.debug("Calculating reconstruction error") tic = time.clock() Esig_w_tau = self.Esig_w_tau_cla.get() I_rec_w_tau = np.real(Esig_w_tau * np.conj(Esig_w_tau)) I_w_tau = self.I_w_tau_cla.get() my = I_w_tau.max() / I_rec_w_tau.max() root.debug("".join(("My=", str(my)))) G = np.sqrt(((I_w_tau - my * I_rec_w_tau) ** 2).sum() / (I_rec_w_tau.shape[0] * I_rec_w_tau.shape[1])) # G = np.sqrt(((self.I_w_tau-my*I_rec_w_tau)**2).sum()/(self.I_w_tau.sum())) toc = time.clock() root.debug("".join(("Time spent: ", str(toc - tic)))) return G def getData(self): root.debug("Retrieving data from opencl buffers") tic = time.clock() self.Esig_t_tau_cla.get() self.Et_cla.get() self.Esig_t_tau_p_cla.get() self.Esig_w_tau_cla.get() toc = time.clock() root.debug("".join(("Time spent: ", str(toc - tic)))) def getTraceAbs(self): self.centerPeakTime() return np.abs(self.Et_cla.get()) def getTracePhase(self): self.centerPeakTime() Et = self.Et_cla.get() ph0 = np.angle(Et[Et.shape[0] / 2]) return np.angle(Et) - ph0 def getT(self): return self.t def setupVanillaAlgorithm(self): if self.Et_cla is None: self.initClBuffers() def runCycleVanilla(self, cycles=1, algo="SHG", useCL=None): root.debug("Starting FROG reconstruction cycle using the vanilla algorithm") if useCL is not None: self.useCL = useCL self.rollFFT = useCL t0 = time.clock() er = [] self.setupVanillaAlgorithm() for c in range(cycles): root.debug("".join(("Cycle ", str(c + 1), "/", str(cycles)))) if algo == "SD": self.generateEsig_t_tau_SD() else: self.generateEsig_t_tau_SHG() self.generateEsig_w_tau() G = self.calcReconstructionError() self.applyIntensityData() self.updateEt_vanilla("SD") # self.centerPeakTime() root.debug("-------------------------------------------") root.debug("".join(("Error G = ", str(G)))) root.debug("-------------------------------------------") er.append(G) deltaT = time.clock() - t0 root.debug("".join(("Total runtime ", str(deltaT)))) root.debug("".join((str(cycles / deltaT), " iterations/s"))) print "".join((str(cycles / deltaT), " iterations/s")) return np.array(er) def setupGPAlgorithm(self): if self.Et_cla is None: self.initClBuffers() def runCycleGP(self, cycles=1, algo="SHG", useCL=None): root.debug("Starting FROG reconstruction cycle using the GP algorithm") if useCL is not None: self.useCL = useCL self.rollFFT = useCL t0 = time.clock() er = [] self.setupGPAlgorithm() for c in range(cycles): root.debug("".join(("Cycle ", str(c + 1), "/", str(cycles)))) if algo == "SD": self.generateEsig_t_tau_SD() else: self.generateEsig_t_tau_SHG() self.generateEsig_w_tau() G = self.calcReconstructionError() self.applyIntensityData() self.updateEt_gp(algo) # self.centerPeakTime() root.debug("-------------------------------------------") root.debug("".join(("Error G = ", str(G)))) root.debug("-------------------------------------------") er.append(G) deltaT = time.clock() - t0 root.debug("".join(("Total runtime ", str(deltaT)))) root.debug("".join((str(cycles / deltaT), " iterations/s"))) print "".join((str(cycles / deltaT), " iterations/s")) return np.array(er) def runComplete(self): tic = time.clock() er = self.runCycleVanilla(30) oldEr = np.min(er) er = self.runCycleGP(30) newEr = np.min(er) epochs = 0 while oldEr - newEr > 1e-5 and epochs < 20: oldEr = newEr er = self.runCycleGP(30) newEr = np.min(er) epochs += 1 print "Epoch ", epochs, ", error ", newEr print "Epochs: ", epochs toc = time.clock() print "Total reconstruction time ", toc - tic, " s"
pl = cl.get_platforms() d = pl[1].get_devices() context = cl.Context(devices = d) #context = cl.create_some_context() queue = cl.CommandQueue(context) dataRe = np.random.rand(512,512) dataIm = np.random.rand(512,512) nd_dataC = (dataRe + 1j*dataIm).astype(np.complex64) #nd_dataC = np.random.rand((1024, 1024), dtype = np.complex64) dataC = cla.to_device(queue, nd_dataC) nd_result = np.zeros_like(nd_dataC, dtype = np.complex64) resultC = cla.to_device(queue, nd_result) transform = FFT(context, queue, (dataC,), (resultC,), axes = [1]) tic = time.clock() events = transform.enqueue() for e in events: e.wait() toc = time.clock() clTime = toc-tic print 'clTime: ', clTime tic = time.clock() resultCl = resultC.get() toc = time.clock() print "transfer time: ", toc-tic ticNp = time.clock() resultNp = np.fft.fft(nd_dataC, axis=1).astype(np.complex64) tocNp = time.clock()
class PyOpenCLNUFFT: def __init__(self, ctx, queue, par, kwidth=3, overgridfactor=2, fft_dim=(1, 2), klength=200, DTYPE=np.complex64, DTYPE_real=np.float32): print("Setting up PyOpenCL NUFFT.") self.DTYPE = DTYPE self.DTYPE_real = DTYPE_real self.fft_shape = (par["NScan"] * par["NC"] * par["NSlice"], par["N"], par["N"]) self.traj = par["traj"] self.dcf = par["dcf"] self.Nproj = par["Nproj"] self.ctx = ctx self.queue = queue self.overgridfactor = overgridfactor self.kerneltable, self.kerneltable_FT, self.u = calckbkernel( kwidth, overgridfactor, par["N"], klength) self.kernelpoints = self.kerneltable.size self.fft_scale = DTYPE_real( np.sqrt(np.prod(self.fft_shape[fft_dim[0]:]))) self.deapo = 1 / self.kerneltable_FT.astype(DTYPE_real) self.kwidth = kwidth / 2 self.cl_kerneltable = cl.Buffer( self.ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=self.kerneltable.astype(DTYPE_real).data) self.deapo_cl = cl.Buffer(self.ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=self.deapo.data) self.dcf = clarray.to_device(self.queue, self.dcf) self.traj = clarray.to_device(self.queue, self.traj) self.tmp_fft_array = (clarray.empty(self.queue, (self.fft_shape), dtype=DTYPE)) self.check = np.ones(par["N"], dtype=DTYPE_real) self.check[1::2] = -1 self.check = clarray.to_device(self.queue, self.check) self.par_fft = int(self.fft_shape[0] / par["NScan"]) self.fft = FFT(ctx, queue, self.tmp_fft_array[0:int(self.fft_shape[0] / par["NScan"]), ...], out_array=self.tmp_fft_array[0:int(self.fft_shape[0] / par["NScan"]), ...], axes=fft_dim) self.gridsize = par["N"] self.fwd_NUFFT = self.NUFFT self.adj_NUFFT = self.NUFFTH self.prg = Program( self.ctx, open( resource_filename('rrsg_cgreco', 'kernels/opencl_nufft_kernels.c')).read()) def __del__(self): del self.traj del self.dcf del self.tmp_fft_array del self.cl_kerneltable del self.fft del self.deapo_cl del self.check del self.queue del self.ctx def NUFFTH(self, sg, s, wait_for=[]): # Zero tmp arrays self.tmp_fft_array.add_event( self.prg.zero_tmp(self.queue, (self.tmp_fft_array.size, ), None, self.tmp_fft_array.data, wait_for=(s.events + sg.events + self.tmp_fft_array.events + wait_for))) # Grid k-space self.tmp_fft_array.add_event( self.prg.grid_lut(self.queue, (s.shape[0], s.shape[1] * s.shape[2], s.shape[-2] * self.gridsize), None, self.tmp_fft_array.data, s.data, self.traj.data, np.int32(self.gridsize), self.DTYPE_real(self.kwidth / self.gridsize), self.dcf.data, self.cl_kerneltable, np.int32(self.kernelpoints), wait_for=(wait_for + sg.events + s.events + self.tmp_fft_array.events))) # FFT self.tmp_fft_array.add_event( self.prg.fftshift( self.queue, (self.fft_shape[0], self.fft_shape[1], self.fft_shape[2]), None, self.tmp_fft_array.data, self.check.data)) for j in range(s.shape[0]): self.tmp_fft_array.add_event( self.fft.enqueue_arrays( data=self.tmp_fft_array[j * self.par_fft:(j + 1) * self.par_fft, ...], result=self.tmp_fft_array[j * self.par_fft:(j + 1) * self.par_fft, ...], forward=False)[0]) self.tmp_fft_array.add_event( self.prg.fftshift( self.queue, (self.fft_shape[0], self.fft_shape[1], self.fft_shape[2]), None, self.tmp_fft_array.data, self.check.data)) return self.prg.deapo_adj(self.queue, (sg.shape[0] * sg.shape[1] * sg.shape[2], sg.shape[3], sg.shape[4]), None, sg.data, self.tmp_fft_array.data, self.deapo_cl, np.int32(self.tmp_fft_array.shape[-1]), self.DTYPE_real(self.fft_scale), self.DTYPE_real(self.overgridfactor), wait_for=wait_for + sg.events + s.events + self.tmp_fft_array.events) def NUFFT(self, s, sg, wait_for=[]): # Zero tmp arrays self.tmp_fft_array.add_event( self.prg.zero_tmp(self.queue, (self.tmp_fft_array.size, ), None, self.tmp_fft_array.data, wait_for=(s.events + sg.events + self.tmp_fft_array.events + wait_for))) # Deapodization and Scaling self.tmp_fft_array.add_event( self.prg.deapo_fwd( self.queue, (sg.shape[0] * sg.shape[1] * sg.shape[2], sg.shape[3], sg.shape[4]), None, self.tmp_fft_array.data, sg.data, self.deapo_cl, np.int32(self.tmp_fft_array.shape[-1]), self.DTYPE_real(1 / self.fft_scale), self.DTYPE_real(self.overgridfactor), wait_for=wait_for + sg.events + self.tmp_fft_array.events)) # FFT self.tmp_fft_array.add_event( self.prg.fftshift( self.queue, (self.fft_shape[0], self.fft_shape[1], self.fft_shape[2]), None, self.tmp_fft_array.data, self.check.data)) for j in range(s.shape[0]): self.tmp_fft_array.add_event( self.fft.enqueue_arrays( data=self.tmp_fft_array[j * self.par_fft:(j + 1) * self.par_fft, ...], result=self.tmp_fft_array[j * self.par_fft:(j + 1) * self.par_fft, ...], forward=True)[0]) self.tmp_fft_array.add_event( self.prg.fftshift( self.queue, (self.fft_shape[0], self.fft_shape[1], self.fft_shape[2]), None, self.tmp_fft_array.data, self.check.data)) # Resample on Spoke return self.prg.invgrid_lut( self.queue, (s.shape[0], s.shape[1] * s.shape[2], s.shape[-2] * self.gridsize), None, s.data, self.tmp_fft_array.data, self.traj.data, np.int32(self.gridsize), self.DTYPE_real(self.kwidth / self.gridsize), self.dcf.data, self.cl_kerneltable, np.int32(self.kernelpoints), wait_for=s.events + wait_for + self.tmp_fft_array.events)
import numpy as np import pyopencl as cl import pyopencl.array as cla from gpyfft.fft import FFT context = cl.create_some_context() queue = cl.CommandQueue(context) data_host = np.zeros((4, 1024, 1024), dtype=np.complex64) #data_host[:] = some_useful_data data_gpu = cla.to_device(queue, data_host) transform = FFT(context, queue, data_gpu, axes=(2, 1)) event, = transform.enqueue() event.wait() result_host = data_gpu.get()