def cufft_conv(x, y): x = x.astype(np.complex64) y = y.astype(np.complex64) if (x.shape != y.shape): return -1 plan = fft.Plan(x.shape, np.complex64, np.complex64) inverse_plan = fft.Plan(x.shape, np.complex64, np.complex64) x_gpu = gpuarray.to_gpu(x) y_gpu = gpuarray.to_gpu(y) x_fft = gpuarray.empty_like(x_gpu, dtype=np.complex64) y_fft = gpuarray.empty_like(y_gpu, dtype=np.complex64) out_gpu = gpuarray.empty_like(x_gpu, dtype=np.complex64) fft.fft(x_gpu, x_fft, plan) fft.fft(y_gpu, y_fft, plan) linalg.multiply(x_fft, y_fft, overwrite=True) fft.ifft(y_fft, out_gpu, inverse_plan, scale=True) conv_out = out_gpu.get() x_gpu.gpudata.free() y_gpu.gpudata.free() x_fft.gpudata.free() y_fft.gpudata.free() out_gpu.gpudata.free() return conv_out
def propagate_eager(self, wavelength, wavefront): """ 'Not-Too-Good' version of the propagation on the GPU (lots of Memory issues...) Remove in the future :param wavelength: :param wavefront: :return: """ N = self.N_PIX # free, total = cuda.mem_get_info() free, total = cuda.mem_get_info() print("Free: %.2f percent" % (free / total * 100)) # Pupil Plane -> Image Slicer complex_pupil = self.pupil_masks[wavelength] * np.exp( 1j * 2 * np.pi * self.pupil_masks[wavelength] / wavelength) complex_pupil_gpu = gpuarray.to_gpu( np.asarray(complex_pupil, np.complex64)) plan = cu_fft.Plan(complex_pupil_gpu.shape, np.complex64, np.complex64) cu_fft.fft(complex_pupil_gpu, complex_pupil_gpu, plan, scale=True) # Add N_slices copies to be Masked complex_slicer_cpu = complex_pupil_gpu.get() complex_pupil_gpu.gpudata.free() free, total = cuda.mem_get_info() print("*Free: %.2f percent" % (free / total * 100)) complex_slicer_cpu = np.stack([complex_slicer_cpu] * self.N_slices) complex_slicer_gpu = gpuarray.to_gpu(complex_slicer_cpu) slicer_masks_gpu = gpuarray.to_gpu(self.slicer_masks_fftshift) clinalg.multiply(slicer_masks_gpu, complex_slicer_gpu, overwrite=True) slicer_masks_gpu.gpudata.free() free, total = cuda.mem_get_info() print("**Free: %.2f percent" % (free / total * 100)) # Slicer -> Pupil Mirror plan = cu_fft.Plan((N, N), np.complex64, np.complex64, self.N_slices) cu_fft.ifft(complex_slicer_gpu, complex_slicer_gpu, plan, scale=True) mirror_mask_gpu = gpuarray.to_gpu(self.pupil_mirror_masks_fft) clinalg.multiply(mirror_mask_gpu, complex_slicer_gpu, overwrite=True) # Pupil Mirror -> Slits cu_fft.fft(complex_slicer_gpu, complex_slicer_gpu, plan) slits = complex_slicer_gpu.get() complex_slicer_gpu.gpudata.free() mirror_mask_gpu.gpudata.free() slit = fftshift(np.sum((np.abs(slits))**2, axis=0)) free, total = cuda.mem_get_info() print("***Free: %.2f percent" % (free / total * 100)) return slit
def _impl_test_multiply(self, N, dtype): mk_matrix = lambda N, dtype: np.asarray(np.random.rand(N, N), dtype) x = mk_matrix(N, dtype) y = mk_matrix(N, dtype) if np.iscomplexobj(x): x += 1j*mk_matrix(N, dtype) y += 1j*mk_matrix(N, dtype) x_gpu = gpuarray.to_gpu(x) y_gpu = gpuarray.to_gpu(y) z_gpu = linalg.multiply(x_gpu, y_gpu) assert np.allclose(x*y, z_gpu.get())
def filter(self): import pycuda.gpuarray as gpuarray import skcuda.fft as cu_fft import skcuda.linalg as linalg import pycuda.driver as cuda from pycuda.tools import make_default_context cuda.init() context = make_default_context() device = context.get_device() signal = self.series[0] window = self.series[1] linalg.init() nfft = determine_size(len(signal) + len(window) - 1) # Move data to GPU sig_zero_pad = np.zeros(nfft, dtype=self.precision['float']) win_zero_pad = np.zeros(nfft, dtype=self.precision['float']) sig_gpu = gpuarray.zeros(sig_zero_pad.shape, dtype=self.precision['float']) win_gpu = gpuarray.zeros(win_zero_pad.shape, dtype=self.precision['float']) sig_zero_pad[0:len(signal)] = signal win_zero_pad[0:len(window)] = window sig_gpu.set(sig_zero_pad) win_gpu.set(win_zero_pad) # Plan forwards sig_fft_gpu = gpuarray.zeros(nfft, dtype=self.precision['complex']) win_fft_gpu = gpuarray.zeros(nfft, dtype=self.precision['complex']) sig_plan_forward = cu_fft.Plan(sig_fft_gpu.shape, self.precision['float'], self.precision['complex']) win_plan_forward = cu_fft.Plan(win_fft_gpu.shape, self.precision['float'], self.precision['complex']) cu_fft.fft(sig_gpu, sig_fft_gpu, sig_plan_forward) cu_fft.fft(win_gpu, win_fft_gpu, win_plan_forward) # Convolve out_fft = linalg.multiply(sig_fft_gpu, win_fft_gpu, overwrite=True) linalg.scale(2.0, out_fft) # Plan inverse out_gpu = gpuarray.zeros_like(out_fft) plan_inverse = cu_fft.Plan(out_fft.shape, self.precision['complex'], self.precision['complex']) cu_fft.ifft(out_fft, out_gpu, plan_inverse, True) out_np = np.zeros(len(out_gpu), self.precision['complex']) out_gpu.get(out_np) context.pop() return out_np
def filter_fft_cuda(signal: np.array, window: np.array, prec: dict): """ Computes the low_pass filter using the numpy pycuda method. Also auto-inits the pycuda library :param signal: The input series :param window: The input window :param prec: The precision entry :return: The filtered signal """ import pycuda.autoinit # Here because it initialises a new cuda environment every trial. import pycuda.gpuarray as gpuarray import skcuda.fft as cu_fft import skcuda.linalg as linalg linalg.init() nfft = determine_size(len(signal) + len(window) - 1) # Move data to GPU sig_zero_pad = np.zeros(nfft, dtype=prec['float']) win_zero_pad = np.zeros(nfft, dtype=prec['float']) sig_gpu = gpuarray.zeros(sig_zero_pad.shape, dtype=prec['float']) win_gpu = gpuarray.zeros(win_zero_pad.shape, dtype=prec['float']) sig_zero_pad[0:len(signal)] = signal win_zero_pad[0:len(window)] = window sig_gpu.set(sig_zero_pad) win_gpu.set(win_zero_pad) # Plan forwards sig_fft_gpu = gpuarray.zeros(nfft, dtype=prec['complex']) win_fft_gpu = gpuarray.zeros(nfft, dtype=prec['complex']) sig_plan_forward = cu_fft.Plan(sig_fft_gpu.shape, prec['float'], prec['complex']) win_plan_forward = cu_fft.Plan(win_fft_gpu.shape, prec['float'], prec['complex']) cu_fft.fft(sig_gpu, sig_fft_gpu, sig_plan_forward) cu_fft.fft(win_gpu, win_fft_gpu, win_plan_forward) # Convolve out_fft = linalg.multiply(sig_fft_gpu, win_fft_gpu, overwrite=True) linalg.scale(2.0, out_fft) # Plan inverse out_gpu = gpuarray.zeros_like(out_fft) plan_inverse = cu_fft.Plan(out_fft.shape, prec['complex'], prec['complex']) cu_fft.ifft(out_fft, out_gpu, plan_inverse, True) out_np = np.zeros(len(out_gpu), prec['complex']) out_gpu.get(out_np) return out_np
def logis(y,x): end = 0 start = 0 x = x.astype(np.float32) y = y.astype(np.float32) start=time.time() # Translado de variable a GPU x_gpu = gpuarray.to_gpu(x) y_gpu = gpuarray.to_gpu(y) linalg.init() # Transpuesta de X x_gpu_T = linalg.transpose(x_gpu) beta_gpu = linalg.dot(linalg.dot(linalg.inv(linalg.dot(x_gpu_T,x_gpu)),x_gpu_T),y_gpu) j = 1 while(True): mu = sapply(x,beta_gpu.get()) mu = mu.astype(np.float32) mu_gpu = gpuarray.to_gpu(mu) V_gpu= linalg.diag(mu_gpu) f2_gpu = linalg.multiply(mu_gpu,1-mu_gpu) f3_gpu = linalg.diag(1/f2_gpu) f4_gpu = (y_gpu-mu_gpu) f5_gpu = linalg.dot(f3_gpu,f4_gpu) if(np.isnan(f5_gpu.get()).any()): f5_cpu = f5_gpu.get() f5_cpu = nanValue(f5_cpu) f5_gpu = gpuarray.to_gpu(f5_cpu.astype(np.float32)) y_1_gpu = linalg.dot(x_gpu,beta_gpu) + f5_gpu beta_1_gpu = linalg.dot(linalg.dot(linalg.dot(linalg.inv(linalg.dot(linalg.dot(x_gpu_T,V_gpu),x_gpu)),x_gpu_T),V_gpu),y_1_gpu) check_value = np.absolute(linalg.norm(beta_1_gpu-beta_gpu)) #if(check_value<0.00001): #break if(j == 10 or check_value<0.00001): break beta_gpu = beta_1_gpu j = j + 1 end = time.time() tiempo = (end-start) return {"iteraciones":j,"Betas":beta_gpu.get(),"time":tiempo}
def step_2(matrix_1, matrix_image_pan): #La función linalg.mulitply realiza la multiplicación elemento a elemento entre dos matrices matrix_2 = linalg.multiply(matrix_1, matrix_image_pan) return matrix_2
def _sub_kmeans_gpu(X, k): import skcuda import skcuda.linalg as LA import pycuda.driver as cuda import pycuda.autoinit import pycuda.gpuarray as gpuarray LA.init() n, d = X.shape X = X.astype(np.float32) V_gpu = random_V(d, mode='gpu') m = d / 2 X_gpu = gpuarray.to_gpu(X) mu_D_gpu = skcuda.misc.mean(X_gpu, axis=0, keepdims=True) sub_gpu = skcuda.misc.subtract(X_gpu, mu_D_gpu) S_D_gpu = LA.dot(sub_gpu, sub_gpu, transa='T') mu_is_gpu = gpuarray.to_gpu(X[np.random.choice(n, k)]) itr = 1 assignment_unchanged = 0 C_gpu = None MAX_ITER = 100 while itr < MAX_ITER: Pc_gpu = projection_matrix(d, m, mode='gpu') PcV_gpu = LA.dot(Pc_gpu, V_gpu, transa='T', transb='T') PcVmu_is_gpu = gpuarray.empty((k, m), dtype=np.float32) for i in range(k): PcVmu_is_gpu[i] = LA.dot(PcV_gpu, mu_is_gpu[i][:, None]).ravel() global_temp = LA.dot(X_gpu, PcV_gpu, transb='T') if itr % 2 == 0: C_old = C_gpu.get() X_transformed_gpu = gpuarray.empty( (n, k, m), dtype=np.float32) for i in xrange(n): temp = global_temp[i] X_transformed_gpu[i] = skcuda.misc.subtract( PcVmu_is_gpu, temp) X_transformed_squared_gpu = LA.multiply( X_transformed_gpu, X_transformed_gpu) X_transformed_squared_gpu = X_transformed_squared_gpu.reshape( (n * k, m)) X_transformed_sum_gpu = skcuda.misc.sum( X_transformed_squared_gpu, axis=-1, keepdims=True) X_transformed_sum_gpu = X_transformed_sum_gpu.reshape((n, k)) C_gpu = skcuda.misc.argmin( X_transformed_sum_gpu, axis=1) if itr % 2 == 0: Cnew = C_gpu.get() points_changed = np.sum(1 - np.equal(C_old, Cnew).astype(np.uint8)) if points_changed == 0: assignment_unchanged += 1 if assignment_unchanged >= 2: break print('[i] Itr %d: %d points changed' % (itr, points_changed)) C = C_gpu.get() counts = {i: 0 for i in range(k)} mu_is = np.zeros((k, d)).astype(np.float32) for i in range(n): C_id = np.int(C[i]) mu_is[C_id] += X[i] counts[C_id] += 1 mu_is = np.array([mu_is[i] / counts[i] for i in range(k)]) mu_is_gpu = gpuarray.to_gpu(mu_is) S_is_gpu = gpuarray.zeros((k, d, d), dtype=np.float32) maxv = np.max(counts.values()) storage = np.empty((k, np.int(maxv), d)).astype(np.float32) counter = np.zeros(k, dtype=np.uint32) for i in range(n): C_id = np.int(C[i]) X_minus_mu_isi = (X[i] - mu_is[C_id])[:, None] storage[C_id, np.int(counter[C_id]), :] = X_minus_mu_isi.ravel() counter[C_id] += 1 storage_gpu = gpuarray.to_gpu(storage) for i in range(k): curr_cluster_points = storage_gpu[i, :np.int(counter[i]), :] S_is_gpu[i] = LA.dot(curr_cluster_points, curr_cluster_points, transa='T') S_is_sum_gpu = S_is_gpu.reshape((k, d * d)) S_is_sum_gpu = skcuda.misc.sum(S_is_sum_gpu, axis=0, keepdims=True) S_is_sum_gpu = S_is_sum_gpu.reshape((d, d)) S_is_diff_gpu = skcuda.misc.subtract(S_is_sum_gpu, S_D_gpu) w, V_gpu = sorted_eig(S_is_diff_gpu, mode='gpu') maxVal = min(w) m = np.sum([1 for i in w if i / maxVal > 1e-3]) m = max(1, m) itr += 1 return C_gpu.get(), V_gpu.get(), m
def fitSlcGPU(slc, srcFatT2, t2, b1, ff): global ROWSTEP print("Fitting slice", slc) yValues = dicomStack[:, :, slc, :].squeeze() slcShape = yValues.shape nrows = slcShape[0] ncols = slcShape[1] sigLen = slcShape[2] success = False ffParams_gpu = None ffValues_gpu = None if np.any(ff[:,:,slc] > 0): useFF = True ffParams_gpu = findmax_ff.prepareAndLoadParams(parameterCombinations) else: useFF = False while not success: try: for r in range(0,nrows,ROWSTEP): rowMax = min(r+ROWSTEP, nrows) slcLin = yValues[r:rowMax,:,:].reshape(ncols*(rowMax-r), sigLen).astype(np.float32) slcGPU = None slcGPU = pycuda.gpuarray.to_gpu(slcLin) slcGPU = sklinalg.multiply(slcGPU, slcGPU) corrMatrixGPU = sklinalg.mdot(slcGPU, signalsGPU) # correlation tryFree(slcGPU) if useFF: ffValues_gpu = findmax_ff.prepareAndLoadFF(ff[r:rowMax, :, slc]) corrMax = findmax_ff.findmax_gpu(corrMatrixGPU, ffValues_gpu, ffParams_gpu) else: corrMaxGPU = skmisc.argmax(corrMatrixGPU, 1) corrMax = corrMaxGPU.get() tryFree(corrMaxGPU) tryFree(corrMatrixGPU) tryFree(ffValues_gpu) for row in range(r, rowMax): for c in range(ncols): ind = (row-r)*ncols + c t2[row,c,slc] = parameterCombinations[corrMax[ind]][0] b1[row,c,slc] = parameterCombinations[corrMax[ind]][1] ff[row,c,slc] = parameterCombinations[corrMax[ind]][2] if DOPLOT >= 1: plotImages() success = True except pycuda._driver.MemoryError: ROWSTEP -= 1 tryFree(slcGPU) tryFree(corrMatrixGPU) tryFree(ffValues_gpu) gc.collect() print("Not enough GPU Mem: decreasing ROWSTEP to", ROWSTEP)
start = time.time() xf = np.fft.fft2(image) * np.fft.fft2(kernel) conv_cpu = np.real(np.fft.ifft2(xf)) cpu_time = time.time() - start print('CPU FFT in ', cpu_time) shape = image.shape image_gpu = gpuarray.to_gpu(image) xf_gpu = gpuarray.empty(shape, np.complex64) image_plan_forward = cu_fft.Plan(shape, np.float32, np.complex64) kernel_gpu = gpuarray.to_gpu(kernel) kf_gpu = gpuarray.empty(shape, np.complex64) kernel_plan_forward = cu_fft.Plan(shape, np.float32, np.complex64) plan_inverse = cu_fft.Plan(shape, np.complex64, np.float32) start = time.time() cu_fft.fft(image_gpu, xf_gpu, image_plan_forward) cu_fft.fft(kernel_gpu, kf_gpu, kernel_plan_forward) cf_gpu = culinalg.multiply(xf_gpu, kf_gpu) cu_fft.ifft(cf_gpu, image_gpu, plan_inverse, True) gpu_time = time.time() - start conv_gpu = image_gpu.get() print('GPU FFT in ', gpu_time) tol = 1e-4 print('Success status: ', np.allclose(conv_cpu, conv_gpu, atol=tol), "; atol=", tol)
def propagate_gpu_wavelength(self, wavelength, wavefront, N): """ Propagation from Pupil Plane to Exit Slit on the GPU for a single wavelength Repeated N times to show how it runs much faster on the GPU when we want to compute many PSF images :param wavefront: :return: """ # It is a pain in the ass to handle the memory properly on the GPU when you have [N_slices, N_pix, N_pix] # arrays print("\nPropagating on the GPU") # GPU memory management free, total = cuda.mem_get_info() print("Memory Start | Free: %.2f percent" % (free / total * 100)) slicer_masks_gpu = gpuarray.to_gpu(self.slicer_masks_fftshift) mirror_mask_gpu = gpuarray.to_gpu( self.pupil_mirror_masks_fft[wavelength]) plan_batch = cu_fft.Plan((self.N_PIX, self.N_PIX), np.complex64, np.complex64, self.N_slices) # Allocate GPU arrays that will be overwritten with skcuda.misc.set_realloc to save memory _pupil = np.zeros((self.N_PIX, self.N_PIX), dtype=np.complex64) complex_pupil_gpu = gpuarray.to_gpu(_pupil) _slicer = np.zeros((self.N_slices, self.N_PIX, self.N_PIX), dtype=np.complex64) complex_slicer_gpu = gpuarray.to_gpu(_slicer) PSF_images = [] for i in range(N): print(i) # Pupil Plane -> Image Slicer pupil_mask = self.pupil_masks[wavelength] complex_pupil = pupil_mask * np.exp( 1j * 2 * np.pi * wavefront[i] / wavelength) skcuda.misc.set_realloc(complex_pupil_gpu, np.asarray(complex_pupil, np.complex64)) cu_fft.fft(complex_pupil_gpu, complex_pupil_gpu, plan_batch) # Add N_slices copies to be Masked complex_slicer_cpu = complex_pupil_gpu.get() complex_slicer_cpu = np.stack([complex_slicer_cpu] * self.N_slices) skcuda.misc.set_realloc(complex_slicer_gpu, complex_slicer_cpu) clinalg.multiply(slicer_masks_gpu, complex_slicer_gpu, overwrite=True) # Image Slicer -> Pupil Mirror cu_fft.ifft(complex_slicer_gpu, complex_slicer_gpu, plan_batch, True) clinalg.multiply(mirror_mask_gpu, complex_slicer_gpu, overwrite=True) # Pupil Mirror -> Exit Slits cu_fft.fft(complex_slicer_gpu, complex_slicer_gpu, plan_batch) # pycuda.cumath.fabs(complex_slicer_gpu, out=complex_slicer_gpu) _slits = complex_slicer_gpu.get() slits = np.sum((np.abs(_slits))**2, axis=0) PSF_images.append(slits) # free, total = cuda.mem_get_info() # print("Memory Usage | Free: %.2f percent" % (free / total * 100)) # free, total = cuda.mem_get_info() # print("Memory End | Free: %.2f percent" % (free/total*100)) # Make sure you clean up the memory so that it doesn't blow up!! complex_pupil_gpu.gpudata.free() complex_slicer_gpu.gpudata.free() slicer_masks_gpu.gpudata.free() mirror_mask_gpu.gpudata.free() free, total = cuda.mem_get_info() print("Memory Final | Free: %.2f percent" % (free / total * 100)) return fftshift(np.array(PSF_images), axes=(1, 2))