def test_cublasCsscal(self): x = (np.random.rand(5)+1j*np.random.rand(5)).astype(np.complex64) x_gpu = gpuarray.to_gpu(x) alpha = np.float32(np.random.rand()) cublas.cublasCscal(self.cublas_handle, x_gpu.size, alpha, x_gpu.gpudata, 1) assert np.allclose(x_gpu.get(), alpha*x)
def bicgstabMemory(cublasHandle, x_gpu, b_gpu, Xprime_gpu, X_gpu, XX_gpu, Yprime_gpu, Y_gpu, YY_gpu, Zprime_gpu, zzero, freq, FREQ_gpu, c, Deltaxprime, Deltayprime, Deltazprime, sizePartitionr, sizePartitionc, M, max_it, tol): # --- flag: 0 = solution found to tolerance # 1 = no convergence given max_it # -1 = breakdown: rho = 0 # -2 = breakdown: omega = 0 N = xcg_gpu.size # --- Initializations iter = np.float32(0) flag = np.float32(0) alpha = np.float32(0) rho_1 = np.float32(0) v_gpu = gpuarray.zeros(N, dtype=np.float32) p_gpu = gpuarray.zeros(N, dtype=np.float32) # d_p_hat = gpuarray.zeros(N, dtype = np.float32) # d_s_hat = gpuarray.zeros(N, dtype = np.float32) # d_t = gpuarray.zeros(N, dtype = np.float32) #bnrm2 = np.sqrt((culinalg.dot(b_gpu, b_gpu.conj(), 'T', 'N').real).get()) bnrm2 = cublas.cublasScnrm2(cublasHandle, N, b_gpu.gpudata, 1) if bnrm2 == np.float32(0.0): bnrm2 = np.float32(1.0) yprime_gpu = computeAx(x_gpu, Xprime_gpu, X_gpu, XX_gpu, Yprime_gpu, Y_gpu, YY_gpu, Zprime_gpu, zzero, freq, FREQ_gpu, c, Deltaxprime, Deltayprime, Deltazprime, sizePartitionc, XX_gpu.size) xprime_gpu = computeAdy(cublasHandle, yprime_gpu, Xprime_gpu, XX_gpu, Yprime_gpu, YY_gpu, Zprime_gpu, zzero, FREQ_gpu, c, Deltaxprime * Deltayprime * Deltazprime, sizePartitionr, b_gpu.size) r_gpu = b_gpu - xprime_gpu error = cublas.cublasScnrm2(cublasHandle, N, r_gpu.gpudata, 1) / bnrm2 if (error < tol): return x_gpu, error, iter, flag omega = np.float32(1.0) r_tld_gpu = r_gpu.copy() for iter in range(max_it): rho = cublas.cublasCdotc(cublasHandle, N, r_tld_gpu.gpudata, 1, r_gpu.gpudata, 1) # direction vector if (rho == np.float32(0.0)): break if (iter > 0): beta = (rho / rho_1) * (alpha / omega) cublas.cublasCaxpy(cublasHandle, N, -omega, v_gpu.gpudata, 1, p_gpu.gpudata, 1) cublas.cublasCscal(cublasHandle, N, beta, p_gpu.gpudata, 1) cublas.cublasCaxpy(cublasHandle, N, np.float32(1.0), r_gpu.gpudata, 1, p_gpu.gpudata, 1) else: p_gpu = r_gpu.copy() p_hat_gpu = p_gpu.copy() yprime_gpu = computeAx(p_hat_gpu, Xprime_gpu, X_gpu, XX_gpu, Yprime_gpu, Y_gpu, YY_gpu, Zprime_gpu, zzero, freq, FREQ_gpu, c, Deltaxprime, Deltayprime, Deltazprime, sizePartitionc, XX_gpu.size) v_gpu = computeAdy(cublasHandle, yprime_gpu, Xprime_gpu, XX_gpu, Yprime_gpu, YY_gpu, Zprime_gpu, zzero, FREQ_gpu, c, Deltaxprime * Deltayprime * Deltazprime, sizePartitionr, b_gpu.size) alpha = rho / cublas.cublasCdotc(cublasHandle, N, r_tld_gpu.gpudata, 1, v_gpu.gpudata, 1) s_gpu = r_gpu.copy() cublas.cublasCaxpy(cublasHandle, N, -alpha, v_gpu.gpudata, 1, s_gpu.gpudata, 1) norms = cublas.cublasScnrm2(cublasHandle, N, s_gpu.gpudata, 1) if (norms < tol): # --- early convergence check cublas.cublasCaxpy(cublasHandle, N, np.float32(alpha), p_hat_gpu.gpudata, 1, x_gpu.gpudata, 1) break # --- stabilizer s_hat_gpu = s_gpu.copy() yprime_gpu = computeAx(s_hat_gpu, Xprime_gpu, X_gpu, XX_gpu, Yprime_gpu, Y_gpu, YY_gpu, Zprime_gpu, zzero, freq, FREQ_gpu, c, Deltaxprime, Deltayprime, Deltazprime, sizePartitionc, XX_gpu.size) t_gpu = computeAdy(cublasHandle, yprime_gpu, Xprime_gpu, XX_gpu, Yprime_gpu, YY_gpu, Zprime_gpu, zzero, FREQ_gpu, c, Deltaxprime * Deltayprime * Deltazprime, sizePartitionr, b_gpu.size) omega = cublas.cublasCdotc(cublasHandle, N, t_gpu.gpudata, 1, s_gpu.gpudata, 1) / cublas.cublasCdotc( cublasHandle, N, t_gpu.gpudata, 1, t_gpu.gpudata, 1) # --- update approximation cublas.cublasCaxpy(cublasHandle, N, alpha, p_hat_gpu.gpudata, 1, x_gpu.gpudata, 1) cublas.cublasCaxpy(cublasHandle, N, omega, s_hat_gpu.gpudata, 1, x_gpu.gpudata, 1) r_gpu = s_gpu.copy() cublas.cublasCaxpy(cublasHandle, N, -omega, t_gpu.gpudata, 1, r_gpu.gpudata, 1) error = cublas.cublasScnrm2(cublasHandle, N, r_gpu.gpudata, 1) / bnrm2 # --- check convergence if (error <= tol): break if (omega == np.float32(0.0)): break rho_1 = rho print("iteration") temp = np.sqrt( gpuarray.max(s_gpu.real * s_gpu.real + s_gpu.imag * s_gpu.imag).get()) if ((error <= np.float32(tol)) or temp <= tol): # --- converged if (temp <= tol): error = cublas.cublasScnrm2(cublasHandle, N, s_gpu.gpudata, 1) / bnrm2 flag = 0 elif (omega == np.float32(0.0)): # --- breakdown flag = -2 elif (rho == np.float32(0.0)): flag = -1 else: # --- no convergence flag = 1 p_hat_gpu.gpudata.free() s_hat_gpu.gpudata.free() v_gpu.gpudata.free() t_gpu.gpudata.free() return xcg_gpu, 0, 0, 0