Beispiel #1
0
 def test_cublasCsscal(self):
     x = (np.random.rand(5)+1j*np.random.rand(5)).astype(np.complex64)
     x_gpu = gpuarray.to_gpu(x)
     alpha = np.float32(np.random.rand())
     cublas.cublasCscal(self.cublas_handle, x_gpu.size, alpha,
                        x_gpu.gpudata, 1)
     assert np.allclose(x_gpu.get(), alpha*x)
def bicgstabMemory(cublasHandle, x_gpu, b_gpu, Xprime_gpu, X_gpu, XX_gpu,
                   Yprime_gpu, Y_gpu, YY_gpu, Zprime_gpu, zzero, freq,
                   FREQ_gpu, c, Deltaxprime, Deltayprime, Deltazprime,
                   sizePartitionr, sizePartitionc, M, max_it, tol):

    # --- flag:     0 = solution found to tolerance
    #               1 = no convergence given max_it
    #              -1 = breakdown: rho = 0
    #              -2 = breakdown: omega = 0

    N = xcg_gpu.size

    # --- Initializations
    iter = np.float32(0)
    flag = np.float32(0)
    alpha = np.float32(0)
    rho_1 = np.float32(0)
    v_gpu = gpuarray.zeros(N, dtype=np.float32)
    p_gpu = gpuarray.zeros(N, dtype=np.float32)
    #    d_p_hat = gpuarray.zeros(N, dtype = np.float32)
    #    d_s_hat = gpuarray.zeros(N, dtype = np.float32)
    #    d_t     = gpuarray.zeros(N, dtype = np.float32)

    #bnrm2   = np.sqrt((culinalg.dot(b_gpu, b_gpu.conj(), 'T', 'N').real).get())
    bnrm2 = cublas.cublasScnrm2(cublasHandle, N, b_gpu.gpudata, 1)
    if bnrm2 == np.float32(0.0):
        bnrm2 = np.float32(1.0)

    yprime_gpu = computeAx(x_gpu, Xprime_gpu, X_gpu, XX_gpu, Yprime_gpu, Y_gpu,
                           YY_gpu, Zprime_gpu, zzero, freq, FREQ_gpu, c,
                           Deltaxprime, Deltayprime, Deltazprime,
                           sizePartitionc, XX_gpu.size)
    xprime_gpu = computeAdy(cublasHandle, yprime_gpu, Xprime_gpu, XX_gpu,
                            Yprime_gpu, YY_gpu, Zprime_gpu, zzero, FREQ_gpu, c,
                            Deltaxprime * Deltayprime * Deltazprime,
                            sizePartitionr, b_gpu.size)
    r_gpu = b_gpu - xprime_gpu
    error = cublas.cublasScnrm2(cublasHandle, N, r_gpu.gpudata, 1) / bnrm2
    if (error < tol):
        return x_gpu, error, iter, flag

    omega = np.float32(1.0)
    r_tld_gpu = r_gpu.copy()

    for iter in range(max_it):

        rho = cublas.cublasCdotc(cublasHandle, N, r_tld_gpu.gpudata, 1,
                                 r_gpu.gpudata, 1)  # direction vector
        if (rho == np.float32(0.0)):
            break

        if (iter > 0):
            beta = (rho / rho_1) * (alpha / omega)
            cublas.cublasCaxpy(cublasHandle, N, -omega, v_gpu.gpudata, 1,
                               p_gpu.gpudata, 1)
            cublas.cublasCscal(cublasHandle, N, beta, p_gpu.gpudata, 1)
            cublas.cublasCaxpy(cublasHandle, N, np.float32(1.0), r_gpu.gpudata,
                               1, p_gpu.gpudata, 1)
        else:
            p_gpu = r_gpu.copy()

        p_hat_gpu = p_gpu.copy()
        yprime_gpu = computeAx(p_hat_gpu, Xprime_gpu, X_gpu, XX_gpu,
                               Yprime_gpu, Y_gpu, YY_gpu, Zprime_gpu, zzero,
                               freq, FREQ_gpu, c, Deltaxprime, Deltayprime,
                               Deltazprime, sizePartitionc, XX_gpu.size)
        v_gpu = computeAdy(cublasHandle, yprime_gpu, Xprime_gpu, XX_gpu,
                           Yprime_gpu, YY_gpu, Zprime_gpu, zzero, FREQ_gpu, c,
                           Deltaxprime * Deltayprime * Deltazprime,
                           sizePartitionr, b_gpu.size)

        alpha = rho / cublas.cublasCdotc(cublasHandle, N, r_tld_gpu.gpudata, 1,
                                         v_gpu.gpudata, 1)
        s_gpu = r_gpu.copy()
        cublas.cublasCaxpy(cublasHandle, N, -alpha, v_gpu.gpudata, 1,
                           s_gpu.gpudata, 1)
        norms = cublas.cublasScnrm2(cublasHandle, N, s_gpu.gpudata, 1)
        if (norms < tol):  # --- early convergence check
            cublas.cublasCaxpy(cublasHandle, N, np.float32(alpha),
                               p_hat_gpu.gpudata, 1, x_gpu.gpudata, 1)
            break

        # --- stabilizer
        s_hat_gpu = s_gpu.copy()
        yprime_gpu = computeAx(s_hat_gpu, Xprime_gpu, X_gpu, XX_gpu,
                               Yprime_gpu, Y_gpu, YY_gpu, Zprime_gpu, zzero,
                               freq, FREQ_gpu, c, Deltaxprime, Deltayprime,
                               Deltazprime, sizePartitionc, XX_gpu.size)
        t_gpu = computeAdy(cublasHandle, yprime_gpu, Xprime_gpu, XX_gpu,
                           Yprime_gpu, YY_gpu, Zprime_gpu, zzero, FREQ_gpu, c,
                           Deltaxprime * Deltayprime * Deltazprime,
                           sizePartitionr, b_gpu.size)
        omega = cublas.cublasCdotc(cublasHandle, N, t_gpu.gpudata, 1,
                                   s_gpu.gpudata, 1) / cublas.cublasCdotc(
                                       cublasHandle, N, t_gpu.gpudata, 1,
                                       t_gpu.gpudata, 1)

        # --- update approximation
        cublas.cublasCaxpy(cublasHandle, N, alpha, p_hat_gpu.gpudata, 1,
                           x_gpu.gpudata, 1)
        cublas.cublasCaxpy(cublasHandle, N, omega, s_hat_gpu.gpudata, 1,
                           x_gpu.gpudata, 1)

        r_gpu = s_gpu.copy()
        cublas.cublasCaxpy(cublasHandle, N, -omega, t_gpu.gpudata, 1,
                           r_gpu.gpudata, 1)

        error = cublas.cublasScnrm2(cublasHandle, N, r_gpu.gpudata, 1) / bnrm2
        # --- check convergence
        if (error <= tol):
            break

        if (omega == np.float32(0.0)):
            break

        rho_1 = rho

        print("iteration")

    temp = np.sqrt(
        gpuarray.max(s_gpu.real * s_gpu.real + s_gpu.imag * s_gpu.imag).get())
    if ((error <= np.float32(tol)) or temp <= tol):  # --- converged
        if (temp <= tol):
            error = cublas.cublasScnrm2(cublasHandle, N, s_gpu.gpudata,
                                        1) / bnrm2
        flag = 0
    elif (omega == np.float32(0.0)):  # --- breakdown
        flag = -2
    elif (rho == np.float32(0.0)):
        flag = -1
    else:  # --- no convergence
        flag = 1

    p_hat_gpu.gpudata.free()
    s_hat_gpu.gpudata.free()
    v_gpu.gpudata.free()
    t_gpu.gpudata.free()

    return xcg_gpu, 0, 0, 0