Beispiel #1
0
def runTest(vlength = 128,loops = 1):
    n2 = vlength*vlength
    alfa = c_float(.5)

    cublasInit()

    h_X = (c_float*n2)()
    h_Y = (c_float*n2)()
    g_Y = (c_float*n2)()
    vectorInit(h_X)
    vectorInit(h_Y)

    d_X = c_void_p()
    d_Y = c_void_p()
    cublasAlloc(n2, sizeof(c_float), byref(d_X))
    cublasAlloc(n2, sizeof(c_float), byref(d_Y))
 
    cublasSetVector(n2, sizeof(c_float), h_X, 1, d_X, 1)
    cublasSetVector(n2, sizeof(c_float), h_Y, 1, d_Y, 1)

    flops = (2.e-9*n2)*float(loops)
    t0 = time()
    for i in range(loops):
        cublasSaxpy(n2, alfa, d_X, 1, d_Y, 1)
    cudaThreadSynchronize()
    t0 = time()-t0

    print "Processing time: %.3g sec" % t0
    print "Gigaflops GPU: %.2f (%d)" % (flops/t0,n2)

    t1 = time()
    for i in range(loops):
        cpuSAXPY(alfa,h_X,h_Y)
    t1 = time()-t1
    
    print "\nProcessing time: %.3g sec" % t1
    print "Gigaflops CPU: %.2f" % (flops/t1)
    print "GPU vs. CPU  : %.2f" % (t1/t0)

    cublasGetVector(n2, sizeof(c_float), d_Y, 1, g_Y, 1)
    err,mxe = checkError(h_Y,g_Y)
    print "\nAvg and max rel error = %.2e %.2e" % (err,mxe)

    cublasFree(d_X)
    cublasFree(d_Y)

    cublasShutdown()
Beispiel #2
0
def runTest(vlength = 128,loops = 1):
    n2 = vlength*vlength
    alfa = c_float(.5)

    cublasInit()

    h_X = (c_float*n2)()
    h_Y = (c_float*n2)()
    vectorInit(h_X)
    vectorInit(h_Y)

    d_X = c_void_p()
    d_Y = c_void_p()
    cublasAlloc(n2, sizeof(c_float), byref(d_X))
    cublasAlloc(n2, sizeof(c_float), byref(d_Y))
 
    cublasSetVector(n2, sizeof(c_float), h_X, 1, d_X, 1)
    cublasSetVector(n2, sizeof(c_float), h_Y, 1, d_Y, 1)

    flops = (2.e-9*n2)*float(loops)
    s0 = 0.
    t0 = time()
    for i in range(loops):
        s0 += cublasSdot(n2, d_X, 1, d_Y, 1)
    cudaThreadSynchronize()
    t0 = time()-t0

    print "Processing time: %.3g sec" % t0
    print "Gigaflops GPU: %.2f (%d)" % (flops/t0,n2)

    s1 = 0.
    t1 = time()
    for i in range(loops):
        s1 += cpuSDOT(h_X,h_Y)
    t1 = time()-t1
    print "\nProcessing time: %.3g sec" % t1
    print "Gigaflops CPU: %.2f" % (flops/t1)
    print "GPU vs. CPU  : %.2f" % (t1/t0)

    sx = max(1.e-7,max(abs(s0),abs(s1)))
    err = abs(s1-s0)/sx
    print "\nError = %.2e" % err

    cublasFree(d_X)
    cublasFree(d_Y)

    cublasShutdown()
Beispiel #3
0
def main(N = 1024,L = 100):
    M = N
    K = N >> 1
    N = N << 1
    flops = (2.e-9*M*N)*float(K*L)
    print "M = %d, N = %d, K = %d, L = %d; GFlops = %.1f\n" % (M,N,K,L,flops)
    na,nb,nc,alfa,beta = M*K,K*N,M*N,1.,0.

    t0 = time()
    cublasInit()

    h_A = (c_float*na)()
    h_B = (c_float*nb)()
    h_C = (c_float*nc)()
    g_C = (c_float*nc)()

    arrayInit(h_A,na)
    arrayInit(h_B,nb)
    arrayInit(h_C,nc)

    d_A = c_void_p()
    d_B = c_void_p()
    d_C = c_void_p()

    cublasAlloc(na, sizeof(c_float), byref(d_A))
    cublasAlloc(nb, sizeof(c_float), byref(d_B))
    cublasAlloc(nc, sizeof(c_float), byref(d_C))
 
    cublasSetVector(na, sizeof(c_float), h_A, 1, d_A, 1)
    cublasSetVector(nb, sizeof(c_float), h_B, 1, d_B, 1)
    cublasSetVector(nc, sizeof(c_float), h_C, 1, d_C, 1)
    tt = t0 = time()-t0
    print "Overhead CUBLAS: %.3f sec\n" % t0

    t0 = time()
    for i in range(L):
        cublasSgemm('n', 'n', M, N, K, alfa, d_A, M, d_B, K, beta, d_C, M)
    cudaThreadSynchronize()
    t0 = time()-t0
    tt += t0

    print "Processing time: %.3g (%.3g) sec" % (t0,tt)
    print "Gigaflops GPU: %.2f (%.2f)" % (flops/t0,flops/tt)

    t1 = time()
    for i in range(L):
        sgemm(h_C,h_A,h_B,M,N,K)
    t1 = time()-t1
    print "\nProcessing time: %.3g sec" % t1
    print "Gigaflops CPU  : %.2f" % (flops/t1)
    print "Speedup GPU/CPU: %.2f" % (t1/t0)

    cublasGetVector(nc, sizeof(c_float), d_C, 1, g_C, 1)
    err,mxe = checkError(h_C,g_C,nc)
    print "\nAvg and max rel error = %.2e %.2e" % (err,mxe)

    cublasFree(d_A)
    cublasFree(d_B)
    cublasFree(d_C)

    cublasShutdown()