def runTest(vlength = 128,loops = 1): n2 = vlength*vlength alfa = c_float(.5) cublasInit() h_X = (c_float*n2)() h_Y = (c_float*n2)() g_Y = (c_float*n2)() vectorInit(h_X) vectorInit(h_Y) d_X = c_void_p() d_Y = c_void_p() cublasAlloc(n2, sizeof(c_float), byref(d_X)) cublasAlloc(n2, sizeof(c_float), byref(d_Y)) cublasSetVector(n2, sizeof(c_float), h_X, 1, d_X, 1) cublasSetVector(n2, sizeof(c_float), h_Y, 1, d_Y, 1) flops = (2.e-9*n2)*float(loops) t0 = time() for i in range(loops): cublasSaxpy(n2, alfa, d_X, 1, d_Y, 1) cudaThreadSynchronize() t0 = time()-t0 print "Processing time: %.3g sec" % t0 print "Gigaflops GPU: %.2f (%d)" % (flops/t0,n2) t1 = time() for i in range(loops): cpuSAXPY(alfa,h_X,h_Y) t1 = time()-t1 print "\nProcessing time: %.3g sec" % t1 print "Gigaflops CPU: %.2f" % (flops/t1) print "GPU vs. CPU : %.2f" % (t1/t0) cublasGetVector(n2, sizeof(c_float), d_Y, 1, g_Y, 1) err,mxe = checkError(h_Y,g_Y) print "\nAvg and max rel error = %.2e %.2e" % (err,mxe) cublasFree(d_X) cublasFree(d_Y) cublasShutdown()
def runTest(vlength = 128,loops = 1): n2 = vlength*vlength alfa = c_float(.5) cublasInit() h_X = (c_float*n2)() h_Y = (c_float*n2)() vectorInit(h_X) vectorInit(h_Y) d_X = c_void_p() d_Y = c_void_p() cublasAlloc(n2, sizeof(c_float), byref(d_X)) cublasAlloc(n2, sizeof(c_float), byref(d_Y)) cublasSetVector(n2, sizeof(c_float), h_X, 1, d_X, 1) cublasSetVector(n2, sizeof(c_float), h_Y, 1, d_Y, 1) flops = (2.e-9*n2)*float(loops) s0 = 0. t0 = time() for i in range(loops): s0 += cublasSdot(n2, d_X, 1, d_Y, 1) cudaThreadSynchronize() t0 = time()-t0 print "Processing time: %.3g sec" % t0 print "Gigaflops GPU: %.2f (%d)" % (flops/t0,n2) s1 = 0. t1 = time() for i in range(loops): s1 += cpuSDOT(h_X,h_Y) t1 = time()-t1 print "\nProcessing time: %.3g sec" % t1 print "Gigaflops CPU: %.2f" % (flops/t1) print "GPU vs. CPU : %.2f" % (t1/t0) sx = max(1.e-7,max(abs(s0),abs(s1))) err = abs(s1-s0)/sx print "\nError = %.2e" % err cublasFree(d_X) cublasFree(d_Y) cublasShutdown()
def main(N = 1024,L = 100): M = N K = N >> 1 N = N << 1 flops = (2.e-9*M*N)*float(K*L) print "M = %d, N = %d, K = %d, L = %d; GFlops = %.1f\n" % (M,N,K,L,flops) na,nb,nc,alfa,beta = M*K,K*N,M*N,1.,0. t0 = time() cublasInit() h_A = (c_float*na)() h_B = (c_float*nb)() h_C = (c_float*nc)() g_C = (c_float*nc)() arrayInit(h_A,na) arrayInit(h_B,nb) arrayInit(h_C,nc) d_A = c_void_p() d_B = c_void_p() d_C = c_void_p() cublasAlloc(na, sizeof(c_float), byref(d_A)) cublasAlloc(nb, sizeof(c_float), byref(d_B)) cublasAlloc(nc, sizeof(c_float), byref(d_C)) cublasSetVector(na, sizeof(c_float), h_A, 1, d_A, 1) cublasSetVector(nb, sizeof(c_float), h_B, 1, d_B, 1) cublasSetVector(nc, sizeof(c_float), h_C, 1, d_C, 1) tt = t0 = time()-t0 print "Overhead CUBLAS: %.3f sec\n" % t0 t0 = time() for i in range(L): cublasSgemm('n', 'n', M, N, K, alfa, d_A, M, d_B, K, beta, d_C, M) cudaThreadSynchronize() t0 = time()-t0 tt += t0 print "Processing time: %.3g (%.3g) sec" % (t0,tt) print "Gigaflops GPU: %.2f (%.2f)" % (flops/t0,flops/tt) t1 = time() for i in range(L): sgemm(h_C,h_A,h_B,M,N,K) t1 = time()-t1 print "\nProcessing time: %.3g sec" % t1 print "Gigaflops CPU : %.2f" % (flops/t1) print "Speedup GPU/CPU: %.2f" % (t1/t0) cublasGetVector(nc, sizeof(c_float), d_C, 1, g_C, 1) err,mxe = checkError(h_C,g_C,nc) print "\nAvg and max rel error = %.2e %.2e" % (err,mxe) cublasFree(d_A) cublasFree(d_B) cublasFree(d_C) cublasShutdown()