def main(N=1024, L=100): M = N K = N >> 1 N = N << 1 flops = (2.e-9 * M * N) * float(K * L) print "M = %d, N = %d, K = %d, L = %d; GFlops = %.1f\n" % (M, N, K, L, flops) na, nb, nc, alfa, beta = M * K, K * N, M * N, 1., 0. t0 = time() sizeA = M * K sizeB = K * N sizeC = M * N h_A = (c_float * sizeA)() h_B = (c_float * sizeB)() arrayInit(h_A) arrayInit(h_B) d_A = getMemory(h_A) d_B = getMemory(h_B) d_C = getMemory(sizeC) blockDim = dim3(BLOCK_SIZE, BLOCK_SIZE, 1) gridDim = dim3(N / BLOCK_SIZE, M / BLOCK_SIZE, 1) sharedMem = S4 * 2 * BLOCK_SIZE * BLOCK_SIZE tt = t0 = time() - t0 print "Overhead runtime API: %.3f sec\n" % t0 t0 = time() cudaThreadSynchronize() for i in range(L): cudaConfigureCall(gridDim, blockDim, sharedMem, 0) gpuSGEMM(d_C, d_A, d_B, K, N) cudaThreadSynchronize() t0 = time() - t0 tt += t0 h_C = (c_float * sizeC)() cudaMemcpy(h_C, d_C, S4 * sizeC, cudaMemcpyDeviceToHost) cudaThreadSynchronize() cudaFree(d_A) cudaFree(d_B) cudaFree(d_C) cudaThreadExit() print "Processing time: %.3g (%.3g) sec" % (t0, tt) print "Gigaflops GPU: %.2f (%.2f)" % (flops / t0, flops / tt) ref = (c_float * sizeC)() t1 = time() for i in range(L): sgemm(ref, h_A, h_B, M, N, K) t1 = time() - t1 print "\nProcessing time: %.3g sec" % t1 print "Gigaflops CPU : %.2f" % (flops / t1) print "Speedup GPU/CPU: %.2f" % (t1 / t0) err, mxe = checkError(ref, h_C) print "\nAvg and max rel error = %.2e %.2e" % (err, mxe)
kr = int(sys.argv[1]) dims = tuple([int(x) for x in sys.argv[2].split(",")]) except IndexError: sys.exit() doComplex = False if kr < 0: kr = -kr doComplex = True size = reduce(lambda x, y: x * y, dims) if doComplex: r = (c_float * (size * 2))() else: r = (c_float * size)() arrayInit(r) sz = 1.e6 / float(size) fftw_start = time.clock() wall_start = time.time() xr = float(.5) / float(kr) if doComplex: text = "complex" rcfftx = xfft.ccfft crfftx = xfft.icfft else: text = " real" rcfftx = xfft.rcfft
def main(N = 1024,L = 100): M = N K = N >> 1 N = N << 1 flops = (2.e-9*M*N)*float(K*L) print "M = %d, N = %d, K = %d, L = %d; GFlops = %.1f\n" % (M,N,K,L,flops) na,nb,nc,alfa,beta = M*K,K*N,M*N,1.,0. t0 = time() cublasInit() h_A = (c_float*na)() h_B = (c_float*nb)() h_C = (c_float*nc)() g_C = (c_float*nc)() arrayInit(h_A,na) arrayInit(h_B,nb) arrayInit(h_C,nc) d_A = c_void_p() d_B = c_void_p() d_C = c_void_p() cublasAlloc(na, sizeof(c_float), byref(d_A)) cublasAlloc(nb, sizeof(c_float), byref(d_B)) cublasAlloc(nc, sizeof(c_float), byref(d_C)) cublasSetVector(na, sizeof(c_float), h_A, 1, d_A, 1) cublasSetVector(nb, sizeof(c_float), h_B, 1, d_B, 1) cublasSetVector(nc, sizeof(c_float), h_C, 1, d_C, 1) tt = t0 = time()-t0 print "Overhead CUBLAS: %.3f sec\n" % t0 t0 = time() for i in range(L): cublasSgemm('n', 'n', M, N, K, alfa, d_A, M, d_B, K, beta, d_C, M) cudaThreadSynchronize() t0 = time()-t0 tt += t0 print "Processing time: %.3g (%.3g) sec" % (t0,tt) print "Gigaflops GPU: %.2f (%.2f)" % (flops/t0,flops/tt) t1 = time() for i in range(L): sgemm(h_C,h_A,h_B,M,N,K) t1 = time()-t1 print "\nProcessing time: %.3g sec" % t1 print "Gigaflops CPU : %.2f" % (flops/t1) print "Speedup GPU/CPU: %.2f" % (t1/t0) cublasGetVector(nc, sizeof(c_float), d_C, 1, g_C, 1) err,mxe = checkError(h_C,g_C,nc) print "\nAvg and max rel error = %.2e %.2e" % (err,mxe) cublasFree(d_A) cublasFree(d_B) cublasFree(d_C) cublasShutdown()
def main(check=False, doComplex=False, dims=(128, )): print "+------------------------+" print "| Fast Fourier Transform |" print "| using CUDA runtime API |" print "+------------------------+\n" dims = tuple(dims) ndim = len(dims) v = ("", "NX = %d", "NX = %d NY = %d", "NX = %d NY = %d NZ = %d") SC = reduce(lambda x, y: x * y, dims) SR = reduce(lambda x, y: x * y, dims[:-1], 1) SR *= 2 * (dims[-1] / 2 + 1) print v[ndim] % dims print "< doComplex: %s >\n" % doComplex rz = 1. / float(SC) flops = 2. * 5. * SC * log(SC) / log(2.) * 1.e-9 if doComplex: SC *= 2 S4 = sizeof(c_float) if doComplex: sz = S4 * (SC + SC) / (1024 * 1024) else: sz = S4 * (SC + SR) / (1024 * 1024) h_A = (c_float * SC)() g_A = (c_float * SC)() arrayInit(h_A) d_A = getMemory(h_A) allocate = True if doComplex: d_B = getMemory(SC) elif allocate: d_B = getMemory(SR) if doComplex: plan = gf.makePlan(dims, CUFFT_C2C) else: plan1 = gf.makePlan(dims, CUFFT_R2C) plan2 = gf.makePlan(dims, CUFFT_C2R) t0 = time() x0 = ReadTimestampCounter() cudaThreadSynchronize() if doComplex: d_B = gf.ccfft(plan, d_A, None, d_B) d_A = gf.icfft(plan, d_B, None, d_A) else: if allocate: d_B = gf.rcfft(plan1, d_A, None, d_B) d_A = gf.crfft(plan2, d_B, None, d_A) else: d_B = gf.rcfft(plan1, d_A, SR) cuMemFree(d_A) d_A = gf.crfft(plan2, d_B, SR) cudaThreadSynchronize() t0 = time() - t0 x1 = ReadTimestampCounter() fc = 1.e-3 / 2.8 print "RDTSC: %.0f µs" % ((x1 - x0) * fc) cudaMemcpy(g_A, d_A, S4 * SC, cudaMemcpyDeviceToHost) cudaFree(d_A) cudaFree(d_B) if doComplex: cufftDestroy(plan) else: cufftDestroy(plan1) cufftDestroy(plan2) cudaThreadExit() scale(g_A, rz) print "\nProcessing time: %.3g sec" % t0 print "Gigaflops GPU : %.2f" % (flops / t0) gflops = (flops / t0, ) print "\nError CPU initial vs GPU" err, mxe = checkError(h_A, g_A) stats = err, mxe print "Avg and max rel error = %.2e %.2e\n" % (err, mxe) if check: t1 = time() if doComplex: h_B = xf.ccfft(h_A, dims) h_B = xf.icfft(h_B, dims) else: h_B = xf.rcfft(h_A, dims) h_B = xf.crfft(h_B, dims) t1 = time() - t1 print "Processing time: %.3g sec" % t1 print "Gigaflops CPU : %.2f" % (flops / t1) print "Speedup GPU/CPU: %.2f" % (t1 / t0) print "\nError CPU final vs CPU initial" err, mxe = checkError(h_B, h_A) print "Avg and max rel error = %.2e %.2e" % (err, mxe) print "\nError CPU final vs GPU" err, mxe = checkError(h_B, g_A) print "Avg and max rel error = %.2e %.2e" % (err, mxe) f = (-1., ) if check: f = (t1 / t0, ) fmt = "\n## " + " ".join(len(dims) * ["%3d"]) + " : %.1f %.1f: %.2e %.2e" print fmt % (dims + gflops + f + stats)
def main(N=1024, L=100): M = N K = N >> 1 N = N << 1 flops = (2.e-9 * M * N) * float(K * L) print "M = %d, N = %d, K = %d, L = %d; GFlops = %.1f\n" % (M, N, K, L, flops) na, nb, nc, alfa, beta = M * K, K * N, M * N, 1., 0. t0 = time() device = cu_CUDA() device.getSourceModule("gpuFunctions.cubin") gpuSGEMM = device.getFunction("gpuSGEMM") sizeA = M * K sizeB = K * N sizeC = M * N h_A = (c_float * sizeA)() h_B = (c_float * sizeB)() arrayInit(h_A) arrayInit(h_B) d_A = getMemory(h_A) d_B = getMemory(h_B) d_C = getMemory(sizeC) cuFuncSetBlockShape(gpuSGEMM, BLOCK_SIZE, BLOCK_SIZE, 1) cuFuncSetSharedSize(gpuSGEMM, 2 * BLOCK_SIZE * BLOCK_SIZE * S4) cuParamSeti(gpuSGEMM, 0, d_C) cuParamSeti(gpuSGEMM, 4, d_A) cuParamSeti(gpuSGEMM, 8, d_B) cuParamSeti(gpuSGEMM, 12, K) cuParamSeti(gpuSGEMM, 16, N) cuParamSetSize(gpuSGEMM, 20) tt = t0 = time() - t0 print "Overhead driver API: %.3f sec\n" % t0 t0 = time() cuCtxSynchronize() for i in range(L): cuLaunchGrid(gpuSGEMM, N / BLOCK_SIZE, M / BLOCK_SIZE) cuCtxSynchronize() t0 = time() - t0 tt += t0 h_C = (c_float * sizeC)() cuMemcpyDtoH(h_C, d_C, S4 * sizeC) cuCtxSynchronize() cuMemFree(d_A) cuMemFree(d_B) cuMemFree(d_C) cuCtxDetach(device.context) print "Processing time: %.3g (%.3g) sec" % (t0, tt) print "Gigaflops GPU: %.2f (%.2f)" % (flops / t0, flops / tt) ref = (c_float * sizeC)() t1 = time() for i in range(L): sgemm(ref, h_A, h_B, M, N, K) t1 = time() - t1 print "\nProcessing time: %.3g sec" % t1 print "Gigaflops CPU : %.2f" % (flops / t1) print "Speedup GPU/CPU: %.2f" % (t1 / t0) err, mxe = checkError(ref, h_C) print "\nAvg and max rel error = %.2e %.2e" % (err, mxe)
def main(check=False,doComplex=False,dims=(128,)): print "+------------------------+" print "| Fast Fourier Transform |" print "| using CUDA runtime API |" print "+------------------------+\n" dims = tuple(dims) ndim = len(dims) v = ("","NX = %d","NX = %d NY = %d","NX = %d NY = %d NZ = %d") SC = reduce(lambda x,y:x*y,dims) SR = reduce(lambda x,y:x*y,dims[:-1],1) SR *= 2*(dims[-1]/2+1) print v[ndim] % dims print "< doComplex: %s >\n" % doComplex rz = 1./float(SC) flops = 2.*5.*SC*log(SC)/log(2.)*1.e-9 if doComplex: SC *= 2 S4 = sizeof(c_float) if doComplex: sz = S4*(SC+SC)/(1024*1024) else: sz = S4*(SC+SR)/(1024*1024) h_A = (c_float*SC)() g_A = (c_float*SC)() arrayInit(h_A) d_A = getMemory(h_A) allocate = True if doComplex: d_B = getMemory(SC) elif allocate: d_B = getMemory(SR) if doComplex: plan = gf.makePlan(dims,CUFFT_C2C) else: plan1 = gf.makePlan(dims,CUFFT_R2C) plan2 = gf.makePlan(dims,CUFFT_C2R) t0 = time() x0 = ReadTimestampCounter() cudaThreadSynchronize() if doComplex: d_B = gf.ccfft(plan,d_A,None,d_B) d_A = gf.icfft(plan,d_B,None,d_A) else: if allocate: d_B = gf.rcfft(plan1,d_A,None,d_B) d_A = gf.crfft(plan2,d_B,None,d_A) else: d_B = gf.rcfft(plan1,d_A,SR) cuMemFree(d_A) d_A = gf.crfft(plan2,d_B,SR) cudaThreadSynchronize() t0 = time()-t0 x1 = ReadTimestampCounter() fc = 1.e-3/2.8 print "RDTSC: %.0f µs" % ((x1-x0)*fc) cudaMemcpy(g_A,d_A,S4*SC,cudaMemcpyDeviceToHost) cudaFree(d_A) cudaFree(d_B) if doComplex: cufftDestroy(plan) else: cufftDestroy(plan1) cufftDestroy(plan2) cudaThreadExit() scale(g_A,rz) print "\nProcessing time: %.3g sec" % t0 print "Gigaflops GPU : %.2f" % (flops/t0) gflops = (flops/t0,) print "\nError CPU initial vs GPU" err,mxe = checkError(h_A,g_A) stats = err,mxe print "Avg and max rel error = %.2e %.2e\n" % (err,mxe) if check: t1 = time() if doComplex: h_B = xf.ccfft(h_A,dims) h_B = xf.icfft(h_B,dims) else: h_B = xf.rcfft(h_A,dims) h_B = xf.crfft(h_B,dims) t1 = time()-t1 print "Processing time: %.3g sec" % t1 print "Gigaflops CPU : %.2f" % (flops/t1) print "Speedup GPU/CPU: %.2f" % (t1/t0) print "\nError CPU final vs CPU initial" err,mxe = checkError(h_B,h_A) print "Avg and max rel error = %.2e %.2e" % (err,mxe) print "\nError CPU final vs GPU" err,mxe = checkError(h_B,g_A) print "Avg and max rel error = %.2e %.2e" % (err,mxe) f = (-1.,) if check: f = (t1/t0,) fmt = "\n## "+" ".join(len(dims)*["%3d"])+" : %.1f %.1f: %.2e %.2e" print fmt % (dims+gflops+f+stats)
def main(N = 1024,L = 100): M = N K = N >> 1 N = N << 1 flops = (2.e-9*M*N)*float(K*L) print "M = %d, N = %d, K = %d, L = %d; GFlops = %.1f\n" % (M,N,K,L,flops) na,nb,nc,alfa,beta = M*K,K*N,M*N,1.,0. t0 = time() sizeA = M*K sizeB = K*N sizeC = M*N h_A = (c_float*sizeA)() h_B = (c_float*sizeB)() arrayInit(h_A) arrayInit(h_B) d_A = getMemory(h_A) d_B = getMemory(h_B) d_C = getMemory(sizeC) blockDim = dim3(BLOCK_SIZE,BLOCK_SIZE,1) gridDim = dim3(N/BLOCK_SIZE,M/BLOCK_SIZE,1) sharedMem = S4*2*BLOCK_SIZE*BLOCK_SIZE tt = t0 = time()-t0 print "Overhead runtime API: %.3f sec\n" % t0 t0 = time() cudaThreadSynchronize() for i in range(L): cudaConfigureCall(gridDim,blockDim,sharedMem,0) gpuSGEMM(d_C,d_A,d_B,K,N) cudaThreadSynchronize() t0 = time()-t0 tt += t0 h_C = (c_float*sizeC)() cudaMemcpy(h_C,d_C,S4*sizeC,cudaMemcpyDeviceToHost) cudaThreadSynchronize() cudaFree(d_A) cudaFree(d_B) cudaFree(d_C) cudaThreadExit() print "Processing time: %.3g (%.3g) sec" % (t0,tt) print "Gigaflops GPU: %.2f (%.2f)" % (flops/t0,flops/tt) ref = (c_float*sizeC)() t1 = time() for i in range(L): sgemm(ref,h_A,h_B,M,N,K) t1 = time()-t1 print "\nProcessing time: %.3g sec" % t1 print "Gigaflops CPU : %.2f" % (flops/t1) print "Speedup GPU/CPU: %.2f" % (t1/t0) err,mxe = checkError(ref,h_C) print "\nAvg and max rel error = %.2e %.2e" % (err,mxe)
kr = int(sys.argv[1]) dims = tuple([int(x) for x in sys.argv[2].split(",")]) except IndexError: sys.exit() doComplex = False if kr < 0: kr = -kr doComplex = True size = reduce(lambda x,y:x*y,dims) if doComplex: r = (c_float*(size*2))() else: r = (c_float*size)() arrayInit(r) sz = 1.e6/float(size) fftw_start = time.clock() wall_start = time.time() xr = float(.5 )/float(kr) if doComplex: text = "complex" rcfftx = xfft.ccfft crfftx = xfft.icfft else: text = " real"
def main(N = 1024,L = 100): M = N K = N >> 1 N = N << 1 flops = (2.e-9*M*N)*float(K*L) print "M = %d, N = %d, K = %d, L = %d; GFlops = %.1f\n" % (M,N,K,L,flops) na,nb,nc,alfa,beta = M*K,K*N,M*N,1.,0. t0 = time() device = cu_CUDA() device.getSourceModule("gpuFunctions.cubin") gpuSGEMM = device.getFunction("gpuSGEMM") sizeA = M*K sizeB = K*N sizeC = M*N h_A = (c_float*sizeA)() h_B = (c_float*sizeB)() arrayInit(h_A) arrayInit(h_B) d_A = getMemory(h_A) d_B = getMemory(h_B) d_C = getMemory(sizeC) cuFuncSetBlockShape(gpuSGEMM,BLOCK_SIZE,BLOCK_SIZE,1) cuFuncSetSharedSize(gpuSGEMM,2*BLOCK_SIZE*BLOCK_SIZE*S4) cuParamSeti(gpuSGEMM,0,d_C) cuParamSeti(gpuSGEMM,4,d_A) cuParamSeti(gpuSGEMM,8,d_B) cuParamSeti(gpuSGEMM,12,K) cuParamSeti(gpuSGEMM,16,N) cuParamSetSize(gpuSGEMM,20) tt = t0 = time()-t0 print "Overhead driver API: %.3f sec\n" % t0 t0 = time() cuCtxSynchronize() for i in range(L): cuLaunchGrid(gpuSGEMM,N/BLOCK_SIZE,M/BLOCK_SIZE) cuCtxSynchronize() t0 = time()-t0 tt += t0 h_C = (c_float*sizeC)() cuMemcpyDtoH(h_C,d_C,S4*sizeC) cuCtxSynchronize() cuMemFree(d_A) cuMemFree(d_B) cuMemFree(d_C) cuCtxDetach(device.context) print "Processing time: %.3g (%.3g) sec" % (t0,tt) print "Gigaflops GPU: %.2f (%.2f)" % (flops/t0,flops/tt) ref = (c_float*sizeC)() t1 = time() for i in range(L): sgemm(ref,h_A,h_B,M,N,K) t1 = time()-t1 print "\nProcessing time: %.3g sec" % t1 print "Gigaflops CPU : %.2f" % (flops/t1) print "Speedup GPU/CPU: %.2f" % (t1/t0) err,mxe = checkError(ref,h_C) print "\nAvg and max rel error = %.2e %.2e" % (err,mxe)