def main(vlength=128, loops=1): n2 = vlength ## Vector length h_X = (c_float * n2)() h_Y = (c_float * n2)() h_Z = (c_float * n2)() vectorInit(h_X) d_X = getMemory(h_X) d_Y = getMemory(h_Y) d_Z = getMemory(h_Z) blockDim = dim3(BLOCK_SIZE, 1, 1) gridDim = dim3(GRID_SIZE, 1, 1) t0 = time() cudaThreadSynchronize() for i in range(loops): cudaConfigureCall(gridDim, blockDim, 0, 0) gpuTRIG(d_Y, d_Z, d_X, n2) cudaThreadSynchronize() t0 = time() - t0 flops = (2.e-9 * n2) * float(loops) g_Y = (c_float * n2)() cudaMemcpy(g_Y, d_Y, S4 * n2, cudaMemcpyDeviceToHost) cudaThreadSynchronize() flops = (8.e-9 * n2) * float(loops) g_Y = (c_float * n2)() g_Z = (c_float * n2)() cudaMemcpy(g_Y, d_Y, S4 * n2, cudaMemcpyDeviceToHost) cudaMemcpy(g_Z, d_Z, S4 * n2, cudaMemcpyDeviceToHost) cudaThreadSynchronize() cudaFree(d_X) cudaFree(d_Y) cudaFree(d_Z) cudaThreadExit() t1 = time() for i in range(loops): cpuTRIG(h_Y, h_Z, h_X) t1 = time() - t1 print "%10d%6.2f%6.2f GFlops" % (vlength, flops / t1, flops / t0) if checkErrorFlag: err, mxe = checkError(h_Y, g_Y, n2) print "Avg and max rel error (cos) = %.2e %.2e" % (err, mxe) err, mxe = checkError(h_Z, g_Z, n2) print "Avg and max rel error (sin) = %.2e %.2e" % (err, mxe)
def main(vlength = 128,loops = 1): n2 = vlength ## Vector length h_X = (c_float*n2)() h_Y = (c_float*n2)() h_Z = (c_float*n2)() vectorInit(h_X) d_X = getMemory(h_X) d_Y = getMemory(h_Y) d_Z = getMemory(h_Z) blockDim = dim3(BLOCK_SIZE,1,1) gridDim = dim3(GRID_SIZE,1,1) t0 = time() cudaThreadSynchronize() for i in range(loops): cudaConfigureCall(gridDim,blockDim,0,0) gpuTRIG(d_Y,d_Z,d_X,n2) cudaThreadSynchronize() t0 = time()-t0 flops = (2.e-9*n2)*float(loops) g_Y = (c_float*n2)() cudaMemcpy(g_Y,d_Y,S4*n2,cudaMemcpyDeviceToHost) cudaThreadSynchronize() flops = (8.e-9*n2)*float(loops) g_Y = (c_float*n2)() g_Z = (c_float*n2)() cudaMemcpy(g_Y,d_Y,S4*n2,cudaMemcpyDeviceToHost) cudaMemcpy(g_Z,d_Z,S4*n2,cudaMemcpyDeviceToHost) cudaThreadSynchronize() cudaFree(d_X) cudaFree(d_Y) cudaFree(d_Z) cudaThreadExit() t1 = time() for i in range(loops): cpuTRIG(h_Y,h_Z,h_X) t1 = time()-t1 print "%10d%6.2f%6.2f GFlops" % (vlength,flops/t1,flops/t0) if checkErrorFlag: err,mxe = checkError(h_Y,g_Y,n2) print "Avg and max rel error (cos) = %.2e %.2e" % (err,mxe) err,mxe = checkError(h_Z,g_Z,n2) print "Avg and max rel error (sin) = %.2e %.2e" % (err,mxe)
def main(device,vlength = 128,loops = 1): n2 = vlength ## Vector length gpuTRIG = device.functions["gpuTRIG"] h_X = (c_float*n2)() h_Y = (c_float*n2)() h_Z = (c_float*n2)() vectorInit(h_X) d_X = getMemory(h_X) d_Y = getMemory(h_Y) d_Z = getMemory(h_Z) cuFuncSetBlockShape(gpuTRIG,BLOCK_SIZE,1,1) cuParamSeti(gpuTRIG,0,d_Y) cuParamSeti(gpuTRIG,4,d_Z) cuParamSeti(gpuTRIG,8,d_X) cuParamSeti(gpuTRIG,12,n2) cuParamSetSize(gpuTRIG,16) cuCtxSynchronize() t0 = time() for i in range(loops): cuLaunchGrid(gpuTRIG,GRID_SIZE,1) cuCtxSynchronize() t0 = time()-t0 flops = (8.e-9*n2)*float(loops) g_Y = (c_float*n2)() g_Z = (c_float*n2)() cuMemcpyDtoH(g_Y,d_Y,S4*n2) cuMemcpyDtoH(g_Z,d_Z,S4*n2) cuCtxSynchronize() cuMemFree(d_X) cuMemFree(d_Y) cuMemFree(d_Z) t1 = time() for i in range(loops): cpuTRIG(h_Y,h_Z,h_X) t1 = time()-t1 print "%10d%6.2f%6.2f GFlops" % (vlength,flops/t1,flops/t0) if checkErrorFlag: err,mxe = checkError(h_Y,g_Y,n2) print "Avg and max rel error (cos) = %.2e %.2e" % (err,mxe) err,mxe = checkError(h_Z,g_Z,n2) print "Avg and max rel error (sin) = %.2e %.2e" % (err,mxe)
def main(device, vlength=128, loops=1): n2 = vlength ## Vector length gpuTRIG = device.functions["gpuTRIG"] h_X = (c_float * n2)() h_Y = (c_float * n2)() h_Z = (c_float * n2)() vectorInit(h_X) d_X = getMemory(h_X) d_Y = getMemory(h_Y) d_Z = getMemory(h_Z) cuFuncSetBlockShape(gpuTRIG, BLOCK_SIZE, 1, 1) cuParamSeti(gpuTRIG, 0, d_Y) cuParamSeti(gpuTRIG, 4, d_Z) cuParamSeti(gpuTRIG, 8, d_X) cuParamSeti(gpuTRIG, 12, n2) cuParamSetSize(gpuTRIG, 16) cuCtxSynchronize() t0 = time() for i in range(loops): cuLaunchGrid(gpuTRIG, GRID_SIZE, 1) cuCtxSynchronize() t0 = time() - t0 flops = (8.e-9 * n2) * float(loops) g_Y = (c_float * n2)() g_Z = (c_float * n2)() cuMemcpyDtoH(g_Y, d_Y, S4 * n2) cuMemcpyDtoH(g_Z, d_Z, S4 * n2) cuCtxSynchronize() cuMemFree(d_X) cuMemFree(d_Y) cuMemFree(d_Z) t1 = time() for i in range(loops): cpuTRIG(h_Y, h_Z, h_X) t1 = time() - t1 print "%10d%6.2f%6.2f GFlops" % (vlength, flops / t1, flops / t0) if checkErrorFlag: err, mxe = checkError(h_Y, g_Y, n2) print "Avg and max rel error (cos) = %.2e %.2e" % (err, mxe) err, mxe = checkError(h_Z, g_Z, n2) print "Avg and max rel error (sin) = %.2e %.2e" % (err, mxe)
def main(device,vlength = 128,loops = 1,m1 = 1): print "%5d %5d %5d" % (l,loops,m1), alfa = c_float(.5) n2 = vlength ## Vector length mp = 1 << (m1-1) print "%5d" % (mp*psize), fcn = "gpuPOLY%d"%(mp*psize) gpuPOLY = device.functions[fcn] h_X = (c_float*n2)() h_Y = (c_float*n2)() g_Y = (c_float*n2)() vectorInit(h_X) d_X = getMemory(h_X) d_Y = getMemory(h_Y) cuFuncSetBlockShape(gpuPOLY,BLOCK_SIZE,1,1) cuParamSeti(gpuPOLY,0,d_X) cuParamSeti(gpuPOLY,4,d_Y) cuParamSeti(gpuPOLY,8,n2) cuParamSetSize(gpuPOLY,12) cuCtxSynchronize() cuLaunchGrid(gpuPOLY,GRID_SIZE,1) t0 = time() for i in range(loops): cuLaunchGrid(gpuPOLY,GRID_SIZE,1) cuCtxSynchronize() t0 = time()-t0 flops = (2.e-9*m1*n2*(psize-1))*float(loops) cuMemcpyDtoH(g_Y,d_Y,n2*S4) cuCtxSynchronize() cuMemFree(d_X) cuMemFree(d_Y) cpuPOLY = eval("cpuPOLY%d" % (mp*psize)) t1 = time() for i in range(loops): cpuPOLY(h_X,h_Y) t1 = time()-t1 print "%10d%6.2f%6.2f" % (vlength,flops/t1,flops/t0) if checkErrorFlag: err,mxe = checkError(h_Y,g_Y) print "Avg and max rel error = %.2e %.2e" % (err,mxe)
def runTest(vlength = 128,loops = 1): n2 = vlength*vlength alfa = c_float(.5) cublasInit() h_X = (c_float*n2)() h_Y = (c_float*n2)() g_Y = (c_float*n2)() vectorInit(h_X) vectorInit(h_Y) d_X = c_void_p() d_Y = c_void_p() cublasAlloc(n2, sizeof(c_float), byref(d_X)) cublasAlloc(n2, sizeof(c_float), byref(d_Y)) cublasSetVector(n2, sizeof(c_float), h_X, 1, d_X, 1) cublasSetVector(n2, sizeof(c_float), h_Y, 1, d_Y, 1) flops = (2.e-9*n2)*float(loops) t0 = time() for i in range(loops): cublasSaxpy(n2, alfa, d_X, 1, d_Y, 1) cudaThreadSynchronize() t0 = time()-t0 print "Processing time: %.3g sec" % t0 print "Gigaflops GPU: %.2f (%d)" % (flops/t0,n2) t1 = time() for i in range(loops): cpuSAXPY(alfa,h_X,h_Y) t1 = time()-t1 print "\nProcessing time: %.3g sec" % t1 print "Gigaflops CPU: %.2f" % (flops/t1) print "GPU vs. CPU : %.2f" % (t1/t0) cublasGetVector(n2, sizeof(c_float), d_Y, 1, g_Y, 1) err,mxe = checkError(h_Y,g_Y) print "\nAvg and max rel error = %.2e %.2e" % (err,mxe) cublasFree(d_X) cublasFree(d_Y) cublasShutdown()
def main(vlength = 128,loops = 1,m1 = 1): print "%5d %5d %5d" % (l,loops,m1), alfa = c_float(.5) n2 = vlength ## Vector length mp = 1 << (m1-1) print "%5d" % (mp*psize), gpuPOLY = eval("gpuPOLY%d"%(mp*psize)) h_X = (c_float*n2)() h_Y = (c_float*n2)() g_Y = (c_float*n2)() vectorInit(h_X) d_X = getMemory(h_X) d_Y = getMemory(h_Y) blockDim = dim3(BLOCK_SIZE,1,1) gridDim = dim3(GRID_SIZE,1,1) t0 = time() cudaThreadSynchronize() for i in range(loops): cudaConfigureCall(gridDim,blockDim,0,0) gpuPOLY(d_X,d_Y,n2) cudaThreadSynchronize() t0 = time()-t0 flops = (2.e-9*m1*n2*(psize-1))*float(loops) cudaMemcpy(g_Y,d_Y,S4*n2,cudaMemcpyDeviceToHost) cudaThreadSynchronize() cudaFree(d_X) cudaFree(d_Y) cudaThreadExit() cpuPOLY = eval("cpuPOLY%d" % (mp*psize)) t1 = time() for i in range(loops): cpuPOLY(h_X,h_Y) t1 = time()-t1 print "%10d%6.2f%6.2f" % (vlength,flops/t1,flops/t0) if checkErrorFlag: err,mxe = checkError(h_Y,g_Y) print "Avg and max rel error = %.2e %.2e" % (err,mxe)
def runTest(vlength = 128,loops = 1): n2 = vlength*vlength alfa = c_float(.5) cublasInit() h_X = (c_float*n2)() h_Y = (c_float*n2)() vectorInit(h_X) vectorInit(h_Y) d_X = c_void_p() d_Y = c_void_p() cublasAlloc(n2, sizeof(c_float), byref(d_X)) cublasAlloc(n2, sizeof(c_float), byref(d_Y)) cublasSetVector(n2, sizeof(c_float), h_X, 1, d_X, 1) cublasSetVector(n2, sizeof(c_float), h_Y, 1, d_Y, 1) flops = (2.e-9*n2)*float(loops) s0 = 0. t0 = time() for i in range(loops): s0 += cublasSdot(n2, d_X, 1, d_Y, 1) cudaThreadSynchronize() t0 = time()-t0 print "Processing time: %.3g sec" % t0 print "Gigaflops GPU: %.2f (%d)" % (flops/t0,n2) s1 = 0. t1 = time() for i in range(loops): s1 += cpuSDOT(h_X,h_Y) t1 = time()-t1 print "\nProcessing time: %.3g sec" % t1 print "Gigaflops CPU: %.2f" % (flops/t1) print "GPU vs. CPU : %.2f" % (t1/t0) sx = max(1.e-7,max(abs(s0),abs(s1))) err = abs(s1-s0)/sx print "\nError = %.2e" % err cublasFree(d_X) cublasFree(d_Y) cublasShutdown()
def main(device,vlength = 128,loops = 1): print "+-----------------------+" print "| Simple TRIG Test |" print "| using CUDA driver API |" print "+-----------------------+" print "params: %2d %5dK %3d\n" % (log2n,vlength >> 10,loops), n2 = vlength ## Vector length # TRIGTex is about 1.5x faster than TRIG # name = "TRIG" name = "TRIGTex" TRIG = device.functions[name] mod0 = device.modules[0] sizeV = S4*n2 h_Arg = (c_float*n2)() h_Cos = (c_float*n2)() h_Sin = (c_float*n2)() vectorInit(h_Arg) d_Arg = getMemory(h_Arg) d_Cos = getMemory(n2) d_Sin = getMemory(n2) tex = devMemToTex(mod0,"Arg",d_Arg,sizeV) cuFuncSetBlockShape(TRIG,BLOCK_SIZE,1,1) cuParamSeti(TRIG,0,d_Cos) cuParamSeti(TRIG,4,d_Sin) if name != "TRIGTex": cuParamSeti(TRIG,8,d_Arg) cuParamSeti(TRIG,12,n2) cuParamSetSize(TRIG,16) else: cuParamSetTexRef(TRIG,CU_PARAM_TR_DEFAULT,tex) cuParamSeti(TRIG,8,n2) cuParamSetSize(TRIG,12) cuCtxSynchronize() t0 = time() for i in range(loops): cuLaunchGrid(TRIG,GRID_SIZE,1) cuCtxSynchronize() t0 = time()-t0 g_Cos = (c_float*n2)() g_Sin = (c_float*n2)() cuMemcpyDtoH(g_Cos,d_Cos,sizeV) cuMemcpyDtoH(g_Sin,d_Sin,sizeV) cuCtxSynchronize() cuMemFree(d_Arg) cuMemFree(d_Cos) cuMemFree(d_Sin) t1 = time() for i in range(loops): cpuTRIG(h_Cos,h_Sin,h_Arg) t1 = time()-t1 flopsg = (2.e-6*n2)*float(loops) flopsc = flopsg t0 *= 1.e3; t1 *= 1.e3; print "\n time[msec] GFlops\n" print "GPU: %12.1f%10.2f" % (t0,flopsg/t0) print "CPU: %12.1f%10.2f" % (t1,flopsc/t1) print " %12.1f" % (t1/t0) x = float(1 << 23) e,m = checkTrig(g_Cos,g_Sin) print "\n",name, "internal check GPU" print "%8.1e %8.1e" % (e,m) print "%8.1f %8.1f" % (e*x,m*x) e,m = checkTrig(h_Cos,h_Sin) print "\n",name, "internal check CPU" print "%8.1e %8.1e" % (e,m) print "%8.1f %8.1f" % (e*x,m*x) print "\n","check between CPU and GPU" err,mxe = checkError(h_Cos,g_Cos) print "Avg and max abs error (cos) = %8.1e %8.1e" % (err,mxe) print " %8.1f %8.1f" % (err*x,mxe*x) err,mxe = checkError(h_Sin,g_Sin) print "Avg and max abs error (sin) = %8.1e %8.1e" % (err,mxe) print " %8.1f %8.1f" % (err*x,mxe*x)
def main(device, vlength=128, loops=1): print "+-----------------------+" print "| Simple TRIG Test |" print "| using CUDA driver API |" print "+-----------------------+" print "params: %2d %5dK %3d\n" % (log2n, vlength >> 10, loops), n2 = vlength ## Vector length # TRIGTex is about 1.5x faster than TRIG # name = "TRIG" name = "TRIGTex" TRIG = device.functions[name] mod0 = device.modules[0] sizeV = S4 * n2 h_Arg = (c_float * n2)() h_Cos = (c_float * n2)() h_Sin = (c_float * n2)() vectorInit(h_Arg) d_Arg = getMemory(h_Arg) d_Cos = getMemory(n2) d_Sin = getMemory(n2) tex = devMemToTex(mod0, "Arg", d_Arg, sizeV) cuFuncSetBlockShape(TRIG, BLOCK_SIZE, 1, 1) cuParamSeti(TRIG, 0, d_Cos) cuParamSeti(TRIG, 4, d_Sin) if name != "TRIGTex": cuParamSeti(TRIG, 8, d_Arg) cuParamSeti(TRIG, 12, n2) cuParamSetSize(TRIG, 16) else: cuParamSetTexRef(TRIG, CU_PARAM_TR_DEFAULT, tex) cuParamSeti(TRIG, 8, n2) cuParamSetSize(TRIG, 12) cuCtxSynchronize() t0 = time() for i in range(loops): cuLaunchGrid(TRIG, GRID_SIZE, 1) cuCtxSynchronize() t0 = time() - t0 g_Cos = (c_float * n2)() g_Sin = (c_float * n2)() cuMemcpyDtoH(g_Cos, d_Cos, sizeV) cuMemcpyDtoH(g_Sin, d_Sin, sizeV) cuCtxSynchronize() cuMemFree(d_Arg) cuMemFree(d_Cos) cuMemFree(d_Sin) t1 = time() for i in range(loops): cpuTRIG(h_Cos, h_Sin, h_Arg) t1 = time() - t1 flopsg = (2.e-6 * n2) * float(loops) flopsc = flopsg t0 *= 1.e3 t1 *= 1.e3 print "\n time[msec] GFlops\n" print "GPU: %12.1f%10.2f" % (t0, flopsg / t0) print "CPU: %12.1f%10.2f" % (t1, flopsc / t1) print " %12.1f" % (t1 / t0) x = float(1 << 23) e, m = checkTrig(g_Cos, g_Sin) print "\n", name, "internal check GPU" print "%8.1e %8.1e" % (e, m) print "%8.1f %8.1f" % (e * x, m * x) e, m = checkTrig(h_Cos, h_Sin) print "\n", name, "internal check CPU" print "%8.1e %8.1e" % (e, m) print "%8.1f %8.1f" % (e * x, m * x) print "\n", "check between CPU and GPU" err, mxe = checkError(h_Cos, g_Cos) print "Avg and max abs error (cos) = %8.1e %8.1e" % (err, mxe) print " %8.1f %8.1f" % (err * x, mxe * x) err, mxe = checkError(h_Sin, g_Sin) print "Avg and max abs error (sin) = %8.1e %8.1e" % (err, mxe) print " %8.1f %8.1f" % (err * x, mxe * x)