def runTest(vlength = 128,loops = 1): n2 = vlength*vlength alfa = c_float(.5) cublasInit() h_X = (c_float*n2)() h_Y = (c_float*n2)() g_Y = (c_float*n2)() vectorInit(h_X) vectorInit(h_Y) d_X = c_void_p() d_Y = c_void_p() cublasAlloc(n2, sizeof(c_float), byref(d_X)) cublasAlloc(n2, sizeof(c_float), byref(d_Y)) cublasSetVector(n2, sizeof(c_float), h_X, 1, d_X, 1) cublasSetVector(n2, sizeof(c_float), h_Y, 1, d_Y, 1) flops = (2.e-9*n2)*float(loops) t0 = time() for i in range(loops): cublasSaxpy(n2, alfa, d_X, 1, d_Y, 1) cudaThreadSynchronize() t0 = time()-t0 print "Processing time: %.3g sec" % t0 print "Gigaflops GPU: %.2f (%d)" % (flops/t0,n2) t1 = time() for i in range(loops): cpuSAXPY(alfa,h_X,h_Y) t1 = time()-t1 print "\nProcessing time: %.3g sec" % t1 print "Gigaflops CPU: %.2f" % (flops/t1) print "GPU vs. CPU : %.2f" % (t1/t0) cublasGetVector(n2, sizeof(c_float), d_Y, 1, g_Y, 1) err,mxe = checkError(h_Y,g_Y) print "\nAvg and max rel error = %.2e %.2e" % (err,mxe) cublasFree(d_X) cublasFree(d_Y) cublasShutdown()
def main(device, vlength=128, loops=1): alfa = c_float(.5) n2 = vlength ## Vector length gpuSAXPY = device.functions["gpuSAXPY"] h_X = (c_float * n2)() h_Y = (c_float * n2)() g_Y = (c_float * n2)() fixedInit(h_X) d_X = getMemory(h_X) d_Y = getMemory(h_Y) cuFuncSetBlockShape(gpuSAXPY, BLOCK_SIZE, 1, 1) cuParamSetf(gpuSAXPY, 0, alfa) cuParamSeti(gpuSAXPY, 4, d_X) cuParamSeti(gpuSAXPY, 8, d_Y) cuParamSeti(gpuSAXPY, 12, n2) cuParamSetSize(gpuSAXPY, 16) cuCtxSynchronize() t0 = time() for i in range(loops): cuLaunchGrid(gpuSAXPY, GRID_SIZE, 1) cuCtxSynchronize() t0 = time() - t0 flops = (2.e-9 * n2) * float(loops) cuMemcpyDtoH(g_Y, d_Y, n2 * S4) cuCtxSynchronize() cuMemFree(d_X) cuMemFree(d_Y) t1 = time() for i in range(loops): cpuSAXPY(alfa, h_X, h_Y) t1 = time() - t1 print "%10d%6.2f%6.2f" % (vlength, flops / t1, flops / t0) if checkErrorFlag: err, mxe = checkError(h_Y, g_Y) print "Avg and max rel error = %.2e %.2e" % (err, mxe)
def main(device, vlength=128, loops=1): alfa = c_float(0.5) n2 = vlength ## Vector length gpuSAXPY = device.functions["gpuSAXPY"] h_X = (c_float * n2)() h_Y = (c_float * n2)() g_Y = (c_float * n2)() fixedInit(h_X) d_X = getMemory(h_X) d_Y = getMemory(h_Y) cuFuncSetBlockShape(gpuSAXPY, BLOCK_SIZE, 1, 1) cuParamSetf(gpuSAXPY, 0, alfa) cuParamSeti(gpuSAXPY, 4, d_X) cuParamSeti(gpuSAXPY, 8, d_Y) cuParamSeti(gpuSAXPY, 12, n2) cuParamSetSize(gpuSAXPY, 16) cuCtxSynchronize() t0 = time() for i in range(loops): cuLaunchGrid(gpuSAXPY, GRID_SIZE, 1) cuCtxSynchronize() t0 = time() - t0 flops = (2.0e-9 * n2) * float(loops) cuMemcpyDtoH(g_Y, d_Y, n2 * S4) cuCtxSynchronize() cuMemFree(d_X) cuMemFree(d_Y) t1 = time() for i in range(loops): cpuSAXPY(alfa, h_X, h_Y) t1 = time() - t1 print "%10d%6.2f%6.2f" % (vlength, flops / t1, flops / t0) if checkErrorFlag: err, mxe = checkError(h_Y, g_Y) print "Avg and max rel error = %.2e %.2e" % (err, mxe)
def main(vlength = 128,loops = 1): alfa = c_float(.5) n2 = vlength ## Vector length h_X = (c_float*n2)() h_Y = (c_float*n2)() g_Y = (c_float*n2)() fixedInit(h_X) d_X = getMemory(h_X) d_Y = getMemory(h_Y) blockDim = dim3(BLOCK_SIZE,1,1) gridDim = dim3(GRID_SIZE,1,1) t0 = time() cudaThreadSynchronize() for i in range(loops): cudaConfigureCall(gridDim,blockDim,0,0) gpuSAXPY(alfa,d_X,d_Y,n2) cudaThreadSynchronize() t0 = time()-t0 flops = (2.e-9*n2)*float(loops) g_Y = (c_float*n2)() cudaMemcpy(g_Y,d_Y,S4*n2,cudaMemcpyDeviceToHost) cudaThreadSynchronize() cudaFree(d_X) cudaFree(d_Y) cudaThreadExit() t1 = time() for i in range(loops): cpuSAXPY(alfa,h_X,h_Y) t1 = time()-t1 print "%10d%6.2f%6.2f" % (vlength,flops/t1,flops/t0) if checkErrorFlag: err,mxe = checkError(h_Y,g_Y) print "Avg and max rel error = %.2e %.2e" % (err,mxe)