def main(device, loops=1): gpuGFLOPS = device.functions["gpuGFLOPS"] cuFuncSetBlockShape(gpuGFLOPS, BLOCK_SIZE_G, 1, 1) t0 = time() for i in range(loops): cuCtxSynchronize() cuLaunchGrid(gpuGFLOPS, GRID_SIZE_G, 1) cuCtxSynchronize() t0 = time() - t0 flopsc = 4096. * ITERATIONS_C * BLOCK_SIZE_C flopsg = 4096. * ITERATIONS_G * BLOCK_SIZE_G * GRID_SIZE_G flopsc *= 1.e-9 * float(loops) flopsg *= 1.e-9 * float(loops) t1 = time() for i in range(loops): cpuGFLOPS() t1 = time() - t1 # peakg = 4.*8.*2.*1.458 # 4MP*8SP/MP*2flops/SP/clock*clock[GHz] (8600GTS) peakg = 14. * 8. * 2. * 1.512 # 14MP*8SP/MP*2flops/SP/clock*clock[GHz] (9800GT) print "%8.3f%8.2f%8.3f%8.2f [%.2f]" % (t1, flopsc / t1, t0, flopsg / t0, peakg)
def main(loops = 1): blockDim = dim3(BLOCK_SIZE,1,1) gridDim = dim3(GRID_SIZE,1,1) t0 = time() cudaThreadSynchronize() for i in range(loops): cudaConfigureCall(gridDim,blockDim,0,0) gpuGFLOPS() cudaThreadSynchronize() t0 = time()-t0 cudaThreadExit() flopsc = 4096.*ITERATIONS*BLOCK_SIZE flopsg = flopsc*GRID_SIZE flopsc *= 1.e-9*float(loops) flopsg *= 1.e-9*float(loops) t1 = time() for i in range(loops): cpuGFLOPS() t1 = time()-t1 peakg = 4.*8.*2.*1.458 # 2MP*8SP/MP*2flops/SP/clock*clock[GHz] print "%8.3f%8.2f%8.3f%8.2f [%.2f]" % ( t1,flopsc/t1,t0,flopsg/t0,peakg)
def main(device,loops = 1): gpuGFLOPS = device.functions["gpuGFLOPS"] cuFuncSetBlockShape(gpuGFLOPS,BLOCK_SIZE_G,1,1) t0 = time() for i in range(loops): cuCtxSynchronize() cuLaunchGrid(gpuGFLOPS,GRID_SIZE_G,1) cuCtxSynchronize() t0 = time()-t0 flopsc = 4096.*ITERATIONS_C*BLOCK_SIZE_C flopsg = 4096.*ITERATIONS_G*BLOCK_SIZE_G*GRID_SIZE_G flopsc *= 1.e-9*float(loops) flopsg *= 1.e-9*float(loops) t1 = time() for i in range(loops): cpuGFLOPS() t1 = time()-t1 # peakg = 4.*8.*2.*1.458 # 4MP*8SP/MP*2flops/SP/clock*clock[GHz] (8600GTS) peakg = 14.*8.*2.*1.512 # 14MP*8SP/MP*2flops/SP/clock*clock[GHz] (9800GT) print "%8.3f%8.2f%8.3f%8.2f [%.2f]" % ( t1,flopsc/t1,t0,flopsg/t0,peakg)
def main(loops = 1): blockDim = dim3(BLOCK_SIZE_G,1,1) gridDim = dim3(GRID_SIZE_G,1,1) t0 = time() cudaThreadSynchronize() for i in range(loops): cudaConfigureCall(gridDim,blockDim,0,0) gpuGFLOPS() cudaThreadSynchronize() t0 = time()-t0 cudaThreadExit() flopsc = 4096.*ITERATIONS_C*BLOCK_SIZE_C flopsg = 4096.*ITERATIONS_G*BLOCK_SIZE_G*GRID_SIZE_G flopsc *= 1.e-9*float(loops) flopsg *= 1.e-9*float(loops) t1 = time() for i in range(loops): cpuGFLOPS() t1 = time()-t1 # peakg = 4.*8.*2.*1.458 # 4MP*8SP/MP*2flops/SP/clock*clock[GHz] (8600GTS) peakg = 14.*8.*2.*1.512 # 14MP*8SP/MP*2flops/SP/clock*clock[GHz] (9800GT) print "%8.3f%8.2f%8.3f%8.2f [%.2f]" % (t1,flopsc/t1,t0,flopsg/t0,peakg) print "%8.3f%8.2f" % (flopsc/t1*2.8,flopsg/t0*1.512/112)
def main(loops=1): blockDim = dim3(BLOCK_SIZE_G, 1, 1) gridDim = dim3(GRID_SIZE_G, 1, 1) t0 = time() cudaThreadSynchronize() for i in range(loops): cudaConfigureCall(gridDim, blockDim, 0, 0) gpuGFLOPS() cudaThreadSynchronize() t0 = time() - t0 cudaThreadExit() flopsc = 4096. * ITERATIONS_C * BLOCK_SIZE_C flopsg = 4096. * ITERATIONS_G * BLOCK_SIZE_G * GRID_SIZE_G flopsc *= 1.e-9 * float(loops) flopsg *= 1.e-9 * float(loops) t1 = time() for i in range(loops): cpuGFLOPS() t1 = time() - t1 # peakg = 4.*8.*2.*1.458 # 4MP*8SP/MP*2flops/SP/clock*clock[GHz] (8600GTS) peakg = 14. * 8. * 2. * 1.512 # 14MP*8SP/MP*2flops/SP/clock*clock[GHz] (9800GT) print "%8.3f%8.2f%8.3f%8.2f [%.2f]" % (t1, flopsc / t1, t0, flopsg / t0, peakg) print "%8.3f%8.2f" % (flopsc / t1 * 2.8, flopsg / t0 * 1.512 / 112)