def doit(nvecs, nwarps, i_chunk, k_chunk): nthreads = 32 * nwarps print print print '=== nvecs={nvecs}, nthreads={nthreads}, i_chunk={i_chunk}, k_chunk={k_chunk}'.format(**locals()) out = np.zeros((nblocks, 2 * nk, nvecs), dtype=np.double, order='C') kernel = CudaLegendreKernel(max_ni=ni, nthreads=nthreads, nvecs=nvecs, k_chunk=k_chunk, i_chunk=i_chunk) if 0: print '======== Reduction ' with cuda_profile() as prof: for rep in range(repeat): output = np.zeros((nblocks, 2, 16, nwarps)) kernel.test_reduce_kernel(output, repeat=1000, nblocks=nblocks) print prof.format('test_reduce_kernel', nflops=nblocks * 2 * 16 * nthreads * 1000, nwarps=nwarps) print '======== Legendre transform ' with cuda_profile() as prof: for rep in range(repeat): kernel.transpose_legendre_transform(m, m + odd, plan.x_squared, Lambda_0, Lambda_1, i_stops, q, out) print prof.format('transpose_legendre_transform', nflops=nblocks * nnz * (5 + 2 * nvecs), nwarps=nwarps) # Output is stored in strided format a = out[:, ::2, :] if check: if not np.all(a[:, :, 0:1] == a): print 'NOT ALL j EQUAL!' a = a[1, :, :] print 'Error', la.norm(a - a0) / la.norm(a0) sys.stdout.flush() return a
map = drv.pagelocked_zeros(npix, np.float64) buf = drv.pagelocked_zeros((nrings, (lmax + 1) // 2 + 1), np.complex128) map_gpu = drv.mem_alloc(npix * 8) buf_gpu = drv.mem_alloc(nrings * ((lmax + 1) // 2 + 1) * 16) drv.memcpy_htod(map_gpu, map) from wavemoth.cuda import cufft print 'ctoring plan' plan = cufft.HealpixCuFFTPlan(2048, 8) repeats = 1 print 'plan ctored' with cuda_profile() as prof: t0 = time() for i in range(repeats): plan.execute(map_gpu, buf_gpu) dt = time() - t0 print dt / repeats print 'benchmark done' #print prof.kernels dt = 0 for kernel, stats in prof.kernels.iteritems(): dt += sum(stats['times']) print dt print prof.kernels drv.memcpy_dtoh(buf, buf_gpu)
cuda.memcpy_htod_async(q_gpu, q_slice, stream=stream) plan.execute_transpose_legendre(q_gpu, a_gpu, stream=stream) cuda.memcpy_dtoh_async(a_slice, a_gpu, stream=stream) print 'Wall-time taken to set up instruction streams ("Python overhead"): %e' % (time() - t0) for stream, q_gpu, a_gpu in stream_objects: stream.synchronize() dt = time() - t0 print 'Wall-time taken to end of execution: %f total, %f per transform' % (dt, dt / ntransforms) print 'Host-to-host compute rate: %f GFLOP/s' % (ntransforms * plan.get_flops() / dt / 1e9) # Profiled, synchronous run print print '== Profiled run at nside=%d' % nside with cuda_profile() as prof: for rep in range(3): stream, q_gpu, a_gpu = stream_objects[0] cuda.memcpy_htod(q_gpu, q[0, ...]) plan.execute_transpose_legendre(q_gpu, a_gpu) cuda.memcpy_dtoh(a[0, ...], a_gpu) print 'Transfer in: ', prof.format('memcpyHtoD', nflops=q.nbytes // ntransforms) print 'Compute: ', prof.format('all_transpose_legendre_transforms', nflops=plan.get_flops(), nwarps=2) print 'Transfer out: ', prof.format('memcpyDtoH', nflops=a.nbytes // ntransforms) # Check result print print '== Accuracy table (m, odd, relative error)'