def profileNCSC(n_reps): """ The C reference version for comparison. """ numpy.random.seed(1) data = numpy.random.uniform(low=10.0, high=20.0, size=(n_reps, n_pts, n_pts)).astype(dtype=numpy.float32) gamma = numpy.random.uniform(low=2.0, high=4.0, size=(n_pts, n_pts)).astype(dtype=numpy.float32) otf_mask = numpy.fft.fftshift(pyRef.createOTFMask().reshape(16, 16)) ref_u = numpy.zeros_like(data) ncs_sr = ncsC.NCSCSubRegion(r_size=n_pts) start_time = time.time() for i in range(n_reps): ncs_sr.newRegion(data[i, :, :], gamma) ncs_sr.setOTFMask(otf_mask) ref_u[i, :, :] = ncs_sr.cSolve(alpha, verbose=False) e_time = time.time() - start_time ncs_sr.cleanup() print("CNSC {0:.6f} seconds".format(e_time))
def test_ncs_noise_reduction_2(): # Setup numpy.random.seed(1) n_reps = 10 data = numpy.random.uniform(low = 10.0, high = 20.0, size = (n_reps, n_pts, n_pts)).astype(dtype = numpy.float32) gamma = numpy.random.uniform(low = 2.0, high = 4.0, size = (n_reps, n_pts, n_pts)).astype(dtype = numpy.float32) otf_mask_shift = pyRef.createOTFMask() # OpenCL Setup. u = numpy.zeros((n_reps, n_pts, n_pts), dtype = numpy.float32) iters = numpy.zeros(n_reps, dtype = numpy.int32) status = numpy.zeros(n_reps, dtype = numpy.int32) data_buffer = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf = data) gamma_buffer = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf = gamma) otf_mask_buffer = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf = otf_mask_shift) u_buffer = cl.Buffer(context, cl.mem_flags.WRITE_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf = u) iters_buffer = cl.Buffer(context, cl.mem_flags.WRITE_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf = iters) status_buffer = cl.Buffer(context, cl.mem_flags.WRITE_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf = status) # OpenCL noise reduction. program.ncsReduceNoise(queue, (n_reps*16,), (16,), data_buffer, gamma_buffer, otf_mask_buffer, u_buffer, iters_buffer, status_buffer, numpy.float32(alpha)) cl.enqueue_copy(queue, u, u_buffer).wait() cl.enqueue_copy(queue, iters, iters_buffer).wait() cl.enqueue_copy(queue, status, status_buffer).wait() queue.finish() # NCSC noise reduction. otf_mask = numpy.fft.fftshift(otf_mask_shift.reshape(16, 16)) ref_u = numpy.zeros_like(data) ncs_sr = ncsC.NCSCSubRegion(r_size = n_pts) for i in range(n_reps): ncs_sr.newRegion(data[i,:,:], gamma[i,:,:]) ncs_sr.setOTFMask(otf_mask) ref_u[i,:,:] = ncs_sr.cSolve(alpha, verbose = False) ncs_sr.cleanup() for i in range(n_reps): norm_diff = numpy.max(numpy.abs(u[i,:,:] - ref_u[i,:,:]))/numpy.max(ref_u[i,:,:]) assert(norm_diff < 1.0e-2), "failed {0:d} {1:.3f}".format(i, norm_diff)
def profile(n_reps): """ Report how long it takes to reduce the noise in X sub-regions. """ # Setup numpy.random.seed(1) data = numpy.random.uniform(low=10.0, high=20.0, size=(n_reps, n_pts, n_pts)).astype(dtype=numpy.float32) gamma = numpy.random.uniform(low=2.0, high=4.0, size=(n_pts, n_pts)).astype(dtype=numpy.float32) otf_mask_shift = pyRef.createOTFMask() # OpenCL Setup. u = numpy.zeros((n_reps, n_pts, n_pts), dtype=numpy.float32) iters = numpy.zeros(n_reps, dtype=numpy.int32) status = numpy.zeros(n_reps, dtype=numpy.int32) data_buffer = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=data) gamma_buffer = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=gamma) otf_mask_buffer = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=otf_mask_shift) u_buffer = cl.Buffer(context, cl.mem_flags.WRITE_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=u) iters_buffer = cl.Buffer(context, cl.mem_flags.WRITE_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=iters) status_buffer = cl.Buffer(context, cl.mem_flags.WRITE_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=status) ev1 = program.ncsReduceNoise(queue, (n_reps, ), (1, ), data_buffer, gamma_buffer, otf_mask_buffer, u_buffer, iters_buffer, status_buffer, numpy.float32(alpha)) cl.enqueue_copy(queue, u, u_buffer).wait() cl.enqueue_copy(queue, iters, iters_buffer).wait() cl.enqueue_copy(queue, status, status_buffer).wait() queue.finish() e_time = 1.0e-9 * (ev1.profile.end - ev1.profile.start) print("OpenCL {0:.6f} seconds".format(e_time))
def test_calc_nc(): n_pts = 16 for i in range(100): # OpenCL u = numpy.random.uniform(low=1.0, high=10.0, size=(n_pts, n_pts)).astype(dtype=numpy.float32) otf_mask_shift = pyRef.createOTFMask() nc = numpy.zeros(1, dtype=numpy.float32) u_buffer = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=u) otf_mask_buffer = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=otf_mask_shift) nc_buffer = cl.Buffer(context, cl.mem_flags.WRITE_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=nc) program.calc_nc_test(queue, (1, ), (1, ), u_buffer, otf_mask_buffer, nc_buffer) cl.enqueue_copy(queue, nc, nc_buffer).wait() queue.finish() # Reference 1 otf_mask = numpy.fft.fftshift(otf_mask_shift.reshape(16, 16)) ncs_sr = ncsC.NCSCSubRegion(r_size=n_pts) ncs_sr.setOTFMask(otf_mask) ncs_sr.setU(u) ref1_nc = ncs_sr.calcNoiseContribution() ncs_sr.cleanup() norm_diff = abs(nc[0] - ref1_nc) / abs(ref1_nc) assert (norm_diff < 1.0e-3), "Difference in results! {0:.6f}".format(norm_diff) # Reference 2 u_r = numpy.copy(u).flatten() u_c = numpy.zeros_like(u_r) u_fft_r = numpy.zeros_like(u_r) u_fft_c = numpy.zeros_like(u_c) otf_mask_sqr = (otf_mask_shift * otf_mask_shift).flatten() pyRef.fft_16x16(u_r, u_c, u_fft_r, u_fft_c) ref2_nc = pyRef.calcNoiseContribution(u_fft_r, u_fft_c, otf_mask_sqr) norm_diff = abs(nc[0] - ref2_nc) / abs(ref2_nc) assert (norm_diff < 1.0e-3), "Difference in results! {0:.6f}".format(norm_diff)
def test_calc_nc_grad_1(): n_pts = 16 for i in range(10): # OpenCL gradient calculation. u = numpy.random.uniform(low = 1.0, high = 10.0, size = (n_pts, n_pts)).astype(dtype = numpy.float32) otf_mask_shift = pyRef.createOTFMask() grad = numpy.zeros((n_pts, n_pts)).astype(numpy.float32) u_buffer = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf = u) otf_mask_buffer = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf = otf_mask_shift) grad_buffer = cl.Buffer(context, cl.mem_flags.WRITE_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf = grad) program.calc_nc_grad_test(queue, (16,), (16,), u_buffer, otf_mask_buffer, grad_buffer) cl.enqueue_copy(queue, grad, grad_buffer).wait() queue.finish() # Reference 1 otf_mask = numpy.fft.fftshift(otf_mask_shift.reshape(16, 16)) ncs_sr = ncsC.NCSCSubRegion(r_size = n_pts) ncs_sr.setOTFMask(otf_mask) ncs_sr.setU(u) ncs_sr.calcNoiseContribution() ref1_grad = ncs_sr.calcNCGradient().reshape(grad.shape) ncs_sr.cleanup() ref_norm = numpy.abs(ref1_grad) ref_norm[(ref_norm<1.0)] = 1.0 max_diff = numpy.max(numpy.abs(grad - ref1_grad)/ref_norm) assert (max_diff < 1.0e-5), "Difference in results! {0:.8f}".format(max_diff) # Reference 2 u_r = numpy.copy(u).flatten() u_c = numpy.zeros_like(u_r) u_fft_r = numpy.zeros_like(u_r) u_fft_c = numpy.zeros_like(u_r) ref2_grad = numpy.zeros_like(u_r) otf_mask_sqr = otf_mask_shift * otf_mask_shift pyRef.fft_16x16(u_r, u_c, u_fft_r, u_fft_c) pyRef.calcNCGradientIFFT(u_fft_r, u_fft_c, otf_mask_sqr, ref2_grad) ref_norm = numpy.abs(ref2_grad) ref_norm[(ref_norm<1.0)] = 1.0 max_diff = numpy.max(numpy.abs(grad.flatten() - ref2_grad)/ref_norm) assert (max_diff < 1.0e-5), "Difference in results! {0:.8f}".format(max_diff)
def test_ncs_noise_reduction_2(): # Setup numpy.random.seed(1) n_reps = 10 data = numpy.random.uniform(low=10.0, high=20.0, size=(n_reps, n_pts, n_pts)).astype(dtype=numpy.float32) gamma = numpy.random.uniform(low=2.0, high=4.0, size=(n_reps, n_pts, n_pts)).astype(dtype=numpy.float32) otf_mask_shift = pyRef.createOTFMask() # CUDA Setup. u = numpy.zeros((n_reps, n_pts, n_pts), dtype=numpy.float32) iters = numpy.zeros(n_reps, dtype=numpy.int32) status = numpy.zeros(n_reps, dtype=numpy.int32) # CUDA noise reduction. ncsReduceNoise(drv.In(data), drv.In(gamma), drv.In(otf_mask_shift), drv.Out(u), drv.Out(iters), drv.Out(status), numpy.float32(alpha), block=(16, 1, 1), grid=(n_reps, 1)) # NCSC noise reduction. otf_mask = numpy.fft.fftshift(otf_mask_shift.reshape(16, 16)) ref_u = numpy.zeros_like(data) ncs_sr = ncsC.NCSCSubRegion(r_size=n_pts) for i in range(n_reps): ncs_sr.newRegion(data[i, :, :], gamma[i, :, :]) ncs_sr.setOTFMask(otf_mask) ref_u[i, :, :] = ncs_sr.cSolve(alpha, verbose=False) ncs_sr.cleanup() for i in range(n_reps): norm_diff = numpy.max( numpy.abs(u[i, :, :] - ref_u[i, :, :])) / numpy.max(ref_u[i, :, :]) assert (norm_diff < 1.0e-2), "failed {0:d} {1:.3f}".format( i, norm_diff)
def test_ncs_noise_reduction_1(): # Setup numpy.random.seed(1) data = numpy.random.uniform(low=10.0, high=20.0, size=(n_pts, n_pts)).astype(dtype=numpy.float32) gamma = numpy.random.uniform(low=2.0, high=4.0, size=(n_pts, n_pts)).astype(dtype=numpy.float32) otf_mask_shift = pyRef.createOTFMask() # CUDA Setup. u = numpy.zeros((n_pts, n_pts), dtype=numpy.float32) iters = numpy.zeros(1, dtype=numpy.int32) status = numpy.zeros(1, dtype=numpy.int32) # CUDA noise reduction. ncsReduceNoise(drv.In(data), drv.In(gamma), drv.In(otf_mask_shift), drv.Out(u), drv.Out(iters), drv.Out(status), numpy.float32(alpha), block=(16, 1, 1), grid=(1, 1)) # Python reference version. ref_u = numpy.zeros(data.size) ref_iters = numpy.zeros_like(iters) ref_status = numpy.zeros_like(status) [py_u_fft_grad_r, py_u_fft_grad_c] = pyRef.createUFFTGrad() pyRef.ncsReduceNoise(py_u_fft_grad_r, py_u_fft_grad_c, data, gamma, otf_mask_shift, ref_u, ref_iters, ref_status, numpy.float32(alpha)) ref_u = numpy.reshape(ref_u, data.shape) norm_diff = numpy.max(numpy.abs(u[:, :] - ref_u[:, :])) / numpy.max( ref_u[:, :]) assert (norm_diff < 1.0e-2), str(norm_diff)
def test_ncs_noise_reduction_1(): # Setup numpy.random.seed(1) data = numpy.random.uniform(low=10.0, high=20.0, size=(n_pts, n_pts)).astype(dtype=numpy.float32) gamma = numpy.random.uniform(low=2.0, high=4.0, size=(n_pts, n_pts)).astype(dtype=numpy.float32) otf_mask_shift = pyRef.createOTFMask() # OpenCL Setup. u = numpy.zeros((n_pts, n_pts), dtype=numpy.float32) iters = numpy.zeros(1, dtype=numpy.int32) status = numpy.zeros(1, dtype=numpy.int32) data_buffer = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=data) gamma_buffer = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=gamma) otf_mask_buffer = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=otf_mask_shift) u_buffer = cl.Buffer(context, cl.mem_flags.WRITE_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=u) iters_buffer = cl.Buffer(context, cl.mem_flags.WRITE_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=iters) status_buffer = cl.Buffer(context, cl.mem_flags.WRITE_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=status) # OpenCL noise reduction. program.ncsReduceNoise(queue, (1, ), (1, ), data_buffer, gamma_buffer, otf_mask_buffer, u_buffer, iters_buffer, status_buffer, numpy.float32(alpha)) cl.enqueue_copy(queue, u, u_buffer).wait() cl.enqueue_copy(queue, iters, iters_buffer).wait() cl.enqueue_copy(queue, status, status_buffer).wait() queue.finish() # Python reference version. ref_u = numpy.zeros(data.size) ref_iters = numpy.zeros_like(iters) ref_status = numpy.zeros_like(status) [py_u_fft_grad_r, py_u_fft_grad_c] = pyRef.createUFFTGrad() pyRef.ncsReduceNoise(py_u_fft_grad_r, py_u_fft_grad_c, data, gamma, otf_mask_shift, ref_u, ref_iters, ref_status, numpy.float32(alpha)) ref_u = numpy.reshape(ref_u, data.shape) norm_diff = numpy.max(numpy.abs(u[:, :] - ref_u[:, :])) / numpy.max( ref_u[:, :]) assert (norm_diff < 1.0e-2), str(norm_diff)