def _iter(self, rdr, gnm, gprof, dim, tc): tref = rdr.mod.get_surfref('flatpal') tref.set_array(self.info_a.d_pal_array, 0) nbins = dim.ah * dim.astride fill = lambda b, s, v=i32(0): util.fill_dptr( self.mod, b, s, stream=self.stream_a, value=v) fill(self.fb.d_front, 4 * nbins) fill(self.fb.d_side, 2 * nbins) fill(self.fb.d_points, self.fb._len_d_points / 4, f32(np.nan)) nts = self.info_a.ntemporal_samples nsamps = (gprof.spp(tc) * dim.w * dim.h) nrounds = int(nsamps / (nts * 256. * 256)) + 1 def launch_iter(n): if n == 0: return launch('iter', rdr.mod, self.stream_a, (32, 8, 1), (nts, n), self.fb.d_front, self.fb.d_side, self.fb.d_rb, self.fb.d_seeds, self.fb.d_points, self.info_a.d_params) # Split the launch into multiple rounds, possibly (slightly) reducing # work overlap but avoiding stalls when working on a device with an # active X session. TODO: characterize performance impact, autodetect BLOCK_SIZE = 16 for i in range(BLOCK_SIZE-1, nrounds, BLOCK_SIZE): launch_iter(BLOCK_SIZE) launch_iter(nrounds%BLOCK_SIZE) nblocks = int(np.ceil(np.sqrt(dim.ah*dim.astride/256.))) launch('flush_atom', self.mod, self.stream_a, 256, (nblocks, nblocks), u64(self.fb.d_front), u64(self.fb.d_side), i32(nbins))
def _iter(self, rdr, gnm, gprof, dim, tc): tref = rdr.mod.get_surfref('flatpal') tref.set_array(self.info_a.d_pal_array, 0) nbins = dim.ah * dim.astride fill = lambda b, s, v=i32(0): util.fill_dptr( self.mod, b, s, stream=self.stream_a, value=v) fill(self.fb.d_front, 4 * nbins) fill(self.fb.d_left, 4 * nbins) fill(self.fb.d_right, 4 * nbins) fill(self.fb.d_points, self.fb._len_d_points / 4, f32(np.nan)) fill(self.fb.d_uleft, nbins / 2) fill(self.fb.d_uright, nbins / 2) nts = self.info_a.ntemporal_samples nsamps = (gprof.spp(tc) * dim.w * dim.h) nrounds = int(nsamps / (nts * 256. * 256)) + 1 # Split the launch into multiple rounds, to prevent a system on older # GPUs from locking up and to give us a chance to flush some stuff. hidden_stream = cuda.Stream() iter_stream_left, iter_stream_right = self.stream_a, hidden_stream block_size = 4 while nrounds: n = min(nrounds, block_size) now = time.time() launch('iter', rdr.mod, iter_stream_left, (32, 8, 1), (nts, n), self.fb.d_front, self.fb.d_left, self.fb.d_rb, self.fb.d_seeds, self.fb.d_points, self.fb.d_uleft, self.info_a.d_params) delta = time.time() - now if delta > 0.1: # More than 100ms passed attempting to launch. The GPU is likely # out of queued execution resources on a long render, and scheduling # additional work will just keep spinning the CPU at 100%. # Do a blocking sync to free up resources. This may slightly reduce # parallelism but makes it a whole heck of a lot easier to keep # using the computer while things render. print >> sys.stderr, 'Launches became blocking, synchronizing' iter_stream_right.synchronize() # Make sure the other stream is done flushing before we start iter_stream_left.wait_for_event(cuda.Event().record(iter_stream_right)) launch('flush_atom', rdr.mod, iter_stream_left, (16, 16, 1), (dim.astride / 16, dim.ah / 16), u64(self.fb.d_front), u64(self.fb.d_left), u64(self.fb.d_uleft), i32(nbins)) self.fb.flip_side() iter_stream_left, iter_stream_right = iter_stream_right, iter_stream_left nrounds -= n block_size += block_size / 2 # Always wait on all events in the hidden stream before continuing on A self.stream_a.wait_for_event(cuda.Event().record(hidden_stream))
def _iter(self, rdr, gnm, gprof, dim, tc): tref = rdr.mod.get_surfref('flatpal') tref.set_array(self.info_a.d_pal_array, 0) nbins = dim.ah * dim.astride fill = lambda b, s, v=i32(0): util.fill_dptr( self.mod, b, s, stream=self.stream_a, value=v) fill(self.fb.d_front, 4 * nbins) fill(self.fb.d_left, 4 * nbins) fill(self.fb.d_right, 4 * nbins) fill(self.fb.d_points, self.fb._len_d_points / 4, f32(np.nan)) fill(self.fb.d_uleft, nbins / 2) fill(self.fb.d_uright, nbins / 2) nts = self.info_a.ntemporal_samples nsamps = (gprof.spp(tc) * dim.w * dim.h) nrounds = int(nsamps / (nts * 256. * 256)) + 1 # Split the launch into multiple rounds, to prevent a system on older # GPUs from locking up and to give us a chance to flush some stuff. hidden_stream = cuda.Stream() iter_stream_left, iter_stream_right = self.stream_a, hidden_stream block_size = 4 while nrounds: n = min(nrounds, block_size) now = time.time() launch('iter', rdr.mod, iter_stream_left, (32, 8, 1), (nts, n), self.fb.d_front, self.fb.d_left, self.fb.d_rb, self.fb.d_seeds, self.fb.d_points, self.fb.d_uleft, self.info_a.d_params) delta = time.time() - now if delta > 0.1: # More than 100ms passed attempting to launch. The GPU is likely # out of queued execution resources on a long render, and scheduling # additional work will just keep spinning the CPU at 100%. # Do a blocking sync to free up resources. This may slightly reduce # parallelism but makes it a whole heck of a lot easier to keep # using the computer while things render. print >> sys.stderr, 'Launches became blocking, synchronizing' iter_stream_right.synchronize() # Make sure the other stream is done flushing before we start iter_stream_left.wait_for_event( cuda.Event().record(iter_stream_right)) launch('flush_atom', rdr.mod, iter_stream_left, (16, 16, 1), (dim.astride / 16, dim.ah / 16), u64(self.fb.d_front), u64(self.fb.d_left), u64(self.fb.d_uleft), i32(nbins)) self.fb.flip_side() iter_stream_left, iter_stream_right = iter_stream_right, iter_stream_left nrounds -= n block_size += block_size / 2 # Always wait on all events in the hidden stream before continuing on A self.stream_a.wait_for_event(cuda.Event().record(hidden_stream))
def launchC(name, mod, stream, dim, fb, *args): launch( name, mod, stream, (32, 8, 1), (int(np.ceil(dim.w / 32.0)), int(np.ceil(dim.h / 8.0))), fb.d_back, fb.d_front, i32(fb.gutter), i32(dim.w), i32(dim.astride), i32(dim.h), *args )
def launch_iter(n): if n == 0: return launch( "iter", rdr.mod, self.stream_a, (32, 8, 1), (nts, n), self.fb.d_front, self.fb.d_side, self.fb.d_rb, self.fb.d_seeds, self.fb.d_points, self.info_a.d_params, )
def _interp(self, rdr, gnm, dim, ts, td): d_acc_size = rdr.mod.get_global('acc_size')[0] p_dim = self.fb.pool.allocate((len(dim), ), u32) p_dim[:] = dim cuda.memcpy_htod_async(d_acc_size, p_dim, self.stream_a) tref = self.mod.get_surfref('flatpal') tref.set_array(self.info_a.d_pal_array, 0) launch('interp_palette_flat', self.mod, self.stream_a, 256, self.info_a.palette_height, self.fb.d_rb, self.fb.d_seeds, self.src_a.d_ptimes, self.src_a.d_pals, f32(ts), f32(td / self.info_a.palette_height)) nts = self.info_a.ntemporal_samples launch('interp_iter_params', rdr.mod, self.stream_a, 256, np.ceil(nts / 256.), self.info_a.d_params, self.src_a.d_times, self.src_a.d_knots, f32(ts), f32(td / nts), i32(nts))
def _interp(self, rdr, gnm, dim, ts, td): d_acc_size = rdr.mod.get_global('acc_size')[0] p_dim = self.fb.pool.allocate((len(dim),), u32) p_dim[:] = dim cuda.memcpy_htod_async(d_acc_size, p_dim, self.stream_a) tref = self.mod.get_surfref('flatpal') tref.set_array(self.info_a.d_pal_array, 0) launch('interp_palette_flat', self.mod, self.stream_a, 256, self.info_a.palette_height, self.fb.d_rb, self.fb.d_seeds, self.src_a.d_ptimes, self.src_a.d_pals, f32(ts), f32(td / self.info_a.palette_height)) nts = self.info_a.ntemporal_samples launch('interp_iter_params', rdr.mod, self.stream_a, 256, np.ceil(nts / 256.), self.info_a.d_params, self.src_a.d_times, self.src_a.d_knots, f32(ts), f32(td / nts), i32(nts))
def launchC(name, mod, stream, dim, fb, *args): launch(name, mod, stream, (32, 8, 1), (int(np.ceil(dim.w / 32.)), int(np.ceil(dim.h / 8.))), fb.d_back, fb.d_front, i32(fb.gutter), i32(dim.w), i32(dim.astride), i32(dim.h), *args)
def launch_iter(n): if n == 0: return launch('iter', rdr.mod, self.stream_a, (32, 8, 1), (nts, n), self.fb.d_front, self.fb.d_side, self.fb.d_rb, self.fb.d_seeds, self.fb.d_points, self.info_a.d_params)