class GPUPhotons(object): def __init__(self, photons, ncopies=1): """Load ``photons`` onto the GPU, replicating as requested. Args: - photons: chroma.Event.Photons Photon state information to load onto GPU - ncopies: int, *optional* Number of times to replicate the photons on the GPU. This is used if you want to propagate the same event many times, for example in a likelihood calculation. The amount of GPU storage will be proportionally larger if ncopies > 1, so be careful. """ nphotons = len(photons) self.pos = ga.empty(shape=nphotons*ncopies, dtype=ga.vec.float3) self.dir = ga.empty(shape=nphotons*ncopies, dtype=ga.vec.float3) self.pol = ga.empty(shape=nphotons*ncopies, dtype=ga.vec.float3) self.wavelengths = ga.empty(shape=nphotons*ncopies, dtype=np.float32) self.t = ga.empty(shape=nphotons*ncopies, dtype=np.float32) self.last_hit_triangles = ga.empty(shape=nphotons*ncopies, dtype=np.int32) self.flags = ga.empty(shape=nphotons*ncopies, dtype=np.uint32) self.weights = ga.empty(shape=nphotons*ncopies, dtype=np.float32) # Assign the provided photons to the beginning (possibly # the entire array if ncopies is 1 self.pos[:nphotons].set(to_float3(photons.pos)) self.dir[:nphotons].set(to_float3(photons.dir)) self.pol[:nphotons].set(to_float3(photons.pol)) self.wavelengths[:nphotons].set(photons.wavelengths.astype(np.float32)) self.t[:nphotons].set(photons.t.astype(np.float32)) self.last_hit_triangles[:nphotons].set(photons.last_hit_triangles.astype(np.int32)) self.flags[:nphotons].set(photons.flags.astype(np.uint32)) self.weights[:nphotons].set(photons.weights.astype(np.float32)) module = get_cu_module('propagate.cu', options=cuda_options) self.gpu_funcs = GPUFuncs(module) # Replicate the photons to the rest of the slots if needed if ncopies > 1: max_blocks = 1024 nthreads_per_block = 64 for first_photon, photons_this_round, blocks in \ chunk_iterator(nphotons, nthreads_per_block, max_blocks): self.gpu_funcs.photon_duplicate(np.int32(first_photon), np.int32(photons_this_round), self.pos, self.dir, self.wavelengths, self.pol, self.t, self.flags, self.last_hit_triangles, self.weights, np.int32(ncopies-1), np.int32(nphotons), block=(nthreads_per_block,1,1), grid=(blocks, 1)) # Save the duplication information for the iterate_copies() method self.true_nphotons = nphotons self.ncopies = ncopies def get(self): pos = self.pos.get().view(np.float32).reshape((len(self.pos),3)) dir = self.dir.get().view(np.float32).reshape((len(self.dir),3)) pol = self.pol.get().view(np.float32).reshape((len(self.pol),3)) wavelengths = self.wavelengths.get() t = self.t.get() last_hit_triangles = self.last_hit_triangles.get() flags = self.flags.get() weights = self.weights.get() return event.Photons(pos, dir, pol, wavelengths, t, last_hit_triangles, flags, weights) def get_hits(self, gpu_detector, target_flag=(0x1<<2), nthreads_per_block=64, max_blocks=1024, start_photon=None, nphotons=None): '''Return a map of GPUPhoton objects containing only photons that have a particular bit set in their history word and were detected by a channel.''' cuda.Context.get_current().synchronize() index_counter_gpu = ga.zeros(shape=1, dtype=np.uint32) cuda.Context.get_current().synchronize() if start_photon is None: start_photon = 0 if nphotons is None: nphotons = self.pos.size - start_photon # First count how much space we need for first_photon, photons_this_round, blocks in chunk_iterator(nphotons, nthreads_per_block, max_blocks): self.gpu_funcs.count_photon_hits(np.int32(start_photon+first_photon), np.int32(photons_this_round), np.uint32(target_flag), self.flags, gpu_detector.solid_id_map, self.last_hit_triangles, gpu_detector.detector_gpu, index_counter_gpu, block=(nthreads_per_block,1,1), grid=(blocks, 1)) cuda.Context.get_current().synchronize() reduced_nphotons = int(index_counter_gpu.get()[0]) # Then allocate new storage space pos = ga.empty(shape=reduced_nphotons, dtype=ga.vec.float3) dir = ga.empty(shape=reduced_nphotons, dtype=ga.vec.float3) pol = ga.empty(shape=reduced_nphotons, dtype=ga.vec.float3) wavelengths = ga.empty(shape=reduced_nphotons, dtype=np.float32) t = ga.empty(shape=reduced_nphotons, dtype=np.float32) last_hit_triangles = ga.empty(shape=reduced_nphotons, dtype=np.int32) flags = ga.empty(shape=reduced_nphotons, dtype=np.uint32) weights = ga.empty(shape=reduced_nphotons, dtype=np.float32) channels = ga.empty(shape=reduced_nphotons, dtype=np.int32) # And finaly copy hits, if there are any if reduced_nphotons > 0: index_counter_gpu.fill(0) for first_photon, photons_this_round, blocks in \ chunk_iterator(nphotons, nthreads_per_block, max_blocks): self.gpu_funcs.copy_photon_hits(np.int32(start_photon+first_photon), np.int32(photons_this_round), np.uint32(target_flag), gpu_detector.solid_id_map, gpu_detector.detector_gpu, index_counter_gpu, self.pos, self.dir, self.wavelengths, self.pol, self.t, self.flags, self.last_hit_triangles, self.weights, pos, dir, wavelengths, pol, t, flags, last_hit_triangles, weights, channels, block=(nthreads_per_block,1,1), grid=(blocks, 1)) assert index_counter_gpu.get()[0] == reduced_nphotons pos = pos.get().view(np.float32).reshape((len(pos),3)) dir = dir.get().view(np.float32).reshape((len(dir),3)) pol = pol.get().view(np.float32).reshape((len(pol),3)) wavelengths = wavelengths.get() t = t.get() last_hit_triangles = last_hit_triangles.get() flags = flags.get() weights = weights.get() channels = channels.get() hitmap = {} for chan in np.unique(channels): mask = (channels == chan).astype(bool) hitmap[chan] = event.Photons(pos[mask], dir[mask], pol[mask], wavelengths[mask], t[mask], last_hit_triangles[mask], flags[mask], weights[mask]) return hitmap def iterate_copies(self): '''Returns an iterator that yields GPUPhotonsSlice objects corresponding to the event copies stored in ``self``.''' for i in xrange(self.ncopies): window = slice(self.true_nphotons*i, self.true_nphotons*(i+1)) yield GPUPhotonsSlice(pos=self.pos[window], dir=self.dir[window], pol=self.pol[window], wavelengths=self.wavelengths[window], t=self.t[window], last_hit_triangles=self.last_hit_triangles[window], flags=self.flags[window], weights=self.weights[window]) @profile_if_possible def propagate(self, gpu_geometry, rng_states, nthreads_per_block=64, max_blocks=1024, max_steps=10, use_weights=False, scatter_first=0): """Propagate photons on GPU to termination or max_steps, whichever comes first. May be called repeatedly without reloading photon information if single-stepping through photon history. ..warning:: `rng_states` must have at least `nthreads_per_block`*`max_blocks` number of curandStates. """ nphotons = self.pos.size step = 0 input_queue = np.empty(shape=nphotons+1, dtype=np.uint32) input_queue[0] = 0 # Order photons initially in the queue to put the clones next to each other for copy in xrange(self.ncopies): input_queue[1+copy::self.ncopies] = np.arange(self.true_nphotons, dtype=np.uint32) + copy * self.true_nphotons input_queue_gpu = ga.to_gpu(input_queue) output_queue = np.zeros(shape=nphotons+1, dtype=np.uint32) output_queue[0] = 1 output_queue_gpu = ga.to_gpu(output_queue) while step < max_steps: # Just finish the rest of the steps if the # of photons is low if nphotons < nthreads_per_block * 16 * 8 or use_weights: nsteps = max_steps - step else: nsteps = 1 for first_photon, photons_this_round, blocks in \ chunk_iterator(nphotons, nthreads_per_block, max_blocks): self.gpu_funcs.propagate(np.int32(first_photon), np.int32(photons_this_round), input_queue_gpu[1:], output_queue_gpu, rng_states, self.pos, self.dir, self.wavelengths, self.pol, self.t, self.flags, self.last_hit_triangles, self.weights, np.int32(nsteps), np.int32(use_weights), np.int32(scatter_first), gpu_geometry.gpudata, block=(nthreads_per_block,1,1), grid=(blocks, 1)) step += nsteps scatter_first = 0 # Only allow non-zero in first pass if step < max_steps: temp = input_queue_gpu input_queue_gpu = output_queue_gpu output_queue_gpu = temp # Assign with a numpy array of length 1 to silence # warning from PyCUDA about setting array with different strides/storage orders. output_queue_gpu[:1].set(np.ones(shape=1, dtype=np.uint32)) nphotons = input_queue_gpu[:1].get()[0] - 1 if ga.max(self.flags).get() & (1 << 31): print >>sys.stderr, "WARNING: ABORTED PHOTONS" cuda.Context.get_current().synchronize() @profile_if_possible def select(self, target_flag, nthreads_per_block=64, max_blocks=1024, start_photon=None, nphotons=None): '''Return a new GPUPhoton object containing only photons that have a particular bit set in their history word.''' cuda.Context.get_current().synchronize() index_counter_gpu = ga.zeros(shape=1, dtype=np.uint32) cuda.Context.get_current().synchronize() if start_photon is None: start_photon = 0 if nphotons is None: nphotons = self.pos.size - start_photon # First count how much space we need for first_photon, photons_this_round, blocks in \ chunk_iterator(nphotons, nthreads_per_block, max_blocks): self.gpu_funcs.count_photons(np.int32(start_photon+first_photon), np.int32(photons_this_round), np.uint32(target_flag), index_counter_gpu, self.flags, block=(nthreads_per_block,1,1), grid=(blocks, 1)) cuda.Context.get_current().synchronize() reduced_nphotons = int(index_counter_gpu.get()[0]) # Then allocate new storage space pos = ga.empty(shape=reduced_nphotons, dtype=ga.vec.float3) dir = ga.empty(shape=reduced_nphotons, dtype=ga.vec.float3) pol = ga.empty(shape=reduced_nphotons, dtype=ga.vec.float3) wavelengths = ga.empty(shape=reduced_nphotons, dtype=np.float32) t = ga.empty(shape=reduced_nphotons, dtype=np.float32) last_hit_triangles = ga.empty(shape=reduced_nphotons, dtype=np.int32) flags = ga.empty(shape=reduced_nphotons, dtype=np.uint32) weights = ga.empty(shape=reduced_nphotons, dtype=np.float32) # And finaly copy photons, if there are any if reduced_nphotons > 0: index_counter_gpu.fill(0) for first_photon, photons_this_round, blocks in \ chunk_iterator(nphotons, nthreads_per_block, max_blocks): self.gpu_funcs.copy_photons(np.int32(start_photon+first_photon), np.int32(photons_this_round), np.uint32(target_flag), index_counter_gpu, self.pos, self.dir, self.wavelengths, self.pol, self.t, self.flags, self.last_hit_triangles, self.weights, pos, dir, wavelengths, pol, t, flags, last_hit_triangles, weights, block=(nthreads_per_block,1,1), grid=(blocks, 1)) assert index_counter_gpu.get()[0] == reduced_nphotons return GPUPhotonsSlice(pos, dir, pol, wavelengths, t, last_hit_triangles, flags, weights) def __del__(self): del self.pos del self.dir del self.pol del self.wavelengths del self.t del self.flags del self.last_hit_triangles # Free up GPU memory quickly if now available gc.collect() def __len__(self): return self.pos.size
class GPUPhotons(object): def __init__(self, photons, ncopies=1, copy_flags=True, copy_triangles=True, copy_weights=True): """Load ``photons`` onto the GPU, replicating as requested. Args: - photons: chroma.Event.Photons Photon state information to load onto GPU - ncopies: int, *optional* Number of times to replicate the photons on the GPU. This is used if you want to propagate the same event many times, for example in a likelihood calculation. The amount of GPU storage will be proportionally larger if ncopies > 1, so be careful. """ nphotons = len(photons) self.pos = ga.empty(shape=nphotons * ncopies, dtype=ga.vec.float3) self.dir = ga.empty(shape=nphotons * ncopies, dtype=ga.vec.float3) self.pol = ga.empty(shape=nphotons * ncopies, dtype=ga.vec.float3) self.wavelengths = ga.empty(shape=nphotons * ncopies, dtype=np.float32) self.t = ga.empty(shape=nphotons * ncopies, dtype=np.float32) self.last_hit_triangles = ga.empty(shape=nphotons * ncopies, dtype=np.int32) if not copy_triangles: self.last_hit_triangles.fill(-1) if not copy_flags: self.flags = ga.zeros(shape=nphotons * ncopies, dtype=np.uint32) else: self.flags = ga.empty(shape=nphotons * ncopies, dtype=np.uint32) if not copy_weights: self.weights = ga.ones_like(self.last_hit_triangles, dtype=np.float32) else: self.weights = ga.empty(shape=nphotons * ncopies, dtype=np.float32) self.evidx = ga.empty(shape=nphotons, dtype=np.uint32) # Assign the provided photons to the beginning (possibly # the entire array if ncopies is 1 self.pos[:nphotons].set(to_float3(photons.pos)) self.dir[:nphotons].set(to_float3(photons.dir)) self.pol[:nphotons].set(to_float3(photons.pol)) self.wavelengths[:nphotons].set(photons.wavelengths.astype(np.float32)) self.t[:nphotons].set(photons.t.astype(np.float32)) if copy_triangles: self.last_hit_triangles[:nphotons].set( photons.last_hit_triangles.astype(np.int32)) if copy_flags: self.flags[:nphotons].set(photons.flags.astype(np.uint32)) if copy_weights: self.weights[:nphotons].set(photons.weights.astype(np.float32)) self.evidx[:nphotons].set(photons.evidx.astype(np.uint32)) module = get_cu_module('propagate.cu', options=cuda_options) self.gpu_funcs = GPUFuncs(module) # Replicate the photons to the rest of the slots if needed if ncopies > 1: max_blocks = 1024 nthreads_per_block = 64 for first_photon, photons_this_round, blocks in \ chunk_iterator(nphotons, nthreads_per_block, max_blocks): self.gpu_funcs.photon_duplicate(np.int32(first_photon), np.int32(photons_this_round), self.pos, self.dir, self.wavelengths, self.pol, self.t, self.flags, self.last_hit_triangles, self.weights, self.evidx, np.int32(ncopies - 1), np.int32(nphotons), block=(nthreads_per_block, 1, 1), grid=(blocks, 1)) # Save the duplication information for the iterate_copies() method self.true_nphotons = nphotons self.ncopies = ncopies def get(self): pos = self.pos.get().view(np.float32).reshape((len(self.pos), 3)) dir = self.dir.get().view(np.float32).reshape((len(self.dir), 3)) pol = self.pol.get().view(np.float32).reshape((len(self.pol), 3)) wavelengths = self.wavelengths.get() t = self.t.get() last_hit_triangles = self.last_hit_triangles.get() flags = self.flags.get() weights = self.weights.get() evidx = self.evidx.get() return event.Photons(pos, dir, pol, wavelengths, t, last_hit_triangles, flags, weights, evidx) def get_hits(self, *args, **kwargs): '''Return a map of GPUPhoton objects containing only photons that have a particular bit set in their history word and were detected by a channel.''' flat_hits = self.get_flat_hits(*args, **kwargs) hitmap = {} for chan in np.unique(flat_hits.channel): mask = (flat_hits.channel == chan).astype(bool) hitmap[int(chan)] = flat_hits[mask] return hitmap def get_flat_hits(self, gpu_detector, target_flag=(0x1 << 2), nthreads_per_block=64, max_blocks=1024, start_photon=None, nphotons=None, no_map=False): '''GPUPhoton objects containing only photons that have a particular bit set in their history word and were detected by a channel.''' cuda.Context.get_current().synchronize() index_counter_gpu = ga.zeros(shape=1, dtype=np.uint32) cuda.Context.get_current().synchronize() if start_photon is None: start_photon = 0 if nphotons is None: nphotons = self.pos.size - start_photon # First count how much space we need for first_photon, photons_this_round, blocks in chunk_iterator( nphotons, nthreads_per_block, max_blocks): self.gpu_funcs.count_photon_hits(np.int32(start_photon + first_photon), np.int32(photons_this_round), np.uint32(target_flag), self.flags, gpu_detector.solid_id_map, self.last_hit_triangles, gpu_detector.detector_gpu, index_counter_gpu, block=(nthreads_per_block, 1, 1), grid=(blocks, 1)) cuda.Context.get_current().synchronize() reduced_nphotons = int(index_counter_gpu.get()[0]) # Then allocate new storage space pos = ga.empty(shape=reduced_nphotons, dtype=ga.vec.float3) dir = ga.empty(shape=reduced_nphotons, dtype=ga.vec.float3) pol = ga.empty(shape=reduced_nphotons, dtype=ga.vec.float3) wavelengths = ga.empty(shape=reduced_nphotons, dtype=np.float32) t = ga.empty(shape=reduced_nphotons, dtype=np.float32) last_hit_triangles = ga.empty(shape=reduced_nphotons, dtype=np.int32) flags = ga.empty(shape=reduced_nphotons, dtype=np.uint32) weights = ga.empty(shape=reduced_nphotons, dtype=np.float32) evidx = ga.empty(shape=reduced_nphotons, dtype=np.uint32) channels = ga.empty(shape=reduced_nphotons, dtype=np.int32) # And finaly copy hits, if there are any if reduced_nphotons > 0: index_counter_gpu.fill(0) for first_photon, photons_this_round, blocks in \ chunk_iterator(nphotons, nthreads_per_block, max_blocks): self.gpu_funcs.copy_photon_hits( np.int32(start_photon + first_photon), np.int32(photons_this_round), np.uint32(target_flag), gpu_detector.solid_id_map, gpu_detector.detector_gpu, index_counter_gpu, self.pos, self.dir, self.wavelengths, self.pol, self.t, self.flags, self.last_hit_triangles, self.weights, self.evidx, pos, dir, wavelengths, pol, t, flags, last_hit_triangles, weights, evidx, channels, block=(nthreads_per_block, 1, 1), grid=(blocks, 1)) assert index_counter_gpu.get()[0] == reduced_nphotons pos = pos.get().view(np.float32).reshape((len(pos), 3)) dir = dir.get().view(np.float32).reshape((len(dir), 3)) pol = pol.get().view(np.float32).reshape((len(pol), 3)) wavelengths = wavelengths.get() t = t.get() last_hit_triangles = last_hit_triangles.get() flags = flags.get() weights = weights.get() evidx = evidx.get() channels = channels.get() hitmap = {} return event.Photons(pos, dir, pol, wavelengths, t, last_hit_triangles, flags, weights, evidx, channels) def iterate_copies(self): '''Returns an iterator that yields GPUPhotonsSlice objects corresponding to the event copies stored in ``self``.''' for i in range(self.ncopies): window = slice(self.true_nphotons * i, self.true_nphotons * (i + 1)) yield GPUPhotonsSlice( pos=self.pos[window], dir=self.dir[window], pol=self.pol[window], wavelengths=self.wavelengths[window], t=self.t[window], last_hit_triangles=self.last_hit_triangles[window], flags=self.flags[window], weights=self.weights[window], evidx=self.evidx[window]) @profile_if_possible def propagate(self, gpu_geometry, rng_states, nthreads_per_block=64, max_blocks=1024, max_steps=10, use_weights=False, scatter_first=0, track=False): """Propagate photons on GPU to termination or max_steps, whichever comes first. May be called repeatedly without reloading photon information if single-stepping through photon history. ..warning:: `rng_states` must have at least `nthreads_per_block`*`max_blocks` number of curandStates. """ nphotons = self.pos.size step = 0 input_queue = np.empty(shape=nphotons + 1, dtype=np.uint32) input_queue[0] = 0 # Order photons initially in the queue to put the clones next to each other for copy in range(self.ncopies): input_queue[1 + copy::self.ncopies] = np.arange( self.true_nphotons, dtype=np.uint32) + copy * self.true_nphotons input_queue_gpu = ga.to_gpu(input_queue) output_queue = np.zeros(shape=nphotons + 1, dtype=np.uint32) output_queue[0] = 1 output_queue_gpu = ga.to_gpu(output_queue) if track: step_photon_ids = [] step_photons = [] #save the first step for all photons in the input queue step_photon_ids.append(input_queue_gpu[1:nphotons + 1].get()) step_photons.append( self.copy_queue(input_queue_gpu[1:], nphotons).get()) while step < max_steps: # Just finish the rest of the steps if the # of photons is low and not tracking if not track and (nphotons < nthreads_per_block * 16 * 8 or use_weights): nsteps = max_steps - step else: nsteps = 1 for first_photon, photons_this_round, blocks in \ chunk_iterator(nphotons, nthreads_per_block, max_blocks): self.gpu_funcs.propagate(np.int32(first_photon), np.int32(photons_this_round), input_queue_gpu[1:], output_queue_gpu, rng_states, self.pos, self.dir, self.wavelengths, self.pol, self.t, self.flags, self.last_hit_triangles, self.weights, self.evidx, np.int32(nsteps), np.int32(use_weights), np.int32(scatter_first), gpu_geometry.gpudata, block=(nthreads_per_block, 1, 1), grid=(blocks, 1)) if track: #save the next step for all photons in the input queue step_photon_ids.append(input_queue_gpu[1:nphotons + 1].get()) step_photons.append( self.copy_queue(input_queue_gpu[1:], nphotons).get()) step += nsteps scatter_first = 0 # Only allow non-zero in first pass if step < max_steps: temp = input_queue_gpu input_queue_gpu = output_queue_gpu output_queue_gpu = temp # Assign with a numpy array of length 1 to silence # warning from PyCUDA about setting array with different strides/storage orders. output_queue_gpu[:1].set(np.ones(shape=1, dtype=np.uint32)) nphotons = input_queue_gpu[:1].get()[0] - 1 if nphotons == 0: break if ga.max(self.flags).get() & (1 << 31): print("WARNING: ABORTED PHOTONS", file=sys.stderr) cuda.Context.get_current().synchronize() if track: return step_photon_ids, step_photons @profile_if_possible def copy_queue(self, queue_gpu, nphotons, nthreads_per_block=64, max_blocks=1024, start_photon=0): # Allocate new storage space pos = ga.empty(shape=nphotons, dtype=ga.vec.float3) dir = ga.empty(shape=nphotons, dtype=ga.vec.float3) pol = ga.empty(shape=nphotons, dtype=ga.vec.float3) wavelengths = ga.empty(shape=nphotons, dtype=np.float32) t = ga.empty(shape=nphotons, dtype=np.float32) last_hit_triangles = ga.empty(shape=nphotons, dtype=np.int32) flags = ga.empty(shape=nphotons, dtype=np.uint32) weights = ga.empty(shape=nphotons, dtype=np.float32) evidx = ga.empty(shape=nphotons, dtype=np.uint32) # And finaly copy photons, if there are any if nphotons > 0: for first_photon, photons_this_round, blocks in chunk_iterator( nphotons, nthreads_per_block, max_blocks): self.gpu_funcs.copy_photon_queue( np.int32(start_photon + first_photon), np.int32(photons_this_round), queue_gpu, self.pos, self.dir, self.wavelengths, self.pol, self.t, self.flags, self.last_hit_triangles, self.weights, self.evidx, pos, dir, wavelengths, pol, t, flags, last_hit_triangles, weights, evidx, block=(nthreads_per_block, 1, 1), grid=(blocks, 1)) return GPUPhotonsSlice(pos, dir, pol, wavelengths, t, last_hit_triangles, flags, weights, evidx) @profile_if_possible def select(self, target_flag, nthreads_per_block=64, max_blocks=1024, start_photon=None, nphotons=None): '''Return a new GPUPhoton object containing only photons that have a particular bit set in their history word.''' cuda.Context.get_current().synchronize() index_counter_gpu = ga.zeros(shape=1, dtype=np.uint32) cuda.Context.get_current().synchronize() if start_photon is None: start_photon = 0 if nphotons is None: nphotons = self.pos.size - start_photon # First count how much space we need for first_photon, photons_this_round, blocks in \ chunk_iterator(nphotons, nthreads_per_block, max_blocks): self.gpu_funcs.count_photons(np.int32(start_photon + first_photon), np.int32(photons_this_round), np.uint32(target_flag), index_counter_gpu, self.flags, block=(nthreads_per_block, 1, 1), grid=(blocks, 1)) cuda.Context.get_current().synchronize() reduced_nphotons = int(index_counter_gpu.get()[0]) # Then allocate new storage space pos = ga.empty(shape=reduced_nphotons, dtype=ga.vec.float3) dir = ga.empty(shape=reduced_nphotons, dtype=ga.vec.float3) pol = ga.empty(shape=reduced_nphotons, dtype=ga.vec.float3) wavelengths = ga.empty(shape=reduced_nphotons, dtype=np.float32) t = ga.empty(shape=reduced_nphotons, dtype=np.float32) last_hit_triangles = ga.empty(shape=reduced_nphotons, dtype=np.int32) flags = ga.empty(shape=reduced_nphotons, dtype=np.uint32) weights = ga.empty(shape=reduced_nphotons, dtype=np.float32) evidx = ga.empty(shape=reduced_nphotons, dtype=np.uint32) # And finaly copy photons, if there are any if reduced_nphotons > 0: index_counter_gpu.fill(0) for first_photon, photons_this_round, blocks in \ chunk_iterator(nphotons, nthreads_per_block, max_blocks): self.gpu_funcs.copy_photons(np.int32(start_photon + first_photon), np.int32(photons_this_round), np.uint32(target_flag), index_counter_gpu, self.pos, self.dir, self.wavelengths, self.pol, self.t, self.flags, self.last_hit_triangles, self.weights, self.evidx, pos, dir, wavelengths, pol, t, flags, last_hit_triangles, weights, evidx, block=(nthreads_per_block, 1, 1), grid=(blocks, 1)) assert index_counter_gpu.get()[0] == reduced_nphotons return GPUPhotonsSlice(pos, dir, pol, wavelengths, t, last_hit_triangles, flags, weights, evidx) def __del__(self): del self.pos del self.dir del self.pol del self.wavelengths del self.t del self.flags del self.last_hit_triangles # Free up GPU memory quickly if now available gc.collect() def __len__(self): return self.pos.size
class GPUPhotonsHit(object): def __init__(self, photons, ncopies=1, max_time=4.): """Load ``photons`` onto the GPU, replicating as requested. Args: - photons: chroma.Event.Photons Photon state information to load onto GPU - ncopies: int, *optional* Number of times to replicate the photons on the GPU. This is used if you want to propagate the same event many times, for example in a likelihood calculation. The amount of GPU storage will be proportionally larger if ncopies > 1, so be careful. """ module = get_cu_module('propagate_hit.cu', options=cuda_options) propagate_hit_kernel = module.get_function('propagate_hit') propagate_hit_kernel.prepare('iiPPPPPPPPPPPiiiPPP') self.propagate_hit_kernel = propagate_hit_kernel self.gpu_funcs = GPUFuncs(module) self.max_time = max_time self.ncopies = ncopies self.true_nphotons = len(photons) self.marshall_photons(photons, ncopies) def marshall_photons_npl(self, npl): pass def marshall_photons(self, photons, ncopies): """ Assign the provided photons to the beginning (possibly the entire array if ncopies is 1 """ nphotons = len(photons) self.pos = ga.empty(shape=nphotons * ncopies, dtype=ga.vec.float3) self.dir = ga.empty(shape=nphotons * ncopies, dtype=ga.vec.float3) self.pol = ga.empty(shape=nphotons * ncopies, dtype=ga.vec.float3) self.wavelengths = ga.empty(shape=nphotons * ncopies, dtype=np.float32) self.t = ga.empty(shape=nphotons * ncopies, dtype=np.float32) self.last_hit_triangles = ga.empty(shape=nphotons * ncopies, dtype=np.int32) self.flags = ga.empty(shape=nphotons * ncopies, dtype=np.uint32) self.weights = ga.empty(shape=nphotons * ncopies, dtype=np.float32) self.pos[:nphotons].set(to_float3(photons.pos)) self.dir[:nphotons].set(to_float3(photons.dir)) self.pol[:nphotons].set(to_float3(photons.pol)) self.wavelengths[:nphotons].set(photons.wavelengths.astype(np.float32)) self.t[:nphotons].set(photons.t.astype(np.float32)) self.last_hit_triangles[:nphotons].set( photons.last_hit_triangles.astype(np.int32)) self.flags[:nphotons].set(photons.flags.astype(np.uint32)) self.weights[:nphotons].set(photons.weights.astype(np.float32)) # Replicate the photons to the rest of the slots if needed if ncopies > 1: max_blocks = 1024 nthreads_per_block = 64 block = (nthreads_per_block, 1, 1) for first_photon, photons_this_round, blocks in chunk_iterator( nphotons, nthreads_per_block, max_blocks): pass grid = (blocks, 1) args = ( np.int32(first_photon), np.int32(photons_this_round), self.pos, self.dir, self.wavelengths, self.pol, self.t, self.flags, self.last_hit_triangles, self.weights, np.int32(ncopies - 1), np.int32(nphotons), ) self.gpu_funcs.photon_duplicate(*args, block=block, grid=grid) pass pass def get(self, npl=0, hit=0): log.info("get npl:%d hit:%d " % (npl, hit)) pos = self.pos.get().view(np.float32).reshape((len(self.pos), 3)) dir = self.dir.get().view(np.float32).reshape((len(self.dir), 3)) pol = self.pol.get().view(np.float32).reshape((len(self.pol), 3)) wavelengths = self.wavelengths.get() t = self.t.get() last_hit_triangles = self.last_hit_triangles.get() flags = self.flags.get() weights = self.weights.get() if npl: nall = len(pos) a = np.zeros((nall, 4, 4), dtype=np.float32) a[:, 0, :3] = pos a[:, 0, 3] = t a[:, 1, :3] = dir a[:, 1, 3] = wavelengths a[:, 2, :3] = pol a[:, 2, 3] = weights assert len(last_hit_triangles) == len(flags) pmtid = np.zeros(nall, dtype=np.int32) # a kludge setting of pmtid into lht using the map argument of propagate_hit.cu SURFACE_DETECT = 0x1 << 2 detected = np.where(flags & SURFACE_DETECT) pmtid[detected] = last_hit_triangles[ detected] # sparsely populate, leaving zeros for undetected a[:, 3, 0] = np.arange(nall, dtype=np.int32).view(a.dtype) # photon_id a[:, 3, 1] = 0 # used in comparison againt vbo prop a[:, 3, 2] = flags.view(a.dtype) # history flags a[:, 3, 3] = pmtid.view(a.dtype) # channel_id ie PmtId if hit: return a[pmtid > 0].view(NPY) else: return a.view(NPY) pass else: # the old way return event.Photons(pos, dir, pol, wavelengths, t, last_hit_triangles, flags, weights) def iterate_copies(self): '''Returns an iterator that yields GPUPhotonsSlice objects corresponding to the event copies stored in ``self``.''' for i in xrange(self.ncopies): window = slice(self.true_nphotons * i, self.true_nphotons * (i + 1)) yield GPUPhotonsSlice( pos=self.pos[window], dir=self.dir[window], pol=self.pol[window], wavelengths=self.wavelengths[window], t=self.t[window], last_hit_triangles=self.last_hit_triangles[window], flags=self.flags[window], weights=self.weights[window]) def upload_queues(self, nwork): """ # Order photons initially in the queue to put the clones next to each other #. input_queue starts as [0,0,1,2,3,.....,nwork] #. output_queue starts as [1,0,0,0,0,....] """ input_queue = np.empty(shape=nwork + 1, dtype=np.uint32) input_queue[0] = 0 for copy in xrange(self.ncopies): input_queue[1 + copy::self.ncopies] = np.arange( self.true_nphotons, dtype=np.uint32) + copy * self.true_nphotons output_queue = np.zeros(shape=nwork + 1, dtype=np.uint32) output_queue[0] = 1 self.input_queue_gpu = ga.to_gpu(input_queue) self.output_queue_gpu = ga.to_gpu(output_queue) def swap_queues(self): """ Swaps queues and returns photons remaining to propagate #. output_queue[0] = 1 initially, this avoids enqueued photon_id stomping on output_queue[0] as atomicAdd returns the non-incremented:: 230 // Not done, put photon in output queue 231 if ((p.history & (NO_HIT | BULK_ABSORB | SURFACE_DETECT | SURFACE_ABSORB | NAN_ABORT)) == 0) 232 { // pulling queue ticket 233 int out_idx = atomicAdd(output_queue, 1); // atomic add 1 to slot zero value, returns non-incremented original value 234 output_queue[out_idx] = photon_id; 235 } #. At kernel tail non-completed photon threads enqueue their photon_id into a slot in the output_queue. The slot to use is obtained by atomic incrementing output_queue[0], ensuring orderly queue. #. after kernel completes output_queue[0] contains the number of photon_id enqued in output_queue[1:] """ temp = self.input_queue_gpu self.input_queue_gpu = self.output_queue_gpu self.output_queue_gpu = temp self.output_queue_gpu[:1].set(np.ones(shape=1, dtype=np.uint32)) slot0minus1 = self.input_queue_gpu[:1].get( )[0] - 1 # which was just now the output_queue before swap log.debug("swap_queues slot0minus1 %s " % slot0minus1) return slot0minus1 @profile_if_possible def propagate_hit(self, gpu_geometry, rng_states, parameters): """Propagate photons on GPU to termination or max_steps, whichever comes first. May be called repeatedly without reloading photon information if single-stepping through photon history. ..warning:: `rng_states` must have at least `nthreads_per_block`*`max_blocks` number of curandStates. got one abort:: In [1]: a = ph("hhMOCK") In [9]: f = a[:,3,2].view(np.uint32) In [12]: np.where( f & 1<<31 ) Out[12]: (array([279]),) failed to just mock that one:: RANGE=279:280 MockNuWa MOCK """ nphotons = self.pos.size nwork = nphotons nthreads_per_block = parameters['threads_per_block'] max_blocks = parameters['max_blocks'] max_steps = parameters['max_steps'] use_weights = False scatter_first = 0 self.upload_queues(nwork) solid_id_map_gpu = gpu_geometry.solid_id_map solid_id_to_channel_id_gpu = gpu_geometry.solid_id_to_channel_id_gpu small_remainder = nthreads_per_block * 16 * 8 block = (nthreads_per_block, 1, 1) results = {} results['name'] = "propagate_hit" results['nphotons'] = nphotons results['nwork'] = nwork results['nsmall'] = small_remainder results['COLUMNS'] = "name:s,nphotons:i,nwork:i,nsmall:i" step = 0 times = [] npass = 0 nabort = 0 while step < max_steps: npass += 1 if nwork < small_remainder or use_weights: nsteps = max_steps - step # Just finish the rest of the steps if the # of photons is low log.debug( "increase nsteps for stragglers: small_remainder %s nwork %s nsteps %s max_steps %s " % (small_remainder, nwork, nsteps, max_steps)) else: nsteps = 1 pass log.info("nphotons %s nwork %s step %s max_steps %s nsteps %s " % (nphotons, nwork, step, max_steps, nsteps)) abort = False for first_photon, photons_this_round, blocks in chunk_iterator( nwork, nthreads_per_block, max_blocks): if abort: nabort += 1 else: grid = (blocks, 1) args = ( np.int32(first_photon), np.int32(photons_this_round), self.input_queue_gpu[1:].gpudata, self.output_queue_gpu.gpudata, rng_states, self.pos.gpudata, self.dir.gpudata, self.wavelengths.gpudata, self.pol.gpudata, self.t.gpudata, self.flags.gpudata, self.last_hit_triangles.gpudata, self.weights.gpudata, np.int32(nsteps), np.int32(use_weights), np.int32(scatter_first), gpu_geometry.gpudata, solid_id_map_gpu.gpudata, solid_id_to_channel_id_gpu.gpudata, ) log.info( "propagate_hit_kernel.prepared_timed_call grid %s block %s first_photon %s photons_this_round %s " % (repr(grid), repr(block), first_photon, photons_this_round)) get_time = self.propagate_hit_kernel.prepared_timed_call( grid, block, *args) t = get_time() times.append(t) if t > self.max_time: abort = True log.warn( "kernel launch time %s > max_time %s : ABORTING " % (t, self.max_time)) pass pass pass log.info("step %s propagate_hit_kernel times %s " % (step, repr(times))) pass step += nsteps scatter_first = 0 # Only allow non-zero in first pass if step < max_steps: nwork = self.swap_queues() pass pass log.info("calling max ") if ga.max(self.flags).get() & (1 << 31): log.warn("ABORTED PHOTONS") log.info("done calling max ") cuda.Context.get_current().synchronize() results['npass'] = npass results['nabort'] = nabort results['nlaunch'] = len(times) results['tottime'] = sum(times) results['maxtime'] = max(times) results['mintime'] = min(times) results[ 'COLUMNS'] += ",npass:i,nabort:i,nlaunch:i,tottime:f,maxtime:f,mintime:f" return results @profile_if_possible def select(self, target_flag, nthreads_per_block=64, max_blocks=1024, start_photon=None, nphotons=None): '''Return a new GPUPhoton object containing only photons that have a particular bit set in their history word.''' cuda.Context.get_current().synchronize() index_counter_gpu = ga.zeros(shape=1, dtype=np.uint32) cuda.Context.get_current().synchronize() if start_photon is None: start_photon = 0 if nphotons is None: nphotons = self.pos.size - start_photon # First count how much space we need for first_photon, photons_this_round, blocks in \ chunk_iterator(nphotons, nthreads_per_block, max_blocks): self.gpu_funcs.count_photons(np.int32(start_photon + first_photon), np.int32(photons_this_round), np.uint32(target_flag), index_counter_gpu, self.flags, block=(nthreads_per_block, 1, 1), grid=(blocks, 1)) cuda.Context.get_current().synchronize() reduced_nphotons = int(index_counter_gpu.get()[0]) # Then allocate new storage space pos = ga.empty(shape=reduced_nphotons, dtype=ga.vec.float3) dir = ga.empty(shape=reduced_nphotons, dtype=ga.vec.float3) pol = ga.empty(shape=reduced_nphotons, dtype=ga.vec.float3) wavelengths = ga.empty(shape=reduced_nphotons, dtype=np.float32) t = ga.empty(shape=reduced_nphotons, dtype=np.float32) last_hit_triangles = ga.empty(shape=reduced_nphotons, dtype=np.int32) flags = ga.empty(shape=reduced_nphotons, dtype=np.uint32) weights = ga.empty(shape=reduced_nphotons, dtype=np.float32) # And finaly copy photons, if there are any if reduced_nphotons > 0: index_counter_gpu.fill(0) for first_photon, photons_this_round, blocks in \ chunk_iterator(nphotons, nthreads_per_block, max_blocks): self.gpu_funcs.copy_photons(np.int32(start_photon + first_photon), np.int32(photons_this_round), np.uint32(target_flag), index_counter_gpu, self.pos, self.dir, self.wavelengths, self.pol, self.t, self.flags, self.last_hit_triangles, self.weights, pos, dir, wavelengths, pol, t, flags, last_hit_triangles, weights, block=(nthreads_per_block, 1, 1), grid=(blocks, 1)) assert index_counter_gpu.get()[0] == reduced_nphotons return GPUPhotonsHitSlice(pos, dir, pol, wavelengths, t, last_hit_triangles, flags, weights) def __del__(self): del self.pos del self.dir del self.pol del self.wavelengths del self.t del self.flags del self.last_hit_triangles # Free up GPU memory quickly if now available gc.collect() def __len__(self): return self.pos.size