def propagate(gpu_detector, number=10, nphotons=500000, nthreads_per_block=64, max_blocks=1024): "Returns the average number of photons propagated on the GPU per second." rng_states = gpu.get_rng_states(nthreads_per_block * max_blocks) run_times = [] for i in tools.progress(list(range(number))): pos = np.zeros((nphotons, 3)) dir = sample.uniform_sphere(nphotons) reorder = tools.argsort_direction(dir) dir = dir[reorder] pol = normalize(np.cross(sample.uniform_sphere(nphotons), dir)) wavelengths = np.random.uniform(400, 800, size=nphotons) photons = event.Photons(pos, dir, pol, wavelengths) gpu_photons = gpu.GPUPhotons(photons) t0 = time.time() gpu_photons.propagate(gpu_detector, rng_states, nthreads_per_block, max_blocks) cuda.Context.get_current().synchronize() elapsed = time.time() - t0 if i > 0: # first kernel call incurs some driver overhead run_times.append(elapsed) return nphotons / ufloat((np.mean(run_times), np.std(run_times)))
def make_photon_with_arrays(size): '''Returns a new chroma.event.Photons object for `size` number of photons with empty arrays set for all the photon attributes.''' return event.Photons(pos=np.empty((size,3), dtype=np.float32), dir=np.empty((size,3), dtype=np.float32), pol=np.empty((size,3), dtype=np.float32), wavelengths=np.empty(size, dtype=np.float32), t=np.empty(size, dtype=np.float32), flags=np.empty(size, dtype=np.uint32), last_hit_triangles=np.empty(size, dtype=np.int32))
def get(self): pos = self.pos.get().view(np.float32).reshape((len(self.pos), 3)) dir = self.dir.get().view(np.float32).reshape((len(self.dir), 3)) pol = self.pol.get().view(np.float32).reshape((len(self.pol), 3)) wavelengths = self.wavelengths.get() t = self.t.get() last_hit_triangles = self.last_hit_triangles.get() flags = self.flags.get() weights = self.weights.get() return event.Photons(pos, dir, pol, wavelengths, t, last_hit_triangles, flags, weights)
def get(self): ncols = 3 if api.is_gpu_api_opencl(): ncols = 4 # must include padding pos = self.pos.get().view(np.float32).reshape((len(self.pos), ncols)) dir = self.dir.get().view(np.float32).reshape((len(self.dir), ncols)) pol = self.pol.get().view(np.float32).reshape((len(self.pol), ncols)) wavelengths = self.wavelengths.get() t = self.t.get() last_hit_triangles = self.last_hit_triangles.get() flags = self.flags.get() weights = self.weights.get() return event.Photons(pos, dir, pol, wavelengths, t, last_hit_triangles, flags, weights)
def get(self, npl=0, hit=0): log.info("get npl:%d hit:%d " % (npl, hit)) pos = self.pos.get().view(np.float32).reshape((len(self.pos), 3)) dir = self.dir.get().view(np.float32).reshape((len(self.dir), 3)) pol = self.pol.get().view(np.float32).reshape((len(self.pol), 3)) wavelengths = self.wavelengths.get() t = self.t.get() last_hit_triangles = self.last_hit_triangles.get() flags = self.flags.get() weights = self.weights.get() if npl: nall = len(pos) a = np.zeros((nall, 4, 4), dtype=np.float32) a[:, 0, :3] = pos a[:, 0, 3] = t a[:, 1, :3] = dir a[:, 1, 3] = wavelengths a[:, 2, :3] = pol a[:, 2, 3] = weights assert len(last_hit_triangles) == len(flags) pmtid = np.zeros(nall, dtype=np.int32) # a kludge setting of pmtid into lht using the map argument of propagate_hit.cu SURFACE_DETECT = 0x1 << 2 detected = np.where(flags & SURFACE_DETECT) pmtid[detected] = last_hit_triangles[ detected] # sparsely populate, leaving zeros for undetected a[:, 3, 0] = np.arange(nall, dtype=np.int32).view(a.dtype) # photon_id a[:, 3, 1] = 0 # used in comparison againt vbo prop a[:, 3, 2] = flags.view(a.dtype) # history flags a[:, 3, 3] = pmtid.view(a.dtype) # channel_id ie PmtId if hit: return a[pmtid > 0].view(NPY) else: return a.view(NPY) pass else: # the old way return event.Photons(pos, dir, pol, wavelengths, t, last_hit_triangles, flags, weights)
def load_photons(number=100, nphotons=500000): """Returns the average number of photons moved to the GPU device memory per second.""" pos = np.zeros((nphotons, 3)) dir = sample.uniform_sphere(nphotons) pol = normalize(np.cross(sample.uniform_sphere(nphotons), dir)) wavelengths = np.random.uniform(400, 800, size=nphotons) photons = event.Photons(pos, dir, pol, wavelengths) run_times = [] for i in tools.progress(list(range(number))): t0 = time.time() gpu_photons = gpu.GPUPhotons(photons) cuda.Context.get_current().synchronize() elapsed = time.time() - t0 if i > 0: # first kernel call incurs some driver overhead run_times.append(elapsed) return nphotons / ufloat((np.mean(run_times), np.std(run_times)))
def get_flat_hits(self, gpu_detector, target_flag=(0x1 << 2), nthreads_per_block=64, max_blocks=1024, start_photon=None, nphotons=None, no_map=False): '''GPUPhoton objects containing only photons that have a particular bit set in their history word and were detected by a channel.''' cuda.Context.get_current().synchronize() index_counter_gpu = ga.zeros(shape=1, dtype=np.uint32) cuda.Context.get_current().synchronize() if start_photon is None: start_photon = 0 if nphotons is None: nphotons = self.pos.size - start_photon # First count how much space we need for first_photon, photons_this_round, blocks in chunk_iterator( nphotons, nthreads_per_block, max_blocks): self.gpu_funcs.count_photon_hits(np.int32(start_photon + first_photon), np.int32(photons_this_round), np.uint32(target_flag), self.flags, gpu_detector.solid_id_map, self.last_hit_triangles, gpu_detector.detector_gpu, index_counter_gpu, block=(nthreads_per_block, 1, 1), grid=(blocks, 1)) cuda.Context.get_current().synchronize() reduced_nphotons = int(index_counter_gpu.get()[0]) # Then allocate new storage space pos = ga.empty(shape=reduced_nphotons, dtype=ga.vec.float3) dir = ga.empty(shape=reduced_nphotons, dtype=ga.vec.float3) pol = ga.empty(shape=reduced_nphotons, dtype=ga.vec.float3) wavelengths = ga.empty(shape=reduced_nphotons, dtype=np.float32) t = ga.empty(shape=reduced_nphotons, dtype=np.float32) last_hit_triangles = ga.empty(shape=reduced_nphotons, dtype=np.int32) flags = ga.empty(shape=reduced_nphotons, dtype=np.uint32) weights = ga.empty(shape=reduced_nphotons, dtype=np.float32) evidx = ga.empty(shape=reduced_nphotons, dtype=np.uint32) channels = ga.empty(shape=reduced_nphotons, dtype=np.int32) # And finaly copy hits, if there are any if reduced_nphotons > 0: index_counter_gpu.fill(0) for first_photon, photons_this_round, blocks in \ chunk_iterator(nphotons, nthreads_per_block, max_blocks): self.gpu_funcs.copy_photon_hits( np.int32(start_photon + first_photon), np.int32(photons_this_round), np.uint32(target_flag), gpu_detector.solid_id_map, gpu_detector.detector_gpu, index_counter_gpu, self.pos, self.dir, self.wavelengths, self.pol, self.t, self.flags, self.last_hit_triangles, self.weights, self.evidx, pos, dir, wavelengths, pol, t, flags, last_hit_triangles, weights, evidx, channels, block=(nthreads_per_block, 1, 1), grid=(blocks, 1)) assert index_counter_gpu.get()[0] == reduced_nphotons pos = pos.get().view(np.float32).reshape((len(pos), 3)) dir = dir.get().view(np.float32).reshape((len(dir), 3)) pol = pol.get().view(np.float32).reshape((len(pol), 3)) wavelengths = wavelengths.get() t = t.get() last_hit_triangles = last_hit_triangles.get() flags = flags.get() weights = weights.get() evidx = evidx.get() channels = channels.get() hitmap = {} return event.Photons(pos, dir, pol, wavelengths, t, last_hit_triangles, flags, weights, evidx, channels)
def _simulate_batch(self, batch_events, keep_photons_beg=False, keep_photons_end=False, keep_hits=True, run_daq=False, max_steps=100, verbose=False): '''Assumes batch_events is a list of Event objects with photons_beg having evidx set to the index in the array. Yields the fully formed events. Do not call directly.''' t_start = timer() #Idea: allocate memory on gpu and copy photons into it, instead of concatenating on CPU? batch_photons = event.Photons.join( [ev.photons_beg for ev in batch_events]) batch_bounds = np.cumsum( np.concatenate([[0], [len(ev.photons_beg) for ev in batch_events]])) #This copy to gpu has a _lot_ of overhead, want 100k photons at least, hence batches #Assume triangles, and weights are unimportant to copy to GPU t_copy_start = timer() gpu_photons = gpu.GPUPhotons(batch_photons, copy_triangles=False, copy_weights=False) t_copy_end = timer() if verbose: print('GPU copy took %0.2f s' % (t_copy_end - t_copy_start)) t_prop_start = timer() tracking = gpu_photons.propagate( self.gpu_geometry, self.rng_states, nthreads_per_block=self.nthreads_per_block, max_blocks=self.max_blocks, max_steps=max_steps, track=self.photon_tracking) t_prop_end = timer() if verbose: print('GPU propagate took %0.2f s' % (t_prop_end - t_prop_start)) t_end = timer() if verbose: print('Batch took %0.2f s' % (t_end - t_start)) if keep_photons_end: batch_photons_end = gpu_photons.get() if hasattr(self.detector, 'num_channels') and keep_hits: batch_hits = gpu_photons.get_hits(self.gpu_geometry) for i, (batch_ev, (start_photon, end_photon)) in enumerate( zip(batch_events, zip(batch_bounds[:-1], batch_bounds[1:]))): if not keep_photons_beg: batch_ev.photons_beg = None if self.photon_tracking: step_photon_ids, step_photons = tracking nphotons = end_photon - start_photon photon_tracks = [[] for i in range(nphotons)] for step_ids, step_photons in zip(step_photon_ids, step_photons): mask = np.logical_and(step_ids >= start_photon, step_ids < end_photon) if np.count_nonzero(mask) == 0: break photon_ids = step_ids[mask] - start_photon photons = step_photons[mask] #Indexing Photons with a scalar changes the internal array shapes... any(photon_tracks[id].append(photons[i]) for i, id in enumerate(photon_ids)) batch_ev.photon_tracks = [ event.Photons.join(photons, concatenate=False) if len(photons) > 0 else event.Photons() for photons in photon_tracks ] if keep_photons_end: batch_ev.photons_end = batch_photons_end[ start_photon:end_photon] if hasattr(self.detector, 'num_channels') and keep_hits: #Thought: this is kind of expensive computationally, but keep_hits is for diagnostics batch_ev.hits = { chan: batch_hits[chan][batch_hits[chan].evidx == i] for chan in batch_hits } batch_ev.hits = { chan: batch_ev.hits[chan] for chan in batch_ev.hits if len(batch_ev.hits[chan]) > 0 } if hasattr(self, 'gpu_daq') and run_daq: #Must run DAQ per event, or design a much more complicated daq algorithm self.gpu_daq.begin_acquire() self.gpu_daq.acquire( gpu_photons, self.rng_states, start_photon=start_photon, nphotons=(end_photon - start_photon), nthreads_per_block=self.nthreads_per_block, max_blocks=self.max_blocks) gpu_channels = self.gpu_daq.end_acquire() batch_ev.channels = gpu_channels.get() yield batch_ev