def __init__(self, gpu_detector, ntdcs=None, ns_per_tdc=None, adc_bits=None, ndaq=1, cl_context=None, cl_queue=None): """constructor. Args: gpu_detector: GPUDetector Keywords: ntdcs: int number of time bins per channel if not supplied, using class variable value ns_per_tdc: float nanoseconds per time bin if not supplied, using class variable value adc_bits: int number of ADC bits (not used yet) ndaq: int number of daqs cl_context: pyopencl.Context cl_queue: pyopencl.CommandQueue Raises: ValueError when ntdcs and ns_per_tdc are found to be NoneType """ if ntdcs == None: self.ntdcs = GPUDaqLAr1ND.NTDC if ns_per_tdc == None: self.ns_per_tdc = GPUDaqLAr1ND.NS_PER_TDC super(GPUDaqLAr1ND, self).__init__(gpu_detector, ntdcs=self.ntdcs, ns_per_tdc=self.ns_per_tdc, adc_bits=adc_bits, ndaq=ndaq, cl_context=cl_context, cl_queue=cl_queue) if self.ntdcs == None: raise ValueError("GPUDaqLAr1ND.NTDC has not been set.") if self.ns_per_tdc == None: raise ValueError("GPUDaqLAr1ND.NS_PER_TDC has not been set.") kernel_filepath = os.path.dirname( os.path.realpath(__file__)) + "/daq_lar1nd" if api.is_gpu_api_cuda(): self.module = cutools.get_cu_module(kernel_filepath + ".cu", options=api_options, include_source_directory=True) elif api.is_gpu_api_opencl(): self.module = cltools.get_cl_module(kernel_filepath + '.cl', cl_context, options=api_options, include_source_directory=True) else: raise RuntimeError("GPU API is neither CUDA nor OpenCL") self.gpu_funcs = GPUFuncs(self.module)
def __init__(self, cl_context=None): if api.is_gpu_api_cuda(): self.module = cutools.get_cu_module('pdf.cu', options=api_options, include_source_directory=True) elif api.is_gpu_api_opencl(): self.module = cltools.get_cl_module('pdf.cl', cl_context, options=api_options, include_source_directory=True) self.gpu_funcs = GPUFuncs(self.module)
def setUp(self): self.context = cltools.get_last_context() self.nthreads_per_block = 256 self.myoptions = ('-I.', ) + api_options self.mod = get_module("test_sample_cdf.cl", self.context, options=self.myoptions, include_source_directory=True) self.funcs = GPUFuncs(self.mod) self.rng_states = clrand.get_rng_states(self.context, self.nthreads_per_block) self.outf = rt.TFile("output_sample_cdf.root", "RECREATE")
def area_sort_nodes(gpu_geometry, layer_bounds): bvh_module = get_cu_module('bvh.cu', options=cuda_options, include_source_directory=True) bvh_funcs = GPUFuncs(bvh_module) bounds = zip(layer_bounds[:-1], layer_bounds[1:])[:-1] bounds.reverse() nthreads_per_block = 256 for start, end in bounds: bvh_funcs.area_sort_child(np.uint32(start), np.uint32(end), gpu_geometry, block=(nthreads_per_block, 1, 1), grid=(120, 1)) return gpu_geometry.nodes.get()
def fill_array(context, rng_states, size): queue = cl.CommandQueue(context) out_gpu = cl.array.empty(queue, size, dtype=np.float32) randmod = get_cl_module("random.cl", context, options=cl_options, include_source_directory=True) randfuncs = GPUFuncs(randmod) nthreads_per_block = 256 for first_index, elements_this_iter, nblocks_this_iter in chunk_iterator( size, nthreads_per_block, max_blocks=1): randfuncs.fillArray(queue, (elements_this_iter, 1, 1), None, np.uint32(first_index), rng_states.data, out_gpu.data) out = out_gpu.get() return out
def __init__(self, gpu_detector, ndaq=1, cl_context=None, cl_queue=None): if api.is_gpu_api_cuda(): self.earliest_time_gpu = ga.empty(gpu_detector.nchannels * ndaq, dtype=np.float32) self.earliest_time_int_gpu = ga.empty(gpu_detector.nchannels * ndaq, dtype=np.uint32) self.channel_history_gpu = ga.zeros_like( self.earliest_time_int_gpu) self.channel_q_int_gpu = ga.zeros_like(self.earliest_time_int_gpu) self.channel_q_gpu = ga.zeros(len(self.earliest_time_int_gpu), dtype=np.float32) self.detector_gpu = gpu_detector.detector_gpu self.module = cutools.get_cu_module('daq.cu', options=api_options, include_source_directory=True) elif api.is_gpu_api_opencl(): self.earliest_time_gpu = ga.empty(cl_queue, gpu_detector.nchannels * ndaq, dtype=np.float32) self.earliest_time_int_gpu = ga.empty(cl_queue, gpu_detector.nchannels * ndaq, dtype=np.uint32) self.channel_history_gpu = ga.zeros(cl_queue, gpu_detector.nchannels * ndaq, dtype=np.uint32) self.channel_q_int_gpu = ga.zeros(cl_queue, gpu_detector.nchannels * ndaq, dtype=np.uint32) self.channel_q_gpu = ga.zeros(cl_queue, gpu_detector.nchannels * ndaq, dtype=np.float32) self.detector_gpu = gpu_detector # struct not made in opencl mode, so we keep a copy of the class self.module = cltools.get_cl_module('daq.cl', cl_context, options=api_options, include_source_directory=True) else: raise RuntimeError("GPU API is neither CUDA nor OpenCL") self.solid_id_map_gpu = gpu_detector.solid_id_map self.solid_id_to_channel_index_gpu = gpu_detector.solid_id_to_channel_index_gpu self.gpu_funcs = GPUFuncs(self.module) self.ndaq = ndaq self.stride = gpu_detector.nchannels
def __init__(self, pos, dir, pol, wavelengths, t, last_hit_triangles, flags, weights): '''Create new object using slices of GPUArrays from an instance of GPUPhotons. NOTE THESE ARE NOT CPU ARRAYS!''' self.pos = pos self.dir = dir self.pol = pol self.wavelengths = wavelengths self.t = t self.last_hit_triangles = last_hit_triangles self.flags = flags self.weights = weights module = get_cu_module('propagate.cu', options=cuda_options) self.gpu_funcs = GPUFuncs(module) self.true_nphotons = len(pos) self.ncopies = 1
def collapse_chains(nodes, layer_bounds): if gpuapi.is_gpu_api_cuda(): bvh_module = get_module('bvh.cu', options=api_options, include_source_directory=True) elif gpuapi.is_gpu_api_opencl(): context = cltools.get_last_context() queue = cl.CommandQueue(context) bvh_module = get_module('bvh.cl', context, options=api_options, include_source_directory=True) else: raise RuntimeError('API neither CUDA or OpenCL') bvh_funcs = GPUFuncs(bvh_module) if gpuapi.is_gpu_api_cuda(): gpu_nodes = ga.to_gpu(nodes) elif gpuapi.is_gpu_api_opencl(): gpu_nodes = ga.to_device(queue, nodes) bounds = zip(layer_bounds[:-1], layer_bounds[1:])[:-1] bounds.reverse() nthreads_per_block = 256 for start, end in bounds: if gpuapi.is_gpu_api_cuda(): bvh_funcs.collapse_child(np.uint32(start), np.uint32(end), gpu_nodes, block=(nthreads_per_block, 1, 1), grid=(120, 1)) elif gpuapi.is_gpu_api_opencl(): bvh_funcs.collapse_child(queue, (end - start, 1, 1), None, np.uint32(start), np.uint32(end), gpu_nodes.data).wait() return gpu_nodes.get()
class GPUPhotons(object): def __init__(self, photons, ncopies=1, cl_context=None): """Load ``photons`` onto the GPU, replicating as requested. Args: - photons: chroma.Event.Photons Photon state information to load onto GPU - ncopies: int, *optional* Number of times to replicate the photons on the GPU. This is used if you want to propagate the same event many times, for example in a likelihood calculation. The amount of GPU storage will be proportionally larger if ncopies > 1, so be careful. """ nphotons = len(photons) # Allocate GPU memory for photon info and push to device if api.is_gpu_api_cuda(): self.pos = ga.empty(shape=nphotons * ncopies, dtype=ga.vec.float3) self.dir = ga.empty(shape=nphotons * ncopies, dtype=ga.vec.float3) self.pol = ga.empty(shape=nphotons * ncopies, dtype=ga.vec.float3) self.wavelengths = ga.empty(shape=nphotons * ncopies, dtype=np.float32) self.t = ga.empty(shape=nphotons * ncopies, dtype=np.float32) self.last_hit_triangles = ga.empty(shape=nphotons * ncopies, dtype=np.int32) self.flags = ga.empty(shape=nphotons * ncopies, dtype=np.uint32) self.weights = ga.empty(shape=nphotons * ncopies, dtype=np.float32) self.current_node_index = ga.zeros(shape=nphotons * ncopies, dtype=np.uint32) # deprecated self.requested_workcode = ga.empty(shape=nphotons * ncopies, dtype=np.uint32) # deprecated elif api.is_gpu_api_opencl(): queue = cl.CommandQueue(cl_context) self.pos = ga.empty(queue, shape=nphotons * ncopies, dtype=ga.vec.float3) self.dir = ga.empty(queue, shape=nphotons * ncopies, dtype=ga.vec.float3) self.pol = ga.empty(queue, shape=nphotons * ncopies, dtype=ga.vec.float3) self.wavelengths = ga.empty(queue, shape=nphotons * ncopies, dtype=np.float32) self.t = ga.empty(queue, shape=nphotons * ncopies, dtype=np.float32) self.last_hit_triangles = ga.empty(queue, shape=nphotons * ncopies, dtype=np.int32) self.flags = ga.empty(queue, shape=nphotons * ncopies, dtype=np.uint32) self.weights = ga.empty(queue, shape=nphotons * ncopies, dtype=np.float32) self.current_node_index = ga.zeros(queue, shape=nphotons * ncopies, dtype=np.uint32) # deprecated self.requested_workcode = ga.empty(queue, shape=nphotons * ncopies, dtype=np.uint32) # deprecated # Assign the provided photons to the beginning (possibly # the entire array if ncopies is 1 self.pos[:nphotons].set(to_float3(photons.pos)) self.dir[:nphotons].set(to_float3(photons.dir)) self.pol[:nphotons].set(to_float3(photons.pol)) self.wavelengths[:nphotons].set(photons.wavelengths.astype(np.float32)) self.t[:nphotons].set(photons.t.astype(np.float32)) self.last_hit_triangles[:nphotons].set( photons.last_hit_triangles.astype(np.int32)) self.flags[:nphotons].set(photons.flags.astype(np.uint32)) self.weights[:nphotons].set(photons.weights.astype(np.float32)) if api.is_gpu_api_cuda(): self.module = get_module('propagate.cu', options=api_options, include_source_directory=True) elif api.is_gpu_api_opencl(): self.module = get_module('propagate.cl', cl_context, options=api_options, include_source_directory=True) # define the texture references self.define_texture_references() # get kernel functions self.gpu_funcs = GPUFuncs(self.module) # Replicate the photons to the rest of the slots if needed if ncopies > 1: max_blocks = 1024 nthreads_per_block = 64 for first_photon, photons_this_round, blocks in \ chunk_iterator(nphotons, nthreads_per_block, max_blocks): self.gpu_funcs.photon_duplicate(np.int32(first_photon), np.int32(photons_this_round), self.pos, self.dir, self.wavelengths, self.pol, self.t, self.flags, self.last_hit_triangles, self.weights, np.int32(ncopies - 1), np.int32(nphotons), block=(nthreads_per_block, 1, 1), grid=(blocks, 1)) # Save the duplication information for the iterate_copies() method self.true_nphotons = nphotons self.ncopies = ncopies def define_texture_references(self, module=None): # unbound texture references declared for use with propagate if module == None: module = self.module if api.is_gpu_api_cuda(): self.node_texture_ref = module.get_texref("nodevec_tex_ref") self.node_texture_ref.set_format(cuda.array_format.UNSIGNED_INT32, 4) self.extra_node_texture_ref = module.get_texref( "extra_node_tex_ref") self.extra_node_texture_ref.set_format( cuda.array_format.UNSIGNED_INT32, 4) self.vertices_texture_ref = module.get_texref( "verticesvec_tex_ref") self.vertices_texture_ref.set_format(cuda.array_format.FLOAT, 4) self.triangles_texture_ref = module.get_texref( "trianglesvec_tex_ref") self.triangles_texture_ref.set_format( cuda.array_format.UNSIGNED_INT32, 4) self.node_texture_ref_bound = False elif api.is_gpu_api_opencl(): # texture usage not used at the moment pass def get(self): ncols = 3 if api.is_gpu_api_opencl(): ncols = 4 # must include padding pos = self.pos.get().view(np.float32).reshape((len(self.pos), ncols)) dir = self.dir.get().view(np.float32).reshape((len(self.dir), ncols)) pol = self.pol.get().view(np.float32).reshape((len(self.pol), ncols)) wavelengths = self.wavelengths.get() t = self.t.get() last_hit_triangles = self.last_hit_triangles.get() flags = self.flags.get() weights = self.weights.get() return event.Photons(pos, dir, pol, wavelengths, t, last_hit_triangles, flags, weights) def iterate_copies(self): '''Returns an iterator that yields GPUPhotonsSlice objects corresponding to the event copies stored in ``self``.''' for i in xrange(self.ncopies): window = slice(self.true_nphotons * i, self.true_nphotons * (i + 1)) yield GPUPhotonsSlice( pos=self.pos[window], dir=self.dir[window], pol=self.pol[window], wavelengths=self.wavelengths[window], t=self.t[window], last_hit_triangles=self.last_hit_triangles[window], flags=self.flags[window], weights=self.weights[window]) @profile_if_possible def propagate(self, gpu_geometry, rng_states, nthreads_per_block=64, max_blocks=1024, max_steps=10, use_weights=False, scatter_first=0, cl_context=None): """Propagate photons on GPU to termination or max_steps, whichever comes first. May be called repeatedly without reloading photon information if single-stepping through photon history. ..warning:: `rng_states` must have at least `nthreads_per_block`*`max_blocks` number of curandStates. """ nphotons = self.pos.size # bind node texture reference if api.is_gpu_api_cuda() and not self.node_texture_ref_bound: # we have to unroll, as pycuda doesn't seem to support vector times right now for binding self.unrolled_nodes = ga.to_gpu( gpu_geometry.nodes.get().ravel().view(np.uint32)) self.unrolled_extra_nodes = ga.to_gpu( gpu_geometry.extra_nodes.ravel().view(np.uint32)) self.unrolled_triangles = ga.to_gpu( gpu_geometry.triangles.get().ravel().view(np.uint32)) self.unrolled_triangles4 = ga.to_gpu( gpu_geometry.triangles4.ravel().view(np.uint32)) self.unrolled_vertices = ga.to_gpu( gpu_geometry.vertices.get().ravel().view(np.float32)) self.unrolled_vertices4 = ga.to_gpu( gpu_geometry.vertices4.ravel().view(np.float32)) self.node_texture_ref.set_address(self.unrolled_nodes.gpudata, self.unrolled_nodes.nbytes) self.extra_node_texture_ref.set_address( self.unrolled_extra_nodes.gpudata, self.unrolled_extra_nodes.nbytes) #self.unrolled_nodes.bind_to_texref_ext( self.node_texture_ref ) #self.unrolled_extra_nodes.bind_to_texref_ext( self.extra_node_texture_ref ) #self.unrolled_triangles.bind_to_texref_ext( self.triangles_texture_ref ) self.triangles_texture_ref.set_address( self.unrolled_triangles4.gpudata, self.unrolled_triangles4.nbytes) #self.unrolled_vertices.bind_to_texref_ext( self.vertices_texture_ref ) self.vertices_texture_ref.set_address( self.unrolled_vertices4.gpudata, self.unrolled_vertices4.nbytes) print "[BOUND TO TEXTURE MEMORY]" print "Nodes: ", self.unrolled_nodes.nbytes / 1.0e3, " kbytes" print "Extra nodes: ", self.unrolled_extra_nodes.nbytes / 1.0e3, " kbytes" print "Triangles: ", self.unrolled_triangles4.nbytes / 1.0e3, " kbytes" print "Vertices: ", self.unrolled_vertices4.nbytes / 1.0e3, " kbytes" print "Total: ", (self.unrolled_nodes.nbytes + self.unrolled_extra_nodes.nbytes + self.unrolled_triangles4.nbytes + self.unrolled_vertices4.nbytes) / 1.0e3, "kbytes" self.node_texture_ref_bound = True # setup queue maxqueue = nphotons step = 0 input_queue = np.empty(shape=maxqueue + 1, dtype=np.uint32) input_queue[0] = 0 # Order photons initially in the queue to put the clones next to each other for copy in xrange(self.ncopies): input_queue[1 + copy::self.ncopies] = np.arange( self.true_nphotons, dtype=np.uint32) + copy * self.true_nphotons if api.is_gpu_api_cuda(): input_queue_gpu = ga.to_gpu(input_queue) elif api.is_gpu_api_opencl(): comqueue = cl.CommandQueue(cl_context) input_queue_gpu = ga.to_device(comqueue, input_queue[1:]) # why the offset? output_queue = np.zeros(shape=maxqueue + 1, dtype=np.uint32) output_queue[0] = 1 if api.is_gpu_api_cuda(): output_queue_gpu = ga.to_gpu(output_queue) elif api.is_gpu_api_opencl(): output_queue_gpu = ga.to_device(comqueue, output_queue) if use_weights: iuse_weights = 1 else: iuse_weights = 0 adapt_factor = 1.0 start_prop = time.time() while step < max_steps: # Just finish the rest of the steps if the # of photons is low #if nphotons < nthreads_per_block * 16 * 8 or use_weights: # nsteps = max_steps - step #else: # nsteps = 1 nsteps = 1 start_step = time.time() for first_photon, photons_this_round, blocks in \ chunk_iterator(nphotons, nthreads_per_block, max( int(adapt_factor*max_blocks), 1 )): #print nphotons, nthreads_per_block, max_blocks," : ",first_photon, photons_this_round, blocks, adapt_factor start_chunk = time.time() if api.is_gpu_api_cuda(): self.gpu_funcs.propagate(np.int32(first_photon), np.int32(photons_this_round), input_queue_gpu[1:], output_queue_gpu, rng_states, self.pos, self.dir, self.wavelengths, self.pol, self.t, self.flags, self.last_hit_triangles, self.weights, np.int32(nsteps), np.int32(iuse_weights), np.int32(scatter_first), gpu_geometry.gpudata, block=(nthreads_per_block, 1, 1), grid=(blocks, 1)) #cuda.Context.get_current().synchronize() elif api.is_gpu_api_opencl(): self.gpu_funcs.propagate( comqueue, (photons_this_round, 1, 1), None, np.int32(first_photon), np.int32(photons_this_round), input_queue_gpu.data, output_queue_gpu.data, rng_states.data, self.pos.data, self.dir.data, self.wavelengths.data, self.pol.data, self.t.data, self.flags.data, self.last_hit_triangles.data, self.weights.data, np.int32(nsteps), np.int32(iuse_weights), np.int32(scatter_first), gpu_geometry.world_scale, gpu_geometry.world_origin.data, np.int32(len(gpu_geometry.nodes)), gpu_geometry.material_data['n'], gpu_geometry.material_data['step'], gpu_geometry.material_data["wavelength0"], gpu_geometry.vertices.data, gpu_geometry.triangles.data, gpu_geometry.material_codes.data, gpu_geometry.colors.data, gpu_geometry.nodes.data, gpu_geometry.extra_nodes.data, gpu_geometry.material_data["nmaterials"], gpu_geometry.material_data['refractive_index'].data, gpu_geometry.material_data['absorption_length'].data, gpu_geometry.material_data['scattering_length'].data, gpu_geometry.material_data['reemission_prob'].data, gpu_geometry.material_data['reemission_cdf'].data, gpu_geometry.surface_data['nsurfaces'], gpu_geometry.surface_data['detect'].data, gpu_geometry.surface_data['absorb'].data, gpu_geometry.surface_data['reemit'].data, gpu_geometry.surface_data['reflect_diffuse'].data, gpu_geometry.surface_data['reflect_specular'].data, gpu_geometry.surface_data['eta'].data, gpu_geometry.surface_data['k'].data, gpu_geometry.surface_data['reemission_cdf'].data, gpu_geometry.surface_data['model'].data, gpu_geometry.surface_data['transmissive'].data, gpu_geometry.surface_data['thickness'].data, gpu_geometry.surface_data['nplanes'].data, gpu_geometry.surface_data['wire_diameter'].data, gpu_geometry.surface_data['wire_pitch'].data, g_times_l=True).wait() end_chunk = time.time() chunk_time = end_chunk - start_chunk #print "chunk time: ",chunk_time #if chunk_time>2.5: # adapt_factor *= 0.5 step += nsteps scatter_first = 0 # Only allow non-zero in first pass end_step = time.time() #print "step time: ",end_step-start_step if step < max_steps: start_requeue = time.time() #print "reset photon queues" if api.is_gpu_api_cuda(): cuda.Context.get_current().synchronize( ) # ensure all threads done #temp = input_queue_gpu #input_queue_gpu = output_queue_gpu #output_queue_gpu = temp # Assign with a numpy array of length 1 to silence # warning from PyCUDA about setting array with different strides/storage orders. #output_queue_gpu[:1].set(np.ones(shape=1, dtype=np.uint32)) #nphotons = input_queue_gpu[:1].get()[0] - 1 # new style output_queue_gpu.get(output_queue) nphotons = output_queue[0] - 1 input_queue_gpu.set(output_queue) output_queue_gpu[:1].set(np.ones(shape=1, dtype=np.uint32)) elif api.is_gpu_api_opencl(): temp_out = output_queue_gpu.get() nphotons = temp_out[0] input_queue_gpu.set( temp_out[1:], queue=comqueue ) # set the input queue to have index of photons still need to be run output_queue_gpu[:1].set( np.ones(shape=1, dtype=np.uint32), queue=comqueue) # reset first instance to be one end_requeue = time.time() #print "re-queue time (nphotons=",nphotons"): ",end_requeue-start_requeue if nphotons == 0: break end_prop = time.time() print "propagation time: ", end_prop - start_prop, " secs" end_flags = self.flags.get() end_flag = np.max(end_flags) if end_flag & (1 << 31): print >> sys.stderr, "WARNING: ABORTED PHOTONS" if api.is_gpu_api_cuda(): cuda.Context.get_current().synchronize() elif api.is_gpu_api_opencl(): cl.enqueue_barrier(comqueue) @profile_if_possible def select(self, target_flag, nthreads_per_block=64, max_blocks=1024, start_photon=None, nphotons=None): '''Return a new GPUPhoton object containing only photons that have a particular bit set in their history word.''' cuda.Context.get_current().synchronize() index_counter_gpu = ga.zeros(shape=1, dtype=np.uint32) cuda.Context.get_current().synchronize() if start_photon is None: start_photon = 0 if nphotons is None: nphotons = self.pos.size - start_photon # First count how much space we need for first_photon, photons_this_round, blocks in \ chunk_iterator(nphotons, nthreads_per_block, max_blocks): self.gpu_funcs.count_photons(np.int32(start_photon + first_photon), np.int32(photons_this_round), np.uint32(target_flag), index_counter_gpu, self.flags, block=(nthreads_per_block, 1, 1), grid=(blocks, 1)) cuda.Context.get_current().synchronize() reduced_nphotons = int(index_counter_gpu.get()[0]) # Then allocate new storage space pos = ga.empty(shape=reduced_nphotons, dtype=ga.vec.float3) dir = ga.empty(shape=reduced_nphotons, dtype=ga.vec.float3) pol = ga.empty(shape=reduced_nphotons, dtype=ga.vec.float3) wavelengths = ga.empty(shape=reduced_nphotons, dtype=np.float32) t = ga.empty(shape=reduced_nphotons, dtype=np.float32) last_hit_triangles = ga.empty(shape=reduced_nphotons, dtype=np.int32) flags = ga.empty(shape=reduced_nphotons, dtype=np.uint32) weights = ga.empty(shape=reduced_nphotons, dtype=np.float32) # And finaly copy photons, if there are any if reduced_nphotons > 0: index_counter_gpu.fill(0) for first_photon, photons_this_round, blocks in \ chunk_iterator(nphotons, nthreads_per_block, max_blocks): self.gpu_funcs.copy_photons(np.int32(start_photon + first_photon), np.int32(photons_this_round), np.uint32(target_flag), index_counter_gpu, self.pos, self.dir, self.wavelengths, self.pol, self.t, self.flags, self.last_hit_triangles, self.weights, pos, dir, wavelengths, pol, t, flags, last_hit_triangles, weights, block=(nthreads_per_block, 1, 1), grid=(blocks, 1)) assert index_counter_gpu.get()[0] == reduced_nphotons return GPUPhotonsSlice(pos, dir, pol, wavelengths, t, last_hit_triangles, flags, weights) def __del__(self): del self.pos del self.dir del self.pol del self.wavelengths del self.t del self.flags del self.last_hit_triangles # Free up GPU memory quickly if now available gc.collect() def __len__(self): return self.pos.size
def optimize_layer(orig_nodes): bvh_module = get_cu_module('bvh.cu', options=cuda_options, include_source_directory=True) bvh_funcs = GPUFuncs(bvh_module) nodes = ga.to_gpu(orig_nodes) n = len(nodes) areas = ga.empty(shape=n / 2, dtype=np.uint64) nthreads_per_block = 128 min_areas = ga.empty(shape=int(np.ceil(n / float(nthreads_per_block))), dtype=np.uint64) min_index = ga.empty(shape=min_areas.shape, dtype=np.uint32) update = 10000 skip_size = 1 flag = cutools.mapped_empty(shape=skip_size, dtype=np.uint32) i = 0 skips = 0 swaps = 0 while i < n / 2 - 1: # How are we doing? if i % update == 0: for first_index, elements_this_iter, nblocks_this_iter in \ chunk_iterator(n/2, nthreads_per_block, max_blocks=10000): bvh_funcs.pair_area(np.uint32(first_index), np.uint32(elements_this_iter), nodes, areas, block=(nthreads_per_block, 1, 1), grid=(nblocks_this_iter, 1)) areas_host = areas.get() #print nodes.get(), areas_host.astype(float) print 'Area of parent layer so far (%d): %1.12e' % ( i * 2, areas_host.astype(float).sum()) print 'Skips: %d, Swaps: %d' % (skips, swaps) test_index = i * 2 blocks = 0 look_forward = min(8192 * 50, n - test_index - 2) skip_this_round = min(skip_size, n - test_index - 1) flag[:] = 0 for first_index, elements_this_iter, nblocks_this_iter in \ chunk_iterator(look_forward, nthreads_per_block, max_blocks=10000): bvh_funcs.min_distance_to(np.uint32(first_index + test_index + 2), np.uint32(elements_this_iter), np.uint32(test_index), nodes, np.uint32(blocks), min_areas, min_index, cutools.Mapped(flag), block=(nthreads_per_block, 1, 1), grid=(nblocks_this_iter, skip_this_round)) blocks += nblocks_this_iter #print i, first_index, nblocks_this_iter, look_forward cuda.Context.get_current().synchronize() if flag[0] == 0: flag_nonzero = flag.nonzero()[0] if len(flag_nonzero) == 0: no_swap_required = skip_size else: no_swap_required = flag_nonzero[0] i += no_swap_required skips += no_swap_required continue min_areas_host = min_areas[:blocks].get() min_index_host = min_index[:blocks].get() best_block = min_areas_host.argmin() better_i = min_index_host[best_block] swaps += 1 #print 'swap', test_index+1, better_i assert 0 < better_i < len(nodes) assert 0 < test_index + 1 < len(nodes) bvh_funcs.swap(np.uint32(test_index + 1), np.uint32(better_i), nodes, block=(1, 1, 1), grid=(1, 1)) cuda.Context.get_current().synchronize() i += 1 for first_index, elements_this_iter, nblocks_this_iter in \ chunk_iterator(n/2, nthreads_per_block, max_blocks=10000): bvh_funcs.pair_area(np.uint32(first_index), np.uint32(elements_this_iter), nodes, areas, block=(nthreads_per_block, 1, 1), grid=(nblocks_this_iter, 1)) areas_host = areas.get() print 'Final area of parent layer: %1.12e' % areas_host.sum() print 'Skips: %d, Swaps: %d' % (skips, swaps) return nodes.get()
def concatenate_layers(layers): nthreads_per_block = 1024 context = None queue = None if gpuapi.is_gpu_api_opencl(): context = cltools.get_last_context() #print context queue = cl.CommandQueue(context) # Load GPU functions if gpuapi.is_gpu_api_cuda(): bvh_module = get_module('bvh.cu', options=api_options, include_source_directory=True) elif gpuapi.is_gpu_api_opencl(): # don't like the last context method. trouble. trouble. bvh_module = get_module('bvh.cl', cltools.get_last_context(), options=api_options, include_source_directory=True) else: raise RuntimeError('API neither CUDA nor OpenCL?!') bvh_funcs = GPUFuncs(bvh_module) # Put 0 at beginning of list layer_bounds = np.insert(np.cumsum(map(len, layers)), 0, 0) # allocate memory if gpuapi.is_gpu_api_cuda(): nodes = ga.empty(shape=int(layer_bounds[-1]), dtype=ga.vec.uint4) elif gpuapi.is_gpu_api_opencl(): totsize = 0 layer_pos = [] print layer_bounds[-1] for n, layer in enumerate(layers): layer_pos.append(totsize) print "LAYER ", n, " size=", len(layer), "start=", totsize totsize += len(layer) print "totsize: ", totsize nodes_iter_np = np.empty(totsize, dtype=ga.vec.uint4) nodes_iter_gpu = ga.to_device(queue, nodes_iter_np) nodeset_np = [] else: raise RuntimeError('API neither CUDA nor OpenCL?!') ilayer = 0 for layer_start, layer_end, layer in zip(layer_bounds[:-1], layer_bounds[1:], layers): if layer_end == layer_bounds[-1]: # leaf nodes need no offset child_offset = 0 else: child_offset = layer_end #print "ilayer,start,end,child_offset: ",ilayer,layer_start, layer_end, child_offset nmax_blocks = 10000 if gpuapi.is_gpu_api_opencl(): nthreads_per_block = 256 nmax_blocks = 1 for first_index, elements_this_iter, nblocks_this_iter in \ chunk_iterator(layer_end-layer_start, nthreads_per_block,max_blocks=nmax_blocks): #print " ",ilayer,first_index, elements_this_iter, nblocks_this_iter, layer_start if gpuapi.is_gpu_api_cuda(): bvh_funcs.copy_and_offset(np.uint32(first_index), np.uint32(elements_this_iter), np.uint32(child_offset), cuda.In(layer), nodes[layer_start:], block=(nthreads_per_block, 1, 1), grid=(nblocks_this_iter, 1)) elif gpuapi.is_gpu_api_opencl(): layer_gpu = ga.to_device(queue, layer) bvh_funcs.copy_and_offset(queue, (elements_this_iter, 1, 1), (1, 1, 1), np.uint32(first_index), np.uint32(elements_this_iter), np.uint32(child_offset), np.uint32(layer_start), layer_gpu.data, nodes_iter_gpu.data, g_times_l=True).wait() else: raise RuntimeError('API neither CUDA nor OpenCL?!') ilayer += 1 if gpuapi.is_gpu_api_cuda(): return nodes.get(), layer_bounds elif gpuapi.is_gpu_api_opencl(): return nodes_iter_gpu.get(), layer_bounds
def merge_nodes(nodes, degree, max_ratio=None): nthreads_per_block = 256 context = None queue = None if gpuapi.is_gpu_api_opencl(): context = cltools.get_last_context() queue = cl.CommandQueue(context) # Load GPU functions if gpuapi.is_gpu_api_cuda(): bvh_module = get_module('bvh.cu', options=api_options, include_source_directory=True) elif gpuapi.is_gpu_api_opencl(): # don't like the last context method. trouble. trouble. bvh_module = get_module('bvh.cl', context, options=api_options, include_source_directory=True) else: raise RuntimeError('API is neither CUDA nor OpenCL?!') bvh_funcs = GPUFuncs(bvh_module) # determine number of parents nparent = len(nodes) / degree if len(nodes) % degree != 0: nparent += 1 if nparent == 1: nparent_pad = nparent else: nparent_pad = round_up_to_multiple(nparent, 1) #degree # allocate memory if gpuapi.is_gpu_api_cuda(): gpu_parent_nodes = ga.zeros(shape=nparent_pad, dtype=ga.vec.uint4) elif gpuapi.is_gpu_api_opencl(): parent_nodes_np = np.zeros(shape=nparent, dtype=ga.vec.uint4) gpu_parent_nodes = ga.to_device(queue, parent_nodes_np) gpu_nodes = ga.to_device(queue, nodes) else: raise RuntimeError('API is neither CUDA nor OpenCL?!') # run kernel if gpuapi.is_gpu_api_cuda(): for first_index, elements_this_iter, nblocks_this_iter in \ chunk_iterator(nparent, nthreads_per_block, max_blocks=10000): bvh_funcs.make_parents(np.uint32(first_index), np.uint32(elements_this_iter), np.uint32(degree), gpu_parent_nodes, cuda.In(nodes), np.uint32(0), np.uint32(len(nodes)), block=(nthreads_per_block, 1, 1), grid=(nblocks_this_iter, 1)) elif gpuapi.is_gpu_api_opencl(): for first_index, elements_this_iter, nblocks_this_iter in \ chunk_iterator(nparent, nthreads_per_block, max_blocks=1): bvh_funcs.make_parents(queue, (elements_this_iter, 1, 1), None, np.uint32(first_index), np.uint32(elements_this_iter), np.uint32(degree), gpu_parent_nodes.data, gpu_nodes.data, np.uint32(0), np.uint32(len(nodes))).wait() else: raise RuntimeError('API is neither CUDA nor OpenCL?!') parent_nodes = gpu_parent_nodes.get() if max_ratio is not None: areas = node_areas(parent_nodes) child_areas = node_areas(nodes) excessive_area = np.zeros(shape=len(areas), dtype=bool) for i, parent_area in enumerate(areas): nchild = parent_nodes['w'][i] >> CHILD_BITS child_index = parent_nodes['w'][i] & ~NCHILD_MASK child_area = child_areas[child_index:child_index + nchild].sum() #if parent_area > 1e9: # print i, 'Children: %e, Parent: %e' % (child_area, parent_area) if child_area / parent_area < 0.3: excessive_area[i] = True #print i, 'Children: %e, Parent: %e' % (child_area, parent_area) extra_slots = round_up_to_multiple( (degree - 1) * np.count_nonzero(excessive_area), 1) print 'Extra slots:', extra_slots new_parent_nodes = np.zeros(shape=len(parent_nodes) + extra_slots, dtype=parent_nodes.dtype) new_parent_nodes[:len(parent_nodes)] = parent_nodes offset = 0 for count, index in enumerate(np.argwhere(excessive_area)): index = index[0] + offset nchild = new_parent_nodes['w'][index] >> CHILD_BITS child_index = new_parent_nodes['w'][index] & ~NCHILD_MASK new_parent_nodes[index] = nodes[child_index] #new_parent_nodes['w'][index] = 1 << CHILD_BITS | child_index tmp_nchild = new_parent_nodes['w'][index] >> CHILD_BITS tmp_child_index = new_parent_nodes['w'][index] & ~NCHILD_MASK new_parent_nodes['w'][index] = tmp_nchild << CHILD_BITS | ( tmp_child_index + len(nodes)) if nchild == 1: continue # slide everyone over #print index, nchild, len(new_parent_nodes) new_parent_nodes[index + nchild:] = new_parent_nodes[index + 1:-nchild + 1] offset += nchild - 1 for sibling in xrange(nchild - 1): new_parent_index = index + 1 + sibling new_parent_nodes[new_parent_index] = nodes[child_index + sibling + 1] if new_parent_nodes['x'][new_parent_index] != 0: tmp_nchild = new_parent_nodes['w'][ new_parent_index] >> CHILD_BITS tmp_child_index = new_parent_nodes['w'][ new_parent_index] & ~NCHILD_MASK new_parent_nodes['w'][ new_parent_index] = tmp_nchild << CHILD_BITS | ( tmp_child_index + len(nodes)) #new_parent_nodes['w'][new_parent_index] = 1 << CHILD_BITS | (child_index + sibling + 1) #print 'intermediate: %e' % node_areas(new_parent_nodes).max() print 'old: %e' % node_areas(parent_nodes).max() print 'new: %e' % node_areas(new_parent_nodes).max() if len(new_parent_nodes) < len(nodes): # Only adopt new set of parent nodes if it actually reduces the # total number of nodes at this level by 1. parent_nodes = new_parent_nodes return parent_nodes
def __init__(self, geometry, wavelengths=None, print_usage=False, min_free_gpu_mem=300e6, cl_context=None, cl_queue=None): log.info("GPUGeometry.__init__ min_free_gpu_mem %s ", min_free_gpu_mem) self.geometry = geometry self.instance_count += 1 assert self.instance_count == 1, traceback.print_stack() self.metadata = Metadata() self.metadata(None, 'preinfo') self.metadata('a', "start") self.metadata['a_min_free_gpu_mem'] = min_free_gpu_mem if wavelengths is None: self.wavelengths = standard_wavelengths else: self.wavelengths = wavelengths try: self.wavelength_step = np.unique(np.diff(self.wavelengths)).item() except ValueError: raise ValueError('wavelengths must be equally spaced apart.') # this is where things get difficult. # pycuda and pyopencl gives us very different methods for working with structs #geometry_struct_size = characterize.sizeof('Geometry', geometry_source) # Note, that unfortunately the data types returned are very different as the if api.is_gpu_api_cuda(): self.material_data, self.material_ptrs, self.material_pointer_array = self._package_material_data_cuda( geometry, self.wavelengths, self.wavelength_step) self.surface_data, self.surface_ptrs, self.surface_pointer_array = self._package_surface_data_cuda( geometry, self.wavelengths, self.wavelength_step) elif api.is_gpu_api_opencl(): self.material_data, materials_bytes_cl = self._package_material_data_cl( cl_context, cl_queue, geometry, self.wavelengths, self.wavelength_step) self.surface_data, surfaces_bytes_cl = self._package_surface_data_cl( cl_context, cl_queue, geometry, self.wavelengths, self.wavelength_step) self.metadata('b', "after materials,surfaces") if api.is_gpu_api_opencl(): self.metadata[ 'b_gpu_used'] = materials_bytes_cl + surfaces_bytes_cl # opencl, we have to track this ourselves # Load Vertices and Triangles if api.is_gpu_api_cuda(): self.vertices = mapped_empty(shape=len(geometry.mesh.vertices), dtype=ga.vec.float3, write_combined=True) self.vertices4 = np.zeros(shape=(len(self.vertices), 4), dtype=np.float32) self.triangles = mapped_empty(shape=len(geometry.mesh.triangles), dtype=ga.vec.uint3, write_combined=True) self.triangles4 = np.zeros(shape=(len(self.triangles), 4), dtype=np.uint32) self.vertices[:] = to_float3(geometry.mesh.vertices) self.vertices4[:, :-1] = self.vertices.ravel().view( np.float32).reshape(len(self.vertices), 3) # for textures self.triangles[:] = to_uint3(geometry.mesh.triangles) self.triangles4[:, :-1] = self.triangles.ravel().view( np.uint32).reshape(len(self.triangles), 3) # for textures elif api.is_gpu_api_opencl(): self.vertices = ga.empty(cl_queue, len(geometry.mesh.vertices), dtype=ga.vec.float3) self.triangles = ga.empty(cl_queue, len(geometry.mesh.triangles), dtype=ga.vec.uint3) self.vertices[:] = to_float3(geometry.mesh.vertices) self.triangles[:] = to_uint3(geometry.mesh.triangles) if api.is_gpu_api_cuda(): self.world_origin = ga.vec.make_float3( *geometry.bvh.world_coords.world_origin) elif api.is_gpu_api_opencl(): self.world_origin = ga.vec.make_float3( *geometry.bvh.world_coords.world_origin) #self.world_origin = geometry.bvh.world_coords.world_origin self.world_origin = ga.to_device(cl_queue, self.world_origin) print type(self.world_origin), self.world_origin self.world_scale = np.float32(geometry.bvh.world_coords.world_scale) # Load material and surface indices into 8-bit codes # check if we've reached a complexity threshold if len(geometry.unique_materials) >= int(0xff): raise ValueError( 'Number of materials to index has hit maximum of %d' % (int(0xff))) if len(geometry.unique_surfaces) >= int(0xff): raise ValueError( 'Number of surfaces to index has hit maximum of %d' % (int(0xff))) # make bit code material_codes = (((geometry.material1_index & 0xff) << 24) | ((geometry.material2_index & 0xff) << 16) | ((geometry.surface_index & 0xff) << 8)).astype( np.uint32) if api.is_gpu_api_cuda(): self.material_codes = ga.to_gpu(material_codes) elif api.is_gpu_api_opencl(): self.material_codes = ga.to_device(cl_queue, material_codes) # assign color codes colors = geometry.colors.astype(np.uint32) if api.is_gpu_api_cuda(): self.colors = ga.to_gpu(colors) self.solid_id_map = ga.to_gpu(geometry.solid_id.astype(np.uint32)) elif api.is_gpu_api_opencl(): self.colors = ga.to_device(cl_queue, colors) self.solid_id_map = ga.to_device( cl_queue, geometry.solid_id.astype(np.uint32)) # Limit memory usage by splitting BVH into on and off-GPU parts self.metadata('c', "after colors, idmap") if api.is_gpu_api_cuda(): gpu_free, gpu_total = cuda.mem_get_info() elif api.is_gpu_api_opencl(): gpu_total = self.metadata['gpu_total'] meshdef_nbytes_cl = self.vertices.nbytes + self.triangles.nbytes + self.world_origin.nbytes + self.world_scale.nbytes + self.material_codes.nbytes + self.colors.nbytes + self.solid_id_map.nbytes self.metadata[ 'c_gpu_used'] = materials_bytes_cl + surfaces_bytes_cl + meshdef_nbytes_cl gpu_free = gpu_total - (materials_bytes_cl + surfaces_bytes_cl + meshdef_nbytes_cl) # Figure out how many elements we can fit on the GPU, # but no fewer than 100 elements, and no more than the number of actual nodes n_nodes = len(geometry.bvh.nodes) split_index = min( max( int((gpu_free - min_free_gpu_mem) / geometry.bvh.nodes.itemsize), 100), n_nodes) print "split index=", split_index, " vs. total nodes=", n_nodes # push nodes to GPU if api.is_gpu_api_cuda(): self.nodes = ga.to_gpu(geometry.bvh.nodes[:split_index]) elif api.is_gpu_api_opencl(): self.nodes = ga.to_device(cl_queue, geometry.bvh.nodes[:split_index]) n_extra = max(1, (n_nodes - split_index)) # forbid zero size # left over nodes if api.is_gpu_api_cuda(): self.extra_nodes = mapped_empty(shape=n_extra, dtype=geometry.bvh.nodes.dtype, write_combined=True) elif api.is_gpu_api_opencl(): self.extra_nodes = ga.empty(cl_queue, shape=n_extra, dtype=geometry.bvh.nodes.dtype) if split_index < n_nodes: log.info('Splitting BVH between GPU and CPU memory at node %d' % split_index) self.extra_nodes[:] = geometry.bvh.nodes[split_index:] splitting = 1 else: splitting = 0 self.metadata('d', "after nodes") if api.is_gpu_api_opencl(): nodes_nbytes_cl = self.nodes.nbytes self.metadata[ 'd_gpu_used'] = materials_bytes_cl + surfaces_bytes_cl + meshdef_nbytes_cl + nodes_nbytes_cl self.metadata.array("d_nodes", geometry.bvh.nodes) self.metadata['d_split_index'] = split_index self.metadata['d_extra_nodes_count'] = n_extra self.metadata['d_splitting'] = splitting self.print_device_usage(cl_context=cl_context) # CUDA See if there is enough memory to put the vertices and/or triangles back on the GPU if api.is_gpu_api_cuda(): gpu_free, gpu_total = cuda.mem_get_info() elif api.is_gpu_api_opencl(): gpu_total = self.metadata['gpu_total'] gpu_free = gpu_total - self.metadata['d_gpu_used'] self.metadata.array('e_triangles', self.triangles) if api.is_gpu_api_cuda(): if self.triangles.nbytes < (gpu_free - min_free_gpu_mem): self.triangles = ga.to_gpu(self.triangles) log.info( 'Optimization: Sufficient memory to move triangles onto GPU' ) ftriangles_gpu = 1 else: log.warn('using host mapped memory triangles') ftriangles_gpu = 0 elif api.is_gpu_api_opencl(): if self.triangles.nbytes < (gpu_free - min_free_gpu_mem): #self.triangles = ga.to_device(cl_queue,self.triangles) log.info( 'Optimization: Sufficient memory to move triangles onto GPU' ) ftriangles_gpu = 1 else: log.warn('using host mapped memory triangles') ftriangles_gpu = 0 self.metadata('e', "after triangles") self.metadata['e_triangles_gpu'] = ftriangles_gpu if api.is_gpu_api_cuda(): gpu_free, gpu_total = cuda.mem_get_info() elif api.is_gpu_api_opencl(): gpu_total = self.metadata['gpu_total'] gpu_free = gpu_total - self.metadata['d_gpu_used'] self.metadata.array('f_vertices', self.vertices) if api.is_gpu_api_cuda(): if self.vertices.nbytes < (gpu_free - min_free_gpu_mem): self.vertices = ga.to_gpu(self.vertices) log.info( 'Optimization: Sufficient memory to move vertices onto GPU' ) vertices_gpu = 1 else: log.warn('using host mapped memory vertices') vertices_gpu = 0 elif api.is_gpu_api_opencl(): if self.vertices.nbytes < (gpu_free - min_free_gpu_mem): #self.vertices = ga.to_gpu(self.vertices) log.info( 'Optimization: Sufficient memory to move vertices onto GPU' ) vertices_gpu = 1 else: log.warn('using host mapped memory vertices') vertices_gpu = 0 self.metadata('f', "after vertices") self.metadata['f_vertices_gpu'] = vertices_gpu if api.is_gpu_api_cuda(): geometry_source = cutools.get_cu_source('geometry_types.h') geometry_struct_size = characterize.sizeof('Geometry', geometry_source) self.gpudata = make_gpu_struct(geometry_struct_size, [ Mapped(self.vertices), Mapped(self.triangles), self.material_codes, self.colors, self.nodes, Mapped(self.extra_nodes), self.material_pointer_array, self.surface_pointer_array, self.world_origin, self.world_scale, np.int32(len(self.nodes)) ]) elif api.is_gpu_api_opencl(): # No relevant way to pass struct into OpenCL kernel. We have to pass everything by arrays # We then build a geometry struct later in the kernel # provided below is example/test of passing the data #if True: # for debuggin if False: # print "loading geometry_structs.cl" geostructsmod = cltools.get_cl_module( "geometry_structs.cl", cl_context, options=cltools.cl_options, include_source_directory=True) geostructsfunc = GPUFuncs(geostructsmod) geostructsfunc.make_geostruct( cl_queue, (3, ), None, self.vertices.data, self.triangles.data, self.material_codes.data, self.colors.data, self.nodes.data, self.extra_nodes.data, np.int32(len(geometry.unique_materials)), self.material_data['refractive_index'].data, self.material_data['absorption_length'].data, self.material_data['scattering_length'].data, self.material_data['reemission_prob'].data, self.material_data['reemission_cdf'].data, np.int32(len(geometry.unique_surfaces)), self.surface_data['detect'].data, self.surface_data['absorb'].data, self.surface_data['reemit'].data, self.surface_data['reflect_diffuse'].data, self.surface_data['reflect_specular'].data, self.surface_data['eta'].data, self.surface_data['k'].data, self.surface_data['reemission_cdf'].data, self.surface_data['model'].data, self.surface_data['transmissive'].data, self.surface_data['thickness'].data, self.surface_data['nplanes'].data, self.surface_data['wire_diameter'].data, self.surface_data['wire_pitch'].data, self.world_origin.data, self.world_scale, np.int32(len(self.nodes)), self.material_data['n'], self.material_data['step'], self.material_data["wavelength0"]) cl_queue.finish() self.material_codes.get() raise RuntimeError('bail') if print_usage: self.print_device_usage(cl_context=cl_context) log.info(self.device_usage_str(cl_context=cl_context)) self.metadata('g', "after geometry struct")
def create_leaf_nodes(mesh, morton_bits=16, round_to_multiple=1, nthreads_per_block=32, max_blocks=16): '''Compute the leaf nodes surrounding a triangle mesh. ``mesh``: chroma.geometry.Mesh Triangles to box ``morton_bits``: int Number of bits to use per dimension when computing Morton code. ``round_to_multiple``: int Round the number of nodes created up to multiple of this number Extra nodes will be all zero. Returns (world_coords, nodes, morton_codes), where ``world_coords``: chroma.bvh.WorldCoords Defines the fixed point coordinate system ``nodes``: ndarray(shape=len(mesh.triangles), dtype=uint4) List of leaf nodes. Child IDs will be set to triangle offsets. ``morton_codes``: ndarray(shape=len(mesh.triangles), dtype=np.uint64) Morton codes for each triangle, using ``morton_bits`` per axis. Must be <= 16 bits. ''' # it would be nice not to duplicate code, make functions transparent... context = None queue = None if gpuapi.is_gpu_api_opencl(): context = cltools.get_last_context() #print context queue = cl.CommandQueue(context) # Load GPU functions if gpuapi.is_gpu_api_cuda(): bvh_module = get_module('bvh.cu', options=api_options, include_source_directory=True) elif gpuapi.is_gpu_api_opencl(): # don't like the last context method. trouble. trouble. bvh_module = get_module('bvh.cl', cltools.get_last_context(), options=api_options, include_source_directory=True) bvh_funcs = GPUFuncs(bvh_module) # compute world coordinates world_origin_np = mesh.vertices.min(axis=0) world_scale = np.max( (mesh.vertices.max(axis=0) - world_origin_np)) / (2**16 - 2) world_coords = WorldCoords(world_origin=world_origin_np, world_scale=world_scale) # Put triangles and vertices into host and device memory # unfortunately, opencl and cuda has different methods for managing memory here # we have to write divergent code if gpuapi.is_gpu_api_cuda(): # here cuda supports a nice feature where we allocate host and device memory that are mapped onto one another. # no explicit requests for transfers here triangles = cutools.mapped_empty(shape=len(mesh.triangles), dtype=ga.vec.uint3, write_combined=True) triangles[:] = to_uint3(mesh.triangles) vertices = cutools.mapped_empty(shape=len(mesh.vertices), dtype=ga.vec.float3, write_combined=True) vertices[:] = to_float3(mesh.vertices) #print triangles[0:10] #print vertices[0:10] # Call GPU to compute nodes nodes = ga.zeros(shape=round_up_to_multiple(len(triangles), round_to_multiple), dtype=ga.vec.uint4) morton_codes = ga.empty(shape=len(triangles), dtype=np.uint64) # Convert world coords to GPU-friendly types world_origin = ga.vec.make_float3(*world_origin_np) world_scale = np.float32(world_scale) # generate morton codes on GPU for first_index, elements_this_iter, nblocks_this_iter in \ chunk_iterator(len(triangles), nthreads_per_block, max_blocks=30000): bvh_funcs.make_leaves(np.uint32(first_index), np.uint32(elements_this_iter), cutools.Mapped(triangles), cutools.Mapped(vertices), world_origin, world_scale, nodes, morton_codes, block=(nthreads_per_block, 1, 1), grid=(nblocks_this_iter, 1)) morton_codes_host = morton_codes.get() >> (16 - morton_bits) elif gpuapi.is_gpu_api_opencl(): # here we need to allocate a buffer on the host and on the device triangles = np.empty(len(mesh.triangles), dtype=ga.vec.uint3) copy_to_uint3(mesh.triangles, triangles) vertices = np.empty(len(mesh.vertices), dtype=ga.vec.float3) copy_to_float3(mesh.vertices, vertices) # now create a buffer object on the device and push data to it triangles_dev = ga.to_device(queue, triangles) vertices_dev = ga.to_device(queue, vertices) # Call GPU to compute nodes nodes = ga.zeros(queue, shape=round_up_to_multiple(len(triangles), round_to_multiple), dtype=ga.vec.uint4) morton_codes = ga.empty(queue, shape=len(triangles), dtype=np.uint64) # Convert world coords to GPU-friendly types #world_origin = np.array(world_origin_np,dtype=np.float32) world_origin = np.empty(1, dtype=ga.vec.float3) world_origin['x'] = world_origin_np[0] world_origin['y'] = world_origin_np[1] world_origin['z'] = world_origin_np[2] world_scale = np.float32(world_scale) #print world_origin, world_scale # generate morton codes on GPU for first_index, elements_this_iter, nblocks_this_iter in \ chunk_iterator(len(triangles), nthreads_per_block, max_blocks): print first_index, elements_this_iter, nblocks_this_iter bvh_funcs.make_leaves( queue, (nblocks_this_iter, 1, 1), (nthreads_per_block, 1, 1), #bvh_funcs.make_leaves( queue, (elements_this_iter,1,1), None, np.uint32(first_index), np.uint32(elements_this_iter), triangles_dev.data, vertices_dev.data, world_origin, world_scale, nodes.data, morton_codes.data, g_times_l=True).wait() morton_codes_host = morton_codes.get() >> (16 - morton_bits) return world_coords, nodes.get(), morton_codes_host
origin = geo.bvh.world_coords.world_origin nodes = sim.gpu_geometry.nodes extra_node = sim.gpu_geometry.extra_nodes triangles = sim.gpu_geometry.triangles vertices = sim.gpu_geometry.vertices print vertices.shape vertices4 = np.zeros((len(vertices), 4), dtype=np.float32) print vertices.get().ravel().view(np.float32).shape vertices4[:, :-1] = vertices.get().ravel().view(np.float32).reshape( len(vertices), 3) module = get_module('test_texture.cu', options=api_options, include_source_directory=True) gpu_funcs = GPUFuncs(module) node_texture_ref = module.get_texref("node_tex_ref") extra_node_texture_ref = module.get_texref("extra_node_tex_ref") triangles_texture_ref = module.get_texref("triangles_tex_ref") vertices_texture_ref = module.get_texref("vertices_tex_ref") node_vec_texture_ref = module.get_texref("nodevec_tex_ref") node_vec_texture_ref.set_format(cuda.array_format.UNSIGNED_INT32, 4) ur_nodes = nodes.get().ravel().view(np.uint32) ur_nodes_gpu = ga.to_gpu(ur_nodes) ur_nodes_gpu.bind_to_texref_ext(node_texture_ref) nodes_nbytes = ur_nodes.nbytes ur_nodes = nodes.get().ravel().view(np.uint32) ur_nodes_vec_gpu = ga.to_gpu(ur_nodes)
def merge_nodes_detailed(nodes, first_child, nchild): '''Merges nodes into len(first_child) parent nodes, using the provided arrays to determine the index of the first child of each parent, and how many children there are.''' nthreads_per_block = 256 context = None queue = None if gpuapi.is_gpu_api_opencl(): context = cltools.get_last_context() #print context queue = cl.CommandQueue(context) # Load GPU functions if gpuapi.is_gpu_api_cuda(): bvh_module = get_module('bvh.cu', options=api_options, include_source_directory=True) elif gpuapi.is_gpu_api_opencl(): # don't like the last context method. trouble. trouble. bvh_module = get_module('bvh.cl', context, options=api_options, include_source_directory=True) else: raise RuntimeError('API is neither CUDA nor OpenCL?!') bvh_funcs = GPUFuncs(bvh_module) # Load Memory if gpuapi.is_gpu_api_cuda(): gpu_nodes = ga.to_gpu(nodes) gpu_first_child = ga.to_gpu(first_child.astype(np.int32)) gpu_nchild = ga.to_gpu(nchild.astype(np.int32)) nparent = len(first_child) gpu_parent_nodes = ga.empty(shape=nparent, dtype=ga.vec.uint4) elif gpuapi.is_gpu_api_opencl(): gpu_nodes = ga.to_device(queue, nodes) gpu_first_child = ga.to_device(queue, first_child.astype(np.int32)) gpu_nchild = ga.to_device(queue, nchild.astype(np.int32)) nparent = len(first_child) parent_nodes_np = np.zeros(shape=nparent, dtype=ga.vec.uint4) gpu_parent_nodes = ga.to_device(queue, parent_nodes_np) else: raise RuntimeError('API is neither CUDA nor OpenCL?!') # Run Kernel for first_index, elements_this_iter, nblocks_this_iter in \ chunk_iterator(nparent, nthreads_per_block, max_blocks=10000): if gpuapi.is_gpu_api_cuda(): bvh_funcs.make_parents_detailed(np.uint32(first_index), np.uint32(elements_this_iter), gpu_nodes, gpu_parent_nodes, gpu_first_child, gpu_nchild, block=(nthreads_per_block, 1, 1), grid=(nblocks_this_iter, 1)) elif gpuapi.is_gpu_api_opencl(): bvh_funcs.make_parents_detailed(queue, (elements_this_iter, 1, 1), None, np.uint32(first_index), np.uint32(elements_this_iter), gpu_nodes.data, gpu_parent_nodes.data, gpu_first_child.data, gpu_nchild.data).wait() else: raise RuntimeError('API is neither CUDA nor OpenCL?!') return gpu_parent_nodes.get()
class GPUPDF(object): def __init__(self, cl_context=None): if api.is_gpu_api_cuda(): self.module = cutools.get_cu_module('pdf.cu', options=api_options, include_source_directory=True) elif api.is_gpu_api_opencl(): self.module = cltools.get_cl_module('pdf.cl', cl_context, options=api_options, include_source_directory=True) self.gpu_funcs = GPUFuncs(self.module) def setup_pdf(self, nchannels, tbins, trange, qbins, qrange): """Setup GPU arrays to hold PDF information. nchannels: int, number of channels tbins: number of time bins trange: tuple of (min, max) time in PDF qbins: number of charge bins qrange: tuple of (min, max) charge in PDF """ self.events_in_histogram = 0 self.hitcount_gpu = ga.zeros(nchannels, dtype=np.uint32) self.pdf_gpu = ga.zeros(shape=(nchannels, tbins, qbins), dtype=np.uint32) self.tbins = tbins self.trange = trange self.qbins = qbins self.qrange = qrange def clear_pdf(self): """Rezero the PDF counters.""" self.hitcount_gpu.fill(0) self.pdf_gpu.fill(0) def add_hits_to_pdf(self, gpuchannels, nthreads_per_block=64): self.gpu_funcs.bin_hits( np.int32(len(self.hitcount_gpu)), gpuchannels.q, gpuchannels.t, self.hitcount_gpu, np.int32(self.tbins), np.float32(self.trange[0]), np.float32(self.trange[1]), np.int32(self.qbins), np.float32(self.qrange[0]), np.float32(self.qrange[1]), self.pdf_gpu, block=(nthreads_per_block, 1, 1), grid=(len(gpuchannels.t) // nthreads_per_block + 1, 1)) self.events_in_histogram += 1 def get_pdfs(self): """Returns the 1D hitcount array and the 3D [channel, time, charge] histogram.""" return self.hitcount_gpu.get(), self.pdf_gpu.get() def setup_pdf_eval(self, event_hit, event_time, event_charge, min_twidth, trange, min_qwidth, qrange, min_bin_content=10, time_only=True): """Setup GPU arrays to compute PDF values for the given event. The pdf_eval calculation allows the PDF to be evaluated at a single point for each channel as the Monte Carlo is run. The effective bin size will be as small as (`min_twidth`, `min_qwidth`) around the point of interest, but will be large enough to ensure that `min_bin_content` Monte Carlo events fall into the bin. event_hit: ndarray Hit or not-hit status for each channel in the detector. event_time: ndarray Hit time for each channel in the detector. If channel not hit, the time will be ignored. event_charge: ndarray Integrated charge for each channel in the detector. If channel not hit, the charge will be ignored. min_twidth: float Minimum bin size in the time dimension trange: (float, float) Range of time dimension in PDF min_qwidth: float Minimum bin size in charge dimension qrange: (float, float) Range of charge dimension in PDF min_bin_content: int The bin will be expanded to include at least this many events time_only: bool If True, only the time observable will be used in the PDF. """ self.event_nhit = count_nonzero(event_hit) # Define a mapping from an array of len(event_hit) to an array of length event_nhit self.map_hit_offset_to_channel_id = np.where(event_hit)[0].astype( np.uint32) self.map_hit_offset_to_channel_id_gpu = ga.to_gpu( self.map_hit_offset_to_channel_id) self.map_channel_id_to_hit_offset = np.maximum(0, event_hit.cumsum() - 1).astype(np.uint32) self.map_channel_id_to_hit_offset_gpu = ga.to_gpu( self.map_channel_id_to_hit_offset) self.event_hit_gpu = ga.to_gpu(event_hit.astype(np.uint32)) self.event_time_gpu = ga.to_gpu(event_time.astype(np.float32)) self.event_charge_gpu = ga.to_gpu(event_charge.astype(np.float32)) self.eval_hitcount_gpu = ga.zeros(len(event_hit), dtype=np.uint32) self.eval_bincount_gpu = ga.zeros(len(event_hit), dtype=np.uint32) self.nearest_mc_gpu = ga.empty(shape=self.event_nhit * min_bin_content, dtype=np.float32) self.nearest_mc_gpu.fill(1e9) self.min_twidth = min_twidth self.trange = trange self.min_qwidth = min_qwidth self.qrange = qrange self.min_bin_content = min_bin_content assert time_only # Only support time right now self.time_only = time_only def clear_pdf_eval(self): "Reset PDF evaluation counters to start accumulating new Monte Carlo." self.eval_hitcount_gpu.fill(0) self.eval_bincount_gpu.fill(0) self.nearest_mc_gpu.fill(1e9) @profile_if_possible def accumulate_pdf_eval(self, gpuchannels, nthreads_per_block=64, max_blocks=10000, cl_queue=None): "Add the most recent results of run_daq() to the PDF evaluation." if api.is_gpu_api_cuda(): self.work_queues = ga.empty(shape=self.event_nhit * (gpuchannels.ndaq + 1), dtype=np.uint32) elif api.is_gpu_api_opencl(): self.work_queues = ga.empty(cl_queue, shape=self.event_nhit * (gpuchannels.ndaq + 1), dtype=np.uint32) self.work_queues.fill(1) if api.is_gpu_api_cuda(): self.gpu_funcs.accumulate_bincount( np.int32(self.event_hit_gpu.size), np.int32(gpuchannels.ndaq), self.event_hit_gpu, self.event_time_gpu, gpuchannels.t, self.eval_hitcount_gpu, self.eval_bincount_gpu, np.float32(self.min_twidth), np.float32(self.trange[0]), np.float32(self.trange[1]), np.int32(self.min_bin_content), self.map_channel_id_to_hit_offset_gpu, self.work_queues, block=(nthreads_per_block, 1, 1), grid=(self.event_hit_gpu.size // nthreads_per_block + 1, 1)) cuda.Context.get_current().synchronize() self.gpu_funcs.accumulate_nearest_neighbor_block( np.int32(self.event_nhit), np.int32(gpuchannels.ndaq), self.map_hit_offset_to_channel_id_gpu, self.work_queues, self.event_time_gpu, gpuchannels.t, self.nearest_mc_gpu, np.int32(self.min_bin_content), block=(nthreads_per_block, 1, 1), grid=(self.event_nhit, 1)) cuda.Context.get_current().synchronize() elif api.is_gpu_api_opencl(): self.gpu_funcs.accumulate_bincount( cl_queue, (nthreads_per_block, 1, 1), (self.event_hit_gpu.size // nthreads_per_block + 1, 1), np.int32(gpuchannels.ndaq), self.event_hit_gpu.data, self.event_time_gpu.data, gpuchannels.t.data, self.eval_hitcount_gpu.data, self.eval_bincount_gpu.data, np.float32(self.min_twidth), np.float32(self.trange[0]), np.float32(self.trange[1]), np.int32(self.min_bin_content), self.map_channel_id_to_hit_offset_gpu.data, self.work_queues.data, g_times_l=True) #cl.enqueue_barrier( cl_queue ) self.gpu_funcs.accumulate_nearest_neighbor_block( cl_queue, (nthreads_per_block, 1, 1), (self.event_nhit, 1), np.int32(self.event_nhit), np.int32(gpuchannels.ndaq), self.map_hit_offset_to_channel_id_gpu.data, self.work_queues.data, self.event_time_gpu.daa, gpuchannels.t.data, self.nearest_mc_gpu.data, np.int32(self.min_bin_content), g_time_l=True) #cl.enqueue_barrier( cl_queue ) def get_pdf_eval(self): evhit = self.event_hit_gpu.get().astype(bool) hitcount = self.eval_hitcount_gpu.get() bincount = self.eval_bincount_gpu.get() pdf_value = np.zeros(len(hitcount), dtype=float) pdf_frac_uncert = np.zeros_like(pdf_value) # PDF value for high stats bins high_stats = (bincount >= self.min_bin_content) if high_stats.any(): if self.time_only: pdf_value[high_stats] = bincount[high_stats].astype( float) / hitcount[high_stats] / self.min_twidth else: assert Exception('Unimplemented 2D (time,charge) mode!') pdf_frac_uncert[high_stats] = 1.0 / np.sqrt(bincount[high_stats]) # PDF value for low stats bins low_stats = ~high_stats & (hitcount > 0) & evhit nearest_mc_by_hit = self.nearest_mc_gpu.get().reshape( (self.event_nhit, self.min_bin_content)) nearest_mc = np.empty(shape=(len(hitcount), self.min_bin_content), dtype=np.float32) nearest_mc.fill(1e9) nearest_mc[self.map_hit_offset_to_channel_id, :] = nearest_mc_by_hit # Deal with the case where we did not even get min_bin_content events # in the PDF but also clamp the lower range to ensure we don't index # by a negative number in 2 lines last_valid_entry = np.maximum( 0, (nearest_mc < 1e9).astype(int).sum(axis=1) - 1) distance = nearest_mc[np.arange(len(last_valid_entry)), last_valid_entry] if low_stats.any(): if self.time_only: pdf_value[low_stats] = ( last_valid_entry[low_stats] + 1).astype(float) / hitcount[ low_stats] / distance[low_stats] / 2.0 else: assert Exception('Unimplemented 2D (time,charge) mode!') pdf_frac_uncert[low_stats] = 1.0 / np.sqrt( last_valid_entry[low_stats] + 1) # PDFs with no stats got zero by default during array creation print 'high_stats:', high_stats.sum(), 'low_stats', low_stats.sum() return hitcount, pdf_value, pdf_value * pdf_frac_uncert
def _call_opencl_kernel(self, sim, photons, ourphotons, max_shared_nodes, nodes, workgroupsize, comqueue): module = get_module('wq_checknode.cl', self.context, options=api_options, include_source_directory=True) gpu_funcs = GPUFuncs(module) # gather variables for kernel call gpugeo = sim.gpu_geometry photon_pos = photons.pos photon_dir = photons.dir photon_current_node = photons.current_node_index photon_tested_node = ga.to_device( comqueue, 1 * np.ones(len(photons.pos), dtype=np.uint32)) photon_last_result = ga.to_device( comqueue, -1 * np.ones(len(photons.pos), dtype=np.int32)) nodes = gpugeo.nodes node_parent = ga.to_device(comqueue, sim.detector.node_dsar_tree.parent) node_first_daughter = ga.to_device( comqueue, sim.detector.node_dsar_tree.first_daughter) node_sibling = ga.to_device(comqueue, sim.detector.node_dsar_tree.sibling) node_aunt = ga.to_device(comqueue, sim.detector.node_dsar_tree.aunt) world_origin = gpugeo.world_origin_gpu world_scale = gpugeo.world_scale # make queue related variables queue_size = np.int32(len(photons.pos) * 2) queue_photon_index = ga.empty(comqueue, queue_size, dtype=np.int32) queue_slot_flag = ga.zeros(comqueue, queue_size, dtype=np.int32) queue_photon_index[0:len(photons.pos)] = np.arange(0, len(photons.pos), dtype=np.int32)[:] queue_photon_index[len(photons.pos):] = ( np.ones(len(photons.pos), dtype=np.int32) * -1)[:] queue_slot_flag[0:len(photons.pos)] = np.ones(len(photons.pos), dtype=np.int32)[:] a = ga.zeros(comqueue, 1, dtype=ga.vec.uint4) b = np.array(1, dtype=np.int32) c = np.array(1, dtype=np.uint32) workgroup_photons = cl.LocalMemory(b.nbytes * workgroupsize) workgroup_current_node = cl.LocalMemory(b.nbytes * workgroupsize) workgroup_tested_node = cl.LocalMemory(b.nbytes * workgroupsize) max_nodes_can_store = (max_shared_nodes - 20 - 3 * workgroupsize) max_nodes_can_store -= max_nodes_can_store % 32 max_nodes_can_store = np.int32(max_nodes_can_store) loaded_node_start_index = np.int32(0) loaded_node_end_index = np.int32(1) node_front_start = ga.empty(comqueue, 1, dtype=np.int32) node_front_end = ga.empty(comqueue, 1, dtype=np.int32) workgroup_nodes = cl.LocalMemory(a.nbytes * (max_nodes_can_store + 1)) workgroup_daughter = cl.LocalMemory(c.nbytes * (max_nodes_can_store + 1)) workgroup_sibling = cl.LocalMemory(c.nbytes * (max_nodes_can_store + 1)) workgroup_aunt = cl.LocalMemory(c.nbytes * (max_nodes_can_store + 1)) max_loops = 32 if len(gpugeo.extra_nodes) > 1: raise RuntimeError('did not plan for there to be a node split.') print photon_current_node print photon_tested_node print queue_photon_index print queue_slot_flag print "Starting node range: ", loaded_node_start_index, " to ", loaded_node_end_index print "Max nodes in shared: ", max_nodes_can_store print "Work group nodes size: ", a.nbytes * workgroupsize, " bytes = (", a.nbytes, "*", workgroupsize, ")" print "Available local memsize: ", self.shared_mem_size print "Total number of nodes: ", len( nodes), " (", nodes.nbytes, " bytes)" print "Stored node size: ", max_nodes_can_store * a.nbytes print "Left over: ", self.shared_mem_size - max_nodes_can_store * a.nbytes - a.nbytes * workgroupsize print sim.detector.bvh.layer_bounds print "PRESUB CURRENT NODES" print photon_current_node print "PRESUB TESTED NODES" print photon_tested_node start_queue = time.time() gpu_funcs.checknode( comqueue, (workgroupsize, 1, 1), (workgroupsize, 1, 1), np.int32(max_loops), photon_pos.data, photon_dir.data, photon_current_node.data, photon_tested_node.data, photon_last_result.data, np.int32(len(nodes)), nodes.data, node_parent.data, node_first_daughter.data, node_sibling.data, node_aunt.data, world_origin.data, world_scale, queue_size, queue_photon_index.data, queue_slot_flag.data, np.int32(len(photon_pos)), np.int32(workgroupsize), workgroup_photons, workgroup_current_node, workgroup_tested_node, max_nodes_can_store, workgroup_nodes, workgroup_daughter, workgroup_sibling, workgroup_aunt, loaded_node_start_index, loaded_node_end_index, node_front_start.data, node_front_end.data).wait() end_queue = time.time() print "CheckNode Queue returns. ", end_queue - start_queue, " seconds" print "(Current node, To Test, result)" node_states = zip(photon_current_node.get(), photon_tested_node.get(), photon_last_result.get()) for x in xrange(0, len(node_states), 10): y = x + 10 if y > len(node_states): y = len(node_states) print x, ": ", node_states[x:y] print "LAST RESULT:" print photon_last_result.get() print "PHOTON QUEUE" photon_queue = queue_photon_index.get() for x in xrange(0, len(photon_queue), 32): y = x + 32 if y > len(photon_queue): y = len(photon_queue) print x, ": ", photon_queue[x:y] print "QUEUE SLOT FLAGS" slot_flags = queue_slot_flag.get() for x in xrange(0, len(slot_flags), 32): y = x + 32 if y > len(slot_flags): y = len(slot_flags) print x, ": ", slot_flags[x:y] print "NODE FRONT: ", node_front_start.get( ), " to ", node_front_end.get( ), node_front_end.get() - node_front_start.get() return
def __init__(self, photons, ncopies=1, cl_context=None): """Load ``photons`` onto the GPU, replicating as requested. Args: - photons: chroma.Event.Photons Photon state information to load onto GPU - ncopies: int, *optional* Number of times to replicate the photons on the GPU. This is used if you want to propagate the same event many times, for example in a likelihood calculation. The amount of GPU storage will be proportionally larger if ncopies > 1, so be careful. """ nphotons = len(photons) # Allocate GPU memory for photon info and push to device if api.is_gpu_api_cuda(): self.pos = ga.empty(shape=nphotons * ncopies, dtype=ga.vec.float3) self.dir = ga.empty(shape=nphotons * ncopies, dtype=ga.vec.float3) self.pol = ga.empty(shape=nphotons * ncopies, dtype=ga.vec.float3) self.wavelengths = ga.empty(shape=nphotons * ncopies, dtype=np.float32) self.t = ga.empty(shape=nphotons * ncopies, dtype=np.float32) self.last_hit_triangles = ga.empty(shape=nphotons * ncopies, dtype=np.int32) self.flags = ga.empty(shape=nphotons * ncopies, dtype=np.uint32) self.weights = ga.empty(shape=nphotons * ncopies, dtype=np.float32) self.current_node_index = ga.zeros(shape=nphotons * ncopies, dtype=np.uint32) # deprecated self.requested_workcode = ga.empty(shape=nphotons * ncopies, dtype=np.uint32) # deprecated elif api.is_gpu_api_opencl(): queue = cl.CommandQueue(cl_context) self.pos = ga.empty(queue, shape=nphotons * ncopies, dtype=ga.vec.float3) self.dir = ga.empty(queue, shape=nphotons * ncopies, dtype=ga.vec.float3) self.pol = ga.empty(queue, shape=nphotons * ncopies, dtype=ga.vec.float3) self.wavelengths = ga.empty(queue, shape=nphotons * ncopies, dtype=np.float32) self.t = ga.empty(queue, shape=nphotons * ncopies, dtype=np.float32) self.last_hit_triangles = ga.empty(queue, shape=nphotons * ncopies, dtype=np.int32) self.flags = ga.empty(queue, shape=nphotons * ncopies, dtype=np.uint32) self.weights = ga.empty(queue, shape=nphotons * ncopies, dtype=np.float32) self.current_node_index = ga.zeros(queue, shape=nphotons * ncopies, dtype=np.uint32) # deprecated self.requested_workcode = ga.empty(queue, shape=nphotons * ncopies, dtype=np.uint32) # deprecated # Assign the provided photons to the beginning (possibly # the entire array if ncopies is 1 self.pos[:nphotons].set(to_float3(photons.pos)) self.dir[:nphotons].set(to_float3(photons.dir)) self.pol[:nphotons].set(to_float3(photons.pol)) self.wavelengths[:nphotons].set(photons.wavelengths.astype(np.float32)) self.t[:nphotons].set(photons.t.astype(np.float32)) self.last_hit_triangles[:nphotons].set( photons.last_hit_triangles.astype(np.int32)) self.flags[:nphotons].set(photons.flags.astype(np.uint32)) self.weights[:nphotons].set(photons.weights.astype(np.float32)) if api.is_gpu_api_cuda(): self.module = get_module('propagate.cu', options=api_options, include_source_directory=True) elif api.is_gpu_api_opencl(): self.module = get_module('propagate.cl', cl_context, options=api_options, include_source_directory=True) # define the texture references self.define_texture_references() # get kernel functions self.gpu_funcs = GPUFuncs(self.module) # Replicate the photons to the rest of the slots if needed if ncopies > 1: max_blocks = 1024 nthreads_per_block = 64 for first_photon, photons_this_round, blocks in \ chunk_iterator(nphotons, nthreads_per_block, max_blocks): self.gpu_funcs.photon_duplicate(np.int32(first_photon), np.int32(photons_this_round), self.pos, self.dir, self.wavelengths, self.pol, self.t, self.flags, self.last_hit_triangles, self.weights, np.int32(ncopies - 1), np.int32(nphotons), block=(nthreads_per_block, 1, 1), grid=(blocks, 1)) # Save the duplication information for the iterate_copies() method self.true_nphotons = nphotons self.ncopies = ncopies
def __init__(self, steps_arr, multiple=1.0, nthreads_per_block=64, max_blocks=1024, ncopies=1, seed=None, cl_context=None): """ Generates photons from information in the steps_arr Parameters ---------- steps_arr : numpy.array with shape=(N,10) dtype=np.float contains [ x1, y1, z1, t1, x2, y2, z2, nphotons, fast_to_slow_ratio, fast_time_constatn, slow_time_constatn ] in the future could generalize this to many different time components. developed for liquid argon TPCs. multiple : float scale up the number of photons generated (not implemented yet) """ self.steps_array = steps_arr self.nsteps = self.steps_array.shape[0] if multiple!=1.0: raise RuntimeError('Have not implemented scaling of the number of photons generated.') # =========================== # GEN PHOTONS tstart_genphotons = time.time() # we do the dumbest thing first (i.e., no attempt to do fancy GPU manipulations here) # on the CPU, we scan the steps to determine the total number of photons using poisson statistics # we assume the user has seeded the random number generator to her liking tstart_nphotons = time.time() self.step_fsratio = np.array( self.steps_array[:,self._fsratio], dtype=np.float32 ) #self.nphotons_per_step = np.array( [ np.random.poisson( z ) for z in self.steps_array[:,self._nphotons].ravel() ], dtype=np.int ) self.nphotons_per_step = self.steps_array[ self._nphotons, : ] self.nphotons = reduce( lambda x, y : x + y, self.nphotons_per_step.ravel() ) print "NSTEPS: ",self.nsteps print "NPHOTONS: ",self.nphotons," (time to determine per step=",time.time()-tstart_nphotons # now we make an index array for which step we need to get info from self.source_step_index = np.zeros( self.nphotons, dtype=np.int32 ) current_index=0 for n, n_per_step in enumerate( self.nphotons_per_step ): self.source_step_index[current_index:current_index+n_per_step] = n current_index += n_per_step # push everything to the GPU tstart_transfer = time.time() if api.is_gpu_api_cuda(): # step info self.step_pos1_gpu = ga.empty(shape=self.nsteps, dtype=ga.vec.float3) self.step_pos2_gpu = ga.empty(shape=self.nsteps, dtype=ga.vec.float3) self.step_fsratio_gpu = ga.to_gpu( self.step_fsratio ) self.source_step_index_gpu = ga.to_gpu( self.source_step_index ) # photon info self.pos = ga.empty( shape=self.nphotons, dtype=ga.vec.float3 ) self.dir = ga.empty( shape=self.nphotons, dtype=ga.vec.float3 ) self.pol = ga.empty( shape=self.nphotons, dtype=ga.vec.float3 ) self.wavelengths = ga.empty(shape=self.nphotons*ncopies, dtype=np.float32) self.t = ga.to_gpu( np.zeros(self.nphotons*ncopies, dtype=np.float32) ) self.last_hit_triangles = ga.empty(shape=self.nphotons*ncopies, dtype=np.int32) self.flags = ga.empty(shape=self.nphotons*ncopies, dtype=np.uint32) self.weights = ga.empty(shape=self.nphotons*ncopies, dtype=np.float32) elif api.is_gpu_api_opencl(): cl_queue = cl.CommandQueue( cl_context ) # step info self.step_pos1_gpu = ga.empty(cl_queue, self.nsteps, dtype=ga.vec.float3) self.step_pos2_gpu = ga.empty(cl_queue, self.nsteps, dtype=ga.vec.float3) self.step_fsratio_gpu = ga.to_device( cl_queue, self.step_fsratio ) self.source_step_index_gpu = ga.to_device( cl_queue, self.source_step_index ) # photon info self.pos = ga.empty( cl_queue, self.nphotons, dtype=ga.vec.float3 ) self.dir = ga.empty( cl_queue, self.nphotons, dtype=ga.vec.float3 ) self.pol = ga.empty( cl_queue, self.nphotons, dtype=ga.vec.float3 ) self.wavelengths = ga.empty( cl_queue, self.nphotons*ncopies, dtype=np.float32) self.t = ga.zeros( cl_queue, self.nphotons*ncopies, dtype=np.float32) self.last_hit_triangles = ga.empty( cl_queue, self.nphotons*ncopies, dtype=np.int32) self.flags = ga.empty( cl_queue, self.nphotons*ncopies, dtype=np.uint32) self.weights = ga.empty( cl_queue, self.nphotons*ncopies, dtype=np.float32) self.step_pos1_gpu.set( to_float3( self.steps_array[:,0:3] ) ) self.step_pos2_gpu.set( to_float3( self.steps_array[:,4:7] ) ) self.t.set( self.steps_array[:,3] ) self.ncopies = ncopies self.true_nphotons = self.nphotons if self.ncopies!=1: raise ValueError('support for multiple copies not supported') if api.is_gpu_api_cuda(): self.gpumod = get_module( "gen_photon_from_step.cu", options=api_options, include_source_directory=True ) elif api.is_gpu_api_opencl(): self.gpumod = get_module( "gen_photon_from_step.cl", cl_context, options=api_options, include_source_directory=True ) self.gpufuncs = GPUFuncs( self.gpumod ) print "gen photon mem alloc/transfer time=",time.time()-tstart_transfer # need random numbers tgpu = time.time() if seed==None: seed = 5 rng_states = get_rng_states(nthreads_per_block*max_blocks, seed=seed, cl_context=cl_context) for first_photon, photons_this_round, blocks in chunk_iterator(self.nphotons, nthreads_per_block, max_blocks): if api.is_gpu_api_cuda(): self.gpufuncs.gen_photon_from_step( np.int32(first_photon), np.int32(self.nphotons), self.source_step_index_gpu, self.step_pos1_gpu, self.step_pos2_gpu, self.step_fsratio_gpu, np.float32( self.steps_array[0,self._fconst] ), np.float32( self.steps_array[0,self._sconst] ), np.float32( 128.0 ), rng_states, self.pos, self.dir, self.pol, self.t, self.wavelengths, self.last_hit_triangles, self.flags, self.weights, block=(nthreads_per_block,1,1), grid=(blocks, 1) ) elif api.is_gpu_api_opencl(): self.gpufuncs.gen_photon_from_step( cl_queue, ( photons_this_round, 1, 1), None, np.int32(first_photon), np.int32(self.nphotons), self.source_step_index_gpu.data, self.step_pos1_gpu.data, self.step_pos2_gpu.data, self.step_fsratio_gpu.data, np.float32( self.steps_array[0,self._fconst] ), np.float32( self.steps_array[0,self._sconst] ), np.float32( 128.0 ), rng_states.data, self.pos.data, self.dir.data, self.pol.data, self.t.data, self.wavelengths.data, self.last_hit_triangles.data, self.flags.data, self.weights.data, g_times_l=False ).wait() else: raise RuntimeError("GPU API is neither CUDA nor OpenCL!") if api.is_gpu_api_cuda(): cuda.Context.get_current().synchronize() tend_genphotons = time.time() print "GPUPhotonFromSteps: time to gen photons ",tend_genphotons-tstart_genphotons," secs (gpu time=",time.time()-tgpu,")" # Now load modules if api.is_gpu_api_cuda(): self.module = get_module('propagate.cu', options=api_options, include_source_directory=True) elif api.is_gpu_api_opencl(): self.module = get_module('propagate.cl', cl_context, options=api_options, include_source_directory=True) # define the texture references self.define_texture_references() # get kernel functions self.gpu_funcs = GPUFuncs(self.module)
class TestSampling(unittest.TestCase): def setUp(self): self.context = cltools.get_last_context() self.nthreads_per_block = 256 self.myoptions = ('-I.', ) + api_options self.mod = get_module("test_sample_cdf.cl", self.context, options=self.myoptions, include_source_directory=True) self.funcs = GPUFuncs(self.mod) self.rng_states = clrand.get_rng_states(self.context, self.nthreads_per_block) self.outf = rt.TFile("output_sample_cdf.root", "RECREATE") def compare_sampling(self, hist, reps=10): queue = cl.CommandQueue(self.context) # make cdf histogram nbins = hist.GetNbinsX() xaxis = hist.GetXaxis() intg = hist.GetIntegral() cdf_y = np.empty(nbins + 1, dtype=float) cdf_x = np.empty_like(cdf_y) cdf_x[0] = xaxis.GetBinLowEdge(1) cdf_y[0] = 0.0 for i in xrange(1, len(cdf_x)): cdf_y[i] = intg[i] cdf_x[i] = xaxis.GetBinUpEdge(i) cdf_x_gpu = cl.array.to_device(queue, cdf_x.astype(np.float32)) cdf_y_gpu = cl.array.to_device(queue, cdf_y.astype(np.float32)) block = (self.nthreads_per_block, 1, 1) grid = (1, 1) out_gpu = cl.array.empty(queue, shape=self.nthreads_per_block, dtype=np.float32) out_h = rt.TH1D('out_h', '', hist.GetNbinsX(), xaxis.GetXmin(), xaxis.GetXmax()) out_h.SetLineColor(rt.kGreen) for first_index, elements_this_iter, nblocks_this_iter in \ chunk_iterator(reps, self.nthreads_per_block, max_blocks=1): self.funcs.test_sample_cdf(queue, (elements_this_iter, 1, 1), None, self.rng_states.data, np.int32(len(cdf_x_gpu)), cdf_x_gpu.data, cdf_y_gpu.data, out_gpu.data) out = out_gpu.get() for v in out[:elements_this_iter]: out_h.Fill(v) prob = out_h.KolmogorovTest(hist) out_h.Write() return prob, out_h def test_sampling(self): '''Verify that the CDF-based sampler on the GPU reproduces a binned Gaussian distribution''' f = rt.TF1('f_gaussian', 'gaus(0)', -5, 5) f.SetParameters(1.0 / np.sqrt(np.pi * 2), 0.0, 1.0) gaussian = rt.TH1D('gaussian', '', 100, -5, 5) gaussian.Add(f) prob, out_h = self.compare_sampling(gaussian, reps=20000) self.outf.cd() gaussian.Write("gaussian") out_h.Write("out_h") assert prob > 0.01 def tearDown(self): self.outf.Close()
class GPUDaq(object): def __init__(self, gpu_detector, ndaq=1, cl_context=None, cl_queue=None): if api.is_gpu_api_cuda(): self.earliest_time_gpu = ga.empty(gpu_detector.nchannels * ndaq, dtype=np.float32) self.earliest_time_int_gpu = ga.empty(gpu_detector.nchannels * ndaq, dtype=np.uint32) self.channel_history_gpu = ga.zeros_like( self.earliest_time_int_gpu) self.channel_q_int_gpu = ga.zeros_like(self.earliest_time_int_gpu) self.channel_q_gpu = ga.zeros(len(self.earliest_time_int_gpu), dtype=np.float32) self.detector_gpu = gpu_detector.detector_gpu self.module = cutools.get_cu_module('daq.cu', options=api_options, include_source_directory=True) elif api.is_gpu_api_opencl(): self.earliest_time_gpu = ga.empty(cl_queue, gpu_detector.nchannels * ndaq, dtype=np.float32) self.earliest_time_int_gpu = ga.empty(cl_queue, gpu_detector.nchannels * ndaq, dtype=np.uint32) self.channel_history_gpu = ga.zeros(cl_queue, gpu_detector.nchannels * ndaq, dtype=np.uint32) self.channel_q_int_gpu = ga.zeros(cl_queue, gpu_detector.nchannels * ndaq, dtype=np.uint32) self.channel_q_gpu = ga.zeros(cl_queue, gpu_detector.nchannels * ndaq, dtype=np.float32) self.detector_gpu = gpu_detector # struct not made in opencl mode, so we keep a copy of the class self.module = cltools.get_cl_module('daq.cl', cl_context, options=api_options, include_source_directory=True) else: raise RuntimeError("GPU API is neither CUDA nor OpenCL") self.solid_id_map_gpu = gpu_detector.solid_id_map self.solid_id_to_channel_index_gpu = gpu_detector.solid_id_to_channel_index_gpu self.gpu_funcs = GPUFuncs(self.module) self.ndaq = ndaq self.stride = gpu_detector.nchannels def begin_acquire(self, nthreads_per_block=64, cl_context=None): if api.is_gpu_api_cuda(): self.gpu_funcs.reset_earliest_time_int( np.float32(1e9), np.int32(len(self.earliest_time_int_gpu)), self.earliest_time_int_gpu, block=(nthreads_per_block, 1, 1), grid=(len(self.earliest_time_int_gpu) // nthreads_per_block + 1, 1)) self.channel_q_int_gpu.fill(0) self.channel_q_gpu.fill(0) self.channel_history_gpu.fill(0) elif api.is_gpu_api_opencl(): comqueue = cl.CommandQueue(cl_context) self.gpu_funcs.reset_earliest_time_int( comqueue, (nthreads_per_block, 1, 1), (len(self.earliest_time_int_gpu) // nthreads_per_block + 1, 1), np.float32(1e9), np.int32(len(self.earliest_time_int_gpu)), self.earliest_time_int_gpu.data, g_times_l=True).wait() self.channel_q_int_gpu.fill(0, queue=comqueue) self.channel_q_gpu.fill(0, queue=comqueue) self.channel_history_gpu.fill(0, queue=comqueue) cl.enqueue_barrier(comqueue) def acquire(self, gpuphotons, rng_states, nthreads_per_block=64, max_blocks=1024, start_photon=None, nphotons=None, weight=1.0, cl_context=None): if start_photon is None: start_photon = 0 if nphotons is None: nphotons = len(gpuphotons.pos) - start_photon if api.is_gpu_api_opencl(): comqueue = cl.CommandQueue(cl_context) clmaxblocks = max_blocks if self.ndaq == 1: for first_photon, photons_this_round, blocks in \ chunk_iterator(nphotons, nthreads_per_block, max_blocks): if api.is_gpu_api_cuda(): self.gpu_funcs.run_daq(rng_states, np.uint32(0x1 << 2), np.int32(start_photon + first_photon), np.int32(photons_this_round), gpuphotons.t, gpuphotons.flags, gpuphotons.last_hit_triangles, gpuphotons.weights, self.solid_id_map_gpu, self.detector_gpu, self.earliest_time_int_gpu, self.channel_q_int_gpu, self.channel_history_gpu, np.float32(weight), block=(nthreads_per_block, 1, 1), grid=(blocks, 1)) elif api.is_gpu_api_opencl(): #print "daq: ",start_photon,first_photon,start_photon+first_photon,(photons_this_round/nthreads_per_block,1,1), (nthreads_per_block,1,1) self.gpu_funcs.run_daq( comqueue, (photons_this_round / nthreads_per_block, 1, 1), (nthreads_per_block, 1, 1), rng_states.data, np.uint32(0x1 << 2), np.int32(start_photon + first_photon), np.int32(photons_this_round), gpuphotons.t.data, gpuphotons.flags.data, gpuphotons.last_hit_triangles.data, gpuphotons.weights.data, self.solid_id_map_gpu.data, # -- Detector struct -- self.solid_id_to_channel_index_gpu.data, self.detector_gpu.time_cdf_x_gpu.data, self.detector_gpu.time_cdf_y_gpu.data, self.detector_gpu.charge_cdf_x_gpu.data, self.detector_gpu.charge_cdf_y_gpu.data, self.detector_gpu.nchannels, self.detector_gpu.time_cdf_len, self.detector_gpu.charge_cdf_len, self.detector_gpu.charge_unit, # --------------------- self.earliest_time_int_gpu.data, self.channel_q_int_gpu.data, self.channel_history_gpu.data, np.float32(weight), g_times_l=True).wait() else: for first_photon, photons_this_round, blocks in \ chunk_iterator(nphotons, 1, max_blocks): if api.is_gpu_api_cuda(): self.gpu_funcs.run_daq_many( rng_states, np.uint32(0x1 << 2), np.int32(start_photon + first_photon), np.int32(photons_this_round), gpuphotons.t, gpuphotons.flags, gpuphotons.last_hit_triangles, gpuphotons.weights, self.solid_id_map_gpu, self.detector_gpu, self.earliest_time_int_gpu, self.channel_q_int_gpu, self.channel_history_gpu, np.int32(self.ndaq), np.int32(self.stride), np.float32(weight), block=(nthreads_per_block, 1, 1), grid=(blocks, 1)) elif api.is_gpu_api_opencl(): self.gpu_funcs.run_daq_many( comqueue, (nthreads_per_block, 1, 1), (blocks, 1), np.int32(start_photon + first_photon), np.int32(photons_this_round), gpuphotons.t.data, gpuphotons.flags.data, gpuphotons.last_hit_triangles.data, gpuphotons.weights.data, self.solid_id_map_gpu, # -- Detector Struct -- self.solid_id_to_channel_index_gpu.data, self.detector_gpu.time_cdf_x_gpu.data, self.detector_gpu.time_cdf_y_gpu.data, self.detector_gpu.charge_cdf_x_gpu.data, self.detector_gpu.charge_cdf_y_gpu.data, self.detector_gpu.nchannels, self.detector_gpu.time_cdf_len, self.detector_gpu.charge_cdf_len, self.detector_gpu.charge_unit, # --------------------- self.earliest_time_int_gpu.data, self.channel_q_int_gpu.data, self.channel_history_gpu.data, np.int32(self.ndaq), np.int32(self.stride), np.float32(weight), g_times_l=True).wait() if api.is_gpu_api_cuda(): cuda.Context.get_current().synchronize() elif api.is_gpu_api_opencl(): cl.enqueue_barrier(comqueue) def end_acquire(self, nthreads_per_block=64, cl_context=None): if api.is_gpu_api_cuda(): self.gpu_funcs.convert_sortable_int_to_float( np.int32(len(self.earliest_time_int_gpu)), self.earliest_time_int_gpu, self.earliest_time_gpu, block=(nthreads_per_block, 1, 1), grid=(len(self.earliest_time_int_gpu) // nthreads_per_block + 1, 1)) self.gpu_funcs.convert_charge_int_to_float( self.detector_gpu, self.channel_q_int_gpu, self.channel_q_gpu, block=(nthreads_per_block, 1, 1), grid=(len(self.channel_q_int_gpu) // nthreads_per_block + 1, 1)) cuda.Context.get_current().synchronize() elif api.is_gpu_api_opencl(): print cl_context, nthreads_per_block comqueue = cl.CommandQueue(cl_context) self.gpu_funcs.convert_sortable_int_to_float( comqueue, (len(self.earliest_time_int_gpu), 1, 1), (nthreads_per_block, 1, 1), np.int32(len(self.earliest_time_int_gpu)), self.earliest_time_int_gpu.data, self.earliest_time_gpu.data, g_times_l=True).wait() self.gpu_funcs.convert_charge_int_to_float( comqueue, (len(self.channel_q_int_gpu), 1, 1), (nthreads_per_block, 1, 1), self.detector_gpu.nchannels, self.detector_gpu.charge_unit, self.channel_q_int_gpu.data, self.channel_q_gpu.data, g_times_l=True).wait() return GPUChannels(self.earliest_time_gpu, self.channel_q_gpu, self.channel_history_gpu, self.ndaq, self.stride)
class GPUDaqLAr1ND(GPUDAQHist): """ DAQ that stores histogram of photon hits.""" NTDC = None NS_PER_TDC = None def __init__(self, gpu_detector, ntdcs=None, ns_per_tdc=None, adc_bits=None, ndaq=1, cl_context=None, cl_queue=None): """constructor. Args: gpu_detector: GPUDetector Keywords: ntdcs: int number of time bins per channel if not supplied, using class variable value ns_per_tdc: float nanoseconds per time bin if not supplied, using class variable value adc_bits: int number of ADC bits (not used yet) ndaq: int number of daqs cl_context: pyopencl.Context cl_queue: pyopencl.CommandQueue Raises: ValueError when ntdcs and ns_per_tdc are found to be NoneType """ if ntdcs == None: self.ntdcs = GPUDaqLAr1ND.NTDC if ns_per_tdc == None: self.ns_per_tdc = GPUDaqLAr1ND.NS_PER_TDC super(GPUDaqLAr1ND, self).__init__(gpu_detector, ntdcs=self.ntdcs, ns_per_tdc=self.ns_per_tdc, adc_bits=adc_bits, ndaq=ndaq, cl_context=cl_context, cl_queue=cl_queue) if self.ntdcs == None: raise ValueError("GPUDaqLAr1ND.NTDC has not been set.") if self.ns_per_tdc == None: raise ValueError("GPUDaqLAr1ND.NS_PER_TDC has not been set.") kernel_filepath = os.path.dirname( os.path.realpath(__file__)) + "/daq_lar1nd" if api.is_gpu_api_cuda(): self.module = cutools.get_cu_module(kernel_filepath + ".cu", options=api_options, include_source_directory=True) elif api.is_gpu_api_opencl(): self.module = cltools.get_cl_module(kernel_filepath + '.cl', cl_context, options=api_options, include_source_directory=True) else: raise RuntimeError("GPU API is neither CUDA nor OpenCL") self.gpu_funcs = GPUFuncs(self.module) def acquire(self, gpuphotons, rng_states, nthreads_per_block=64, max_blocks=1024, start_photon=None, nphotons=None, weight=1.0, cl_context=None): """run UBooNE DAQ acquire kernels""" if start_photon is None: start_photon = 0 if nphotons is None: nphotons = len(gpuphotons.pos) - start_photon if api.is_gpu_api_opencl(): comqueue = cl.CommandQueue(cl_context) clmaxblocks = max_blocks # We loop over all photons and bin them essentially if self.ndaq == 1: for first_photon, photons_this_round, blocks in \ chunk_iterator(nphotons, nthreads_per_block, max_blocks): if api.is_gpu_api_cuda(): self.gpu_funcs.run_daq(rng_states, np.uint32(event.SURFACE_DETECT), np.int32(start_photon + first_photon), np.int32(photons_this_round), gpuphotons.t, gpuphotons.flags, gpuphotons.last_hit_triangles, gpuphotons.weights, self.solid_id_map_gpu, self.detector_gpu, self.adc_gpu, np.int32(self.nchannels), np.int32(self.ntdcs), np.float32(self.ns_per_tdc), np.float32(100.0), self.channel_history_gpu, np.float32(weight), block=(nthreads_per_block, 1, 1), grid=(blocks, 1)) elif api.is_gpu_api_opencl(): self.gpu_funcs.run_daq( comqueue, (photons_this_round, 1, 1), None, rng_states.data, np.uint32(0x1 << 2), np.int32(start_photon + first_photon), np.int32(nphotons), gpuphotons.t.data, gpuphotons.pos.data, gpuphotons.flags.data, gpuphotons.last_hit_triangles.data, gpuphotons.weights.data, self.solid_id_map_gpu.data, # -- Detector struct -- self.solid_id_to_channel_index_gpu.data, # --------------------- self.uint_adc_gpu.data, np.int32(self.nchannels), np.int32(self.ntdcs), np.float32(self.ns_per_tdc), np.float32(100.0), self.channel_history_gpu.data, # -- Channel transforms -- self.channel_inverse_rot_gpu.data, self.channel_inverse_trans_gpu.data, # ------------------------ np.float32(weight), g_times_l=False).wait() # if opencl, need to convert ADC from uint to float if api.is_gpu_api_opencl(): self.gpu_funcs.convert_adc(comqueue, (int(self.nchannels), 1, 1), None, self.uint_adc_gpu.data, self.adc_gpu.data, np.int32(self.nchannels), np.int32(self.ntdcs), g_times_l=False).wait() else: raise RunTimeError("Multi-DAQ not built") for first_photon, photons_this_round, blocks in \ chunk_iterator(nphotons, 1, max_blocks): if api.is_gpu_api_cuda(): self.gpu_funcs.run_daq_many( rng_states, np.uint32(0x1 << 2), np.int32(start_photon + first_photon), np.int32(photons_this_round), gpuphotons.t, gpuphotons.flags, gpuphotons.last_hit_triangles, gpuphotons.weights, self.solid_id_map_gpu, self.detector_gpu, self.earliest_time_int_gpu, self.channel_q_int_gpu, self.channel_history_gpu, np.int32(self.ndaq), np.int32(self.stride), np.float32(weight), block=(nthreads_per_block, 1, 1), grid=(blocks, 1)) elif api.is_gpu_api_opencl(): self.gpu_funcs.run_daq_many( comqueue, (nthreads_per_block, 1, 1), (blocks, 1), np.int32(start_photon + first_photon), np.int32(photons_this_round), gpuphotons.t.data, gpuphotons.flags.data, gpuphotons.last_hit_triangles.data, gpuphotons.weights.data, self.solid_id_map_gpu, # -- Detector Struct -- self.solid_id_to_channel_index_gpu.data, self.detector_gpu.time_cdf_x_gpu.data, self.detector_gpu.time_cdf_y_gpu.data, self.detector_gpu.charge_cdf_x_gpu.data, self.detector_gpu.charge_cdf_y_gpu.data, self.detector_gpu.nchannels, self.detector_gpu.time_cdf_len, self.detector_gpu.charge_cdf_len, self.detector_gpu.charge_unit, # --------------------- self.earliest_time_int_gpu.data, self.channel_q_int_gpu.data, self.channel_history_gpu.data, np.int32(self.ndaq), np.int32(self.stride), np.float32(weight), g_times_l=True).wait() if api.is_gpu_api_cuda(): cuda.Context.get_current().synchronize() elif api.is_gpu_api_opencl(): cl.enqueue_barrier(comqueue) def end_acquire(self, nthreads_per_block=64, cl_context=None): """collect daq info and make GPUChannels instance. Args: nthreads_per_block: int cl_context: pyopenc.Context Returns: GPUChannels """ if api.is_gpu_api_cuda(): self.earliest_time_gpu = ga.zeros(self.nchannels, dtype=np.float32) nblocks = int(self.nchannels / nthreads_per_block) + 1 self.gpu_funcs.get_earliest_hit_time(np.int32(self.nchannels), np.int32(self.ntdcs), np.float32(self.ns_per_tdc), self.adc_gpu, self.channel_history_gpu, self.earliest_time_gpu, block=(1000, 1, 1), grid=(1, 1)) self.adc_gpu.get() elif api.is_gpu_api_opencl(): comqueue = cl.CommandQueue(cl_context) self.earliest_time_gpu = ga.zeros(comqueue, self.nchannels, dtype=np.float32) self.gpu_funcs.get_earliest_hit_time( comqueue, (int(self.nchannels), 1, 1), None, np.int32(self.nchannels), np.int32(self.ntdcs), np.float32(self.ns_per_tdc), self.adc_gpu.data, self.channel_history_gpu.data, self.earliest_time_gpu.data).wait() self.adc_gpu.get() return GPUChannels(self.earliest_time_gpu, self.adc_gpu, self.channel_history_gpu, self.ndaq, self.stride) @classmethod def build_daq(cls, gpu_geometry, cl_context=None, cl_queue=None): """factory method. will be called by chroma.Simulation to build DAQ instance. Returns: GPUDaqLAr1ND instance """ return GPUDaqLAr1ND(gpu_geometry, cl_context=cl_context, cl_queue=cl_queue)
def _call_cuda_kernel(self, sim, photons, ourphotons, max_shared_nodes, nodes, workgroupsize): module = get_module('wq_checknode.cu', options=api_options, include_source_directory=True) gpu_funcs = GPUFuncs(module) # gather variables for kernel call gpugeo = sim.gpu_geometry photon_pos = photons.pos photon_dir = photons.dir photon_current_node = photons.current_node_index photon_tested_node = ga.to_gpu( 1 * np.ones(len(photons.pos), dtype=np.uint32)) photon_last_result = ga.to_gpu( -1 * np.ones(len(photons.pos), dtype=np.int32)) nodes = gpugeo.nodes node_parent = ga.to_gpu(sim.detector.node_dsar_tree.parent) node_first_daughter = ga.to_gpu( sim.detector.node_dsar_tree.first_daughter) node_sibling = ga.to_gpu(sim.detector.node_dsar_tree.sibling) node_aunt = ga.to_gpu(sim.detector.node_dsar_tree.aunt) world_origin = gpugeo.world_origin world_scale = gpugeo.world_scale # make queue related variables queue_size = np.int32(len(photons.pos) * 2) queue_photon_index = ga.empty(queue_size, dtype=np.int32) queue_slot_flag = ga.zeros(queue_size, dtype=np.int32) queue_photon_index[0:len(photons.pos)].set( np.arange(0, len(photons.pos), dtype=np.int32)[:]) queue_photon_index[len(photons.pos):].set( -1 * np.ones(len(photons.pos), dtype=np.int32)) queue_slot_flag[0:len(photons.pos)].set( np.ones(len(photons.pos), dtype=np.int32)[:]) a = ga.zeros(1, dtype=ga.vec.uint4) b = np.array(1, dtype=np.int32) c = np.array(1, dtype=np.uint32) max_nodes_can_store = (max_shared_nodes - 20 - 3 * workgroupsize) max_nodes_can_store -= max_nodes_can_store % 32 max_nodes_can_store = np.int32(max_nodes_can_store) loaded_node_start_index = np.int32(0) loaded_node_end_index = np.int32(1) node_front_start = ga.empty(1, dtype=np.int32) node_front_end = ga.empty(1, dtype=np.int32) max_loops = 1000 if len(gpugeo.extra_nodes) > 1: raise RuntimeError('did not plan for there to be a node split.') print photon_current_node print photon_tested_node print queue_photon_index print queue_slot_flag print "Starting node range: ", loaded_node_start_index, " to ", loaded_node_end_index print "Max nodes in shared: ", max_nodes_can_store print "Work group nodes size: ", a.nbytes * workgroupsize, " bytes = (", a.nbytes, "*", workgroupsize, ")" print "Available local memsize: ", self.shared_mem_size print "Total number of nodes: ", len( nodes), " (", nodes.nbytes, " bytes)" print "Stored node size: ", max_nodes_can_store * a.nbytes print "Left over: ", self.shared_mem_size - max_nodes_can_store * a.nbytes - a.nbytes * workgroupsize print sim.detector.bvh.layer_bounds print "PRESUB CURRENT NODES" print photon_current_node print "PRESUB TESTED NODES" print photon_tested_node print "STARTING QUEUE" print queue_photon_index start_queue = time.time() gpu_funcs.checknode(np.int32(max_loops), photon_pos, photon_dir, photon_current_node, photon_tested_node, photon_last_result, np.int32(len(nodes)), nodes, node_parent, node_first_daughter, node_sibling, node_aunt, world_origin, world_scale, queue_size, queue_photon_index, queue_slot_flag, np.int32(len(photon_pos)), max_nodes_can_store, loaded_node_start_index, loaded_node_end_index, node_front_start, node_front_end, block=(workgroupsize, 1, 1), grid=(1, 1), shared=4 * (7 * max_nodes_can_store + 3 * workgroupsize + 1)) cuda.Context.get_current().synchronize() end_queue = time.time() nactive = len(np.argwhere(queue_slot_flag.get() == 1)) print "CheckNode Queue returns. ", end_queue - start_queue, " seconds" print "(Current node, To Test)" node_states = zip(photon_current_node.get(), photon_tested_node.get(), photon_last_result.get()) for x in xrange(0, len(node_states), 10): y = x + 10 if y > len(node_states): y = len(node_states) print x, ": ", node_states[x:y] print "LAST RESULT:" np_photon_results = photon_last_result.get() for x in xrange(0, len(np_photon_results), 10): y = x + 10 if y > len(np_photon_results): y = len(np_photon_results) print x, ": ", np_photon_results[x:y] print "PHOTON QUEUE" photon_queue = queue_photon_index.get() for x in xrange(0, len(photon_queue), 10): y = x + 10 if y > len(photon_queue): y = len(photon_queue) print x, ": ", photon_queue[x:y] print "QUEUE SLOT FLAGS: ", nactive, " threads" slot_flags = queue_slot_flag.get() for x in xrange(0, len(slot_flags), 10): y = x + 10 if y > len(slot_flags): y = len(slot_flags) print x, ": ", slot_flags[x:y] print "NODE FRONT: ", node_front_start.get( ), " to ", node_front_end.get( ), node_front_end.get() - node_front_start.get()
class GPUKernelPDF(object): def __init__(self, cl_context=None, cl_queue=None): if api.is_gpu_api_cuda(): self.module = cutools.get_cu_module('pdf.cu', options=cutools.cuda_options, include_source_directory=True) elif api.is_gpu_api_opencl(): self.module = cltools.get_cl_module('pdf.cl', cl_context, options=cltools.cl_options, include_source_directory=True) self.gpu_funcs = GPUFuncs(self.module) def setup_moments(self, nchannels, trange, qrange, time_only=True): """Setup GPU arrays to accumulate moments and eventually compute a kernel estimate of PDF values for each hit channel. trange: (float, float) Range of time dimension in PDF qrange: (float, float) Range of charge dimension in PDF time_only: bool If True, only the time observable will be used in the PDF. """ self.hitcount_gpu = ga.zeros(nchannels, dtype=np.uint32) self.tmom1_gpu = ga.zeros(nchannels, dtype=np.float32) self.tmom2_gpu = ga.zeros(nchannels, dtype=np.float32) self.qmom1_gpu = ga.zeros(nchannels, dtype=np.float32) self.qmom2_gpu = ga.zeros(nchannels, dtype=np.float32) self.trange = trange self.qrange = qrange self.time_only = time_only def clear_moments(self): "Reset PDF evaluation counters to start accumulating new Monte Carlo." self.hitcount_gpu.fill(0) self.tmom1_gpu.fill(0.0) self.tmom2_gpu.fill(0.0) self.qmom1_gpu.fill(0.0) self.qmom2_gpu.fill(0.0) def accumulate_moments(self, gpuchannels, nthreads_per_block=64): """Add the most recent results of run_daq() to the accumulate of moments for future bandwidth calculation.""" self.gpu_funcs.accumulate_moments( np.int32(self.time_only), np.int32(len(gpuchannels.t)), gpuchannels.t, gpuchannels.q, np.float32(self.trange[0]), np.float32(self.trange[1]), np.float32(self.qrange[0]), np.float32(self.qrange[1]), self.hitcount_gpu, self.tmom1_gpu, self.tmom2_gpu, self.qmom1_gpu, self.qmom2_gpu, block=(nthreads_per_block, 1, 1), grid=(len(gpuchannels.t) // nthreads_per_block + 1, 1)) def compute_bandwidth(self, event_hit, event_time, event_charge, scale_factor=1.0): """Use the MC information accumulated by accumulate_moments() to estimate the best bandwidth to use when kernel estimating.""" rho = 1.0 hitcount = self.hitcount_gpu.get() mom0 = np.maximum(hitcount, 1) tmom1 = self.tmom1_gpu.get() tmom2 = self.tmom2_gpu.get() tmean = tmom1 / mom0 tvar = np.maximum(tmom2 / mom0 - tmean**2, 0.0) # roundoff can go neg trms = tvar**0.5 if self.time_only: d = 1 else: d = 2 dimensionality_factor = ((4.0 / (d + 2)) / (mom0 / scale_factor))**(-1.0 / (d + 4)) gaussian_density = np.minimum( 1.0 / trms, (1.0 / np.sqrt(2.0 * np.pi)) * np.exp(-0.5 * ((event_time - tmean) / trms)) / trms) time_bandwidths = dimensionality_factor / gaussian_density * rho inv_time_bandwidths = np.zeros_like(time_bandwidths) inv_time_bandwidths[time_bandwidths > 0] = time_bandwidths[ time_bandwidths > 0]**-1 # precompute inverse to speed up GPU evaluation self.inv_time_bandwidths_gpu = ga.to_gpu( inv_time_bandwidths.astype(np.float32)) # Compute charge bandwidths if needed if self.time_only: self.inv_charge_bandwidths_gpu = ga.empty_like( self.inv_time_bandwidths_gpu) self.inv_charge_bandwidths_gpu.fill(0.0) else: qmom1 = self.qmom1_gpu.get() qmom2 = self.qmom2_gpu.get() qmean = qmom1 / mom0 qrms = (qmom2 / mom0 - qmean**2)**0.5 gaussian_density = np.minimum( 1.0 / qrms, (1.0 / np.sqrt(2.0 * np.pi)) * np.exp(-0.5 * ((event_charge - qmean) / qrms)) / qrms) charge_bandwidths = dimensionality_factor / gaussian_density * rho # precompute inverse to speed up GPU evaluation self.inv_charge_bandwidths_gpu = ga.to_gpu( (charge_bandwidths**-1).astype(np.float32)) def setup_kernel(self, event_hit, event_time, event_charge): """Setup GPU arrays to accumulate moments and eventually compute a kernel estimate of PDF values for each hit channel. event_hit: ndarray Hit or not-hit status for each channel in the detector. event_time: ndarray Hit time for each channel in the detector. If channel not hit, the time will be ignored. event_charge: ndarray Integrated charge for each channel in the detector. If channel not hit, the charge will be ignored. """ self.event_hit_gpu = ga.to_gpu(event_hit.astype(np.uint32)) self.event_time_gpu = ga.to_gpu(event_time.astype(np.float32)) self.event_charge_gpu = ga.to_gpu(event_charge.astype(np.float32)) self.hitcount_gpu.fill(0) self.time_pdf_values_gpu = ga.zeros(len(event_hit), dtype=np.float32) self.charge_pdf_values_gpu = ga.zeros(len(event_hit), dtype=np.float32) def clear_kernel(self): self.hitcount_gpu.fill(0) self.time_pdf_values_gpu.fill(0.0) self.charge_pdf_values_gpu.fill(0.0) def accumulate_kernel(self, gpuchannels, nthreads_per_block=64): "Add the most recent results of run_daq() to the kernel PDF evaluation." self.gpu_funcs.accumulate_kernel_eval( np.int32(self.time_only), np.int32(len(self.event_hit_gpu)), self.event_hit_gpu, self.event_time_gpu, self.event_charge_gpu, gpuchannels.t, gpuchannels.q, np.float32(self.trange[0]), np.float32(self.trange[1]), np.float32(self.qrange[0]), np.float32(self.qrange[1]), self.inv_time_bandwidths_gpu, self.inv_charge_bandwidths_gpu, self.hitcount_gpu, self.time_pdf_values_gpu, self.charge_pdf_values_gpu, block=(nthreads_per_block, 1, 1), grid=(len(gpuchannels.t) // nthreads_per_block + 1, 1)) def get_kernel_eval(self): hitcount = self.hitcount_gpu.get() hit = self.event_hit_gpu.get().astype(bool) time_pdf_values = self.time_pdf_values_gpu.get() time_pdf_values /= np.maximum(1, hitcount) # avoid divide by zero charge_pdf_values = self.charge_pdf_values_gpu.get() charge_pdf_values /= np.maximum(1, hitcount) # avoid divide by zero if self.time_only: pdf_values = time_pdf_values else: pdf_values = time_pdf_values * charge_pdf_values return hitcount, pdf_values, np.zeros_like(pdf_values)