def get_context(*args, **kwargs): if gpuapi.is_gpu_api_cuda(): #return cutools.get_cuda_context(device_id,context_flags) return cutools.create_cuda_context(*args, **kwargs) elif gpuapi.is_gpu_api_opencl(): #return cltools.create_cl_context(device_id,context_flags) return cltools.create_cl_context(*args, **kwargs)
def get_module(*args, **kwargs): #def get_module(name, options=None, include_source_directory=True, template_uncomment=None, template_fill=None): """ arguments: name, options=None, include_source_directory=True, template_uncomment=None, template_fill=None)""" if gpuapi.is_gpu_api_cuda(): return cutools.get_cu_module(*args, **kwargs) elif gpuapi.is_gpu_api_opencl(): return cltools.get_cl_module(*args, **kwargs)
def define_texture_references(self, module=None): # unbound texture references declared for use with propagate if module == None: module = self.module if api.is_gpu_api_cuda(): self.node_texture_ref = module.get_texref("nodevec_tex_ref") self.node_texture_ref.set_format(cuda.array_format.UNSIGNED_INT32, 4) self.extra_node_texture_ref = module.get_texref( "extra_node_tex_ref") self.extra_node_texture_ref.set_format( cuda.array_format.UNSIGNED_INT32, 4) self.vertices_texture_ref = module.get_texref( "verticesvec_tex_ref") self.vertices_texture_ref.set_format(cuda.array_format.FLOAT, 4) self.triangles_texture_ref = module.get_texref( "trianglesvec_tex_ref") self.triangles_texture_ref.set_format( cuda.array_format.UNSIGNED_INT32, 4) self.node_texture_ref_bound = False elif api.is_gpu_api_opencl(): # texture usage not used at the moment pass
def launchOnce(self, photons, sim, workgroupsize=32): # command queue if api.is_gpu_api_opencl(): comqueue = cl.CommandQueue( self.context ) if workgroupsize%32!=0: raise ValueError('work group size must be multiple value of 32') if workgroupsize>self.work_item_sizes: raise ValueError('work group size must be smaller than %d'%(self.work_item_sizes)) # photons is instance of GPUPhotons class. contains photon info on the host side bvh = sim.detector.bvh # get the photons we need to work on ourphotons = np.argwhere( photons.requested_workcode==self.workid ) # get index of photons that require us to work on them if len(ourphotons)==0: return # get information on what they need: # node list max_shared_nodes = self.shared_mem_size/((4+7)*4) # 4 is size of uint32, each node has 4 of them, plus daugher, sibling, aunt if bvh.nodes.nbytes<self.shared_mem_size: # lucky us, we can push the entire node list onto the device (though rarely will be the case) node_chunks = [0,len(bvh.nodes)-1] nodes = np.take( photons.current_node_index, ourphotons.ravel() ) # node indices # planning goals. forget about shared memory for now # pack in as many nodes to shared memory as possible. # try to take current layer, daughter layer, parent layer in that order # prep for kernel call if api.is_gpu_api_cuda(): self._call_cuda_kernel( sim, photons, ourphotons, max_shared_nodes, nodes, workgroupsize ) elif api.is_gpu_api_opencl(): self._call_opencl_kernel( sim, photons, ourphotons, max_shared_nodes, nodes, workgroupsize, comqueue )
def begin_acquire(self, nthreads_per_block=64, cl_context=None): if api.is_gpu_api_cuda(): self.gpu_funcs.reset_earliest_time_int( np.float32(1e9), np.int32(len(self.earliest_time_int_gpu)), self.earliest_time_int_gpu, block=(nthreads_per_block, 1, 1), grid=(len(self.earliest_time_int_gpu) // nthreads_per_block + 1, 1)) self.channel_q_int_gpu.fill(0) self.channel_q_gpu.fill(0) self.channel_history_gpu.fill(0) elif api.is_gpu_api_opencl(): comqueue = cl.CommandQueue(cl_context) self.gpu_funcs.reset_earliest_time_int( comqueue, (nthreads_per_block, 1, 1), (len(self.earliest_time_int_gpu) // nthreads_per_block + 1, 1), np.float32(1e9), np.int32(len(self.earliest_time_int_gpu)), self.earliest_time_int_gpu.data, g_times_l=True).wait() self.channel_q_int_gpu.fill(0, queue=comqueue) self.channel_q_gpu.fill(0, queue=comqueue) self.channel_history_gpu.fill(0, queue=comqueue) cl.enqueue_barrier(comqueue)
def __init__(self, gpu_detector, ntdcs=None, ns_per_tdc=None, adc_bits=None, ndaq=1, cl_context=None, cl_queue=None): """constructor. Args: gpu_detector: GPUDetector Keywords: ntdcs: int number of time bins per channel if not supplied, using class variable value ns_per_tdc: float nanoseconds per time bin if not supplied, using class variable value adc_bits: int number of ADC bits (not used yet) ndaq: int number of daqs cl_context: pyopencl.Context cl_queue: pyopencl.CommandQueue Raises: ValueError when ntdcs and ns_per_tdc are found to be NoneType """ if ntdcs == None: self.ntdcs = GPUDaqLAr1ND.NTDC if ns_per_tdc == None: self.ns_per_tdc = GPUDaqLAr1ND.NS_PER_TDC super(GPUDaqLAr1ND, self).__init__(gpu_detector, ntdcs=self.ntdcs, ns_per_tdc=self.ns_per_tdc, adc_bits=adc_bits, ndaq=ndaq, cl_context=cl_context, cl_queue=cl_queue) if self.ntdcs == None: raise ValueError("GPUDaqLAr1ND.NTDC has not been set.") if self.ns_per_tdc == None: raise ValueError("GPUDaqLAr1ND.NS_PER_TDC has not been set.") kernel_filepath = os.path.dirname( os.path.realpath(__file__)) + "/daq_lar1nd" if api.is_gpu_api_cuda(): self.module = cutools.get_cu_module(kernel_filepath + ".cu", options=api_options, include_source_directory=True) elif api.is_gpu_api_opencl(): self.module = cltools.get_cl_module(kernel_filepath + '.cl', cl_context, options=api_options, include_source_directory=True) else: raise RuntimeError("GPU API is neither CUDA nor OpenCL") self.gpu_funcs = GPUFuncs(self.module)
def load_bvh(geometry, bvh_name="default", auto_build_bvh=True, read_bvh_cache=True, target_degree=3, update_bvh_cache=True, cache_dir=None, bvh_method='grid', cuda_device=None, cl_device=None): if cache_dir is None: cache = Cache() else: cache = Cache(cache_dir) mesh_hash = geometry.mesh.md5() bvh = None if read_bvh_cache and cache.exist_bvh(mesh_hash, bvh_name): logger.debug('Loading BVH "%s" for geometry from cache.' % bvh_name) bvh = cache.load_bvh(mesh_hash, bvh_name) elif auto_build_bvh: logger.info('Building new BVH using recursive grid algorithm.') start = time.time() # creates quick context to make BVH if api.is_gpu_api_cuda(): context = cutools.create_cuda_context(cuda_device) if bvh_method == 'grid': bvh = make_recursive_grid_bvh(geometry.mesh, target_degree=3) elif bvh_method == 'simple': bvh = make_simple_bvh(geometry.mesh, 3) else: raise ValueError( 'Requestd BVH construction method invalid: %s' % (bvh_method)) context.pop() elif api.is_gpu_api_opencl(): context = cltools.create_cl_context(cl_device) if bvh_method == 'grid': bvh = make_recursive_grid_bvh(geometry.mesh, target_degree=target_degree) elif bvh_method == 'simple': bvh = make_simple_bvh(geometry.mesh, target_degree) else: raise ValueError( 'Requestd BVH construction method invalid: %s' % (bvh_method)) cltools.close_cl_context(context) logger.info('BVH generated in %1.1f seconds.' % (time.time() - start)) if update_bvh_cache: logger.info('Saving BVH (%s:%s) to cache.' % (mesh_hash, bvh_name)) cache.save_bvh(bvh, mesh_hash, bvh_name) return bvh
def to_uint3(arr): "Returns a vec.uint3 array from an (N,3) array." if gpuapi.is_gpu_api_cuda(): if not arr.flags['C_CONTIGUOUS']: arr = np.asarray(arr, order='c') return arr.astype(np.uint32).view(ga.vec.uint3)[:, 0] elif gpuapi.is_gpu_api_opencl(): n = len(arr) pad = np.zeros((n, 1), dtype=arr.dtype) arr_wpad = np.hstack((arr, pad)) return arr_wpad.astype(np.uint32).view(ga.vec.uint3)[:, 0]
def __init__(self, cl_context=None): if api.is_gpu_api_cuda(): self.module = cutools.get_cu_module('pdf.cu', options=api_options, include_source_directory=True) elif api.is_gpu_api_opencl(): self.module = cltools.get_cl_module('pdf.cl', cl_context, options=api_options, include_source_directory=True) self.gpu_funcs = GPUFuncs(self.module)
def __getattr__(self, name): try: return self.funcs[name] except KeyError: # find and then store function name on the demand if gpuapi.is_gpu_api_cuda(): f = self.module.get_function(name) self.funcs[name] = f return f elif gpuapi.is_gpu_api_opencl(): f = self.module.__getattr__(name) self.funcs[name] = f return f
def collapse_chains(nodes, layer_bounds): if gpuapi.is_gpu_api_cuda(): bvh_module = get_module('bvh.cu', options=api_options, include_source_directory=True) elif gpuapi.is_gpu_api_opencl(): context = cltools.get_last_context() queue = cl.CommandQueue(context) bvh_module = get_module('bvh.cl', context, options=api_options, include_source_directory=True) else: raise RuntimeError('API neither CUDA or OpenCL') bvh_funcs = GPUFuncs(bvh_module) if gpuapi.is_gpu_api_cuda(): gpu_nodes = ga.to_gpu(nodes) elif gpuapi.is_gpu_api_opencl(): gpu_nodes = ga.to_device(queue, nodes) bounds = zip(layer_bounds[:-1], layer_bounds[1:])[:-1] bounds.reverse() nthreads_per_block = 256 for start, end in bounds: if gpuapi.is_gpu_api_cuda(): bvh_funcs.collapse_child(np.uint32(start), np.uint32(end), gpu_nodes, block=(nthreads_per_block, 1, 1), grid=(120, 1)) elif gpuapi.is_gpu_api_opencl(): bvh_funcs.collapse_child(queue, (end - start, 1, 1), None, np.uint32(start), np.uint32(end), gpu_nodes.data).wait() return gpu_nodes.get()
def __call__(self, tag, description): if api.is_gpu_api_cuda(): gpu_free, gpu_total = cuda.mem_get_info() elif api.is_gpu_api_opencl(): ctx = cltools.get_last_context() device = ctx.get_info(cl.context_info.DEVICES)[0] gpu_total = device.get_info(cl.device_info.GLOBAL_MEM_SIZE) gpu_free = gpu_total # free memory info not availabe to opencl... if tag is None: self['gpu_total'] = gpu_total else: self['%s' % tag] = description self['%s_gpu_used' % tag] = gpu_total - gpu_free pass
def to_float3(arr): "Returns a vec.float3 array from an (N,3) array." if gpuapi.is_gpu_api_cuda(): if not arr.flags['C_CONTIGUOUS']: arr = np.asarray(arr, order='c') return arr.astype(np.float32).view(ga.vec.float3)[:, 0] elif gpuapi.is_gpu_api_opencl(): # in the pyopencl implementation, the vec types have a padding column # need to extend this n = len(arr) pad = np.zeros((n, 1), dtype=arr.dtype) arr_wpad = np.hstack((arr, pad)) return arr_wpad.astype(np.float32).view(ga.vec.float3)[:, 0] else: raise RuntimeError('API is neither CUDA nor OpenCL')
def __init__(self, gpu_detector, ndaq=1, cl_context=None, cl_queue=None): if api.is_gpu_api_cuda(): self.earliest_time_gpu = ga.empty(gpu_detector.nchannels * ndaq, dtype=np.float32) self.earliest_time_int_gpu = ga.empty(gpu_detector.nchannels * ndaq, dtype=np.uint32) self.channel_history_gpu = ga.zeros_like( self.earliest_time_int_gpu) self.channel_q_int_gpu = ga.zeros_like(self.earliest_time_int_gpu) self.channel_q_gpu = ga.zeros(len(self.earliest_time_int_gpu), dtype=np.float32) self.detector_gpu = gpu_detector.detector_gpu self.module = cutools.get_cu_module('daq.cu', options=api_options, include_source_directory=True) elif api.is_gpu_api_opencl(): self.earliest_time_gpu = ga.empty(cl_queue, gpu_detector.nchannels * ndaq, dtype=np.float32) self.earliest_time_int_gpu = ga.empty(cl_queue, gpu_detector.nchannels * ndaq, dtype=np.uint32) self.channel_history_gpu = ga.zeros(cl_queue, gpu_detector.nchannels * ndaq, dtype=np.uint32) self.channel_q_int_gpu = ga.zeros(cl_queue, gpu_detector.nchannels * ndaq, dtype=np.uint32) self.channel_q_gpu = ga.zeros(cl_queue, gpu_detector.nchannels * ndaq, dtype=np.float32) self.detector_gpu = gpu_detector # struct not made in opencl mode, so we keep a copy of the class self.module = cltools.get_cl_module('daq.cl', cl_context, options=api_options, include_source_directory=True) else: raise RuntimeError("GPU API is neither CUDA nor OpenCL") self.solid_id_map_gpu = gpu_detector.solid_id_map self.solid_id_to_channel_index_gpu = gpu_detector.solid_id_to_channel_index_gpu self.gpu_funcs = GPUFuncs(self.module) self.ndaq = ndaq self.stride = gpu_detector.nchannels
def device_usage_str(self, cl_context=None): '''Returns a formatted string displaying the memory usage.''' s = 'device usage:\n' s += '-' * 10 + '\n' #s += format_array('vertices', self.vertices) + '\n' #s += format_array('triangles', self.triangles) + '\n' s += format_array('nodes', self.nodes) + '\n' s += '%-15s %6s %6s' % ('total', '', format_size( self.nodes.nbytes)) + '\n' s += '-' * 10 + '\n' if api.is_gpu_api_cuda(): free, total = cuda.mem_get_info() elif api.is_gpu_api_opencl: total = cl_context.get_info(cl.context_info.DEVICES)[0].get_info( cl.device_info.GLOBAL_MEM_SIZE) free = total - self.metadata['d_gpu_used'] s += '%-15s %6s %6s' % ('device total', '', format_size(total)) + '\n' s += '%-15s %6s %6s' % ('device used', '', format_size(total - free)) + '\n' s += '%-15s %6s %6s' % ('device free', '', format_size(free)) + '\n' return s
def get(self): if api.is_gpu_api_cuda(): pos = self.pos.get().ravel().view(np.float32).reshape( self.nphotons, 3 ) pdir = self.dir.get().ravel().view(np.float32).reshape( self.nphotons, 3 ) pol = self.pol.get().ravel().view(np.float32).reshape( self.nphotons, 3 ) elif api.is_gpu_api_opencl(): # need to remove the padding from vectors pos = np.zeros( shape=(self.nphotons,3), dtype=np.float32 ) pdir = np.zeros( shape=(self.nphotons,3), dtype=np.float32 ) pol = np.zeros( shape=(self.nphotons,3), dtype=np.float32 ) gapos = self.pos.get() gadir = self.dir.get() gapol = self.pol.get() for n in xrange(0,self.nphotons): for i in xrange(0,3): pos[n,i] = gapos[n][i] pdir[n,i] = gadir[n][i] pol[n,i] = gapol[n][i] t = self.t.get().view(np.float32) - 100.0 wavelengths = self.wavelengths.get().view(np.float32) return chroma.event.Photons( pos=pos, dir=pdir, pol=pol, t=t, wavelengths=wavelengths )
def end_acquire(self, nthreads_per_block=64, cl_context=None): if api.is_gpu_api_cuda(): self.gpu_funcs.convert_sortable_int_to_float( np.int32(len(self.earliest_time_int_gpu)), self.earliest_time_int_gpu, self.earliest_time_gpu, block=(nthreads_per_block, 1, 1), grid=(len(self.earliest_time_int_gpu) // nthreads_per_block + 1, 1)) self.gpu_funcs.convert_charge_int_to_float( self.detector_gpu, self.channel_q_int_gpu, self.channel_q_gpu, block=(nthreads_per_block, 1, 1), grid=(len(self.channel_q_int_gpu) // nthreads_per_block + 1, 1)) cuda.Context.get_current().synchronize() elif api.is_gpu_api_opencl(): print cl_context, nthreads_per_block comqueue = cl.CommandQueue(cl_context) self.gpu_funcs.convert_sortable_int_to_float( comqueue, (len(self.earliest_time_int_gpu), 1, 1), (nthreads_per_block, 1, 1), np.int32(len(self.earliest_time_int_gpu)), self.earliest_time_int_gpu.data, self.earliest_time_gpu.data, g_times_l=True).wait() self.gpu_funcs.convert_charge_int_to_float( comqueue, (len(self.channel_q_int_gpu), 1, 1), (nthreads_per_block, 1, 1), self.detector_gpu.nchannels, self.detector_gpu.charge_unit, self.channel_q_int_gpu.data, self.channel_q_gpu.data, g_times_l=True).wait() return GPUChannels(self.earliest_time_gpu, self.channel_q_gpu, self.channel_history_gpu, self.ndaq, self.stride)
def end_acquire(self, nthreads_per_block=64, cl_context=None): """collect daq info and make GPUChannels instance. Args: nthreads_per_block: int cl_context: pyopenc.Context Returns: GPUChannels """ if api.is_gpu_api_cuda(): self.earliest_time_gpu = ga.zeros(self.nchannels, dtype=np.float32) nblocks = int(self.nchannels / nthreads_per_block) + 1 self.gpu_funcs.get_earliest_hit_time(np.int32(self.nchannels), np.int32(self.ntdcs), np.float32(self.ns_per_tdc), self.adc_gpu, self.channel_history_gpu, self.earliest_time_gpu, block=(1000, 1, 1), grid=(1, 1)) self.adc_gpu.get() elif api.is_gpu_api_opencl(): comqueue = cl.CommandQueue(cl_context) self.earliest_time_gpu = ga.zeros(comqueue, self.nchannels, dtype=np.float32) self.gpu_funcs.get_earliest_hit_time( comqueue, (int(self.nchannels), 1, 1), None, np.int32(self.nchannels), np.int32(self.ntdcs), np.float32(self.ns_per_tdc), self.adc_gpu.data, self.channel_history_gpu.data, self.earliest_time_gpu.data).wait() self.adc_gpu.get() return GPUChannels(self.earliest_time_gpu, self.adc_gpu, self.channel_history_gpu, self.ndaq, self.stride)
def concatenate_layers(layers): nthreads_per_block = 1024 context = None queue = None if gpuapi.is_gpu_api_opencl(): context = cltools.get_last_context() #print context queue = cl.CommandQueue(context) # Load GPU functions if gpuapi.is_gpu_api_cuda(): bvh_module = get_module('bvh.cu', options=api_options, include_source_directory=True) elif gpuapi.is_gpu_api_opencl(): # don't like the last context method. trouble. trouble. bvh_module = get_module('bvh.cl', cltools.get_last_context(), options=api_options, include_source_directory=True) else: raise RuntimeError('API neither CUDA nor OpenCL?!') bvh_funcs = GPUFuncs(bvh_module) # Put 0 at beginning of list layer_bounds = np.insert(np.cumsum(map(len, layers)), 0, 0) # allocate memory if gpuapi.is_gpu_api_cuda(): nodes = ga.empty(shape=int(layer_bounds[-1]), dtype=ga.vec.uint4) elif gpuapi.is_gpu_api_opencl(): totsize = 0 layer_pos = [] print layer_bounds[-1] for n, layer in enumerate(layers): layer_pos.append(totsize) print "LAYER ", n, " size=", len(layer), "start=", totsize totsize += len(layer) print "totsize: ", totsize nodes_iter_np = np.empty(totsize, dtype=ga.vec.uint4) nodes_iter_gpu = ga.to_device(queue, nodes_iter_np) nodeset_np = [] else: raise RuntimeError('API neither CUDA nor OpenCL?!') ilayer = 0 for layer_start, layer_end, layer in zip(layer_bounds[:-1], layer_bounds[1:], layers): if layer_end == layer_bounds[-1]: # leaf nodes need no offset child_offset = 0 else: child_offset = layer_end #print "ilayer,start,end,child_offset: ",ilayer,layer_start, layer_end, child_offset nmax_blocks = 10000 if gpuapi.is_gpu_api_opencl(): nthreads_per_block = 256 nmax_blocks = 1 for first_index, elements_this_iter, nblocks_this_iter in \ chunk_iterator(layer_end-layer_start, nthreads_per_block,max_blocks=nmax_blocks): #print " ",ilayer,first_index, elements_this_iter, nblocks_this_iter, layer_start if gpuapi.is_gpu_api_cuda(): bvh_funcs.copy_and_offset(np.uint32(first_index), np.uint32(elements_this_iter), np.uint32(child_offset), cuda.In(layer), nodes[layer_start:], block=(nthreads_per_block, 1, 1), grid=(nblocks_this_iter, 1)) elif gpuapi.is_gpu_api_opencl(): layer_gpu = ga.to_device(queue, layer) bvh_funcs.copy_and_offset(queue, (elements_this_iter, 1, 1), (1, 1, 1), np.uint32(first_index), np.uint32(elements_this_iter), np.uint32(child_offset), np.uint32(layer_start), layer_gpu.data, nodes_iter_gpu.data, g_times_l=True).wait() else: raise RuntimeError('API neither CUDA nor OpenCL?!') ilayer += 1 if gpuapi.is_gpu_api_cuda(): return nodes.get(), layer_bounds elif gpuapi.is_gpu_api_opencl(): return nodes_iter_gpu.get(), layer_bounds
def merge_nodes(nodes, degree, max_ratio=None): nthreads_per_block = 256 context = None queue = None if gpuapi.is_gpu_api_opencl(): context = cltools.get_last_context() queue = cl.CommandQueue(context) # Load GPU functions if gpuapi.is_gpu_api_cuda(): bvh_module = get_module('bvh.cu', options=api_options, include_source_directory=True) elif gpuapi.is_gpu_api_opencl(): # don't like the last context method. trouble. trouble. bvh_module = get_module('bvh.cl', context, options=api_options, include_source_directory=True) else: raise RuntimeError('API is neither CUDA nor OpenCL?!') bvh_funcs = GPUFuncs(bvh_module) # determine number of parents nparent = len(nodes) / degree if len(nodes) % degree != 0: nparent += 1 if nparent == 1: nparent_pad = nparent else: nparent_pad = round_up_to_multiple(nparent, 1) #degree # allocate memory if gpuapi.is_gpu_api_cuda(): gpu_parent_nodes = ga.zeros(shape=nparent_pad, dtype=ga.vec.uint4) elif gpuapi.is_gpu_api_opencl(): parent_nodes_np = np.zeros(shape=nparent, dtype=ga.vec.uint4) gpu_parent_nodes = ga.to_device(queue, parent_nodes_np) gpu_nodes = ga.to_device(queue, nodes) else: raise RuntimeError('API is neither CUDA nor OpenCL?!') # run kernel if gpuapi.is_gpu_api_cuda(): for first_index, elements_this_iter, nblocks_this_iter in \ chunk_iterator(nparent, nthreads_per_block, max_blocks=10000): bvh_funcs.make_parents(np.uint32(first_index), np.uint32(elements_this_iter), np.uint32(degree), gpu_parent_nodes, cuda.In(nodes), np.uint32(0), np.uint32(len(nodes)), block=(nthreads_per_block, 1, 1), grid=(nblocks_this_iter, 1)) elif gpuapi.is_gpu_api_opencl(): for first_index, elements_this_iter, nblocks_this_iter in \ chunk_iterator(nparent, nthreads_per_block, max_blocks=1): bvh_funcs.make_parents(queue, (elements_this_iter, 1, 1), None, np.uint32(first_index), np.uint32(elements_this_iter), np.uint32(degree), gpu_parent_nodes.data, gpu_nodes.data, np.uint32(0), np.uint32(len(nodes))).wait() else: raise RuntimeError('API is neither CUDA nor OpenCL?!') parent_nodes = gpu_parent_nodes.get() if max_ratio is not None: areas = node_areas(parent_nodes) child_areas = node_areas(nodes) excessive_area = np.zeros(shape=len(areas), dtype=bool) for i, parent_area in enumerate(areas): nchild = parent_nodes['w'][i] >> CHILD_BITS child_index = parent_nodes['w'][i] & ~NCHILD_MASK child_area = child_areas[child_index:child_index + nchild].sum() #if parent_area > 1e9: # print i, 'Children: %e, Parent: %e' % (child_area, parent_area) if child_area / parent_area < 0.3: excessive_area[i] = True #print i, 'Children: %e, Parent: %e' % (child_area, parent_area) extra_slots = round_up_to_multiple( (degree - 1) * np.count_nonzero(excessive_area), 1) print 'Extra slots:', extra_slots new_parent_nodes = np.zeros(shape=len(parent_nodes) + extra_slots, dtype=parent_nodes.dtype) new_parent_nodes[:len(parent_nodes)] = parent_nodes offset = 0 for count, index in enumerate(np.argwhere(excessive_area)): index = index[0] + offset nchild = new_parent_nodes['w'][index] >> CHILD_BITS child_index = new_parent_nodes['w'][index] & ~NCHILD_MASK new_parent_nodes[index] = nodes[child_index] #new_parent_nodes['w'][index] = 1 << CHILD_BITS | child_index tmp_nchild = new_parent_nodes['w'][index] >> CHILD_BITS tmp_child_index = new_parent_nodes['w'][index] & ~NCHILD_MASK new_parent_nodes['w'][index] = tmp_nchild << CHILD_BITS | ( tmp_child_index + len(nodes)) if nchild == 1: continue # slide everyone over #print index, nchild, len(new_parent_nodes) new_parent_nodes[index + nchild:] = new_parent_nodes[index + 1:-nchild + 1] offset += nchild - 1 for sibling in xrange(nchild - 1): new_parent_index = index + 1 + sibling new_parent_nodes[new_parent_index] = nodes[child_index + sibling + 1] if new_parent_nodes['x'][new_parent_index] != 0: tmp_nchild = new_parent_nodes['w'][ new_parent_index] >> CHILD_BITS tmp_child_index = new_parent_nodes['w'][ new_parent_index] & ~NCHILD_MASK new_parent_nodes['w'][ new_parent_index] = tmp_nchild << CHILD_BITS | ( tmp_child_index + len(nodes)) #new_parent_nodes['w'][new_parent_index] = 1 << CHILD_BITS | (child_index + sibling + 1) #print 'intermediate: %e' % node_areas(new_parent_nodes).max() print 'old: %e' % node_areas(parent_nodes).max() print 'new: %e' % node_areas(new_parent_nodes).max() if len(new_parent_nodes) < len(nodes): # Only adopt new set of parent nodes if it actually reduces the # total number of nodes at this level by 1. parent_nodes = new_parent_nodes return parent_nodes
def create_leaf_nodes(mesh, morton_bits=16, round_to_multiple=1, nthreads_per_block=32, max_blocks=16): '''Compute the leaf nodes surrounding a triangle mesh. ``mesh``: chroma.geometry.Mesh Triangles to box ``morton_bits``: int Number of bits to use per dimension when computing Morton code. ``round_to_multiple``: int Round the number of nodes created up to multiple of this number Extra nodes will be all zero. Returns (world_coords, nodes, morton_codes), where ``world_coords``: chroma.bvh.WorldCoords Defines the fixed point coordinate system ``nodes``: ndarray(shape=len(mesh.triangles), dtype=uint4) List of leaf nodes. Child IDs will be set to triangle offsets. ``morton_codes``: ndarray(shape=len(mesh.triangles), dtype=np.uint64) Morton codes for each triangle, using ``morton_bits`` per axis. Must be <= 16 bits. ''' # it would be nice not to duplicate code, make functions transparent... context = None queue = None if gpuapi.is_gpu_api_opencl(): context = cltools.get_last_context() #print context queue = cl.CommandQueue(context) # Load GPU functions if gpuapi.is_gpu_api_cuda(): bvh_module = get_module('bvh.cu', options=api_options, include_source_directory=True) elif gpuapi.is_gpu_api_opencl(): # don't like the last context method. trouble. trouble. bvh_module = get_module('bvh.cl', cltools.get_last_context(), options=api_options, include_source_directory=True) bvh_funcs = GPUFuncs(bvh_module) # compute world coordinates world_origin_np = mesh.vertices.min(axis=0) world_scale = np.max( (mesh.vertices.max(axis=0) - world_origin_np)) / (2**16 - 2) world_coords = WorldCoords(world_origin=world_origin_np, world_scale=world_scale) # Put triangles and vertices into host and device memory # unfortunately, opencl and cuda has different methods for managing memory here # we have to write divergent code if gpuapi.is_gpu_api_cuda(): # here cuda supports a nice feature where we allocate host and device memory that are mapped onto one another. # no explicit requests for transfers here triangles = cutools.mapped_empty(shape=len(mesh.triangles), dtype=ga.vec.uint3, write_combined=True) triangles[:] = to_uint3(mesh.triangles) vertices = cutools.mapped_empty(shape=len(mesh.vertices), dtype=ga.vec.float3, write_combined=True) vertices[:] = to_float3(mesh.vertices) #print triangles[0:10] #print vertices[0:10] # Call GPU to compute nodes nodes = ga.zeros(shape=round_up_to_multiple(len(triangles), round_to_multiple), dtype=ga.vec.uint4) morton_codes = ga.empty(shape=len(triangles), dtype=np.uint64) # Convert world coords to GPU-friendly types world_origin = ga.vec.make_float3(*world_origin_np) world_scale = np.float32(world_scale) # generate morton codes on GPU for first_index, elements_this_iter, nblocks_this_iter in \ chunk_iterator(len(triangles), nthreads_per_block, max_blocks=30000): bvh_funcs.make_leaves(np.uint32(first_index), np.uint32(elements_this_iter), cutools.Mapped(triangles), cutools.Mapped(vertices), world_origin, world_scale, nodes, morton_codes, block=(nthreads_per_block, 1, 1), grid=(nblocks_this_iter, 1)) morton_codes_host = morton_codes.get() >> (16 - morton_bits) elif gpuapi.is_gpu_api_opencl(): # here we need to allocate a buffer on the host and on the device triangles = np.empty(len(mesh.triangles), dtype=ga.vec.uint3) copy_to_uint3(mesh.triangles, triangles) vertices = np.empty(len(mesh.vertices), dtype=ga.vec.float3) copy_to_float3(mesh.vertices, vertices) # now create a buffer object on the device and push data to it triangles_dev = ga.to_device(queue, triangles) vertices_dev = ga.to_device(queue, vertices) # Call GPU to compute nodes nodes = ga.zeros(queue, shape=round_up_to_multiple(len(triangles), round_to_multiple), dtype=ga.vec.uint4) morton_codes = ga.empty(queue, shape=len(triangles), dtype=np.uint64) # Convert world coords to GPU-friendly types #world_origin = np.array(world_origin_np,dtype=np.float32) world_origin = np.empty(1, dtype=ga.vec.float3) world_origin['x'] = world_origin_np[0] world_origin['y'] = world_origin_np[1] world_origin['z'] = world_origin_np[2] world_scale = np.float32(world_scale) #print world_origin, world_scale # generate morton codes on GPU for first_index, elements_this_iter, nblocks_this_iter in \ chunk_iterator(len(triangles), nthreads_per_block, max_blocks): print first_index, elements_this_iter, nblocks_this_iter bvh_funcs.make_leaves( queue, (nblocks_this_iter, 1, 1), (nthreads_per_block, 1, 1), #bvh_funcs.make_leaves( queue, (elements_this_iter,1,1), None, np.uint32(first_index), np.uint32(elements_this_iter), triangles_dev.data, vertices_dev.data, world_origin, world_scale, nodes.data, morton_codes.data, g_times_l=True).wait() morton_codes_host = morton_codes.get() >> (16 - morton_bits) return world_coords, nodes.get(), morton_codes_host
def merge_nodes_detailed(nodes, first_child, nchild): '''Merges nodes into len(first_child) parent nodes, using the provided arrays to determine the index of the first child of each parent, and how many children there are.''' nthreads_per_block = 256 context = None queue = None if gpuapi.is_gpu_api_opencl(): context = cltools.get_last_context() #print context queue = cl.CommandQueue(context) # Load GPU functions if gpuapi.is_gpu_api_cuda(): bvh_module = get_module('bvh.cu', options=api_options, include_source_directory=True) elif gpuapi.is_gpu_api_opencl(): # don't like the last context method. trouble. trouble. bvh_module = get_module('bvh.cl', context, options=api_options, include_source_directory=True) else: raise RuntimeError('API is neither CUDA nor OpenCL?!') bvh_funcs = GPUFuncs(bvh_module) # Load Memory if gpuapi.is_gpu_api_cuda(): gpu_nodes = ga.to_gpu(nodes) gpu_first_child = ga.to_gpu(first_child.astype(np.int32)) gpu_nchild = ga.to_gpu(nchild.astype(np.int32)) nparent = len(first_child) gpu_parent_nodes = ga.empty(shape=nparent, dtype=ga.vec.uint4) elif gpuapi.is_gpu_api_opencl(): gpu_nodes = ga.to_device(queue, nodes) gpu_first_child = ga.to_device(queue, first_child.astype(np.int32)) gpu_nchild = ga.to_device(queue, nchild.astype(np.int32)) nparent = len(first_child) parent_nodes_np = np.zeros(shape=nparent, dtype=ga.vec.uint4) gpu_parent_nodes = ga.to_device(queue, parent_nodes_np) else: raise RuntimeError('API is neither CUDA nor OpenCL?!') # Run Kernel for first_index, elements_this_iter, nblocks_this_iter in \ chunk_iterator(nparent, nthreads_per_block, max_blocks=10000): if gpuapi.is_gpu_api_cuda(): bvh_funcs.make_parents_detailed(np.uint32(first_index), np.uint32(elements_this_iter), gpu_nodes, gpu_parent_nodes, gpu_first_child, gpu_nchild, block=(nthreads_per_block, 1, 1), grid=(nblocks_this_iter, 1)) elif gpuapi.is_gpu_api_opencl(): bvh_funcs.make_parents_detailed(queue, (elements_this_iter, 1, 1), None, np.uint32(first_index), np.uint32(elements_this_iter), gpu_nodes.data, gpu_parent_nodes.data, gpu_first_child.data, gpu_nchild.data).wait() else: raise RuntimeError('API is neither CUDA nor OpenCL?!') return gpu_parent_nodes.get()
import numpy as np import chroma.api as gpuapi from chroma.gpu.tools import get_module, api_options, \ chunk_iterator, to_uint3, to_float3, copy_to_uint3, copy_to_float3 from chroma.gpu.gpufuncs import GPUFuncs if gpuapi.is_gpu_api_cuda(): import pycuda.driver as cuda from pycuda import gpuarray as ga from pycuda import characterize #from chroma.gpu.cutools import Mapped import chroma.gpu.cutools as cutools elif gpuapi.is_gpu_api_opencl(): import pyopencl as cl import pyopencl.array as ga import chroma.gpu.cltools as cltools from chroma.bvh.bvh import WorldCoords, node_areas, NCHILD_MASK, CHILD_BITS def round_up_to_multiple(x, multiple): remainder = x % multiple if remainder == 0: return x else: return x + multiple - remainder def create_leaf_nodes(mesh, morton_bits=16, round_to_multiple=1, nthreads_per_block=32,
import chroma.api as api print api.get_gpu_api() print api.is_gpu_api_opencl() print api.is_gpu_api_cuda()
def acquire(self, gpuphotons, rng_states, nthreads_per_block=64, max_blocks=1024, start_photon=None, nphotons=None, weight=1.0, cl_context=None): if start_photon is None: start_photon = 0 if nphotons is None: nphotons = len(gpuphotons.pos) - start_photon if api.is_gpu_api_opencl(): comqueue = cl.CommandQueue(cl_context) clmaxblocks = max_blocks if self.ndaq == 1: for first_photon, photons_this_round, blocks in \ chunk_iterator(nphotons, nthreads_per_block, max_blocks): if api.is_gpu_api_cuda(): self.gpu_funcs.run_daq(rng_states, np.uint32(0x1 << 2), np.int32(start_photon + first_photon), np.int32(photons_this_round), gpuphotons.t, gpuphotons.flags, gpuphotons.last_hit_triangles, gpuphotons.weights, self.solid_id_map_gpu, self.detector_gpu, self.earliest_time_int_gpu, self.channel_q_int_gpu, self.channel_history_gpu, np.float32(weight), block=(nthreads_per_block, 1, 1), grid=(blocks, 1)) elif api.is_gpu_api_opencl(): #print "daq: ",start_photon,first_photon,start_photon+first_photon,(photons_this_round/nthreads_per_block,1,1), (nthreads_per_block,1,1) self.gpu_funcs.run_daq( comqueue, (photons_this_round / nthreads_per_block, 1, 1), (nthreads_per_block, 1, 1), rng_states.data, np.uint32(0x1 << 2), np.int32(start_photon + first_photon), np.int32(photons_this_round), gpuphotons.t.data, gpuphotons.flags.data, gpuphotons.last_hit_triangles.data, gpuphotons.weights.data, self.solid_id_map_gpu.data, # -- Detector struct -- self.solid_id_to_channel_index_gpu.data, self.detector_gpu.time_cdf_x_gpu.data, self.detector_gpu.time_cdf_y_gpu.data, self.detector_gpu.charge_cdf_x_gpu.data, self.detector_gpu.charge_cdf_y_gpu.data, self.detector_gpu.nchannels, self.detector_gpu.time_cdf_len, self.detector_gpu.charge_cdf_len, self.detector_gpu.charge_unit, # --------------------- self.earliest_time_int_gpu.data, self.channel_q_int_gpu.data, self.channel_history_gpu.data, np.float32(weight), g_times_l=True).wait() else: for first_photon, photons_this_round, blocks in \ chunk_iterator(nphotons, 1, max_blocks): if api.is_gpu_api_cuda(): self.gpu_funcs.run_daq_many( rng_states, np.uint32(0x1 << 2), np.int32(start_photon + first_photon), np.int32(photons_this_round), gpuphotons.t, gpuphotons.flags, gpuphotons.last_hit_triangles, gpuphotons.weights, self.solid_id_map_gpu, self.detector_gpu, self.earliest_time_int_gpu, self.channel_q_int_gpu, self.channel_history_gpu, np.int32(self.ndaq), np.int32(self.stride), np.float32(weight), block=(nthreads_per_block, 1, 1), grid=(blocks, 1)) elif api.is_gpu_api_opencl(): self.gpu_funcs.run_daq_many( comqueue, (nthreads_per_block, 1, 1), (blocks, 1), np.int32(start_photon + first_photon), np.int32(photons_this_round), gpuphotons.t.data, gpuphotons.flags.data, gpuphotons.last_hit_triangles.data, gpuphotons.weights.data, self.solid_id_map_gpu, # -- Detector Struct -- self.solid_id_to_channel_index_gpu.data, self.detector_gpu.time_cdf_x_gpu.data, self.detector_gpu.time_cdf_y_gpu.data, self.detector_gpu.charge_cdf_x_gpu.data, self.detector_gpu.charge_cdf_y_gpu.data, self.detector_gpu.nchannels, self.detector_gpu.time_cdf_len, self.detector_gpu.charge_cdf_len, self.detector_gpu.charge_unit, # --------------------- self.earliest_time_int_gpu.data, self.channel_q_int_gpu.data, self.channel_history_gpu.data, np.int32(self.ndaq), np.int32(self.stride), np.float32(weight), g_times_l=True).wait() if api.is_gpu_api_cuda(): cuda.Context.get_current().synchronize() elif api.is_gpu_api_opencl(): cl.enqueue_barrier(comqueue)
def propagate(self, gpu_geometry, rng_states, nthreads_per_block=64, max_blocks=1024, max_steps=10, use_weights=False, scatter_first=0, cl_context=None): """Propagate photons on GPU to termination or max_steps, whichever comes first. May be called repeatedly without reloading photon information if single-stepping through photon history. ..warning:: `rng_states` must have at least `nthreads_per_block`*`max_blocks` number of curandStates. """ nphotons = self.pos.size # bind node texture reference if api.is_gpu_api_cuda() and not self.node_texture_ref_bound: # we have to unroll, as pycuda doesn't seem to support vector times right now for binding self.unrolled_nodes = ga.to_gpu( gpu_geometry.nodes.get().ravel().view(np.uint32)) self.unrolled_extra_nodes = ga.to_gpu( gpu_geometry.extra_nodes.ravel().view(np.uint32)) self.unrolled_triangles = ga.to_gpu( gpu_geometry.triangles.get().ravel().view(np.uint32)) self.unrolled_triangles4 = ga.to_gpu( gpu_geometry.triangles4.ravel().view(np.uint32)) self.unrolled_vertices = ga.to_gpu( gpu_geometry.vertices.get().ravel().view(np.float32)) self.unrolled_vertices4 = ga.to_gpu( gpu_geometry.vertices4.ravel().view(np.float32)) self.node_texture_ref.set_address(self.unrolled_nodes.gpudata, self.unrolled_nodes.nbytes) self.extra_node_texture_ref.set_address( self.unrolled_extra_nodes.gpudata, self.unrolled_extra_nodes.nbytes) #self.unrolled_nodes.bind_to_texref_ext( self.node_texture_ref ) #self.unrolled_extra_nodes.bind_to_texref_ext( self.extra_node_texture_ref ) #self.unrolled_triangles.bind_to_texref_ext( self.triangles_texture_ref ) self.triangles_texture_ref.set_address( self.unrolled_triangles4.gpudata, self.unrolled_triangles4.nbytes) #self.unrolled_vertices.bind_to_texref_ext( self.vertices_texture_ref ) self.vertices_texture_ref.set_address( self.unrolled_vertices4.gpudata, self.unrolled_vertices4.nbytes) print "[BOUND TO TEXTURE MEMORY]" print "Nodes: ", self.unrolled_nodes.nbytes / 1.0e3, " kbytes" print "Extra nodes: ", self.unrolled_extra_nodes.nbytes / 1.0e3, " kbytes" print "Triangles: ", self.unrolled_triangles4.nbytes / 1.0e3, " kbytes" print "Vertices: ", self.unrolled_vertices4.nbytes / 1.0e3, " kbytes" print "Total: ", (self.unrolled_nodes.nbytes + self.unrolled_extra_nodes.nbytes + self.unrolled_triangles4.nbytes + self.unrolled_vertices4.nbytes) / 1.0e3, "kbytes" self.node_texture_ref_bound = True # setup queue maxqueue = nphotons step = 0 input_queue = np.empty(shape=maxqueue + 1, dtype=np.uint32) input_queue[0] = 0 # Order photons initially in the queue to put the clones next to each other for copy in xrange(self.ncopies): input_queue[1 + copy::self.ncopies] = np.arange( self.true_nphotons, dtype=np.uint32) + copy * self.true_nphotons if api.is_gpu_api_cuda(): input_queue_gpu = ga.to_gpu(input_queue) elif api.is_gpu_api_opencl(): comqueue = cl.CommandQueue(cl_context) input_queue_gpu = ga.to_device(comqueue, input_queue[1:]) # why the offset? output_queue = np.zeros(shape=maxqueue + 1, dtype=np.uint32) output_queue[0] = 1 if api.is_gpu_api_cuda(): output_queue_gpu = ga.to_gpu(output_queue) elif api.is_gpu_api_opencl(): output_queue_gpu = ga.to_device(comqueue, output_queue) if use_weights: iuse_weights = 1 else: iuse_weights = 0 adapt_factor = 1.0 start_prop = time.time() while step < max_steps: # Just finish the rest of the steps if the # of photons is low #if nphotons < nthreads_per_block * 16 * 8 or use_weights: # nsteps = max_steps - step #else: # nsteps = 1 nsteps = 1 start_step = time.time() for first_photon, photons_this_round, blocks in \ chunk_iterator(nphotons, nthreads_per_block, max( int(adapt_factor*max_blocks), 1 )): #print nphotons, nthreads_per_block, max_blocks," : ",first_photon, photons_this_round, blocks, adapt_factor start_chunk = time.time() if api.is_gpu_api_cuda(): self.gpu_funcs.propagate(np.int32(first_photon), np.int32(photons_this_round), input_queue_gpu[1:], output_queue_gpu, rng_states, self.pos, self.dir, self.wavelengths, self.pol, self.t, self.flags, self.last_hit_triangles, self.weights, np.int32(nsteps), np.int32(iuse_weights), np.int32(scatter_first), gpu_geometry.gpudata, block=(nthreads_per_block, 1, 1), grid=(blocks, 1)) #cuda.Context.get_current().synchronize() elif api.is_gpu_api_opencl(): self.gpu_funcs.propagate( comqueue, (photons_this_round, 1, 1), None, np.int32(first_photon), np.int32(photons_this_round), input_queue_gpu.data, output_queue_gpu.data, rng_states.data, self.pos.data, self.dir.data, self.wavelengths.data, self.pol.data, self.t.data, self.flags.data, self.last_hit_triangles.data, self.weights.data, np.int32(nsteps), np.int32(iuse_weights), np.int32(scatter_first), gpu_geometry.world_scale, gpu_geometry.world_origin.data, np.int32(len(gpu_geometry.nodes)), gpu_geometry.material_data['n'], gpu_geometry.material_data['step'], gpu_geometry.material_data["wavelength0"], gpu_geometry.vertices.data, gpu_geometry.triangles.data, gpu_geometry.material_codes.data, gpu_geometry.colors.data, gpu_geometry.nodes.data, gpu_geometry.extra_nodes.data, gpu_geometry.material_data["nmaterials"], gpu_geometry.material_data['refractive_index'].data, gpu_geometry.material_data['absorption_length'].data, gpu_geometry.material_data['scattering_length'].data, gpu_geometry.material_data['reemission_prob'].data, gpu_geometry.material_data['reemission_cdf'].data, gpu_geometry.surface_data['nsurfaces'], gpu_geometry.surface_data['detect'].data, gpu_geometry.surface_data['absorb'].data, gpu_geometry.surface_data['reemit'].data, gpu_geometry.surface_data['reflect_diffuse'].data, gpu_geometry.surface_data['reflect_specular'].data, gpu_geometry.surface_data['eta'].data, gpu_geometry.surface_data['k'].data, gpu_geometry.surface_data['reemission_cdf'].data, gpu_geometry.surface_data['model'].data, gpu_geometry.surface_data['transmissive'].data, gpu_geometry.surface_data['thickness'].data, gpu_geometry.surface_data['nplanes'].data, gpu_geometry.surface_data['wire_diameter'].data, gpu_geometry.surface_data['wire_pitch'].data, g_times_l=True).wait() end_chunk = time.time() chunk_time = end_chunk - start_chunk #print "chunk time: ",chunk_time #if chunk_time>2.5: # adapt_factor *= 0.5 step += nsteps scatter_first = 0 # Only allow non-zero in first pass end_step = time.time() #print "step time: ",end_step-start_step if step < max_steps: start_requeue = time.time() #print "reset photon queues" if api.is_gpu_api_cuda(): cuda.Context.get_current().synchronize( ) # ensure all threads done #temp = input_queue_gpu #input_queue_gpu = output_queue_gpu #output_queue_gpu = temp # Assign with a numpy array of length 1 to silence # warning from PyCUDA about setting array with different strides/storage orders. #output_queue_gpu[:1].set(np.ones(shape=1, dtype=np.uint32)) #nphotons = input_queue_gpu[:1].get()[0] - 1 # new style output_queue_gpu.get(output_queue) nphotons = output_queue[0] - 1 input_queue_gpu.set(output_queue) output_queue_gpu[:1].set(np.ones(shape=1, dtype=np.uint32)) elif api.is_gpu_api_opencl(): temp_out = output_queue_gpu.get() nphotons = temp_out[0] input_queue_gpu.set( temp_out[1:], queue=comqueue ) # set the input queue to have index of photons still need to be run output_queue_gpu[:1].set( np.ones(shape=1, dtype=np.uint32), queue=comqueue) # reset first instance to be one end_requeue = time.time() #print "re-queue time (nphotons=",nphotons"): ",end_requeue-start_requeue if nphotons == 0: break end_prop = time.time() print "propagation time: ", end_prop - start_prop, " secs" end_flags = self.flags.get() end_flag = np.max(end_flags) if end_flag & (1 << 31): print >> sys.stderr, "WARNING: ABORTED PHOTONS" if api.is_gpu_api_cuda(): cuda.Context.get_current().synchronize() elif api.is_gpu_api_opencl(): cl.enqueue_barrier(comqueue)
def __init__(self, photons, ncopies=1, cl_context=None): """Load ``photons`` onto the GPU, replicating as requested. Args: - photons: chroma.Event.Photons Photon state information to load onto GPU - ncopies: int, *optional* Number of times to replicate the photons on the GPU. This is used if you want to propagate the same event many times, for example in a likelihood calculation. The amount of GPU storage will be proportionally larger if ncopies > 1, so be careful. """ nphotons = len(photons) # Allocate GPU memory for photon info and push to device if api.is_gpu_api_cuda(): self.pos = ga.empty(shape=nphotons * ncopies, dtype=ga.vec.float3) self.dir = ga.empty(shape=nphotons * ncopies, dtype=ga.vec.float3) self.pol = ga.empty(shape=nphotons * ncopies, dtype=ga.vec.float3) self.wavelengths = ga.empty(shape=nphotons * ncopies, dtype=np.float32) self.t = ga.empty(shape=nphotons * ncopies, dtype=np.float32) self.last_hit_triangles = ga.empty(shape=nphotons * ncopies, dtype=np.int32) self.flags = ga.empty(shape=nphotons * ncopies, dtype=np.uint32) self.weights = ga.empty(shape=nphotons * ncopies, dtype=np.float32) self.current_node_index = ga.zeros(shape=nphotons * ncopies, dtype=np.uint32) # deprecated self.requested_workcode = ga.empty(shape=nphotons * ncopies, dtype=np.uint32) # deprecated elif api.is_gpu_api_opencl(): queue = cl.CommandQueue(cl_context) self.pos = ga.empty(queue, shape=nphotons * ncopies, dtype=ga.vec.float3) self.dir = ga.empty(queue, shape=nphotons * ncopies, dtype=ga.vec.float3) self.pol = ga.empty(queue, shape=nphotons * ncopies, dtype=ga.vec.float3) self.wavelengths = ga.empty(queue, shape=nphotons * ncopies, dtype=np.float32) self.t = ga.empty(queue, shape=nphotons * ncopies, dtype=np.float32) self.last_hit_triangles = ga.empty(queue, shape=nphotons * ncopies, dtype=np.int32) self.flags = ga.empty(queue, shape=nphotons * ncopies, dtype=np.uint32) self.weights = ga.empty(queue, shape=nphotons * ncopies, dtype=np.float32) self.current_node_index = ga.zeros(queue, shape=nphotons * ncopies, dtype=np.uint32) # deprecated self.requested_workcode = ga.empty(queue, shape=nphotons * ncopies, dtype=np.uint32) # deprecated # Assign the provided photons to the beginning (possibly # the entire array if ncopies is 1 self.pos[:nphotons].set(to_float3(photons.pos)) self.dir[:nphotons].set(to_float3(photons.dir)) self.pol[:nphotons].set(to_float3(photons.pol)) self.wavelengths[:nphotons].set(photons.wavelengths.astype(np.float32)) self.t[:nphotons].set(photons.t.astype(np.float32)) self.last_hit_triangles[:nphotons].set( photons.last_hit_triangles.astype(np.int32)) self.flags[:nphotons].set(photons.flags.astype(np.uint32)) self.weights[:nphotons].set(photons.weights.astype(np.float32)) if api.is_gpu_api_cuda(): self.module = get_module('propagate.cu', options=api_options, include_source_directory=True) elif api.is_gpu_api_opencl(): self.module = get_module('propagate.cl', cl_context, options=api_options, include_source_directory=True) # define the texture references self.define_texture_references() # get kernel functions self.gpu_funcs = GPUFuncs(self.module) # Replicate the photons to the rest of the slots if needed if ncopies > 1: max_blocks = 1024 nthreads_per_block = 64 for first_photon, photons_this_round, blocks in \ chunk_iterator(nphotons, nthreads_per_block, max_blocks): self.gpu_funcs.photon_duplicate(np.int32(first_photon), np.int32(photons_this_round), self.pos, self.dir, self.wavelengths, self.pol, self.t, self.flags, self.last_hit_triangles, self.weights, np.int32(ncopies - 1), np.int32(nphotons), block=(nthreads_per_block, 1, 1), grid=(blocks, 1)) # Save the duplication information for the iterate_copies() method self.true_nphotons = nphotons self.ncopies = ncopies
def __init__(self, steps_arr, multiple=1.0, nthreads_per_block=64, max_blocks=1024, ncopies=1, seed=None, cl_context=None): """ Generates photons from information in the steps_arr Parameters ---------- steps_arr : numpy.array with shape=(N,10) dtype=np.float contains [ x1, y1, z1, t1, x2, y2, z2, nphotons, fast_to_slow_ratio, fast_time_constatn, slow_time_constatn ] in the future could generalize this to many different time components. developed for liquid argon TPCs. multiple : float scale up the number of photons generated (not implemented yet) """ self.steps_array = steps_arr self.nsteps = self.steps_array.shape[0] if multiple!=1.0: raise RuntimeError('Have not implemented scaling of the number of photons generated.') # =========================== # GEN PHOTONS tstart_genphotons = time.time() # we do the dumbest thing first (i.e., no attempt to do fancy GPU manipulations here) # on the CPU, we scan the steps to determine the total number of photons using poisson statistics # we assume the user has seeded the random number generator to her liking tstart_nphotons = time.time() self.step_fsratio = np.array( self.steps_array[:,self._fsratio], dtype=np.float32 ) #self.nphotons_per_step = np.array( [ np.random.poisson( z ) for z in self.steps_array[:,self._nphotons].ravel() ], dtype=np.int ) self.nphotons_per_step = self.steps_array[ self._nphotons, : ] self.nphotons = reduce( lambda x, y : x + y, self.nphotons_per_step.ravel() ) print "NSTEPS: ",self.nsteps print "NPHOTONS: ",self.nphotons," (time to determine per step=",time.time()-tstart_nphotons # now we make an index array for which step we need to get info from self.source_step_index = np.zeros( self.nphotons, dtype=np.int32 ) current_index=0 for n, n_per_step in enumerate( self.nphotons_per_step ): self.source_step_index[current_index:current_index+n_per_step] = n current_index += n_per_step # push everything to the GPU tstart_transfer = time.time() if api.is_gpu_api_cuda(): # step info self.step_pos1_gpu = ga.empty(shape=self.nsteps, dtype=ga.vec.float3) self.step_pos2_gpu = ga.empty(shape=self.nsteps, dtype=ga.vec.float3) self.step_fsratio_gpu = ga.to_gpu( self.step_fsratio ) self.source_step_index_gpu = ga.to_gpu( self.source_step_index ) # photon info self.pos = ga.empty( shape=self.nphotons, dtype=ga.vec.float3 ) self.dir = ga.empty( shape=self.nphotons, dtype=ga.vec.float3 ) self.pol = ga.empty( shape=self.nphotons, dtype=ga.vec.float3 ) self.wavelengths = ga.empty(shape=self.nphotons*ncopies, dtype=np.float32) self.t = ga.to_gpu( np.zeros(self.nphotons*ncopies, dtype=np.float32) ) self.last_hit_triangles = ga.empty(shape=self.nphotons*ncopies, dtype=np.int32) self.flags = ga.empty(shape=self.nphotons*ncopies, dtype=np.uint32) self.weights = ga.empty(shape=self.nphotons*ncopies, dtype=np.float32) elif api.is_gpu_api_opencl(): cl_queue = cl.CommandQueue( cl_context ) # step info self.step_pos1_gpu = ga.empty(cl_queue, self.nsteps, dtype=ga.vec.float3) self.step_pos2_gpu = ga.empty(cl_queue, self.nsteps, dtype=ga.vec.float3) self.step_fsratio_gpu = ga.to_device( cl_queue, self.step_fsratio ) self.source_step_index_gpu = ga.to_device( cl_queue, self.source_step_index ) # photon info self.pos = ga.empty( cl_queue, self.nphotons, dtype=ga.vec.float3 ) self.dir = ga.empty( cl_queue, self.nphotons, dtype=ga.vec.float3 ) self.pol = ga.empty( cl_queue, self.nphotons, dtype=ga.vec.float3 ) self.wavelengths = ga.empty( cl_queue, self.nphotons*ncopies, dtype=np.float32) self.t = ga.zeros( cl_queue, self.nphotons*ncopies, dtype=np.float32) self.last_hit_triangles = ga.empty( cl_queue, self.nphotons*ncopies, dtype=np.int32) self.flags = ga.empty( cl_queue, self.nphotons*ncopies, dtype=np.uint32) self.weights = ga.empty( cl_queue, self.nphotons*ncopies, dtype=np.float32) self.step_pos1_gpu.set( to_float3( self.steps_array[:,0:3] ) ) self.step_pos2_gpu.set( to_float3( self.steps_array[:,4:7] ) ) self.t.set( self.steps_array[:,3] ) self.ncopies = ncopies self.true_nphotons = self.nphotons if self.ncopies!=1: raise ValueError('support for multiple copies not supported') if api.is_gpu_api_cuda(): self.gpumod = get_module( "gen_photon_from_step.cu", options=api_options, include_source_directory=True ) elif api.is_gpu_api_opencl(): self.gpumod = get_module( "gen_photon_from_step.cl", cl_context, options=api_options, include_source_directory=True ) self.gpufuncs = GPUFuncs( self.gpumod ) print "gen photon mem alloc/transfer time=",time.time()-tstart_transfer # need random numbers tgpu = time.time() if seed==None: seed = 5 rng_states = get_rng_states(nthreads_per_block*max_blocks, seed=seed, cl_context=cl_context) for first_photon, photons_this_round, blocks in chunk_iterator(self.nphotons, nthreads_per_block, max_blocks): if api.is_gpu_api_cuda(): self.gpufuncs.gen_photon_from_step( np.int32(first_photon), np.int32(self.nphotons), self.source_step_index_gpu, self.step_pos1_gpu, self.step_pos2_gpu, self.step_fsratio_gpu, np.float32( self.steps_array[0,self._fconst] ), np.float32( self.steps_array[0,self._sconst] ), np.float32( 128.0 ), rng_states, self.pos, self.dir, self.pol, self.t, self.wavelengths, self.last_hit_triangles, self.flags, self.weights, block=(nthreads_per_block,1,1), grid=(blocks, 1) ) elif api.is_gpu_api_opencl(): self.gpufuncs.gen_photon_from_step( cl_queue, ( photons_this_round, 1, 1), None, np.int32(first_photon), np.int32(self.nphotons), self.source_step_index_gpu.data, self.step_pos1_gpu.data, self.step_pos2_gpu.data, self.step_fsratio_gpu.data, np.float32( self.steps_array[0,self._fconst] ), np.float32( self.steps_array[0,self._sconst] ), np.float32( 128.0 ), rng_states.data, self.pos.data, self.dir.data, self.pol.data, self.t.data, self.wavelengths.data, self.last_hit_triangles.data, self.flags.data, self.weights.data, g_times_l=False ).wait() else: raise RuntimeError("GPU API is neither CUDA nor OpenCL!") if api.is_gpu_api_cuda(): cuda.Context.get_current().synchronize() tend_genphotons = time.time() print "GPUPhotonFromSteps: time to gen photons ",tend_genphotons-tstart_genphotons," secs (gpu time=",time.time()-tgpu,")" # Now load modules if api.is_gpu_api_cuda(): self.module = get_module('propagate.cu', options=api_options, include_source_directory=True) elif api.is_gpu_api_opencl(): self.module = get_module('propagate.cl', cl_context, options=api_options, include_source_directory=True) # define the texture references self.define_texture_references() # get kernel functions self.gpu_funcs = GPUFuncs(self.module)
import os,sys import chroma.api as api if api.is_gpu_api_cuda(): import pycuda.driver as cuda elif api.is_gpu_api_opencl(): import pyopencl as cl import chroma.gpu.tools as tools class workQueue(object): def __init__(self, context ): # we get important information about work queues here self.context = context if api.is_gpu_api_opencl(): self.device = context.get_info( cl.context_info.DEVICES )[0] self.shared_mem_size = self.device.get_info( cl.device_info.LOCAL_MEM_SIZE ) self.work_group_size = self.device.get_info( cl.device_info.MAX_WORK_GROUP_SIZE ) self.work_item_sizes = self.device.get_info( cl.device_info.MAX_WORK_ITEM_SIZES ) self.work_item_dims = self.device.get_info( cl.device_info.MAX_WORK_ITEM_DIMENSIONS ) self.max_compute_units = self.device.get_info( cl.device_info.MAX_COMPUTE_UNITS ) else: self.device = context.get_device() self.shared_mem_size = self.device.max_shared_memory_per_block self.work_group_size = self.device.max_threads_per_block self.work_item_sizes = self.device.max_block_dim_x self.work_item_dimes = 3 self.max_compute_units = self.device.multiprocessor_count def print_dev_info(self): print self.device, self.shared_mem_size, self.work_group_size, self.work_group_size, self.max_compute_units
def __init__(self, detector, wavelengths=None, print_usage=False, cl_context=None, cl_queue=None): GPUGeometry.__init__(self, detector, wavelengths=wavelengths, print_usage=False, cl_context=cl_context, cl_queue=cl_queue) if api.is_gpu_api_cuda(): self.solid_id_to_channel_index_gpu = ga.to_gpu( detector.solid_id_to_channel_index.astype(np.int32)) self.solid_id_to_channel_id_gpu = ga.to_gpu( detector.solid_id_to_channel_id.astype(np.int32)) self.nchannels = detector.num_channels() self.time_cdf_x_gpu = ga.to_gpu(detector.time_cdf[0].astype( np.float32)) self.time_cdf_y_gpu = ga.to_gpu(detector.time_cdf[1].astype( np.float32)) self.charge_cdf_x_gpu = ga.to_gpu(detector.charge_cdf[0].astype( np.float32)) self.charge_cdf_y_gpu = ga.to_gpu(detector.charge_cdf[1].astype( np.float32)) detector_source = cutools.get_cu_source('detector.h') detector_struct_size = characterize.sizeof('Detector', detector_source) self.detector_gpu = make_gpu_struct(detector_struct_size, [ self.solid_id_to_channel_index_gpu, self.time_cdf_x_gpu, self.time_cdf_y_gpu, self.charge_cdf_x_gpu, self.charge_cdf_y_gpu, np.int32(self.nchannels), np.int32(len(detector.time_cdf[0])), np.int32(len(detector.charge_cdf[0])), np.float32(detector.charge_cdf[0][-1] / 2**16) ]) elif api.is_gpu_api_opencl(): self.solid_id_to_channel_index_gpu = ga.to_device( cl_queue, detector.solid_id_to_channel_index.astype(np.int32)) self.solid_id_to_channel_id_gpu = ga.to_device( cl_queue, detector.solid_id_to_channel_id.astype(np.int32)) self.nchannels = np.int32(detector.num_channels()) self.time_cdf_x_gpu = ga.to_device( cl_queue, detector.time_cdf[0].astype(np.float32)) self.time_cdf_y_gpu = ga.to_device( cl_queue, detector.time_cdf[1].astype(np.float32)) self.charge_cdf_x_gpu = ga.to_device( cl_queue, detector.charge_cdf[0].astype(np.float32)) self.charge_cdf_y_gpu = ga.to_device( cl_queue, detector.charge_cdf[1].astype(np.float32)) self.time_cdf_len = np.int32(len(detector.time_cdf[0])) self.charge_cdf_len = np.int32(len(detector.charge_cdf[0])) self.charge_unit = np.float32(detector.charge_cdf[0][-1] / 2**16) else: raise RuntimeError("GPU API is neither OpenCL nor CUDA")