コード例 #1
0
    def launchOnce(self, photons, sim, workgroupsize=32):
        # command queue
        if api.is_gpu_api_opencl():
            comqueue = cl.CommandQueue( self.context )
        if workgroupsize%32!=0:
            raise ValueError('work group size must be multiple value of 32')
        if workgroupsize>self.work_item_sizes:
            raise ValueError('work group size must be smaller than %d'%(self.work_item_sizes))
        
        # photons is instance of GPUPhotons class. contains photon info on the host side
        bvh = sim.detector.bvh

        # get the photons we need to work on
        ourphotons = np.argwhere( photons.requested_workcode==self.workid ) # get index of photons that require us to work on them
        if len(ourphotons)==0:
            return

        # get information on what they need:
        # node list
        max_shared_nodes = self.shared_mem_size/((4+7)*4) # 4 is size of uint32, each node has 4 of them, plus daugher, sibling, aunt
        if bvh.nodes.nbytes<self.shared_mem_size:
            # lucky us, we can push the entire node list onto the device (though rarely will be the case)
            node_chunks = [0,len(bvh.nodes)-1]

        nodes = np.take( photons.current_node_index, ourphotons.ravel() ) # node indices

        # planning goals. forget about shared memory for now
        # pack in as many nodes to shared memory as possible.
        # try to take current layer, daughter layer, parent layer in that order
        
        # prep for kernel call
        if api.is_gpu_api_cuda():
            self._call_cuda_kernel( sim, photons, ourphotons, max_shared_nodes, nodes, workgroupsize )
        elif api.is_gpu_api_opencl():
            self._call_opencl_kernel( sim, photons, ourphotons, max_shared_nodes, nodes, workgroupsize, comqueue )
コード例 #2
0
def get_module(*args, **kwargs):
    #def get_module(name, options=None, include_source_directory=True, template_uncomment=None, template_fill=None):
    """ arguments: name, options=None, include_source_directory=True, template_uncomment=None, template_fill=None)"""
    if gpuapi.is_gpu_api_cuda():
        return cutools.get_cu_module(*args, **kwargs)
    elif gpuapi.is_gpu_api_opencl():
        return cltools.get_cl_module(*args, **kwargs)
コード例 #3
0
ファイル: photon.py プロジェクト: NuTufts/ChromaUBooNE
    def define_texture_references(self, module=None):
        # unbound texture references declared for use with propagate
        if module == None:
            module = self.module
        if api.is_gpu_api_cuda():
            self.node_texture_ref = module.get_texref("nodevec_tex_ref")
            self.node_texture_ref.set_format(cuda.array_format.UNSIGNED_INT32,
                                             4)

            self.extra_node_texture_ref = module.get_texref(
                "extra_node_tex_ref")
            self.extra_node_texture_ref.set_format(
                cuda.array_format.UNSIGNED_INT32, 4)

            self.vertices_texture_ref = module.get_texref(
                "verticesvec_tex_ref")
            self.vertices_texture_ref.set_format(cuda.array_format.FLOAT, 4)

            self.triangles_texture_ref = module.get_texref(
                "trianglesvec_tex_ref")
            self.triangles_texture_ref.set_format(
                cuda.array_format.UNSIGNED_INT32, 4)

            self.node_texture_ref_bound = False
        elif api.is_gpu_api_opencl():
            # texture usage not used at the moment
            pass
コード例 #4
0
def get_context(*args, **kwargs):
    if gpuapi.is_gpu_api_cuda():
        #return cutools.get_cuda_context(device_id,context_flags)
        return cutools.create_cuda_context(*args, **kwargs)
    elif gpuapi.is_gpu_api_opencl():
        #return cltools.create_cl_context(device_id,context_flags)
        return cltools.create_cl_context(*args, **kwargs)
コード例 #5
0
ファイル: daq.py プロジェクト: NuTufts/ChromaUBooNE
 def begin_acquire(self, nthreads_per_block=64, cl_context=None):
     if api.is_gpu_api_cuda():
         self.gpu_funcs.reset_earliest_time_int(
             np.float32(1e9),
             np.int32(len(self.earliest_time_int_gpu)),
             self.earliest_time_int_gpu,
             block=(nthreads_per_block, 1, 1),
             grid=(len(self.earliest_time_int_gpu) // nthreads_per_block +
                   1, 1))
         self.channel_q_int_gpu.fill(0)
         self.channel_q_gpu.fill(0)
         self.channel_history_gpu.fill(0)
     elif api.is_gpu_api_opencl():
         comqueue = cl.CommandQueue(cl_context)
         self.gpu_funcs.reset_earliest_time_int(
             comqueue, (nthreads_per_block, 1, 1),
             (len(self.earliest_time_int_gpu) // nthreads_per_block + 1, 1),
             np.float32(1e9),
             np.int32(len(self.earliest_time_int_gpu)),
             self.earliest_time_int_gpu.data,
             g_times_l=True).wait()
         self.channel_q_int_gpu.fill(0, queue=comqueue)
         self.channel_q_gpu.fill(0, queue=comqueue)
         self.channel_history_gpu.fill(0, queue=comqueue)
         cl.enqueue_barrier(comqueue)
コード例 #6
0
    def __init__(self,
                 gpu_detector,
                 ntdcs=None,
                 ns_per_tdc=None,
                 adc_bits=None,
                 ndaq=1,
                 cl_context=None,
                 cl_queue=None):
        """constructor.
        
        Args:
          gpu_detector: GPUDetector
        Keywords:
          ntdcs: int
            number of time bins per channel
            if not supplied, using class variable value
          ns_per_tdc: float
            nanoseconds per time bin
            if not supplied, using class variable value
          adc_bits:  int
            number of ADC bits (not used yet)
          ndaq: int
            number of daqs
          cl_context: pyopencl.Context
          cl_queue: pyopencl.CommandQueue
        Raises:
          ValueError when ntdcs and ns_per_tdc are found to be NoneType
        """
        if ntdcs == None:
            self.ntdcs = GPUDaqLAr1ND.NTDC
        if ns_per_tdc == None:
            self.ns_per_tdc = GPUDaqLAr1ND.NS_PER_TDC
        super(GPUDaqLAr1ND, self).__init__(gpu_detector,
                                           ntdcs=self.ntdcs,
                                           ns_per_tdc=self.ns_per_tdc,
                                           adc_bits=adc_bits,
                                           ndaq=ndaq,
                                           cl_context=cl_context,
                                           cl_queue=cl_queue)
        if self.ntdcs == None:
            raise ValueError("GPUDaqLAr1ND.NTDC has not been set.")
        if self.ns_per_tdc == None:
            raise ValueError("GPUDaqLAr1ND.NS_PER_TDC has not been set.")

        kernel_filepath = os.path.dirname(
            os.path.realpath(__file__)) + "/daq_lar1nd"
        if api.is_gpu_api_cuda():
            self.module = cutools.get_cu_module(kernel_filepath + ".cu",
                                                options=api_options,
                                                include_source_directory=True)
        elif api.is_gpu_api_opencl():
            self.module = cltools.get_cl_module(kernel_filepath + '.cl',
                                                cl_context,
                                                options=api_options,
                                                include_source_directory=True)
        else:
            raise RuntimeError("GPU API is neither CUDA nor OpenCL")

        self.gpu_funcs = GPUFuncs(self.module)
コード例 #7
0
def mapped_alloc(pagelocked_alloc_func, shape, dtype, write_combined):
    '''Returns a pagelocked host array mapped into the CUDA device
    address space, with a gpudata field set so it just works with CUDA 
    functions.'''
    if gpuapi.is_gpu_api_opencl():
        raise RuntimeError('Command only works for CUDA api')
    return cutools.mapped_alloc(pagelocked_alloc_func, shape, dtype,
                                write_combined)
コード例 #8
0
def load_bvh(geometry,
             bvh_name="default",
             auto_build_bvh=True,
             read_bvh_cache=True,
             target_degree=3,
             update_bvh_cache=True,
             cache_dir=None,
             bvh_method='grid',
             cuda_device=None,
             cl_device=None):
    if cache_dir is None:
        cache = Cache()
    else:
        cache = Cache(cache_dir)

    mesh_hash = geometry.mesh.md5()
    bvh = None
    if read_bvh_cache and cache.exist_bvh(mesh_hash, bvh_name):
        logger.debug('Loading BVH "%s" for geometry from cache.' % bvh_name)
        bvh = cache.load_bvh(mesh_hash, bvh_name)
    elif auto_build_bvh:
        logger.info('Building new BVH using recursive grid algorithm.')

        start = time.time()

        # creates quick context to make BVH
        if api.is_gpu_api_cuda():
            context = cutools.create_cuda_context(cuda_device)
            if bvh_method == 'grid':
                bvh = make_recursive_grid_bvh(geometry.mesh, target_degree=3)
            elif bvh_method == 'simple':
                bvh = make_simple_bvh(geometry.mesh, 3)
            else:
                raise ValueError(
                    'Requestd BVH construction method invalid: %s' %
                    (bvh_method))
            context.pop()
        elif api.is_gpu_api_opencl():
            context = cltools.create_cl_context(cl_device)
            if bvh_method == 'grid':
                bvh = make_recursive_grid_bvh(geometry.mesh,
                                              target_degree=target_degree)
            elif bvh_method == 'simple':
                bvh = make_simple_bvh(geometry.mesh, target_degree)
            else:
                raise ValueError(
                    'Requestd BVH construction method invalid: %s' %
                    (bvh_method))
            cltools.close_cl_context(context)

        logger.info('BVH generated in %1.1f seconds.' % (time.time() - start))

        if update_bvh_cache:
            logger.info('Saving BVH (%s:%s) to cache.' % (mesh_hash, bvh_name))
            cache.save_bvh(bvh, mesh_hash, bvh_name)

    return bvh
コード例 #9
0
def Mapped(array):
    '''Analog to pycuda.driver.InOut(), but indicates this array
    is memory mapped to the device space and should not be copied.

    To simplify coding, Mapped() will pass anything with a gpudata
    member, like a gpuarray, through unchanged.
    '''
    if gpuapi.is_gpu_api_opencl():
        raise RuntimeError('Command only works for CUDA api')
    return cutools.Mapped(array)
コード例 #10
0
 def __init__(self, cl_context=None):
     if api.is_gpu_api_cuda():
         self.module = cutools.get_cu_module('pdf.cu',
                                             options=api_options,
                                             include_source_directory=True)
     elif api.is_gpu_api_opencl():
         self.module = cltools.get_cl_module('pdf.cl',
                                             cl_context,
                                             options=api_options,
                                             include_source_directory=True)
     self.gpu_funcs = GPUFuncs(self.module)
コード例 #11
0
def to_uint3(arr):
    "Returns a vec.uint3 array from an (N,3) array."
    if gpuapi.is_gpu_api_cuda():
        if not arr.flags['C_CONTIGUOUS']:
            arr = np.asarray(arr, order='c')
        return arr.astype(np.uint32).view(ga.vec.uint3)[:, 0]
    elif gpuapi.is_gpu_api_opencl():
        n = len(arr)
        pad = np.zeros((n, 1), dtype=arr.dtype)
        arr_wpad = np.hstack((arr, pad))
        return arr_wpad.astype(np.uint32).view(ga.vec.uint3)[:, 0]
コード例 #12
0
 def __getattr__(self, name):
     try:
         return self.funcs[name]
     except KeyError:
         # find and then store function name on the demand
         if gpuapi.is_gpu_api_cuda():
             f = self.module.get_function(name)
             self.funcs[name] = f
             return f
         elif gpuapi.is_gpu_api_opencl():
             f = self.module.__getattr__(name)
             self.funcs[name] = f
             return f
コード例 #13
0
ファイル: bvh.py プロジェクト: NuTufts/ChromaUBooNE
def collapse_chains(nodes, layer_bounds):
    if gpuapi.is_gpu_api_cuda():
        bvh_module = get_module('bvh.cu',
                                options=api_options,
                                include_source_directory=True)
    elif gpuapi.is_gpu_api_opencl():
        context = cltools.get_last_context()
        queue = cl.CommandQueue(context)
        bvh_module = get_module('bvh.cl',
                                context,
                                options=api_options,
                                include_source_directory=True)
    else:
        raise RuntimeError('API neither CUDA or OpenCL')

    bvh_funcs = GPUFuncs(bvh_module)

    if gpuapi.is_gpu_api_cuda():
        gpu_nodes = ga.to_gpu(nodes)
    elif gpuapi.is_gpu_api_opencl():
        gpu_nodes = ga.to_device(queue, nodes)

    bounds = zip(layer_bounds[:-1], layer_bounds[1:])[:-1]
    bounds.reverse()
    nthreads_per_block = 256
    for start, end in bounds:
        if gpuapi.is_gpu_api_cuda():
            bvh_funcs.collapse_child(np.uint32(start),
                                     np.uint32(end),
                                     gpu_nodes,
                                     block=(nthreads_per_block, 1, 1),
                                     grid=(120, 1))
        elif gpuapi.is_gpu_api_opencl():
            bvh_funcs.collapse_child(queue, (end - start, 1, 1), None,
                                     np.uint32(start), np.uint32(end),
                                     gpu_nodes.data).wait()

    return gpu_nodes.get()
コード例 #14
0
ファイル: photon.py プロジェクト: NuTufts/ChromaUBooNE
 def get(self):
     ncols = 3
     if api.is_gpu_api_opencl():
         ncols = 4  # must include padding
     pos = self.pos.get().view(np.float32).reshape((len(self.pos), ncols))
     dir = self.dir.get().view(np.float32).reshape((len(self.dir), ncols))
     pol = self.pol.get().view(np.float32).reshape((len(self.pol), ncols))
     wavelengths = self.wavelengths.get()
     t = self.t.get()
     last_hit_triangles = self.last_hit_triangles.get()
     flags = self.flags.get()
     weights = self.weights.get()
     return event.Photons(pos, dir, pol, wavelengths, t, last_hit_triangles,
                          flags, weights)
コード例 #15
0
ファイル: geometry.py プロジェクト: NuTufts/ChromaUBooNE
 def __call__(self, tag, description):
     if api.is_gpu_api_cuda():
         gpu_free, gpu_total = cuda.mem_get_info()
     elif api.is_gpu_api_opencl():
         ctx = cltools.get_last_context()
         device = ctx.get_info(cl.context_info.DEVICES)[0]
         gpu_total = device.get_info(cl.device_info.GLOBAL_MEM_SIZE)
         gpu_free = gpu_total  # free memory info not availabe to opencl...
     if tag is None:
         self['gpu_total'] = gpu_total
     else:
         self['%s' % tag] = description
         self['%s_gpu_used' % tag] = gpu_total - gpu_free
     pass
コード例 #16
0
def to_float3(arr):
    "Returns a vec.float3 array from an (N,3) array."
    if gpuapi.is_gpu_api_cuda():
        if not arr.flags['C_CONTIGUOUS']:
            arr = np.asarray(arr, order='c')
        return arr.astype(np.float32).view(ga.vec.float3)[:, 0]
    elif gpuapi.is_gpu_api_opencl():
        # in the pyopencl implementation, the vec types have a padding column
        # need to extend this
        n = len(arr)
        pad = np.zeros((n, 1), dtype=arr.dtype)
        arr_wpad = np.hstack((arr, pad))
        return arr_wpad.astype(np.float32).view(ga.vec.float3)[:, 0]
    else:
        raise RuntimeError('API is neither CUDA nor OpenCL')
コード例 #17
0
ファイル: daq.py プロジェクト: NuTufts/ChromaUBooNE
    def __init__(self, gpu_detector, ndaq=1, cl_context=None, cl_queue=None):
        if api.is_gpu_api_cuda():
            self.earliest_time_gpu = ga.empty(gpu_detector.nchannels * ndaq,
                                              dtype=np.float32)
            self.earliest_time_int_gpu = ga.empty(gpu_detector.nchannels *
                                                  ndaq,
                                                  dtype=np.uint32)
            self.channel_history_gpu = ga.zeros_like(
                self.earliest_time_int_gpu)
            self.channel_q_int_gpu = ga.zeros_like(self.earliest_time_int_gpu)
            self.channel_q_gpu = ga.zeros(len(self.earliest_time_int_gpu),
                                          dtype=np.float32)
            self.detector_gpu = gpu_detector.detector_gpu
            self.module = cutools.get_cu_module('daq.cu',
                                                options=api_options,
                                                include_source_directory=True)
        elif api.is_gpu_api_opencl():
            self.earliest_time_gpu = ga.empty(cl_queue,
                                              gpu_detector.nchannels * ndaq,
                                              dtype=np.float32)
            self.earliest_time_int_gpu = ga.empty(cl_queue,
                                                  gpu_detector.nchannels *
                                                  ndaq,
                                                  dtype=np.uint32)
            self.channel_history_gpu = ga.zeros(cl_queue,
                                                gpu_detector.nchannels * ndaq,
                                                dtype=np.uint32)
            self.channel_q_int_gpu = ga.zeros(cl_queue,
                                              gpu_detector.nchannels * ndaq,
                                              dtype=np.uint32)
            self.channel_q_gpu = ga.zeros(cl_queue,
                                          gpu_detector.nchannels * ndaq,
                                          dtype=np.float32)
            self.detector_gpu = gpu_detector  # struct not made in opencl mode, so we keep a copy of the class
            self.module = cltools.get_cl_module('daq.cl',
                                                cl_context,
                                                options=api_options,
                                                include_source_directory=True)
        else:
            raise RuntimeError("GPU API is neither CUDA nor OpenCL")

        self.solid_id_map_gpu = gpu_detector.solid_id_map
        self.solid_id_to_channel_index_gpu = gpu_detector.solid_id_to_channel_index_gpu
        self.gpu_funcs = GPUFuncs(self.module)
        self.ndaq = ndaq
        self.stride = gpu_detector.nchannels
コード例 #18
0
 def __init__(self, context ):
     # we get important information about work queues here
     self.context = context
     if api.is_gpu_api_opencl():
         self.device = context.get_info( cl.context_info.DEVICES )[0]
         self.shared_mem_size = self.device.get_info( cl.device_info.LOCAL_MEM_SIZE )
         self.work_group_size = self.device.get_info( cl.device_info.MAX_WORK_GROUP_SIZE )
         self.work_item_sizes = self.device.get_info( cl.device_info.MAX_WORK_ITEM_SIZES )
         self.work_item_dims  = self.device.get_info( cl.device_info.MAX_WORK_ITEM_DIMENSIONS )
         self.max_compute_units = self.device.get_info( cl.device_info.MAX_COMPUTE_UNITS )
     else:
         self.device = context.get_device()
         self.shared_mem_size = self.device.max_shared_memory_per_block
         self.work_group_size = self.device.max_threads_per_block
         self.work_item_sizes = self.device.max_block_dim_x
         self.work_item_dimes = 3
         self.max_compute_units = self.device.multiprocessor_count
コード例 #19
0
ファイル: clrandstate.py プロジェクト: NuTufts/ChromaUBooNE
def get_struct_def(context):
    global randstate_struct_dict
    if api.is_gpu_api_opencl() == False:
        return None
    if context not in randstate_struct_dict:
        randstate_struct = np.dtype([("a", np.uint32), ("b", np.uint32),
                                     ("c", np.uint32), ("d", np.uint32)])
        #randstate_struct = np.dtype( [("a",np.int32), ("b",np.int32), ("c",np.int32), ("d",np.int32)] )
        print randstate_struct
        device = context.devices[0]
        randstate_struct, my_struct_c_decl = cl.tools.match_dtype_to_c_struct(
            device, "clrandState", randstate_struct)
        print "Defined clrandState.clrandState struct"
        print my_struct_c_decl
        randstate_struct = cl.tools.get_or_register_dtype(
            "clrandState", randstate_struct)
        print "registered with pyopencl for context ", context
        randstate_struct_dict[context] = randstate_struct
    return randstate_struct_dict[context]
コード例 #20
0
    def get(self):
        if api.is_gpu_api_cuda():
            pos  = self.pos.get().ravel().view(np.float32).reshape( self.nphotons, 3 )
            pdir = self.dir.get().ravel().view(np.float32).reshape( self.nphotons, 3 )
            pol  = self.pol.get().ravel().view(np.float32).reshape( self.nphotons, 3 )
        elif api.is_gpu_api_opencl():
            # need to remove the padding from vectors
            pos  = np.zeros( shape=(self.nphotons,3), dtype=np.float32 )
            pdir = np.zeros( shape=(self.nphotons,3), dtype=np.float32 )
            pol  = np.zeros( shape=(self.nphotons,3), dtype=np.float32 )
            gapos = self.pos.get()
            gadir = self.dir.get()
            gapol = self.pol.get()
            for n in xrange(0,self.nphotons):
                for i in xrange(0,3):
                    pos[n,i]  = gapos[n][i]
                    pdir[n,i] = gadir[n][i]
                pol[n,i]  = gapol[n][i]
        t = self.t.get().view(np.float32) - 100.0
        wavelengths = self.wavelengths.get().view(np.float32)

        return chroma.event.Photons( pos=pos, dir=pdir, pol=pol, t=t, wavelengths=wavelengths )
コード例 #21
0
ファイル: daq.py プロジェクト: NuTufts/ChromaUBooNE
    def end_acquire(self, nthreads_per_block=64, cl_context=None):
        if api.is_gpu_api_cuda():
            self.gpu_funcs.convert_sortable_int_to_float(
                np.int32(len(self.earliest_time_int_gpu)),
                self.earliest_time_int_gpu,
                self.earliest_time_gpu,
                block=(nthreads_per_block, 1, 1),
                grid=(len(self.earliest_time_int_gpu) // nthreads_per_block +
                      1, 1))
            self.gpu_funcs.convert_charge_int_to_float(
                self.detector_gpu,
                self.channel_q_int_gpu,
                self.channel_q_gpu,
                block=(nthreads_per_block, 1, 1),
                grid=(len(self.channel_q_int_gpu) // nthreads_per_block + 1,
                      1))
            cuda.Context.get_current().synchronize()
        elif api.is_gpu_api_opencl():
            print cl_context, nthreads_per_block
            comqueue = cl.CommandQueue(cl_context)
            self.gpu_funcs.convert_sortable_int_to_float(
                comqueue, (len(self.earliest_time_int_gpu), 1, 1),
                (nthreads_per_block, 1, 1),
                np.int32(len(self.earliest_time_int_gpu)),
                self.earliest_time_int_gpu.data,
                self.earliest_time_gpu.data,
                g_times_l=True).wait()
            self.gpu_funcs.convert_charge_int_to_float(
                comqueue, (len(self.channel_q_int_gpu), 1, 1),
                (nthreads_per_block, 1, 1),
                self.detector_gpu.nchannels,
                self.detector_gpu.charge_unit,
                self.channel_q_int_gpu.data,
                self.channel_q_gpu.data,
                g_times_l=True).wait()

        return GPUChannels(self.earliest_time_gpu, self.channel_q_gpu,
                           self.channel_history_gpu, self.ndaq, self.stride)
コード例 #22
0
    def end_acquire(self, nthreads_per_block=64, cl_context=None):
        """collect daq info and make GPUChannels instance.
        
        Args:
          nthreads_per_block: int
          cl_context: pyopenc.Context
        Returns:
          GPUChannels
        """
        if api.is_gpu_api_cuda():
            self.earliest_time_gpu = ga.zeros(self.nchannels, dtype=np.float32)
            nblocks = int(self.nchannels / nthreads_per_block) + 1
            self.gpu_funcs.get_earliest_hit_time(np.int32(self.nchannels),
                                                 np.int32(self.ntdcs),
                                                 np.float32(self.ns_per_tdc),
                                                 self.adc_gpu,
                                                 self.channel_history_gpu,
                                                 self.earliest_time_gpu,
                                                 block=(1000, 1, 1),
                                                 grid=(1, 1))
            self.adc_gpu.get()
        elif api.is_gpu_api_opencl():
            comqueue = cl.CommandQueue(cl_context)
            self.earliest_time_gpu = ga.zeros(comqueue,
                                              self.nchannels,
                                              dtype=np.float32)
            self.gpu_funcs.get_earliest_hit_time(
                comqueue, (int(self.nchannels), 1, 1), None,
                np.int32(self.nchannels), np.int32(self.ntdcs),
                np.float32(self.ns_per_tdc), self.adc_gpu.data,
                self.channel_history_gpu.data,
                self.earliest_time_gpu.data).wait()
            self.adc_gpu.get()

        return GPUChannels(self.earliest_time_gpu, self.adc_gpu,
                           self.channel_history_gpu, self.ndaq, self.stride)
コード例 #23
0
ファイル: bvh.py プロジェクト: NuTufts/ChromaUBooNE
def merge_nodes_detailed(nodes, first_child, nchild):
    '''Merges nodes into len(first_child) parent nodes, using
    the provided arrays to determine the index of the first
    child of each parent, and how many children there are.'''
    nthreads_per_block = 256
    context = None
    queue = None
    if gpuapi.is_gpu_api_opencl():
        context = cltools.get_last_context()
        #print context
        queue = cl.CommandQueue(context)

    # Load GPU functions
    if gpuapi.is_gpu_api_cuda():
        bvh_module = get_module('bvh.cu',
                                options=api_options,
                                include_source_directory=True)
    elif gpuapi.is_gpu_api_opencl():
        # don't like the last context method. trouble. trouble.
        bvh_module = get_module('bvh.cl',
                                context,
                                options=api_options,
                                include_source_directory=True)
    else:
        raise RuntimeError('API is neither CUDA nor OpenCL?!')
    bvh_funcs = GPUFuncs(bvh_module)

    # Load Memory
    if gpuapi.is_gpu_api_cuda():
        gpu_nodes = ga.to_gpu(nodes)
        gpu_first_child = ga.to_gpu(first_child.astype(np.int32))
        gpu_nchild = ga.to_gpu(nchild.astype(np.int32))

        nparent = len(first_child)
        gpu_parent_nodes = ga.empty(shape=nparent, dtype=ga.vec.uint4)
    elif gpuapi.is_gpu_api_opencl():
        gpu_nodes = ga.to_device(queue, nodes)
        gpu_first_child = ga.to_device(queue, first_child.astype(np.int32))
        gpu_nchild = ga.to_device(queue, nchild.astype(np.int32))
        nparent = len(first_child)
        parent_nodes_np = np.zeros(shape=nparent, dtype=ga.vec.uint4)
        gpu_parent_nodes = ga.to_device(queue, parent_nodes_np)
    else:
        raise RuntimeError('API is neither CUDA nor OpenCL?!')

    # Run Kernel
    for first_index, elements_this_iter, nblocks_this_iter in \
            chunk_iterator(nparent, nthreads_per_block, max_blocks=10000):
        if gpuapi.is_gpu_api_cuda():
            bvh_funcs.make_parents_detailed(np.uint32(first_index),
                                            np.uint32(elements_this_iter),
                                            gpu_nodes,
                                            gpu_parent_nodes,
                                            gpu_first_child,
                                            gpu_nchild,
                                            block=(nthreads_per_block, 1, 1),
                                            grid=(nblocks_this_iter, 1))
        elif gpuapi.is_gpu_api_opencl():
            bvh_funcs.make_parents_detailed(queue, (elements_this_iter, 1, 1),
                                            None, np.uint32(first_index),
                                            np.uint32(elements_this_iter),
                                            gpu_nodes.data,
                                            gpu_parent_nodes.data,
                                            gpu_first_child.data,
                                            gpu_nchild.data).wait()
        else:
            raise RuntimeError('API is neither CUDA nor OpenCL?!')

    return gpu_parent_nodes.get()
コード例 #24
0
import os,sys
import chroma.api as api
if api.is_gpu_api_cuda():
    import pycuda.driver as cuda
elif api.is_gpu_api_opencl():
    import pyopencl as cl
import chroma.gpu.tools as tools

class workQueue(object):

    def __init__(self, context ):
        # we get important information about work queues here
        self.context = context
        if api.is_gpu_api_opencl():
            self.device = context.get_info( cl.context_info.DEVICES )[0]
            self.shared_mem_size = self.device.get_info( cl.device_info.LOCAL_MEM_SIZE )
            self.work_group_size = self.device.get_info( cl.device_info.MAX_WORK_GROUP_SIZE )
            self.work_item_sizes = self.device.get_info( cl.device_info.MAX_WORK_ITEM_SIZES )
            self.work_item_dims  = self.device.get_info( cl.device_info.MAX_WORK_ITEM_DIMENSIONS )
            self.max_compute_units = self.device.get_info( cl.device_info.MAX_COMPUTE_UNITS )
        else:
            self.device = context.get_device()
            self.shared_mem_size = self.device.max_shared_memory_per_block
            self.work_group_size = self.device.max_threads_per_block
            self.work_item_sizes = self.device.max_block_dim_x
            self.work_item_dimes = 3
            self.max_compute_units = self.device.multiprocessor_count
            
    def print_dev_info(self):
        print self.device, self.shared_mem_size, self.work_group_size, self.work_group_size, self.max_compute_units
コード例 #25
0
ファイル: photon.py プロジェクト: NuTufts/ChromaUBooNE
    def propagate(self,
                  gpu_geometry,
                  rng_states,
                  nthreads_per_block=64,
                  max_blocks=1024,
                  max_steps=10,
                  use_weights=False,
                  scatter_first=0,
                  cl_context=None):
        """Propagate photons on GPU to termination or max_steps, whichever
        comes first.

        May be called repeatedly without reloading photon information if
        single-stepping through photon history.

        ..warning::
            `rng_states` must have at least `nthreads_per_block`*`max_blocks`
            number of curandStates.
        """
        nphotons = self.pos.size
        # bind node texture reference
        if api.is_gpu_api_cuda() and not self.node_texture_ref_bound:
            # we have to unroll, as pycuda doesn't seem to support vector times right now for binding
            self.unrolled_nodes = ga.to_gpu(
                gpu_geometry.nodes.get().ravel().view(np.uint32))
            self.unrolled_extra_nodes = ga.to_gpu(
                gpu_geometry.extra_nodes.ravel().view(np.uint32))
            self.unrolled_triangles = ga.to_gpu(
                gpu_geometry.triangles.get().ravel().view(np.uint32))
            self.unrolled_triangles4 = ga.to_gpu(
                gpu_geometry.triangles4.ravel().view(np.uint32))
            self.unrolled_vertices = ga.to_gpu(
                gpu_geometry.vertices.get().ravel().view(np.float32))
            self.unrolled_vertices4 = ga.to_gpu(
                gpu_geometry.vertices4.ravel().view(np.float32))
            self.node_texture_ref.set_address(self.unrolled_nodes.gpudata,
                                              self.unrolled_nodes.nbytes)
            self.extra_node_texture_ref.set_address(
                self.unrolled_extra_nodes.gpudata,
                self.unrolled_extra_nodes.nbytes)
            #self.unrolled_nodes.bind_to_texref_ext( self.node_texture_ref )
            #self.unrolled_extra_nodes.bind_to_texref_ext( self.extra_node_texture_ref )
            #self.unrolled_triangles.bind_to_texref_ext( self.triangles_texture_ref )
            self.triangles_texture_ref.set_address(
                self.unrolled_triangles4.gpudata,
                self.unrolled_triangles4.nbytes)
            #self.unrolled_vertices.bind_to_texref_ext( self.vertices_texture_ref )
            self.vertices_texture_ref.set_address(
                self.unrolled_vertices4.gpudata,
                self.unrolled_vertices4.nbytes)
            print "[BOUND TO TEXTURE MEMORY]"
            print "Nodes: ", self.unrolled_nodes.nbytes / 1.0e3, " kbytes"
            print "Extra nodes: ", self.unrolled_extra_nodes.nbytes / 1.0e3, " kbytes"
            print "Triangles: ", self.unrolled_triangles4.nbytes / 1.0e3, " kbytes"
            print "Vertices: ", self.unrolled_vertices4.nbytes / 1.0e3, " kbytes"
            print "Total: ", (self.unrolled_nodes.nbytes +
                              self.unrolled_extra_nodes.nbytes +
                              self.unrolled_triangles4.nbytes +
                              self.unrolled_vertices4.nbytes) / 1.0e3, "kbytes"
            self.node_texture_ref_bound = True

        # setup queue
        maxqueue = nphotons
        step = 0
        input_queue = np.empty(shape=maxqueue + 1, dtype=np.uint32)
        input_queue[0] = 0
        # Order photons initially in the queue to put the clones next to each other
        for copy in xrange(self.ncopies):
            input_queue[1 + copy::self.ncopies] = np.arange(
                self.true_nphotons,
                dtype=np.uint32) + copy * self.true_nphotons
        if api.is_gpu_api_cuda():
            input_queue_gpu = ga.to_gpu(input_queue)
        elif api.is_gpu_api_opencl():
            comqueue = cl.CommandQueue(cl_context)
            input_queue_gpu = ga.to_device(comqueue,
                                           input_queue[1:])  # why the offset?

        output_queue = np.zeros(shape=maxqueue + 1, dtype=np.uint32)
        output_queue[0] = 1
        if api.is_gpu_api_cuda():
            output_queue_gpu = ga.to_gpu(output_queue)
        elif api.is_gpu_api_opencl():
            output_queue_gpu = ga.to_device(comqueue, output_queue)

        if use_weights:
            iuse_weights = 1
        else:
            iuse_weights = 0

        adapt_factor = 1.0
        start_prop = time.time()
        while step < max_steps:
            # Just finish the rest of the steps if the # of photons is low
            #if nphotons < nthreads_per_block * 16 * 8 or use_weights:
            #    nsteps = max_steps - step
            #else:
            #    nsteps = 1
            nsteps = 1

            start_step = time.time()
            for first_photon, photons_this_round, blocks in \
                    chunk_iterator(nphotons, nthreads_per_block, max( int(adapt_factor*max_blocks), 1 )):
                #print nphotons, nthreads_per_block, max_blocks," : ",first_photon, photons_this_round, blocks, adapt_factor
                start_chunk = time.time()
                if api.is_gpu_api_cuda():
                    self.gpu_funcs.propagate(np.int32(first_photon),
                                             np.int32(photons_this_round),
                                             input_queue_gpu[1:],
                                             output_queue_gpu,
                                             rng_states,
                                             self.pos,
                                             self.dir,
                                             self.wavelengths,
                                             self.pol,
                                             self.t,
                                             self.flags,
                                             self.last_hit_triangles,
                                             self.weights,
                                             np.int32(nsteps),
                                             np.int32(iuse_weights),
                                             np.int32(scatter_first),
                                             gpu_geometry.gpudata,
                                             block=(nthreads_per_block, 1, 1),
                                             grid=(blocks, 1))
                    #cuda.Context.get_current().synchronize()
                elif api.is_gpu_api_opencl():
                    self.gpu_funcs.propagate(
                        comqueue, (photons_this_round, 1, 1),
                        None,
                        np.int32(first_photon),
                        np.int32(photons_this_round),
                        input_queue_gpu.data,
                        output_queue_gpu.data,
                        rng_states.data,
                        self.pos.data,
                        self.dir.data,
                        self.wavelengths.data,
                        self.pol.data,
                        self.t.data,
                        self.flags.data,
                        self.last_hit_triangles.data,
                        self.weights.data,
                        np.int32(nsteps),
                        np.int32(iuse_weights),
                        np.int32(scatter_first),
                        gpu_geometry.world_scale,
                        gpu_geometry.world_origin.data,
                        np.int32(len(gpu_geometry.nodes)),
                        gpu_geometry.material_data['n'],
                        gpu_geometry.material_data['step'],
                        gpu_geometry.material_data["wavelength0"],
                        gpu_geometry.vertices.data,
                        gpu_geometry.triangles.data,
                        gpu_geometry.material_codes.data,
                        gpu_geometry.colors.data,
                        gpu_geometry.nodes.data,
                        gpu_geometry.extra_nodes.data,
                        gpu_geometry.material_data["nmaterials"],
                        gpu_geometry.material_data['refractive_index'].data,
                        gpu_geometry.material_data['absorption_length'].data,
                        gpu_geometry.material_data['scattering_length'].data,
                        gpu_geometry.material_data['reemission_prob'].data,
                        gpu_geometry.material_data['reemission_cdf'].data,
                        gpu_geometry.surface_data['nsurfaces'],
                        gpu_geometry.surface_data['detect'].data,
                        gpu_geometry.surface_data['absorb'].data,
                        gpu_geometry.surface_data['reemit'].data,
                        gpu_geometry.surface_data['reflect_diffuse'].data,
                        gpu_geometry.surface_data['reflect_specular'].data,
                        gpu_geometry.surface_data['eta'].data,
                        gpu_geometry.surface_data['k'].data,
                        gpu_geometry.surface_data['reemission_cdf'].data,
                        gpu_geometry.surface_data['model'].data,
                        gpu_geometry.surface_data['transmissive'].data,
                        gpu_geometry.surface_data['thickness'].data,
                        gpu_geometry.surface_data['nplanes'].data,
                        gpu_geometry.surface_data['wire_diameter'].data,
                        gpu_geometry.surface_data['wire_pitch'].data,
                        g_times_l=True).wait()
                end_chunk = time.time()
                chunk_time = end_chunk - start_chunk
                #print "chunk time: ",chunk_time
                #if chunk_time>2.5:
                #    adapt_factor *= 0.5
            step += nsteps
            scatter_first = 0  # Only allow non-zero in first pass
            end_step = time.time()
            #print "step time: ",end_step-start_step

            if step < max_steps:
                start_requeue = time.time()
                #print "reset photon queues"
                if api.is_gpu_api_cuda():
                    cuda.Context.get_current().synchronize(
                    )  # ensure all threads done
                    #temp = input_queue_gpu
                    #input_queue_gpu = output_queue_gpu
                    #output_queue_gpu = temp
                    # Assign with a numpy array of length 1 to silence
                    # warning from PyCUDA about setting array with different strides/storage orders.
                    #output_queue_gpu[:1].set(np.ones(shape=1, dtype=np.uint32))
                    #nphotons = input_queue_gpu[:1].get()[0] - 1
                    # new style
                    output_queue_gpu.get(output_queue)
                    nphotons = output_queue[0] - 1
                    input_queue_gpu.set(output_queue)
                    output_queue_gpu[:1].set(np.ones(shape=1, dtype=np.uint32))

                elif api.is_gpu_api_opencl():
                    temp_out = output_queue_gpu.get()
                    nphotons = temp_out[0]
                    input_queue_gpu.set(
                        temp_out[1:], queue=comqueue
                    )  # set the input queue to have index of photons still need to be run
                    output_queue_gpu[:1].set(
                        np.ones(shape=1, dtype=np.uint32),
                        queue=comqueue)  # reset first instance to be one
                end_requeue = time.time()
                #print "re-queue time (nphotons=",nphotons"): ",end_requeue-start_requeue
                if nphotons == 0:
                    break

        end_prop = time.time()
        print "propagation time: ", end_prop - start_prop, " secs"
        end_flags = self.flags.get()
        end_flag = np.max(end_flags)
        if end_flag & (1 << 31):
            print >> sys.stderr, "WARNING: ABORTED PHOTONS"
        if api.is_gpu_api_cuda():
            cuda.Context.get_current().synchronize()
        elif api.is_gpu_api_opencl():
            cl.enqueue_barrier(comqueue)
コード例 #26
0
ファイル: photon.py プロジェクト: NuTufts/ChromaUBooNE
    def __init__(self, photons, ncopies=1, cl_context=None):
        """Load ``photons`` onto the GPU, replicating as requested.

           Args:
               - photons: chroma.Event.Photons
                   Photon state information to load onto GPU
               - ncopies: int, *optional*
                   Number of times to replicate the photons
                   on the GPU.  This is used if you want
                   to propagate the same event many times,
                   for example in a likelihood calculation.

                   The amount of GPU storage will be proportionally
                   larger if ncopies > 1, so be careful.
        """
        nphotons = len(photons)
        # Allocate GPU memory for photon info and push to device
        if api.is_gpu_api_cuda():
            self.pos = ga.empty(shape=nphotons * ncopies, dtype=ga.vec.float3)
            self.dir = ga.empty(shape=nphotons * ncopies, dtype=ga.vec.float3)
            self.pol = ga.empty(shape=nphotons * ncopies, dtype=ga.vec.float3)
            self.wavelengths = ga.empty(shape=nphotons * ncopies,
                                        dtype=np.float32)
            self.t = ga.empty(shape=nphotons * ncopies, dtype=np.float32)
            self.last_hit_triangles = ga.empty(shape=nphotons * ncopies,
                                               dtype=np.int32)
            self.flags = ga.empty(shape=nphotons * ncopies, dtype=np.uint32)
            self.weights = ga.empty(shape=nphotons * ncopies, dtype=np.float32)
            self.current_node_index = ga.zeros(shape=nphotons * ncopies,
                                               dtype=np.uint32)  # deprecated
            self.requested_workcode = ga.empty(shape=nphotons * ncopies,
                                               dtype=np.uint32)  # deprecated
        elif api.is_gpu_api_opencl():
            queue = cl.CommandQueue(cl_context)
            self.pos = ga.empty(queue,
                                shape=nphotons * ncopies,
                                dtype=ga.vec.float3)
            self.dir = ga.empty(queue,
                                shape=nphotons * ncopies,
                                dtype=ga.vec.float3)
            self.pol = ga.empty(queue,
                                shape=nphotons * ncopies,
                                dtype=ga.vec.float3)
            self.wavelengths = ga.empty(queue,
                                        shape=nphotons * ncopies,
                                        dtype=np.float32)
            self.t = ga.empty(queue,
                              shape=nphotons * ncopies,
                              dtype=np.float32)
            self.last_hit_triangles = ga.empty(queue,
                                               shape=nphotons * ncopies,
                                               dtype=np.int32)
            self.flags = ga.empty(queue,
                                  shape=nphotons * ncopies,
                                  dtype=np.uint32)
            self.weights = ga.empty(queue,
                                    shape=nphotons * ncopies,
                                    dtype=np.float32)
            self.current_node_index = ga.zeros(queue,
                                               shape=nphotons * ncopies,
                                               dtype=np.uint32)  # deprecated
            self.requested_workcode = ga.empty(queue,
                                               shape=nphotons * ncopies,
                                               dtype=np.uint32)  # deprecated

        # Assign the provided photons to the beginning (possibly
        # the entire array if ncopies is 1
        self.pos[:nphotons].set(to_float3(photons.pos))
        self.dir[:nphotons].set(to_float3(photons.dir))
        self.pol[:nphotons].set(to_float3(photons.pol))
        self.wavelengths[:nphotons].set(photons.wavelengths.astype(np.float32))
        self.t[:nphotons].set(photons.t.astype(np.float32))
        self.last_hit_triangles[:nphotons].set(
            photons.last_hit_triangles.astype(np.int32))
        self.flags[:nphotons].set(photons.flags.astype(np.uint32))
        self.weights[:nphotons].set(photons.weights.astype(np.float32))

        if api.is_gpu_api_cuda():
            self.module = get_module('propagate.cu',
                                     options=api_options,
                                     include_source_directory=True)
        elif api.is_gpu_api_opencl():
            self.module = get_module('propagate.cl',
                                     cl_context,
                                     options=api_options,
                                     include_source_directory=True)
        # define the texture references
        self.define_texture_references()
        # get kernel functions
        self.gpu_funcs = GPUFuncs(self.module)

        # Replicate the photons to the rest of the slots if needed
        if ncopies > 1:
            max_blocks = 1024
            nthreads_per_block = 64
            for first_photon, photons_this_round, blocks in \
                    chunk_iterator(nphotons, nthreads_per_block, max_blocks):
                self.gpu_funcs.photon_duplicate(np.int32(first_photon),
                                                np.int32(photons_this_round),
                                                self.pos,
                                                self.dir,
                                                self.wavelengths,
                                                self.pol,
                                                self.t,
                                                self.flags,
                                                self.last_hit_triangles,
                                                self.weights,
                                                np.int32(ncopies - 1),
                                                np.int32(nphotons),
                                                block=(nthreads_per_block, 1,
                                                       1),
                                                grid=(blocks, 1))

        # Save the duplication information for the iterate_copies() method
        self.true_nphotons = nphotons
        self.ncopies = ncopies
コード例 #27
0
ファイル: bvh.py プロジェクト: NuTufts/ChromaUBooNE
def concatenate_layers(layers):
    nthreads_per_block = 1024
    context = None
    queue = None
    if gpuapi.is_gpu_api_opencl():
        context = cltools.get_last_context()
        #print context
        queue = cl.CommandQueue(context)

    # Load GPU functions
    if gpuapi.is_gpu_api_cuda():
        bvh_module = get_module('bvh.cu',
                                options=api_options,
                                include_source_directory=True)
    elif gpuapi.is_gpu_api_opencl():
        # don't like the last context method. trouble. trouble.
        bvh_module = get_module('bvh.cl',
                                cltools.get_last_context(),
                                options=api_options,
                                include_source_directory=True)
    else:
        raise RuntimeError('API neither CUDA nor OpenCL?!')
    bvh_funcs = GPUFuncs(bvh_module)

    # Put 0 at beginning of list
    layer_bounds = np.insert(np.cumsum(map(len, layers)), 0, 0)

    # allocate memory
    if gpuapi.is_gpu_api_cuda():
        nodes = ga.empty(shape=int(layer_bounds[-1]), dtype=ga.vec.uint4)
    elif gpuapi.is_gpu_api_opencl():
        totsize = 0
        layer_pos = []
        print layer_bounds[-1]
        for n, layer in enumerate(layers):
            layer_pos.append(totsize)
            print "LAYER ", n, " size=", len(layer), "start=", totsize
            totsize += len(layer)
        print "totsize: ", totsize
        nodes_iter_np = np.empty(totsize, dtype=ga.vec.uint4)
        nodes_iter_gpu = ga.to_device(queue, nodes_iter_np)
        nodeset_np = []
    else:
        raise RuntimeError('API neither CUDA nor OpenCL?!')

    ilayer = 0
    for layer_start, layer_end, layer in zip(layer_bounds[:-1],
                                             layer_bounds[1:], layers):
        if layer_end == layer_bounds[-1]:
            # leaf nodes need no offset
            child_offset = 0
        else:
            child_offset = layer_end

        #print "ilayer,start,end,child_offset: ",ilayer,layer_start, layer_end, child_offset
        nmax_blocks = 10000
        if gpuapi.is_gpu_api_opencl():
            nthreads_per_block = 256
            nmax_blocks = 1
        for first_index, elements_this_iter, nblocks_this_iter in \
                chunk_iterator(layer_end-layer_start, nthreads_per_block,max_blocks=nmax_blocks):
            #print "   ",ilayer,first_index, elements_this_iter, nblocks_this_iter, layer_start
            if gpuapi.is_gpu_api_cuda():
                bvh_funcs.copy_and_offset(np.uint32(first_index),
                                          np.uint32(elements_this_iter),
                                          np.uint32(child_offset),
                                          cuda.In(layer),
                                          nodes[layer_start:],
                                          block=(nthreads_per_block, 1, 1),
                                          grid=(nblocks_this_iter, 1))
            elif gpuapi.is_gpu_api_opencl():
                layer_gpu = ga.to_device(queue, layer)
                bvh_funcs.copy_and_offset(queue, (elements_this_iter, 1, 1),
                                          (1, 1, 1),
                                          np.uint32(first_index),
                                          np.uint32(elements_this_iter),
                                          np.uint32(child_offset),
                                          np.uint32(layer_start),
                                          layer_gpu.data,
                                          nodes_iter_gpu.data,
                                          g_times_l=True).wait()
            else:
                raise RuntimeError('API neither CUDA nor OpenCL?!')
        ilayer += 1

    if gpuapi.is_gpu_api_cuda():
        return nodes.get(), layer_bounds
    elif gpuapi.is_gpu_api_opencl():
        return nodes_iter_gpu.get(), layer_bounds
コード例 #28
0
ファイル: bvh.py プロジェクト: NuTufts/ChromaUBooNE
def merge_nodes(nodes, degree, max_ratio=None):
    nthreads_per_block = 256
    context = None
    queue = None
    if gpuapi.is_gpu_api_opencl():
        context = cltools.get_last_context()
        queue = cl.CommandQueue(context)

    # Load GPU functions
    if gpuapi.is_gpu_api_cuda():
        bvh_module = get_module('bvh.cu',
                                options=api_options,
                                include_source_directory=True)
    elif gpuapi.is_gpu_api_opencl():
        # don't like the last context method. trouble. trouble.
        bvh_module = get_module('bvh.cl',
                                context,
                                options=api_options,
                                include_source_directory=True)
    else:
        raise RuntimeError('API is neither CUDA nor OpenCL?!')
    bvh_funcs = GPUFuncs(bvh_module)

    # determine number of parents
    nparent = len(nodes) / degree
    if len(nodes) % degree != 0:
        nparent += 1

    if nparent == 1:
        nparent_pad = nparent
    else:
        nparent_pad = round_up_to_multiple(nparent, 1)  #degree

    # allocate memory
    if gpuapi.is_gpu_api_cuda():
        gpu_parent_nodes = ga.zeros(shape=nparent_pad, dtype=ga.vec.uint4)
    elif gpuapi.is_gpu_api_opencl():
        parent_nodes_np = np.zeros(shape=nparent, dtype=ga.vec.uint4)
        gpu_parent_nodes = ga.to_device(queue, parent_nodes_np)
        gpu_nodes = ga.to_device(queue, nodes)
    else:
        raise RuntimeError('API is neither CUDA nor OpenCL?!')

    # run kernel
    if gpuapi.is_gpu_api_cuda():
        for first_index, elements_this_iter, nblocks_this_iter in \
                chunk_iterator(nparent, nthreads_per_block, max_blocks=10000):
            bvh_funcs.make_parents(np.uint32(first_index),
                                   np.uint32(elements_this_iter),
                                   np.uint32(degree),
                                   gpu_parent_nodes,
                                   cuda.In(nodes),
                                   np.uint32(0),
                                   np.uint32(len(nodes)),
                                   block=(nthreads_per_block, 1, 1),
                                   grid=(nblocks_this_iter, 1))
    elif gpuapi.is_gpu_api_opencl():
        for first_index, elements_this_iter, nblocks_this_iter in \
                chunk_iterator(nparent, nthreads_per_block, max_blocks=1):
            bvh_funcs.make_parents(queue, (elements_this_iter, 1, 1), None,
                                   np.uint32(first_index),
                                   np.uint32(elements_this_iter),
                                   np.uint32(degree), gpu_parent_nodes.data,
                                   gpu_nodes.data, np.uint32(0),
                                   np.uint32(len(nodes))).wait()
    else:
        raise RuntimeError('API is neither CUDA nor OpenCL?!')

    parent_nodes = gpu_parent_nodes.get()

    if max_ratio is not None:
        areas = node_areas(parent_nodes)
        child_areas = node_areas(nodes)

        excessive_area = np.zeros(shape=len(areas), dtype=bool)
        for i, parent_area in enumerate(areas):
            nchild = parent_nodes['w'][i] >> CHILD_BITS
            child_index = parent_nodes['w'][i] & ~NCHILD_MASK
            child_area = child_areas[child_index:child_index + nchild].sum()
            #if parent_area > 1e9:
            #    print i, 'Children: %e, Parent: %e' % (child_area, parent_area)
            if child_area / parent_area < 0.3:
                excessive_area[i] = True
                #print i, 'Children: %e, Parent: %e' % (child_area, parent_area)

        extra_slots = round_up_to_multiple(
            (degree - 1) * np.count_nonzero(excessive_area), 1)
        print 'Extra slots:', extra_slots
        new_parent_nodes = np.zeros(shape=len(parent_nodes) + extra_slots,
                                    dtype=parent_nodes.dtype)
        new_parent_nodes[:len(parent_nodes)] = parent_nodes

        offset = 0
        for count, index in enumerate(np.argwhere(excessive_area)):
            index = index[0] + offset
            nchild = new_parent_nodes['w'][index] >> CHILD_BITS
            child_index = new_parent_nodes['w'][index] & ~NCHILD_MASK
            new_parent_nodes[index] = nodes[child_index]
            #new_parent_nodes['w'][index] = 1 << CHILD_BITS | child_index
            tmp_nchild = new_parent_nodes['w'][index] >> CHILD_BITS
            tmp_child_index = new_parent_nodes['w'][index] & ~NCHILD_MASK
            new_parent_nodes['w'][index] = tmp_nchild << CHILD_BITS | (
                tmp_child_index + len(nodes))

            if nchild == 1:
                continue

            # slide everyone over
            #print index, nchild, len(new_parent_nodes)
            new_parent_nodes[index + nchild:] = new_parent_nodes[index +
                                                                 1:-nchild + 1]
            offset += nchild - 1
            for sibling in xrange(nchild - 1):
                new_parent_index = index + 1 + sibling
                new_parent_nodes[new_parent_index] = nodes[child_index +
                                                           sibling + 1]
                if new_parent_nodes['x'][new_parent_index] != 0:
                    tmp_nchild = new_parent_nodes['w'][
                        new_parent_index] >> CHILD_BITS
                    tmp_child_index = new_parent_nodes['w'][
                        new_parent_index] & ~NCHILD_MASK
                    new_parent_nodes['w'][
                        new_parent_index] = tmp_nchild << CHILD_BITS | (
                            tmp_child_index + len(nodes))

                    #new_parent_nodes['w'][new_parent_index] = 1 << CHILD_BITS | (child_index + sibling + 1)

            #print 'intermediate: %e' % node_areas(new_parent_nodes).max()
        print 'old: %e' % node_areas(parent_nodes).max()
        print 'new: %e' % node_areas(new_parent_nodes).max()
        if len(new_parent_nodes) < len(nodes):
            # Only adopt new set of parent nodes if it actually reduces the
            # total number of nodes at this level by 1.
            parent_nodes = new_parent_nodes

    return parent_nodes
コード例 #29
0
ファイル: bvh.py プロジェクト: NuTufts/ChromaUBooNE
def create_leaf_nodes(mesh,
                      morton_bits=16,
                      round_to_multiple=1,
                      nthreads_per_block=32,
                      max_blocks=16):
    '''Compute the leaf nodes surrounding a triangle mesh.

      ``mesh``: chroma.geometry.Mesh
        Triangles to box
      ``morton_bits``: int
        Number of bits to use per dimension when computing Morton code.
      ``round_to_multiple``: int
        Round the number of nodes created up to multiple of this number
        Extra nodes will be all zero.
        
    Returns (world_coords, nodes, morton_codes), where
      ``world_coords``: chroma.bvh.WorldCoords
        Defines the fixed point coordinate system
      ``nodes``: ndarray(shape=len(mesh.triangles), dtype=uint4)
        List of leaf nodes.  Child IDs will be set to triangle offsets.
      ``morton_codes``: ndarray(shape=len(mesh.triangles), dtype=np.uint64)
        Morton codes for each triangle, using ``morton_bits`` per axis.
        Must be <= 16 bits.
    '''
    # it would be nice not to duplicate code, make functions transparent...
    context = None
    queue = None
    if gpuapi.is_gpu_api_opencl():
        context = cltools.get_last_context()
        #print context
        queue = cl.CommandQueue(context)

    # Load GPU functions
    if gpuapi.is_gpu_api_cuda():
        bvh_module = get_module('bvh.cu',
                                options=api_options,
                                include_source_directory=True)
    elif gpuapi.is_gpu_api_opencl():
        # don't like the last context method. trouble. trouble.
        bvh_module = get_module('bvh.cl',
                                cltools.get_last_context(),
                                options=api_options,
                                include_source_directory=True)
    bvh_funcs = GPUFuncs(bvh_module)

    # compute world coordinates
    world_origin_np = mesh.vertices.min(axis=0)
    world_scale = np.max(
        (mesh.vertices.max(axis=0) - world_origin_np)) / (2**16 - 2)
    world_coords = WorldCoords(world_origin=world_origin_np,
                               world_scale=world_scale)

    # Put triangles and vertices into host and device memory
    # unfortunately, opencl and cuda has different methods for managing memory here
    # we have to write divergent code
    if gpuapi.is_gpu_api_cuda():
        # here cuda supports a nice feature where we allocate host and device memory that are mapped onto one another.
        # no explicit requests for transfers here
        triangles = cutools.mapped_empty(shape=len(mesh.triangles),
                                         dtype=ga.vec.uint3,
                                         write_combined=True)
        triangles[:] = to_uint3(mesh.triangles)
        vertices = cutools.mapped_empty(shape=len(mesh.vertices),
                                        dtype=ga.vec.float3,
                                        write_combined=True)
        vertices[:] = to_float3(mesh.vertices)
        #print triangles[0:10]
        #print vertices[0:10]

        # Call GPU to compute nodes
        nodes = ga.zeros(shape=round_up_to_multiple(len(triangles),
                                                    round_to_multiple),
                         dtype=ga.vec.uint4)
        morton_codes = ga.empty(shape=len(triangles), dtype=np.uint64)

        # Convert world coords to GPU-friendly types
        world_origin = ga.vec.make_float3(*world_origin_np)
        world_scale = np.float32(world_scale)

        # generate morton codes on GPU
        for first_index, elements_this_iter, nblocks_this_iter in \
                chunk_iterator(len(triangles), nthreads_per_block,
                               max_blocks=30000):
            bvh_funcs.make_leaves(np.uint32(first_index),
                                  np.uint32(elements_this_iter),
                                  cutools.Mapped(triangles),
                                  cutools.Mapped(vertices),
                                  world_origin,
                                  world_scale,
                                  nodes,
                                  morton_codes,
                                  block=(nthreads_per_block, 1, 1),
                                  grid=(nblocks_this_iter, 1))

        morton_codes_host = morton_codes.get() >> (16 - morton_bits)

    elif gpuapi.is_gpu_api_opencl():
        # here we need to allocate a buffer on the host and on the device
        triangles = np.empty(len(mesh.triangles), dtype=ga.vec.uint3)
        copy_to_uint3(mesh.triangles, triangles)
        vertices = np.empty(len(mesh.vertices), dtype=ga.vec.float3)
        copy_to_float3(mesh.vertices, vertices)
        # now create a buffer object on the device and push data to it
        triangles_dev = ga.to_device(queue, triangles)
        vertices_dev = ga.to_device(queue, vertices)

        # Call GPU to compute nodes
        nodes = ga.zeros(queue,
                         shape=round_up_to_multiple(len(triangles),
                                                    round_to_multiple),
                         dtype=ga.vec.uint4)
        morton_codes = ga.empty(queue, shape=len(triangles), dtype=np.uint64)

        # Convert world coords to GPU-friendly types
        #world_origin = np.array(world_origin_np,dtype=np.float32)
        world_origin = np.empty(1, dtype=ga.vec.float3)
        world_origin['x'] = world_origin_np[0]
        world_origin['y'] = world_origin_np[1]
        world_origin['z'] = world_origin_np[2]
        world_scale = np.float32(world_scale)
        #print world_origin, world_scale

        # generate morton codes on GPU
        for first_index, elements_this_iter, nblocks_this_iter in \
                chunk_iterator(len(triangles), nthreads_per_block, max_blocks):
            print first_index, elements_this_iter, nblocks_this_iter
            bvh_funcs.make_leaves(
                queue,
                (nblocks_this_iter, 1, 1),
                (nthreads_per_block, 1, 1),
                #bvh_funcs.make_leaves( queue, (elements_this_iter,1,1), None,
                np.uint32(first_index),
                np.uint32(elements_this_iter),
                triangles_dev.data,
                vertices_dev.data,
                world_origin,
                world_scale,
                nodes.data,
                morton_codes.data,
                g_times_l=True).wait()

        morton_codes_host = morton_codes.get() >> (16 - morton_bits)

    return world_coords, nodes.get(), morton_codes_host
コード例 #30
0
ファイル: bvh.py プロジェクト: NuTufts/ChromaUBooNE
import numpy as np
import chroma.api as gpuapi
from chroma.gpu.tools import get_module, api_options, \
    chunk_iterator, to_uint3, to_float3, copy_to_uint3, copy_to_float3
from chroma.gpu.gpufuncs import GPUFuncs
if gpuapi.is_gpu_api_cuda():
    import pycuda.driver as cuda
    from pycuda import gpuarray as ga
    from pycuda import characterize
    #from chroma.gpu.cutools import Mapped
    import chroma.gpu.cutools as cutools
elif gpuapi.is_gpu_api_opencl():
    import pyopencl as cl
    import pyopencl.array as ga
    import chroma.gpu.cltools as cltools

from chroma.bvh.bvh import WorldCoords, node_areas, NCHILD_MASK, CHILD_BITS


def round_up_to_multiple(x, multiple):
    remainder = x % multiple
    if remainder == 0:
        return x
    else:
        return x + multiple - remainder


def create_leaf_nodes(mesh,
                      morton_bits=16,
                      round_to_multiple=1,
                      nthreads_per_block=32,