Beispiel #1
0
def merge_nodes_detailed(nodes, first_child, nchild):
    '''Merges nodes into len(first_child) parent nodes, using
    the provided arrays to determine the index of the first
    child of each parent, and how many children there are.'''
    bvh_module = get_cu_module('bvh.cu',
                               options=cuda_options,
                               include_source_directory=True)
    bvh_funcs = GPUFuncs(bvh_module)

    gpu_nodes = ga.to_gpu(nodes)
    gpu_first_child = ga.to_gpu(first_child.astype(np.int32))
    gpu_nchild = ga.to_gpu(nchild.astype(np.int32))

    nparent = len(first_child)
    gpu_parent_nodes = ga.empty(shape=nparent, dtype=ga.vec.uint4)

    nthreads_per_block = 256
    for first_index, elements_this_iter, nblocks_this_iter in \
            chunk_iterator(nparent, nthreads_per_block, max_blocks=10000):

        bvh_funcs.make_parents_detailed(np.uint32(first_index),
                                        np.uint32(elements_this_iter),
                                        gpu_nodes,
                                        gpu_parent_nodes,
                                        gpu_first_child,
                                        gpu_nchild,
                                        block=(nthreads_per_block, 1, 1),
                                        grid=(nblocks_this_iter, 1))

    return gpu_parent_nodes.get()
Beispiel #2
0
    def __init__(self, photons, ncopies=1, max_time=4.):
        """Load ``photons`` onto the GPU, replicating as requested.

           Args:
               - photons: chroma.Event.Photons
                   Photon state information to load onto GPU
               - ncopies: int, *optional*
                   Number of times to replicate the photons
                   on the GPU.  This is used if you want
                   to propagate the same event many times,
                   for example in a likelihood calculation.

                   The amount of GPU storage will be proportionally
                   larger if ncopies > 1, so be careful.
        """

        module = get_cu_module('propagate_hit.cu', options=cuda_options)
        propagate_hit_kernel = module.get_function('propagate_hit')
        propagate_hit_kernel.prepare('iiPPPPPPPPPPPiiiPPP')
        self.propagate_hit_kernel = propagate_hit_kernel
        self.gpu_funcs = GPUFuncs(module)

        self.max_time = max_time
        self.ncopies = ncopies
        self.true_nphotons = len(photons)
        self.marshall_photons(photons, ncopies)
Beispiel #3
0
def concatenate_layers(layers):
    bvh_module = get_cu_module('bvh.cu',
                               options=cuda_options,
                               include_source_directory=True)
    bvh_funcs = GPUFuncs(bvh_module)
    # Put 0 at beginning of list
    layer_bounds = np.insert(np.cumsum(map(len, layers)), 0, 0)
    nodes = ga.empty(shape=int(layer_bounds[-1]), dtype=ga.vec.uint4)
    nthreads_per_block = 256

    for layer_start, layer_end, layer in zip(layer_bounds[:-1],
                                             layer_bounds[1:], layers):
        if layer_end == layer_bounds[-1]:
            # leaf nodes need no offset
            child_offset = 0
        else:
            child_offset = layer_end

        for first_index, elements_this_iter, nblocks_this_iter in \
                chunk_iterator(layer_end-layer_start, nthreads_per_block,
                               max_blocks=10000):
            bvh_funcs.copy_and_offset(np.uint32(first_index),
                                      np.uint32(elements_this_iter),
                                      np.uint32(child_offset),
                                      cuda.In(layer),
                                      nodes[layer_start:],
                                      block=(nthreads_per_block, 1, 1),
                                      grid=(nblocks_this_iter, 1))
    return nodes.get(), layer_bounds
Beispiel #4
0
def merge_nodes_detailed(nodes, first_child, nchild):
    '''Merges nodes into len(first_child) parent nodes, using
    the provided arrays to determine the index of the first
    child of each parent, and how many children there are.'''
    bvh_module = get_cu_module('bvh.cu', options=cuda_options,
                               include_source_directory=True)
    bvh_funcs = GPUFuncs(bvh_module)

    gpu_nodes = ga.to_gpu(nodes)
    gpu_first_child = ga.to_gpu(first_child.astype(np.int32))
    gpu_nchild = ga.to_gpu(nchild.astype(np.int32))

    nparent = len(first_child)
    gpu_parent_nodes = ga.empty(shape=nparent, dtype=ga.vec.uint4)

    nthreads_per_block = 256
    for first_index, elements_this_iter, nblocks_this_iter in \
            chunk_iterator(nparent, nthreads_per_block, max_blocks=10000):

        bvh_funcs.make_parents_detailed(np.uint32(first_index),
                                        np.uint32(elements_this_iter),
                                        gpu_nodes,
                                        gpu_parent_nodes,
                                        gpu_first_child,
                                        gpu_nchild,
                                        block=(nthreads_per_block,1,1),
                                        grid=(nblocks_this_iter,1))

    return gpu_parent_nodes.get()
Beispiel #5
0
def concatenate_layers(layers):
    bvh_module = get_cu_module('bvh.cu', options=cuda_options,
                               include_source_directory=True)
    bvh_funcs = GPUFuncs(bvh_module)
    # Put 0 at beginning of list
    layer_bounds = np.insert(np.cumsum(map(len, layers)), 0, 0)
    nodes = ga.empty(shape=int(layer_bounds[-1]), dtype=ga.vec.uint4)
    nthreads_per_block = 256

    for layer_start, layer_end, layer in zip(layer_bounds[:-1],
                                             layer_bounds[1:],
                                             layers):
        if layer_end == layer_bounds[-1]:
            # leaf nodes need no offset
            child_offset = 0
        else:
            child_offset = layer_end

        for first_index, elements_this_iter, nblocks_this_iter in \
                chunk_iterator(layer_end-layer_start, nthreads_per_block,
                               max_blocks=10000):
            bvh_funcs.copy_and_offset(np.uint32(first_index),
                                      np.uint32(elements_this_iter),
                                      np.uint32(child_offset),
                                      cuda.In(layer),
                                      nodes[layer_start:],
                                      block=(nthreads_per_block,1,1),
                                      grid=(nblocks_this_iter,1))
    return nodes.get(), layer_bounds
Beispiel #6
0
class GPURays(object):
    """The GPURays class holds arrays of ray positions and directions
    on the GPU that are used to render a geometry."""
    def __init__(self, pos, dir, max_alpha_depth=10, nblocks=64):
        self.pos = ga.to_gpu(to_float3(pos))
        self.dir = ga.to_gpu(to_float3(dir))

        self.max_alpha_depth = max_alpha_depth

        self.nblocks = nblocks

        transform_module = get_cu_module('transform.cu', options=cuda_options)
        self.transform_funcs = GPUFuncs(transform_module)

        render_module = get_cu_module('render.cu', options=cuda_options)
        self.render_funcs = GPUFuncs(render_module)

        self.dx = ga.empty(max_alpha_depth*self.pos.size, dtype=np.float32)
        self.color = ga.empty(self.dx.size, dtype=ga.vec.float4)
        self.dxlen = ga.zeros(self.pos.size, dtype=np.uint32)

    def rotate(self, phi, n):
        "Rotate by an angle phi around the axis `n`."
        self.transform_funcs.rotate(np.int32(self.pos.size), self.pos, np.float32(phi), ga.vec.make_float3(*n), block=(self.nblocks,1,1), grid=(self.pos.size//self.nblocks+1,1))
        self.transform_funcs.rotate(np.int32(self.dir.size), self.dir, np.float32(phi), ga.vec.make_float3(*n), block=(self.nblocks,1,1), grid=(self.dir.size//self.nblocks+1,1))

    def rotate_around_point(self, phi, n, point):
        """"Rotate by an angle phi around the axis `n` passing through
        the point `point`."""
        self.transform_funcs.rotate_around_point(np.int32(self.pos.size), self.pos, np.float32(phi), ga.vec.make_float3(*n), ga.vec.make_float3(*point), block=(self.nblocks,1,1), grid=(self.pos.size//self.nblocks+1,1))
        self.transform_funcs.rotate(np.int32(self.dir.size), self.dir, np.float32(phi), ga.vec.make_float3(*n), block=(self.nblocks,1,1), grid=(self.dir.size//self.nblocks+1,1))

    def translate(self, v):
        "Translate the ray positions by the vector `v`."
        self.transform_funcs.translate(np.int32(self.pos.size), self.pos, ga.vec.make_float3(*v), block=(self.nblocks,1,1), grid=(self.pos.size//self.nblocks+1,1))

    def render(self, gpu_geometry, pixels, alpha_depth=10,
               keep_last_render=False):
        """Render `gpu_geometry` and fill the GPU array `pixels` with pixel
        colors."""
        if not keep_last_render:
            self.dxlen.fill(0)

        if alpha_depth > self.max_alpha_depth:
            raise Exception('alpha_depth > max_alpha_depth')

        if not isinstance(pixels, ga.GPUArray):
            raise TypeError('`pixels` must be a %s instance.' % ga.GPUArray)

        if pixels.size != self.pos.size:
            raise ValueError('`pixels`.size != number of rays')

        self.render_funcs.render(np.int32(self.pos.size), self.pos, self.dir, gpu_geometry.gpudata, np.uint32(alpha_depth), pixels, self.dx, self.dxlen, self.color, block=(self.nblocks,1,1), grid=(self.pos.size//self.nblocks+1,1))

    def snapshot(self, gpu_geometry, alpha_depth=10):
        "Render `gpu_geometry` and return a numpy array of pixel colors."
        pixels = ga.empty(self.pos.size, dtype=np.uint32)
        self.render(gpu_geometry, pixels, alpha_depth)
        return pixels.get()
Beispiel #7
0
    def __init__(self, gpu_detector, ndaq=1):
        self.earliest_time_gpu = ga.empty(gpu_detector.nchannels*ndaq, dtype=np.float32)
        self.earliest_time_int_gpu = ga.empty(gpu_detector.nchannels*ndaq, dtype=np.uint32)
        self.channel_history_gpu = ga.zeros_like(self.earliest_time_int_gpu)
        self.channel_q_int_gpu = ga.zeros_like(self.earliest_time_int_gpu)
        self.channel_q_gpu = ga.zeros(len(self.earliest_time_int_gpu), dtype=np.float32)
        self.detector_gpu = gpu_detector.detector_gpu
        self.solid_id_map_gpu = gpu_detector.solid_id_map
        self.solid_id_to_channel_index_gpu = gpu_detector.solid_id_to_channel_index_gpu

        self.module = get_cu_module('daq.cu', options=cuda_options, 
                                    include_source_directory=True)
        self.gpu_funcs = GPUFuncs(self.module)
        self.ndaq = ndaq
        self.stride = gpu_detector.nchannels
Beispiel #8
0
def area_sort_nodes(gpu_geometry, layer_bounds):
    bvh_module = get_cu_module('bvh.cu', options=cuda_options,
                               include_source_directory=True)
    bvh_funcs = GPUFuncs(bvh_module)

    bounds = zip(layer_bounds[:-1], layer_bounds[1:])[:-1]
    bounds.reverse()
    nthreads_per_block = 256
    for start, end in bounds:
        bvh_funcs.area_sort_child(np.uint32(start),
                                  np.uint32(end),
                                  gpu_geometry,
                                  block=(nthreads_per_block,1,1),
                                  grid=(120,1))
    return gpu_geometry.nodes.get()
Beispiel #9
0
def area_sort_nodes(gpu_geometry, layer_bounds):
    bvh_module = get_cu_module('bvh.cu',
                               options=cuda_options,
                               include_source_directory=True)
    bvh_funcs = GPUFuncs(bvh_module)

    bounds = zip(layer_bounds[:-1], layer_bounds[1:])[:-1]
    bounds.reverse()
    nthreads_per_block = 256
    for start, end in bounds:
        bvh_funcs.area_sort_child(np.uint32(start),
                                  np.uint32(end),
                                  gpu_geometry,
                                  block=(nthreads_per_block, 1, 1),
                                  grid=(120, 1))
    return gpu_geometry.nodes.get()
Beispiel #10
0
    def __init__(self, pos, dir, max_alpha_depth=10, nblocks=64):
        self.pos = ga.to_gpu(to_float3(pos))
        self.dir = ga.to_gpu(to_float3(dir))

        self.max_alpha_depth = max_alpha_depth

        self.nblocks = nblocks

        transform_module = get_cu_module('transform.cu', options=cuda_options)
        self.transform_funcs = GPUFuncs(transform_module)

        render_module = get_cu_module('render.cu', options=cuda_options)
        self.render_funcs = GPUFuncs(render_module)

        self.dx = ga.empty(max_alpha_depth * self.pos.size, dtype=np.float32)
        self.color = ga.empty(self.dx.size, dtype=ga.vec.float4)
        self.dxlen = ga.zeros(self.pos.size, dtype=np.uint32)
Beispiel #11
0
    def __init__(self, pos, dir, max_alpha_depth=10, nblocks=64):
        self.pos = ga.to_gpu(to_float3(pos))
        self.dir = ga.to_gpu(to_float3(dir))

        self.max_alpha_depth = max_alpha_depth

        self.nblocks = nblocks

        transform_module = get_cu_module('transform.cu', options=cuda_options)
        self.transform_funcs = GPUFuncs(transform_module)

        render_module = get_cu_module('render.cu', options=cuda_options)
        self.render_funcs = GPUFuncs(render_module)

        self.dx = ga.empty(max_alpha_depth*self.pos.size, dtype=np.float32)
        self.color = ga.empty(self.dx.size, dtype=ga.vec.float4)
        self.dxlen = ga.zeros(self.pos.size, dtype=np.uint32)
Beispiel #12
0
def collapse_chains(nodes, layer_bounds):
    bvh_module = get_cu_module('bvh.cu', options=cuda_options,
                               include_source_directory=True)
    bvh_funcs = GPUFuncs(bvh_module)
    
    gpu_nodes = ga.to_gpu(nodes)

    bounds = zip(layer_bounds[:-1], layer_bounds[1:])[:-1]
    bounds.reverse()
    nthreads_per_block = 256
    for start, end in bounds:
        bvh_funcs.collapse_child(np.uint32(start),
                                 np.uint32(end),
                                 gpu_nodes,
                                 block=(nthreads_per_block,1,1),
                                 grid=(120,1))
    return gpu_nodes.get()
Beispiel #13
0
def collapse_chains(nodes, layer_bounds):
    bvh_module = get_cu_module('bvh.cu',
                               options=cuda_options,
                               include_source_directory=True)
    bvh_funcs = GPUFuncs(bvh_module)

    gpu_nodes = ga.to_gpu(nodes)

    bounds = zip(layer_bounds[:-1], layer_bounds[1:])[:-1]
    bounds.reverse()
    nthreads_per_block = 256
    for start, end in bounds:
        bvh_funcs.collapse_child(np.uint32(start),
                                 np.uint32(end),
                                 gpu_nodes,
                                 block=(nthreads_per_block, 1, 1),
                                 grid=(120, 1))
    return gpu_nodes.get()
Beispiel #14
0
    def __init__(self, photons, ncopies=1):
        """Load ``photons`` onto the GPU, replicating as requested.

           Args:
               - photons: chroma.Event.Photons
                   Photon state information to load onto GPU
               - ncopies: int, *optional*
                   Number of times to replicate the photons
                   on the GPU.  This is used if you want
                   to propagate the same event many times,
                   for example in a likelihood calculation.

                   The amount of GPU storage will be proportionally
                   larger if ncopies > 1, so be careful.
        """
        nphotons = len(photons)
        self.pos = ga.empty(shape=nphotons*ncopies, dtype=ga.vec.float3)
        self.dir = ga.empty(shape=nphotons*ncopies, dtype=ga.vec.float3)
        self.pol = ga.empty(shape=nphotons*ncopies, dtype=ga.vec.float3)
        self.wavelengths = ga.empty(shape=nphotons*ncopies, dtype=np.float32)
        self.t = ga.empty(shape=nphotons*ncopies, dtype=np.float32)
        self.last_hit_triangles = ga.empty(shape=nphotons*ncopies, dtype=np.int32)
        self.flags = ga.empty(shape=nphotons*ncopies, dtype=np.uint32)
        self.weights = ga.empty(shape=nphotons*ncopies, dtype=np.float32)

        # Assign the provided photons to the beginning (possibly
        # the entire array if ncopies is 1
        self.pos[:nphotons].set(to_float3(photons.pos))
        self.dir[:nphotons].set(to_float3(photons.dir))
        self.pol[:nphotons].set(to_float3(photons.pol))
        self.wavelengths[:nphotons].set(photons.wavelengths.astype(np.float32))
        self.t[:nphotons].set(photons.t.astype(np.float32))
        self.last_hit_triangles[:nphotons].set(photons.last_hit_triangles.astype(np.int32))
        self.flags[:nphotons].set(photons.flags.astype(np.uint32))
        self.weights[:nphotons].set(photons.weights.astype(np.float32))

        module = get_cu_module('propagate.cu', options=cuda_options)
        self.gpu_funcs = GPUFuncs(module)

        # Replicate the photons to the rest of the slots if needed
        if ncopies > 1:
            max_blocks = 1024
            nthreads_per_block = 64
            for first_photon, photons_this_round, blocks in \
                    chunk_iterator(nphotons, nthreads_per_block, max_blocks):
                self.gpu_funcs.photon_duplicate(np.int32(first_photon), np.int32(photons_this_round),
                                                self.pos, self.dir, self.wavelengths, self.pol, self.t, 
                                                self.flags, self.last_hit_triangles, self.weights,
                                                np.int32(ncopies-1), 
                                                np.int32(nphotons),
                                                block=(nthreads_per_block,1,1), grid=(blocks, 1))


        # Save the duplication information for the iterate_copies() method
        self.true_nphotons = nphotons
        self.ncopies = ncopies
Beispiel #15
0
    def __init__(self, pos, dir, pol, wavelengths, t, last_hit_triangles,
                 flags, weights):
        '''Create new object using slices of GPUArrays from an instance
        of GPUPhotons.  NOTE THESE ARE NOT CPU ARRAYS!'''
        self.pos = pos
        self.dir = dir
        self.pol = pol
        self.wavelengths = wavelengths
        self.t = t
        self.last_hit_triangles = last_hit_triangles
        self.flags = flags
        self.weights = weights

        module = get_cu_module('propagate.cu', options=cuda_options)
        self.gpu_funcs = GPUFuncs(module)

        self.true_nphotons = len(pos)
        self.ncopies = 1
Beispiel #16
0
class GPUPDF(object):
    def __init__(self):
        self.module = get_cu_module('pdf.cu', options=cuda_options,
                                    include_source_directory=True)
        self.gpu_funcs = GPUFuncs(self.module)

    def setup_pdf(self, nchannels, tbins, trange, qbins, qrange):
        """Setup GPU arrays to hold PDF information.

           nchannels: int, number of channels
           tbins: number of time bins
           trange: tuple of (min, max) time in PDF
           qbins: number of charge bins
           qrange: tuple of (min, max) charge in PDF
        """
        self.events_in_histogram = 0
        self.hitcount_gpu = ga.zeros(nchannels, dtype=np.uint32)
        self.pdf_gpu = ga.zeros(shape=(nchannels, tbins, qbins), 
                                      dtype=np.uint32)
        self.tbins = tbins
        self.trange = trange
        self.qbins = qbins
        self.qrange = qrange

    def clear_pdf(self):
        """Rezero the PDF counters."""
        self.hitcount_gpu.fill(0)
        self.pdf_gpu.fill(0)

    def add_hits_to_pdf(self, gpuchannels, nthreads_per_block=64):
        self.gpu_funcs.bin_hits(np.int32(len(self.hitcount_gpu)),
                                gpuchannels.q,
                                gpuchannels.t,
                                self.hitcount_gpu,
                                np.int32(self.tbins),
                                np.float32(self.trange[0]),
                                np.float32(self.trange[1]),
                                np.int32(self.qbins),
                                np.float32(self.qrange[0]),
                                np.float32(self.qrange[1]),
                                self.pdf_gpu,
                                block=(nthreads_per_block,1,1), 
                                grid=(len(gpuchannels.t)//nthreads_per_block+1,1))


        self.events_in_histogram += 1

    def get_pdfs(self):
        """Returns the 1D hitcount array and the 3D [channel, time, charge]
        histogram."""
        return self.hitcount_gpu.get(), self.pdf_gpu.get()

    def setup_pdf_eval(self, event_hit, event_time, event_charge, min_twidth,
                       trange, min_qwidth, qrange, min_bin_content=10,
                       time_only=True):
        """Setup GPU arrays to compute PDF values for the given event.
        The pdf_eval calculation allows the PDF to be evaluated at a
        single point for each channel as the Monte Carlo is run.  The
        effective bin size will be as small as (`min_twidth`,
        `min_qwidth`) around the point of interest, but will be large
        enough to ensure that `min_bin_content` Monte Carlo events
        fall into the bin.

            event_hit: ndarray
              Hit or not-hit status for each channel in the detector.
            event_time: ndarray
              Hit time for each channel in the detector.  If channel 
              not hit, the time will be ignored.
            event_charge: ndarray
              Integrated charge for each channel in the detector.
              If channel not hit, the charge will be ignored.

            min_twidth: float
              Minimum bin size in the time dimension
            trange: (float, float)
              Range of time dimension in PDF
            min_qwidth: float
              Minimum bin size in charge dimension
            qrange: (float, float)
              Range of charge dimension in PDF
            min_bin_content: int
              The bin will be expanded to include at least this many events
            time_only: bool
              If True, only the time observable will be used in the PDF.
        """
        self.event_nhit = count_nonzero(event_hit)
        
        # Define a mapping from an array of len(event_hit) to an array of length event_nhit
        self.map_hit_offset_to_channel_id = np.where(event_hit)[0].astype(np.uint32)
        self.map_hit_offset_to_channel_id_gpu = ga.to_gpu(self.map_hit_offset_to_channel_id)
        self.map_channel_id_to_hit_offset = np.maximum(0, event_hit.cumsum() - 1).astype(np.uint32)
        self.map_channel_id_to_hit_offset_gpu = ga.to_gpu(self.map_channel_id_to_hit_offset)

        self.event_hit_gpu = ga.to_gpu(event_hit.astype(np.uint32))
        self.event_time_gpu = ga.to_gpu(event_time.astype(np.float32))
        self.event_charge_gpu = ga.to_gpu(event_charge.astype(np.float32))

        self.eval_hitcount_gpu = ga.zeros(len(event_hit), dtype=np.uint32)
        self.eval_bincount_gpu = ga.zeros(len(event_hit), dtype=np.uint32)
        self.nearest_mc_gpu = ga.empty(shape=self.event_nhit * min_bin_content, 
                                             dtype=np.float32)
        self.nearest_mc_gpu.fill(1e9)
        
        self.min_twidth = min_twidth
        self.trange = trange
        self.min_qwidth = min_qwidth
        self.qrange = qrange
        self.min_bin_content = min_bin_content

        assert time_only # Only support time right now
        self.time_only = time_only

    def clear_pdf_eval(self):
        "Reset PDF evaluation counters to start accumulating new Monte Carlo."
        self.eval_hitcount_gpu.fill(0)
        self.eval_bincount_gpu.fill(0)
        self.nearest_mc_gpu.fill(1e9)

    @profile_if_possible
    def accumulate_pdf_eval(self, gpuchannels, nthreads_per_block=64, max_blocks=10000):
        "Add the most recent results of run_daq() to the PDF evaluation."
        self.work_queues = ga.empty(shape=self.event_nhit * (gpuchannels.ndaq+1), dtype=np.uint32)
        self.work_queues.fill(1)

        self.gpu_funcs.accumulate_bincount(np.int32(self.event_hit_gpu.size),
                                           np.int32(gpuchannels.ndaq),
                                           self.event_hit_gpu,
                                           self.event_time_gpu,
                                           gpuchannels.t,
                                           self.eval_hitcount_gpu,
                                           self.eval_bincount_gpu,
                                           np.float32(self.min_twidth),
                                           np.float32(self.trange[0]),
                                           np.float32(self.trange[1]),
                                           np.int32(self.min_bin_content),
                                           self.map_channel_id_to_hit_offset_gpu,
                                           self.work_queues,
                                           block=(nthreads_per_block,1,1), 
                                           grid=(self.event_hit_gpu.size//nthreads_per_block+1,1))
        cuda.Context.get_current().synchronize()

        self.gpu_funcs.accumulate_nearest_neighbor_block(np.int32(self.event_nhit),
                                                         np.int32(gpuchannels.ndaq),
                                                         self.map_hit_offset_to_channel_id_gpu,
                                                         self.work_queues,
                                                         self.event_time_gpu,
                                                         gpuchannels.t,
                                                         self.nearest_mc_gpu,
                                                         np.int32(self.min_bin_content),
                                                         block=(nthreads_per_block,1,1), 
                                                         grid=(self.event_nhit,1))
        cuda.Context.get_current().synchronize()

    def get_pdf_eval(self):
        evhit = self.event_hit_gpu.get().astype(bool)
        hitcount = self.eval_hitcount_gpu.get()
        bincount = self.eval_bincount_gpu.get()

        pdf_value = np.zeros(len(hitcount), dtype=float)
        pdf_frac_uncert = np.zeros_like(pdf_value)

        # PDF value for high stats bins
        high_stats = (bincount >= self.min_bin_content)
        if high_stats.any():
            if self.time_only:
                pdf_value[high_stats] = bincount[high_stats].astype(float) / hitcount[high_stats] / self.min_twidth
            else:
                assert Exception('Unimplemented 2D (time,charge) mode!')

            pdf_frac_uncert[high_stats] = 1.0/np.sqrt(bincount[high_stats])

        # PDF value for low stats bins
        low_stats = ~high_stats & (hitcount > 0) & evhit

        nearest_mc_by_hit = self.nearest_mc_gpu.get().reshape((self.event_nhit, self.min_bin_content))
        nearest_mc = np.empty(shape=(len(hitcount), self.min_bin_content), dtype=np.float32)
        nearest_mc.fill(1e9)
        nearest_mc[self.map_hit_offset_to_channel_id,:] = nearest_mc_by_hit

        # Deal with the case where we did not even get min_bin_content events
        # in the PDF but also clamp the lower range to ensure we don't index
        # by a negative number in 2 lines
        last_valid_entry = np.maximum(0, (nearest_mc < 1e9).astype(int).sum(axis=1) - 1)
        distance = nearest_mc[np.arange(len(last_valid_entry)),last_valid_entry]
        if low_stats.any():
            if self.time_only:
                pdf_value[low_stats] = (last_valid_entry[low_stats] + 1).astype(float) / hitcount[low_stats] / distance[low_stats] / 2.0
            else:
                assert Exception('Unimplemented 2D (time,charge) mode!')

            pdf_frac_uncert[low_stats] = 1.0/np.sqrt(last_valid_entry[low_stats] + 1)

        # PDFs with no stats got zero by default during array creation
        
        print 'high_stats:', high_stats.sum(), 'low_stats', low_stats.sum()
        return hitcount, pdf_value, pdf_value * pdf_frac_uncert
Beispiel #17
0
class GPUDaq(object):
    def __init__(self, gpu_detector, ndaq=1):
        self.earliest_time_gpu = ga.empty(gpu_detector.nchannels*ndaq, dtype=np.float32)
        self.earliest_time_int_gpu = ga.empty(gpu_detector.nchannels*ndaq, dtype=np.uint32)
        self.channel_history_gpu = ga.zeros_like(self.earliest_time_int_gpu)
        self.channel_q_int_gpu = ga.zeros_like(self.earliest_time_int_gpu)
        self.channel_q_gpu = ga.zeros(len(self.earliest_time_int_gpu), dtype=np.float32)
        self.detector_gpu = gpu_detector.detector_gpu
        self.solid_id_map_gpu = gpu_detector.solid_id_map
        self.solid_id_to_channel_index_gpu = gpu_detector.solid_id_to_channel_index_gpu

        self.module = get_cu_module('daq.cu', options=cuda_options, 
                                    include_source_directory=True)
        self.gpu_funcs = GPUFuncs(self.module)
        self.ndaq = ndaq
        self.stride = gpu_detector.nchannels

    def begin_acquire(self, nthreads_per_block=64):
        self.gpu_funcs.reset_earliest_time_int(np.float32(1e9), np.int32(len(self.earliest_time_int_gpu)), self.earliest_time_int_gpu, block=(nthreads_per_block,1,1), grid=(len(self.earliest_time_int_gpu)//nthreads_per_block+1,1))
        self.channel_q_int_gpu.fill(0)
        self.channel_q_gpu.fill(0)
        self.channel_history_gpu.fill(0)

    def acquire(self, gpuphotons, rng_states, nthreads_per_block=64, max_blocks=1024, start_photon=None, nphotons=None, weight=1.0):
        if start_photon is None:
            start_photon = 0
        if nphotons is None:
            nphotons = len(gpuphotons.pos) - start_photon

        if self.ndaq == 1:
            for first_photon, photons_this_round, blocks in \
                    chunk_iterator(nphotons, nthreads_per_block, max_blocks):
                self.gpu_funcs.run_daq(rng_states, np.uint32(0x1 << 2), 
                                       np.int32(start_photon+first_photon), np.int32(photons_this_round), gpuphotons.t, 
                                       gpuphotons.flags, gpuphotons.last_hit_triangles, gpuphotons.weights,
                                       self.solid_id_map_gpu,
                                       self.detector_gpu,
                                       self.earliest_time_int_gpu, 
                                       self.channel_q_int_gpu, self.channel_history_gpu,
                                       np.float32(weight),
                                       block=(nthreads_per_block,1,1), grid=(blocks,1))
        else:
            for first_photon, photons_this_round, blocks in \
                    chunk_iterator(nphotons, 1, max_blocks):
                self.gpu_funcs.run_daq_many(rng_states, np.uint32(0x1 << 2), 
                                            np.int32(start_photon+first_photon), np.int32(photons_this_round), gpuphotons.t, 
                                            gpuphotons.flags, gpuphotons.last_hit_triangles, gpuphotons.weights,
                                            self.solid_id_map_gpu,
                                            self.detector_gpu,
                                            self.earliest_time_int_gpu, 
                                            self.channel_q_int_gpu, self.channel_history_gpu, 
                                            np.int32(self.ndaq), np.int32(self.stride),
                                            np.float32(weight),
                                            block=(nthreads_per_block,1,1), grid=(blocks,1))
        cuda.Context.get_current().synchronize()
    
    def end_acquire(self, nthreads_per_block=64):
        self.gpu_funcs.convert_sortable_int_to_float(np.int32(len(self.earliest_time_int_gpu)), self.earliest_time_int_gpu, self.earliest_time_gpu, block=(nthreads_per_block,1,1), grid=(len(self.earliest_time_int_gpu)//nthreads_per_block+1,1))

        self.gpu_funcs.convert_charge_int_to_float(self.detector_gpu, self.channel_q_int_gpu, self.channel_q_gpu, block=(nthreads_per_block,1,1), grid=(len(self.channel_q_int_gpu)//nthreads_per_block+1,1))

        cuda.Context.get_current().synchronize()

        return GPUChannels(self.earliest_time_gpu, self.channel_q_gpu, self.channel_history_gpu, self.ndaq, self.stride)
Beispiel #18
0
class GPURays(object):
    """The GPURays class holds arrays of ray positions and directions
    on the GPU that are used to render a geometry."""
    def __init__(self, pos, dir, max_alpha_depth=10, nblocks=64):
        self.pos = ga.to_gpu(to_float3(pos))
        self.dir = ga.to_gpu(to_float3(dir))

        self.max_alpha_depth = max_alpha_depth

        self.nblocks = nblocks

        transform_module = get_cu_module('transform.cu', options=cuda_options)
        self.transform_funcs = GPUFuncs(transform_module)

        render_module = get_cu_module('render.cu', options=cuda_options)
        self.render_funcs = GPUFuncs(render_module)

        self.dx = ga.empty(max_alpha_depth * self.pos.size, dtype=np.float32)
        self.color = ga.empty(self.dx.size, dtype=ga.vec.float4)
        self.dxlen = ga.zeros(self.pos.size, dtype=np.uint32)

    def rotate(self, phi, n):
        "Rotate by an angle phi around the axis `n`."
        self.transform_funcs.rotate(np.int32(self.pos.size),
                                    self.pos,
                                    np.float32(phi),
                                    ga.vec.make_float3(*n),
                                    block=(self.nblocks, 1, 1),
                                    grid=(self.pos.size // self.nblocks + 1,
                                          1))
        self.transform_funcs.rotate(np.int32(self.dir.size),
                                    self.dir,
                                    np.float32(phi),
                                    ga.vec.make_float3(*n),
                                    block=(self.nblocks, 1, 1),
                                    grid=(self.dir.size // self.nblocks + 1,
                                          1))

    def rotate_around_point(self, phi, n, point):
        """"Rotate by an angle phi around the axis `n` passing through
        the point `point`."""
        self.transform_funcs.rotate_around_point(
            np.int32(self.pos.size),
            self.pos,
            np.float32(phi),
            ga.vec.make_float3(*n),
            ga.vec.make_float3(*point),
            block=(self.nblocks, 1, 1),
            grid=(self.pos.size // self.nblocks + 1, 1))
        self.transform_funcs.rotate(np.int32(self.dir.size),
                                    self.dir,
                                    np.float32(phi),
                                    ga.vec.make_float3(*n),
                                    block=(self.nblocks, 1, 1),
                                    grid=(self.dir.size // self.nblocks + 1,
                                          1))

    def translate(self, v):
        "Translate the ray positions by the vector `v`."
        self.transform_funcs.translate(np.int32(self.pos.size),
                                       self.pos,
                                       ga.vec.make_float3(*v),
                                       block=(self.nblocks, 1, 1),
                                       grid=(self.pos.size // self.nblocks + 1,
                                             1))

    def render(self,
               gpu_geometry,
               pixels,
               alpha_depth=10,
               keep_last_render=False):
        """Render `gpu_geometry` and fill the GPU array `pixels` with pixel
        colors."""
        if not keep_last_render:
            self.dxlen.fill(0)

        if alpha_depth > self.max_alpha_depth:
            raise Exception('alpha_depth > max_alpha_depth')

        if not isinstance(pixels, ga.GPUArray):
            raise TypeError('`pixels` must be a %s instance.' % ga.GPUArray)

        if pixels.size != self.pos.size:
            raise ValueError('`pixels`.size != number of rays')

        self.render_funcs.render(np.int32(self.pos.size),
                                 self.pos,
                                 self.dir,
                                 gpu_geometry.gpudata,
                                 np.uint32(alpha_depth),
                                 pixels,
                                 self.dx,
                                 self.dxlen,
                                 self.color,
                                 block=(self.nblocks, 1, 1),
                                 grid=(self.pos.size // self.nblocks + 1, 1))

    def snapshot(self, gpu_geometry, alpha_depth=10):
        "Render `gpu_geometry` and return a numpy array of pixel colors."
        pixels = ga.empty(self.pos.size, dtype=np.uint32)
        self.render(gpu_geometry, pixels, alpha_depth)
        return pixels.get()
Beispiel #19
0
def create_leaf_nodes(mesh, morton_bits=16, round_to_multiple=1):
    '''Compute the leaf nodes surrounding a triangle mesh.

      ``mesh``: chroma.geometry.Mesh
        Triangles to box
      ``morton_bits``: int
        Number of bits to use per dimension when computing Morton code.
      ``round_to_multiple``: int
        Round the number of nodes created up to multiple of this number
        Extra nodes will be all zero.
        
    Returns (world_coords, nodes, morton_codes), where
      ``world_coords``: chroma.bvh.WorldCoords
        Defines the fixed point coordinate system
      ``nodes``: ndarray(shape=len(mesh.triangles), dtype=uint4)
        List of leaf nodes.  Child IDs will be set to triangle offsets.
      ``morton_codes``: ndarray(shape=len(mesh.triangles), dtype=np.uint64)
        Morton codes for each triangle, using ``morton_bits`` per axis.
        Must be <= 16 bits.
    '''
    # Load GPU functions
    bvh_module = get_cu_module('bvh.cu',
                               options=cuda_options,
                               include_source_directory=True)
    bvh_funcs = GPUFuncs(bvh_module)

    # compute world coordinates
    world_origin = mesh.vertices.min(axis=0)
    world_scale = np.max((mesh.vertices.max(axis=0) - world_origin)) \
        / (2**16 - 2)
    world_coords = WorldCoords(world_origin=world_origin,
                               world_scale=world_scale)

    # Put triangles and vertices in mapped host memory
    triangles = mapped_empty(shape=len(mesh.triangles),
                             dtype=ga.vec.uint3,
                             write_combined=True)
    triangles[:] = to_uint3(mesh.triangles)
    vertices = mapped_empty(shape=len(mesh.vertices),
                            dtype=ga.vec.float3,
                            write_combined=True)
    vertices[:] = to_float3(mesh.vertices)

    # Call GPU to compute nodes
    nodes = ga.zeros(shape=round_up_to_multiple(len(triangles),
                                                round_to_multiple),
                     dtype=ga.vec.uint4)
    morton_codes = ga.empty(shape=len(triangles), dtype=np.uint64)

    # Convert world coords to GPU-friendly types
    world_origin = ga.vec.make_float3(*world_origin)
    world_scale = np.float32(world_scale)

    nthreads_per_block = 256
    for first_index, elements_this_iter, nblocks_this_iter in \
            chunk_iterator(len(triangles), nthreads_per_block,
                           max_blocks=30000):
        bvh_funcs.make_leaves(np.uint32(first_index),
                              np.uint32(elements_this_iter),
                              Mapped(triangles),
                              Mapped(vertices),
                              world_origin,
                              world_scale,
                              nodes,
                              morton_codes,
                              block=(nthreads_per_block, 1, 1),
                              grid=(nblocks_this_iter, 1))

    morton_codes_host = morton_codes.get() >> (16 - morton_bits)
    return world_coords, nodes.get(), morton_codes_host
Beispiel #20
0
def optimize_layer(orig_nodes):
    bvh_module = get_cu_module('bvh.cu',
                               options=cuda_options,
                               include_source_directory=True)
    bvh_funcs = GPUFuncs(bvh_module)

    nodes = ga.to_gpu(orig_nodes)
    n = len(nodes)
    areas = ga.empty(shape=n / 2, dtype=np.uint64)
    nthreads_per_block = 128

    min_areas = ga.empty(shape=int(np.ceil(n / float(nthreads_per_block))),
                         dtype=np.uint64)
    min_index = ga.empty(shape=min_areas.shape, dtype=np.uint32)

    update = 10000

    skip_size = 1
    flag = mapped_empty(shape=skip_size, dtype=np.uint32)

    i = 0
    skips = 0
    swaps = 0
    while i < n / 2 - 1:
        # How are we doing?
        if i % update == 0:
            for first_index, elements_this_iter, nblocks_this_iter in \
                    chunk_iterator(n/2, nthreads_per_block, max_blocks=10000):

                bvh_funcs.pair_area(np.uint32(first_index),
                                    np.uint32(elements_this_iter),
                                    nodes,
                                    areas,
                                    block=(nthreads_per_block, 1, 1),
                                    grid=(nblocks_this_iter, 1))

            areas_host = areas.get()
            #print nodes.get(), areas_host.astype(float)
            print 'Area of parent layer so far (%d): %1.12e' % (
                i * 2, areas_host.astype(float).sum())
            print 'Skips: %d, Swaps: %d' % (skips, swaps)

        test_index = i * 2

        blocks = 0
        look_forward = min(8192 * 50, n - test_index - 2)
        skip_this_round = min(skip_size, n - test_index - 1)
        flag[:] = 0
        for first_index, elements_this_iter, nblocks_this_iter in \
                chunk_iterator(look_forward, nthreads_per_block, max_blocks=10000):
            bvh_funcs.min_distance_to(np.uint32(first_index + test_index + 2),
                                      np.uint32(elements_this_iter),
                                      np.uint32(test_index),
                                      nodes,
                                      np.uint32(blocks),
                                      min_areas,
                                      min_index,
                                      Mapped(flag),
                                      block=(nthreads_per_block, 1, 1),
                                      grid=(nblocks_this_iter,
                                            skip_this_round))
            blocks += nblocks_this_iter
            #print i, first_index, nblocks_this_iter, look_forward
        cuda.Context.get_current().synchronize()

        if flag[0] == 0:
            flag_nonzero = flag.nonzero()[0]
            if len(flag_nonzero) == 0:
                no_swap_required = skip_size
            else:
                no_swap_required = flag_nonzero[0]
            i += no_swap_required
            skips += no_swap_required
            continue

        min_areas_host = min_areas[:blocks].get()
        min_index_host = min_index[:blocks].get()
        best_block = min_areas_host.argmin()
        better_i = min_index_host[best_block]

        swaps += 1
        #print 'swap', test_index+1, better_i
        assert 0 < better_i < len(nodes)
        assert 0 < test_index + 1 < len(nodes)
        bvh_funcs.swap(np.uint32(test_index + 1),
                       np.uint32(better_i),
                       nodes,
                       block=(1, 1, 1),
                       grid=(1, 1))
        cuda.Context.get_current().synchronize()
        i += 1

    for first_index, elements_this_iter, nblocks_this_iter in \
            chunk_iterator(n/2, nthreads_per_block, max_blocks=10000):

        bvh_funcs.pair_area(np.uint32(first_index),
                            np.uint32(elements_this_iter),
                            nodes,
                            areas,
                            block=(nthreads_per_block, 1, 1),
                            grid=(nblocks_this_iter, 1))

    areas_host = areas.get()

    print 'Final area of parent layer: %1.12e' % areas_host.sum()
    print 'Skips: %d, Swaps: %d' % (skips, swaps)

    return nodes.get()
Beispiel #21
0
class GPUKernelPDF(object):
    def __init__(self):
        self.module = get_cu_module('pdf.cu', options=cuda_options,
                                    include_source_directory=True)
        self.gpu_funcs = GPUFuncs(self.module)

    def setup_moments(self, nchannels, trange, qrange, time_only=True):
        """Setup GPU arrays to accumulate moments and eventually
        compute a kernel estimate of PDF values for each hit channel.

            trange: (float, float)
              Range of time dimension in PDF
            qrange: (float, float)
              Range of charge dimension in PDF
            time_only: bool
              If True, only the time observable will be used in the PDF.
        """
        self.hitcount_gpu = ga.zeros(nchannels, dtype=np.uint32)
        self.tmom1_gpu = ga.zeros(nchannels, dtype=np.float32)
        self.tmom2_gpu = ga.zeros(nchannels, dtype=np.float32)
        self.qmom1_gpu = ga.zeros(nchannels, dtype=np.float32)
        self.qmom2_gpu = ga.zeros(nchannels, dtype=np.float32)

        self.trange = trange
        self.qrange = qrange
        self.time_only = time_only

    def clear_moments(self):
        "Reset PDF evaluation counters to start accumulating new Monte Carlo."
        self.hitcount_gpu.fill(0)
        self.tmom1_gpu.fill(0.0)
        self.tmom2_gpu.fill(0.0)
        self.qmom1_gpu.fill(0.0)
        self.qmom2_gpu.fill(0.0)

    def accumulate_moments(self, gpuchannels, nthreads_per_block=64):
        """Add the most recent results of run_daq() to the accumulate of 
        moments for future bandwidth calculation."""
        self.gpu_funcs.accumulate_moments(np.int32(self.time_only),
                                          np.int32(len(gpuchannels.t)),
                                          gpuchannels.t,
                                          gpuchannels.q,
                                          np.float32(self.trange[0]),
                                          np.float32(self.trange[1]),
                                          np.float32(self.qrange[0]),
                                          np.float32(self.qrange[1]),
                                          self.hitcount_gpu,
                                          self.tmom1_gpu,
                                          self.tmom2_gpu,
                                          self.qmom1_gpu,
                                          self.qmom2_gpu,
                                          block=(nthreads_per_block,1,1), 
                                          grid=(len(gpuchannels.t)//nthreads_per_block+1,1))
        
    def compute_bandwidth(self, event_hit, event_time, event_charge, 
                          scale_factor=1.0):
        """Use the MC information accumulated by accumulate_moments() to
        estimate the best bandwidth to use when kernel estimating."""

        rho = 1.0

        hitcount = self.hitcount_gpu.get()
        mom0 = np.maximum(hitcount, 1)
        tmom1 = self.tmom1_gpu.get()
        tmom2 = self.tmom2_gpu.get()

        tmean = tmom1 / mom0
        tvar = np.maximum(tmom2 / mom0 - tmean**2, 0.0) # roundoff can go neg
        trms = tvar**0.5

        if self.time_only:
            d = 1
        else:
            d = 2
        dimensionality_factor = ((4.0/(d+2)) / (mom0/scale_factor))**(-1.0/(d+4))
        gaussian_density = np.minimum(1.0/trms, (1.0/np.sqrt(2.0*np.pi)) * np.exp(-0.5*((event_time - tmean)/trms))  / trms)
        time_bandwidths = dimensionality_factor / gaussian_density * rho
        inv_time_bandwidths = np.zeros_like(time_bandwidths)
        inv_time_bandwidths[time_bandwidths  > 0] = time_bandwidths[time_bandwidths > 0] ** -1

        # precompute inverse to speed up GPU evaluation
        self.inv_time_bandwidths_gpu = ga.to_gpu(
            inv_time_bandwidths.astype(np.float32)
            )

        # Compute charge bandwidths if needed
        if self.time_only:
            self.inv_charge_bandwidths_gpu = ga.empty_like(
                self.inv_time_bandwidths_gpu
                )
            self.inv_charge_bandwidths_gpu.fill(0.0)
        else:
            qmom1 = self.qmom1_gpu.get()
            qmom2 = self.qmom2_gpu.get()

            qmean = qmom1 / mom0
            qrms = (qmom2 / mom0 - qmean**2)**0.5

            gaussian_density = np.minimum(1.0/qrms, (1.0/np.sqrt(2.0*np.pi)) * np.exp(-0.5*((event_charge - qmean)/qrms))  / qrms)

            charge_bandwidths = dimensionality_factor / gaussian_density * rho

            # precompute inverse to speed up GPU evaluation
            self.inv_charge_bandwidths_gpu = ga.to_gpu( 
                (charge_bandwidths**-1).astype(np.float32)
                )

    def setup_kernel(self, event_hit, event_time, event_charge):
        """Setup GPU arrays to accumulate moments and eventually
        compute a kernel estimate of PDF values for each hit channel.

            event_hit: ndarray
              Hit or not-hit status for each channel in the detector.
            event_time: ndarray
              Hit time for each channel in the detector.  If channel 
              not hit, the time will be ignored.
            event_charge: ndarray
              Integrated charge for each channel in the detector.
              If channel not hit, the charge will be ignored.
        """
        self.event_hit_gpu = ga.to_gpu(event_hit.astype(np.uint32))
        self.event_time_gpu = ga.to_gpu(event_time.astype(np.float32))
        self.event_charge_gpu = ga.to_gpu(event_charge.astype(np.float32))
        self.hitcount_gpu.fill(0)
        self.time_pdf_values_gpu = ga.zeros(len(event_hit), dtype=np.float32)
        self.charge_pdf_values_gpu = ga.zeros(len(event_hit), dtype=np.float32)

    def clear_kernel(self):
        self.hitcount_gpu.fill(0)
        self.time_pdf_values_gpu.fill(0.0)
        self.charge_pdf_values_gpu.fill(0.0)
            
    def accumulate_kernel(self, gpuchannels, nthreads_per_block=64):
        "Add the most recent results of run_daq() to the kernel PDF evaluation."
        self.gpu_funcs.accumulate_kernel_eval(np.int32(self.time_only),
                                              np.int32(len(self.event_hit_gpu)),
                                              self.event_hit_gpu,
                                              self.event_time_gpu,
                                              self.event_charge_gpu,
                                              gpuchannels.t,
                                              gpuchannels.q,
                                              np.float32(self.trange[0]),
                                              np.float32(self.trange[1]),
                                              np.float32(self.qrange[0]),
                                              np.float32(self.qrange[1]),
                                              self.inv_time_bandwidths_gpu,
                                              self.inv_charge_bandwidths_gpu,
                                              self.hitcount_gpu,
                                              self.time_pdf_values_gpu,
                                              self.charge_pdf_values_gpu,
                                              block=(nthreads_per_block,1,1), 
                                              grid=(len(gpuchannels.t)//nthreads_per_block+1,1))


    def get_kernel_eval(self):
        hitcount = self.hitcount_gpu.get()
        hit = self.event_hit_gpu.get().astype(bool)
        time_pdf_values = self.time_pdf_values_gpu.get()
        time_pdf_values /= np.maximum(1, hitcount) # avoid divide by zero

        charge_pdf_values = self.charge_pdf_values_gpu.get()
        charge_pdf_values /= np.maximum(1, hitcount) # avoid divide by zero

        if self.time_only:
            pdf_values = time_pdf_values
        else:
            pdf_values = time_pdf_values * charge_pdf_values
        
        return hitcount, pdf_values, np.zeros_like(pdf_values)
Beispiel #22
0
 def __init__(self):
     self.module = get_cu_module('pdf.cu', options=cuda_options,
                                 include_source_directory=True)
     self.gpu_funcs = GPUFuncs(self.module)
Beispiel #23
0
    def __init__(self,
                 photons,
                 ncopies=1,
                 copy_flags=True,
                 copy_triangles=True,
                 copy_weights=True):
        """Load ``photons`` onto the GPU, replicating as requested.

           Args:
               - photons: chroma.Event.Photons
                   Photon state information to load onto GPU
               - ncopies: int, *optional*
                   Number of times to replicate the photons
                   on the GPU.  This is used if you want
                   to propagate the same event many times,
                   for example in a likelihood calculation.

                   The amount of GPU storage will be proportionally
                   larger if ncopies > 1, so be careful.
        """
        nphotons = len(photons)
        self.pos = ga.empty(shape=nphotons * ncopies, dtype=ga.vec.float3)
        self.dir = ga.empty(shape=nphotons * ncopies, dtype=ga.vec.float3)
        self.pol = ga.empty(shape=nphotons * ncopies, dtype=ga.vec.float3)
        self.wavelengths = ga.empty(shape=nphotons * ncopies, dtype=np.float32)
        self.t = ga.empty(shape=nphotons * ncopies, dtype=np.float32)
        self.last_hit_triangles = ga.empty(shape=nphotons * ncopies,
                                           dtype=np.int32)
        if not copy_triangles:
            self.last_hit_triangles.fill(-1)
        if not copy_flags:
            self.flags = ga.zeros(shape=nphotons * ncopies, dtype=np.uint32)
        else:
            self.flags = ga.empty(shape=nphotons * ncopies, dtype=np.uint32)
        if not copy_weights:
            self.weights = ga.ones_like(self.last_hit_triangles,
                                        dtype=np.float32)
        else:
            self.weights = ga.empty(shape=nphotons * ncopies, dtype=np.float32)
        self.evidx = ga.empty(shape=nphotons, dtype=np.uint32)

        # Assign the provided photons to the beginning (possibly
        # the entire array if ncopies is 1
        self.pos[:nphotons].set(to_float3(photons.pos))
        self.dir[:nphotons].set(to_float3(photons.dir))
        self.pol[:nphotons].set(to_float3(photons.pol))
        self.wavelengths[:nphotons].set(photons.wavelengths.astype(np.float32))
        self.t[:nphotons].set(photons.t.astype(np.float32))
        if copy_triangles:
            self.last_hit_triangles[:nphotons].set(
                photons.last_hit_triangles.astype(np.int32))
        if copy_flags:
            self.flags[:nphotons].set(photons.flags.astype(np.uint32))
        if copy_weights:
            self.weights[:nphotons].set(photons.weights.astype(np.float32))
        self.evidx[:nphotons].set(photons.evidx.astype(np.uint32))

        module = get_cu_module('propagate.cu', options=cuda_options)
        self.gpu_funcs = GPUFuncs(module)

        # Replicate the photons to the rest of the slots if needed
        if ncopies > 1:
            max_blocks = 1024
            nthreads_per_block = 64
            for first_photon, photons_this_round, blocks in \
                    chunk_iterator(nphotons, nthreads_per_block, max_blocks):
                self.gpu_funcs.photon_duplicate(np.int32(first_photon),
                                                np.int32(photons_this_round),
                                                self.pos,
                                                self.dir,
                                                self.wavelengths,
                                                self.pol,
                                                self.t,
                                                self.flags,
                                                self.last_hit_triangles,
                                                self.weights,
                                                self.evidx,
                                                np.int32(ncopies - 1),
                                                np.int32(nphotons),
                                                block=(nthreads_per_block, 1,
                                                       1),
                                                grid=(blocks, 1))

        # Save the duplication information for the iterate_copies() method
        self.true_nphotons = nphotons
        self.ncopies = ncopies
Beispiel #24
0
class GPUKernelPDF(object):
    def __init__(self):
        self.module = get_cu_module('pdf.cu',
                                    options=cuda_options,
                                    include_source_directory=True)
        self.gpu_funcs = GPUFuncs(self.module)

    def setup_moments(self, nchannels, trange, qrange, time_only=True):
        """Setup GPU arrays to accumulate moments and eventually
        compute a kernel estimate of PDF values for each hit channel.

            trange: (float, float)
              Range of time dimension in PDF
            qrange: (float, float)
              Range of charge dimension in PDF
            time_only: bool
              If True, only the time observable will be used in the PDF.
        """
        self.hitcount_gpu = ga.zeros(nchannels, dtype=np.uint32)
        self.tmom1_gpu = ga.zeros(nchannels, dtype=np.float32)
        self.tmom2_gpu = ga.zeros(nchannels, dtype=np.float32)
        self.qmom1_gpu = ga.zeros(nchannels, dtype=np.float32)
        self.qmom2_gpu = ga.zeros(nchannels, dtype=np.float32)

        self.trange = trange
        self.qrange = qrange
        self.time_only = time_only

    def clear_moments(self):
        "Reset PDF evaluation counters to start accumulating new Monte Carlo."
        self.hitcount_gpu.fill(0)
        self.tmom1_gpu.fill(0.0)
        self.tmom2_gpu.fill(0.0)
        self.qmom1_gpu.fill(0.0)
        self.qmom2_gpu.fill(0.0)

    def accumulate_moments(self, gpuchannels, nthreads_per_block=64):
        """Add the most recent results of run_daq() to the accumulate of 
        moments for future bandwidth calculation."""
        self.gpu_funcs.accumulate_moments(
            np.int32(self.time_only),
            np.int32(len(gpuchannels.t)),
            gpuchannels.t,
            gpuchannels.q,
            np.float32(self.trange[0]),
            np.float32(self.trange[1]),
            np.float32(self.qrange[0]),
            np.float32(self.qrange[1]),
            self.hitcount_gpu,
            self.tmom1_gpu,
            self.tmom2_gpu,
            self.qmom1_gpu,
            self.qmom2_gpu,
            block=(nthreads_per_block, 1, 1),
            grid=(len(gpuchannels.t) // nthreads_per_block + 1, 1))

    def compute_bandwidth(self,
                          event_hit,
                          event_time,
                          event_charge,
                          scale_factor=1.0):
        """Use the MC information accumulated by accumulate_moments() to
        estimate the best bandwidth to use when kernel estimating."""

        rho = 1.0

        hitcount = self.hitcount_gpu.get()
        mom0 = np.maximum(hitcount, 1)
        tmom1 = self.tmom1_gpu.get()
        tmom2 = self.tmom2_gpu.get()

        tmean = tmom1 / mom0
        tvar = np.maximum(tmom2 / mom0 - tmean**2, 0.0)  # roundoff can go neg
        trms = tvar**0.5

        if self.time_only:
            d = 1
        else:
            d = 2
        dimensionality_factor = ((4.0 / (d + 2)) /
                                 (mom0 / scale_factor))**(-1.0 / (d + 4))
        gaussian_density = np.minimum(
            1.0 / trms, (1.0 / np.sqrt(2.0 * np.pi)) *
            np.exp(-0.5 * ((event_time - tmean) / trms)) / trms)
        time_bandwidths = dimensionality_factor / gaussian_density * rho
        inv_time_bandwidths = np.zeros_like(time_bandwidths)
        inv_time_bandwidths[time_bandwidths > 0] = time_bandwidths[
            time_bandwidths > 0]**-1

        # precompute inverse to speed up GPU evaluation
        self.inv_time_bandwidths_gpu = ga.to_gpu(
            inv_time_bandwidths.astype(np.float32))

        # Compute charge bandwidths if needed
        if self.time_only:
            self.inv_charge_bandwidths_gpu = ga.empty_like(
                self.inv_time_bandwidths_gpu)
            self.inv_charge_bandwidths_gpu.fill(0.0)
        else:
            qmom1 = self.qmom1_gpu.get()
            qmom2 = self.qmom2_gpu.get()

            qmean = qmom1 / mom0
            qrms = (qmom2 / mom0 - qmean**2)**0.5

            gaussian_density = np.minimum(
                1.0 / qrms, (1.0 / np.sqrt(2.0 * np.pi)) *
                np.exp(-0.5 * ((event_charge - qmean) / qrms)) / qrms)

            charge_bandwidths = dimensionality_factor / gaussian_density * rho

            # precompute inverse to speed up GPU evaluation
            self.inv_charge_bandwidths_gpu = ga.to_gpu(
                (charge_bandwidths**-1).astype(np.float32))

    def setup_kernel(self, event_hit, event_time, event_charge):
        """Setup GPU arrays to accumulate moments and eventually
        compute a kernel estimate of PDF values for each hit channel.

            event_hit: ndarray
              Hit or not-hit status for each channel in the detector.
            event_time: ndarray
              Hit time for each channel in the detector.  If channel 
              not hit, the time will be ignored.
            event_charge: ndarray
              Integrated charge for each channel in the detector.
              If channel not hit, the charge will be ignored.
        """
        self.event_hit_gpu = ga.to_gpu(event_hit.astype(np.uint32))
        self.event_time_gpu = ga.to_gpu(event_time.astype(np.float32))
        self.event_charge_gpu = ga.to_gpu(event_charge.astype(np.float32))
        self.hitcount_gpu.fill(0)
        self.time_pdf_values_gpu = ga.zeros(len(event_hit), dtype=np.float32)
        self.charge_pdf_values_gpu = ga.zeros(len(event_hit), dtype=np.float32)

    def clear_kernel(self):
        self.hitcount_gpu.fill(0)
        self.time_pdf_values_gpu.fill(0.0)
        self.charge_pdf_values_gpu.fill(0.0)

    def accumulate_kernel(self, gpuchannels, nthreads_per_block=64):
        "Add the most recent results of run_daq() to the kernel PDF evaluation."
        self.gpu_funcs.accumulate_kernel_eval(
            np.int32(self.time_only),
            np.int32(len(self.event_hit_gpu)),
            self.event_hit_gpu,
            self.event_time_gpu,
            self.event_charge_gpu,
            gpuchannels.t,
            gpuchannels.q,
            np.float32(self.trange[0]),
            np.float32(self.trange[1]),
            np.float32(self.qrange[0]),
            np.float32(self.qrange[1]),
            self.inv_time_bandwidths_gpu,
            self.inv_charge_bandwidths_gpu,
            self.hitcount_gpu,
            self.time_pdf_values_gpu,
            self.charge_pdf_values_gpu,
            block=(nthreads_per_block, 1, 1),
            grid=(len(gpuchannels.t) // nthreads_per_block + 1, 1))

    def get_kernel_eval(self):
        hitcount = self.hitcount_gpu.get()
        hit = self.event_hit_gpu.get().astype(bool)
        time_pdf_values = self.time_pdf_values_gpu.get()
        time_pdf_values /= np.maximum(1, hitcount)  # avoid divide by zero

        charge_pdf_values = self.charge_pdf_values_gpu.get()
        charge_pdf_values /= np.maximum(1, hitcount)  # avoid divide by zero

        if self.time_only:
            pdf_values = time_pdf_values
        else:
            pdf_values = time_pdf_values * charge_pdf_values

        return hitcount, pdf_values, np.zeros_like(pdf_values)
Beispiel #25
0
def merge_nodes(nodes, degree, max_ratio=None):
    bvh_module = get_cu_module('bvh.cu',
                               options=cuda_options,
                               include_source_directory=True)
    bvh_funcs = GPUFuncs(bvh_module)

    nparent = len(nodes) / degree
    if len(nodes) % degree != 0:
        nparent += 1

    if nparent == 1:
        nparent_pad = nparent
    else:
        nparent_pad = round_up_to_multiple(nparent, 1)  #degree)
    gpu_parent_nodes = ga.zeros(shape=nparent_pad, dtype=ga.vec.uint4)

    nthreads_per_block = 256
    for first_index, elements_this_iter, nblocks_this_iter in \
            chunk_iterator(nparent, nthreads_per_block, max_blocks=10000):
        bvh_funcs.make_parents(np.uint32(first_index),
                               np.uint32(elements_this_iter),
                               np.uint32(degree),
                               gpu_parent_nodes,
                               cuda.In(nodes),
                               np.uint32(0),
                               np.uint32(len(nodes)),
                               block=(nthreads_per_block, 1, 1),
                               grid=(nblocks_this_iter, 1))

    parent_nodes = gpu_parent_nodes.get()

    if max_ratio is not None:
        areas = node_areas(parent_nodes)
        child_areas = node_areas(nodes)

        excessive_area = np.zeros(shape=len(areas), dtype=bool)
        for i, parent_area in enumerate(areas):
            nchild = parent_nodes['w'][i] >> CHILD_BITS
            child_index = parent_nodes['w'][i] & ~NCHILD_MASK
            child_area = child_areas[child_index:child_index + nchild].sum()
            #if parent_area > 1e9:
            #    print i, 'Children: %e, Parent: %e' % (child_area, parent_area)
            if child_area / parent_area < 0.3:
                excessive_area[i] = True
                #print i, 'Children: %e, Parent: %e' % (child_area, parent_area)

        extra_slots = round_up_to_multiple(
            (degree - 1) * np.count_nonzero(excessive_area), 1)
        print 'Extra slots:', extra_slots
        new_parent_nodes = np.zeros(shape=len(parent_nodes) + extra_slots,
                                    dtype=parent_nodes.dtype)
        new_parent_nodes[:len(parent_nodes)] = parent_nodes

        offset = 0
        for count, index in enumerate(np.argwhere(excessive_area)):
            index = index[0] + offset
            nchild = new_parent_nodes['w'][index] >> CHILD_BITS
            child_index = new_parent_nodes['w'][index] & ~NCHILD_MASK
            new_parent_nodes[index] = nodes[child_index]
            #new_parent_nodes['w'][index] = 1 << CHILD_BITS | child_index
            tmp_nchild = new_parent_nodes['w'][index] >> CHILD_BITS
            tmp_child_index = new_parent_nodes['w'][index] & ~NCHILD_MASK
            new_parent_nodes['w'][index] = tmp_nchild << CHILD_BITS | (
                tmp_child_index + len(nodes))

            if nchild == 1:
                continue

            # slide everyone over
            #print index, nchild, len(new_parent_nodes)
            new_parent_nodes[index + nchild:] = new_parent_nodes[index +
                                                                 1:-nchild + 1]
            offset += nchild - 1
            for sibling in xrange(nchild - 1):
                new_parent_index = index + 1 + sibling
                new_parent_nodes[new_parent_index] = nodes[child_index +
                                                           sibling + 1]
                if new_parent_nodes['x'][new_parent_index] != 0:
                    tmp_nchild = new_parent_nodes['w'][
                        new_parent_index] >> CHILD_BITS
                    tmp_child_index = new_parent_nodes['w'][
                        new_parent_index] & ~NCHILD_MASK
                    new_parent_nodes['w'][
                        new_parent_index] = tmp_nchild << CHILD_BITS | (
                            tmp_child_index + len(nodes))

                    #new_parent_nodes['w'][new_parent_index] = 1 << CHILD_BITS | (child_index + sibling + 1)

            #print 'intermediate: %e' % node_areas(new_parent_nodes).max()
        print 'old: %e' % node_areas(parent_nodes).max()
        print 'new: %e' % node_areas(new_parent_nodes).max()
        if len(new_parent_nodes) < len(nodes):
            # Only adopt new set of parent nodes if it actually reduces the
            # total number of nodes at this level by 1.
            parent_nodes = new_parent_nodes

    return parent_nodes
Beispiel #26
0
class GPUPhotons(object):
    def __init__(self, photons, ncopies=1):
        """Load ``photons`` onto the GPU, replicating as requested.

           Args:
               - photons: chroma.Event.Photons
                   Photon state information to load onto GPU
               - ncopies: int, *optional*
                   Number of times to replicate the photons
                   on the GPU.  This is used if you want
                   to propagate the same event many times,
                   for example in a likelihood calculation.

                   The amount of GPU storage will be proportionally
                   larger if ncopies > 1, so be careful.
        """
        nphotons = len(photons)
        self.pos = ga.empty(shape=nphotons*ncopies, dtype=ga.vec.float3)
        self.dir = ga.empty(shape=nphotons*ncopies, dtype=ga.vec.float3)
        self.pol = ga.empty(shape=nphotons*ncopies, dtype=ga.vec.float3)
        self.wavelengths = ga.empty(shape=nphotons*ncopies, dtype=np.float32)
        self.t = ga.empty(shape=nphotons*ncopies, dtype=np.float32)
        self.last_hit_triangles = ga.empty(shape=nphotons*ncopies, dtype=np.int32)
        self.flags = ga.empty(shape=nphotons*ncopies, dtype=np.uint32)
        self.weights = ga.empty(shape=nphotons*ncopies, dtype=np.float32)

        # Assign the provided photons to the beginning (possibly
        # the entire array if ncopies is 1
        self.pos[:nphotons].set(to_float3(photons.pos))
        self.dir[:nphotons].set(to_float3(photons.dir))
        self.pol[:nphotons].set(to_float3(photons.pol))
        self.wavelengths[:nphotons].set(photons.wavelengths.astype(np.float32))
        self.t[:nphotons].set(photons.t.astype(np.float32))
        self.last_hit_triangles[:nphotons].set(photons.last_hit_triangles.astype(np.int32))
        self.flags[:nphotons].set(photons.flags.astype(np.uint32))
        self.weights[:nphotons].set(photons.weights.astype(np.float32))

        module = get_cu_module('propagate.cu', options=cuda_options)
        self.gpu_funcs = GPUFuncs(module)

        # Replicate the photons to the rest of the slots if needed
        if ncopies > 1:
            max_blocks = 1024
            nthreads_per_block = 64
            for first_photon, photons_this_round, blocks in \
                    chunk_iterator(nphotons, nthreads_per_block, max_blocks):
                self.gpu_funcs.photon_duplicate(np.int32(first_photon), np.int32(photons_this_round),
                                                self.pos, self.dir, self.wavelengths, self.pol, self.t, 
                                                self.flags, self.last_hit_triangles, self.weights,
                                                np.int32(ncopies-1), 
                                                np.int32(nphotons),
                                                block=(nthreads_per_block,1,1), grid=(blocks, 1))


        # Save the duplication information for the iterate_copies() method
        self.true_nphotons = nphotons
        self.ncopies = ncopies

    def get(self):
        pos = self.pos.get().view(np.float32).reshape((len(self.pos),3))
        dir = self.dir.get().view(np.float32).reshape((len(self.dir),3))
        pol = self.pol.get().view(np.float32).reshape((len(self.pol),3))
        wavelengths = self.wavelengths.get()
        t = self.t.get()
        last_hit_triangles = self.last_hit_triangles.get()
        flags = self.flags.get()
        weights = self.weights.get()
        return event.Photons(pos, dir, pol, wavelengths, t, last_hit_triangles, flags, weights)
        
    def get_hits(self, gpu_detector, target_flag=(0x1<<2), nthreads_per_block=64, max_blocks=1024,
               start_photon=None, nphotons=None):
        '''Return a map of GPUPhoton objects containing only photons that
        have a particular bit set in their history word and were detected by
        a channel.'''
        cuda.Context.get_current().synchronize()
        index_counter_gpu = ga.zeros(shape=1, dtype=np.uint32)
        cuda.Context.get_current().synchronize()
        if start_photon is None:
            start_photon = 0
        if nphotons is None:
            nphotons = self.pos.size - start_photon

        # First count how much space we need
        for first_photon, photons_this_round, blocks in chunk_iterator(nphotons, nthreads_per_block, max_blocks):
            self.gpu_funcs.count_photon_hits(np.int32(start_photon+first_photon), 
                                         np.int32(photons_this_round),
                                         np.uint32(target_flag),
                                         self.flags,
                                         gpu_detector.solid_id_map,
                                         self.last_hit_triangles,
                                         gpu_detector.detector_gpu,
                                         index_counter_gpu,
                                         block=(nthreads_per_block,1,1), 
                                         grid=(blocks, 1))
        cuda.Context.get_current().synchronize()
        reduced_nphotons = int(index_counter_gpu.get()[0])
        
        # Then allocate new storage space
        pos = ga.empty(shape=reduced_nphotons, dtype=ga.vec.float3)
        dir = ga.empty(shape=reduced_nphotons, dtype=ga.vec.float3)
        pol = ga.empty(shape=reduced_nphotons, dtype=ga.vec.float3)
        wavelengths = ga.empty(shape=reduced_nphotons, dtype=np.float32)
        t = ga.empty(shape=reduced_nphotons, dtype=np.float32)
        last_hit_triangles = ga.empty(shape=reduced_nphotons, dtype=np.int32)
        flags = ga.empty(shape=reduced_nphotons, dtype=np.uint32)
        weights = ga.empty(shape=reduced_nphotons, dtype=np.float32)
        channels = ga.empty(shape=reduced_nphotons, dtype=np.int32)

        # And finaly copy hits, if there are any
        if reduced_nphotons > 0:
            index_counter_gpu.fill(0)
            for first_photon, photons_this_round, blocks in \
                    chunk_iterator(nphotons, nthreads_per_block, max_blocks):
                self.gpu_funcs.copy_photon_hits(np.int32(start_photon+first_photon), 
                                            np.int32(photons_this_round), 
                                            np.uint32(target_flag),
                                            gpu_detector.solid_id_map,
                                            gpu_detector.detector_gpu,
                                            index_counter_gpu, 
                                            self.pos, self.dir, self.wavelengths, self.pol, self.t, self.flags, self.last_hit_triangles, self.weights,
                                            pos, dir, wavelengths, pol, t, flags, last_hit_triangles, weights, channels,
                                            block=(nthreads_per_block,1,1), 
                                            grid=(blocks, 1))
            assert index_counter_gpu.get()[0] == reduced_nphotons
            
        pos = pos.get().view(np.float32).reshape((len(pos),3))
        dir = dir.get().view(np.float32).reshape((len(dir),3))
        pol = pol.get().view(np.float32).reshape((len(pol),3))
        wavelengths = wavelengths.get()
        t = t.get()
        last_hit_triangles = last_hit_triangles.get()
        flags = flags.get()
        weights = weights.get()
        channels = channels.get()
        hitmap = {}
        for chan in np.unique(channels):
            mask = (channels == chan).astype(bool)
            hitmap[chan] = event.Photons(pos[mask], dir[mask], pol[mask], wavelengths[mask], t[mask], last_hit_triangles[mask], flags[mask], weights[mask])
        return hitmap

    def iterate_copies(self):
        '''Returns an iterator that yields GPUPhotonsSlice objects
        corresponding to the event copies stored in ``self``.'''
        for i in xrange(self.ncopies):
            window = slice(self.true_nphotons*i, self.true_nphotons*(i+1))
            yield GPUPhotonsSlice(pos=self.pos[window],
                                  dir=self.dir[window],
                                  pol=self.pol[window],
                                  wavelengths=self.wavelengths[window],
                                  t=self.t[window],
                                  last_hit_triangles=self.last_hit_triangles[window],
                                  flags=self.flags[window],
                                  weights=self.weights[window])

    @profile_if_possible
    def propagate(self, gpu_geometry, rng_states, nthreads_per_block=64,
                  max_blocks=1024, max_steps=10, use_weights=False,
                  scatter_first=0):
        """Propagate photons on GPU to termination or max_steps, whichever
        comes first.

        May be called repeatedly without reloading photon information if
        single-stepping through photon history.

        ..warning::
            `rng_states` must have at least `nthreads_per_block`*`max_blocks`
            number of curandStates.
        """
        nphotons = self.pos.size
        step = 0
        input_queue = np.empty(shape=nphotons+1, dtype=np.uint32)
        input_queue[0] = 0
        # Order photons initially in the queue to put the clones next to each other
        for copy in xrange(self.ncopies):
            input_queue[1+copy::self.ncopies] = np.arange(self.true_nphotons, dtype=np.uint32) + copy * self.true_nphotons
        input_queue_gpu = ga.to_gpu(input_queue)
        output_queue = np.zeros(shape=nphotons+1, dtype=np.uint32)
        output_queue[0] = 1
        output_queue_gpu = ga.to_gpu(output_queue)

        while step < max_steps:
            # Just finish the rest of the steps if the # of photons is low
            if nphotons < nthreads_per_block * 16 * 8 or use_weights:
                nsteps = max_steps - step
            else:
                nsteps = 1

            for first_photon, photons_this_round, blocks in \
                    chunk_iterator(nphotons, nthreads_per_block, max_blocks):
                self.gpu_funcs.propagate(np.int32(first_photon), np.int32(photons_this_round), input_queue_gpu[1:], output_queue_gpu, rng_states, self.pos, self.dir, self.wavelengths, self.pol, self.t, self.flags, self.last_hit_triangles, self.weights, np.int32(nsteps), np.int32(use_weights), np.int32(scatter_first), gpu_geometry.gpudata, block=(nthreads_per_block,1,1), grid=(blocks, 1))

            step += nsteps
            scatter_first = 0 # Only allow non-zero in first pass

            if step < max_steps:
                temp = input_queue_gpu
                input_queue_gpu = output_queue_gpu
                output_queue_gpu = temp
                # Assign with a numpy array of length 1 to silence
                # warning from PyCUDA about setting array with different strides/storage orders.
                output_queue_gpu[:1].set(np.ones(shape=1, dtype=np.uint32))
                nphotons = input_queue_gpu[:1].get()[0] - 1

        if ga.max(self.flags).get() & (1 << 31):
            print >>sys.stderr, "WARNING: ABORTED PHOTONS"
        cuda.Context.get_current().synchronize()


    @profile_if_possible
    def select(self, target_flag, nthreads_per_block=64, max_blocks=1024,
               start_photon=None, nphotons=None):
        '''Return a new GPUPhoton object containing only photons that
        have a particular bit set in their history word.'''
        cuda.Context.get_current().synchronize()
        index_counter_gpu = ga.zeros(shape=1, dtype=np.uint32)
        cuda.Context.get_current().synchronize()
        if start_photon is None:
            start_photon = 0
        if nphotons is None:
            nphotons = self.pos.size - start_photon

        # First count how much space we need
        for first_photon, photons_this_round, blocks in \
                chunk_iterator(nphotons, nthreads_per_block, max_blocks):
            self.gpu_funcs.count_photons(np.int32(start_photon+first_photon), 
                                         np.int32(photons_this_round),
                                         np.uint32(target_flag),
                                         index_counter_gpu, self.flags,
                                         block=(nthreads_per_block,1,1), 
                                         grid=(blocks, 1))
        cuda.Context.get_current().synchronize()
        reduced_nphotons = int(index_counter_gpu.get()[0])
        # Then allocate new storage space
        pos = ga.empty(shape=reduced_nphotons, dtype=ga.vec.float3)
        dir = ga.empty(shape=reduced_nphotons, dtype=ga.vec.float3)
        pol = ga.empty(shape=reduced_nphotons, dtype=ga.vec.float3)
        wavelengths = ga.empty(shape=reduced_nphotons, dtype=np.float32)
        t = ga.empty(shape=reduced_nphotons, dtype=np.float32)
        last_hit_triangles = ga.empty(shape=reduced_nphotons, dtype=np.int32)
        flags = ga.empty(shape=reduced_nphotons, dtype=np.uint32)
        weights = ga.empty(shape=reduced_nphotons, dtype=np.float32)

        # And finaly copy photons, if there are any
        if reduced_nphotons > 0:
            index_counter_gpu.fill(0)
            for first_photon, photons_this_round, blocks in \
                    chunk_iterator(nphotons, nthreads_per_block, max_blocks):
                self.gpu_funcs.copy_photons(np.int32(start_photon+first_photon), 
                                            np.int32(photons_this_round), 
                                            np.uint32(target_flag),
                                            index_counter_gpu, 
                                            self.pos, self.dir, self.wavelengths, self.pol, self.t, self.flags, self.last_hit_triangles, self.weights,
                                            pos, dir, wavelengths, pol, t, flags, last_hit_triangles, weights,
                                            block=(nthreads_per_block,1,1), 
                                            grid=(blocks, 1))
            assert index_counter_gpu.get()[0] == reduced_nphotons
        return GPUPhotonsSlice(pos, dir, pol, wavelengths, t, last_hit_triangles, flags, weights)

    def __del__(self):
        del self.pos
        del self.dir
        del self.pol
        del self.wavelengths
        del self.t
        del self.flags
        del self.last_hit_triangles
        # Free up GPU memory quickly if now available
        gc.collect()


    def __len__(self):
        return self.pos.size
Beispiel #27
0
def merge_nodes(nodes, degree, max_ratio=None):
    bvh_module = get_cu_module('bvh.cu', options=cuda_options,
                               include_source_directory=True)
    bvh_funcs = GPUFuncs(bvh_module)
    
    nparent = len(nodes) / degree
    if len(nodes) % degree != 0:
        nparent += 1

    if nparent == 1:
        nparent_pad = nparent
    else:
        nparent_pad = round_up_to_multiple(nparent, 1)#degree)
    gpu_parent_nodes = ga.zeros(shape=nparent_pad, dtype=ga.vec.uint4)

    nthreads_per_block = 256
    for first_index, elements_this_iter, nblocks_this_iter in \
            chunk_iterator(nparent, nthreads_per_block, max_blocks=10000):
        bvh_funcs.make_parents(np.uint32(first_index),
                               np.uint32(elements_this_iter),
                               np.uint32(degree),
                               gpu_parent_nodes,
                               cuda.In(nodes),
                               np.uint32(0),
                               np.uint32(len(nodes)),
                               block=(nthreads_per_block,1,1),
                               grid=(nblocks_this_iter,1))

    parent_nodes = gpu_parent_nodes.get()

    if max_ratio is not None:
        areas = node_areas(parent_nodes)
        child_areas = node_areas(nodes)

        excessive_area = np.zeros(shape=len(areas), dtype=bool)
        for i, parent_area in enumerate(areas):
            nchild = parent_nodes['w'][i] >> CHILD_BITS
            child_index = parent_nodes['w'][i] & ~NCHILD_MASK
            child_area = child_areas[child_index:child_index+nchild].sum()
            #if parent_area > 1e9:
            #    print i, 'Children: %e, Parent: %e' % (child_area, parent_area)
            if child_area/parent_area < 0.3:
                excessive_area[i] = True
                #print i, 'Children: %e, Parent: %e' % (child_area, parent_area)

        extra_slots = round_up_to_multiple((degree - 1) * np.count_nonzero(excessive_area), 1)
        print 'Extra slots:', extra_slots
        new_parent_nodes = np.zeros(shape=len(parent_nodes) + extra_slots,
                                    dtype=parent_nodes.dtype)
        new_parent_nodes[:len(parent_nodes)] = parent_nodes

        offset = 0
        for count, index in enumerate(np.argwhere(excessive_area)):
            index = index[0] + offset
            nchild = new_parent_nodes['w'][index] >> CHILD_BITS
            child_index = new_parent_nodes['w'][index] & ~NCHILD_MASK
            new_parent_nodes[index] = nodes[child_index]
            #new_parent_nodes['w'][index] = 1 << CHILD_BITS | child_index
            tmp_nchild = new_parent_nodes['w'][index] >> CHILD_BITS
            tmp_child_index = new_parent_nodes['w'][index] & ~NCHILD_MASK
            new_parent_nodes['w'][index] = tmp_nchild << CHILD_BITS | (tmp_child_index + len(nodes))

            if nchild == 1:
                continue

            # slide everyone over
            #print index, nchild, len(new_parent_nodes)
            new_parent_nodes[index+nchild:] = new_parent_nodes[index+1:-nchild+1]
            offset += nchild - 1
            for sibling in xrange(nchild - 1):
                new_parent_index = index + 1 + sibling
                new_parent_nodes[new_parent_index] = nodes[child_index + sibling + 1]
                if new_parent_nodes['x'][new_parent_index] != 0:
                    tmp_nchild = new_parent_nodes['w'][new_parent_index] >> CHILD_BITS
                    tmp_child_index = new_parent_nodes['w'][new_parent_index] & ~NCHILD_MASK
                    new_parent_nodes['w'][new_parent_index] = tmp_nchild << CHILD_BITS | (tmp_child_index + len(nodes))

                    #new_parent_nodes['w'][new_parent_index] = 1 << CHILD_BITS | (child_index + sibling + 1)


            #print 'intermediate: %e' % node_areas(new_parent_nodes).max()
        print 'old: %e' % node_areas(parent_nodes).max()
        print 'new: %e' % node_areas(new_parent_nodes).max()
        if len(new_parent_nodes) < len(nodes):
            # Only adopt new set of parent nodes if it actually reduces the
            # total number of nodes at this level by 1.
            parent_nodes = new_parent_nodes

    return parent_nodes
Beispiel #28
0
class GPUPhotons(object):
    def __init__(self,
                 photons,
                 ncopies=1,
                 copy_flags=True,
                 copy_triangles=True,
                 copy_weights=True):
        """Load ``photons`` onto the GPU, replicating as requested.

           Args:
               - photons: chroma.Event.Photons
                   Photon state information to load onto GPU
               - ncopies: int, *optional*
                   Number of times to replicate the photons
                   on the GPU.  This is used if you want
                   to propagate the same event many times,
                   for example in a likelihood calculation.

                   The amount of GPU storage will be proportionally
                   larger if ncopies > 1, so be careful.
        """
        nphotons = len(photons)
        self.pos = ga.empty(shape=nphotons * ncopies, dtype=ga.vec.float3)
        self.dir = ga.empty(shape=nphotons * ncopies, dtype=ga.vec.float3)
        self.pol = ga.empty(shape=nphotons * ncopies, dtype=ga.vec.float3)
        self.wavelengths = ga.empty(shape=nphotons * ncopies, dtype=np.float32)
        self.t = ga.empty(shape=nphotons * ncopies, dtype=np.float32)
        self.last_hit_triangles = ga.empty(shape=nphotons * ncopies,
                                           dtype=np.int32)
        if not copy_triangles:
            self.last_hit_triangles.fill(-1)
        if not copy_flags:
            self.flags = ga.zeros(shape=nphotons * ncopies, dtype=np.uint32)
        else:
            self.flags = ga.empty(shape=nphotons * ncopies, dtype=np.uint32)
        if not copy_weights:
            self.weights = ga.ones_like(self.last_hit_triangles,
                                        dtype=np.float32)
        else:
            self.weights = ga.empty(shape=nphotons * ncopies, dtype=np.float32)
        self.evidx = ga.empty(shape=nphotons, dtype=np.uint32)

        # Assign the provided photons to the beginning (possibly
        # the entire array if ncopies is 1
        self.pos[:nphotons].set(to_float3(photons.pos))
        self.dir[:nphotons].set(to_float3(photons.dir))
        self.pol[:nphotons].set(to_float3(photons.pol))
        self.wavelengths[:nphotons].set(photons.wavelengths.astype(np.float32))
        self.t[:nphotons].set(photons.t.astype(np.float32))
        if copy_triangles:
            self.last_hit_triangles[:nphotons].set(
                photons.last_hit_triangles.astype(np.int32))
        if copy_flags:
            self.flags[:nphotons].set(photons.flags.astype(np.uint32))
        if copy_weights:
            self.weights[:nphotons].set(photons.weights.astype(np.float32))
        self.evidx[:nphotons].set(photons.evidx.astype(np.uint32))

        module = get_cu_module('propagate.cu', options=cuda_options)
        self.gpu_funcs = GPUFuncs(module)

        # Replicate the photons to the rest of the slots if needed
        if ncopies > 1:
            max_blocks = 1024
            nthreads_per_block = 64
            for first_photon, photons_this_round, blocks in \
                    chunk_iterator(nphotons, nthreads_per_block, max_blocks):
                self.gpu_funcs.photon_duplicate(np.int32(first_photon),
                                                np.int32(photons_this_round),
                                                self.pos,
                                                self.dir,
                                                self.wavelengths,
                                                self.pol,
                                                self.t,
                                                self.flags,
                                                self.last_hit_triangles,
                                                self.weights,
                                                self.evidx,
                                                np.int32(ncopies - 1),
                                                np.int32(nphotons),
                                                block=(nthreads_per_block, 1,
                                                       1),
                                                grid=(blocks, 1))

        # Save the duplication information for the iterate_copies() method
        self.true_nphotons = nphotons
        self.ncopies = ncopies

    def get(self):
        pos = self.pos.get().view(np.float32).reshape((len(self.pos), 3))
        dir = self.dir.get().view(np.float32).reshape((len(self.dir), 3))
        pol = self.pol.get().view(np.float32).reshape((len(self.pol), 3))
        wavelengths = self.wavelengths.get()
        t = self.t.get()
        last_hit_triangles = self.last_hit_triangles.get()
        flags = self.flags.get()
        weights = self.weights.get()
        evidx = self.evidx.get()
        return event.Photons(pos, dir, pol, wavelengths, t, last_hit_triangles,
                             flags, weights, evidx)

    def get_hits(self, *args, **kwargs):
        '''Return a map of GPUPhoton objects containing only photons that
        have a particular bit set in their history word and were detected by
        a channel.'''
        flat_hits = self.get_flat_hits(*args, **kwargs)
        hitmap = {}
        for chan in np.unique(flat_hits.channel):
            mask = (flat_hits.channel == chan).astype(bool)
            hitmap[int(chan)] = flat_hits[mask]
        return hitmap

    def get_flat_hits(self,
                      gpu_detector,
                      target_flag=(0x1 << 2),
                      nthreads_per_block=64,
                      max_blocks=1024,
                      start_photon=None,
                      nphotons=None,
                      no_map=False):
        '''GPUPhoton objects containing only photons that
        have a particular bit set in their history word and were detected by
        a channel.'''
        cuda.Context.get_current().synchronize()
        index_counter_gpu = ga.zeros(shape=1, dtype=np.uint32)
        cuda.Context.get_current().synchronize()
        if start_photon is None:
            start_photon = 0
        if nphotons is None:
            nphotons = self.pos.size - start_photon

        # First count how much space we need
        for first_photon, photons_this_round, blocks in chunk_iterator(
                nphotons, nthreads_per_block, max_blocks):
            self.gpu_funcs.count_photon_hits(np.int32(start_photon +
                                                      first_photon),
                                             np.int32(photons_this_round),
                                             np.uint32(target_flag),
                                             self.flags,
                                             gpu_detector.solid_id_map,
                                             self.last_hit_triangles,
                                             gpu_detector.detector_gpu,
                                             index_counter_gpu,
                                             block=(nthreads_per_block, 1, 1),
                                             grid=(blocks, 1))
        cuda.Context.get_current().synchronize()
        reduced_nphotons = int(index_counter_gpu.get()[0])

        # Then allocate new storage space
        pos = ga.empty(shape=reduced_nphotons, dtype=ga.vec.float3)
        dir = ga.empty(shape=reduced_nphotons, dtype=ga.vec.float3)
        pol = ga.empty(shape=reduced_nphotons, dtype=ga.vec.float3)
        wavelengths = ga.empty(shape=reduced_nphotons, dtype=np.float32)
        t = ga.empty(shape=reduced_nphotons, dtype=np.float32)
        last_hit_triangles = ga.empty(shape=reduced_nphotons, dtype=np.int32)
        flags = ga.empty(shape=reduced_nphotons, dtype=np.uint32)
        weights = ga.empty(shape=reduced_nphotons, dtype=np.float32)
        evidx = ga.empty(shape=reduced_nphotons, dtype=np.uint32)
        channels = ga.empty(shape=reduced_nphotons, dtype=np.int32)

        # And finaly copy hits, if there are any
        if reduced_nphotons > 0:
            index_counter_gpu.fill(0)
            for first_photon, photons_this_round, blocks in \
                    chunk_iterator(nphotons, nthreads_per_block, max_blocks):
                self.gpu_funcs.copy_photon_hits(
                    np.int32(start_photon + first_photon),
                    np.int32(photons_this_round),
                    np.uint32(target_flag),
                    gpu_detector.solid_id_map,
                    gpu_detector.detector_gpu,
                    index_counter_gpu,
                    self.pos,
                    self.dir,
                    self.wavelengths,
                    self.pol,
                    self.t,
                    self.flags,
                    self.last_hit_triangles,
                    self.weights,
                    self.evidx,
                    pos,
                    dir,
                    wavelengths,
                    pol,
                    t,
                    flags,
                    last_hit_triangles,
                    weights,
                    evidx,
                    channels,
                    block=(nthreads_per_block, 1, 1),
                    grid=(blocks, 1))
            assert index_counter_gpu.get()[0] == reduced_nphotons

        pos = pos.get().view(np.float32).reshape((len(pos), 3))
        dir = dir.get().view(np.float32).reshape((len(dir), 3))
        pol = pol.get().view(np.float32).reshape((len(pol), 3))
        wavelengths = wavelengths.get()
        t = t.get()
        last_hit_triangles = last_hit_triangles.get()
        flags = flags.get()
        weights = weights.get()
        evidx = evidx.get()
        channels = channels.get()
        hitmap = {}
        return event.Photons(pos, dir, pol, wavelengths, t, last_hit_triangles,
                             flags, weights, evidx, channels)

    def iterate_copies(self):
        '''Returns an iterator that yields GPUPhotonsSlice objects
        corresponding to the event copies stored in ``self``.'''
        for i in range(self.ncopies):
            window = slice(self.true_nphotons * i,
                           self.true_nphotons * (i + 1))
            yield GPUPhotonsSlice(
                pos=self.pos[window],
                dir=self.dir[window],
                pol=self.pol[window],
                wavelengths=self.wavelengths[window],
                t=self.t[window],
                last_hit_triangles=self.last_hit_triangles[window],
                flags=self.flags[window],
                weights=self.weights[window],
                evidx=self.evidx[window])

    @profile_if_possible
    def propagate(self,
                  gpu_geometry,
                  rng_states,
                  nthreads_per_block=64,
                  max_blocks=1024,
                  max_steps=10,
                  use_weights=False,
                  scatter_first=0,
                  track=False):
        """Propagate photons on GPU to termination or max_steps, whichever
        comes first.

        May be called repeatedly without reloading photon information if
        single-stepping through photon history.

        ..warning::
            `rng_states` must have at least `nthreads_per_block`*`max_blocks`
            number of curandStates.
        """
        nphotons = self.pos.size
        step = 0
        input_queue = np.empty(shape=nphotons + 1, dtype=np.uint32)
        input_queue[0] = 0
        # Order photons initially in the queue to put the clones next to each other
        for copy in range(self.ncopies):
            input_queue[1 + copy::self.ncopies] = np.arange(
                self.true_nphotons,
                dtype=np.uint32) + copy * self.true_nphotons
        input_queue_gpu = ga.to_gpu(input_queue)
        output_queue = np.zeros(shape=nphotons + 1, dtype=np.uint32)
        output_queue[0] = 1
        output_queue_gpu = ga.to_gpu(output_queue)

        if track:
            step_photon_ids = []
            step_photons = []
            #save the first step for all photons in the input queue
            step_photon_ids.append(input_queue_gpu[1:nphotons + 1].get())
            step_photons.append(
                self.copy_queue(input_queue_gpu[1:], nphotons).get())

        while step < max_steps:
            # Just finish the rest of the steps if the # of photons is low and not tracking
            if not track and (nphotons < nthreads_per_block * 16 * 8
                              or use_weights):
                nsteps = max_steps - step
            else:
                nsteps = 1

            for first_photon, photons_this_round, blocks in \
                    chunk_iterator(nphotons, nthreads_per_block, max_blocks):
                self.gpu_funcs.propagate(np.int32(first_photon),
                                         np.int32(photons_this_round),
                                         input_queue_gpu[1:],
                                         output_queue_gpu,
                                         rng_states,
                                         self.pos,
                                         self.dir,
                                         self.wavelengths,
                                         self.pol,
                                         self.t,
                                         self.flags,
                                         self.last_hit_triangles,
                                         self.weights,
                                         self.evidx,
                                         np.int32(nsteps),
                                         np.int32(use_weights),
                                         np.int32(scatter_first),
                                         gpu_geometry.gpudata,
                                         block=(nthreads_per_block, 1, 1),
                                         grid=(blocks, 1))

            if track:  #save the next step for all photons in the input queue
                step_photon_ids.append(input_queue_gpu[1:nphotons + 1].get())
                step_photons.append(
                    self.copy_queue(input_queue_gpu[1:], nphotons).get())

            step += nsteps
            scatter_first = 0  # Only allow non-zero in first pass

            if step < max_steps:
                temp = input_queue_gpu
                input_queue_gpu = output_queue_gpu
                output_queue_gpu = temp
                # Assign with a numpy array of length 1 to silence
                # warning from PyCUDA about setting array with different strides/storage orders.
                output_queue_gpu[:1].set(np.ones(shape=1, dtype=np.uint32))
                nphotons = input_queue_gpu[:1].get()[0] - 1
                if nphotons == 0:
                    break

        if ga.max(self.flags).get() & (1 << 31):
            print("WARNING: ABORTED PHOTONS", file=sys.stderr)
        cuda.Context.get_current().synchronize()

        if track:
            return step_photon_ids, step_photons

    @profile_if_possible
    def copy_queue(self,
                   queue_gpu,
                   nphotons,
                   nthreads_per_block=64,
                   max_blocks=1024,
                   start_photon=0):

        # Allocate new storage space
        pos = ga.empty(shape=nphotons, dtype=ga.vec.float3)
        dir = ga.empty(shape=nphotons, dtype=ga.vec.float3)
        pol = ga.empty(shape=nphotons, dtype=ga.vec.float3)
        wavelengths = ga.empty(shape=nphotons, dtype=np.float32)
        t = ga.empty(shape=nphotons, dtype=np.float32)
        last_hit_triangles = ga.empty(shape=nphotons, dtype=np.int32)
        flags = ga.empty(shape=nphotons, dtype=np.uint32)
        weights = ga.empty(shape=nphotons, dtype=np.float32)
        evidx = ga.empty(shape=nphotons, dtype=np.uint32)

        # And finaly copy photons, if there are any
        if nphotons > 0:
            for first_photon, photons_this_round, blocks in chunk_iterator(
                    nphotons, nthreads_per_block, max_blocks):
                self.gpu_funcs.copy_photon_queue(
                    np.int32(start_photon + first_photon),
                    np.int32(photons_this_round),
                    queue_gpu,
                    self.pos,
                    self.dir,
                    self.wavelengths,
                    self.pol,
                    self.t,
                    self.flags,
                    self.last_hit_triangles,
                    self.weights,
                    self.evidx,
                    pos,
                    dir,
                    wavelengths,
                    pol,
                    t,
                    flags,
                    last_hit_triangles,
                    weights,
                    evidx,
                    block=(nthreads_per_block, 1, 1),
                    grid=(blocks, 1))
        return GPUPhotonsSlice(pos, dir, pol, wavelengths, t,
                               last_hit_triangles, flags, weights, evidx)

    @profile_if_possible
    def select(self,
               target_flag,
               nthreads_per_block=64,
               max_blocks=1024,
               start_photon=None,
               nphotons=None):
        '''Return a new GPUPhoton object containing only photons that
        have a particular bit set in their history word.'''
        cuda.Context.get_current().synchronize()
        index_counter_gpu = ga.zeros(shape=1, dtype=np.uint32)
        cuda.Context.get_current().synchronize()
        if start_photon is None:
            start_photon = 0
        if nphotons is None:
            nphotons = self.pos.size - start_photon

        # First count how much space we need
        for first_photon, photons_this_round, blocks in \
                chunk_iterator(nphotons, nthreads_per_block, max_blocks):
            self.gpu_funcs.count_photons(np.int32(start_photon + first_photon),
                                         np.int32(photons_this_round),
                                         np.uint32(target_flag),
                                         index_counter_gpu,
                                         self.flags,
                                         block=(nthreads_per_block, 1, 1),
                                         grid=(blocks, 1))
        cuda.Context.get_current().synchronize()
        reduced_nphotons = int(index_counter_gpu.get()[0])
        # Then allocate new storage space
        pos = ga.empty(shape=reduced_nphotons, dtype=ga.vec.float3)
        dir = ga.empty(shape=reduced_nphotons, dtype=ga.vec.float3)
        pol = ga.empty(shape=reduced_nphotons, dtype=ga.vec.float3)
        wavelengths = ga.empty(shape=reduced_nphotons, dtype=np.float32)
        t = ga.empty(shape=reduced_nphotons, dtype=np.float32)
        last_hit_triangles = ga.empty(shape=reduced_nphotons, dtype=np.int32)
        flags = ga.empty(shape=reduced_nphotons, dtype=np.uint32)
        weights = ga.empty(shape=reduced_nphotons, dtype=np.float32)
        evidx = ga.empty(shape=reduced_nphotons, dtype=np.uint32)

        # And finaly copy photons, if there are any
        if reduced_nphotons > 0:
            index_counter_gpu.fill(0)
            for first_photon, photons_this_round, blocks in \
                    chunk_iterator(nphotons, nthreads_per_block, max_blocks):
                self.gpu_funcs.copy_photons(np.int32(start_photon +
                                                     first_photon),
                                            np.int32(photons_this_round),
                                            np.uint32(target_flag),
                                            index_counter_gpu,
                                            self.pos,
                                            self.dir,
                                            self.wavelengths,
                                            self.pol,
                                            self.t,
                                            self.flags,
                                            self.last_hit_triangles,
                                            self.weights,
                                            self.evidx,
                                            pos,
                                            dir,
                                            wavelengths,
                                            pol,
                                            t,
                                            flags,
                                            last_hit_triangles,
                                            weights,
                                            evidx,
                                            block=(nthreads_per_block, 1, 1),
                                            grid=(blocks, 1))
            assert index_counter_gpu.get()[0] == reduced_nphotons
        return GPUPhotonsSlice(pos, dir, pol, wavelengths, t,
                               last_hit_triangles, flags, weights, evidx)

    def __del__(self):
        del self.pos
        del self.dir
        del self.pol
        del self.wavelengths
        del self.t
        del self.flags
        del self.last_hit_triangles
        # Free up GPU memory quickly if now available
        gc.collect()

    def __len__(self):
        return self.pos.size
Beispiel #29
0
def create_leaf_nodes(mesh, morton_bits=16, round_to_multiple=1):
    '''Compute the leaf nodes surrounding a triangle mesh.

      ``mesh``: chroma.geometry.Mesh
        Triangles to box
      ``morton_bits``: int
        Number of bits to use per dimension when computing Morton code.
      ``round_to_multiple``: int
        Round the number of nodes created up to multiple of this number
        Extra nodes will be all zero.
        
    Returns (world_coords, nodes, morton_codes), where
      ``world_coords``: chroma.bvh.WorldCoords
        Defines the fixed point coordinate system
      ``nodes``: ndarray(shape=len(mesh.triangles), dtype=uint4)
        List of leaf nodes.  Child IDs will be set to triangle offsets.
      ``morton_codes``: ndarray(shape=len(mesh.triangles), dtype=np.uint64)
        Morton codes for each triangle, using ``morton_bits`` per axis.
        Must be <= 16 bits.
    '''
    # Load GPU functions
    bvh_module = get_cu_module('bvh.cu', options=cuda_options,
                               include_source_directory=True)
    bvh_funcs = GPUFuncs(bvh_module)

    # compute world coordinates
    world_origin = mesh.vertices.min(axis=0)
    world_scale = np.max((mesh.vertices.max(axis=0) - world_origin)) \
        / (2**16 - 2)
    world_coords = WorldCoords(world_origin=world_origin, 
                               world_scale=world_scale)

    # Put triangles and vertices in mapped host memory
    triangles = mapped_empty(shape=len(mesh.triangles), dtype=ga.vec.uint3,
                             write_combined=True)
    triangles[:] = to_uint3(mesh.triangles)
    vertices = mapped_empty(shape=len(mesh.vertices), dtype=ga.vec.float3,
                            write_combined=True)
    vertices[:] = to_float3(mesh.vertices)
    
    # Call GPU to compute nodes
    nodes = ga.zeros(shape=round_up_to_multiple(len(triangles), 
                                                round_to_multiple),
                     dtype=ga.vec.uint4)
    morton_codes = ga.empty(shape=len(triangles), dtype=np.uint64)

    # Convert world coords to GPU-friendly types
    world_origin = ga.vec.make_float3(*world_origin)
    world_scale = np.float32(world_scale)

    nthreads_per_block = 256
    for first_index, elements_this_iter, nblocks_this_iter in \
            chunk_iterator(len(triangles), nthreads_per_block, 
                           max_blocks=30000):
        bvh_funcs.make_leaves(np.uint32(first_index),
                              np.uint32(elements_this_iter),
                              Mapped(triangles), Mapped(vertices),
                              world_origin, world_scale,
                              nodes, morton_codes,
                              block=(nthreads_per_block,1,1),
                              grid=(nblocks_this_iter,1))

    morton_codes_host = morton_codes.get() >> (16 - morton_bits)
    return world_coords, nodes.get(), morton_codes_host
Beispiel #30
0
class GPUPDF(object):
    def __init__(self):
        self.module = get_cu_module('pdf.cu',
                                    options=cuda_options,
                                    include_source_directory=True)
        self.gpu_funcs = GPUFuncs(self.module)

    def setup_pdf(self, nchannels, tbins, trange, qbins, qrange):
        """Setup GPU arrays to hold PDF information.

           nchannels: int, number of channels
           tbins: number of time bins
           trange: tuple of (min, max) time in PDF
           qbins: number of charge bins
           qrange: tuple of (min, max) charge in PDF
        """
        self.events_in_histogram = 0
        self.hitcount_gpu = ga.zeros(nchannels, dtype=np.uint32)
        self.pdf_gpu = ga.zeros(shape=(nchannels, tbins, qbins),
                                dtype=np.uint32)
        self.tbins = tbins
        self.trange = trange
        self.qbins = qbins
        self.qrange = qrange

    def clear_pdf(self):
        """Rezero the PDF counters."""
        self.hitcount_gpu.fill(0)
        self.pdf_gpu.fill(0)

    def add_hits_to_pdf(self, gpuchannels, nthreads_per_block=64):
        self.gpu_funcs.bin_hits(
            np.int32(len(self.hitcount_gpu)),
            gpuchannels.q,
            gpuchannels.t,
            self.hitcount_gpu,
            np.int32(self.tbins),
            np.float32(self.trange[0]),
            np.float32(self.trange[1]),
            np.int32(self.qbins),
            np.float32(self.qrange[0]),
            np.float32(self.qrange[1]),
            self.pdf_gpu,
            block=(nthreads_per_block, 1, 1),
            grid=(len(gpuchannels.t) // nthreads_per_block + 1, 1))

        self.events_in_histogram += 1

    def get_pdfs(self):
        """Returns the 1D hitcount array and the 3D [channel, time, charge]
        histogram."""
        return self.hitcount_gpu.get(), self.pdf_gpu.get()

    def setup_pdf_eval(self,
                       event_hit,
                       event_time,
                       event_charge,
                       min_twidth,
                       trange,
                       min_qwidth,
                       qrange,
                       min_bin_content=10,
                       time_only=True):
        """Setup GPU arrays to compute PDF values for the given event.
        The pdf_eval calculation allows the PDF to be evaluated at a
        single point for each channel as the Monte Carlo is run.  The
        effective bin size will be as small as (`min_twidth`,
        `min_qwidth`) around the point of interest, but will be large
        enough to ensure that `min_bin_content` Monte Carlo events
        fall into the bin.

            event_hit: ndarray
              Hit or not-hit status for each channel in the detector.
            event_time: ndarray
              Hit time for each channel in the detector.  If channel 
              not hit, the time will be ignored.
            event_charge: ndarray
              Integrated charge for each channel in the detector.
              If channel not hit, the charge will be ignored.

            min_twidth: float
              Minimum bin size in the time dimension
            trange: (float, float)
              Range of time dimension in PDF
            min_qwidth: float
              Minimum bin size in charge dimension
            qrange: (float, float)
              Range of charge dimension in PDF
            min_bin_content: int
              The bin will be expanded to include at least this many events
            time_only: bool
              If True, only the time observable will be used in the PDF.
        """
        self.event_nhit = count_nonzero(event_hit)

        # Define a mapping from an array of len(event_hit) to an array of length event_nhit
        self.map_hit_offset_to_channel_id = np.where(event_hit)[0].astype(
            np.uint32)
        self.map_hit_offset_to_channel_id_gpu = ga.to_gpu(
            self.map_hit_offset_to_channel_id)
        self.map_channel_id_to_hit_offset = np.maximum(0,
                                                       event_hit.cumsum() -
                                                       1).astype(np.uint32)
        self.map_channel_id_to_hit_offset_gpu = ga.to_gpu(
            self.map_channel_id_to_hit_offset)

        self.event_hit_gpu = ga.to_gpu(event_hit.astype(np.uint32))
        self.event_time_gpu = ga.to_gpu(event_time.astype(np.float32))
        self.event_charge_gpu = ga.to_gpu(event_charge.astype(np.float32))

        self.eval_hitcount_gpu = ga.zeros(len(event_hit), dtype=np.uint32)
        self.eval_bincount_gpu = ga.zeros(len(event_hit), dtype=np.uint32)
        self.nearest_mc_gpu = ga.empty(shape=self.event_nhit * min_bin_content,
                                       dtype=np.float32)
        self.nearest_mc_gpu.fill(1e9)

        self.min_twidth = min_twidth
        self.trange = trange
        self.min_qwidth = min_qwidth
        self.qrange = qrange
        self.min_bin_content = min_bin_content

        assert time_only  # Only support time right now
        self.time_only = time_only

    def clear_pdf_eval(self):
        "Reset PDF evaluation counters to start accumulating new Monte Carlo."
        self.eval_hitcount_gpu.fill(0)
        self.eval_bincount_gpu.fill(0)
        self.nearest_mc_gpu.fill(1e9)

    @profile_if_possible
    def accumulate_pdf_eval(self,
                            gpuchannels,
                            nthreads_per_block=64,
                            max_blocks=10000):
        "Add the most recent results of run_daq() to the PDF evaluation."
        self.work_queues = ga.empty(shape=self.event_nhit *
                                    (gpuchannels.ndaq + 1),
                                    dtype=np.uint32)
        self.work_queues.fill(1)

        self.gpu_funcs.accumulate_bincount(
            np.int32(self.event_hit_gpu.size),
            np.int32(gpuchannels.ndaq),
            self.event_hit_gpu,
            self.event_time_gpu,
            gpuchannels.t,
            self.eval_hitcount_gpu,
            self.eval_bincount_gpu,
            np.float32(self.min_twidth),
            np.float32(self.trange[0]),
            np.float32(self.trange[1]),
            np.int32(self.min_bin_content),
            self.map_channel_id_to_hit_offset_gpu,
            self.work_queues,
            block=(nthreads_per_block, 1, 1),
            grid=(self.event_hit_gpu.size // nthreads_per_block + 1, 1))
        cuda.Context.get_current().synchronize()

        self.gpu_funcs.accumulate_nearest_neighbor_block(
            np.int32(self.event_nhit),
            np.int32(gpuchannels.ndaq),
            self.map_hit_offset_to_channel_id_gpu,
            self.work_queues,
            self.event_time_gpu,
            gpuchannels.t,
            self.nearest_mc_gpu,
            np.int32(self.min_bin_content),
            block=(nthreads_per_block, 1, 1),
            grid=(self.event_nhit, 1))
        cuda.Context.get_current().synchronize()

    def get_pdf_eval(self):
        evhit = self.event_hit_gpu.get().astype(bool)
        hitcount = self.eval_hitcount_gpu.get()
        bincount = self.eval_bincount_gpu.get()

        pdf_value = np.zeros(len(hitcount), dtype=float)
        pdf_frac_uncert = np.zeros_like(pdf_value)

        # PDF value for high stats bins
        high_stats = (bincount >= self.min_bin_content)
        if high_stats.any():
            if self.time_only:
                pdf_value[high_stats] = bincount[high_stats].astype(
                    float) / hitcount[high_stats] / self.min_twidth
            else:
                assert Exception('Unimplemented 2D (time,charge) mode!')

            pdf_frac_uncert[high_stats] = 1.0 / np.sqrt(bincount[high_stats])

        # PDF value for low stats bins
        low_stats = ~high_stats & (hitcount > 0) & evhit

        nearest_mc_by_hit = self.nearest_mc_gpu.get().reshape(
            (self.event_nhit, self.min_bin_content))
        nearest_mc = np.empty(shape=(len(hitcount), self.min_bin_content),
                              dtype=np.float32)
        nearest_mc.fill(1e9)
        nearest_mc[self.map_hit_offset_to_channel_id, :] = nearest_mc_by_hit

        # Deal with the case where we did not even get min_bin_content events
        # in the PDF but also clamp the lower range to ensure we don't index
        # by a negative number in 2 lines
        last_valid_entry = np.maximum(
            0, (nearest_mc < 1e9).astype(int).sum(axis=1) - 1)
        distance = nearest_mc[np.arange(len(last_valid_entry)),
                              last_valid_entry]
        if low_stats.any():
            if self.time_only:
                pdf_value[low_stats] = (
                    last_valid_entry[low_stats] + 1).astype(float) / hitcount[
                        low_stats] / distance[low_stats] / 2.0
            else:
                assert Exception('Unimplemented 2D (time,charge) mode!')

            pdf_frac_uncert[low_stats] = 1.0 / np.sqrt(
                last_valid_entry[low_stats] + 1)

        # PDFs with no stats got zero by default during array creation

        print 'high_stats:', high_stats.sum(), 'low_stats', low_stats.sum()
        return hitcount, pdf_value, pdf_value * pdf_frac_uncert
Beispiel #31
0
def optimize_layer(orig_nodes):
    bvh_module = get_cu_module('bvh.cu', options=cuda_options,
                               include_source_directory=True)
    bvh_funcs = GPUFuncs(bvh_module)

    nodes = ga.to_gpu(orig_nodes)
    n = len(nodes)
    areas = ga.empty(shape=n/2, dtype=np.uint64)
    nthreads_per_block = 128

    min_areas = ga.empty(shape=int(np.ceil(n/float(nthreads_per_block))), dtype=np.uint64)
    min_index = ga.empty(shape=min_areas.shape, dtype=np.uint32)

    update = 10000

    skip_size = 1
    flag = mapped_empty(shape=skip_size, dtype=np.uint32)

    i = 0
    skips = 0
    swaps = 0
    while i < n/2 - 1:
        # How are we doing?
        if i % update == 0:
            for first_index, elements_this_iter, nblocks_this_iter in \
                    chunk_iterator(n/2, nthreads_per_block, max_blocks=10000):

                bvh_funcs.pair_area(np.uint32(first_index),
                                    np.uint32(elements_this_iter),
                                    nodes,
                                    areas,
                                    block=(nthreads_per_block,1,1),
                                    grid=(nblocks_this_iter,1))
                
            areas_host = areas.get()
            #print nodes.get(), areas_host.astype(float)
            print 'Area of parent layer so far (%d): %1.12e' % (i*2, areas_host.astype(float).sum())
            print 'Skips: %d, Swaps: %d' % (skips, swaps)

        test_index = i * 2

        blocks = 0
        look_forward = min(8192*50, n - test_index - 2)
        skip_this_round = min(skip_size, n - test_index - 1)
        flag[:] = 0
        for first_index, elements_this_iter, nblocks_this_iter in \
                chunk_iterator(look_forward, nthreads_per_block, max_blocks=10000):
            bvh_funcs.min_distance_to(np.uint32(first_index + test_index + 2),
                                      np.uint32(elements_this_iter),
                                      np.uint32(test_index),
                                      nodes,
                                      np.uint32(blocks),
                                      min_areas,
                                      min_index,
                                      Mapped(flag),
                                      block=(nthreads_per_block,1,1),
                                      grid=(nblocks_this_iter, skip_this_round))
            blocks += nblocks_this_iter
            #print i, first_index, nblocks_this_iter, look_forward
        cuda.Context.get_current().synchronize()

        if flag[0] == 0:
            flag_nonzero = flag.nonzero()[0]
            if len(flag_nonzero) == 0:
                no_swap_required = skip_size
            else:
                no_swap_required = flag_nonzero[0]
            i += no_swap_required
            skips += no_swap_required
            continue

        min_areas_host = min_areas[:blocks].get()
        min_index_host = min_index[:blocks].get()
        best_block = min_areas_host.argmin()
        better_i = min_index_host[best_block]

        swaps += 1
        #print 'swap', test_index+1, better_i
        assert 0 < better_i < len(nodes)
        assert 0 < test_index + 1 < len(nodes)
        bvh_funcs.swap(np.uint32(test_index+1), np.uint32(better_i),
                       nodes, block=(1,1,1), grid=(1,1))
        cuda.Context.get_current().synchronize()
        i += 1

    for first_index, elements_this_iter, nblocks_this_iter in \
            chunk_iterator(n/2, nthreads_per_block, max_blocks=10000):

        bvh_funcs.pair_area(np.uint32(first_index),
                            np.uint32(elements_this_iter),
                            nodes,
                            areas,
                            block=(nthreads_per_block,1,1),
                            grid=(nblocks_this_iter,1))
        
    areas_host = areas.get()

    print 'Final area of parent layer: %1.12e' % areas_host.sum()
    print 'Skips: %d, Swaps: %d' % (skips, swaps)

    return nodes.get()
Beispiel #32
0
 def __init__(self):
     self.module = get_cu_module('pdf.cu',
                                 options=cuda_options,
                                 include_source_directory=True)
     self.gpu_funcs = GPUFuncs(self.module)
Beispiel #33
0
class GPUPhotonsHit(object):
    def __init__(self, photons, ncopies=1, max_time=4.):
        """Load ``photons`` onto the GPU, replicating as requested.

           Args:
               - photons: chroma.Event.Photons
                   Photon state information to load onto GPU
               - ncopies: int, *optional*
                   Number of times to replicate the photons
                   on the GPU.  This is used if you want
                   to propagate the same event many times,
                   for example in a likelihood calculation.

                   The amount of GPU storage will be proportionally
                   larger if ncopies > 1, so be careful.
        """

        module = get_cu_module('propagate_hit.cu', options=cuda_options)
        propagate_hit_kernel = module.get_function('propagate_hit')
        propagate_hit_kernel.prepare('iiPPPPPPPPPPPiiiPPP')
        self.propagate_hit_kernel = propagate_hit_kernel
        self.gpu_funcs = GPUFuncs(module)

        self.max_time = max_time
        self.ncopies = ncopies
        self.true_nphotons = len(photons)
        self.marshall_photons(photons, ncopies)

    def marshall_photons_npl(self, npl):
        pass

    def marshall_photons(self, photons, ncopies):
        """
        Assign the provided photons to the beginning (possibly
        the entire array if ncopies is 1
        """
        nphotons = len(photons)
        self.pos = ga.empty(shape=nphotons * ncopies, dtype=ga.vec.float3)
        self.dir = ga.empty(shape=nphotons * ncopies, dtype=ga.vec.float3)
        self.pol = ga.empty(shape=nphotons * ncopies, dtype=ga.vec.float3)
        self.wavelengths = ga.empty(shape=nphotons * ncopies, dtype=np.float32)
        self.t = ga.empty(shape=nphotons * ncopies, dtype=np.float32)
        self.last_hit_triangles = ga.empty(shape=nphotons * ncopies,
                                           dtype=np.int32)
        self.flags = ga.empty(shape=nphotons * ncopies, dtype=np.uint32)
        self.weights = ga.empty(shape=nphotons * ncopies, dtype=np.float32)

        self.pos[:nphotons].set(to_float3(photons.pos))
        self.dir[:nphotons].set(to_float3(photons.dir))
        self.pol[:nphotons].set(to_float3(photons.pol))
        self.wavelengths[:nphotons].set(photons.wavelengths.astype(np.float32))
        self.t[:nphotons].set(photons.t.astype(np.float32))
        self.last_hit_triangles[:nphotons].set(
            photons.last_hit_triangles.astype(np.int32))
        self.flags[:nphotons].set(photons.flags.astype(np.uint32))
        self.weights[:nphotons].set(photons.weights.astype(np.float32))

        # Replicate the photons to the rest of the slots if needed
        if ncopies > 1:
            max_blocks = 1024
            nthreads_per_block = 64
            block = (nthreads_per_block, 1, 1)
            for first_photon, photons_this_round, blocks in chunk_iterator(
                    nphotons, nthreads_per_block, max_blocks):
                pass
                grid = (blocks, 1)
                args = (
                    np.int32(first_photon),
                    np.int32(photons_this_round),
                    self.pos,
                    self.dir,
                    self.wavelengths,
                    self.pol,
                    self.t,
                    self.flags,
                    self.last_hit_triangles,
                    self.weights,
                    np.int32(ncopies - 1),
                    np.int32(nphotons),
                )
                self.gpu_funcs.photon_duplicate(*args, block=block, grid=grid)
            pass
        pass

    def get(self, npl=0, hit=0):
        log.info("get npl:%d hit:%d " % (npl, hit))
        pos = self.pos.get().view(np.float32).reshape((len(self.pos), 3))
        dir = self.dir.get().view(np.float32).reshape((len(self.dir), 3))
        pol = self.pol.get().view(np.float32).reshape((len(self.pol), 3))
        wavelengths = self.wavelengths.get()
        t = self.t.get()
        last_hit_triangles = self.last_hit_triangles.get()
        flags = self.flags.get()
        weights = self.weights.get()

        if npl:
            nall = len(pos)
            a = np.zeros((nall, 4, 4), dtype=np.float32)

            a[:, 0, :3] = pos
            a[:, 0, 3] = t

            a[:, 1, :3] = dir
            a[:, 1, 3] = wavelengths

            a[:, 2, :3] = pol
            a[:, 2, 3] = weights

            assert len(last_hit_triangles) == len(flags)
            pmtid = np.zeros(nall, dtype=np.int32)

            # a kludge setting of pmtid into lht using the map argument of propagate_hit.cu
            SURFACE_DETECT = 0x1 << 2
            detected = np.where(flags & SURFACE_DETECT)
            pmtid[detected] = last_hit_triangles[
                detected]  # sparsely populate, leaving zeros for undetected

            a[:, 3, 0] = np.arange(nall,
                                   dtype=np.int32).view(a.dtype)  # photon_id
            a[:, 3, 1] = 0  # used in comparison againt vbo prop
            a[:, 3, 2] = flags.view(a.dtype)  # history flags
            a[:, 3, 3] = pmtid.view(a.dtype)  # channel_id ie PmtId

            if hit:
                return a[pmtid > 0].view(NPY)
            else:
                return a.view(NPY)
            pass
        else:  # the old way
            return event.Photons(pos, dir, pol, wavelengths, t,
                                 last_hit_triangles, flags, weights)

    def iterate_copies(self):
        '''Returns an iterator that yields GPUPhotonsSlice objects
        corresponding to the event copies stored in ``self``.'''
        for i in xrange(self.ncopies):
            window = slice(self.true_nphotons * i,
                           self.true_nphotons * (i + 1))
            yield GPUPhotonsSlice(
                pos=self.pos[window],
                dir=self.dir[window],
                pol=self.pol[window],
                wavelengths=self.wavelengths[window],
                t=self.t[window],
                last_hit_triangles=self.last_hit_triangles[window],
                flags=self.flags[window],
                weights=self.weights[window])

    def upload_queues(self, nwork):
        """
        # Order photons initially in the queue to put the clones next to each other


        #. input_queue starts as [0,0,1,2,3,.....,nwork]

        #. output_queue starts as [1,0,0,0,0,....] 
 
        """
        input_queue = np.empty(shape=nwork + 1, dtype=np.uint32)
        input_queue[0] = 0

        for copy in xrange(self.ncopies):
            input_queue[1 + copy::self.ncopies] = np.arange(
                self.true_nphotons,
                dtype=np.uint32) + copy * self.true_nphotons

        output_queue = np.zeros(shape=nwork + 1, dtype=np.uint32)
        output_queue[0] = 1

        self.input_queue_gpu = ga.to_gpu(input_queue)
        self.output_queue_gpu = ga.to_gpu(output_queue)

    def swap_queues(self):
        """ 
        Swaps queues and returns photons remaining to propagate

        #. output_queue[0] = 1 initially, this avoids enqueued photon_id 
           stomping on output_queue[0] as atomicAdd returns the non-incremented::

                230     // Not done, put photon in output queue
                231     if ((p.history & (NO_HIT | BULK_ABSORB | SURFACE_DETECT | SURFACE_ABSORB | NAN_ABORT)) == 0) 
                232     {  
                            // pulling queue ticket
                233         int out_idx = atomicAdd(output_queue, 1);  // atomic add 1 to slot zero value, returns non-incremented original value
                234         output_queue[out_idx] = photon_id;
                235     }


        #. At kernel tail non-completed photon threads enqueue their photon_id
           into a slot in the output_queue. The slot to use is obtained 
           by atomic incrementing output_queue[0], ensuring orderly queue.

        #. after kernel completes output_queue[0] contains the
           number of photon_id enqued in output_queue[1:] 

        """
        temp = self.input_queue_gpu
        self.input_queue_gpu = self.output_queue_gpu
        self.output_queue_gpu = temp
        self.output_queue_gpu[:1].set(np.ones(shape=1, dtype=np.uint32))
        slot0minus1 = self.input_queue_gpu[:1].get(
        )[0] - 1  # which was just now the output_queue before swap
        log.debug("swap_queues slot0minus1 %s " % slot0minus1)
        return slot0minus1

    @profile_if_possible
    def propagate_hit(self, gpu_geometry, rng_states, parameters):
        """Propagate photons on GPU to termination or max_steps, whichever
        comes first.

        May be called repeatedly without reloading photon information if
        single-stepping through photon history.

        ..warning::
            `rng_states` must have at least `nthreads_per_block`*`max_blocks`
            number of curandStates.


        got one abort::

             In [1]: a = ph("hhMOCK")

             In [9]: f = a[:,3,2].view(np.uint32)

             In [12]: np.where( f & 1<<31 )
             Out[12]: (array([279]),)

        failed to just mock that one::

              RANGE=279:280 MockNuWa MOCK 


        """
        nphotons = self.pos.size
        nwork = nphotons

        nthreads_per_block = parameters['threads_per_block']
        max_blocks = parameters['max_blocks']
        max_steps = parameters['max_steps']
        use_weights = False
        scatter_first = 0

        self.upload_queues(nwork)

        solid_id_map_gpu = gpu_geometry.solid_id_map
        solid_id_to_channel_id_gpu = gpu_geometry.solid_id_to_channel_id_gpu

        small_remainder = nthreads_per_block * 16 * 8
        block = (nthreads_per_block, 1, 1)

        results = {}
        results['name'] = "propagate_hit"
        results['nphotons'] = nphotons
        results['nwork'] = nwork
        results['nsmall'] = small_remainder
        results['COLUMNS'] = "name:s,nphotons:i,nwork:i,nsmall:i"

        step = 0
        times = []

        npass = 0
        nabort = 0

        while step < max_steps:
            npass += 1
            if nwork < small_remainder or use_weights:
                nsteps = max_steps - step  # Just finish the rest of the steps if the # of photons is low
                log.debug(
                    "increase nsteps for stragglers: small_remainder %s nwork %s nsteps %s max_steps %s "
                    % (small_remainder, nwork, nsteps, max_steps))
            else:
                nsteps = 1
            pass
            log.info("nphotons %s nwork %s step %s max_steps %s nsteps %s " %
                     (nphotons, nwork, step, max_steps, nsteps))

            abort = False
            for first_photon, photons_this_round, blocks in chunk_iterator(
                    nwork, nthreads_per_block, max_blocks):
                if abort:
                    nabort += 1
                else:
                    grid = (blocks, 1)
                    args = (
                        np.int32(first_photon),
                        np.int32(photons_this_round),
                        self.input_queue_gpu[1:].gpudata,
                        self.output_queue_gpu.gpudata,
                        rng_states,
                        self.pos.gpudata,
                        self.dir.gpudata,
                        self.wavelengths.gpudata,
                        self.pol.gpudata,
                        self.t.gpudata,
                        self.flags.gpudata,
                        self.last_hit_triangles.gpudata,
                        self.weights.gpudata,
                        np.int32(nsteps),
                        np.int32(use_weights),
                        np.int32(scatter_first),
                        gpu_geometry.gpudata,
                        solid_id_map_gpu.gpudata,
                        solid_id_to_channel_id_gpu.gpudata,
                    )

                    log.info(
                        "propagate_hit_kernel.prepared_timed_call grid %s block %s first_photon %s photons_this_round %s "
                        % (repr(grid), repr(block), first_photon,
                           photons_this_round))
                    get_time = self.propagate_hit_kernel.prepared_timed_call(
                        grid, block, *args)
                    t = get_time()
                    times.append(t)
                    if t > self.max_time:
                        abort = True
                        log.warn(
                            "kernel launch time %s > max_time %s : ABORTING " %
                            (t, self.max_time))
                    pass
                pass
            pass
            log.info("step %s propagate_hit_kernel times  %s " %
                     (step, repr(times)))
            pass
            step += nsteps
            scatter_first = 0  # Only allow non-zero in first pass
            if step < max_steps:
                nwork = self.swap_queues()
            pass
        pass

        log.info("calling max ")
        if ga.max(self.flags).get() & (1 << 31):
            log.warn("ABORTED PHOTONS")
        log.info("done calling max ")

        cuda.Context.get_current().synchronize()

        results['npass'] = npass
        results['nabort'] = nabort
        results['nlaunch'] = len(times)
        results['tottime'] = sum(times)
        results['maxtime'] = max(times)
        results['mintime'] = min(times)
        results[
            'COLUMNS'] += ",npass:i,nabort:i,nlaunch:i,tottime:f,maxtime:f,mintime:f"
        return results

    @profile_if_possible
    def select(self,
               target_flag,
               nthreads_per_block=64,
               max_blocks=1024,
               start_photon=None,
               nphotons=None):
        '''Return a new GPUPhoton object containing only photons that
        have a particular bit set in their history word.'''
        cuda.Context.get_current().synchronize()
        index_counter_gpu = ga.zeros(shape=1, dtype=np.uint32)
        cuda.Context.get_current().synchronize()
        if start_photon is None:
            start_photon = 0
        if nphotons is None:
            nphotons = self.pos.size - start_photon

        # First count how much space we need
        for first_photon, photons_this_round, blocks in \
                chunk_iterator(nphotons, nthreads_per_block, max_blocks):
            self.gpu_funcs.count_photons(np.int32(start_photon + first_photon),
                                         np.int32(photons_this_round),
                                         np.uint32(target_flag),
                                         index_counter_gpu,
                                         self.flags,
                                         block=(nthreads_per_block, 1, 1),
                                         grid=(blocks, 1))
        cuda.Context.get_current().synchronize()
        reduced_nphotons = int(index_counter_gpu.get()[0])
        # Then allocate new storage space
        pos = ga.empty(shape=reduced_nphotons, dtype=ga.vec.float3)
        dir = ga.empty(shape=reduced_nphotons, dtype=ga.vec.float3)
        pol = ga.empty(shape=reduced_nphotons, dtype=ga.vec.float3)
        wavelengths = ga.empty(shape=reduced_nphotons, dtype=np.float32)
        t = ga.empty(shape=reduced_nphotons, dtype=np.float32)
        last_hit_triangles = ga.empty(shape=reduced_nphotons, dtype=np.int32)
        flags = ga.empty(shape=reduced_nphotons, dtype=np.uint32)
        weights = ga.empty(shape=reduced_nphotons, dtype=np.float32)

        # And finaly copy photons, if there are any
        if reduced_nphotons > 0:
            index_counter_gpu.fill(0)
            for first_photon, photons_this_round, blocks in \
                    chunk_iterator(nphotons, nthreads_per_block, max_blocks):
                self.gpu_funcs.copy_photons(np.int32(start_photon +
                                                     first_photon),
                                            np.int32(photons_this_round),
                                            np.uint32(target_flag),
                                            index_counter_gpu,
                                            self.pos,
                                            self.dir,
                                            self.wavelengths,
                                            self.pol,
                                            self.t,
                                            self.flags,
                                            self.last_hit_triangles,
                                            self.weights,
                                            pos,
                                            dir,
                                            wavelengths,
                                            pol,
                                            t,
                                            flags,
                                            last_hit_triangles,
                                            weights,
                                            block=(nthreads_per_block, 1, 1),
                                            grid=(blocks, 1))
            assert index_counter_gpu.get()[0] == reduced_nphotons
        return GPUPhotonsHitSlice(pos, dir, pol, wavelengths, t,
                                  last_hit_triangles, flags, weights)

    def __del__(self):
        del self.pos
        del self.dir
        del self.pol
        del self.wavelengths
        del self.t
        del self.flags
        del self.last_hit_triangles
        # Free up GPU memory quickly if now available
        gc.collect()

    def __len__(self):
        return self.pos.size