Example #1
0
    def acquire(self, gpuphotons, rng_states, nthreads_per_block=64, max_blocks=1024, start_photon=None, nphotons=None, weight=1.0):
        if start_photon is None:
            start_photon = 0
        if nphotons is None:
            nphotons = len(gpuphotons.pos) - start_photon

        if self.ndaq == 1:
            for first_photon, photons_this_round, blocks in \
                    chunk_iterator(nphotons, nthreads_per_block, max_blocks):
                self.gpu_funcs.run_daq(rng_states, np.uint32(0x1 << 2), 
                                       np.int32(start_photon+first_photon), np.int32(photons_this_round), gpuphotons.t, 
                                       gpuphotons.flags, gpuphotons.last_hit_triangles, gpuphotons.weights,
                                       self.solid_id_map_gpu,
                                       self.detector_gpu,
                                       self.earliest_time_int_gpu, 
                                       self.channel_q_int_gpu, self.channel_history_gpu,
                                       np.float32(weight),
                                       block=(nthreads_per_block,1,1), grid=(blocks,1))
        else:
            for first_photon, photons_this_round, blocks in \
                    chunk_iterator(nphotons, 1, max_blocks):
                self.gpu_funcs.run_daq_many(rng_states, np.uint32(0x1 << 2), 
                                            np.int32(start_photon+first_photon), np.int32(photons_this_round), gpuphotons.t, 
                                            gpuphotons.flags, gpuphotons.last_hit_triangles, gpuphotons.weights,
                                            self.solid_id_map_gpu,
                                            self.detector_gpu,
                                            self.earliest_time_int_gpu, 
                                            self.channel_q_int_gpu, self.channel_history_gpu, 
                                            np.int32(self.ndaq), np.int32(self.stride),
                                            np.float32(weight),
                                            block=(nthreads_per_block,1,1), grid=(blocks,1))
        cuda.Context.get_current().synchronize()
Example #2
0
    def select(self, target_flag, nthreads_per_block=64, max_blocks=1024,
               start_photon=None, nphotons=None):
        '''Return a new GPUPhoton object containing only photons that
        have a particular bit set in their history word.'''
        cuda.Context.get_current().synchronize()
        index_counter_gpu = ga.zeros(shape=1, dtype=np.uint32)
        cuda.Context.get_current().synchronize()
        if start_photon is None:
            start_photon = 0
        if nphotons is None:
            nphotons = self.pos.size - start_photon

        # First count how much space we need
        for first_photon, photons_this_round, blocks in \
                chunk_iterator(nphotons, nthreads_per_block, max_blocks):
            self.gpu_funcs.count_photons(np.int32(start_photon+first_photon), 
                                         np.int32(photons_this_round),
                                         np.uint32(target_flag),
                                         index_counter_gpu, self.flags,
                                         block=(nthreads_per_block,1,1), 
                                         grid=(blocks, 1))
        cuda.Context.get_current().synchronize()
        reduced_nphotons = int(index_counter_gpu.get()[0])
        # Then allocate new storage space
        pos = ga.empty(shape=reduced_nphotons, dtype=ga.vec.float3)
        dir = ga.empty(shape=reduced_nphotons, dtype=ga.vec.float3)
        pol = ga.empty(shape=reduced_nphotons, dtype=ga.vec.float3)
        wavelengths = ga.empty(shape=reduced_nphotons, dtype=np.float32)
        t = ga.empty(shape=reduced_nphotons, dtype=np.float32)
        last_hit_triangles = ga.empty(shape=reduced_nphotons, dtype=np.int32)
        flags = ga.empty(shape=reduced_nphotons, dtype=np.uint32)
        weights = ga.empty(shape=reduced_nphotons, dtype=np.float32)
        evidx = ga.empty(shape=reduced_nphotons, dtype=np.uint32)

        # And finaly copy photons, if there are any
        if reduced_nphotons > 0:
            index_counter_gpu.fill(0)
            for first_photon, photons_this_round, blocks in \
                    chunk_iterator(nphotons, nthreads_per_block, max_blocks):
                self.gpu_funcs.copy_photons(np.int32(start_photon+first_photon), 
                                            np.int32(photons_this_round), 
                                            np.uint32(target_flag),
                                            index_counter_gpu, 
                                            self.pos, self.dir, self.wavelengths, self.pol, self.t, self.flags, self.last_hit_triangles, self.weights, self.evidx,
                                            pos, dir, wavelengths, pol, t, flags, last_hit_triangles, weights, evidx,
                                            block=(nthreads_per_block,1,1), 
                                            grid=(blocks, 1))
            assert index_counter_gpu.get()[0] == reduced_nphotons
        return GPUPhotonsSlice(pos, dir, pol, wavelengths, t, last_hit_triangles, flags, weights, evidx)
Example #3
0
    def select(self, target_flag, nthreads_per_block=64, max_blocks=1024,
               start_photon=None, nphotons=None):
        '''Return a new GPUPhoton object containing only photons that
        have a particular bit set in their history word.'''
        cuda.Context.get_current().synchronize()
        index_counter_gpu = ga.zeros(shape=1, dtype=np.uint32)
        cuda.Context.get_current().synchronize()
        if start_photon is None:
            start_photon = 0
        if nphotons is None:
            nphotons = self.pos.size - start_photon

        # First count how much space we need
        for first_photon, photons_this_round, blocks in \
                chunk_iterator(nphotons, nthreads_per_block, max_blocks):
            self.gpu_funcs.count_photons(np.int32(start_photon+first_photon), 
                                         np.int32(photons_this_round),
                                         np.uint32(target_flag),
                                         index_counter_gpu, self.flags,
                                         block=(nthreads_per_block,1,1), 
                                         grid=(blocks, 1))
        cuda.Context.get_current().synchronize()
        reduced_nphotons = int(index_counter_gpu.get()[0])
        # Then allocate new storage space
        pos = ga.empty(shape=reduced_nphotons, dtype=ga.vec.float3)
        dir = ga.empty(shape=reduced_nphotons, dtype=ga.vec.float3)
        pol = ga.empty(shape=reduced_nphotons, dtype=ga.vec.float3)
        wavelengths = ga.empty(shape=reduced_nphotons, dtype=np.float32)
        t = ga.empty(shape=reduced_nphotons, dtype=np.float32)
        last_hit_triangles = ga.empty(shape=reduced_nphotons, dtype=np.int32)
        flags = ga.empty(shape=reduced_nphotons, dtype=np.uint32)
        weights = ga.empty(shape=reduced_nphotons, dtype=np.float32)

        # And finaly copy photons, if there are any
        if reduced_nphotons > 0:
            index_counter_gpu.fill(0)
            for first_photon, photons_this_round, blocks in \
                    chunk_iterator(nphotons, nthreads_per_block, max_blocks):
                self.gpu_funcs.copy_photons(np.int32(start_photon+first_photon), 
                                            np.int32(photons_this_round), 
                                            np.uint32(target_flag),
                                            index_counter_gpu, 
                                            self.pos, self.dir, self.wavelengths, self.pol, self.t, self.flags, self.last_hit_triangles, self.weights,
                                            pos, dir, wavelengths, pol, t, flags, last_hit_triangles, weights,
                                            block=(nthreads_per_block,1,1), 
                                            grid=(blocks, 1))
            assert index_counter_gpu.get()[0] == reduced_nphotons
        return GPUPhotonsSlice(pos, dir, pol, wavelengths, t, last_hit_triangles, flags, weights)
Example #4
0
def concatenate_layers(layers):
    bvh_module = get_cu_module('bvh.cu',
                               options=cuda_options,
                               include_source_directory=True)
    bvh_funcs = GPUFuncs(bvh_module)
    # Put 0 at beginning of list
    layer_bounds = np.insert(np.cumsum(map(len, layers)), 0, 0)
    nodes = ga.empty(shape=int(layer_bounds[-1]), dtype=ga.vec.uint4)
    nthreads_per_block = 256

    for layer_start, layer_end, layer in zip(layer_bounds[:-1],
                                             layer_bounds[1:], layers):
        if layer_end == layer_bounds[-1]:
            # leaf nodes need no offset
            child_offset = 0
        else:
            child_offset = layer_end

        for first_index, elements_this_iter, nblocks_this_iter in \
                chunk_iterator(layer_end-layer_start, nthreads_per_block,
                               max_blocks=10000):
            bvh_funcs.copy_and_offset(np.uint32(first_index),
                                      np.uint32(elements_this_iter),
                                      np.uint32(child_offset),
                                      cuda.In(layer),
                                      nodes[layer_start:],
                                      block=(nthreads_per_block, 1, 1),
                                      grid=(nblocks_this_iter, 1))
    return nodes.get(), layer_bounds
Example #5
0
def merge_nodes_detailed(nodes, first_child, nchild):
    '''Merges nodes into len(first_child) parent nodes, using
    the provided arrays to determine the index of the first
    child of each parent, and how many children there are.'''
    bvh_module = get_cu_module('bvh.cu',
                               options=cuda_options,
                               include_source_directory=True)
    bvh_funcs = GPUFuncs(bvh_module)

    gpu_nodes = ga.to_gpu(nodes)
    gpu_first_child = ga.to_gpu(first_child.astype(np.int32))
    gpu_nchild = ga.to_gpu(nchild.astype(np.int32))

    nparent = len(first_child)
    gpu_parent_nodes = ga.empty(shape=nparent, dtype=ga.vec.uint4)

    nthreads_per_block = 256
    for first_index, elements_this_iter, nblocks_this_iter in \
            chunk_iterator(nparent, nthreads_per_block, max_blocks=10000):

        bvh_funcs.make_parents_detailed(np.uint32(first_index),
                                        np.uint32(elements_this_iter),
                                        gpu_nodes,
                                        gpu_parent_nodes,
                                        gpu_first_child,
                                        gpu_nchild,
                                        block=(nthreads_per_block, 1, 1),
                                        grid=(nblocks_this_iter, 1))

    return gpu_parent_nodes.get()
Example #6
0
    def copy_queue(self, queue_gpu, nphotons, nthreads_per_block=64, max_blocks=1024,
               start_photon=0):
               
        # Allocate new storage space
        pos = ga.empty(shape=nphotons, dtype=ga.vec.float3)
        dir = ga.empty(shape=nphotons, dtype=ga.vec.float3)
        pol = ga.empty(shape=nphotons, dtype=ga.vec.float3)
        wavelengths = ga.empty(shape=nphotons, dtype=np.float32)
        t = ga.empty(shape=nphotons, dtype=np.float32)
        last_hit_triangles = ga.empty(shape=nphotons, dtype=np.int32)
        flags = ga.empty(shape=nphotons, dtype=np.uint32)
        weights = ga.empty(shape=nphotons, dtype=np.float32)
        evidx = ga.empty(shape=nphotons, dtype=np.uint32)

        # And finaly copy photons, if there are any
        if nphotons > 0:
            for first_photon, photons_this_round, blocks in chunk_iterator(nphotons, nthreads_per_block, max_blocks):
                self.gpu_funcs.copy_photon_queue(np.int32(start_photon+first_photon), 
                                            np.int32(photons_this_round), 
                                            queue_gpu, 
                                            self.pos, self.dir, self.wavelengths, self.pol, self.t, self.flags, self.last_hit_triangles, self.weights, self.evidx,
                                            pos, dir, wavelengths, pol, t, flags, last_hit_triangles, weights, evidx,
                                            block=(nthreads_per_block,1,1), 
                                            grid=(blocks, 1))
        return GPUPhotonsSlice(pos, dir, pol, wavelengths, t, last_hit_triangles, flags, weights, evidx)
Example #7
0
def merge_nodes_detailed(nodes, first_child, nchild):
    '''Merges nodes into len(first_child) parent nodes, using
    the provided arrays to determine the index of the first
    child of each parent, and how many children there are.'''
    bvh_module = get_cu_module('bvh.cu', options=cuda_options,
                               include_source_directory=True)
    bvh_funcs = GPUFuncs(bvh_module)

    gpu_nodes = ga.to_gpu(nodes)
    gpu_first_child = ga.to_gpu(first_child.astype(np.int32))
    gpu_nchild = ga.to_gpu(nchild.astype(np.int32))

    nparent = len(first_child)
    gpu_parent_nodes = ga.empty(shape=nparent, dtype=ga.vec.uint4)

    nthreads_per_block = 256
    for first_index, elements_this_iter, nblocks_this_iter in \
            chunk_iterator(nparent, nthreads_per_block, max_blocks=10000):

        bvh_funcs.make_parents_detailed(np.uint32(first_index),
                                        np.uint32(elements_this_iter),
                                        gpu_nodes,
                                        gpu_parent_nodes,
                                        gpu_first_child,
                                        gpu_nchild,
                                        block=(nthreads_per_block,1,1),
                                        grid=(nblocks_this_iter,1))

    return gpu_parent_nodes.get()
Example #8
0
def concatenate_layers(layers):
    bvh_module = get_cu_module('bvh.cu', options=cuda_options,
                               include_source_directory=True)
    bvh_funcs = GPUFuncs(bvh_module)
    # Put 0 at beginning of list
    layer_bounds = np.insert(np.cumsum(map(len, layers)), 0, 0)
    nodes = ga.empty(shape=int(layer_bounds[-1]), dtype=ga.vec.uint4)
    nthreads_per_block = 256

    for layer_start, layer_end, layer in zip(layer_bounds[:-1],
                                             layer_bounds[1:],
                                             layers):
        if layer_end == layer_bounds[-1]:
            # leaf nodes need no offset
            child_offset = 0
        else:
            child_offset = layer_end

        for first_index, elements_this_iter, nblocks_this_iter in \
                chunk_iterator(layer_end-layer_start, nthreads_per_block,
                               max_blocks=10000):
            bvh_funcs.copy_and_offset(np.uint32(first_index),
                                      np.uint32(elements_this_iter),
                                      np.uint32(child_offset),
                                      cuda.In(layer),
                                      nodes[layer_start:],
                                      block=(nthreads_per_block,1,1),
                                      grid=(nblocks_this_iter,1))
    return nodes.get(), layer_bounds
Example #9
0
    def __init__(self, photons, ncopies=1):
        """Load ``photons`` onto the GPU, replicating as requested.

           Args:
               - photons: chroma.Event.Photons
                   Photon state information to load onto GPU
               - ncopies: int, *optional*
                   Number of times to replicate the photons
                   on the GPU.  This is used if you want
                   to propagate the same event many times,
                   for example in a likelihood calculation.

                   The amount of GPU storage will be proportionally
                   larger if ncopies > 1, so be careful.
        """
        nphotons = len(photons)
        self.pos = ga.empty(shape=nphotons*ncopies, dtype=ga.vec.float3)
        self.dir = ga.empty(shape=nphotons*ncopies, dtype=ga.vec.float3)
        self.pol = ga.empty(shape=nphotons*ncopies, dtype=ga.vec.float3)
        self.wavelengths = ga.empty(shape=nphotons*ncopies, dtype=np.float32)
        self.t = ga.empty(shape=nphotons*ncopies, dtype=np.float32)
        self.last_hit_triangles = ga.empty(shape=nphotons*ncopies, dtype=np.int32)
        self.flags = ga.empty(shape=nphotons*ncopies, dtype=np.uint32)
        self.weights = ga.empty(shape=nphotons*ncopies, dtype=np.float32)

        # Assign the provided photons to the beginning (possibly
        # the entire array if ncopies is 1
        self.pos[:nphotons].set(to_float3(photons.pos))
        self.dir[:nphotons].set(to_float3(photons.dir))
        self.pol[:nphotons].set(to_float3(photons.pol))
        self.wavelengths[:nphotons].set(photons.wavelengths.astype(np.float32))
        self.t[:nphotons].set(photons.t.astype(np.float32))
        self.last_hit_triangles[:nphotons].set(photons.last_hit_triangles.astype(np.int32))
        self.flags[:nphotons].set(photons.flags.astype(np.uint32))
        self.weights[:nphotons].set(photons.weights.astype(np.float32))

        module = get_cu_module('propagate.cu', options=cuda_options)
        self.gpu_funcs = GPUFuncs(module)

        # Replicate the photons to the rest of the slots if needed
        if ncopies > 1:
            max_blocks = 1024
            nthreads_per_block = 64
            for first_photon, photons_this_round, blocks in \
                    chunk_iterator(nphotons, nthreads_per_block, max_blocks):
                self.gpu_funcs.photon_duplicate(np.int32(first_photon), np.int32(photons_this_round),
                                                self.pos, self.dir, self.wavelengths, self.pol, self.t, 
                                                self.flags, self.last_hit_triangles, self.weights,
                                                np.int32(ncopies-1), 
                                                np.int32(nphotons),
                                                block=(nthreads_per_block,1,1), grid=(blocks, 1))


        # Save the duplication information for the iterate_copies() method
        self.true_nphotons = nphotons
        self.ncopies = ncopies
Example #10
0
    def marshall_photons(self, photons, ncopies):
        """
        Assign the provided photons to the beginning (possibly
        the entire array if ncopies is 1
        """
        nphotons = len(photons)
        self.pos = ga.empty(shape=nphotons * ncopies, dtype=ga.vec.float3)
        self.dir = ga.empty(shape=nphotons * ncopies, dtype=ga.vec.float3)
        self.pol = ga.empty(shape=nphotons * ncopies, dtype=ga.vec.float3)
        self.wavelengths = ga.empty(shape=nphotons * ncopies, dtype=np.float32)
        self.t = ga.empty(shape=nphotons * ncopies, dtype=np.float32)
        self.last_hit_triangles = ga.empty(shape=nphotons * ncopies,
                                           dtype=np.int32)
        self.flags = ga.empty(shape=nphotons * ncopies, dtype=np.uint32)
        self.weights = ga.empty(shape=nphotons * ncopies, dtype=np.float32)

        self.pos[:nphotons].set(to_float3(photons.pos))
        self.dir[:nphotons].set(to_float3(photons.dir))
        self.pol[:nphotons].set(to_float3(photons.pol))
        self.wavelengths[:nphotons].set(photons.wavelengths.astype(np.float32))
        self.t[:nphotons].set(photons.t.astype(np.float32))
        self.last_hit_triangles[:nphotons].set(
            photons.last_hit_triangles.astype(np.int32))
        self.flags[:nphotons].set(photons.flags.astype(np.uint32))
        self.weights[:nphotons].set(photons.weights.astype(np.float32))

        # Replicate the photons to the rest of the slots if needed
        if ncopies > 1:
            max_blocks = 1024
            nthreads_per_block = 64
            block = (nthreads_per_block, 1, 1)
            for first_photon, photons_this_round, blocks in chunk_iterator(
                    nphotons, nthreads_per_block, max_blocks):
                pass
                grid = (blocks, 1)
                args = (
                    np.int32(first_photon),
                    np.int32(photons_this_round),
                    self.pos,
                    self.dir,
                    self.wavelengths,
                    self.pol,
                    self.t,
                    self.flags,
                    self.last_hit_triangles,
                    self.weights,
                    np.int32(ncopies - 1),
                    np.int32(nphotons),
                )
                self.gpu_funcs.photon_duplicate(*args, block=block, grid=grid)
            pass
        pass
Example #11
0
    def propagate(self, gpu_geometry, rng_states, nthreads_per_block=64,
                  max_blocks=1024, max_steps=10, use_weights=False,
                  scatter_first=0):
        """Propagate photons on GPU to termination or max_steps, whichever
        comes first.

        May be called repeatedly without reloading photon information if
        single-stepping through photon history.

        ..warning::
            `rng_states` must have at least `nthreads_per_block`*`max_blocks`
            number of curandStates.
        """
        nphotons = self.pos.size
        step = 0
        input_queue = np.empty(shape=nphotons+1, dtype=np.uint32)
        input_queue[0] = 0
        # Order photons initially in the queue to put the clones next to each other
        for copy in xrange(self.ncopies):
            input_queue[1+copy::self.ncopies] = np.arange(self.true_nphotons, dtype=np.uint32) + copy * self.true_nphotons
        input_queue_gpu = ga.to_gpu(input_queue)
        output_queue = np.zeros(shape=nphotons+1, dtype=np.uint32)
        output_queue[0] = 1
        output_queue_gpu = ga.to_gpu(output_queue)

        while step < max_steps:
            # Just finish the rest of the steps if the # of photons is low
            if nphotons < nthreads_per_block * 16 * 8 or use_weights:
                nsteps = max_steps - step
            else:
                nsteps = 1

            for first_photon, photons_this_round, blocks in \
                    chunk_iterator(nphotons, nthreads_per_block, max_blocks):
                self.gpu_funcs.propagate(np.int32(first_photon), np.int32(photons_this_round), input_queue_gpu[1:], output_queue_gpu, rng_states, self.pos, self.dir, self.wavelengths, self.pol, self.t, self.flags, self.last_hit_triangles, self.weights, np.int32(nsteps), np.int32(use_weights), np.int32(scatter_first), gpu_geometry.gpudata, block=(nthreads_per_block,1,1), grid=(blocks, 1))

            step += nsteps
            scatter_first = 0 # Only allow non-zero in first pass

            if step < max_steps:
                temp = input_queue_gpu
                input_queue_gpu = output_queue_gpu
                output_queue_gpu = temp
                # Assign with a numpy array of length 1 to silence
                # warning from PyCUDA about setting array with different strides/storage orders.
                output_queue_gpu[:1].set(np.ones(shape=1, dtype=np.uint32))
                nphotons = input_queue_gpu[:1].get()[0] - 1

        if ga.max(self.flags).get() & (1 << 31):
            print >>sys.stderr, "WARNING: ABORTED PHOTONS"
        cuda.Context.get_current().synchronize()
Example #12
0
    def color_solids(self, solid_hit, colors, nblocks_per_thread=64,
                     max_blocks=1024):
        solid_hit_gpu = ga.to_gpu(np.array(solid_hit, dtype=np.bool))
        solid_colors_gpu = ga.to_gpu(np.array(colors, dtype=np.uint32))

        module = get_cu_module('mesh.h', options=cuda_options)
        color_solids = module.get_function('color_solids')

        for first_triangle, triangles_this_round, blocks in \
                chunk_iterator(self.triangles.size, nblocks_per_thread,
                               max_blocks):
            color_solids(np.int32(first_triangle),
                         np.int32(triangles_this_round), self.solid_id_map,
                         solid_hit_gpu, solid_colors_gpu, self.gpudata,
                         block=(nblocks_per_thread,1,1), 
                         grid=(blocks,1))
Example #13
0
    def color_solids(self, solid_hit, colors, nblocks_per_thread=64,
                     max_blocks=1024):
        solid_hit_gpu = ga.to_gpu(np.array(solid_hit, dtype=np.bool))
        solid_colors_gpu = ga.to_gpu(np.array(colors, dtype=np.uint32))

        module = get_cu_module('mesh.h', options=cuda_options)
        color_solids = module.get_function('color_solids')

        for first_triangle, triangles_this_round, blocks in \
                chunk_iterator(self.triangles.size, nblocks_per_thread,
                               max_blocks):
            color_solids(np.int32(first_triangle),
                         np.int32(triangles_this_round), self.solid_id_map,
                         solid_hit_gpu, solid_colors_gpu, self.gpudata,
                         block=(nblocks_per_thread,1,1), 
                         grid=(blocks,1))
Example #14
0
    def compare_sampling(self, hist, reps=10):
        queue = cl.CommandQueue(self.context)

        # make cdf histogram
        nbins = hist.GetNbinsX()
        xaxis = hist.GetXaxis()
        intg = hist.GetIntegral()
        cdf_y = np.empty(nbins + 1, dtype=float)
        cdf_x = np.empty_like(cdf_y)

        cdf_x[0] = xaxis.GetBinLowEdge(1)
        cdf_y[0] = 0.0
        for i in xrange(1, len(cdf_x)):
            cdf_y[i] = intg[i]
            cdf_x[i] = xaxis.GetBinUpEdge(i)

        cdf_x_gpu = cl.array.to_device(queue, cdf_x.astype(np.float32))
        cdf_y_gpu = cl.array.to_device(queue, cdf_y.astype(np.float32))
        block = (self.nthreads_per_block, 1, 1)
        grid = (1, 1)
        out_gpu = cl.array.empty(queue,
                                 shape=self.nthreads_per_block,
                                 dtype=np.float32)

        out_h = rt.TH1D('out_h', '', hist.GetNbinsX(), xaxis.GetXmin(),
                        xaxis.GetXmax())
        out_h.SetLineColor(rt.kGreen)

        for first_index, elements_this_iter, nblocks_this_iter in \
                chunk_iterator(reps, self.nthreads_per_block, max_blocks=1):
            self.funcs.test_sample_cdf(queue, (elements_this_iter, 1, 1), None,
                                       self.rng_states.data,
                                       np.int32(len(cdf_x_gpu)),
                                       cdf_x_gpu.data, cdf_y_gpu.data,
                                       out_gpu.data)
            out = out_gpu.get()
            for v in out[:elements_this_iter]:
                out_h.Fill(v)

        prob = out_h.KolmogorovTest(hist)
        out_h.Write()
        return prob, out_h
Example #15
0
    def __init__(self, photons, ncopies=1, cl_context=None):
        """Load ``photons`` onto the GPU, replicating as requested.

           Args:
               - photons: chroma.Event.Photons
                   Photon state information to load onto GPU
               - ncopies: int, *optional*
                   Number of times to replicate the photons
                   on the GPU.  This is used if you want
                   to propagate the same event many times,
                   for example in a likelihood calculation.

                   The amount of GPU storage will be proportionally
                   larger if ncopies > 1, so be careful.
        """
        nphotons = len(photons)
        # Allocate GPU memory for photon info and push to device
        if api.is_gpu_api_cuda():
            self.pos = ga.empty(shape=nphotons * ncopies, dtype=ga.vec.float3)
            self.dir = ga.empty(shape=nphotons * ncopies, dtype=ga.vec.float3)
            self.pol = ga.empty(shape=nphotons * ncopies, dtype=ga.vec.float3)
            self.wavelengths = ga.empty(shape=nphotons * ncopies,
                                        dtype=np.float32)
            self.t = ga.empty(shape=nphotons * ncopies, dtype=np.float32)
            self.last_hit_triangles = ga.empty(shape=nphotons * ncopies,
                                               dtype=np.int32)
            self.flags = ga.empty(shape=nphotons * ncopies, dtype=np.uint32)
            self.weights = ga.empty(shape=nphotons * ncopies, dtype=np.float32)
            self.current_node_index = ga.zeros(shape=nphotons * ncopies,
                                               dtype=np.uint32)  # deprecated
            self.requested_workcode = ga.empty(shape=nphotons * ncopies,
                                               dtype=np.uint32)  # deprecated
        elif api.is_gpu_api_opencl():
            queue = cl.CommandQueue(cl_context)
            self.pos = ga.empty(queue,
                                shape=nphotons * ncopies,
                                dtype=ga.vec.float3)
            self.dir = ga.empty(queue,
                                shape=nphotons * ncopies,
                                dtype=ga.vec.float3)
            self.pol = ga.empty(queue,
                                shape=nphotons * ncopies,
                                dtype=ga.vec.float3)
            self.wavelengths = ga.empty(queue,
                                        shape=nphotons * ncopies,
                                        dtype=np.float32)
            self.t = ga.empty(queue,
                              shape=nphotons * ncopies,
                              dtype=np.float32)
            self.last_hit_triangles = ga.empty(queue,
                                               shape=nphotons * ncopies,
                                               dtype=np.int32)
            self.flags = ga.empty(queue,
                                  shape=nphotons * ncopies,
                                  dtype=np.uint32)
            self.weights = ga.empty(queue,
                                    shape=nphotons * ncopies,
                                    dtype=np.float32)
            self.current_node_index = ga.zeros(queue,
                                               shape=nphotons * ncopies,
                                               dtype=np.uint32)  # deprecated
            self.requested_workcode = ga.empty(queue,
                                               shape=nphotons * ncopies,
                                               dtype=np.uint32)  # deprecated

        # Assign the provided photons to the beginning (possibly
        # the entire array if ncopies is 1
        self.pos[:nphotons].set(to_float3(photons.pos))
        self.dir[:nphotons].set(to_float3(photons.dir))
        self.pol[:nphotons].set(to_float3(photons.pol))
        self.wavelengths[:nphotons].set(photons.wavelengths.astype(np.float32))
        self.t[:nphotons].set(photons.t.astype(np.float32))
        self.last_hit_triangles[:nphotons].set(
            photons.last_hit_triangles.astype(np.int32))
        self.flags[:nphotons].set(photons.flags.astype(np.uint32))
        self.weights[:nphotons].set(photons.weights.astype(np.float32))

        if api.is_gpu_api_cuda():
            self.module = get_module('propagate.cu',
                                     options=api_options,
                                     include_source_directory=True)
        elif api.is_gpu_api_opencl():
            self.module = get_module('propagate.cl',
                                     cl_context,
                                     options=api_options,
                                     include_source_directory=True)
        # define the texture references
        self.define_texture_references()
        # get kernel functions
        self.gpu_funcs = GPUFuncs(self.module)

        # Replicate the photons to the rest of the slots if needed
        if ncopies > 1:
            max_blocks = 1024
            nthreads_per_block = 64
            for first_photon, photons_this_round, blocks in \
                    chunk_iterator(nphotons, nthreads_per_block, max_blocks):
                self.gpu_funcs.photon_duplicate(np.int32(first_photon),
                                                np.int32(photons_this_round),
                                                self.pos,
                                                self.dir,
                                                self.wavelengths,
                                                self.pol,
                                                self.t,
                                                self.flags,
                                                self.last_hit_triangles,
                                                self.weights,
                                                np.int32(ncopies - 1),
                                                np.int32(nphotons),
                                                block=(nthreads_per_block, 1,
                                                       1),
                                                grid=(blocks, 1))

        # Save the duplication information for the iterate_copies() method
        self.true_nphotons = nphotons
        self.ncopies = ncopies
Example #16
0
def concatenate_layers(layers):
    nthreads_per_block = 1024
    context = None
    queue = None
    if gpuapi.is_gpu_api_opencl():
        context = cltools.get_last_context()
        #print context
        queue = cl.CommandQueue(context)

    # Load GPU functions
    if gpuapi.is_gpu_api_cuda():
        bvh_module = get_module('bvh.cu',
                                options=api_options,
                                include_source_directory=True)
    elif gpuapi.is_gpu_api_opencl():
        # don't like the last context method. trouble. trouble.
        bvh_module = get_module('bvh.cl',
                                cltools.get_last_context(),
                                options=api_options,
                                include_source_directory=True)
    else:
        raise RuntimeError('API neither CUDA nor OpenCL?!')
    bvh_funcs = GPUFuncs(bvh_module)

    # Put 0 at beginning of list
    layer_bounds = np.insert(np.cumsum(map(len, layers)), 0, 0)

    # allocate memory
    if gpuapi.is_gpu_api_cuda():
        nodes = ga.empty(shape=int(layer_bounds[-1]), dtype=ga.vec.uint4)
    elif gpuapi.is_gpu_api_opencl():
        totsize = 0
        layer_pos = []
        print layer_bounds[-1]
        for n, layer in enumerate(layers):
            layer_pos.append(totsize)
            print "LAYER ", n, " size=", len(layer), "start=", totsize
            totsize += len(layer)
        print "totsize: ", totsize
        nodes_iter_np = np.empty(totsize, dtype=ga.vec.uint4)
        nodes_iter_gpu = ga.to_device(queue, nodes_iter_np)
        nodeset_np = []
    else:
        raise RuntimeError('API neither CUDA nor OpenCL?!')

    ilayer = 0
    for layer_start, layer_end, layer in zip(layer_bounds[:-1],
                                             layer_bounds[1:], layers):
        if layer_end == layer_bounds[-1]:
            # leaf nodes need no offset
            child_offset = 0
        else:
            child_offset = layer_end

        #print "ilayer,start,end,child_offset: ",ilayer,layer_start, layer_end, child_offset
        nmax_blocks = 10000
        if gpuapi.is_gpu_api_opencl():
            nthreads_per_block = 256
            nmax_blocks = 1
        for first_index, elements_this_iter, nblocks_this_iter in \
                chunk_iterator(layer_end-layer_start, nthreads_per_block,max_blocks=nmax_blocks):
            #print "   ",ilayer,first_index, elements_this_iter, nblocks_this_iter, layer_start
            if gpuapi.is_gpu_api_cuda():
                bvh_funcs.copy_and_offset(np.uint32(first_index),
                                          np.uint32(elements_this_iter),
                                          np.uint32(child_offset),
                                          cuda.In(layer),
                                          nodes[layer_start:],
                                          block=(nthreads_per_block, 1, 1),
                                          grid=(nblocks_this_iter, 1))
            elif gpuapi.is_gpu_api_opencl():
                layer_gpu = ga.to_device(queue, layer)
                bvh_funcs.copy_and_offset(queue, (elements_this_iter, 1, 1),
                                          (1, 1, 1),
                                          np.uint32(first_index),
                                          np.uint32(elements_this_iter),
                                          np.uint32(child_offset),
                                          np.uint32(layer_start),
                                          layer_gpu.data,
                                          nodes_iter_gpu.data,
                                          g_times_l=True).wait()
            else:
                raise RuntimeError('API neither CUDA nor OpenCL?!')
        ilayer += 1

    if gpuapi.is_gpu_api_cuda():
        return nodes.get(), layer_bounds
    elif gpuapi.is_gpu_api_opencl():
        return nodes_iter_gpu.get(), layer_bounds
Example #17
0
def create_leaf_nodes(mesh, morton_bits=16, round_to_multiple=1):
    '''Compute the leaf nodes surrounding a triangle mesh.

      ``mesh``: chroma.geometry.Mesh
        Triangles to box
      ``morton_bits``: int
        Number of bits to use per dimension when computing Morton code.
      ``round_to_multiple``: int
        Round the number of nodes created up to multiple of this number
        Extra nodes will be all zero.
        
    Returns (world_coords, nodes, morton_codes), where
      ``world_coords``: chroma.bvh.WorldCoords
        Defines the fixed point coordinate system
      ``nodes``: ndarray(shape=len(mesh.triangles), dtype=uint4)
        List of leaf nodes.  Child IDs will be set to triangle offsets.
      ``morton_codes``: ndarray(shape=len(mesh.triangles), dtype=np.uint64)
        Morton codes for each triangle, using ``morton_bits`` per axis.
        Must be <= 16 bits.
    '''
    # Load GPU functions
    bvh_module = get_cu_module('bvh.cu',
                               options=cuda_options,
                               include_source_directory=True)
    bvh_funcs = GPUFuncs(bvh_module)

    # compute world coordinates
    world_origin = mesh.vertices.min(axis=0)
    world_scale = np.max((mesh.vertices.max(axis=0) - world_origin)) \
        / (2**16 - 2)
    world_coords = WorldCoords(world_origin=world_origin,
                               world_scale=world_scale)

    # Put triangles and vertices in mapped host memory
    triangles = mapped_empty(shape=len(mesh.triangles),
                             dtype=ga.vec.uint3,
                             write_combined=True)
    triangles[:] = to_uint3(mesh.triangles)
    vertices = mapped_empty(shape=len(mesh.vertices),
                            dtype=ga.vec.float3,
                            write_combined=True)
    vertices[:] = to_float3(mesh.vertices)

    # Call GPU to compute nodes
    nodes = ga.zeros(shape=round_up_to_multiple(len(triangles),
                                                round_to_multiple),
                     dtype=ga.vec.uint4)
    morton_codes = ga.empty(shape=len(triangles), dtype=np.uint64)

    # Convert world coords to GPU-friendly types
    world_origin = ga.vec.make_float3(*world_origin)
    world_scale = np.float32(world_scale)

    nthreads_per_block = 256
    for first_index, elements_this_iter, nblocks_this_iter in \
            chunk_iterator(len(triangles), nthreads_per_block,
                           max_blocks=30000):
        bvh_funcs.make_leaves(np.uint32(first_index),
                              np.uint32(elements_this_iter),
                              Mapped(triangles),
                              Mapped(vertices),
                              world_origin,
                              world_scale,
                              nodes,
                              morton_codes,
                              block=(nthreads_per_block, 1, 1),
                              grid=(nblocks_this_iter, 1))

    morton_codes_host = morton_codes.get() >> (16 - morton_bits)
    return world_coords, nodes.get(), morton_codes_host
Example #18
0
    def __init__(self,
                 photons,
                 ncopies=1,
                 copy_flags=True,
                 copy_triangles=True,
                 copy_weights=True):
        """Load ``photons`` onto the GPU, replicating as requested.

           Args:
               - photons: chroma.Event.Photons
                   Photon state information to load onto GPU
               - ncopies: int, *optional*
                   Number of times to replicate the photons
                   on the GPU.  This is used if you want
                   to propagate the same event many times,
                   for example in a likelihood calculation.

                   The amount of GPU storage will be proportionally
                   larger if ncopies > 1, so be careful.
        """
        nphotons = len(photons)
        self.pos = ga.empty(shape=nphotons * ncopies, dtype=ga.vec.float3)
        self.dir = ga.empty(shape=nphotons * ncopies, dtype=ga.vec.float3)
        self.pol = ga.empty(shape=nphotons * ncopies, dtype=ga.vec.float3)
        self.wavelengths = ga.empty(shape=nphotons * ncopies, dtype=np.float32)
        self.t = ga.empty(shape=nphotons * ncopies, dtype=np.float32)
        self.last_hit_triangles = ga.empty(shape=nphotons * ncopies,
                                           dtype=np.int32)
        if not copy_triangles:
            self.last_hit_triangles.fill(-1)
        if not copy_flags:
            self.flags = ga.zeros(shape=nphotons * ncopies, dtype=np.uint32)
        else:
            self.flags = ga.empty(shape=nphotons * ncopies, dtype=np.uint32)
        if not copy_weights:
            self.weights = ga.ones_like(self.last_hit_triangles,
                                        dtype=np.float32)
        else:
            self.weights = ga.empty(shape=nphotons * ncopies, dtype=np.float32)
        self.evidx = ga.empty(shape=nphotons, dtype=np.uint32)

        # Assign the provided photons to the beginning (possibly
        # the entire array if ncopies is 1
        self.pos[:nphotons].set(to_float3(photons.pos))
        self.dir[:nphotons].set(to_float3(photons.dir))
        self.pol[:nphotons].set(to_float3(photons.pol))
        self.wavelengths[:nphotons].set(photons.wavelengths.astype(np.float32))
        self.t[:nphotons].set(photons.t.astype(np.float32))
        if copy_triangles:
            self.last_hit_triangles[:nphotons].set(
                photons.last_hit_triangles.astype(np.int32))
        if copy_flags:
            self.flags[:nphotons].set(photons.flags.astype(np.uint32))
        if copy_weights:
            self.weights[:nphotons].set(photons.weights.astype(np.float32))
        self.evidx[:nphotons].set(photons.evidx.astype(np.uint32))

        module = get_cu_module('propagate.cu', options=cuda_options)
        self.gpu_funcs = GPUFuncs(module)

        # Replicate the photons to the rest of the slots if needed
        if ncopies > 1:
            max_blocks = 1024
            nthreads_per_block = 64
            for first_photon, photons_this_round, blocks in \
                    chunk_iterator(nphotons, nthreads_per_block, max_blocks):
                self.gpu_funcs.photon_duplicate(np.int32(first_photon),
                                                np.int32(photons_this_round),
                                                self.pos,
                                                self.dir,
                                                self.wavelengths,
                                                self.pol,
                                                self.t,
                                                self.flags,
                                                self.last_hit_triangles,
                                                self.weights,
                                                self.evidx,
                                                np.int32(ncopies - 1),
                                                np.int32(nphotons),
                                                block=(nthreads_per_block, 1,
                                                       1),
                                                grid=(blocks, 1))

        # Save the duplication information for the iterate_copies() method
        self.true_nphotons = nphotons
        self.ncopies = ncopies
Example #19
0
    def get_hits(self, gpu_detector, target_flag=(0x1<<2), nthreads_per_block=64, max_blocks=1024,
               start_photon=None, nphotons=None):
        '''Return a map of GPUPhoton objects containing only photons that
        have a particular bit set in their history word and were detected by
        a channel.'''
        cuda.Context.get_current().synchronize()
        index_counter_gpu = ga.zeros(shape=1, dtype=np.uint32)
        cuda.Context.get_current().synchronize()
        if start_photon is None:
            start_photon = 0
        if nphotons is None:
            nphotons = self.pos.size - start_photon

        # First count how much space we need
        for first_photon, photons_this_round, blocks in chunk_iterator(nphotons, nthreads_per_block, max_blocks):
            self.gpu_funcs.count_photon_hits(np.int32(start_photon+first_photon), 
                                         np.int32(photons_this_round),
                                         np.uint32(target_flag),
                                         self.flags,
                                         gpu_detector.solid_id_map,
                                         self.last_hit_triangles,
                                         gpu_detector.detector_gpu,
                                         index_counter_gpu,
                                         block=(nthreads_per_block,1,1), 
                                         grid=(blocks, 1))
        cuda.Context.get_current().synchronize()
        reduced_nphotons = int(index_counter_gpu.get()[0])
        
        # Then allocate new storage space
        pos = ga.empty(shape=reduced_nphotons, dtype=ga.vec.float3)
        dir = ga.empty(shape=reduced_nphotons, dtype=ga.vec.float3)
        pol = ga.empty(shape=reduced_nphotons, dtype=ga.vec.float3)
        wavelengths = ga.empty(shape=reduced_nphotons, dtype=np.float32)
        t = ga.empty(shape=reduced_nphotons, dtype=np.float32)
        last_hit_triangles = ga.empty(shape=reduced_nphotons, dtype=np.int32)
        flags = ga.empty(shape=reduced_nphotons, dtype=np.uint32)
        weights = ga.empty(shape=reduced_nphotons, dtype=np.float32)
        channels = ga.empty(shape=reduced_nphotons, dtype=np.int32)

        # And finaly copy hits, if there are any
        if reduced_nphotons > 0:
            index_counter_gpu.fill(0)
            for first_photon, photons_this_round, blocks in \
                    chunk_iterator(nphotons, nthreads_per_block, max_blocks):
                self.gpu_funcs.copy_photon_hits(np.int32(start_photon+first_photon), 
                                            np.int32(photons_this_round), 
                                            np.uint32(target_flag),
                                            gpu_detector.solid_id_map,
                                            gpu_detector.detector_gpu,
                                            index_counter_gpu, 
                                            self.pos, self.dir, self.wavelengths, self.pol, self.t, self.flags, self.last_hit_triangles, self.weights,
                                            pos, dir, wavelengths, pol, t, flags, last_hit_triangles, weights, channels,
                                            block=(nthreads_per_block,1,1), 
                                            grid=(blocks, 1))
            assert index_counter_gpu.get()[0] == reduced_nphotons
            
        pos = pos.get().view(np.float32).reshape((len(pos),3))
        dir = dir.get().view(np.float32).reshape((len(dir),3))
        pol = pol.get().view(np.float32).reshape((len(pol),3))
        wavelengths = wavelengths.get()
        t = t.get()
        last_hit_triangles = last_hit_triangles.get()
        flags = flags.get()
        weights = weights.get()
        channels = channels.get()
        hitmap = {}
        for chan in np.unique(channels):
            mask = (channels == chan).astype(bool)
            hitmap[chan] = event.Photons(pos[mask], dir[mask], pol[mask], wavelengths[mask], t[mask], last_hit_triangles[mask], flags[mask], weights[mask])
        return hitmap
Example #20
0
def optimize_layer(orig_nodes):
    bvh_module = get_cu_module('bvh.cu', options=cuda_options,
                               include_source_directory=True)
    bvh_funcs = GPUFuncs(bvh_module)

    nodes = ga.to_gpu(orig_nodes)
    n = len(nodes)
    areas = ga.empty(shape=n/2, dtype=np.uint64)
    nthreads_per_block = 128

    min_areas = ga.empty(shape=int(np.ceil(n/float(nthreads_per_block))), dtype=np.uint64)
    min_index = ga.empty(shape=min_areas.shape, dtype=np.uint32)

    update = 10000

    skip_size = 1
    flag = mapped_empty(shape=skip_size, dtype=np.uint32)

    i = 0
    skips = 0
    swaps = 0
    while i < n/2 - 1:
        # How are we doing?
        if i % update == 0:
            for first_index, elements_this_iter, nblocks_this_iter in \
                    chunk_iterator(n/2, nthreads_per_block, max_blocks=10000):

                bvh_funcs.pair_area(np.uint32(first_index),
                                    np.uint32(elements_this_iter),
                                    nodes,
                                    areas,
                                    block=(nthreads_per_block,1,1),
                                    grid=(nblocks_this_iter,1))
                
            areas_host = areas.get()
            #print nodes.get(), areas_host.astype(float)
            print 'Area of parent layer so far (%d): %1.12e' % (i*2, areas_host.astype(float).sum())
            print 'Skips: %d, Swaps: %d' % (skips, swaps)

        test_index = i * 2

        blocks = 0
        look_forward = min(8192*50, n - test_index - 2)
        skip_this_round = min(skip_size, n - test_index - 1)
        flag[:] = 0
        for first_index, elements_this_iter, nblocks_this_iter in \
                chunk_iterator(look_forward, nthreads_per_block, max_blocks=10000):
            bvh_funcs.min_distance_to(np.uint32(first_index + test_index + 2),
                                      np.uint32(elements_this_iter),
                                      np.uint32(test_index),
                                      nodes,
                                      np.uint32(blocks),
                                      min_areas,
                                      min_index,
                                      Mapped(flag),
                                      block=(nthreads_per_block,1,1),
                                      grid=(nblocks_this_iter, skip_this_round))
            blocks += nblocks_this_iter
            #print i, first_index, nblocks_this_iter, look_forward
        cuda.Context.get_current().synchronize()

        if flag[0] == 0:
            flag_nonzero = flag.nonzero()[0]
            if len(flag_nonzero) == 0:
                no_swap_required = skip_size
            else:
                no_swap_required = flag_nonzero[0]
            i += no_swap_required
            skips += no_swap_required
            continue

        min_areas_host = min_areas[:blocks].get()
        min_index_host = min_index[:blocks].get()
        best_block = min_areas_host.argmin()
        better_i = min_index_host[best_block]

        swaps += 1
        #print 'swap', test_index+1, better_i
        assert 0 < better_i < len(nodes)
        assert 0 < test_index + 1 < len(nodes)
        bvh_funcs.swap(np.uint32(test_index+1), np.uint32(better_i),
                       nodes, block=(1,1,1), grid=(1,1))
        cuda.Context.get_current().synchronize()
        i += 1

    for first_index, elements_this_iter, nblocks_this_iter in \
            chunk_iterator(n/2, nthreads_per_block, max_blocks=10000):

        bvh_funcs.pair_area(np.uint32(first_index),
                            np.uint32(elements_this_iter),
                            nodes,
                            areas,
                            block=(nthreads_per_block,1,1),
                            grid=(nblocks_this_iter,1))
        
    areas_host = areas.get()

    print 'Final area of parent layer: %1.12e' % areas_host.sum()
    print 'Skips: %d, Swaps: %d' % (skips, swaps)

    return nodes.get()
Example #21
0
def create_leaf_nodes(mesh, morton_bits=16, round_to_multiple=1):
    '''Compute the leaf nodes surrounding a triangle mesh.

      ``mesh``: chroma.geometry.Mesh
        Triangles to box
      ``morton_bits``: int
        Number of bits to use per dimension when computing Morton code.
      ``round_to_multiple``: int
        Round the number of nodes created up to multiple of this number
        Extra nodes will be all zero.
        
    Returns (world_coords, nodes, morton_codes), where
      ``world_coords``: chroma.bvh.WorldCoords
        Defines the fixed point coordinate system
      ``nodes``: ndarray(shape=len(mesh.triangles), dtype=uint4)
        List of leaf nodes.  Child IDs will be set to triangle offsets.
      ``morton_codes``: ndarray(shape=len(mesh.triangles), dtype=np.uint64)
        Morton codes for each triangle, using ``morton_bits`` per axis.
        Must be <= 16 bits.
    '''
    # Load GPU functions
    bvh_module = get_cu_module('bvh.cu', options=cuda_options,
                               include_source_directory=True)
    bvh_funcs = GPUFuncs(bvh_module)

    # compute world coordinates
    world_origin = mesh.vertices.min(axis=0)
    world_scale = np.max((mesh.vertices.max(axis=0) - world_origin)) \
        / (2**16 - 2)
    world_coords = WorldCoords(world_origin=world_origin, 
                               world_scale=world_scale)

    # Put triangles and vertices in mapped host memory
    triangles = mapped_empty(shape=len(mesh.triangles), dtype=ga.vec.uint3,
                             write_combined=True)
    triangles[:] = to_uint3(mesh.triangles)
    vertices = mapped_empty(shape=len(mesh.vertices), dtype=ga.vec.float3,
                            write_combined=True)
    vertices[:] = to_float3(mesh.vertices)
    
    # Call GPU to compute nodes
    nodes = ga.zeros(shape=round_up_to_multiple(len(triangles), 
                                                round_to_multiple),
                     dtype=ga.vec.uint4)
    morton_codes = ga.empty(shape=len(triangles), dtype=np.uint64)

    # Convert world coords to GPU-friendly types
    world_origin = ga.vec.make_float3(*world_origin)
    world_scale = np.float32(world_scale)

    nthreads_per_block = 256
    for first_index, elements_this_iter, nblocks_this_iter in \
            chunk_iterator(len(triangles), nthreads_per_block, 
                           max_blocks=30000):
        bvh_funcs.make_leaves(np.uint32(first_index),
                              np.uint32(elements_this_iter),
                              Mapped(triangles), Mapped(vertices),
                              world_origin, world_scale,
                              nodes, morton_codes,
                              block=(nthreads_per_block,1,1),
                              grid=(nblocks_this_iter,1))

    morton_codes_host = morton_codes.get() >> (16 - morton_bits)
    return world_coords, nodes.get(), morton_codes_host
Example #22
0
def merge_nodes(nodes, degree, max_ratio=None):
    bvh_module = get_cu_module('bvh.cu', options=cuda_options,
                               include_source_directory=True)
    bvh_funcs = GPUFuncs(bvh_module)
    
    nparent = len(nodes) / degree
    if len(nodes) % degree != 0:
        nparent += 1

    if nparent == 1:
        nparent_pad = nparent
    else:
        nparent_pad = round_up_to_multiple(nparent, 1)#degree)
    gpu_parent_nodes = ga.zeros(shape=nparent_pad, dtype=ga.vec.uint4)

    nthreads_per_block = 256
    for first_index, elements_this_iter, nblocks_this_iter in \
            chunk_iterator(nparent, nthreads_per_block, max_blocks=10000):
        bvh_funcs.make_parents(np.uint32(first_index),
                               np.uint32(elements_this_iter),
                               np.uint32(degree),
                               gpu_parent_nodes,
                               cuda.In(nodes),
                               np.uint32(0),
                               np.uint32(len(nodes)),
                               block=(nthreads_per_block,1,1),
                               grid=(nblocks_this_iter,1))

    parent_nodes = gpu_parent_nodes.get()

    if max_ratio is not None:
        areas = node_areas(parent_nodes)
        child_areas = node_areas(nodes)

        excessive_area = np.zeros(shape=len(areas), dtype=bool)
        for i, parent_area in enumerate(areas):
            nchild = parent_nodes['w'][i] >> CHILD_BITS
            child_index = parent_nodes['w'][i] & ~NCHILD_MASK
            child_area = child_areas[child_index:child_index+nchild].sum()
            #if parent_area > 1e9:
            #    print i, 'Children: %e, Parent: %e' % (child_area, parent_area)
            if child_area/parent_area < 0.3:
                excessive_area[i] = True
                #print i, 'Children: %e, Parent: %e' % (child_area, parent_area)

        extra_slots = round_up_to_multiple((degree - 1) * np.count_nonzero(excessive_area), 1)
        print 'Extra slots:', extra_slots
        new_parent_nodes = np.zeros(shape=len(parent_nodes) + extra_slots,
                                    dtype=parent_nodes.dtype)
        new_parent_nodes[:len(parent_nodes)] = parent_nodes

        offset = 0
        for count, index in enumerate(np.argwhere(excessive_area)):
            index = index[0] + offset
            nchild = new_parent_nodes['w'][index] >> CHILD_BITS
            child_index = new_parent_nodes['w'][index] & ~NCHILD_MASK
            new_parent_nodes[index] = nodes[child_index]
            #new_parent_nodes['w'][index] = 1 << CHILD_BITS | child_index
            tmp_nchild = new_parent_nodes['w'][index] >> CHILD_BITS
            tmp_child_index = new_parent_nodes['w'][index] & ~NCHILD_MASK
            new_parent_nodes['w'][index] = tmp_nchild << CHILD_BITS | (tmp_child_index + len(nodes))

            if nchild == 1:
                continue

            # slide everyone over
            #print index, nchild, len(new_parent_nodes)
            new_parent_nodes[index+nchild:] = new_parent_nodes[index+1:-nchild+1]
            offset += nchild - 1
            for sibling in xrange(nchild - 1):
                new_parent_index = index + 1 + sibling
                new_parent_nodes[new_parent_index] = nodes[child_index + sibling + 1]
                if new_parent_nodes['x'][new_parent_index] != 0:
                    tmp_nchild = new_parent_nodes['w'][new_parent_index] >> CHILD_BITS
                    tmp_child_index = new_parent_nodes['w'][new_parent_index] & ~NCHILD_MASK
                    new_parent_nodes['w'][new_parent_index] = tmp_nchild << CHILD_BITS | (tmp_child_index + len(nodes))

                    #new_parent_nodes['w'][new_parent_index] = 1 << CHILD_BITS | (child_index + sibling + 1)


            #print 'intermediate: %e' % node_areas(new_parent_nodes).max()
        print 'old: %e' % node_areas(parent_nodes).max()
        print 'new: %e' % node_areas(new_parent_nodes).max()
        if len(new_parent_nodes) < len(nodes):
            # Only adopt new set of parent nodes if it actually reduces the
            # total number of nodes at this level by 1.
            parent_nodes = new_parent_nodes

    return parent_nodes
Example #23
0
    def get_flat_hits(self,
                      gpu_detector,
                      target_flag=(0x1 << 2),
                      nthreads_per_block=64,
                      max_blocks=1024,
                      start_photon=None,
                      nphotons=None,
                      no_map=False):
        '''GPUPhoton objects containing only photons that
        have a particular bit set in their history word and were detected by
        a channel.'''
        cuda.Context.get_current().synchronize()
        index_counter_gpu = ga.zeros(shape=1, dtype=np.uint32)
        cuda.Context.get_current().synchronize()
        if start_photon is None:
            start_photon = 0
        if nphotons is None:
            nphotons = self.pos.size - start_photon

        # First count how much space we need
        for first_photon, photons_this_round, blocks in chunk_iterator(
                nphotons, nthreads_per_block, max_blocks):
            self.gpu_funcs.count_photon_hits(np.int32(start_photon +
                                                      first_photon),
                                             np.int32(photons_this_round),
                                             np.uint32(target_flag),
                                             self.flags,
                                             gpu_detector.solid_id_map,
                                             self.last_hit_triangles,
                                             gpu_detector.detector_gpu,
                                             index_counter_gpu,
                                             block=(nthreads_per_block, 1, 1),
                                             grid=(blocks, 1))
        cuda.Context.get_current().synchronize()
        reduced_nphotons = int(index_counter_gpu.get()[0])

        # Then allocate new storage space
        pos = ga.empty(shape=reduced_nphotons, dtype=ga.vec.float3)
        dir = ga.empty(shape=reduced_nphotons, dtype=ga.vec.float3)
        pol = ga.empty(shape=reduced_nphotons, dtype=ga.vec.float3)
        wavelengths = ga.empty(shape=reduced_nphotons, dtype=np.float32)
        t = ga.empty(shape=reduced_nphotons, dtype=np.float32)
        last_hit_triangles = ga.empty(shape=reduced_nphotons, dtype=np.int32)
        flags = ga.empty(shape=reduced_nphotons, dtype=np.uint32)
        weights = ga.empty(shape=reduced_nphotons, dtype=np.float32)
        evidx = ga.empty(shape=reduced_nphotons, dtype=np.uint32)
        channels = ga.empty(shape=reduced_nphotons, dtype=np.int32)

        # And finaly copy hits, if there are any
        if reduced_nphotons > 0:
            index_counter_gpu.fill(0)
            for first_photon, photons_this_round, blocks in \
                    chunk_iterator(nphotons, nthreads_per_block, max_blocks):
                self.gpu_funcs.copy_photon_hits(
                    np.int32(start_photon + first_photon),
                    np.int32(photons_this_round),
                    np.uint32(target_flag),
                    gpu_detector.solid_id_map,
                    gpu_detector.detector_gpu,
                    index_counter_gpu,
                    self.pos,
                    self.dir,
                    self.wavelengths,
                    self.pol,
                    self.t,
                    self.flags,
                    self.last_hit_triangles,
                    self.weights,
                    self.evidx,
                    pos,
                    dir,
                    wavelengths,
                    pol,
                    t,
                    flags,
                    last_hit_triangles,
                    weights,
                    evidx,
                    channels,
                    block=(nthreads_per_block, 1, 1),
                    grid=(blocks, 1))
            assert index_counter_gpu.get()[0] == reduced_nphotons

        pos = pos.get().view(np.float32).reshape((len(pos), 3))
        dir = dir.get().view(np.float32).reshape((len(dir), 3))
        pol = pol.get().view(np.float32).reshape((len(pol), 3))
        wavelengths = wavelengths.get()
        t = t.get()
        last_hit_triangles = last_hit_triangles.get()
        flags = flags.get()
        weights = weights.get()
        evidx = evidx.get()
        channels = channels.get()
        hitmap = {}
        return event.Photons(pos, dir, pol, wavelengths, t, last_hit_triangles,
                             flags, weights, evidx, channels)
Example #24
0
def create_leaf_nodes(mesh,
                      morton_bits=16,
                      round_to_multiple=1,
                      nthreads_per_block=32,
                      max_blocks=16):
    '''Compute the leaf nodes surrounding a triangle mesh.

      ``mesh``: chroma.geometry.Mesh
        Triangles to box
      ``morton_bits``: int
        Number of bits to use per dimension when computing Morton code.
      ``round_to_multiple``: int
        Round the number of nodes created up to multiple of this number
        Extra nodes will be all zero.
        
    Returns (world_coords, nodes, morton_codes), where
      ``world_coords``: chroma.bvh.WorldCoords
        Defines the fixed point coordinate system
      ``nodes``: ndarray(shape=len(mesh.triangles), dtype=uint4)
        List of leaf nodes.  Child IDs will be set to triangle offsets.
      ``morton_codes``: ndarray(shape=len(mesh.triangles), dtype=np.uint64)
        Morton codes for each triangle, using ``morton_bits`` per axis.
        Must be <= 16 bits.
    '''
    # it would be nice not to duplicate code, make functions transparent...
    context = None
    queue = None
    if gpuapi.is_gpu_api_opencl():
        context = cltools.get_last_context()
        #print context
        queue = cl.CommandQueue(context)

    # Load GPU functions
    if gpuapi.is_gpu_api_cuda():
        bvh_module = get_module('bvh.cu',
                                options=api_options,
                                include_source_directory=True)
    elif gpuapi.is_gpu_api_opencl():
        # don't like the last context method. trouble. trouble.
        bvh_module = get_module('bvh.cl',
                                cltools.get_last_context(),
                                options=api_options,
                                include_source_directory=True)
    bvh_funcs = GPUFuncs(bvh_module)

    # compute world coordinates
    world_origin_np = mesh.vertices.min(axis=0)
    world_scale = np.max(
        (mesh.vertices.max(axis=0) - world_origin_np)) / (2**16 - 2)
    world_coords = WorldCoords(world_origin=world_origin_np,
                               world_scale=world_scale)

    # Put triangles and vertices into host and device memory
    # unfortunately, opencl and cuda has different methods for managing memory here
    # we have to write divergent code
    if gpuapi.is_gpu_api_cuda():
        # here cuda supports a nice feature where we allocate host and device memory that are mapped onto one another.
        # no explicit requests for transfers here
        triangles = cutools.mapped_empty(shape=len(mesh.triangles),
                                         dtype=ga.vec.uint3,
                                         write_combined=True)
        triangles[:] = to_uint3(mesh.triangles)
        vertices = cutools.mapped_empty(shape=len(mesh.vertices),
                                        dtype=ga.vec.float3,
                                        write_combined=True)
        vertices[:] = to_float3(mesh.vertices)
        #print triangles[0:10]
        #print vertices[0:10]

        # Call GPU to compute nodes
        nodes = ga.zeros(shape=round_up_to_multiple(len(triangles),
                                                    round_to_multiple),
                         dtype=ga.vec.uint4)
        morton_codes = ga.empty(shape=len(triangles), dtype=np.uint64)

        # Convert world coords to GPU-friendly types
        world_origin = ga.vec.make_float3(*world_origin_np)
        world_scale = np.float32(world_scale)

        # generate morton codes on GPU
        for first_index, elements_this_iter, nblocks_this_iter in \
                chunk_iterator(len(triangles), nthreads_per_block,
                               max_blocks=30000):
            bvh_funcs.make_leaves(np.uint32(first_index),
                                  np.uint32(elements_this_iter),
                                  cutools.Mapped(triangles),
                                  cutools.Mapped(vertices),
                                  world_origin,
                                  world_scale,
                                  nodes,
                                  morton_codes,
                                  block=(nthreads_per_block, 1, 1),
                                  grid=(nblocks_this_iter, 1))

        morton_codes_host = morton_codes.get() >> (16 - morton_bits)

    elif gpuapi.is_gpu_api_opencl():
        # here we need to allocate a buffer on the host and on the device
        triangles = np.empty(len(mesh.triangles), dtype=ga.vec.uint3)
        copy_to_uint3(mesh.triangles, triangles)
        vertices = np.empty(len(mesh.vertices), dtype=ga.vec.float3)
        copy_to_float3(mesh.vertices, vertices)
        # now create a buffer object on the device and push data to it
        triangles_dev = ga.to_device(queue, triangles)
        vertices_dev = ga.to_device(queue, vertices)

        # Call GPU to compute nodes
        nodes = ga.zeros(queue,
                         shape=round_up_to_multiple(len(triangles),
                                                    round_to_multiple),
                         dtype=ga.vec.uint4)
        morton_codes = ga.empty(queue, shape=len(triangles), dtype=np.uint64)

        # Convert world coords to GPU-friendly types
        #world_origin = np.array(world_origin_np,dtype=np.float32)
        world_origin = np.empty(1, dtype=ga.vec.float3)
        world_origin['x'] = world_origin_np[0]
        world_origin['y'] = world_origin_np[1]
        world_origin['z'] = world_origin_np[2]
        world_scale = np.float32(world_scale)
        #print world_origin, world_scale

        # generate morton codes on GPU
        for first_index, elements_this_iter, nblocks_this_iter in \
                chunk_iterator(len(triangles), nthreads_per_block, max_blocks):
            print first_index, elements_this_iter, nblocks_this_iter
            bvh_funcs.make_leaves(
                queue,
                (nblocks_this_iter, 1, 1),
                (nthreads_per_block, 1, 1),
                #bvh_funcs.make_leaves( queue, (elements_this_iter,1,1), None,
                np.uint32(first_index),
                np.uint32(elements_this_iter),
                triangles_dev.data,
                vertices_dev.data,
                world_origin,
                world_scale,
                nodes.data,
                morton_codes.data,
                g_times_l=True).wait()

        morton_codes_host = morton_codes.get() >> (16 - morton_bits)

    return world_coords, nodes.get(), morton_codes_host
Example #25
0
    def propagate(self,
                  gpu_geometry,
                  rng_states,
                  nthreads_per_block=64,
                  max_blocks=1024,
                  max_steps=10,
                  use_weights=False,
                  scatter_first=0,
                  track=False):
        """Propagate photons on GPU to termination or max_steps, whichever
        comes first.

        May be called repeatedly without reloading photon information if
        single-stepping through photon history.

        ..warning::
            `rng_states` must have at least `nthreads_per_block`*`max_blocks`
            number of curandStates.
        """
        nphotons = self.pos.size
        step = 0
        input_queue = np.empty(shape=nphotons + 1, dtype=np.uint32)
        input_queue[0] = 0
        # Order photons initially in the queue to put the clones next to each other
        for copy in range(self.ncopies):
            input_queue[1 + copy::self.ncopies] = np.arange(
                self.true_nphotons,
                dtype=np.uint32) + copy * self.true_nphotons
        input_queue_gpu = ga.to_gpu(input_queue)
        output_queue = np.zeros(shape=nphotons + 1, dtype=np.uint32)
        output_queue[0] = 1
        output_queue_gpu = ga.to_gpu(output_queue)

        if track:
            step_photon_ids = []
            step_photons = []
            #save the first step for all photons in the input queue
            step_photon_ids.append(input_queue_gpu[1:nphotons + 1].get())
            step_photons.append(
                self.copy_queue(input_queue_gpu[1:], nphotons).get())

        while step < max_steps:
            # Just finish the rest of the steps if the # of photons is low and not tracking
            if not track and (nphotons < nthreads_per_block * 16 * 8
                              or use_weights):
                nsteps = max_steps - step
            else:
                nsteps = 1

            for first_photon, photons_this_round, blocks in \
                    chunk_iterator(nphotons, nthreads_per_block, max_blocks):
                self.gpu_funcs.propagate(np.int32(first_photon),
                                         np.int32(photons_this_round),
                                         input_queue_gpu[1:],
                                         output_queue_gpu,
                                         rng_states,
                                         self.pos,
                                         self.dir,
                                         self.wavelengths,
                                         self.pol,
                                         self.t,
                                         self.flags,
                                         self.last_hit_triangles,
                                         self.weights,
                                         self.evidx,
                                         np.int32(nsteps),
                                         np.int32(use_weights),
                                         np.int32(scatter_first),
                                         gpu_geometry.gpudata,
                                         block=(nthreads_per_block, 1, 1),
                                         grid=(blocks, 1))

            if track:  #save the next step for all photons in the input queue
                step_photon_ids.append(input_queue_gpu[1:nphotons + 1].get())
                step_photons.append(
                    self.copy_queue(input_queue_gpu[1:], nphotons).get())

            step += nsteps
            scatter_first = 0  # Only allow non-zero in first pass

            if step < max_steps:
                temp = input_queue_gpu
                input_queue_gpu = output_queue_gpu
                output_queue_gpu = temp
                # Assign with a numpy array of length 1 to silence
                # warning from PyCUDA about setting array with different strides/storage orders.
                output_queue_gpu[:1].set(np.ones(shape=1, dtype=np.uint32))
                nphotons = input_queue_gpu[:1].get()[0] - 1
                if nphotons == 0:
                    break

        if ga.max(self.flags).get() & (1 << 31):
            print("WARNING: ABORTED PHOTONS", file=sys.stderr)
        cuda.Context.get_current().synchronize()

        if track:
            return step_photon_ids, step_photons
Example #26
0
def merge_nodes(nodes, degree, max_ratio=None):
    bvh_module = get_cu_module('bvh.cu',
                               options=cuda_options,
                               include_source_directory=True)
    bvh_funcs = GPUFuncs(bvh_module)

    nparent = len(nodes) / degree
    if len(nodes) % degree != 0:
        nparent += 1

    if nparent == 1:
        nparent_pad = nparent
    else:
        nparent_pad = round_up_to_multiple(nparent, 1)  #degree)
    gpu_parent_nodes = ga.zeros(shape=nparent_pad, dtype=ga.vec.uint4)

    nthreads_per_block = 256
    for first_index, elements_this_iter, nblocks_this_iter in \
            chunk_iterator(nparent, nthreads_per_block, max_blocks=10000):
        bvh_funcs.make_parents(np.uint32(first_index),
                               np.uint32(elements_this_iter),
                               np.uint32(degree),
                               gpu_parent_nodes,
                               cuda.In(nodes),
                               np.uint32(0),
                               np.uint32(len(nodes)),
                               block=(nthreads_per_block, 1, 1),
                               grid=(nblocks_this_iter, 1))

    parent_nodes = gpu_parent_nodes.get()

    if max_ratio is not None:
        areas = node_areas(parent_nodes)
        child_areas = node_areas(nodes)

        excessive_area = np.zeros(shape=len(areas), dtype=bool)
        for i, parent_area in enumerate(areas):
            nchild = parent_nodes['w'][i] >> CHILD_BITS
            child_index = parent_nodes['w'][i] & ~NCHILD_MASK
            child_area = child_areas[child_index:child_index + nchild].sum()
            #if parent_area > 1e9:
            #    print i, 'Children: %e, Parent: %e' % (child_area, parent_area)
            if child_area / parent_area < 0.3:
                excessive_area[i] = True
                #print i, 'Children: %e, Parent: %e' % (child_area, parent_area)

        extra_slots = round_up_to_multiple(
            (degree - 1) * np.count_nonzero(excessive_area), 1)
        print 'Extra slots:', extra_slots
        new_parent_nodes = np.zeros(shape=len(parent_nodes) + extra_slots,
                                    dtype=parent_nodes.dtype)
        new_parent_nodes[:len(parent_nodes)] = parent_nodes

        offset = 0
        for count, index in enumerate(np.argwhere(excessive_area)):
            index = index[0] + offset
            nchild = new_parent_nodes['w'][index] >> CHILD_BITS
            child_index = new_parent_nodes['w'][index] & ~NCHILD_MASK
            new_parent_nodes[index] = nodes[child_index]
            #new_parent_nodes['w'][index] = 1 << CHILD_BITS | child_index
            tmp_nchild = new_parent_nodes['w'][index] >> CHILD_BITS
            tmp_child_index = new_parent_nodes['w'][index] & ~NCHILD_MASK
            new_parent_nodes['w'][index] = tmp_nchild << CHILD_BITS | (
                tmp_child_index + len(nodes))

            if nchild == 1:
                continue

            # slide everyone over
            #print index, nchild, len(new_parent_nodes)
            new_parent_nodes[index + nchild:] = new_parent_nodes[index +
                                                                 1:-nchild + 1]
            offset += nchild - 1
            for sibling in xrange(nchild - 1):
                new_parent_index = index + 1 + sibling
                new_parent_nodes[new_parent_index] = nodes[child_index +
                                                           sibling + 1]
                if new_parent_nodes['x'][new_parent_index] != 0:
                    tmp_nchild = new_parent_nodes['w'][
                        new_parent_index] >> CHILD_BITS
                    tmp_child_index = new_parent_nodes['w'][
                        new_parent_index] & ~NCHILD_MASK
                    new_parent_nodes['w'][
                        new_parent_index] = tmp_nchild << CHILD_BITS | (
                            tmp_child_index + len(nodes))

                    #new_parent_nodes['w'][new_parent_index] = 1 << CHILD_BITS | (child_index + sibling + 1)

            #print 'intermediate: %e' % node_areas(new_parent_nodes).max()
        print 'old: %e' % node_areas(parent_nodes).max()
        print 'new: %e' % node_areas(new_parent_nodes).max()
        if len(new_parent_nodes) < len(nodes):
            # Only adopt new set of parent nodes if it actually reduces the
            # total number of nodes at this level by 1.
            parent_nodes = new_parent_nodes

    return parent_nodes
Example #27
0
def merge_nodes_detailed(nodes, first_child, nchild):
    '''Merges nodes into len(first_child) parent nodes, using
    the provided arrays to determine the index of the first
    child of each parent, and how many children there are.'''
    nthreads_per_block = 256
    context = None
    queue = None
    if gpuapi.is_gpu_api_opencl():
        context = cltools.get_last_context()
        #print context
        queue = cl.CommandQueue(context)

    # Load GPU functions
    if gpuapi.is_gpu_api_cuda():
        bvh_module = get_module('bvh.cu',
                                options=api_options,
                                include_source_directory=True)
    elif gpuapi.is_gpu_api_opencl():
        # don't like the last context method. trouble. trouble.
        bvh_module = get_module('bvh.cl',
                                context,
                                options=api_options,
                                include_source_directory=True)
    else:
        raise RuntimeError('API is neither CUDA nor OpenCL?!')
    bvh_funcs = GPUFuncs(bvh_module)

    # Load Memory
    if gpuapi.is_gpu_api_cuda():
        gpu_nodes = ga.to_gpu(nodes)
        gpu_first_child = ga.to_gpu(first_child.astype(np.int32))
        gpu_nchild = ga.to_gpu(nchild.astype(np.int32))

        nparent = len(first_child)
        gpu_parent_nodes = ga.empty(shape=nparent, dtype=ga.vec.uint4)
    elif gpuapi.is_gpu_api_opencl():
        gpu_nodes = ga.to_device(queue, nodes)
        gpu_first_child = ga.to_device(queue, first_child.astype(np.int32))
        gpu_nchild = ga.to_device(queue, nchild.astype(np.int32))
        nparent = len(first_child)
        parent_nodes_np = np.zeros(shape=nparent, dtype=ga.vec.uint4)
        gpu_parent_nodes = ga.to_device(queue, parent_nodes_np)
    else:
        raise RuntimeError('API is neither CUDA nor OpenCL?!')

    # Run Kernel
    for first_index, elements_this_iter, nblocks_this_iter in \
            chunk_iterator(nparent, nthreads_per_block, max_blocks=10000):
        if gpuapi.is_gpu_api_cuda():
            bvh_funcs.make_parents_detailed(np.uint32(first_index),
                                            np.uint32(elements_this_iter),
                                            gpu_nodes,
                                            gpu_parent_nodes,
                                            gpu_first_child,
                                            gpu_nchild,
                                            block=(nthreads_per_block, 1, 1),
                                            grid=(nblocks_this_iter, 1))
        elif gpuapi.is_gpu_api_opencl():
            bvh_funcs.make_parents_detailed(queue, (elements_this_iter, 1, 1),
                                            None, np.uint32(first_index),
                                            np.uint32(elements_this_iter),
                                            gpu_nodes.data,
                                            gpu_parent_nodes.data,
                                            gpu_first_child.data,
                                            gpu_nchild.data).wait()
        else:
            raise RuntimeError('API is neither CUDA nor OpenCL?!')

    return gpu_parent_nodes.get()
Example #28
0
    def acquire(self,
                gpuphotons,
                rng_states,
                nthreads_per_block=64,
                max_blocks=1024,
                start_photon=None,
                nphotons=None,
                weight=1.0,
                cl_context=None):
        """run UBooNE DAQ acquire kernels"""
        if start_photon is None:
            start_photon = 0
        if nphotons is None:
            nphotons = len(gpuphotons.pos) - start_photon

        if api.is_gpu_api_opencl():
            comqueue = cl.CommandQueue(cl_context)
            clmaxblocks = max_blocks

        # We loop over all photons and bin them essentially
        if self.ndaq == 1:
            for first_photon, photons_this_round, blocks in \
                    chunk_iterator(nphotons, nthreads_per_block, max_blocks):
                if api.is_gpu_api_cuda():
                    self.gpu_funcs.run_daq(rng_states,
                                           np.uint32(event.SURFACE_DETECT),
                                           np.int32(start_photon +
                                                    first_photon),
                                           np.int32(photons_this_round),
                                           gpuphotons.t,
                                           gpuphotons.flags,
                                           gpuphotons.last_hit_triangles,
                                           gpuphotons.weights,
                                           self.solid_id_map_gpu,
                                           self.detector_gpu,
                                           self.adc_gpu,
                                           np.int32(self.nchannels),
                                           np.int32(self.ntdcs),
                                           np.float32(self.ns_per_tdc),
                                           np.float32(100.0),
                                           self.channel_history_gpu,
                                           np.float32(weight),
                                           block=(nthreads_per_block, 1, 1),
                                           grid=(blocks, 1))
                elif api.is_gpu_api_opencl():
                    self.gpu_funcs.run_daq(
                        comqueue,
                        (photons_this_round, 1, 1),
                        None,
                        rng_states.data,
                        np.uint32(0x1 << 2),
                        np.int32(start_photon + first_photon),
                        np.int32(nphotons),
                        gpuphotons.t.data,
                        gpuphotons.pos.data,
                        gpuphotons.flags.data,
                        gpuphotons.last_hit_triangles.data,
                        gpuphotons.weights.data,
                        self.solid_id_map_gpu.data,
                        # -- Detector struct --
                        self.solid_id_to_channel_index_gpu.data,
                        # ---------------------
                        self.uint_adc_gpu.data,
                        np.int32(self.nchannels),
                        np.int32(self.ntdcs),
                        np.float32(self.ns_per_tdc),
                        np.float32(100.0),
                        self.channel_history_gpu.data,
                        # -- Channel transforms --
                        self.channel_inverse_rot_gpu.data,
                        self.channel_inverse_trans_gpu.data,
                        # ------------------------
                        np.float32(weight),
                        g_times_l=False).wait()
            # if opencl, need to convert ADC from uint to float
            if api.is_gpu_api_opencl():
                self.gpu_funcs.convert_adc(comqueue,
                                           (int(self.nchannels), 1, 1),
                                           None,
                                           self.uint_adc_gpu.data,
                                           self.adc_gpu.data,
                                           np.int32(self.nchannels),
                                           np.int32(self.ntdcs),
                                           g_times_l=False).wait()

        else:
            raise RunTimeError("Multi-DAQ not built")
            for first_photon, photons_this_round, blocks in \
                    chunk_iterator(nphotons, 1, max_blocks):
                if api.is_gpu_api_cuda():
                    self.gpu_funcs.run_daq_many(
                        rng_states,
                        np.uint32(0x1 << 2),
                        np.int32(start_photon + first_photon),
                        np.int32(photons_this_round),
                        gpuphotons.t,
                        gpuphotons.flags,
                        gpuphotons.last_hit_triangles,
                        gpuphotons.weights,
                        self.solid_id_map_gpu,
                        self.detector_gpu,
                        self.earliest_time_int_gpu,
                        self.channel_q_int_gpu,
                        self.channel_history_gpu,
                        np.int32(self.ndaq),
                        np.int32(self.stride),
                        np.float32(weight),
                        block=(nthreads_per_block, 1, 1),
                        grid=(blocks, 1))
                elif api.is_gpu_api_opencl():
                    self.gpu_funcs.run_daq_many(
                        comqueue,
                        (nthreads_per_block, 1, 1),
                        (blocks, 1),
                        np.int32(start_photon + first_photon),
                        np.int32(photons_this_round),
                        gpuphotons.t.data,
                        gpuphotons.flags.data,
                        gpuphotons.last_hit_triangles.data,
                        gpuphotons.weights.data,
                        self.solid_id_map_gpu,
                        # -- Detector Struct --
                        self.solid_id_to_channel_index_gpu.data,
                        self.detector_gpu.time_cdf_x_gpu.data,
                        self.detector_gpu.time_cdf_y_gpu.data,
                        self.detector_gpu.charge_cdf_x_gpu.data,
                        self.detector_gpu.charge_cdf_y_gpu.data,
                        self.detector_gpu.nchannels,
                        self.detector_gpu.time_cdf_len,
                        self.detector_gpu.charge_cdf_len,
                        self.detector_gpu.charge_unit,
                        # ---------------------
                        self.earliest_time_int_gpu.data,
                        self.channel_q_int_gpu.data,
                        self.channel_history_gpu.data,
                        np.int32(self.ndaq),
                        np.int32(self.stride),
                        np.float32(weight),
                        g_times_l=True).wait()
        if api.is_gpu_api_cuda():
            cuda.Context.get_current().synchronize()
        elif api.is_gpu_api_opencl():
            cl.enqueue_barrier(comqueue)
Example #29
0
def merge_nodes(nodes, degree, max_ratio=None):
    nthreads_per_block = 256
    context = None
    queue = None
    if gpuapi.is_gpu_api_opencl():
        context = cltools.get_last_context()
        queue = cl.CommandQueue(context)

    # Load GPU functions
    if gpuapi.is_gpu_api_cuda():
        bvh_module = get_module('bvh.cu',
                                options=api_options,
                                include_source_directory=True)
    elif gpuapi.is_gpu_api_opencl():
        # don't like the last context method. trouble. trouble.
        bvh_module = get_module('bvh.cl',
                                context,
                                options=api_options,
                                include_source_directory=True)
    else:
        raise RuntimeError('API is neither CUDA nor OpenCL?!')
    bvh_funcs = GPUFuncs(bvh_module)

    # determine number of parents
    nparent = len(nodes) / degree
    if len(nodes) % degree != 0:
        nparent += 1

    if nparent == 1:
        nparent_pad = nparent
    else:
        nparent_pad = round_up_to_multiple(nparent, 1)  #degree

    # allocate memory
    if gpuapi.is_gpu_api_cuda():
        gpu_parent_nodes = ga.zeros(shape=nparent_pad, dtype=ga.vec.uint4)
    elif gpuapi.is_gpu_api_opencl():
        parent_nodes_np = np.zeros(shape=nparent, dtype=ga.vec.uint4)
        gpu_parent_nodes = ga.to_device(queue, parent_nodes_np)
        gpu_nodes = ga.to_device(queue, nodes)
    else:
        raise RuntimeError('API is neither CUDA nor OpenCL?!')

    # run kernel
    if gpuapi.is_gpu_api_cuda():
        for first_index, elements_this_iter, nblocks_this_iter in \
                chunk_iterator(nparent, nthreads_per_block, max_blocks=10000):
            bvh_funcs.make_parents(np.uint32(first_index),
                                   np.uint32(elements_this_iter),
                                   np.uint32(degree),
                                   gpu_parent_nodes,
                                   cuda.In(nodes),
                                   np.uint32(0),
                                   np.uint32(len(nodes)),
                                   block=(nthreads_per_block, 1, 1),
                                   grid=(nblocks_this_iter, 1))
    elif gpuapi.is_gpu_api_opencl():
        for first_index, elements_this_iter, nblocks_this_iter in \
                chunk_iterator(nparent, nthreads_per_block, max_blocks=1):
            bvh_funcs.make_parents(queue, (elements_this_iter, 1, 1), None,
                                   np.uint32(first_index),
                                   np.uint32(elements_this_iter),
                                   np.uint32(degree), gpu_parent_nodes.data,
                                   gpu_nodes.data, np.uint32(0),
                                   np.uint32(len(nodes))).wait()
    else:
        raise RuntimeError('API is neither CUDA nor OpenCL?!')

    parent_nodes = gpu_parent_nodes.get()

    if max_ratio is not None:
        areas = node_areas(parent_nodes)
        child_areas = node_areas(nodes)

        excessive_area = np.zeros(shape=len(areas), dtype=bool)
        for i, parent_area in enumerate(areas):
            nchild = parent_nodes['w'][i] >> CHILD_BITS
            child_index = parent_nodes['w'][i] & ~NCHILD_MASK
            child_area = child_areas[child_index:child_index + nchild].sum()
            #if parent_area > 1e9:
            #    print i, 'Children: %e, Parent: %e' % (child_area, parent_area)
            if child_area / parent_area < 0.3:
                excessive_area[i] = True
                #print i, 'Children: %e, Parent: %e' % (child_area, parent_area)

        extra_slots = round_up_to_multiple(
            (degree - 1) * np.count_nonzero(excessive_area), 1)
        print 'Extra slots:', extra_slots
        new_parent_nodes = np.zeros(shape=len(parent_nodes) + extra_slots,
                                    dtype=parent_nodes.dtype)
        new_parent_nodes[:len(parent_nodes)] = parent_nodes

        offset = 0
        for count, index in enumerate(np.argwhere(excessive_area)):
            index = index[0] + offset
            nchild = new_parent_nodes['w'][index] >> CHILD_BITS
            child_index = new_parent_nodes['w'][index] & ~NCHILD_MASK
            new_parent_nodes[index] = nodes[child_index]
            #new_parent_nodes['w'][index] = 1 << CHILD_BITS | child_index
            tmp_nchild = new_parent_nodes['w'][index] >> CHILD_BITS
            tmp_child_index = new_parent_nodes['w'][index] & ~NCHILD_MASK
            new_parent_nodes['w'][index] = tmp_nchild << CHILD_BITS | (
                tmp_child_index + len(nodes))

            if nchild == 1:
                continue

            # slide everyone over
            #print index, nchild, len(new_parent_nodes)
            new_parent_nodes[index + nchild:] = new_parent_nodes[index +
                                                                 1:-nchild + 1]
            offset += nchild - 1
            for sibling in xrange(nchild - 1):
                new_parent_index = index + 1 + sibling
                new_parent_nodes[new_parent_index] = nodes[child_index +
                                                           sibling + 1]
                if new_parent_nodes['x'][new_parent_index] != 0:
                    tmp_nchild = new_parent_nodes['w'][
                        new_parent_index] >> CHILD_BITS
                    tmp_child_index = new_parent_nodes['w'][
                        new_parent_index] & ~NCHILD_MASK
                    new_parent_nodes['w'][
                        new_parent_index] = tmp_nchild << CHILD_BITS | (
                            tmp_child_index + len(nodes))

                    #new_parent_nodes['w'][new_parent_index] = 1 << CHILD_BITS | (child_index + sibling + 1)

            #print 'intermediate: %e' % node_areas(new_parent_nodes).max()
        print 'old: %e' % node_areas(parent_nodes).max()
        print 'new: %e' % node_areas(new_parent_nodes).max()
        if len(new_parent_nodes) < len(nodes):
            # Only adopt new set of parent nodes if it actually reduces the
            # total number of nodes at this level by 1.
            parent_nodes = new_parent_nodes

    return parent_nodes
Example #30
0
    def acquire(self,
                gpuphotons,
                rng_states,
                nthreads_per_block=64,
                max_blocks=1024,
                start_photon=None,
                nphotons=None,
                weight=1.0,
                cl_context=None):
        if start_photon is None:
            start_photon = 0
        if nphotons is None:
            nphotons = len(gpuphotons.pos) - start_photon

        if api.is_gpu_api_opencl():
            comqueue = cl.CommandQueue(cl_context)
            clmaxblocks = max_blocks

        if self.ndaq == 1:
            for first_photon, photons_this_round, blocks in \
                    chunk_iterator(nphotons, nthreads_per_block, max_blocks):
                if api.is_gpu_api_cuda():
                    self.gpu_funcs.run_daq(rng_states,
                                           np.uint32(0x1 << 2),
                                           np.int32(start_photon +
                                                    first_photon),
                                           np.int32(photons_this_round),
                                           gpuphotons.t,
                                           gpuphotons.flags,
                                           gpuphotons.last_hit_triangles,
                                           gpuphotons.weights,
                                           self.solid_id_map_gpu,
                                           self.detector_gpu,
                                           self.earliest_time_int_gpu,
                                           self.channel_q_int_gpu,
                                           self.channel_history_gpu,
                                           np.float32(weight),
                                           block=(nthreads_per_block, 1, 1),
                                           grid=(blocks, 1))
                elif api.is_gpu_api_opencl():
                    #print "daq: ",start_photon,first_photon,start_photon+first_photon,(photons_this_round/nthreads_per_block,1,1), (nthreads_per_block,1,1)
                    self.gpu_funcs.run_daq(
                        comqueue,
                        (photons_this_round / nthreads_per_block, 1, 1),
                        (nthreads_per_block, 1, 1),
                        rng_states.data,
                        np.uint32(0x1 << 2),
                        np.int32(start_photon + first_photon),
                        np.int32(photons_this_round),
                        gpuphotons.t.data,
                        gpuphotons.flags.data,
                        gpuphotons.last_hit_triangles.data,
                        gpuphotons.weights.data,
                        self.solid_id_map_gpu.data,
                        # -- Detector struct --
                        self.solid_id_to_channel_index_gpu.data,
                        self.detector_gpu.time_cdf_x_gpu.data,
                        self.detector_gpu.time_cdf_y_gpu.data,
                        self.detector_gpu.charge_cdf_x_gpu.data,
                        self.detector_gpu.charge_cdf_y_gpu.data,
                        self.detector_gpu.nchannels,
                        self.detector_gpu.time_cdf_len,
                        self.detector_gpu.charge_cdf_len,
                        self.detector_gpu.charge_unit,
                        # ---------------------
                        self.earliest_time_int_gpu.data,
                        self.channel_q_int_gpu.data,
                        self.channel_history_gpu.data,
                        np.float32(weight),
                        g_times_l=True).wait()

        else:
            for first_photon, photons_this_round, blocks in \
                    chunk_iterator(nphotons, 1, max_blocks):
                if api.is_gpu_api_cuda():
                    self.gpu_funcs.run_daq_many(
                        rng_states,
                        np.uint32(0x1 << 2),
                        np.int32(start_photon + first_photon),
                        np.int32(photons_this_round),
                        gpuphotons.t,
                        gpuphotons.flags,
                        gpuphotons.last_hit_triangles,
                        gpuphotons.weights,
                        self.solid_id_map_gpu,
                        self.detector_gpu,
                        self.earliest_time_int_gpu,
                        self.channel_q_int_gpu,
                        self.channel_history_gpu,
                        np.int32(self.ndaq),
                        np.int32(self.stride),
                        np.float32(weight),
                        block=(nthreads_per_block, 1, 1),
                        grid=(blocks, 1))
                elif api.is_gpu_api_opencl():
                    self.gpu_funcs.run_daq_many(
                        comqueue,
                        (nthreads_per_block, 1, 1),
                        (blocks, 1),
                        np.int32(start_photon + first_photon),
                        np.int32(photons_this_round),
                        gpuphotons.t.data,
                        gpuphotons.flags.data,
                        gpuphotons.last_hit_triangles.data,
                        gpuphotons.weights.data,
                        self.solid_id_map_gpu,
                        # -- Detector Struct --
                        self.solid_id_to_channel_index_gpu.data,
                        self.detector_gpu.time_cdf_x_gpu.data,
                        self.detector_gpu.time_cdf_y_gpu.data,
                        self.detector_gpu.charge_cdf_x_gpu.data,
                        self.detector_gpu.charge_cdf_y_gpu.data,
                        self.detector_gpu.nchannels,
                        self.detector_gpu.time_cdf_len,
                        self.detector_gpu.charge_cdf_len,
                        self.detector_gpu.charge_unit,
                        # ---------------------
                        self.earliest_time_int_gpu.data,
                        self.channel_q_int_gpu.data,
                        self.channel_history_gpu.data,
                        np.int32(self.ndaq),
                        np.int32(self.stride),
                        np.float32(weight),
                        g_times_l=True).wait()
        if api.is_gpu_api_cuda():
            cuda.Context.get_current().synchronize()
        elif api.is_gpu_api_opencl():
            cl.enqueue_barrier(comqueue)
Example #31
0
def optimize_layer(orig_nodes):
    bvh_module = get_cu_module('bvh.cu',
                               options=cuda_options,
                               include_source_directory=True)
    bvh_funcs = GPUFuncs(bvh_module)

    nodes = ga.to_gpu(orig_nodes)
    n = len(nodes)
    areas = ga.empty(shape=n / 2, dtype=np.uint64)
    nthreads_per_block = 128

    min_areas = ga.empty(shape=int(np.ceil(n / float(nthreads_per_block))),
                         dtype=np.uint64)
    min_index = ga.empty(shape=min_areas.shape, dtype=np.uint32)

    update = 10000

    skip_size = 1
    flag = cutools.mapped_empty(shape=skip_size, dtype=np.uint32)

    i = 0
    skips = 0
    swaps = 0
    while i < n / 2 - 1:
        # How are we doing?
        if i % update == 0:
            for first_index, elements_this_iter, nblocks_this_iter in \
                    chunk_iterator(n/2, nthreads_per_block, max_blocks=10000):

                bvh_funcs.pair_area(np.uint32(first_index),
                                    np.uint32(elements_this_iter),
                                    nodes,
                                    areas,
                                    block=(nthreads_per_block, 1, 1),
                                    grid=(nblocks_this_iter, 1))

            areas_host = areas.get()
            #print nodes.get(), areas_host.astype(float)
            print 'Area of parent layer so far (%d): %1.12e' % (
                i * 2, areas_host.astype(float).sum())
            print 'Skips: %d, Swaps: %d' % (skips, swaps)

        test_index = i * 2

        blocks = 0
        look_forward = min(8192 * 50, n - test_index - 2)
        skip_this_round = min(skip_size, n - test_index - 1)
        flag[:] = 0
        for first_index, elements_this_iter, nblocks_this_iter in \
                chunk_iterator(look_forward, nthreads_per_block, max_blocks=10000):
            bvh_funcs.min_distance_to(np.uint32(first_index + test_index + 2),
                                      np.uint32(elements_this_iter),
                                      np.uint32(test_index),
                                      nodes,
                                      np.uint32(blocks),
                                      min_areas,
                                      min_index,
                                      cutools.Mapped(flag),
                                      block=(nthreads_per_block, 1, 1),
                                      grid=(nblocks_this_iter,
                                            skip_this_round))
            blocks += nblocks_this_iter
            #print i, first_index, nblocks_this_iter, look_forward
        cuda.Context.get_current().synchronize()

        if flag[0] == 0:
            flag_nonzero = flag.nonzero()[0]
            if len(flag_nonzero) == 0:
                no_swap_required = skip_size
            else:
                no_swap_required = flag_nonzero[0]
            i += no_swap_required
            skips += no_swap_required
            continue

        min_areas_host = min_areas[:blocks].get()
        min_index_host = min_index[:blocks].get()
        best_block = min_areas_host.argmin()
        better_i = min_index_host[best_block]

        swaps += 1
        #print 'swap', test_index+1, better_i
        assert 0 < better_i < len(nodes)
        assert 0 < test_index + 1 < len(nodes)
        bvh_funcs.swap(np.uint32(test_index + 1),
                       np.uint32(better_i),
                       nodes,
                       block=(1, 1, 1),
                       grid=(1, 1))
        cuda.Context.get_current().synchronize()
        i += 1

    for first_index, elements_this_iter, nblocks_this_iter in \
            chunk_iterator(n/2, nthreads_per_block, max_blocks=10000):

        bvh_funcs.pair_area(np.uint32(first_index),
                            np.uint32(elements_this_iter),
                            nodes,
                            areas,
                            block=(nthreads_per_block, 1, 1),
                            grid=(nblocks_this_iter, 1))

    areas_host = areas.get()

    print 'Final area of parent layer: %1.12e' % areas_host.sum()
    print 'Skips: %d, Swaps: %d' % (skips, swaps)

    return nodes.get()
Example #32
0
    def __init__(self, steps_arr, multiple=1.0, nthreads_per_block=64, max_blocks=1024, ncopies=1,
                 seed=None, cl_context=None):
        """
        Generates photons from information in the steps_arr
        
        Parameters
        ----------
        steps_arr : numpy.array with shape=(N,10) dtype=np.float
           contains [ x1, y1, z1, t1, x2, y2, z2, nphotons, fast_to_slow_ratio, fast_time_constatn, slow_time_constatn ]
           in the future could generalize this to many different time components.
           developed for liquid argon TPCs.
        multiple : float
           scale up the number of photons generated (not implemented yet)
        """
        self.steps_array = steps_arr
        self.nsteps = self.steps_array.shape[0]
        if multiple!=1.0:
            raise RuntimeError('Have not implemented scaling of the number of photons generated.')

        # ===========================
        # GEN PHOTONS
        tstart_genphotons =  time.time()
        # we do the dumbest thing first (i.e., no attempt to do fancy GPU manipulations here)
        # on the CPU, we scan the steps to determine the total number of photons using poisson statistics
        # we assume the user has seeded the random number generator to her liking
        tstart_nphotons = time.time()
        self.step_fsratio = np.array( self.steps_array[:,self._fsratio], dtype=np.float32 )
        #self.nphotons_per_step = np.array( [ np.random.poisson( z ) for z in self.steps_array[:,self._nphotons].ravel() ], dtype=np.int )
        self.nphotons_per_step = self.steps_array[ self._nphotons, : ]
        self.nphotons = reduce( lambda x, y : x + y, self.nphotons_per_step.ravel() )
        print "NSTEPS: ",self.nsteps
        print "NPHOTONS: ",self.nphotons," (time to determine per step=",time.time()-tstart_nphotons
        # now we make an index array for which step we need to get info from
        self.source_step_index = np.zeros( self.nphotons, dtype=np.int32 )
        current_index=0
        for n, n_per_step in enumerate( self.nphotons_per_step ):
            self.source_step_index[current_index:current_index+n_per_step] = n
            current_index += n_per_step
        # push everything to the GPU
        tstart_transfer = time.time()
        if api.is_gpu_api_cuda():
            # step info
            self.step_pos1_gpu = ga.empty(shape=self.nsteps, dtype=ga.vec.float3)
            self.step_pos2_gpu = ga.empty(shape=self.nsteps, dtype=ga.vec.float3)
            self.step_fsratio_gpu = ga.to_gpu( self.step_fsratio )
            self.source_step_index_gpu = ga.to_gpu( self.source_step_index )
            # photon info
            self.pos = ga.empty( shape=self.nphotons, dtype=ga.vec.float3 )
            self.dir = ga.empty( shape=self.nphotons, dtype=ga.vec.float3 )
            self.pol = ga.empty( shape=self.nphotons, dtype=ga.vec.float3 )
            self.wavelengths = ga.empty(shape=self.nphotons*ncopies, dtype=np.float32)
            self.t = ga.to_gpu( np.zeros(self.nphotons*ncopies, dtype=np.float32) )
            self.last_hit_triangles = ga.empty(shape=self.nphotons*ncopies, dtype=np.int32)
            self.flags = ga.empty(shape=self.nphotons*ncopies, dtype=np.uint32)
            self.weights = ga.empty(shape=self.nphotons*ncopies, dtype=np.float32)
        elif api.is_gpu_api_opencl():
            cl_queue = cl.CommandQueue( cl_context )
            # step info
            self.step_pos1_gpu = ga.empty(cl_queue, self.nsteps, dtype=ga.vec.float3)
            self.step_pos2_gpu = ga.empty(cl_queue, self.nsteps, dtype=ga.vec.float3)
            self.step_fsratio_gpu  = ga.to_device( cl_queue, self.step_fsratio )
            self.source_step_index_gpu = ga.to_device( cl_queue, self.source_step_index )
            # photon info
            self.pos = ga.empty( cl_queue, self.nphotons, dtype=ga.vec.float3 )
            self.dir = ga.empty( cl_queue, self.nphotons, dtype=ga.vec.float3 )
            self.pol = ga.empty( cl_queue, self.nphotons, dtype=ga.vec.float3 )
            self.wavelengths = ga.empty( cl_queue, self.nphotons*ncopies, dtype=np.float32)
            self.t = ga.zeros( cl_queue, self.nphotons*ncopies, dtype=np.float32)
            self.last_hit_triangles = ga.empty( cl_queue, self.nphotons*ncopies, dtype=np.int32)
            self.flags = ga.empty( cl_queue, self.nphotons*ncopies, dtype=np.uint32)
            self.weights = ga.empty( cl_queue, self.nphotons*ncopies, dtype=np.float32)
        
        self.step_pos1_gpu.set( to_float3( self.steps_array[:,0:3] ) )
        self.step_pos2_gpu.set( to_float3( self.steps_array[:,4:7] ) )
        self.t.set( self.steps_array[:,3] )
        self.ncopies = ncopies
        self.true_nphotons = self.nphotons

        if self.ncopies!=1:
            raise ValueError('support for multiple copies not supported')

        if api.is_gpu_api_cuda():
            self.gpumod = get_module( "gen_photon_from_step.cu", options=api_options, include_source_directory=True )
        elif api.is_gpu_api_opencl():
            self.gpumod = get_module( "gen_photon_from_step.cl", cl_context, options=api_options, include_source_directory=True )
        self.gpufuncs = GPUFuncs( self.gpumod )
        print "gen photon mem alloc/transfer time=",time.time()-tstart_transfer

        # need random numbers
        tgpu = time.time()
        if seed==None:
            seed = 5
        rng_states = get_rng_states(nthreads_per_block*max_blocks, seed=seed, cl_context=cl_context)
        for first_photon, photons_this_round, blocks in chunk_iterator(self.nphotons, nthreads_per_block, max_blocks):
            if api.is_gpu_api_cuda():
                self.gpufuncs.gen_photon_from_step( np.int32(first_photon), np.int32(self.nphotons), self.source_step_index_gpu,
                                                    self.step_pos1_gpu, self.step_pos2_gpu, self.step_fsratio_gpu,
                                                    np.float32( self.steps_array[0,self._fconst] ), np.float32( self.steps_array[0,self._sconst]  ), np.float32( 128.0 ),
                                                    rng_states,
                                                    self.pos, self.dir, self.pol, self.t, self.wavelengths, self.last_hit_triangles, self.flags, self.weights,
                                                    block=(nthreads_per_block,1,1), grid=(blocks, 1) )
            elif api.is_gpu_api_opencl():
                self.gpufuncs.gen_photon_from_step( cl_queue, ( photons_this_round, 1, 1), None,
                                                    np.int32(first_photon), np.int32(self.nphotons), self.source_step_index_gpu.data,
                                                    self.step_pos1_gpu.data, self.step_pos2_gpu.data, self.step_fsratio_gpu.data,
                                                    np.float32( self.steps_array[0,self._fconst] ), np.float32( self.steps_array[0,self._sconst]  ), np.float32( 128.0 ),
                                                    rng_states.data,
                                                    self.pos.data, self.dir.data, self.pol.data, self.t.data, self.wavelengths.data, 
                                                    self.last_hit_triangles.data, self.flags.data, self.weights.data, g_times_l=False ).wait()
                                                    
            else:
                raise RuntimeError("GPU API is neither CUDA nor OpenCL!")
        if api.is_gpu_api_cuda():
            cuda.Context.get_current().synchronize()
        tend_genphotons =  time.time()
        print "GPUPhotonFromSteps: time to gen photons ",tend_genphotons-tstart_genphotons," secs (gpu time=",time.time()-tgpu,")"

        # Now load modules
        if api.is_gpu_api_cuda():
            self.module = get_module('propagate.cu', options=api_options, include_source_directory=True)
        elif  api.is_gpu_api_opencl():
            self.module = get_module('propagate.cl', cl_context, options=api_options, include_source_directory=True)
        # define the texture references
        self.define_texture_references()
        # get kernel functions
        self.gpu_funcs = GPUFuncs(self.module)
Example #33
0
    def propagate(self,
                  gpu_geometry,
                  rng_states,
                  nthreads_per_block=64,
                  max_blocks=1024,
                  max_steps=10,
                  use_weights=False,
                  scatter_first=0,
                  cl_context=None):
        """Propagate photons on GPU to termination or max_steps, whichever
        comes first.

        May be called repeatedly without reloading photon information if
        single-stepping through photon history.

        ..warning::
            `rng_states` must have at least `nthreads_per_block`*`max_blocks`
            number of curandStates.
        """
        nphotons = self.pos.size
        # bind node texture reference
        if api.is_gpu_api_cuda() and not self.node_texture_ref_bound:
            # we have to unroll, as pycuda doesn't seem to support vector times right now for binding
            self.unrolled_nodes = ga.to_gpu(
                gpu_geometry.nodes.get().ravel().view(np.uint32))
            self.unrolled_extra_nodes = ga.to_gpu(
                gpu_geometry.extra_nodes.ravel().view(np.uint32))
            self.unrolled_triangles = ga.to_gpu(
                gpu_geometry.triangles.get().ravel().view(np.uint32))
            self.unrolled_triangles4 = ga.to_gpu(
                gpu_geometry.triangles4.ravel().view(np.uint32))
            self.unrolled_vertices = ga.to_gpu(
                gpu_geometry.vertices.get().ravel().view(np.float32))
            self.unrolled_vertices4 = ga.to_gpu(
                gpu_geometry.vertices4.ravel().view(np.float32))
            self.node_texture_ref.set_address(self.unrolled_nodes.gpudata,
                                              self.unrolled_nodes.nbytes)
            self.extra_node_texture_ref.set_address(
                self.unrolled_extra_nodes.gpudata,
                self.unrolled_extra_nodes.nbytes)
            #self.unrolled_nodes.bind_to_texref_ext( self.node_texture_ref )
            #self.unrolled_extra_nodes.bind_to_texref_ext( self.extra_node_texture_ref )
            #self.unrolled_triangles.bind_to_texref_ext( self.triangles_texture_ref )
            self.triangles_texture_ref.set_address(
                self.unrolled_triangles4.gpudata,
                self.unrolled_triangles4.nbytes)
            #self.unrolled_vertices.bind_to_texref_ext( self.vertices_texture_ref )
            self.vertices_texture_ref.set_address(
                self.unrolled_vertices4.gpudata,
                self.unrolled_vertices4.nbytes)
            print "[BOUND TO TEXTURE MEMORY]"
            print "Nodes: ", self.unrolled_nodes.nbytes / 1.0e3, " kbytes"
            print "Extra nodes: ", self.unrolled_extra_nodes.nbytes / 1.0e3, " kbytes"
            print "Triangles: ", self.unrolled_triangles4.nbytes / 1.0e3, " kbytes"
            print "Vertices: ", self.unrolled_vertices4.nbytes / 1.0e3, " kbytes"
            print "Total: ", (self.unrolled_nodes.nbytes +
                              self.unrolled_extra_nodes.nbytes +
                              self.unrolled_triangles4.nbytes +
                              self.unrolled_vertices4.nbytes) / 1.0e3, "kbytes"
            self.node_texture_ref_bound = True

        # setup queue
        maxqueue = nphotons
        step = 0
        input_queue = np.empty(shape=maxqueue + 1, dtype=np.uint32)
        input_queue[0] = 0
        # Order photons initially in the queue to put the clones next to each other
        for copy in xrange(self.ncopies):
            input_queue[1 + copy::self.ncopies] = np.arange(
                self.true_nphotons,
                dtype=np.uint32) + copy * self.true_nphotons
        if api.is_gpu_api_cuda():
            input_queue_gpu = ga.to_gpu(input_queue)
        elif api.is_gpu_api_opencl():
            comqueue = cl.CommandQueue(cl_context)
            input_queue_gpu = ga.to_device(comqueue,
                                           input_queue[1:])  # why the offset?

        output_queue = np.zeros(shape=maxqueue + 1, dtype=np.uint32)
        output_queue[0] = 1
        if api.is_gpu_api_cuda():
            output_queue_gpu = ga.to_gpu(output_queue)
        elif api.is_gpu_api_opencl():
            output_queue_gpu = ga.to_device(comqueue, output_queue)

        if use_weights:
            iuse_weights = 1
        else:
            iuse_weights = 0

        adapt_factor = 1.0
        start_prop = time.time()
        while step < max_steps:
            # Just finish the rest of the steps if the # of photons is low
            #if nphotons < nthreads_per_block * 16 * 8 or use_weights:
            #    nsteps = max_steps - step
            #else:
            #    nsteps = 1
            nsteps = 1

            start_step = time.time()
            for first_photon, photons_this_round, blocks in \
                    chunk_iterator(nphotons, nthreads_per_block, max( int(adapt_factor*max_blocks), 1 )):
                #print nphotons, nthreads_per_block, max_blocks," : ",first_photon, photons_this_round, blocks, adapt_factor
                start_chunk = time.time()
                if api.is_gpu_api_cuda():
                    self.gpu_funcs.propagate(np.int32(first_photon),
                                             np.int32(photons_this_round),
                                             input_queue_gpu[1:],
                                             output_queue_gpu,
                                             rng_states,
                                             self.pos,
                                             self.dir,
                                             self.wavelengths,
                                             self.pol,
                                             self.t,
                                             self.flags,
                                             self.last_hit_triangles,
                                             self.weights,
                                             np.int32(nsteps),
                                             np.int32(iuse_weights),
                                             np.int32(scatter_first),
                                             gpu_geometry.gpudata,
                                             block=(nthreads_per_block, 1, 1),
                                             grid=(blocks, 1))
                    #cuda.Context.get_current().synchronize()
                elif api.is_gpu_api_opencl():
                    self.gpu_funcs.propagate(
                        comqueue, (photons_this_round, 1, 1),
                        None,
                        np.int32(first_photon),
                        np.int32(photons_this_round),
                        input_queue_gpu.data,
                        output_queue_gpu.data,
                        rng_states.data,
                        self.pos.data,
                        self.dir.data,
                        self.wavelengths.data,
                        self.pol.data,
                        self.t.data,
                        self.flags.data,
                        self.last_hit_triangles.data,
                        self.weights.data,
                        np.int32(nsteps),
                        np.int32(iuse_weights),
                        np.int32(scatter_first),
                        gpu_geometry.world_scale,
                        gpu_geometry.world_origin.data,
                        np.int32(len(gpu_geometry.nodes)),
                        gpu_geometry.material_data['n'],
                        gpu_geometry.material_data['step'],
                        gpu_geometry.material_data["wavelength0"],
                        gpu_geometry.vertices.data,
                        gpu_geometry.triangles.data,
                        gpu_geometry.material_codes.data,
                        gpu_geometry.colors.data,
                        gpu_geometry.nodes.data,
                        gpu_geometry.extra_nodes.data,
                        gpu_geometry.material_data["nmaterials"],
                        gpu_geometry.material_data['refractive_index'].data,
                        gpu_geometry.material_data['absorption_length'].data,
                        gpu_geometry.material_data['scattering_length'].data,
                        gpu_geometry.material_data['reemission_prob'].data,
                        gpu_geometry.material_data['reemission_cdf'].data,
                        gpu_geometry.surface_data['nsurfaces'],
                        gpu_geometry.surface_data['detect'].data,
                        gpu_geometry.surface_data['absorb'].data,
                        gpu_geometry.surface_data['reemit'].data,
                        gpu_geometry.surface_data['reflect_diffuse'].data,
                        gpu_geometry.surface_data['reflect_specular'].data,
                        gpu_geometry.surface_data['eta'].data,
                        gpu_geometry.surface_data['k'].data,
                        gpu_geometry.surface_data['reemission_cdf'].data,
                        gpu_geometry.surface_data['model'].data,
                        gpu_geometry.surface_data['transmissive'].data,
                        gpu_geometry.surface_data['thickness'].data,
                        gpu_geometry.surface_data['nplanes'].data,
                        gpu_geometry.surface_data['wire_diameter'].data,
                        gpu_geometry.surface_data['wire_pitch'].data,
                        g_times_l=True).wait()
                end_chunk = time.time()
                chunk_time = end_chunk - start_chunk
                #print "chunk time: ",chunk_time
                #if chunk_time>2.5:
                #    adapt_factor *= 0.5
            step += nsteps
            scatter_first = 0  # Only allow non-zero in first pass
            end_step = time.time()
            #print "step time: ",end_step-start_step

            if step < max_steps:
                start_requeue = time.time()
                #print "reset photon queues"
                if api.is_gpu_api_cuda():
                    cuda.Context.get_current().synchronize(
                    )  # ensure all threads done
                    #temp = input_queue_gpu
                    #input_queue_gpu = output_queue_gpu
                    #output_queue_gpu = temp
                    # Assign with a numpy array of length 1 to silence
                    # warning from PyCUDA about setting array with different strides/storage orders.
                    #output_queue_gpu[:1].set(np.ones(shape=1, dtype=np.uint32))
                    #nphotons = input_queue_gpu[:1].get()[0] - 1
                    # new style
                    output_queue_gpu.get(output_queue)
                    nphotons = output_queue[0] - 1
                    input_queue_gpu.set(output_queue)
                    output_queue_gpu[:1].set(np.ones(shape=1, dtype=np.uint32))

                elif api.is_gpu_api_opencl():
                    temp_out = output_queue_gpu.get()
                    nphotons = temp_out[0]
                    input_queue_gpu.set(
                        temp_out[1:], queue=comqueue
                    )  # set the input queue to have index of photons still need to be run
                    output_queue_gpu[:1].set(
                        np.ones(shape=1, dtype=np.uint32),
                        queue=comqueue)  # reset first instance to be one
                end_requeue = time.time()
                #print "re-queue time (nphotons=",nphotons"): ",end_requeue-start_requeue
                if nphotons == 0:
                    break

        end_prop = time.time()
        print "propagation time: ", end_prop - start_prop, " secs"
        end_flags = self.flags.get()
        end_flag = np.max(end_flags)
        if end_flag & (1 << 31):
            print >> sys.stderr, "WARNING: ABORTED PHOTONS"
        if api.is_gpu_api_cuda():
            cuda.Context.get_current().synchronize()
        elif api.is_gpu_api_opencl():
            cl.enqueue_barrier(comqueue)
Example #34
0
    def propagate_hit(self, gpu_geometry, rng_states, parameters):
        """Propagate photons on GPU to termination or max_steps, whichever
        comes first.

        May be called repeatedly without reloading photon information if
        single-stepping through photon history.

        ..warning::
            `rng_states` must have at least `nthreads_per_block`*`max_blocks`
            number of curandStates.


        got one abort::

             In [1]: a = ph("hhMOCK")

             In [9]: f = a[:,3,2].view(np.uint32)

             In [12]: np.where( f & 1<<31 )
             Out[12]: (array([279]),)

        failed to just mock that one::

              RANGE=279:280 MockNuWa MOCK 


        """
        nphotons = self.pos.size
        nwork = nphotons

        nthreads_per_block = parameters['threads_per_block']
        max_blocks = parameters['max_blocks']
        max_steps = parameters['max_steps']
        use_weights = False
        scatter_first = 0

        self.upload_queues(nwork)

        solid_id_map_gpu = gpu_geometry.solid_id_map
        solid_id_to_channel_id_gpu = gpu_geometry.solid_id_to_channel_id_gpu

        small_remainder = nthreads_per_block * 16 * 8
        block = (nthreads_per_block, 1, 1)

        results = {}
        results['name'] = "propagate_hit"
        results['nphotons'] = nphotons
        results['nwork'] = nwork
        results['nsmall'] = small_remainder
        results['COLUMNS'] = "name:s,nphotons:i,nwork:i,nsmall:i"

        step = 0
        times = []

        npass = 0
        nabort = 0

        while step < max_steps:
            npass += 1
            if nwork < small_remainder or use_weights:
                nsteps = max_steps - step  # Just finish the rest of the steps if the # of photons is low
                log.debug(
                    "increase nsteps for stragglers: small_remainder %s nwork %s nsteps %s max_steps %s "
                    % (small_remainder, nwork, nsteps, max_steps))
            else:
                nsteps = 1
            pass
            log.info("nphotons %s nwork %s step %s max_steps %s nsteps %s " %
                     (nphotons, nwork, step, max_steps, nsteps))

            abort = False
            for first_photon, photons_this_round, blocks in chunk_iterator(
                    nwork, nthreads_per_block, max_blocks):
                if abort:
                    nabort += 1
                else:
                    grid = (blocks, 1)
                    args = (
                        np.int32(first_photon),
                        np.int32(photons_this_round),
                        self.input_queue_gpu[1:].gpudata,
                        self.output_queue_gpu.gpudata,
                        rng_states,
                        self.pos.gpudata,
                        self.dir.gpudata,
                        self.wavelengths.gpudata,
                        self.pol.gpudata,
                        self.t.gpudata,
                        self.flags.gpudata,
                        self.last_hit_triangles.gpudata,
                        self.weights.gpudata,
                        np.int32(nsteps),
                        np.int32(use_weights),
                        np.int32(scatter_first),
                        gpu_geometry.gpudata,
                        solid_id_map_gpu.gpudata,
                        solid_id_to_channel_id_gpu.gpudata,
                    )

                    log.info(
                        "propagate_hit_kernel.prepared_timed_call grid %s block %s first_photon %s photons_this_round %s "
                        % (repr(grid), repr(block), first_photon,
                           photons_this_round))
                    get_time = self.propagate_hit_kernel.prepared_timed_call(
                        grid, block, *args)
                    t = get_time()
                    times.append(t)
                    if t > self.max_time:
                        abort = True
                        log.warn(
                            "kernel launch time %s > max_time %s : ABORTING " %
                            (t, self.max_time))
                    pass
                pass
            pass
            log.info("step %s propagate_hit_kernel times  %s " %
                     (step, repr(times)))
            pass
            step += nsteps
            scatter_first = 0  # Only allow non-zero in first pass
            if step < max_steps:
                nwork = self.swap_queues()
            pass
        pass

        log.info("calling max ")
        if ga.max(self.flags).get() & (1 << 31):
            log.warn("ABORTED PHOTONS")
        log.info("done calling max ")

        cuda.Context.get_current().synchronize()

        results['npass'] = npass
        results['nabort'] = nabort
        results['nlaunch'] = len(times)
        results['tottime'] = sum(times)
        results['maxtime'] = max(times)
        results['mintime'] = min(times)
        results[
            'COLUMNS'] += ",npass:i,nabort:i,nlaunch:i,tottime:f,maxtime:f,mintime:f"
        return results