Python GPUFuncs Examples

Programming Language: Python

Namespace/Package Name: chroma.gpu.gpufuncs

Class/Type: GPUFuncs

Examples at hotexamples.com: 25

Python GPUFuncs - 25 examples found. These are the top rated real world Python examples of chroma.gpu.gpufuncs.GPUFuncs extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

GPUFuncs(19)

run_daq_many(2)

run_daq(2)

checknode(2)

get_earliest_hit_time(1)

test_sample_cdf(1)

swap(1)

reset_earliest_time_int(1)

propagate(1)

photon_duplicate(1)

pair_area(1)

min_distance_to(1)

make_parents_detailed(1)

make_parents(1)

make_leaves(1)

make_geostruct(1)

gen_photon_from_step(1)

accumulate_bincount(1)

fillArray(1)

count_photons(1)

copy_photons(1)

copy_and_offset(1)

convert_sortable_int_to_float(1)

convert_charge_int_to_float(1)

convert_adc(1)

collapse_child(1)

bin_hits(1)

area_sort_child(1)

accumulate_nearest_neighbor_block(1)

accumulate_moments(1)

accumulate_kernel_eval(1)

test_texture(1)

Example #1

Show file

    def __init__(self,
                 gpu_detector,
                 ntdcs=None,
                 ns_per_tdc=None,
                 adc_bits=None,
                 ndaq=1,
                 cl_context=None,
                 cl_queue=None):
        """constructor.
        
        Args:
          gpu_detector: GPUDetector
        Keywords:
          ntdcs: int
            number of time bins per channel
            if not supplied, using class variable value
          ns_per_tdc: float
            nanoseconds per time bin
            if not supplied, using class variable value
          adc_bits:  int
            number of ADC bits (not used yet)
          ndaq: int
            number of daqs
          cl_context: pyopencl.Context
          cl_queue: pyopencl.CommandQueue
        Raises:
          ValueError when ntdcs and ns_per_tdc are found to be NoneType
        """
        if ntdcs == None:
            self.ntdcs = GPUDaqLAr1ND.NTDC
        if ns_per_tdc == None:
            self.ns_per_tdc = GPUDaqLAr1ND.NS_PER_TDC
        super(GPUDaqLAr1ND, self).__init__(gpu_detector,
                                           ntdcs=self.ntdcs,
                                           ns_per_tdc=self.ns_per_tdc,
                                           adc_bits=adc_bits,
                                           ndaq=ndaq,
                                           cl_context=cl_context,
                                           cl_queue=cl_queue)
        if self.ntdcs == None:
            raise ValueError("GPUDaqLAr1ND.NTDC has not been set.")
        if self.ns_per_tdc == None:
            raise ValueError("GPUDaqLAr1ND.NS_PER_TDC has not been set.")

        kernel_filepath = os.path.dirname(
            os.path.realpath(__file__)) + "/daq_lar1nd"
        if api.is_gpu_api_cuda():
            self.module = cutools.get_cu_module(kernel_filepath + ".cu",
                                                options=api_options,
                                                include_source_directory=True)
        elif api.is_gpu_api_opencl():
            self.module = cltools.get_cl_module(kernel_filepath + '.cl',
                                                cl_context,
                                                options=api_options,
                                                include_source_directory=True)
        else:
            raise RuntimeError("GPU API is neither CUDA nor OpenCL")

        self.gpu_funcs = GPUFuncs(self.module)

Example #2

Show file

 def __init__(self, cl_context=None):
     if api.is_gpu_api_cuda():
         self.module = cutools.get_cu_module('pdf.cu',
                                             options=api_options,
                                             include_source_directory=True)
     elif api.is_gpu_api_opencl():
         self.module = cltools.get_cl_module('pdf.cl',
                                             cl_context,
                                             options=api_options,
                                             include_source_directory=True)
     self.gpu_funcs = GPUFuncs(self.module)

Example #3

Show file

 def setUp(self):
     self.context = cltools.get_last_context()
     self.nthreads_per_block = 256
     self.myoptions = ('-I.', ) + api_options
     self.mod = get_module("test_sample_cdf.cl",
                           self.context,
                           options=self.myoptions,
                           include_source_directory=True)
     self.funcs = GPUFuncs(self.mod)
     self.rng_states = clrand.get_rng_states(self.context,
                                             self.nthreads_per_block)
     self.outf = rt.TFile("output_sample_cdf.root", "RECREATE")

Example #4

Show file

File: bvh.py Project: NuTufts/ChromaUBooNE

def area_sort_nodes(gpu_geometry, layer_bounds):
    bvh_module = get_cu_module('bvh.cu',
                               options=cuda_options,
                               include_source_directory=True)
    bvh_funcs = GPUFuncs(bvh_module)

    bounds = zip(layer_bounds[:-1], layer_bounds[1:])[:-1]
    bounds.reverse()
    nthreads_per_block = 256
    for start, end in bounds:
        bvh_funcs.area_sort_child(np.uint32(start),
                                  np.uint32(end),
                                  gpu_geometry,
                                  block=(nthreads_per_block, 1, 1),
                                  grid=(120, 1))
    return gpu_geometry.nodes.get()

Example #5

Show file

def fill_array(context, rng_states, size):
    queue = cl.CommandQueue(context)
    out_gpu = cl.array.empty(queue, size, dtype=np.float32)
    randmod = get_cl_module("random.cl",
                            context,
                            options=cl_options,
                            include_source_directory=True)
    randfuncs = GPUFuncs(randmod)
    nthreads_per_block = 256
    for first_index, elements_this_iter, nblocks_this_iter in chunk_iterator(
            size, nthreads_per_block, max_blocks=1):
        randfuncs.fillArray(queue, (elements_this_iter, 1, 1), None,
                            np.uint32(first_index), rng_states.data,
                            out_gpu.data)
    out = out_gpu.get()
    return out

Example #6

Show file

File: daq.py Project: NuTufts/ChromaUBooNE

    def __init__(self, gpu_detector, ndaq=1, cl_context=None, cl_queue=None):
        if api.is_gpu_api_cuda():
            self.earliest_time_gpu = ga.empty(gpu_detector.nchannels * ndaq,
                                              dtype=np.float32)
            self.earliest_time_int_gpu = ga.empty(gpu_detector.nchannels *
                                                  ndaq,
                                                  dtype=np.uint32)
            self.channel_history_gpu = ga.zeros_like(
                self.earliest_time_int_gpu)
            self.channel_q_int_gpu = ga.zeros_like(self.earliest_time_int_gpu)
            self.channel_q_gpu = ga.zeros(len(self.earliest_time_int_gpu),
                                          dtype=np.float32)
            self.detector_gpu = gpu_detector.detector_gpu
            self.module = cutools.get_cu_module('daq.cu',
                                                options=api_options,
                                                include_source_directory=True)
        elif api.is_gpu_api_opencl():
            self.earliest_time_gpu = ga.empty(cl_queue,
                                              gpu_detector.nchannels * ndaq,
                                              dtype=np.float32)
            self.earliest_time_int_gpu = ga.empty(cl_queue,
                                                  gpu_detector.nchannels *
                                                  ndaq,
                                                  dtype=np.uint32)
            self.channel_history_gpu = ga.zeros(cl_queue,
                                                gpu_detector.nchannels * ndaq,
                                                dtype=np.uint32)
            self.channel_q_int_gpu = ga.zeros(cl_queue,
                                              gpu_detector.nchannels * ndaq,
                                              dtype=np.uint32)
            self.channel_q_gpu = ga.zeros(cl_queue,
                                          gpu_detector.nchannels * ndaq,
                                          dtype=np.float32)
            self.detector_gpu = gpu_detector  # struct not made in opencl mode, so we keep a copy of the class
            self.module = cltools.get_cl_module('daq.cl',
                                                cl_context,
                                                options=api_options,
                                                include_source_directory=True)
        else:
            raise RuntimeError("GPU API is neither CUDA nor OpenCL")

        self.solid_id_map_gpu = gpu_detector.solid_id_map
        self.solid_id_to_channel_index_gpu = gpu_detector.solid_id_to_channel_index_gpu
        self.gpu_funcs = GPUFuncs(self.module)
        self.ndaq = ndaq
        self.stride = gpu_detector.nchannels

Example #7

Show file

File: photon.py Project: NuTufts/ChromaUBooNE

    def __init__(self, pos, dir, pol, wavelengths, t, last_hit_triangles,
                 flags, weights):
        '''Create new object using slices of GPUArrays from an instance
        of GPUPhotons.  NOTE THESE ARE NOT CPU ARRAYS!'''
        self.pos = pos
        self.dir = dir
        self.pol = pol
        self.wavelengths = wavelengths
        self.t = t
        self.last_hit_triangles = last_hit_triangles
        self.flags = flags
        self.weights = weights

        module = get_cu_module('propagate.cu', options=cuda_options)
        self.gpu_funcs = GPUFuncs(module)

        self.true_nphotons = len(pos)
        self.ncopies = 1

Example #8

Show file

File: bvh.py Project: NuTufts/ChromaUBooNE

def collapse_chains(nodes, layer_bounds):
    if gpuapi.is_gpu_api_cuda():
        bvh_module = get_module('bvh.cu',
                                options=api_options,
                                include_source_directory=True)
    elif gpuapi.is_gpu_api_opencl():
        context = cltools.get_last_context()
        queue = cl.CommandQueue(context)
        bvh_module = get_module('bvh.cl',
                                context,
                                options=api_options,
                                include_source_directory=True)
    else:
        raise RuntimeError('API neither CUDA or OpenCL')

    bvh_funcs = GPUFuncs(bvh_module)

    if gpuapi.is_gpu_api_cuda():
        gpu_nodes = ga.to_gpu(nodes)
    elif gpuapi.is_gpu_api_opencl():
        gpu_nodes = ga.to_device(queue, nodes)

    bounds = zip(layer_bounds[:-1], layer_bounds[1:])[:-1]
    bounds.reverse()
    nthreads_per_block = 256
    for start, end in bounds:
        if gpuapi.is_gpu_api_cuda():
            bvh_funcs.collapse_child(np.uint32(start),
                                     np.uint32(end),
                                     gpu_nodes,
                                     block=(nthreads_per_block, 1, 1),
                                     grid=(120, 1))
        elif gpuapi.is_gpu_api_opencl():
            bvh_funcs.collapse_child(queue, (end - start, 1, 1), None,
                                     np.uint32(start), np.uint32(end),
                                     gpu_nodes.data).wait()

    return gpu_nodes.get()

Example #9

Show file

File: photon.py Project: NuTufts/ChromaUBooNE

class GPUPhotons(object):
    def __init__(self, photons, ncopies=1, cl_context=None):
        """Load ``photons`` onto the GPU, replicating as requested.

           Args:
               - photons: chroma.Event.Photons
                   Photon state information to load onto GPU
               - ncopies: int, *optional*
                   Number of times to replicate the photons
                   on the GPU.  This is used if you want
                   to propagate the same event many times,
                   for example in a likelihood calculation.

                   The amount of GPU storage will be proportionally
                   larger if ncopies > 1, so be careful.
        """
        nphotons = len(photons)
        # Allocate GPU memory for photon info and push to device
        if api.is_gpu_api_cuda():
            self.pos = ga.empty(shape=nphotons * ncopies, dtype=ga.vec.float3)
            self.dir = ga.empty(shape=nphotons * ncopies, dtype=ga.vec.float3)
            self.pol = ga.empty(shape=nphotons * ncopies, dtype=ga.vec.float3)
            self.wavelengths = ga.empty(shape=nphotons * ncopies,
                                        dtype=np.float32)
            self.t = ga.empty(shape=nphotons * ncopies, dtype=np.float32)
            self.last_hit_triangles = ga.empty(shape=nphotons * ncopies,
                                               dtype=np.int32)
            self.flags = ga.empty(shape=nphotons * ncopies, dtype=np.uint32)
            self.weights = ga.empty(shape=nphotons * ncopies, dtype=np.float32)
            self.current_node_index = ga.zeros(shape=nphotons * ncopies,
                                               dtype=np.uint32)  # deprecated
            self.requested_workcode = ga.empty(shape=nphotons * ncopies,
                                               dtype=np.uint32)  # deprecated
        elif api.is_gpu_api_opencl():
            queue = cl.CommandQueue(cl_context)
            self.pos = ga.empty(queue,
                                shape=nphotons * ncopies,
                                dtype=ga.vec.float3)
            self.dir = ga.empty(queue,
                                shape=nphotons * ncopies,
                                dtype=ga.vec.float3)
            self.pol = ga.empty(queue,
                                shape=nphotons * ncopies,
                                dtype=ga.vec.float3)
            self.wavelengths = ga.empty(queue,
                                        shape=nphotons * ncopies,
                                        dtype=np.float32)
            self.t = ga.empty(queue,
                              shape=nphotons * ncopies,
                              dtype=np.float32)
            self.last_hit_triangles = ga.empty(queue,
                                               shape=nphotons * ncopies,
                                               dtype=np.int32)
            self.flags = ga.empty(queue,
                                  shape=nphotons * ncopies,
                                  dtype=np.uint32)
            self.weights = ga.empty(queue,
                                    shape=nphotons * ncopies,
                                    dtype=np.float32)
            self.current_node_index = ga.zeros(queue,
                                               shape=nphotons * ncopies,
                                               dtype=np.uint32)  # deprecated
            self.requested_workcode = ga.empty(queue,
                                               shape=nphotons * ncopies,
                                               dtype=np.uint32)  # deprecated

        # Assign the provided photons to the beginning (possibly
        # the entire array if ncopies is 1
        self.pos[:nphotons].set(to_float3(photons.pos))
        self.dir[:nphotons].set(to_float3(photons.dir))
        self.pol[:nphotons].set(to_float3(photons.pol))
        self.wavelengths[:nphotons].set(photons.wavelengths.astype(np.float32))
        self.t[:nphotons].set(photons.t.astype(np.float32))
        self.last_hit_triangles[:nphotons].set(
            photons.last_hit_triangles.astype(np.int32))
        self.flags[:nphotons].set(photons.flags.astype(np.uint32))
        self.weights[:nphotons].set(photons.weights.astype(np.float32))

        if api.is_gpu_api_cuda():
            self.module = get_module('propagate.cu',
                                     options=api_options,
                                     include_source_directory=True)
        elif api.is_gpu_api_opencl():
            self.module = get_module('propagate.cl',
                                     cl_context,
                                     options=api_options,
                                     include_source_directory=True)
        # define the texture references
        self.define_texture_references()
        # get kernel functions
        self.gpu_funcs = GPUFuncs(self.module)

        # Replicate the photons to the rest of the slots if needed
        if ncopies > 1:
            max_blocks = 1024
            nthreads_per_block = 64
            for first_photon, photons_this_round, blocks in \
                    chunk_iterator(nphotons, nthreads_per_block, max_blocks):
                self.gpu_funcs.photon_duplicate(np.int32(first_photon),
                                                np.int32(photons_this_round),
                                                self.pos,
                                                self.dir,
                                                self.wavelengths,
                                                self.pol,
                                                self.t,
                                                self.flags,
                                                self.last_hit_triangles,
                                                self.weights,
                                                np.int32(ncopies - 1),
                                                np.int32(nphotons),
                                                block=(nthreads_per_block, 1,
                                                       1),
                                                grid=(blocks, 1))

        # Save the duplication information for the iterate_copies() method
        self.true_nphotons = nphotons
        self.ncopies = ncopies

    def define_texture_references(self, module=None):
        # unbound texture references declared for use with propagate
        if module == None:
            module = self.module
        if api.is_gpu_api_cuda():
            self.node_texture_ref = module.get_texref("nodevec_tex_ref")
            self.node_texture_ref.set_format(cuda.array_format.UNSIGNED_INT32,
                                             4)

            self.extra_node_texture_ref = module.get_texref(
                "extra_node_tex_ref")
            self.extra_node_texture_ref.set_format(
                cuda.array_format.UNSIGNED_INT32, 4)

            self.vertices_texture_ref = module.get_texref(
                "verticesvec_tex_ref")
            self.vertices_texture_ref.set_format(cuda.array_format.FLOAT, 4)

            self.triangles_texture_ref = module.get_texref(
                "trianglesvec_tex_ref")
            self.triangles_texture_ref.set_format(
                cuda.array_format.UNSIGNED_INT32, 4)

            self.node_texture_ref_bound = False
        elif api.is_gpu_api_opencl():
            # texture usage not used at the moment
            pass

    def get(self):
        ncols = 3
        if api.is_gpu_api_opencl():
            ncols = 4  # must include padding
        pos = self.pos.get().view(np.float32).reshape((len(self.pos), ncols))
        dir = self.dir.get().view(np.float32).reshape((len(self.dir), ncols))
        pol = self.pol.get().view(np.float32).reshape((len(self.pol), ncols))
        wavelengths = self.wavelengths.get()
        t = self.t.get()
        last_hit_triangles = self.last_hit_triangles.get()
        flags = self.flags.get()
        weights = self.weights.get()
        return event.Photons(pos, dir, pol, wavelengths, t, last_hit_triangles,
                             flags, weights)

    def iterate_copies(self):
        '''Returns an iterator that yields GPUPhotonsSlice objects
        corresponding to the event copies stored in ``self``.'''
        for i in xrange(self.ncopies):
            window = slice(self.true_nphotons * i,
                           self.true_nphotons * (i + 1))
            yield GPUPhotonsSlice(
                pos=self.pos[window],
                dir=self.dir[window],
                pol=self.pol[window],
                wavelengths=self.wavelengths[window],
                t=self.t[window],
                last_hit_triangles=self.last_hit_triangles[window],
                flags=self.flags[window],
                weights=self.weights[window])

    @profile_if_possible
    def propagate(self,
                  gpu_geometry,
                  rng_states,
                  nthreads_per_block=64,
                  max_blocks=1024,
                  max_steps=10,
                  use_weights=False,
                  scatter_first=0,
                  cl_context=None):
        """Propagate photons on GPU to termination or max_steps, whichever
        comes first.

        May be called repeatedly without reloading photon information if
        single-stepping through photon history.

        ..warning::
            `rng_states` must have at least `nthreads_per_block`*`max_blocks`
            number of curandStates.
        """
        nphotons = self.pos.size
        # bind node texture reference
        if api.is_gpu_api_cuda() and not self.node_texture_ref_bound:
            # we have to unroll, as pycuda doesn't seem to support vector times right now for binding
            self.unrolled_nodes = ga.to_gpu(
                gpu_geometry.nodes.get().ravel().view(np.uint32))
            self.unrolled_extra_nodes = ga.to_gpu(
                gpu_geometry.extra_nodes.ravel().view(np.uint32))
            self.unrolled_triangles = ga.to_gpu(
                gpu_geometry.triangles.get().ravel().view(np.uint32))
            self.unrolled_triangles4 = ga.to_gpu(
                gpu_geometry.triangles4.ravel().view(np.uint32))
            self.unrolled_vertices = ga.to_gpu(
                gpu_geometry.vertices.get().ravel().view(np.float32))
            self.unrolled_vertices4 = ga.to_gpu(
                gpu_geometry.vertices4.ravel().view(np.float32))
            self.node_texture_ref.set_address(self.unrolled_nodes.gpudata,
                                              self.unrolled_nodes.nbytes)
            self.extra_node_texture_ref.set_address(
                self.unrolled_extra_nodes.gpudata,
                self.unrolled_extra_nodes.nbytes)
            #self.unrolled_nodes.bind_to_texref_ext( self.node_texture_ref )
            #self.unrolled_extra_nodes.bind_to_texref_ext( self.extra_node_texture_ref )
            #self.unrolled_triangles.bind_to_texref_ext( self.triangles_texture_ref )
            self.triangles_texture_ref.set_address(
                self.unrolled_triangles4.gpudata,
                self.unrolled_triangles4.nbytes)
            #self.unrolled_vertices.bind_to_texref_ext( self.vertices_texture_ref )
            self.vertices_texture_ref.set_address(
                self.unrolled_vertices4.gpudata,
                self.unrolled_vertices4.nbytes)
            print "[BOUND TO TEXTURE MEMORY]"
            print "Nodes: ", self.unrolled_nodes.nbytes / 1.0e3, " kbytes"
            print "Extra nodes: ", self.unrolled_extra_nodes.nbytes / 1.0e3, " kbytes"
            print "Triangles: ", self.unrolled_triangles4.nbytes / 1.0e3, " kbytes"
            print "Vertices: ", self.unrolled_vertices4.nbytes / 1.0e3, " kbytes"
            print "Total: ", (self.unrolled_nodes.nbytes +
                              self.unrolled_extra_nodes.nbytes +
                              self.unrolled_triangles4.nbytes +
                              self.unrolled_vertices4.nbytes) / 1.0e3, "kbytes"
            self.node_texture_ref_bound = True

        # setup queue
        maxqueue = nphotons
        step = 0
        input_queue = np.empty(shape=maxqueue + 1, dtype=np.uint32)
        input_queue[0] = 0
        # Order photons initially in the queue to put the clones next to each other
        for copy in xrange(self.ncopies):
            input_queue[1 + copy::self.ncopies] = np.arange(
                self.true_nphotons,
                dtype=np.uint32) + copy * self.true_nphotons
        if api.is_gpu_api_cuda():
            input_queue_gpu = ga.to_gpu(input_queue)
        elif api.is_gpu_api_opencl():
            comqueue = cl.CommandQueue(cl_context)
            input_queue_gpu = ga.to_device(comqueue,
                                           input_queue[1:])  # why the offset?

        output_queue = np.zeros(shape=maxqueue + 1, dtype=np.uint32)
        output_queue[0] = 1
        if api.is_gpu_api_cuda():
            output_queue_gpu = ga.to_gpu(output_queue)
        elif api.is_gpu_api_opencl():
            output_queue_gpu = ga.to_device(comqueue, output_queue)

        if use_weights:
            iuse_weights = 1
        else:
            iuse_weights = 0

        adapt_factor = 1.0
        start_prop = time.time()
        while step < max_steps:
            # Just finish the rest of the steps if the # of photons is low
            #if nphotons < nthreads_per_block * 16 * 8 or use_weights:
            #    nsteps = max_steps - step
            #else:
            #    nsteps = 1
            nsteps = 1

            start_step = time.time()
            for first_photon, photons_this_round, blocks in \
                    chunk_iterator(nphotons, nthreads_per_block, max( int(adapt_factor*max_blocks), 1 )):
                #print nphotons, nthreads_per_block, max_blocks," : ",first_photon, photons_this_round, blocks, adapt_factor
                start_chunk = time.time()
                if api.is_gpu_api_cuda():
                    self.gpu_funcs.propagate(np.int32(first_photon),
                                             np.int32(photons_this_round),
                                             input_queue_gpu[1:],
                                             output_queue_gpu,
                                             rng_states,
                                             self.pos,
                                             self.dir,
                                             self.wavelengths,
                                             self.pol,
                                             self.t,
                                             self.flags,
                                             self.last_hit_triangles,
                                             self.weights,
                                             np.int32(nsteps),
                                             np.int32(iuse_weights),
                                             np.int32(scatter_first),
                                             gpu_geometry.gpudata,
                                             block=(nthreads_per_block, 1, 1),
                                             grid=(blocks, 1))
                    #cuda.Context.get_current().synchronize()
                elif api.is_gpu_api_opencl():
                    self.gpu_funcs.propagate(
                        comqueue, (photons_this_round, 1, 1),
                        None,
                        np.int32(first_photon),
                        np.int32(photons_this_round),
                        input_queue_gpu.data,
                        output_queue_gpu.data,
                        rng_states.data,
                        self.pos.data,
                        self.dir.data,
                        self.wavelengths.data,
                        self.pol.data,
                        self.t.data,
                        self.flags.data,
                        self.last_hit_triangles.data,
                        self.weights.data,
                        np.int32(nsteps),
                        np.int32(iuse_weights),
                        np.int32(scatter_first),
                        gpu_geometry.world_scale,
                        gpu_geometry.world_origin.data,
                        np.int32(len(gpu_geometry.nodes)),
                        gpu_geometry.material_data['n'],
                        gpu_geometry.material_data['step'],
                        gpu_geometry.material_data["wavelength0"],
                        gpu_geometry.vertices.data,
                        gpu_geometry.triangles.data,
                        gpu_geometry.material_codes.data,
                        gpu_geometry.colors.data,
                        gpu_geometry.nodes.data,
                        gpu_geometry.extra_nodes.data,
                        gpu_geometry.material_data["nmaterials"],
                        gpu_geometry.material_data['refractive_index'].data,
                        gpu_geometry.material_data['absorption_length'].data,
                        gpu_geometry.material_data['scattering_length'].data,
                        gpu_geometry.material_data['reemission_prob'].data,
                        gpu_geometry.material_data['reemission_cdf'].data,
                        gpu_geometry.surface_data['nsurfaces'],
                        gpu_geometry.surface_data['detect'].data,
                        gpu_geometry.surface_data['absorb'].data,
                        gpu_geometry.surface_data['reemit'].data,
                        gpu_geometry.surface_data['reflect_diffuse'].data,
                        gpu_geometry.surface_data['reflect_specular'].data,
                        gpu_geometry.surface_data['eta'].data,
                        gpu_geometry.surface_data['k'].data,
                        gpu_geometry.surface_data['reemission_cdf'].data,
                        gpu_geometry.surface_data['model'].data,
                        gpu_geometry.surface_data['transmissive'].data,
                        gpu_geometry.surface_data['thickness'].data,
                        gpu_geometry.surface_data['nplanes'].data,
                        gpu_geometry.surface_data['wire_diameter'].data,
                        gpu_geometry.surface_data['wire_pitch'].data,
                        g_times_l=True).wait()
                end_chunk = time.time()
                chunk_time = end_chunk - start_chunk
                #print "chunk time: ",chunk_time
                #if chunk_time>2.5:
                #    adapt_factor *= 0.5
            step += nsteps
            scatter_first = 0  # Only allow non-zero in first pass
            end_step = time.time()
            #print "step time: ",end_step-start_step

            if step < max_steps:
                start_requeue = time.time()
                #print "reset photon queues"
                if api.is_gpu_api_cuda():
                    cuda.Context.get_current().synchronize(
                    )  # ensure all threads done
                    #temp = input_queue_gpu
                    #input_queue_gpu = output_queue_gpu
                    #output_queue_gpu = temp
                    # Assign with a numpy array of length 1 to silence
                    # warning from PyCUDA about setting array with different strides/storage orders.
                    #output_queue_gpu[:1].set(np.ones(shape=1, dtype=np.uint32))
                    #nphotons = input_queue_gpu[:1].get()[0] - 1
                    # new style
                    output_queue_gpu.get(output_queue)
                    nphotons = output_queue[0] - 1
                    input_queue_gpu.set(output_queue)
                    output_queue_gpu[:1].set(np.ones(shape=1, dtype=np.uint32))

                elif api.is_gpu_api_opencl():
                    temp_out = output_queue_gpu.get()
                    nphotons = temp_out[0]
                    input_queue_gpu.set(
                        temp_out[1:], queue=comqueue
                    )  # set the input queue to have index of photons still need to be run
                    output_queue_gpu[:1].set(
                        np.ones(shape=1, dtype=np.uint32),
                        queue=comqueue)  # reset first instance to be one
                end_requeue = time.time()
                #print "re-queue time (nphotons=",nphotons"): ",end_requeue-start_requeue
                if nphotons == 0:
                    break

        end_prop = time.time()
        print "propagation time: ", end_prop - start_prop, " secs"
        end_flags = self.flags.get()
        end_flag = np.max(end_flags)
        if end_flag & (1 << 31):
            print >> sys.stderr, "WARNING: ABORTED PHOTONS"
        if api.is_gpu_api_cuda():
            cuda.Context.get_current().synchronize()
        elif api.is_gpu_api_opencl():
            cl.enqueue_barrier(comqueue)

    @profile_if_possible
    def select(self,
               target_flag,
               nthreads_per_block=64,
               max_blocks=1024,
               start_photon=None,
               nphotons=None):
        '''Return a new GPUPhoton object containing only photons that
        have a particular bit set in their history word.'''
        cuda.Context.get_current().synchronize()
        index_counter_gpu = ga.zeros(shape=1, dtype=np.uint32)
        cuda.Context.get_current().synchronize()
        if start_photon is None:
            start_photon = 0
        if nphotons is None:
            nphotons = self.pos.size - start_photon

        # First count how much space we need
        for first_photon, photons_this_round, blocks in \
                chunk_iterator(nphotons, nthreads_per_block, max_blocks):
            self.gpu_funcs.count_photons(np.int32(start_photon + first_photon),
                                         np.int32(photons_this_round),
                                         np.uint32(target_flag),
                                         index_counter_gpu,
                                         self.flags,
                                         block=(nthreads_per_block, 1, 1),
                                         grid=(blocks, 1))
        cuda.Context.get_current().synchronize()
        reduced_nphotons = int(index_counter_gpu.get()[0])
        # Then allocate new storage space
        pos = ga.empty(shape=reduced_nphotons, dtype=ga.vec.float3)
        dir = ga.empty(shape=reduced_nphotons, dtype=ga.vec.float3)
        pol = ga.empty(shape=reduced_nphotons, dtype=ga.vec.float3)
        wavelengths = ga.empty(shape=reduced_nphotons, dtype=np.float32)
        t = ga.empty(shape=reduced_nphotons, dtype=np.float32)
        last_hit_triangles = ga.empty(shape=reduced_nphotons, dtype=np.int32)
        flags = ga.empty(shape=reduced_nphotons, dtype=np.uint32)
        weights = ga.empty(shape=reduced_nphotons, dtype=np.float32)

        # And finaly copy photons, if there are any
        if reduced_nphotons > 0:
            index_counter_gpu.fill(0)
            for first_photon, photons_this_round, blocks in \
                    chunk_iterator(nphotons, nthreads_per_block, max_blocks):
                self.gpu_funcs.copy_photons(np.int32(start_photon +
                                                     first_photon),
                                            np.int32(photons_this_round),
                                            np.uint32(target_flag),
                                            index_counter_gpu,
                                            self.pos,
                                            self.dir,
                                            self.wavelengths,
                                            self.pol,
                                            self.t,
                                            self.flags,
                                            self.last_hit_triangles,
                                            self.weights,
                                            pos,
                                            dir,
                                            wavelengths,
                                            pol,
                                            t,
                                            flags,
                                            last_hit_triangles,
                                            weights,
                                            block=(nthreads_per_block, 1, 1),
                                            grid=(blocks, 1))
            assert index_counter_gpu.get()[0] == reduced_nphotons
        return GPUPhotonsSlice(pos, dir, pol, wavelengths, t,
                               last_hit_triangles, flags, weights)

    def __del__(self):
        del self.pos
        del self.dir
        del self.pol
        del self.wavelengths
        del self.t
        del self.flags
        del self.last_hit_triangles
        # Free up GPU memory quickly if now available
        gc.collect()

    def __len__(self):
        return self.pos.size

Example #10

Show file

File: bvh.py Project: NuTufts/ChromaUBooNE

def optimize_layer(orig_nodes):
    bvh_module = get_cu_module('bvh.cu',
                               options=cuda_options,
                               include_source_directory=True)
    bvh_funcs = GPUFuncs(bvh_module)

    nodes = ga.to_gpu(orig_nodes)
    n = len(nodes)
    areas = ga.empty(shape=n / 2, dtype=np.uint64)
    nthreads_per_block = 128

    min_areas = ga.empty(shape=int(np.ceil(n / float(nthreads_per_block))),
                         dtype=np.uint64)
    min_index = ga.empty(shape=min_areas.shape, dtype=np.uint32)

    update = 10000

    skip_size = 1
    flag = cutools.mapped_empty(shape=skip_size, dtype=np.uint32)

    i = 0
    skips = 0
    swaps = 0
    while i < n / 2 - 1:
        # How are we doing?
        if i % update == 0:
            for first_index, elements_this_iter, nblocks_this_iter in \
                    chunk_iterator(n/2, nthreads_per_block, max_blocks=10000):

                bvh_funcs.pair_area(np.uint32(first_index),
                                    np.uint32(elements_this_iter),
                                    nodes,
                                    areas,
                                    block=(nthreads_per_block, 1, 1),
                                    grid=(nblocks_this_iter, 1))

            areas_host = areas.get()
            #print nodes.get(), areas_host.astype(float)
            print 'Area of parent layer so far (%d): %1.12e' % (
                i * 2, areas_host.astype(float).sum())
            print 'Skips: %d, Swaps: %d' % (skips, swaps)

        test_index = i * 2

        blocks = 0
        look_forward = min(8192 * 50, n - test_index - 2)
        skip_this_round = min(skip_size, n - test_index - 1)
        flag[:] = 0
        for first_index, elements_this_iter, nblocks_this_iter in \
                chunk_iterator(look_forward, nthreads_per_block, max_blocks=10000):
            bvh_funcs.min_distance_to(np.uint32(first_index + test_index + 2),
                                      np.uint32(elements_this_iter),
                                      np.uint32(test_index),
                                      nodes,
                                      np.uint32(blocks),
                                      min_areas,
                                      min_index,
                                      cutools.Mapped(flag),
                                      block=(nthreads_per_block, 1, 1),
                                      grid=(nblocks_this_iter,
                                            skip_this_round))
            blocks += nblocks_this_iter
            #print i, first_index, nblocks_this_iter, look_forward
        cuda.Context.get_current().synchronize()

        if flag[0] == 0:
            flag_nonzero = flag.nonzero()[0]
            if len(flag_nonzero) == 0:
                no_swap_required = skip_size
            else:
                no_swap_required = flag_nonzero[0]
            i += no_swap_required
            skips += no_swap_required
            continue

        min_areas_host = min_areas[:blocks].get()
        min_index_host = min_index[:blocks].get()
        best_block = min_areas_host.argmin()
        better_i = min_index_host[best_block]

        swaps += 1
        #print 'swap', test_index+1, better_i
        assert 0 < better_i < len(nodes)
        assert 0 < test_index + 1 < len(nodes)
        bvh_funcs.swap(np.uint32(test_index + 1),
                       np.uint32(better_i),
                       nodes,
                       block=(1, 1, 1),
                       grid=(1, 1))
        cuda.Context.get_current().synchronize()
        i += 1

    for first_index, elements_this_iter, nblocks_this_iter in \
            chunk_iterator(n/2, nthreads_per_block, max_blocks=10000):

        bvh_funcs.pair_area(np.uint32(first_index),
                            np.uint32(elements_this_iter),
                            nodes,
                            areas,
                            block=(nthreads_per_block, 1, 1),
                            grid=(nblocks_this_iter, 1))

    areas_host = areas.get()

    print 'Final area of parent layer: %1.12e' % areas_host.sum()
    print 'Skips: %d, Swaps: %d' % (skips, swaps)

    return nodes.get()

Example #11

Show file

File: bvh.py Project: NuTufts/ChromaUBooNE

def concatenate_layers(layers):
    nthreads_per_block = 1024
    context = None
    queue = None
    if gpuapi.is_gpu_api_opencl():
        context = cltools.get_last_context()
        #print context
        queue = cl.CommandQueue(context)

    # Load GPU functions
    if gpuapi.is_gpu_api_cuda():
        bvh_module = get_module('bvh.cu',
                                options=api_options,
                                include_source_directory=True)
    elif gpuapi.is_gpu_api_opencl():
        # don't like the last context method. trouble. trouble.
        bvh_module = get_module('bvh.cl',
                                cltools.get_last_context(),
                                options=api_options,
                                include_source_directory=True)
    else:
        raise RuntimeError('API neither CUDA nor OpenCL?!')
    bvh_funcs = GPUFuncs(bvh_module)

    # Put 0 at beginning of list
    layer_bounds = np.insert(np.cumsum(map(len, layers)), 0, 0)

    # allocate memory
    if gpuapi.is_gpu_api_cuda():
        nodes = ga.empty(shape=int(layer_bounds[-1]), dtype=ga.vec.uint4)
    elif gpuapi.is_gpu_api_opencl():
        totsize = 0
        layer_pos = []
        print layer_bounds[-1]
        for n, layer in enumerate(layers):
            layer_pos.append(totsize)
            print "LAYER ", n, " size=", len(layer), "start=", totsize
            totsize += len(layer)
        print "totsize: ", totsize
        nodes_iter_np = np.empty(totsize, dtype=ga.vec.uint4)
        nodes_iter_gpu = ga.to_device(queue, nodes_iter_np)
        nodeset_np = []
    else:
        raise RuntimeError('API neither CUDA nor OpenCL?!')

    ilayer = 0
    for layer_start, layer_end, layer in zip(layer_bounds[:-1],
                                             layer_bounds[1:], layers):
        if layer_end == layer_bounds[-1]:
            # leaf nodes need no offset
            child_offset = 0
        else:
            child_offset = layer_end

        #print "ilayer,start,end,child_offset: ",ilayer,layer_start, layer_end, child_offset
        nmax_blocks = 10000
        if gpuapi.is_gpu_api_opencl():
            nthreads_per_block = 256
            nmax_blocks = 1
        for first_index, elements_this_iter, nblocks_this_iter in \
                chunk_iterator(layer_end-layer_start, nthreads_per_block,max_blocks=nmax_blocks):
            #print "   ",ilayer,first_index, elements_this_iter, nblocks_this_iter, layer_start
            if gpuapi.is_gpu_api_cuda():
                bvh_funcs.copy_and_offset(np.uint32(first_index),
                                          np.uint32(elements_this_iter),
                                          np.uint32(child_offset),
                                          cuda.In(layer),
                                          nodes[layer_start:],
                                          block=(nthreads_per_block, 1, 1),
                                          grid=(nblocks_this_iter, 1))
            elif gpuapi.is_gpu_api_opencl():
                layer_gpu = ga.to_device(queue, layer)
                bvh_funcs.copy_and_offset(queue, (elements_this_iter, 1, 1),
                                          (1, 1, 1),
                                          np.uint32(first_index),
                                          np.uint32(elements_this_iter),
                                          np.uint32(child_offset),
                                          np.uint32(layer_start),
                                          layer_gpu.data,
                                          nodes_iter_gpu.data,
                                          g_times_l=True).wait()
            else:
                raise RuntimeError('API neither CUDA nor OpenCL?!')
        ilayer += 1

    if gpuapi.is_gpu_api_cuda():
        return nodes.get(), layer_bounds
    elif gpuapi.is_gpu_api_opencl():
        return nodes_iter_gpu.get(), layer_bounds

Example #12

Show file

File: bvh.py Project: NuTufts/ChromaUBooNE

def merge_nodes(nodes, degree, max_ratio=None):
    nthreads_per_block = 256
    context = None
    queue = None
    if gpuapi.is_gpu_api_opencl():
        context = cltools.get_last_context()
        queue = cl.CommandQueue(context)

    # Load GPU functions
    if gpuapi.is_gpu_api_cuda():
        bvh_module = get_module('bvh.cu',
                                options=api_options,
                                include_source_directory=True)
    elif gpuapi.is_gpu_api_opencl():
        # don't like the last context method. trouble. trouble.
        bvh_module = get_module('bvh.cl',
                                context,
                                options=api_options,
                                include_source_directory=True)
    else:
        raise RuntimeError('API is neither CUDA nor OpenCL?!')
    bvh_funcs = GPUFuncs(bvh_module)

    # determine number of parents
    nparent = len(nodes) / degree
    if len(nodes) % degree != 0:
        nparent += 1

    if nparent == 1:
        nparent_pad = nparent
    else:
        nparent_pad = round_up_to_multiple(nparent, 1)  #degree

    # allocate memory
    if gpuapi.is_gpu_api_cuda():
        gpu_parent_nodes = ga.zeros(shape=nparent_pad, dtype=ga.vec.uint4)
    elif gpuapi.is_gpu_api_opencl():
        parent_nodes_np = np.zeros(shape=nparent, dtype=ga.vec.uint4)
        gpu_parent_nodes = ga.to_device(queue, parent_nodes_np)
        gpu_nodes = ga.to_device(queue, nodes)
    else:
        raise RuntimeError('API is neither CUDA nor OpenCL?!')

    # run kernel
    if gpuapi.is_gpu_api_cuda():
        for first_index, elements_this_iter, nblocks_this_iter in \
                chunk_iterator(nparent, nthreads_per_block, max_blocks=10000):
            bvh_funcs.make_parents(np.uint32(first_index),
                                   np.uint32(elements_this_iter),
                                   np.uint32(degree),
                                   gpu_parent_nodes,
                                   cuda.In(nodes),
                                   np.uint32(0),
                                   np.uint32(len(nodes)),
                                   block=(nthreads_per_block, 1, 1),
                                   grid=(nblocks_this_iter, 1))
    elif gpuapi.is_gpu_api_opencl():
        for first_index, elements_this_iter, nblocks_this_iter in \
                chunk_iterator(nparent, nthreads_per_block, max_blocks=1):
            bvh_funcs.make_parents(queue, (elements_this_iter, 1, 1), None,
                                   np.uint32(first_index),
                                   np.uint32(elements_this_iter),
                                   np.uint32(degree), gpu_parent_nodes.data,
                                   gpu_nodes.data, np.uint32(0),
                                   np.uint32(len(nodes))).wait()
    else:
        raise RuntimeError('API is neither CUDA nor OpenCL?!')

    parent_nodes = gpu_parent_nodes.get()

    if max_ratio is not None:
        areas = node_areas(parent_nodes)
        child_areas = node_areas(nodes)

        excessive_area = np.zeros(shape=len(areas), dtype=bool)
        for i, parent_area in enumerate(areas):
            nchild = parent_nodes['w'][i] >> CHILD_BITS
            child_index = parent_nodes['w'][i] & ~NCHILD_MASK
            child_area = child_areas[child_index:child_index + nchild].sum()
            #if parent_area > 1e9:
            #    print i, 'Children: %e, Parent: %e' % (child_area, parent_area)
            if child_area / parent_area < 0.3:
                excessive_area[i] = True
                #print i, 'Children: %e, Parent: %e' % (child_area, parent_area)

        extra_slots = round_up_to_multiple(
            (degree - 1) * np.count_nonzero(excessive_area), 1)
        print 'Extra slots:', extra_slots
        new_parent_nodes = np.zeros(shape=len(parent_nodes) + extra_slots,
                                    dtype=parent_nodes.dtype)
        new_parent_nodes[:len(parent_nodes)] = parent_nodes

        offset = 0
        for count, index in enumerate(np.argwhere(excessive_area)):
            index = index[0] + offset
            nchild = new_parent_nodes['w'][index] >> CHILD_BITS
            child_index = new_parent_nodes['w'][index] & ~NCHILD_MASK
            new_parent_nodes[index] = nodes[child_index]
            #new_parent_nodes['w'][index] = 1 << CHILD_BITS | child_index
            tmp_nchild = new_parent_nodes['w'][index] >> CHILD_BITS
            tmp_child_index = new_parent_nodes['w'][index] & ~NCHILD_MASK
            new_parent_nodes['w'][index] = tmp_nchild << CHILD_BITS | (
                tmp_child_index + len(nodes))

            if nchild == 1:
                continue

            # slide everyone over
            #print index, nchild, len(new_parent_nodes)
            new_parent_nodes[index + nchild:] = new_parent_nodes[index +
                                                                 1:-nchild + 1]
            offset += nchild - 1
            for sibling in xrange(nchild - 1):
                new_parent_index = index + 1 + sibling
                new_parent_nodes[new_parent_index] = nodes[child_index +
                                                           sibling + 1]
                if new_parent_nodes['x'][new_parent_index] != 0:
                    tmp_nchild = new_parent_nodes['w'][
                        new_parent_index] >> CHILD_BITS
                    tmp_child_index = new_parent_nodes['w'][
                        new_parent_index] & ~NCHILD_MASK
                    new_parent_nodes['w'][
                        new_parent_index] = tmp_nchild << CHILD_BITS | (
                            tmp_child_index + len(nodes))

                    #new_parent_nodes['w'][new_parent_index] = 1 << CHILD_BITS | (child_index + sibling + 1)

            #print 'intermediate: %e' % node_areas(new_parent_nodes).max()
        print 'old: %e' % node_areas(parent_nodes).max()
        print 'new: %e' % node_areas(new_parent_nodes).max()
        if len(new_parent_nodes) < len(nodes):
            # Only adopt new set of parent nodes if it actually reduces the
            # total number of nodes at this level by 1.
            parent_nodes = new_parent_nodes

    return parent_nodes

Example #13

Show file

File: geometry.py Project: NuTufts/ChromaUBooNE

    def __init__(self,
                 geometry,
                 wavelengths=None,
                 print_usage=False,
                 min_free_gpu_mem=300e6,
                 cl_context=None,
                 cl_queue=None):
        log.info("GPUGeometry.__init__ min_free_gpu_mem %s ", min_free_gpu_mem)

        self.geometry = geometry
        self.instance_count += 1
        assert self.instance_count == 1, traceback.print_stack()

        self.metadata = Metadata()
        self.metadata(None, 'preinfo')
        self.metadata('a', "start")
        self.metadata['a_min_free_gpu_mem'] = min_free_gpu_mem

        if wavelengths is None:
            self.wavelengths = standard_wavelengths
        else:
            self.wavelengths = wavelengths

        try:
            self.wavelength_step = np.unique(np.diff(self.wavelengths)).item()
        except ValueError:
            raise ValueError('wavelengths must be equally spaced apart.')

        # this is where things get difficult.
        # pycuda and pyopencl gives us very different methods for working with structs
        #geometry_struct_size = characterize.sizeof('Geometry', geometry_source)

        # Note, that unfortunately the data types returned are very different as the
        if api.is_gpu_api_cuda():
            self.material_data, self.material_ptrs, self.material_pointer_array = self._package_material_data_cuda(
                geometry, self.wavelengths, self.wavelength_step)
            self.surface_data, self.surface_ptrs, self.surface_pointer_array = self._package_surface_data_cuda(
                geometry, self.wavelengths, self.wavelength_step)
        elif api.is_gpu_api_opencl():
            self.material_data, materials_bytes_cl = self._package_material_data_cl(
                cl_context, cl_queue, geometry, self.wavelengths,
                self.wavelength_step)
            self.surface_data, surfaces_bytes_cl = self._package_surface_data_cl(
                cl_context, cl_queue, geometry, self.wavelengths,
                self.wavelength_step)

        self.metadata('b', "after materials,surfaces")
        if api.is_gpu_api_opencl():
            self.metadata[
                'b_gpu_used'] = materials_bytes_cl + surfaces_bytes_cl  # opencl, we have to track this ourselves

        # Load Vertices and Triangles
        if api.is_gpu_api_cuda():
            self.vertices = mapped_empty(shape=len(geometry.mesh.vertices),
                                         dtype=ga.vec.float3,
                                         write_combined=True)
            self.vertices4 = np.zeros(shape=(len(self.vertices), 4),
                                      dtype=np.float32)
            self.triangles = mapped_empty(shape=len(geometry.mesh.triangles),
                                          dtype=ga.vec.uint3,
                                          write_combined=True)
            self.triangles4 = np.zeros(shape=(len(self.triangles), 4),
                                       dtype=np.uint32)
            self.vertices[:] = to_float3(geometry.mesh.vertices)
            self.vertices4[:, :-1] = self.vertices.ravel().view(
                np.float32).reshape(len(self.vertices), 3)  # for textures
            self.triangles[:] = to_uint3(geometry.mesh.triangles)
            self.triangles4[:, :-1] = self.triangles.ravel().view(
                np.uint32).reshape(len(self.triangles), 3)  # for textures
        elif api.is_gpu_api_opencl():
            self.vertices = ga.empty(cl_queue,
                                     len(geometry.mesh.vertices),
                                     dtype=ga.vec.float3)
            self.triangles = ga.empty(cl_queue,
                                      len(geometry.mesh.triangles),
                                      dtype=ga.vec.uint3)
            self.vertices[:] = to_float3(geometry.mesh.vertices)
            self.triangles[:] = to_uint3(geometry.mesh.triangles)

        if api.is_gpu_api_cuda():
            self.world_origin = ga.vec.make_float3(
                *geometry.bvh.world_coords.world_origin)
        elif api.is_gpu_api_opencl():
            self.world_origin = ga.vec.make_float3(
                *geometry.bvh.world_coords.world_origin)
            #self.world_origin = geometry.bvh.world_coords.world_origin
            self.world_origin = ga.to_device(cl_queue, self.world_origin)
            print type(self.world_origin), self.world_origin
        self.world_scale = np.float32(geometry.bvh.world_coords.world_scale)

        # Load material and surface indices into 8-bit codes
        # check if we've reached a complexity threshold
        if len(geometry.unique_materials) >= int(0xff):
            raise ValueError(
                'Number of materials to index has hit maximum of %d' %
                (int(0xff)))
        if len(geometry.unique_surfaces) >= int(0xff):
            raise ValueError(
                'Number of surfaces to index has hit maximum of %d' %
                (int(0xff)))
        # make bit code
        material_codes = (((geometry.material1_index & 0xff) << 24) |
                          ((geometry.material2_index & 0xff) << 16) |
                          ((geometry.surface_index & 0xff) << 8)).astype(
                              np.uint32)
        if api.is_gpu_api_cuda():
            self.material_codes = ga.to_gpu(material_codes)
        elif api.is_gpu_api_opencl():
            self.material_codes = ga.to_device(cl_queue, material_codes)

        # assign color codes
        colors = geometry.colors.astype(np.uint32)
        if api.is_gpu_api_cuda():
            self.colors = ga.to_gpu(colors)
            self.solid_id_map = ga.to_gpu(geometry.solid_id.astype(np.uint32))
        elif api.is_gpu_api_opencl():
            self.colors = ga.to_device(cl_queue, colors)
            self.solid_id_map = ga.to_device(
                cl_queue, geometry.solid_id.astype(np.uint32))

        # Limit memory usage by splitting BVH into on and off-GPU parts
        self.metadata('c', "after colors, idmap")
        if api.is_gpu_api_cuda():
            gpu_free, gpu_total = cuda.mem_get_info()
        elif api.is_gpu_api_opencl():
            gpu_total = self.metadata['gpu_total']
            meshdef_nbytes_cl = self.vertices.nbytes + self.triangles.nbytes + self.world_origin.nbytes + self.world_scale.nbytes + self.material_codes.nbytes + self.colors.nbytes + self.solid_id_map.nbytes
            self.metadata[
                'c_gpu_used'] = materials_bytes_cl + surfaces_bytes_cl + meshdef_nbytes_cl
            gpu_free = gpu_total - (materials_bytes_cl + surfaces_bytes_cl +
                                    meshdef_nbytes_cl)

        # Figure out how many elements we can fit on the GPU,
        # but no fewer than 100 elements, and no more than the number of actual nodes
        n_nodes = len(geometry.bvh.nodes)
        split_index = min(
            max(
                int((gpu_free - min_free_gpu_mem) /
                    geometry.bvh.nodes.itemsize), 100), n_nodes)
        print "split index=", split_index, " vs. total nodes=", n_nodes

        # push nodes to GPU
        if api.is_gpu_api_cuda():
            self.nodes = ga.to_gpu(geometry.bvh.nodes[:split_index])
        elif api.is_gpu_api_opencl():
            self.nodes = ga.to_device(cl_queue,
                                      geometry.bvh.nodes[:split_index])
        n_extra = max(1, (n_nodes - split_index))  # forbid zero size

        # left over nodes
        if api.is_gpu_api_cuda():
            self.extra_nodes = mapped_empty(shape=n_extra,
                                            dtype=geometry.bvh.nodes.dtype,
                                            write_combined=True)
        elif api.is_gpu_api_opencl():
            self.extra_nodes = ga.empty(cl_queue,
                                        shape=n_extra,
                                        dtype=geometry.bvh.nodes.dtype)

        if split_index < n_nodes:
            log.info('Splitting BVH between GPU and CPU memory at node %d' %
                     split_index)
            self.extra_nodes[:] = geometry.bvh.nodes[split_index:]
            splitting = 1
        else:
            splitting = 0

        self.metadata('d', "after nodes")
        if api.is_gpu_api_opencl():
            nodes_nbytes_cl = self.nodes.nbytes
            self.metadata[
                'd_gpu_used'] = materials_bytes_cl + surfaces_bytes_cl + meshdef_nbytes_cl + nodes_nbytes_cl
        self.metadata.array("d_nodes", geometry.bvh.nodes)
        self.metadata['d_split_index'] = split_index
        self.metadata['d_extra_nodes_count'] = n_extra
        self.metadata['d_splitting'] = splitting
        self.print_device_usage(cl_context=cl_context)

        # CUDA See if there is enough memory to put the vertices and/or triangles back on the GPU
        if api.is_gpu_api_cuda():
            gpu_free, gpu_total = cuda.mem_get_info()
        elif api.is_gpu_api_opencl():
            gpu_total = self.metadata['gpu_total']
            gpu_free = gpu_total - self.metadata['d_gpu_used']
        self.metadata.array('e_triangles', self.triangles)
        if api.is_gpu_api_cuda():
            if self.triangles.nbytes < (gpu_free - min_free_gpu_mem):
                self.triangles = ga.to_gpu(self.triangles)
                log.info(
                    'Optimization: Sufficient memory to move triangles onto GPU'
                )
                ftriangles_gpu = 1
            else:
                log.warn('using host mapped memory triangles')
                ftriangles_gpu = 0
        elif api.is_gpu_api_opencl():
            if self.triangles.nbytes < (gpu_free - min_free_gpu_mem):
                #self.triangles = ga.to_device(cl_queue,self.triangles)
                log.info(
                    'Optimization: Sufficient memory to move triangles onto GPU'
                )
                ftriangles_gpu = 1
            else:
                log.warn('using host mapped memory triangles')
                ftriangles_gpu = 0

        self.metadata('e', "after triangles")
        self.metadata['e_triangles_gpu'] = ftriangles_gpu

        if api.is_gpu_api_cuda():
            gpu_free, gpu_total = cuda.mem_get_info()
        elif api.is_gpu_api_opencl():
            gpu_total = self.metadata['gpu_total']
            gpu_free = gpu_total - self.metadata['d_gpu_used']

        self.metadata.array('f_vertices', self.vertices)

        if api.is_gpu_api_cuda():
            if self.vertices.nbytes < (gpu_free - min_free_gpu_mem):
                self.vertices = ga.to_gpu(self.vertices)
                log.info(
                    'Optimization: Sufficient memory to move vertices onto GPU'
                )
                vertices_gpu = 1
            else:
                log.warn('using host mapped memory vertices')
                vertices_gpu = 0
        elif api.is_gpu_api_opencl():
            if self.vertices.nbytes < (gpu_free - min_free_gpu_mem):
                #self.vertices = ga.to_gpu(self.vertices)
                log.info(
                    'Optimization: Sufficient memory to move vertices onto GPU'
                )
                vertices_gpu = 1
            else:
                log.warn('using host mapped memory vertices')
                vertices_gpu = 0

        self.metadata('f', "after vertices")
        self.metadata['f_vertices_gpu'] = vertices_gpu

        if api.is_gpu_api_cuda():
            geometry_source = cutools.get_cu_source('geometry_types.h')
            geometry_struct_size = characterize.sizeof('Geometry',
                                                       geometry_source)
            self.gpudata = make_gpu_struct(geometry_struct_size, [
                Mapped(self.vertices),
                Mapped(self.triangles), self.material_codes, self.colors,
                self.nodes,
                Mapped(self.extra_nodes), self.material_pointer_array,
                self.surface_pointer_array, self.world_origin,
                self.world_scale,
                np.int32(len(self.nodes))
            ])
        elif api.is_gpu_api_opencl():
            # No relevant way to pass struct into OpenCL kernel. We have to pass everything by arrays
            # We then build a geometry struct later in the kernel
            # provided below is example/test of passing the data
            #if True: # for debuggin
            if False:  #
                print "loading geometry_structs.cl"
                geostructsmod = cltools.get_cl_module(
                    "geometry_structs.cl",
                    cl_context,
                    options=cltools.cl_options,
                    include_source_directory=True)
                geostructsfunc = GPUFuncs(geostructsmod)
                geostructsfunc.make_geostruct(
                    cl_queue, (3, ), None, self.vertices.data,
                    self.triangles.data, self.material_codes.data,
                    self.colors.data, self.nodes.data, self.extra_nodes.data,
                    np.int32(len(geometry.unique_materials)),
                    self.material_data['refractive_index'].data,
                    self.material_data['absorption_length'].data,
                    self.material_data['scattering_length'].data,
                    self.material_data['reemission_prob'].data,
                    self.material_data['reemission_cdf'].data,
                    np.int32(len(geometry.unique_surfaces)),
                    self.surface_data['detect'].data,
                    self.surface_data['absorb'].data,
                    self.surface_data['reemit'].data,
                    self.surface_data['reflect_diffuse'].data,
                    self.surface_data['reflect_specular'].data,
                    self.surface_data['eta'].data, self.surface_data['k'].data,
                    self.surface_data['reemission_cdf'].data,
                    self.surface_data['model'].data,
                    self.surface_data['transmissive'].data,
                    self.surface_data['thickness'].data,
                    self.surface_data['nplanes'].data,
                    self.surface_data['wire_diameter'].data,
                    self.surface_data['wire_pitch'].data,
                    self.world_origin.data, self.world_scale,
                    np.int32(len(self.nodes)), self.material_data['n'],
                    self.material_data['step'],
                    self.material_data["wavelength0"])
                cl_queue.finish()
                self.material_codes.get()
                raise RuntimeError('bail')
        if print_usage:
            self.print_device_usage(cl_context=cl_context)
        log.info(self.device_usage_str(cl_context=cl_context))
        self.metadata('g', "after geometry struct")

Example #14

Show file

File: bvh.py Project: NuTufts/ChromaUBooNE

def create_leaf_nodes(mesh,
                      morton_bits=16,
                      round_to_multiple=1,
                      nthreads_per_block=32,
                      max_blocks=16):
    '''Compute the leaf nodes surrounding a triangle mesh.

      ``mesh``: chroma.geometry.Mesh
        Triangles to box
      ``morton_bits``: int
        Number of bits to use per dimension when computing Morton code.
      ``round_to_multiple``: int
        Round the number of nodes created up to multiple of this number
        Extra nodes will be all zero.
        
    Returns (world_coords, nodes, morton_codes), where
      ``world_coords``: chroma.bvh.WorldCoords
        Defines the fixed point coordinate system
      ``nodes``: ndarray(shape=len(mesh.triangles), dtype=uint4)
        List of leaf nodes.  Child IDs will be set to triangle offsets.
      ``morton_codes``: ndarray(shape=len(mesh.triangles), dtype=np.uint64)
        Morton codes for each triangle, using ``morton_bits`` per axis.
        Must be <= 16 bits.
    '''
    # it would be nice not to duplicate code, make functions transparent...
    context = None
    queue = None
    if gpuapi.is_gpu_api_opencl():
        context = cltools.get_last_context()
        #print context
        queue = cl.CommandQueue(context)

    # Load GPU functions
    if gpuapi.is_gpu_api_cuda():
        bvh_module = get_module('bvh.cu',
                                options=api_options,
                                include_source_directory=True)
    elif gpuapi.is_gpu_api_opencl():
        # don't like the last context method. trouble. trouble.
        bvh_module = get_module('bvh.cl',
                                cltools.get_last_context(),
                                options=api_options,
                                include_source_directory=True)
    bvh_funcs = GPUFuncs(bvh_module)

    # compute world coordinates
    world_origin_np = mesh.vertices.min(axis=0)
    world_scale = np.max(
        (mesh.vertices.max(axis=0) - world_origin_np)) / (2**16 - 2)
    world_coords = WorldCoords(world_origin=world_origin_np,
                               world_scale=world_scale)

    # Put triangles and vertices into host and device memory
    # unfortunately, opencl and cuda has different methods for managing memory here
    # we have to write divergent code
    if gpuapi.is_gpu_api_cuda():
        # here cuda supports a nice feature where we allocate host and device memory that are mapped onto one another.
        # no explicit requests for transfers here
        triangles = cutools.mapped_empty(shape=len(mesh.triangles),
                                         dtype=ga.vec.uint3,
                                         write_combined=True)
        triangles[:] = to_uint3(mesh.triangles)
        vertices = cutools.mapped_empty(shape=len(mesh.vertices),
                                        dtype=ga.vec.float3,
                                        write_combined=True)
        vertices[:] = to_float3(mesh.vertices)
        #print triangles[0:10]
        #print vertices[0:10]

        # Call GPU to compute nodes
        nodes = ga.zeros(shape=round_up_to_multiple(len(triangles),
                                                    round_to_multiple),
                         dtype=ga.vec.uint4)
        morton_codes = ga.empty(shape=len(triangles), dtype=np.uint64)

        # Convert world coords to GPU-friendly types
        world_origin = ga.vec.make_float3(*world_origin_np)
        world_scale = np.float32(world_scale)

        # generate morton codes on GPU
        for first_index, elements_this_iter, nblocks_this_iter in \
                chunk_iterator(len(triangles), nthreads_per_block,
                               max_blocks=30000):
            bvh_funcs.make_leaves(np.uint32(first_index),
                                  np.uint32(elements_this_iter),
                                  cutools.Mapped(triangles),
                                  cutools.Mapped(vertices),
                                  world_origin,
                                  world_scale,
                                  nodes,
                                  morton_codes,
                                  block=(nthreads_per_block, 1, 1),
                                  grid=(nblocks_this_iter, 1))

        morton_codes_host = morton_codes.get() >> (16 - morton_bits)

    elif gpuapi.is_gpu_api_opencl():
        # here we need to allocate a buffer on the host and on the device
        triangles = np.empty(len(mesh.triangles), dtype=ga.vec.uint3)
        copy_to_uint3(mesh.triangles, triangles)
        vertices = np.empty(len(mesh.vertices), dtype=ga.vec.float3)
        copy_to_float3(mesh.vertices, vertices)
        # now create a buffer object on the device and push data to it
        triangles_dev = ga.to_device(queue, triangles)
        vertices_dev = ga.to_device(queue, vertices)

        # Call GPU to compute nodes
        nodes = ga.zeros(queue,
                         shape=round_up_to_multiple(len(triangles),
                                                    round_to_multiple),
                         dtype=ga.vec.uint4)
        morton_codes = ga.empty(queue, shape=len(triangles), dtype=np.uint64)

        # Convert world coords to GPU-friendly types
        #world_origin = np.array(world_origin_np,dtype=np.float32)
        world_origin = np.empty(1, dtype=ga.vec.float3)
        world_origin['x'] = world_origin_np[0]
        world_origin['y'] = world_origin_np[1]
        world_origin['z'] = world_origin_np[2]
        world_scale = np.float32(world_scale)
        #print world_origin, world_scale

        # generate morton codes on GPU
        for first_index, elements_this_iter, nblocks_this_iter in \
                chunk_iterator(len(triangles), nthreads_per_block, max_blocks):
            print first_index, elements_this_iter, nblocks_this_iter
            bvh_funcs.make_leaves(
                queue,
                (nblocks_this_iter, 1, 1),
                (nthreads_per_block, 1, 1),
                #bvh_funcs.make_leaves( queue, (elements_this_iter,1,1), None,
                np.uint32(first_index),
                np.uint32(elements_this_iter),
                triangles_dev.data,
                vertices_dev.data,
                world_origin,
                world_scale,
                nodes.data,
                morton_codes.data,
                g_times_l=True).wait()

        morton_codes_host = morton_codes.get() >> (16 - morton_bits)

    return world_coords, nodes.get(), morton_codes_host

Example #15

Show file

File: test_texture.py Project: NuTufts/ChromaUBooNE

origin = geo.bvh.world_coords.world_origin

nodes = sim.gpu_geometry.nodes
extra_node = sim.gpu_geometry.extra_nodes
triangles = sim.gpu_geometry.triangles
vertices = sim.gpu_geometry.vertices
print vertices.shape
vertices4 = np.zeros((len(vertices), 4), dtype=np.float32)
print vertices.get().ravel().view(np.float32).shape
vertices4[:, :-1] = vertices.get().ravel().view(np.float32).reshape(
    len(vertices), 3)

module = get_module('test_texture.cu',
                    options=api_options,
                    include_source_directory=True)
gpu_funcs = GPUFuncs(module)
node_texture_ref = module.get_texref("node_tex_ref")
extra_node_texture_ref = module.get_texref("extra_node_tex_ref")
triangles_texture_ref = module.get_texref("triangles_tex_ref")
vertices_texture_ref = module.get_texref("vertices_tex_ref")

node_vec_texture_ref = module.get_texref("nodevec_tex_ref")
node_vec_texture_ref.set_format(cuda.array_format.UNSIGNED_INT32, 4)

ur_nodes = nodes.get().ravel().view(np.uint32)
ur_nodes_gpu = ga.to_gpu(ur_nodes)
ur_nodes_gpu.bind_to_texref_ext(node_texture_ref)
nodes_nbytes = ur_nodes.nbytes

ur_nodes = nodes.get().ravel().view(np.uint32)
ur_nodes_vec_gpu = ga.to_gpu(ur_nodes)

Example #16

Show file

File: bvh.py Project: NuTufts/ChromaUBooNE

def merge_nodes_detailed(nodes, first_child, nchild):
    '''Merges nodes into len(first_child) parent nodes, using
    the provided arrays to determine the index of the first
    child of each parent, and how many children there are.'''
    nthreads_per_block = 256
    context = None
    queue = None
    if gpuapi.is_gpu_api_opencl():
        context = cltools.get_last_context()
        #print context
        queue = cl.CommandQueue(context)

    # Load GPU functions
    if gpuapi.is_gpu_api_cuda():
        bvh_module = get_module('bvh.cu',
                                options=api_options,
                                include_source_directory=True)
    elif gpuapi.is_gpu_api_opencl():
        # don't like the last context method. trouble. trouble.
        bvh_module = get_module('bvh.cl',
                                context,
                                options=api_options,
                                include_source_directory=True)
    else:
        raise RuntimeError('API is neither CUDA nor OpenCL?!')
    bvh_funcs = GPUFuncs(bvh_module)

    # Load Memory
    if gpuapi.is_gpu_api_cuda():
        gpu_nodes = ga.to_gpu(nodes)
        gpu_first_child = ga.to_gpu(first_child.astype(np.int32))
        gpu_nchild = ga.to_gpu(nchild.astype(np.int32))

        nparent = len(first_child)
        gpu_parent_nodes = ga.empty(shape=nparent, dtype=ga.vec.uint4)
    elif gpuapi.is_gpu_api_opencl():
        gpu_nodes = ga.to_device(queue, nodes)
        gpu_first_child = ga.to_device(queue, first_child.astype(np.int32))
        gpu_nchild = ga.to_device(queue, nchild.astype(np.int32))
        nparent = len(first_child)
        parent_nodes_np = np.zeros(shape=nparent, dtype=ga.vec.uint4)
        gpu_parent_nodes = ga.to_device(queue, parent_nodes_np)
    else:
        raise RuntimeError('API is neither CUDA nor OpenCL?!')

    # Run Kernel
    for first_index, elements_this_iter, nblocks_this_iter in \
            chunk_iterator(nparent, nthreads_per_block, max_blocks=10000):
        if gpuapi.is_gpu_api_cuda():
            bvh_funcs.make_parents_detailed(np.uint32(first_index),
                                            np.uint32(elements_this_iter),
                                            gpu_nodes,
                                            gpu_parent_nodes,
                                            gpu_first_child,
                                            gpu_nchild,
                                            block=(nthreads_per_block, 1, 1),
                                            grid=(nblocks_this_iter, 1))
        elif gpuapi.is_gpu_api_opencl():
            bvh_funcs.make_parents_detailed(queue, (elements_this_iter, 1, 1),
                                            None, np.uint32(first_index),
                                            np.uint32(elements_this_iter),
                                            gpu_nodes.data,
                                            gpu_parent_nodes.data,
                                            gpu_first_child.data,
                                            gpu_nchild.data).wait()
        else:
            raise RuntimeError('API is neither CUDA nor OpenCL?!')

    return gpu_parent_nodes.get()

Example #17

Show file

class GPUPDF(object):
    def __init__(self, cl_context=None):
        if api.is_gpu_api_cuda():
            self.module = cutools.get_cu_module('pdf.cu',
                                                options=api_options,
                                                include_source_directory=True)
        elif api.is_gpu_api_opencl():
            self.module = cltools.get_cl_module('pdf.cl',
                                                cl_context,
                                                options=api_options,
                                                include_source_directory=True)
        self.gpu_funcs = GPUFuncs(self.module)

    def setup_pdf(self, nchannels, tbins, trange, qbins, qrange):
        """Setup GPU arrays to hold PDF information.

           nchannels: int, number of channels
           tbins: number of time bins
           trange: tuple of (min, max) time in PDF
           qbins: number of charge bins
           qrange: tuple of (min, max) charge in PDF
        """
        self.events_in_histogram = 0
        self.hitcount_gpu = ga.zeros(nchannels, dtype=np.uint32)
        self.pdf_gpu = ga.zeros(shape=(nchannels, tbins, qbins),
                                dtype=np.uint32)
        self.tbins = tbins
        self.trange = trange
        self.qbins = qbins
        self.qrange = qrange

    def clear_pdf(self):
        """Rezero the PDF counters."""
        self.hitcount_gpu.fill(0)
        self.pdf_gpu.fill(0)

    def add_hits_to_pdf(self, gpuchannels, nthreads_per_block=64):
        self.gpu_funcs.bin_hits(
            np.int32(len(self.hitcount_gpu)),
            gpuchannels.q,
            gpuchannels.t,
            self.hitcount_gpu,
            np.int32(self.tbins),
            np.float32(self.trange[0]),
            np.float32(self.trange[1]),
            np.int32(self.qbins),
            np.float32(self.qrange[0]),
            np.float32(self.qrange[1]),
            self.pdf_gpu,
            block=(nthreads_per_block, 1, 1),
            grid=(len(gpuchannels.t) // nthreads_per_block + 1, 1))

        self.events_in_histogram += 1

    def get_pdfs(self):
        """Returns the 1D hitcount array and the 3D [channel, time, charge]
        histogram."""
        return self.hitcount_gpu.get(), self.pdf_gpu.get()

    def setup_pdf_eval(self,
                       event_hit,
                       event_time,
                       event_charge,
                       min_twidth,
                       trange,
                       min_qwidth,
                       qrange,
                       min_bin_content=10,
                       time_only=True):
        """Setup GPU arrays to compute PDF values for the given event.
        The pdf_eval calculation allows the PDF to be evaluated at a
        single point for each channel as the Monte Carlo is run.  The
        effective bin size will be as small as (`min_twidth`,
        `min_qwidth`) around the point of interest, but will be large
        enough to ensure that `min_bin_content` Monte Carlo events
        fall into the bin.

            event_hit: ndarray
              Hit or not-hit status for each channel in the detector.
            event_time: ndarray
              Hit time for each channel in the detector.  If channel 
              not hit, the time will be ignored.
            event_charge: ndarray
              Integrated charge for each channel in the detector.
              If channel not hit, the charge will be ignored.

            min_twidth: float
              Minimum bin size in the time dimension
            trange: (float, float)
              Range of time dimension in PDF
            min_qwidth: float
              Minimum bin size in charge dimension
            qrange: (float, float)
              Range of charge dimension in PDF
            min_bin_content: int
              The bin will be expanded to include at least this many events
            time_only: bool
              If True, only the time observable will be used in the PDF.
        """
        self.event_nhit = count_nonzero(event_hit)

        # Define a mapping from an array of len(event_hit) to an array of length event_nhit
        self.map_hit_offset_to_channel_id = np.where(event_hit)[0].astype(
            np.uint32)
        self.map_hit_offset_to_channel_id_gpu = ga.to_gpu(
            self.map_hit_offset_to_channel_id)
        self.map_channel_id_to_hit_offset = np.maximum(0,
                                                       event_hit.cumsum() -
                                                       1).astype(np.uint32)
        self.map_channel_id_to_hit_offset_gpu = ga.to_gpu(
            self.map_channel_id_to_hit_offset)

        self.event_hit_gpu = ga.to_gpu(event_hit.astype(np.uint32))
        self.event_time_gpu = ga.to_gpu(event_time.astype(np.float32))
        self.event_charge_gpu = ga.to_gpu(event_charge.astype(np.float32))

        self.eval_hitcount_gpu = ga.zeros(len(event_hit), dtype=np.uint32)
        self.eval_bincount_gpu = ga.zeros(len(event_hit), dtype=np.uint32)
        self.nearest_mc_gpu = ga.empty(shape=self.event_nhit * min_bin_content,
                                       dtype=np.float32)
        self.nearest_mc_gpu.fill(1e9)

        self.min_twidth = min_twidth
        self.trange = trange
        self.min_qwidth = min_qwidth
        self.qrange = qrange
        self.min_bin_content = min_bin_content

        assert time_only  # Only support time right now
        self.time_only = time_only

    def clear_pdf_eval(self):
        "Reset PDF evaluation counters to start accumulating new Monte Carlo."
        self.eval_hitcount_gpu.fill(0)
        self.eval_bincount_gpu.fill(0)
        self.nearest_mc_gpu.fill(1e9)

    @profile_if_possible
    def accumulate_pdf_eval(self,
                            gpuchannels,
                            nthreads_per_block=64,
                            max_blocks=10000,
                            cl_queue=None):
        "Add the most recent results of run_daq() to the PDF evaluation."
        if api.is_gpu_api_cuda():
            self.work_queues = ga.empty(shape=self.event_nhit *
                                        (gpuchannels.ndaq + 1),
                                        dtype=np.uint32)
        elif api.is_gpu_api_opencl():
            self.work_queues = ga.empty(cl_queue,
                                        shape=self.event_nhit *
                                        (gpuchannels.ndaq + 1),
                                        dtype=np.uint32)
        self.work_queues.fill(1)

        if api.is_gpu_api_cuda():
            self.gpu_funcs.accumulate_bincount(
                np.int32(self.event_hit_gpu.size),
                np.int32(gpuchannels.ndaq),
                self.event_hit_gpu,
                self.event_time_gpu,
                gpuchannels.t,
                self.eval_hitcount_gpu,
                self.eval_bincount_gpu,
                np.float32(self.min_twidth),
                np.float32(self.trange[0]),
                np.float32(self.trange[1]),
                np.int32(self.min_bin_content),
                self.map_channel_id_to_hit_offset_gpu,
                self.work_queues,
                block=(nthreads_per_block, 1, 1),
                grid=(self.event_hit_gpu.size // nthreads_per_block + 1, 1))
            cuda.Context.get_current().synchronize()

            self.gpu_funcs.accumulate_nearest_neighbor_block(
                np.int32(self.event_nhit),
                np.int32(gpuchannels.ndaq),
                self.map_hit_offset_to_channel_id_gpu,
                self.work_queues,
                self.event_time_gpu,
                gpuchannels.t,
                self.nearest_mc_gpu,
                np.int32(self.min_bin_content),
                block=(nthreads_per_block, 1, 1),
                grid=(self.event_nhit, 1))
            cuda.Context.get_current().synchronize()

        elif api.is_gpu_api_opencl():
            self.gpu_funcs.accumulate_bincount(
                cl_queue, (nthreads_per_block, 1, 1),
                (self.event_hit_gpu.size // nthreads_per_block + 1, 1),
                np.int32(gpuchannels.ndaq),
                self.event_hit_gpu.data,
                self.event_time_gpu.data,
                gpuchannels.t.data,
                self.eval_hitcount_gpu.data,
                self.eval_bincount_gpu.data,
                np.float32(self.min_twidth),
                np.float32(self.trange[0]),
                np.float32(self.trange[1]),
                np.int32(self.min_bin_content),
                self.map_channel_id_to_hit_offset_gpu.data,
                self.work_queues.data,
                g_times_l=True)
            #cl.enqueue_barrier( cl_queue )
            self.gpu_funcs.accumulate_nearest_neighbor_block(
                cl_queue, (nthreads_per_block, 1, 1), (self.event_nhit, 1),
                np.int32(self.event_nhit),
                np.int32(gpuchannels.ndaq),
                self.map_hit_offset_to_channel_id_gpu.data,
                self.work_queues.data,
                self.event_time_gpu.daa,
                gpuchannels.t.data,
                self.nearest_mc_gpu.data,
                np.int32(self.min_bin_content),
                g_time_l=True)
            #cl.enqueue_barrier( cl_queue )

    def get_pdf_eval(self):
        evhit = self.event_hit_gpu.get().astype(bool)
        hitcount = self.eval_hitcount_gpu.get()
        bincount = self.eval_bincount_gpu.get()

        pdf_value = np.zeros(len(hitcount), dtype=float)
        pdf_frac_uncert = np.zeros_like(pdf_value)

        # PDF value for high stats bins
        high_stats = (bincount >= self.min_bin_content)
        if high_stats.any():
            if self.time_only:
                pdf_value[high_stats] = bincount[high_stats].astype(
                    float) / hitcount[high_stats] / self.min_twidth
            else:
                assert Exception('Unimplemented 2D (time,charge) mode!')

            pdf_frac_uncert[high_stats] = 1.0 / np.sqrt(bincount[high_stats])

        # PDF value for low stats bins
        low_stats = ~high_stats & (hitcount > 0) & evhit

        nearest_mc_by_hit = self.nearest_mc_gpu.get().reshape(
            (self.event_nhit, self.min_bin_content))
        nearest_mc = np.empty(shape=(len(hitcount), self.min_bin_content),
                              dtype=np.float32)
        nearest_mc.fill(1e9)
        nearest_mc[self.map_hit_offset_to_channel_id, :] = nearest_mc_by_hit

        # Deal with the case where we did not even get min_bin_content events
        # in the PDF but also clamp the lower range to ensure we don't index
        # by a negative number in 2 lines
        last_valid_entry = np.maximum(
            0, (nearest_mc < 1e9).astype(int).sum(axis=1) - 1)
        distance = nearest_mc[np.arange(len(last_valid_entry)),
                              last_valid_entry]
        if low_stats.any():
            if self.time_only:
                pdf_value[low_stats] = (
                    last_valid_entry[low_stats] + 1).astype(float) / hitcount[
                        low_stats] / distance[low_stats] / 2.0
            else:
                assert Exception('Unimplemented 2D (time,charge) mode!')

            pdf_frac_uncert[low_stats] = 1.0 / np.sqrt(
                last_valid_entry[low_stats] + 1)

        # PDFs with no stats got zero by default during array creation

        print 'high_stats:', high_stats.sum(), 'low_stats', low_stats.sum()
        return hitcount, pdf_value, pdf_value * pdf_frac_uncert

Example #18

Show file

    def _call_opencl_kernel(self, sim, photons, ourphotons, max_shared_nodes,
                            nodes, workgroupsize, comqueue):
        module = get_module('wq_checknode.cl',
                            self.context,
                            options=api_options,
                            include_source_directory=True)
        gpu_funcs = GPUFuncs(module)

        # gather variables for kernel call
        gpugeo = sim.gpu_geometry
        photon_pos = photons.pos
        photon_dir = photons.dir
        photon_current_node = photons.current_node_index
        photon_tested_node = ga.to_device(
            comqueue, 1 * np.ones(len(photons.pos), dtype=np.uint32))
        photon_last_result = ga.to_device(
            comqueue, -1 * np.ones(len(photons.pos), dtype=np.int32))
        nodes = gpugeo.nodes
        node_parent = ga.to_device(comqueue,
                                   sim.detector.node_dsar_tree.parent)
        node_first_daughter = ga.to_device(
            comqueue, sim.detector.node_dsar_tree.first_daughter)
        node_sibling = ga.to_device(comqueue,
                                    sim.detector.node_dsar_tree.sibling)
        node_aunt = ga.to_device(comqueue, sim.detector.node_dsar_tree.aunt)
        world_origin = gpugeo.world_origin_gpu
        world_scale = gpugeo.world_scale
        # make queue related variables
        queue_size = np.int32(len(photons.pos) * 2)
        queue_photon_index = ga.empty(comqueue, queue_size, dtype=np.int32)
        queue_slot_flag = ga.zeros(comqueue, queue_size, dtype=np.int32)
        queue_photon_index[0:len(photons.pos)] = np.arange(0,
                                                           len(photons.pos),
                                                           dtype=np.int32)[:]
        queue_photon_index[len(photons.pos):] = (
            np.ones(len(photons.pos), dtype=np.int32) * -1)[:]
        queue_slot_flag[0:len(photons.pos)] = np.ones(len(photons.pos),
                                                      dtype=np.int32)[:]
        a = ga.zeros(comqueue, 1, dtype=ga.vec.uint4)
        b = np.array(1, dtype=np.int32)
        c = np.array(1, dtype=np.uint32)
        workgroup_photons = cl.LocalMemory(b.nbytes * workgroupsize)
        workgroup_current_node = cl.LocalMemory(b.nbytes * workgroupsize)
        workgroup_tested_node = cl.LocalMemory(b.nbytes * workgroupsize)

        max_nodes_can_store = (max_shared_nodes - 20 - 3 * workgroupsize)
        max_nodes_can_store -= max_nodes_can_store % 32
        max_nodes_can_store = np.int32(max_nodes_can_store)
        loaded_node_start_index = np.int32(0)
        loaded_node_end_index = np.int32(1)
        node_front_start = ga.empty(comqueue, 1, dtype=np.int32)
        node_front_end = ga.empty(comqueue, 1, dtype=np.int32)
        workgroup_nodes = cl.LocalMemory(a.nbytes * (max_nodes_can_store + 1))
        workgroup_daughter = cl.LocalMemory(c.nbytes *
                                            (max_nodes_can_store + 1))
        workgroup_sibling = cl.LocalMemory(c.nbytes *
                                           (max_nodes_can_store + 1))
        workgroup_aunt = cl.LocalMemory(c.nbytes * (max_nodes_can_store + 1))
        max_loops = 32

        if len(gpugeo.extra_nodes) > 1:
            raise RuntimeError('did not plan for there to be a node split.')

        print photon_current_node
        print photon_tested_node
        print queue_photon_index
        print queue_slot_flag

        print "Starting node range: ", loaded_node_start_index, " to ", loaded_node_end_index
        print "Max nodes in shared: ", max_nodes_can_store
        print "Work group nodes size: ", a.nbytes * workgroupsize, " bytes = (", a.nbytes, "*", workgroupsize, ")"
        print "Available local memsize: ", self.shared_mem_size
        print "Total number of nodes: ", len(
            nodes), " (", nodes.nbytes, " bytes)"
        print "Stored node size: ", max_nodes_can_store * a.nbytes
        print "Left over: ", self.shared_mem_size - max_nodes_can_store * a.nbytes - a.nbytes * workgroupsize
        print sim.detector.bvh.layer_bounds

        print "PRESUB CURRENT NODES"
        print photon_current_node
        print "PRESUB TESTED NODES"
        print photon_tested_node

        start_queue = time.time()
        gpu_funcs.checknode(
            comqueue, (workgroupsize, 1, 1), (workgroupsize, 1, 1),
            np.int32(max_loops), photon_pos.data, photon_dir.data,
            photon_current_node.data,
            photon_tested_node.data, photon_last_result.data,
            np.int32(len(nodes)), nodes.data, node_parent.data,
            node_first_daughter.data, node_sibling.data, node_aunt.data,
            world_origin.data, world_scale, queue_size,
            queue_photon_index.data, queue_slot_flag.data,
            np.int32(len(photon_pos)), np.int32(workgroupsize),
            workgroup_photons, workgroup_current_node, workgroup_tested_node,
            max_nodes_can_store, workgroup_nodes, workgroup_daughter,
            workgroup_sibling, workgroup_aunt, loaded_node_start_index,
            loaded_node_end_index, node_front_start.data,
            node_front_end.data).wait()
        end_queue = time.time()

        print "CheckNode Queue returns. ", end_queue - start_queue, " seconds"
        print "(Current node, To Test, result)"
        node_states = zip(photon_current_node.get(), photon_tested_node.get(),
                          photon_last_result.get())
        for x in xrange(0, len(node_states), 10):
            y = x + 10
            if y > len(node_states):
                y = len(node_states)
            print x, ": ", node_states[x:y]

        print "LAST RESULT:"
        print photon_last_result.get()

        print "PHOTON QUEUE"
        photon_queue = queue_photon_index.get()
        for x in xrange(0, len(photon_queue), 32):
            y = x + 32
            if y > len(photon_queue):
                y = len(photon_queue)
            print x, ": ", photon_queue[x:y]

        print "QUEUE SLOT FLAGS"
        slot_flags = queue_slot_flag.get()
        for x in xrange(0, len(slot_flags), 32):
            y = x + 32
            if y > len(slot_flags):
                y = len(slot_flags)
            print x, ": ", slot_flags[x:y]

        print "NODE FRONT: ", node_front_start.get(
        ), " to ", node_front_end.get(
        ), node_front_end.get() - node_front_start.get()
        return

Example #19

Show file

File: photon.py Project: NuTufts/ChromaUBooNE

    def __init__(self, photons, ncopies=1, cl_context=None):
        """Load ``photons`` onto the GPU, replicating as requested.

           Args:
               - photons: chroma.Event.Photons
                   Photon state information to load onto GPU
               - ncopies: int, *optional*
                   Number of times to replicate the photons
                   on the GPU.  This is used if you want
                   to propagate the same event many times,
                   for example in a likelihood calculation.

                   The amount of GPU storage will be proportionally
                   larger if ncopies > 1, so be careful.
        """
        nphotons = len(photons)
        # Allocate GPU memory for photon info and push to device
        if api.is_gpu_api_cuda():
            self.pos = ga.empty(shape=nphotons * ncopies, dtype=ga.vec.float3)
            self.dir = ga.empty(shape=nphotons * ncopies, dtype=ga.vec.float3)
            self.pol = ga.empty(shape=nphotons * ncopies, dtype=ga.vec.float3)
            self.wavelengths = ga.empty(shape=nphotons * ncopies,
                                        dtype=np.float32)
            self.t = ga.empty(shape=nphotons * ncopies, dtype=np.float32)
            self.last_hit_triangles = ga.empty(shape=nphotons * ncopies,
                                               dtype=np.int32)
            self.flags = ga.empty(shape=nphotons * ncopies, dtype=np.uint32)
            self.weights = ga.empty(shape=nphotons * ncopies, dtype=np.float32)
            self.current_node_index = ga.zeros(shape=nphotons * ncopies,
                                               dtype=np.uint32)  # deprecated
            self.requested_workcode = ga.empty(shape=nphotons * ncopies,
                                               dtype=np.uint32)  # deprecated
        elif api.is_gpu_api_opencl():
            queue = cl.CommandQueue(cl_context)
            self.pos = ga.empty(queue,
                                shape=nphotons * ncopies,
                                dtype=ga.vec.float3)
            self.dir = ga.empty(queue,
                                shape=nphotons * ncopies,
                                dtype=ga.vec.float3)
            self.pol = ga.empty(queue,
                                shape=nphotons * ncopies,
                                dtype=ga.vec.float3)
            self.wavelengths = ga.empty(queue,
                                        shape=nphotons * ncopies,
                                        dtype=np.float32)
            self.t = ga.empty(queue,
                              shape=nphotons * ncopies,
                              dtype=np.float32)
            self.last_hit_triangles = ga.empty(queue,
                                               shape=nphotons * ncopies,
                                               dtype=np.int32)
            self.flags = ga.empty(queue,
                                  shape=nphotons * ncopies,
                                  dtype=np.uint32)
            self.weights = ga.empty(queue,
                                    shape=nphotons * ncopies,
                                    dtype=np.float32)
            self.current_node_index = ga.zeros(queue,
                                               shape=nphotons * ncopies,
                                               dtype=np.uint32)  # deprecated
            self.requested_workcode = ga.empty(queue,
                                               shape=nphotons * ncopies,
                                               dtype=np.uint32)  # deprecated

        # Assign the provided photons to the beginning (possibly
        # the entire array if ncopies is 1
        self.pos[:nphotons].set(to_float3(photons.pos))
        self.dir[:nphotons].set(to_float3(photons.dir))
        self.pol[:nphotons].set(to_float3(photons.pol))
        self.wavelengths[:nphotons].set(photons.wavelengths.astype(np.float32))
        self.t[:nphotons].set(photons.t.astype(np.float32))
        self.last_hit_triangles[:nphotons].set(
            photons.last_hit_triangles.astype(np.int32))
        self.flags[:nphotons].set(photons.flags.astype(np.uint32))
        self.weights[:nphotons].set(photons.weights.astype(np.float32))

        if api.is_gpu_api_cuda():
            self.module = get_module('propagate.cu',
                                     options=api_options,
                                     include_source_directory=True)
        elif api.is_gpu_api_opencl():
            self.module = get_module('propagate.cl',
                                     cl_context,
                                     options=api_options,
                                     include_source_directory=True)
        # define the texture references
        self.define_texture_references()
        # get kernel functions
        self.gpu_funcs = GPUFuncs(self.module)

        # Replicate the photons to the rest of the slots if needed
        if ncopies > 1:
            max_blocks = 1024
            nthreads_per_block = 64
            for first_photon, photons_this_round, blocks in \
                    chunk_iterator(nphotons, nthreads_per_block, max_blocks):
                self.gpu_funcs.photon_duplicate(np.int32(first_photon),
                                                np.int32(photons_this_round),
                                                self.pos,
                                                self.dir,
                                                self.wavelengths,
                                                self.pol,
                                                self.t,
                                                self.flags,
                                                self.last_hit_triangles,
                                                self.weights,
                                                np.int32(ncopies - 1),
                                                np.int32(nphotons),
                                                block=(nthreads_per_block, 1,
                                                       1),
                                                grid=(blocks, 1))

        # Save the duplication information for the iterate_copies() method
        self.true_nphotons = nphotons
        self.ncopies = ncopies

Example #20

Show file

    def __init__(self, steps_arr, multiple=1.0, nthreads_per_block=64, max_blocks=1024, ncopies=1,
                 seed=None, cl_context=None):
        """
        Generates photons from information in the steps_arr
        
        Parameters
        ----------
        steps_arr : numpy.array with shape=(N,10) dtype=np.float
           contains [ x1, y1, z1, t1, x2, y2, z2, nphotons, fast_to_slow_ratio, fast_time_constatn, slow_time_constatn ]
           in the future could generalize this to many different time components.
           developed for liquid argon TPCs.
        multiple : float
           scale up the number of photons generated (not implemented yet)
        """
        self.steps_array = steps_arr
        self.nsteps = self.steps_array.shape[0]
        if multiple!=1.0:
            raise RuntimeError('Have not implemented scaling of the number of photons generated.')

        # ===========================
        # GEN PHOTONS
        tstart_genphotons =  time.time()
        # we do the dumbest thing first (i.e., no attempt to do fancy GPU manipulations here)
        # on the CPU, we scan the steps to determine the total number of photons using poisson statistics
        # we assume the user has seeded the random number generator to her liking
        tstart_nphotons = time.time()
        self.step_fsratio = np.array( self.steps_array[:,self._fsratio], dtype=np.float32 )
        #self.nphotons_per_step = np.array( [ np.random.poisson( z ) for z in self.steps_array[:,self._nphotons].ravel() ], dtype=np.int )
        self.nphotons_per_step = self.steps_array[ self._nphotons, : ]
        self.nphotons = reduce( lambda x, y : x + y, self.nphotons_per_step.ravel() )
        print "NSTEPS: ",self.nsteps
        print "NPHOTONS: ",self.nphotons," (time to determine per step=",time.time()-tstart_nphotons
        # now we make an index array for which step we need to get info from
        self.source_step_index = np.zeros( self.nphotons, dtype=np.int32 )
        current_index=0
        for n, n_per_step in enumerate( self.nphotons_per_step ):
            self.source_step_index[current_index:current_index+n_per_step] = n
            current_index += n_per_step
        # push everything to the GPU
        tstart_transfer = time.time()
        if api.is_gpu_api_cuda():
            # step info
            self.step_pos1_gpu = ga.empty(shape=self.nsteps, dtype=ga.vec.float3)
            self.step_pos2_gpu = ga.empty(shape=self.nsteps, dtype=ga.vec.float3)
            self.step_fsratio_gpu = ga.to_gpu( self.step_fsratio )
            self.source_step_index_gpu = ga.to_gpu( self.source_step_index )
            # photon info
            self.pos = ga.empty( shape=self.nphotons, dtype=ga.vec.float3 )
            self.dir = ga.empty( shape=self.nphotons, dtype=ga.vec.float3 )
            self.pol = ga.empty( shape=self.nphotons, dtype=ga.vec.float3 )
            self.wavelengths = ga.empty(shape=self.nphotons*ncopies, dtype=np.float32)
            self.t = ga.to_gpu( np.zeros(self.nphotons*ncopies, dtype=np.float32) )
            self.last_hit_triangles = ga.empty(shape=self.nphotons*ncopies, dtype=np.int32)
            self.flags = ga.empty(shape=self.nphotons*ncopies, dtype=np.uint32)
            self.weights = ga.empty(shape=self.nphotons*ncopies, dtype=np.float32)
        elif api.is_gpu_api_opencl():
            cl_queue = cl.CommandQueue( cl_context )
            # step info
            self.step_pos1_gpu = ga.empty(cl_queue, self.nsteps, dtype=ga.vec.float3)
            self.step_pos2_gpu = ga.empty(cl_queue, self.nsteps, dtype=ga.vec.float3)
            self.step_fsratio_gpu  = ga.to_device( cl_queue, self.step_fsratio )
            self.source_step_index_gpu = ga.to_device( cl_queue, self.source_step_index )
            # photon info
            self.pos = ga.empty( cl_queue, self.nphotons, dtype=ga.vec.float3 )
            self.dir = ga.empty( cl_queue, self.nphotons, dtype=ga.vec.float3 )
            self.pol = ga.empty( cl_queue, self.nphotons, dtype=ga.vec.float3 )
            self.wavelengths = ga.empty( cl_queue, self.nphotons*ncopies, dtype=np.float32)
            self.t = ga.zeros( cl_queue, self.nphotons*ncopies, dtype=np.float32)
            self.last_hit_triangles = ga.empty( cl_queue, self.nphotons*ncopies, dtype=np.int32)
            self.flags = ga.empty( cl_queue, self.nphotons*ncopies, dtype=np.uint32)
            self.weights = ga.empty( cl_queue, self.nphotons*ncopies, dtype=np.float32)
        
        self.step_pos1_gpu.set( to_float3( self.steps_array[:,0:3] ) )
        self.step_pos2_gpu.set( to_float3( self.steps_array[:,4:7] ) )
        self.t.set( self.steps_array[:,3] )
        self.ncopies = ncopies
        self.true_nphotons = self.nphotons

        if self.ncopies!=1:
            raise ValueError('support for multiple copies not supported')

        if api.is_gpu_api_cuda():
            self.gpumod = get_module( "gen_photon_from_step.cu", options=api_options, include_source_directory=True )
        elif api.is_gpu_api_opencl():
            self.gpumod = get_module( "gen_photon_from_step.cl", cl_context, options=api_options, include_source_directory=True )
        self.gpufuncs = GPUFuncs( self.gpumod )
        print "gen photon mem alloc/transfer time=",time.time()-tstart_transfer

        # need random numbers
        tgpu = time.time()
        if seed==None:
            seed = 5
        rng_states = get_rng_states(nthreads_per_block*max_blocks, seed=seed, cl_context=cl_context)
        for first_photon, photons_this_round, blocks in chunk_iterator(self.nphotons, nthreads_per_block, max_blocks):
            if api.is_gpu_api_cuda():
                self.gpufuncs.gen_photon_from_step( np.int32(first_photon), np.int32(self.nphotons), self.source_step_index_gpu,
                                                    self.step_pos1_gpu, self.step_pos2_gpu, self.step_fsratio_gpu,
                                                    np.float32( self.steps_array[0,self._fconst] ), np.float32( self.steps_array[0,self._sconst]  ), np.float32( 128.0 ),
                                                    rng_states,
                                                    self.pos, self.dir, self.pol, self.t, self.wavelengths, self.last_hit_triangles, self.flags, self.weights,
                                                    block=(nthreads_per_block,1,1), grid=(blocks, 1) )
            elif api.is_gpu_api_opencl():
                self.gpufuncs.gen_photon_from_step( cl_queue, ( photons_this_round, 1, 1), None,
                                                    np.int32(first_photon), np.int32(self.nphotons), self.source_step_index_gpu.data,
                                                    self.step_pos1_gpu.data, self.step_pos2_gpu.data, self.step_fsratio_gpu.data,
                                                    np.float32( self.steps_array[0,self._fconst] ), np.float32( self.steps_array[0,self._sconst]  ), np.float32( 128.0 ),
                                                    rng_states.data,
                                                    self.pos.data, self.dir.data, self.pol.data, self.t.data, self.wavelengths.data, 
                                                    self.last_hit_triangles.data, self.flags.data, self.weights.data, g_times_l=False ).wait()
                                                    
            else:
                raise RuntimeError("GPU API is neither CUDA nor OpenCL!")
        if api.is_gpu_api_cuda():
            cuda.Context.get_current().synchronize()
        tend_genphotons =  time.time()
        print "GPUPhotonFromSteps: time to gen photons ",tend_genphotons-tstart_genphotons," secs (gpu time=",time.time()-tgpu,")"

        # Now load modules
        if api.is_gpu_api_cuda():
            self.module = get_module('propagate.cu', options=api_options, include_source_directory=True)
        elif  api.is_gpu_api_opencl():
            self.module = get_module('propagate.cl', cl_context, options=api_options, include_source_directory=True)
        # define the texture references
        self.define_texture_references()
        # get kernel functions
        self.gpu_funcs = GPUFuncs(self.module)

Example #21

Show file

class TestSampling(unittest.TestCase):
    def setUp(self):
        self.context = cltools.get_last_context()
        self.nthreads_per_block = 256
        self.myoptions = ('-I.', ) + api_options
        self.mod = get_module("test_sample_cdf.cl",
                              self.context,
                              options=self.myoptions,
                              include_source_directory=True)
        self.funcs = GPUFuncs(self.mod)
        self.rng_states = clrand.get_rng_states(self.context,
                                                self.nthreads_per_block)
        self.outf = rt.TFile("output_sample_cdf.root", "RECREATE")

    def compare_sampling(self, hist, reps=10):
        queue = cl.CommandQueue(self.context)

        # make cdf histogram
        nbins = hist.GetNbinsX()
        xaxis = hist.GetXaxis()
        intg = hist.GetIntegral()
        cdf_y = np.empty(nbins + 1, dtype=float)
        cdf_x = np.empty_like(cdf_y)

        cdf_x[0] = xaxis.GetBinLowEdge(1)
        cdf_y[0] = 0.0
        for i in xrange(1, len(cdf_x)):
            cdf_y[i] = intg[i]
            cdf_x[i] = xaxis.GetBinUpEdge(i)

        cdf_x_gpu = cl.array.to_device(queue, cdf_x.astype(np.float32))
        cdf_y_gpu = cl.array.to_device(queue, cdf_y.astype(np.float32))
        block = (self.nthreads_per_block, 1, 1)
        grid = (1, 1)
        out_gpu = cl.array.empty(queue,
                                 shape=self.nthreads_per_block,
                                 dtype=np.float32)

        out_h = rt.TH1D('out_h', '', hist.GetNbinsX(), xaxis.GetXmin(),
                        xaxis.GetXmax())
        out_h.SetLineColor(rt.kGreen)

        for first_index, elements_this_iter, nblocks_this_iter in \
                chunk_iterator(reps, self.nthreads_per_block, max_blocks=1):
            self.funcs.test_sample_cdf(queue, (elements_this_iter, 1, 1), None,
                                       self.rng_states.data,
                                       np.int32(len(cdf_x_gpu)),
                                       cdf_x_gpu.data, cdf_y_gpu.data,
                                       out_gpu.data)
            out = out_gpu.get()
            for v in out[:elements_this_iter]:
                out_h.Fill(v)

        prob = out_h.KolmogorovTest(hist)
        out_h.Write()
        return prob, out_h

    def test_sampling(self):
        '''Verify that the CDF-based sampler on the GPU reproduces a binned
        Gaussian distribution'''
        f = rt.TF1('f_gaussian', 'gaus(0)', -5, 5)
        f.SetParameters(1.0 / np.sqrt(np.pi * 2), 0.0, 1.0)
        gaussian = rt.TH1D('gaussian', '', 100, -5, 5)
        gaussian.Add(f)

        prob, out_h = self.compare_sampling(gaussian, reps=20000)

        self.outf.cd()
        gaussian.Write("gaussian")
        out_h.Write("out_h")
        assert prob > 0.01

    def tearDown(self):
        self.outf.Close()

Example #22

Show file

File: daq.py Project: NuTufts/ChromaUBooNE

class GPUDaq(object):
    def __init__(self, gpu_detector, ndaq=1, cl_context=None, cl_queue=None):
        if api.is_gpu_api_cuda():
            self.earliest_time_gpu = ga.empty(gpu_detector.nchannels * ndaq,
                                              dtype=np.float32)
            self.earliest_time_int_gpu = ga.empty(gpu_detector.nchannels *
                                                  ndaq,
                                                  dtype=np.uint32)
            self.channel_history_gpu = ga.zeros_like(
                self.earliest_time_int_gpu)
            self.channel_q_int_gpu = ga.zeros_like(self.earliest_time_int_gpu)
            self.channel_q_gpu = ga.zeros(len(self.earliest_time_int_gpu),
                                          dtype=np.float32)
            self.detector_gpu = gpu_detector.detector_gpu
            self.module = cutools.get_cu_module('daq.cu',
                                                options=api_options,
                                                include_source_directory=True)
        elif api.is_gpu_api_opencl():
            self.earliest_time_gpu = ga.empty(cl_queue,
                                              gpu_detector.nchannels * ndaq,
                                              dtype=np.float32)
            self.earliest_time_int_gpu = ga.empty(cl_queue,
                                                  gpu_detector.nchannels *
                                                  ndaq,
                                                  dtype=np.uint32)
            self.channel_history_gpu = ga.zeros(cl_queue,
                                                gpu_detector.nchannels * ndaq,
                                                dtype=np.uint32)
            self.channel_q_int_gpu = ga.zeros(cl_queue,
                                              gpu_detector.nchannels * ndaq,
                                              dtype=np.uint32)
            self.channel_q_gpu = ga.zeros(cl_queue,
                                          gpu_detector.nchannels * ndaq,
                                          dtype=np.float32)
            self.detector_gpu = gpu_detector  # struct not made in opencl mode, so we keep a copy of the class
            self.module = cltools.get_cl_module('daq.cl',
                                                cl_context,
                                                options=api_options,
                                                include_source_directory=True)
        else:
            raise RuntimeError("GPU API is neither CUDA nor OpenCL")

        self.solid_id_map_gpu = gpu_detector.solid_id_map
        self.solid_id_to_channel_index_gpu = gpu_detector.solid_id_to_channel_index_gpu
        self.gpu_funcs = GPUFuncs(self.module)
        self.ndaq = ndaq
        self.stride = gpu_detector.nchannels

    def begin_acquire(self, nthreads_per_block=64, cl_context=None):
        if api.is_gpu_api_cuda():
            self.gpu_funcs.reset_earliest_time_int(
                np.float32(1e9),
                np.int32(len(self.earliest_time_int_gpu)),
                self.earliest_time_int_gpu,
                block=(nthreads_per_block, 1, 1),
                grid=(len(self.earliest_time_int_gpu) // nthreads_per_block +
                      1, 1))
            self.channel_q_int_gpu.fill(0)
            self.channel_q_gpu.fill(0)
            self.channel_history_gpu.fill(0)
        elif api.is_gpu_api_opencl():
            comqueue = cl.CommandQueue(cl_context)
            self.gpu_funcs.reset_earliest_time_int(
                comqueue, (nthreads_per_block, 1, 1),
                (len(self.earliest_time_int_gpu) // nthreads_per_block + 1, 1),
                np.float32(1e9),
                np.int32(len(self.earliest_time_int_gpu)),
                self.earliest_time_int_gpu.data,
                g_times_l=True).wait()
            self.channel_q_int_gpu.fill(0, queue=comqueue)
            self.channel_q_gpu.fill(0, queue=comqueue)
            self.channel_history_gpu.fill(0, queue=comqueue)
            cl.enqueue_barrier(comqueue)

    def acquire(self,
                gpuphotons,
                rng_states,
                nthreads_per_block=64,
                max_blocks=1024,
                start_photon=None,
                nphotons=None,
                weight=1.0,
                cl_context=None):
        if start_photon is None:
            start_photon = 0
        if nphotons is None:
            nphotons = len(gpuphotons.pos) - start_photon

        if api.is_gpu_api_opencl():
            comqueue = cl.CommandQueue(cl_context)
            clmaxblocks = max_blocks

        if self.ndaq == 1:
            for first_photon, photons_this_round, blocks in \
                    chunk_iterator(nphotons, nthreads_per_block, max_blocks):
                if api.is_gpu_api_cuda():
                    self.gpu_funcs.run_daq(rng_states,
                                           np.uint32(0x1 << 2),
                                           np.int32(start_photon +
                                                    first_photon),
                                           np.int32(photons_this_round),
                                           gpuphotons.t,
                                           gpuphotons.flags,
                                           gpuphotons.last_hit_triangles,
                                           gpuphotons.weights,
                                           self.solid_id_map_gpu,
                                           self.detector_gpu,
                                           self.earliest_time_int_gpu,
                                           self.channel_q_int_gpu,
                                           self.channel_history_gpu,
                                           np.float32(weight),
                                           block=(nthreads_per_block, 1, 1),
                                           grid=(blocks, 1))
                elif api.is_gpu_api_opencl():
                    #print "daq: ",start_photon,first_photon,start_photon+first_photon,(photons_this_round/nthreads_per_block,1,1), (nthreads_per_block,1,1)
                    self.gpu_funcs.run_daq(
                        comqueue,
                        (photons_this_round / nthreads_per_block, 1, 1),
                        (nthreads_per_block, 1, 1),
                        rng_states.data,
                        np.uint32(0x1 << 2),
                        np.int32(start_photon + first_photon),
                        np.int32(photons_this_round),
                        gpuphotons.t.data,
                        gpuphotons.flags.data,
                        gpuphotons.last_hit_triangles.data,
                        gpuphotons.weights.data,
                        self.solid_id_map_gpu.data,
                        # -- Detector struct --
                        self.solid_id_to_channel_index_gpu.data,
                        self.detector_gpu.time_cdf_x_gpu.data,
                        self.detector_gpu.time_cdf_y_gpu.data,
                        self.detector_gpu.charge_cdf_x_gpu.data,
                        self.detector_gpu.charge_cdf_y_gpu.data,
                        self.detector_gpu.nchannels,
                        self.detector_gpu.time_cdf_len,
                        self.detector_gpu.charge_cdf_len,
                        self.detector_gpu.charge_unit,
                        # ---------------------
                        self.earliest_time_int_gpu.data,
                        self.channel_q_int_gpu.data,
                        self.channel_history_gpu.data,
                        np.float32(weight),
                        g_times_l=True).wait()

        else:
            for first_photon, photons_this_round, blocks in \
                    chunk_iterator(nphotons, 1, max_blocks):
                if api.is_gpu_api_cuda():
                    self.gpu_funcs.run_daq_many(
                        rng_states,
                        np.uint32(0x1 << 2),
                        np.int32(start_photon + first_photon),
                        np.int32(photons_this_round),
                        gpuphotons.t,
                        gpuphotons.flags,
                        gpuphotons.last_hit_triangles,
                        gpuphotons.weights,
                        self.solid_id_map_gpu,
                        self.detector_gpu,
                        self.earliest_time_int_gpu,
                        self.channel_q_int_gpu,
                        self.channel_history_gpu,
                        np.int32(self.ndaq),
                        np.int32(self.stride),
                        np.float32(weight),
                        block=(nthreads_per_block, 1, 1),
                        grid=(blocks, 1))
                elif api.is_gpu_api_opencl():
                    self.gpu_funcs.run_daq_many(
                        comqueue,
                        (nthreads_per_block, 1, 1),
                        (blocks, 1),
                        np.int32(start_photon + first_photon),
                        np.int32(photons_this_round),
                        gpuphotons.t.data,
                        gpuphotons.flags.data,
                        gpuphotons.last_hit_triangles.data,
                        gpuphotons.weights.data,
                        self.solid_id_map_gpu,
                        # -- Detector Struct --
                        self.solid_id_to_channel_index_gpu.data,
                        self.detector_gpu.time_cdf_x_gpu.data,
                        self.detector_gpu.time_cdf_y_gpu.data,
                        self.detector_gpu.charge_cdf_x_gpu.data,
                        self.detector_gpu.charge_cdf_y_gpu.data,
                        self.detector_gpu.nchannels,
                        self.detector_gpu.time_cdf_len,
                        self.detector_gpu.charge_cdf_len,
                        self.detector_gpu.charge_unit,
                        # ---------------------
                        self.earliest_time_int_gpu.data,
                        self.channel_q_int_gpu.data,
                        self.channel_history_gpu.data,
                        np.int32(self.ndaq),
                        np.int32(self.stride),
                        np.float32(weight),
                        g_times_l=True).wait()
        if api.is_gpu_api_cuda():
            cuda.Context.get_current().synchronize()
        elif api.is_gpu_api_opencl():
            cl.enqueue_barrier(comqueue)

    def end_acquire(self, nthreads_per_block=64, cl_context=None):
        if api.is_gpu_api_cuda():
            self.gpu_funcs.convert_sortable_int_to_float(
                np.int32(len(self.earliest_time_int_gpu)),
                self.earliest_time_int_gpu,
                self.earliest_time_gpu,
                block=(nthreads_per_block, 1, 1),
                grid=(len(self.earliest_time_int_gpu) // nthreads_per_block +
                      1, 1))
            self.gpu_funcs.convert_charge_int_to_float(
                self.detector_gpu,
                self.channel_q_int_gpu,
                self.channel_q_gpu,
                block=(nthreads_per_block, 1, 1),
                grid=(len(self.channel_q_int_gpu) // nthreads_per_block + 1,
                      1))
            cuda.Context.get_current().synchronize()
        elif api.is_gpu_api_opencl():
            print cl_context, nthreads_per_block
            comqueue = cl.CommandQueue(cl_context)
            self.gpu_funcs.convert_sortable_int_to_float(
                comqueue, (len(self.earliest_time_int_gpu), 1, 1),
                (nthreads_per_block, 1, 1),
                np.int32(len(self.earliest_time_int_gpu)),
                self.earliest_time_int_gpu.data,
                self.earliest_time_gpu.data,
                g_times_l=True).wait()
            self.gpu_funcs.convert_charge_int_to_float(
                comqueue, (len(self.channel_q_int_gpu), 1, 1),
                (nthreads_per_block, 1, 1),
                self.detector_gpu.nchannels,
                self.detector_gpu.charge_unit,
                self.channel_q_int_gpu.data,
                self.channel_q_gpu.data,
                g_times_l=True).wait()

        return GPUChannels(self.earliest_time_gpu, self.channel_q_gpu,
                           self.channel_history_gpu, self.ndaq, self.stride)

Example #23

Show file

class GPUDaqLAr1ND(GPUDAQHist):
    """ DAQ that stores histogram of photon hits."""
    NTDC = None
    NS_PER_TDC = None

    def __init__(self,
                 gpu_detector,
                 ntdcs=None,
                 ns_per_tdc=None,
                 adc_bits=None,
                 ndaq=1,
                 cl_context=None,
                 cl_queue=None):
        """constructor.
        
        Args:
          gpu_detector: GPUDetector
        Keywords:
          ntdcs: int
            number of time bins per channel
            if not supplied, using class variable value
          ns_per_tdc: float
            nanoseconds per time bin
            if not supplied, using class variable value
          adc_bits:  int
            number of ADC bits (not used yet)
          ndaq: int
            number of daqs
          cl_context: pyopencl.Context
          cl_queue: pyopencl.CommandQueue
        Raises:
          ValueError when ntdcs and ns_per_tdc are found to be NoneType
        """
        if ntdcs == None:
            self.ntdcs = GPUDaqLAr1ND.NTDC
        if ns_per_tdc == None:
            self.ns_per_tdc = GPUDaqLAr1ND.NS_PER_TDC
        super(GPUDaqLAr1ND, self).__init__(gpu_detector,
                                           ntdcs=self.ntdcs,
                                           ns_per_tdc=self.ns_per_tdc,
                                           adc_bits=adc_bits,
                                           ndaq=ndaq,
                                           cl_context=cl_context,
                                           cl_queue=cl_queue)
        if self.ntdcs == None:
            raise ValueError("GPUDaqLAr1ND.NTDC has not been set.")
        if self.ns_per_tdc == None:
            raise ValueError("GPUDaqLAr1ND.NS_PER_TDC has not been set.")

        kernel_filepath = os.path.dirname(
            os.path.realpath(__file__)) + "/daq_lar1nd"
        if api.is_gpu_api_cuda():
            self.module = cutools.get_cu_module(kernel_filepath + ".cu",
                                                options=api_options,
                                                include_source_directory=True)
        elif api.is_gpu_api_opencl():
            self.module = cltools.get_cl_module(kernel_filepath + '.cl',
                                                cl_context,
                                                options=api_options,
                                                include_source_directory=True)
        else:
            raise RuntimeError("GPU API is neither CUDA nor OpenCL")

        self.gpu_funcs = GPUFuncs(self.module)

    def acquire(self,
                gpuphotons,
                rng_states,
                nthreads_per_block=64,
                max_blocks=1024,
                start_photon=None,
                nphotons=None,
                weight=1.0,
                cl_context=None):
        """run UBooNE DAQ acquire kernels"""
        if start_photon is None:
            start_photon = 0
        if nphotons is None:
            nphotons = len(gpuphotons.pos) - start_photon

        if api.is_gpu_api_opencl():
            comqueue = cl.CommandQueue(cl_context)
            clmaxblocks = max_blocks

        # We loop over all photons and bin them essentially
        if self.ndaq == 1:
            for first_photon, photons_this_round, blocks in \
                    chunk_iterator(nphotons, nthreads_per_block, max_blocks):
                if api.is_gpu_api_cuda():
                    self.gpu_funcs.run_daq(rng_states,
                                           np.uint32(event.SURFACE_DETECT),
                                           np.int32(start_photon +
                                                    first_photon),
                                           np.int32(photons_this_round),
                                           gpuphotons.t,
                                           gpuphotons.flags,
                                           gpuphotons.last_hit_triangles,
                                           gpuphotons.weights,
                                           self.solid_id_map_gpu,
                                           self.detector_gpu,
                                           self.adc_gpu,
                                           np.int32(self.nchannels),
                                           np.int32(self.ntdcs),
                                           np.float32(self.ns_per_tdc),
                                           np.float32(100.0),
                                           self.channel_history_gpu,
                                           np.float32(weight),
                                           block=(nthreads_per_block, 1, 1),
                                           grid=(blocks, 1))
                elif api.is_gpu_api_opencl():
                    self.gpu_funcs.run_daq(
                        comqueue,
                        (photons_this_round, 1, 1),
                        None,
                        rng_states.data,
                        np.uint32(0x1 << 2),
                        np.int32(start_photon + first_photon),
                        np.int32(nphotons),
                        gpuphotons.t.data,
                        gpuphotons.pos.data,
                        gpuphotons.flags.data,
                        gpuphotons.last_hit_triangles.data,
                        gpuphotons.weights.data,
                        self.solid_id_map_gpu.data,
                        # -- Detector struct --
                        self.solid_id_to_channel_index_gpu.data,
                        # ---------------------
                        self.uint_adc_gpu.data,
                        np.int32(self.nchannels),
                        np.int32(self.ntdcs),
                        np.float32(self.ns_per_tdc),
                        np.float32(100.0),
                        self.channel_history_gpu.data,
                        # -- Channel transforms --
                        self.channel_inverse_rot_gpu.data,
                        self.channel_inverse_trans_gpu.data,
                        # ------------------------
                        np.float32(weight),
                        g_times_l=False).wait()
            # if opencl, need to convert ADC from uint to float
            if api.is_gpu_api_opencl():
                self.gpu_funcs.convert_adc(comqueue,
                                           (int(self.nchannels), 1, 1),
                                           None,
                                           self.uint_adc_gpu.data,
                                           self.adc_gpu.data,
                                           np.int32(self.nchannels),
                                           np.int32(self.ntdcs),
                                           g_times_l=False).wait()

        else:
            raise RunTimeError("Multi-DAQ not built")
            for first_photon, photons_this_round, blocks in \
                    chunk_iterator(nphotons, 1, max_blocks):
                if api.is_gpu_api_cuda():
                    self.gpu_funcs.run_daq_many(
                        rng_states,
                        np.uint32(0x1 << 2),
                        np.int32(start_photon + first_photon),
                        np.int32(photons_this_round),
                        gpuphotons.t,
                        gpuphotons.flags,
                        gpuphotons.last_hit_triangles,
                        gpuphotons.weights,
                        self.solid_id_map_gpu,
                        self.detector_gpu,
                        self.earliest_time_int_gpu,
                        self.channel_q_int_gpu,
                        self.channel_history_gpu,
                        np.int32(self.ndaq),
                        np.int32(self.stride),
                        np.float32(weight),
                        block=(nthreads_per_block, 1, 1),
                        grid=(blocks, 1))
                elif api.is_gpu_api_opencl():
                    self.gpu_funcs.run_daq_many(
                        comqueue,
                        (nthreads_per_block, 1, 1),
                        (blocks, 1),
                        np.int32(start_photon + first_photon),
                        np.int32(photons_this_round),
                        gpuphotons.t.data,
                        gpuphotons.flags.data,
                        gpuphotons.last_hit_triangles.data,
                        gpuphotons.weights.data,
                        self.solid_id_map_gpu,
                        # -- Detector Struct --
                        self.solid_id_to_channel_index_gpu.data,
                        self.detector_gpu.time_cdf_x_gpu.data,
                        self.detector_gpu.time_cdf_y_gpu.data,
                        self.detector_gpu.charge_cdf_x_gpu.data,
                        self.detector_gpu.charge_cdf_y_gpu.data,
                        self.detector_gpu.nchannels,
                        self.detector_gpu.time_cdf_len,
                        self.detector_gpu.charge_cdf_len,
                        self.detector_gpu.charge_unit,
                        # ---------------------
                        self.earliest_time_int_gpu.data,
                        self.channel_q_int_gpu.data,
                        self.channel_history_gpu.data,
                        np.int32(self.ndaq),
                        np.int32(self.stride),
                        np.float32(weight),
                        g_times_l=True).wait()
        if api.is_gpu_api_cuda():
            cuda.Context.get_current().synchronize()
        elif api.is_gpu_api_opencl():
            cl.enqueue_barrier(comqueue)

    def end_acquire(self, nthreads_per_block=64, cl_context=None):
        """collect daq info and make GPUChannels instance.
        
        Args:
          nthreads_per_block: int
          cl_context: pyopenc.Context
        Returns:
          GPUChannels
        """
        if api.is_gpu_api_cuda():
            self.earliest_time_gpu = ga.zeros(self.nchannels, dtype=np.float32)
            nblocks = int(self.nchannels / nthreads_per_block) + 1
            self.gpu_funcs.get_earliest_hit_time(np.int32(self.nchannels),
                                                 np.int32(self.ntdcs),
                                                 np.float32(self.ns_per_tdc),
                                                 self.adc_gpu,
                                                 self.channel_history_gpu,
                                                 self.earliest_time_gpu,
                                                 block=(1000, 1, 1),
                                                 grid=(1, 1))
            self.adc_gpu.get()
        elif api.is_gpu_api_opencl():
            comqueue = cl.CommandQueue(cl_context)
            self.earliest_time_gpu = ga.zeros(comqueue,
                                              self.nchannels,
                                              dtype=np.float32)
            self.gpu_funcs.get_earliest_hit_time(
                comqueue, (int(self.nchannels), 1, 1), None,
                np.int32(self.nchannels), np.int32(self.ntdcs),
                np.float32(self.ns_per_tdc), self.adc_gpu.data,
                self.channel_history_gpu.data,
                self.earliest_time_gpu.data).wait()
            self.adc_gpu.get()

        return GPUChannels(self.earliest_time_gpu, self.adc_gpu,
                           self.channel_history_gpu, self.ndaq, self.stride)

    @classmethod
    def build_daq(cls, gpu_geometry, cl_context=None, cl_queue=None):
        """factory method.

        will be called by chroma.Simulation to build DAQ instance.
        Returns:
          GPUDaqLAr1ND instance
        """
        return GPUDaqLAr1ND(gpu_geometry,
                            cl_context=cl_context,
                            cl_queue=cl_queue)

Example #24

Show file

    def _call_cuda_kernel(self, sim, photons, ourphotons, max_shared_nodes,
                          nodes, workgroupsize):
        module = get_module('wq_checknode.cu',
                            options=api_options,
                            include_source_directory=True)
        gpu_funcs = GPUFuncs(module)

        # gather variables for kernel call
        gpugeo = sim.gpu_geometry
        photon_pos = photons.pos
        photon_dir = photons.dir
        photon_current_node = photons.current_node_index
        photon_tested_node = ga.to_gpu(
            1 * np.ones(len(photons.pos), dtype=np.uint32))
        photon_last_result = ga.to_gpu(
            -1 * np.ones(len(photons.pos), dtype=np.int32))
        nodes = gpugeo.nodes
        node_parent = ga.to_gpu(sim.detector.node_dsar_tree.parent)
        node_first_daughter = ga.to_gpu(
            sim.detector.node_dsar_tree.first_daughter)
        node_sibling = ga.to_gpu(sim.detector.node_dsar_tree.sibling)
        node_aunt = ga.to_gpu(sim.detector.node_dsar_tree.aunt)
        world_origin = gpugeo.world_origin
        world_scale = gpugeo.world_scale

        # make queue related variables
        queue_size = np.int32(len(photons.pos) * 2)
        queue_photon_index = ga.empty(queue_size, dtype=np.int32)
        queue_slot_flag = ga.zeros(queue_size, dtype=np.int32)
        queue_photon_index[0:len(photons.pos)].set(
            np.arange(0, len(photons.pos), dtype=np.int32)[:])
        queue_photon_index[len(photons.pos):].set(
            -1 * np.ones(len(photons.pos), dtype=np.int32))
        queue_slot_flag[0:len(photons.pos)].set(
            np.ones(len(photons.pos), dtype=np.int32)[:])
        a = ga.zeros(1, dtype=ga.vec.uint4)
        b = np.array(1, dtype=np.int32)
        c = np.array(1, dtype=np.uint32)

        max_nodes_can_store = (max_shared_nodes - 20 - 3 * workgroupsize)
        max_nodes_can_store -= max_nodes_can_store % 32
        max_nodes_can_store = np.int32(max_nodes_can_store)

        loaded_node_start_index = np.int32(0)
        loaded_node_end_index = np.int32(1)
        node_front_start = ga.empty(1, dtype=np.int32)
        node_front_end = ga.empty(1, dtype=np.int32)

        max_loops = 1000

        if len(gpugeo.extra_nodes) > 1:
            raise RuntimeError('did not plan for there to be a node split.')

        print photon_current_node
        print photon_tested_node
        print queue_photon_index
        print queue_slot_flag

        print "Starting node range: ", loaded_node_start_index, " to ", loaded_node_end_index
        print "Max nodes in shared: ", max_nodes_can_store
        print "Work group nodes size: ", a.nbytes * workgroupsize, " bytes = (", a.nbytes, "*", workgroupsize, ")"
        print "Available local memsize: ", self.shared_mem_size
        print "Total number of nodes: ", len(
            nodes), " (", nodes.nbytes, " bytes)"
        print "Stored node size: ", max_nodes_can_store * a.nbytes
        print "Left over: ", self.shared_mem_size - max_nodes_can_store * a.nbytes - a.nbytes * workgroupsize
        print sim.detector.bvh.layer_bounds

        print "PRESUB CURRENT NODES"
        print photon_current_node
        print "PRESUB TESTED NODES"
        print photon_tested_node
        print "STARTING QUEUE"
        print queue_photon_index

        start_queue = time.time()
        gpu_funcs.checknode(np.int32(max_loops),
                            photon_pos,
                            photon_dir,
                            photon_current_node,
                            photon_tested_node,
                            photon_last_result,
                            np.int32(len(nodes)),
                            nodes,
                            node_parent,
                            node_first_daughter,
                            node_sibling,
                            node_aunt,
                            world_origin,
                            world_scale,
                            queue_size,
                            queue_photon_index,
                            queue_slot_flag,
                            np.int32(len(photon_pos)),
                            max_nodes_can_store,
                            loaded_node_start_index,
                            loaded_node_end_index,
                            node_front_start,
                            node_front_end,
                            block=(workgroupsize, 1, 1),
                            grid=(1, 1),
                            shared=4 *
                            (7 * max_nodes_can_store + 3 * workgroupsize + 1))
        cuda.Context.get_current().synchronize()
        end_queue = time.time()

        nactive = len(np.argwhere(queue_slot_flag.get() == 1))

        print "CheckNode Queue returns. ", end_queue - start_queue, " seconds"
        print "(Current node, To Test)"
        node_states = zip(photon_current_node.get(), photon_tested_node.get(),
                          photon_last_result.get())
        for x in xrange(0, len(node_states), 10):
            y = x + 10
            if y > len(node_states):
                y = len(node_states)
            print x, ": ", node_states[x:y]

        print "LAST RESULT:"
        np_photon_results = photon_last_result.get()
        for x in xrange(0, len(np_photon_results), 10):
            y = x + 10
            if y > len(np_photon_results):
                y = len(np_photon_results)
            print x, ": ", np_photon_results[x:y]

        print "PHOTON QUEUE"
        photon_queue = queue_photon_index.get()
        for x in xrange(0, len(photon_queue), 10):
            y = x + 10
            if y > len(photon_queue):
                y = len(photon_queue)
            print x, ": ", photon_queue[x:y]

        print "QUEUE SLOT FLAGS: ", nactive, " threads"
        slot_flags = queue_slot_flag.get()
        for x in xrange(0, len(slot_flags), 10):
            y = x + 10
            if y > len(slot_flags):
                y = len(slot_flags)
            print x, ": ", slot_flags[x:y]

        print "NODE FRONT: ", node_front_start.get(
        ), " to ", node_front_end.get(
        ), node_front_end.get() - node_front_start.get()

Example #25

Show file

class GPUKernelPDF(object):
    def __init__(self, cl_context=None, cl_queue=None):
        if api.is_gpu_api_cuda():
            self.module = cutools.get_cu_module('pdf.cu',
                                                options=cutools.cuda_options,
                                                include_source_directory=True)
        elif api.is_gpu_api_opencl():
            self.module = cltools.get_cl_module('pdf.cl',
                                                cl_context,
                                                options=cltools.cl_options,
                                                include_source_directory=True)
        self.gpu_funcs = GPUFuncs(self.module)

    def setup_moments(self, nchannels, trange, qrange, time_only=True):
        """Setup GPU arrays to accumulate moments and eventually
        compute a kernel estimate of PDF values for each hit channel.

            trange: (float, float)
              Range of time dimension in PDF
            qrange: (float, float)
              Range of charge dimension in PDF
            time_only: bool
              If True, only the time observable will be used in the PDF.
        """
        self.hitcount_gpu = ga.zeros(nchannels, dtype=np.uint32)
        self.tmom1_gpu = ga.zeros(nchannels, dtype=np.float32)
        self.tmom2_gpu = ga.zeros(nchannels, dtype=np.float32)
        self.qmom1_gpu = ga.zeros(nchannels, dtype=np.float32)
        self.qmom2_gpu = ga.zeros(nchannels, dtype=np.float32)

        self.trange = trange
        self.qrange = qrange
        self.time_only = time_only

    def clear_moments(self):
        "Reset PDF evaluation counters to start accumulating new Monte Carlo."
        self.hitcount_gpu.fill(0)
        self.tmom1_gpu.fill(0.0)
        self.tmom2_gpu.fill(0.0)
        self.qmom1_gpu.fill(0.0)
        self.qmom2_gpu.fill(0.0)

    def accumulate_moments(self, gpuchannels, nthreads_per_block=64):
        """Add the most recent results of run_daq() to the accumulate of 
        moments for future bandwidth calculation."""
        self.gpu_funcs.accumulate_moments(
            np.int32(self.time_only),
            np.int32(len(gpuchannels.t)),
            gpuchannels.t,
            gpuchannels.q,
            np.float32(self.trange[0]),
            np.float32(self.trange[1]),
            np.float32(self.qrange[0]),
            np.float32(self.qrange[1]),
            self.hitcount_gpu,
            self.tmom1_gpu,
            self.tmom2_gpu,
            self.qmom1_gpu,
            self.qmom2_gpu,
            block=(nthreads_per_block, 1, 1),
            grid=(len(gpuchannels.t) // nthreads_per_block + 1, 1))

    def compute_bandwidth(self,
                          event_hit,
                          event_time,
                          event_charge,
                          scale_factor=1.0):
        """Use the MC information accumulated by accumulate_moments() to
        estimate the best bandwidth to use when kernel estimating."""

        rho = 1.0

        hitcount = self.hitcount_gpu.get()
        mom0 = np.maximum(hitcount, 1)
        tmom1 = self.tmom1_gpu.get()
        tmom2 = self.tmom2_gpu.get()

        tmean = tmom1 / mom0
        tvar = np.maximum(tmom2 / mom0 - tmean**2, 0.0)  # roundoff can go neg
        trms = tvar**0.5

        if self.time_only:
            d = 1
        else:
            d = 2
        dimensionality_factor = ((4.0 / (d + 2)) /
                                 (mom0 / scale_factor))**(-1.0 / (d + 4))
        gaussian_density = np.minimum(
            1.0 / trms, (1.0 / np.sqrt(2.0 * np.pi)) *
            np.exp(-0.5 * ((event_time - tmean) / trms)) / trms)
        time_bandwidths = dimensionality_factor / gaussian_density * rho
        inv_time_bandwidths = np.zeros_like(time_bandwidths)
        inv_time_bandwidths[time_bandwidths > 0] = time_bandwidths[
            time_bandwidths > 0]**-1

        # precompute inverse to speed up GPU evaluation
        self.inv_time_bandwidths_gpu = ga.to_gpu(
            inv_time_bandwidths.astype(np.float32))

        # Compute charge bandwidths if needed
        if self.time_only:
            self.inv_charge_bandwidths_gpu = ga.empty_like(
                self.inv_time_bandwidths_gpu)
            self.inv_charge_bandwidths_gpu.fill(0.0)
        else:
            qmom1 = self.qmom1_gpu.get()
            qmom2 = self.qmom2_gpu.get()

            qmean = qmom1 / mom0
            qrms = (qmom2 / mom0 - qmean**2)**0.5

            gaussian_density = np.minimum(
                1.0 / qrms, (1.0 / np.sqrt(2.0 * np.pi)) *
                np.exp(-0.5 * ((event_charge - qmean) / qrms)) / qrms)

            charge_bandwidths = dimensionality_factor / gaussian_density * rho

            # precompute inverse to speed up GPU evaluation
            self.inv_charge_bandwidths_gpu = ga.to_gpu(
                (charge_bandwidths**-1).astype(np.float32))

    def setup_kernel(self, event_hit, event_time, event_charge):
        """Setup GPU arrays to accumulate moments and eventually
        compute a kernel estimate of PDF values for each hit channel.

            event_hit: ndarray
              Hit or not-hit status for each channel in the detector.
            event_time: ndarray
              Hit time for each channel in the detector.  If channel 
              not hit, the time will be ignored.
            event_charge: ndarray
              Integrated charge for each channel in the detector.
              If channel not hit, the charge will be ignored.
        """
        self.event_hit_gpu = ga.to_gpu(event_hit.astype(np.uint32))
        self.event_time_gpu = ga.to_gpu(event_time.astype(np.float32))
        self.event_charge_gpu = ga.to_gpu(event_charge.astype(np.float32))
        self.hitcount_gpu.fill(0)
        self.time_pdf_values_gpu = ga.zeros(len(event_hit), dtype=np.float32)
        self.charge_pdf_values_gpu = ga.zeros(len(event_hit), dtype=np.float32)

    def clear_kernel(self):
        self.hitcount_gpu.fill(0)
        self.time_pdf_values_gpu.fill(0.0)
        self.charge_pdf_values_gpu.fill(0.0)

    def accumulate_kernel(self, gpuchannels, nthreads_per_block=64):
        "Add the most recent results of run_daq() to the kernel PDF evaluation."
        self.gpu_funcs.accumulate_kernel_eval(
            np.int32(self.time_only),
            np.int32(len(self.event_hit_gpu)),
            self.event_hit_gpu,
            self.event_time_gpu,
            self.event_charge_gpu,
            gpuchannels.t,
            gpuchannels.q,
            np.float32(self.trange[0]),
            np.float32(self.trange[1]),
            np.float32(self.qrange[0]),
            np.float32(self.qrange[1]),
            self.inv_time_bandwidths_gpu,
            self.inv_charge_bandwidths_gpu,
            self.hitcount_gpu,
            self.time_pdf_values_gpu,
            self.charge_pdf_values_gpu,
            block=(nthreads_per_block, 1, 1),
            grid=(len(gpuchannels.t) // nthreads_per_block + 1, 1))

    def get_kernel_eval(self):
        hitcount = self.hitcount_gpu.get()
        hit = self.event_hit_gpu.get().astype(bool)
        time_pdf_values = self.time_pdf_values_gpu.get()
        time_pdf_values /= np.maximum(1, hitcount)  # avoid divide by zero

        charge_pdf_values = self.charge_pdf_values_gpu.get()
        charge_pdf_values /= np.maximum(1, hitcount)  # avoid divide by zero

        if self.time_only:
            pdf_values = time_pdf_values
        else:
            pdf_values = time_pdf_values * charge_pdf_values

        return hitcount, pdf_values, np.zeros_like(pdf_values)