コード例 #1
0
 def setUp(self):
     self.context = cltools.get_last_context()
     self.nthreads_per_block = 256
     self.myoptions = ('-I.', ) + api_options
     self.mod = get_module("test_sample_cdf.cl",
                           self.context,
                           options=self.myoptions,
                           include_source_directory=True)
     self.funcs = GPUFuncs(self.mod)
     self.rng_states = clrand.get_rng_states(self.context,
                                             self.nthreads_per_block)
     self.outf = rt.TFile("output_sample_cdf.root", "RECREATE")
コード例 #2
0
ファイル: bvh.py プロジェクト: NuTufts/ChromaUBooNE
def collapse_chains(nodes, layer_bounds):
    if gpuapi.is_gpu_api_cuda():
        bvh_module = get_module('bvh.cu',
                                options=api_options,
                                include_source_directory=True)
    elif gpuapi.is_gpu_api_opencl():
        context = cltools.get_last_context()
        queue = cl.CommandQueue(context)
        bvh_module = get_module('bvh.cl',
                                context,
                                options=api_options,
                                include_source_directory=True)
    else:
        raise RuntimeError('API neither CUDA or OpenCL')

    bvh_funcs = GPUFuncs(bvh_module)

    if gpuapi.is_gpu_api_cuda():
        gpu_nodes = ga.to_gpu(nodes)
    elif gpuapi.is_gpu_api_opencl():
        gpu_nodes = ga.to_device(queue, nodes)

    bounds = zip(layer_bounds[:-1], layer_bounds[1:])[:-1]
    bounds.reverse()
    nthreads_per_block = 256
    for start, end in bounds:
        if gpuapi.is_gpu_api_cuda():
            bvh_funcs.collapse_child(np.uint32(start),
                                     np.uint32(end),
                                     gpu_nodes,
                                     block=(nthreads_per_block, 1, 1),
                                     grid=(120, 1))
        elif gpuapi.is_gpu_api_opencl():
            bvh_funcs.collapse_child(queue, (end - start, 1, 1), None,
                                     np.uint32(start), np.uint32(end),
                                     gpu_nodes.data).wait()

    return gpu_nodes.get()
コード例 #3
0
ファイル: photon.py プロジェクト: NuTufts/ChromaUBooNE
    def __init__(self, photons, ncopies=1, cl_context=None):
        """Load ``photons`` onto the GPU, replicating as requested.

           Args:
               - photons: chroma.Event.Photons
                   Photon state information to load onto GPU
               - ncopies: int, *optional*
                   Number of times to replicate the photons
                   on the GPU.  This is used if you want
                   to propagate the same event many times,
                   for example in a likelihood calculation.

                   The amount of GPU storage will be proportionally
                   larger if ncopies > 1, so be careful.
        """
        nphotons = len(photons)
        # Allocate GPU memory for photon info and push to device
        if api.is_gpu_api_cuda():
            self.pos = ga.empty(shape=nphotons * ncopies, dtype=ga.vec.float3)
            self.dir = ga.empty(shape=nphotons * ncopies, dtype=ga.vec.float3)
            self.pol = ga.empty(shape=nphotons * ncopies, dtype=ga.vec.float3)
            self.wavelengths = ga.empty(shape=nphotons * ncopies,
                                        dtype=np.float32)
            self.t = ga.empty(shape=nphotons * ncopies, dtype=np.float32)
            self.last_hit_triangles = ga.empty(shape=nphotons * ncopies,
                                               dtype=np.int32)
            self.flags = ga.empty(shape=nphotons * ncopies, dtype=np.uint32)
            self.weights = ga.empty(shape=nphotons * ncopies, dtype=np.float32)
            self.current_node_index = ga.zeros(shape=nphotons * ncopies,
                                               dtype=np.uint32)  # deprecated
            self.requested_workcode = ga.empty(shape=nphotons * ncopies,
                                               dtype=np.uint32)  # deprecated
        elif api.is_gpu_api_opencl():
            queue = cl.CommandQueue(cl_context)
            self.pos = ga.empty(queue,
                                shape=nphotons * ncopies,
                                dtype=ga.vec.float3)
            self.dir = ga.empty(queue,
                                shape=nphotons * ncopies,
                                dtype=ga.vec.float3)
            self.pol = ga.empty(queue,
                                shape=nphotons * ncopies,
                                dtype=ga.vec.float3)
            self.wavelengths = ga.empty(queue,
                                        shape=nphotons * ncopies,
                                        dtype=np.float32)
            self.t = ga.empty(queue,
                              shape=nphotons * ncopies,
                              dtype=np.float32)
            self.last_hit_triangles = ga.empty(queue,
                                               shape=nphotons * ncopies,
                                               dtype=np.int32)
            self.flags = ga.empty(queue,
                                  shape=nphotons * ncopies,
                                  dtype=np.uint32)
            self.weights = ga.empty(queue,
                                    shape=nphotons * ncopies,
                                    dtype=np.float32)
            self.current_node_index = ga.zeros(queue,
                                               shape=nphotons * ncopies,
                                               dtype=np.uint32)  # deprecated
            self.requested_workcode = ga.empty(queue,
                                               shape=nphotons * ncopies,
                                               dtype=np.uint32)  # deprecated

        # Assign the provided photons to the beginning (possibly
        # the entire array if ncopies is 1
        self.pos[:nphotons].set(to_float3(photons.pos))
        self.dir[:nphotons].set(to_float3(photons.dir))
        self.pol[:nphotons].set(to_float3(photons.pol))
        self.wavelengths[:nphotons].set(photons.wavelengths.astype(np.float32))
        self.t[:nphotons].set(photons.t.astype(np.float32))
        self.last_hit_triangles[:nphotons].set(
            photons.last_hit_triangles.astype(np.int32))
        self.flags[:nphotons].set(photons.flags.astype(np.uint32))
        self.weights[:nphotons].set(photons.weights.astype(np.float32))

        if api.is_gpu_api_cuda():
            self.module = get_module('propagate.cu',
                                     options=api_options,
                                     include_source_directory=True)
        elif api.is_gpu_api_opencl():
            self.module = get_module('propagate.cl',
                                     cl_context,
                                     options=api_options,
                                     include_source_directory=True)
        # define the texture references
        self.define_texture_references()
        # get kernel functions
        self.gpu_funcs = GPUFuncs(self.module)

        # Replicate the photons to the rest of the slots if needed
        if ncopies > 1:
            max_blocks = 1024
            nthreads_per_block = 64
            for first_photon, photons_this_round, blocks in \
                    chunk_iterator(nphotons, nthreads_per_block, max_blocks):
                self.gpu_funcs.photon_duplicate(np.int32(first_photon),
                                                np.int32(photons_this_round),
                                                self.pos,
                                                self.dir,
                                                self.wavelengths,
                                                self.pol,
                                                self.t,
                                                self.flags,
                                                self.last_hit_triangles,
                                                self.weights,
                                                np.int32(ncopies - 1),
                                                np.int32(nphotons),
                                                block=(nthreads_per_block, 1,
                                                       1),
                                                grid=(blocks, 1))

        # Save the duplication information for the iterate_copies() method
        self.true_nphotons = nphotons
        self.ncopies = ncopies
コード例 #4
0
ファイル: bvh.py プロジェクト: NuTufts/ChromaUBooNE
def concatenate_layers(layers):
    nthreads_per_block = 1024
    context = None
    queue = None
    if gpuapi.is_gpu_api_opencl():
        context = cltools.get_last_context()
        #print context
        queue = cl.CommandQueue(context)

    # Load GPU functions
    if gpuapi.is_gpu_api_cuda():
        bvh_module = get_module('bvh.cu',
                                options=api_options,
                                include_source_directory=True)
    elif gpuapi.is_gpu_api_opencl():
        # don't like the last context method. trouble. trouble.
        bvh_module = get_module('bvh.cl',
                                cltools.get_last_context(),
                                options=api_options,
                                include_source_directory=True)
    else:
        raise RuntimeError('API neither CUDA nor OpenCL?!')
    bvh_funcs = GPUFuncs(bvh_module)

    # Put 0 at beginning of list
    layer_bounds = np.insert(np.cumsum(map(len, layers)), 0, 0)

    # allocate memory
    if gpuapi.is_gpu_api_cuda():
        nodes = ga.empty(shape=int(layer_bounds[-1]), dtype=ga.vec.uint4)
    elif gpuapi.is_gpu_api_opencl():
        totsize = 0
        layer_pos = []
        print layer_bounds[-1]
        for n, layer in enumerate(layers):
            layer_pos.append(totsize)
            print "LAYER ", n, " size=", len(layer), "start=", totsize
            totsize += len(layer)
        print "totsize: ", totsize
        nodes_iter_np = np.empty(totsize, dtype=ga.vec.uint4)
        nodes_iter_gpu = ga.to_device(queue, nodes_iter_np)
        nodeset_np = []
    else:
        raise RuntimeError('API neither CUDA nor OpenCL?!')

    ilayer = 0
    for layer_start, layer_end, layer in zip(layer_bounds[:-1],
                                             layer_bounds[1:], layers):
        if layer_end == layer_bounds[-1]:
            # leaf nodes need no offset
            child_offset = 0
        else:
            child_offset = layer_end

        #print "ilayer,start,end,child_offset: ",ilayer,layer_start, layer_end, child_offset
        nmax_blocks = 10000
        if gpuapi.is_gpu_api_opencl():
            nthreads_per_block = 256
            nmax_blocks = 1
        for first_index, elements_this_iter, nblocks_this_iter in \
                chunk_iterator(layer_end-layer_start, nthreads_per_block,max_blocks=nmax_blocks):
            #print "   ",ilayer,first_index, elements_this_iter, nblocks_this_iter, layer_start
            if gpuapi.is_gpu_api_cuda():
                bvh_funcs.copy_and_offset(np.uint32(first_index),
                                          np.uint32(elements_this_iter),
                                          np.uint32(child_offset),
                                          cuda.In(layer),
                                          nodes[layer_start:],
                                          block=(nthreads_per_block, 1, 1),
                                          grid=(nblocks_this_iter, 1))
            elif gpuapi.is_gpu_api_opencl():
                layer_gpu = ga.to_device(queue, layer)
                bvh_funcs.copy_and_offset(queue, (elements_this_iter, 1, 1),
                                          (1, 1, 1),
                                          np.uint32(first_index),
                                          np.uint32(elements_this_iter),
                                          np.uint32(child_offset),
                                          np.uint32(layer_start),
                                          layer_gpu.data,
                                          nodes_iter_gpu.data,
                                          g_times_l=True).wait()
            else:
                raise RuntimeError('API neither CUDA nor OpenCL?!')
        ilayer += 1

    if gpuapi.is_gpu_api_cuda():
        return nodes.get(), layer_bounds
    elif gpuapi.is_gpu_api_opencl():
        return nodes_iter_gpu.get(), layer_bounds
コード例 #5
0
ファイル: bvh.py プロジェクト: NuTufts/ChromaUBooNE
def merge_nodes(nodes, degree, max_ratio=None):
    nthreads_per_block = 256
    context = None
    queue = None
    if gpuapi.is_gpu_api_opencl():
        context = cltools.get_last_context()
        queue = cl.CommandQueue(context)

    # Load GPU functions
    if gpuapi.is_gpu_api_cuda():
        bvh_module = get_module('bvh.cu',
                                options=api_options,
                                include_source_directory=True)
    elif gpuapi.is_gpu_api_opencl():
        # don't like the last context method. trouble. trouble.
        bvh_module = get_module('bvh.cl',
                                context,
                                options=api_options,
                                include_source_directory=True)
    else:
        raise RuntimeError('API is neither CUDA nor OpenCL?!')
    bvh_funcs = GPUFuncs(bvh_module)

    # determine number of parents
    nparent = len(nodes) / degree
    if len(nodes) % degree != 0:
        nparent += 1

    if nparent == 1:
        nparent_pad = nparent
    else:
        nparent_pad = round_up_to_multiple(nparent, 1)  #degree

    # allocate memory
    if gpuapi.is_gpu_api_cuda():
        gpu_parent_nodes = ga.zeros(shape=nparent_pad, dtype=ga.vec.uint4)
    elif gpuapi.is_gpu_api_opencl():
        parent_nodes_np = np.zeros(shape=nparent, dtype=ga.vec.uint4)
        gpu_parent_nodes = ga.to_device(queue, parent_nodes_np)
        gpu_nodes = ga.to_device(queue, nodes)
    else:
        raise RuntimeError('API is neither CUDA nor OpenCL?!')

    # run kernel
    if gpuapi.is_gpu_api_cuda():
        for first_index, elements_this_iter, nblocks_this_iter in \
                chunk_iterator(nparent, nthreads_per_block, max_blocks=10000):
            bvh_funcs.make_parents(np.uint32(first_index),
                                   np.uint32(elements_this_iter),
                                   np.uint32(degree),
                                   gpu_parent_nodes,
                                   cuda.In(nodes),
                                   np.uint32(0),
                                   np.uint32(len(nodes)),
                                   block=(nthreads_per_block, 1, 1),
                                   grid=(nblocks_this_iter, 1))
    elif gpuapi.is_gpu_api_opencl():
        for first_index, elements_this_iter, nblocks_this_iter in \
                chunk_iterator(nparent, nthreads_per_block, max_blocks=1):
            bvh_funcs.make_parents(queue, (elements_this_iter, 1, 1), None,
                                   np.uint32(first_index),
                                   np.uint32(elements_this_iter),
                                   np.uint32(degree), gpu_parent_nodes.data,
                                   gpu_nodes.data, np.uint32(0),
                                   np.uint32(len(nodes))).wait()
    else:
        raise RuntimeError('API is neither CUDA nor OpenCL?!')

    parent_nodes = gpu_parent_nodes.get()

    if max_ratio is not None:
        areas = node_areas(parent_nodes)
        child_areas = node_areas(nodes)

        excessive_area = np.zeros(shape=len(areas), dtype=bool)
        for i, parent_area in enumerate(areas):
            nchild = parent_nodes['w'][i] >> CHILD_BITS
            child_index = parent_nodes['w'][i] & ~NCHILD_MASK
            child_area = child_areas[child_index:child_index + nchild].sum()
            #if parent_area > 1e9:
            #    print i, 'Children: %e, Parent: %e' % (child_area, parent_area)
            if child_area / parent_area < 0.3:
                excessive_area[i] = True
                #print i, 'Children: %e, Parent: %e' % (child_area, parent_area)

        extra_slots = round_up_to_multiple(
            (degree - 1) * np.count_nonzero(excessive_area), 1)
        print 'Extra slots:', extra_slots
        new_parent_nodes = np.zeros(shape=len(parent_nodes) + extra_slots,
                                    dtype=parent_nodes.dtype)
        new_parent_nodes[:len(parent_nodes)] = parent_nodes

        offset = 0
        for count, index in enumerate(np.argwhere(excessive_area)):
            index = index[0] + offset
            nchild = new_parent_nodes['w'][index] >> CHILD_BITS
            child_index = new_parent_nodes['w'][index] & ~NCHILD_MASK
            new_parent_nodes[index] = nodes[child_index]
            #new_parent_nodes['w'][index] = 1 << CHILD_BITS | child_index
            tmp_nchild = new_parent_nodes['w'][index] >> CHILD_BITS
            tmp_child_index = new_parent_nodes['w'][index] & ~NCHILD_MASK
            new_parent_nodes['w'][index] = tmp_nchild << CHILD_BITS | (
                tmp_child_index + len(nodes))

            if nchild == 1:
                continue

            # slide everyone over
            #print index, nchild, len(new_parent_nodes)
            new_parent_nodes[index + nchild:] = new_parent_nodes[index +
                                                                 1:-nchild + 1]
            offset += nchild - 1
            for sibling in xrange(nchild - 1):
                new_parent_index = index + 1 + sibling
                new_parent_nodes[new_parent_index] = nodes[child_index +
                                                           sibling + 1]
                if new_parent_nodes['x'][new_parent_index] != 0:
                    tmp_nchild = new_parent_nodes['w'][
                        new_parent_index] >> CHILD_BITS
                    tmp_child_index = new_parent_nodes['w'][
                        new_parent_index] & ~NCHILD_MASK
                    new_parent_nodes['w'][
                        new_parent_index] = tmp_nchild << CHILD_BITS | (
                            tmp_child_index + len(nodes))

                    #new_parent_nodes['w'][new_parent_index] = 1 << CHILD_BITS | (child_index + sibling + 1)

            #print 'intermediate: %e' % node_areas(new_parent_nodes).max()
        print 'old: %e' % node_areas(parent_nodes).max()
        print 'new: %e' % node_areas(new_parent_nodes).max()
        if len(new_parent_nodes) < len(nodes):
            # Only adopt new set of parent nodes if it actually reduces the
            # total number of nodes at this level by 1.
            parent_nodes = new_parent_nodes

    return parent_nodes
コード例 #6
0
ファイル: bvh.py プロジェクト: NuTufts/ChromaUBooNE
def create_leaf_nodes(mesh,
                      morton_bits=16,
                      round_to_multiple=1,
                      nthreads_per_block=32,
                      max_blocks=16):
    '''Compute the leaf nodes surrounding a triangle mesh.

      ``mesh``: chroma.geometry.Mesh
        Triangles to box
      ``morton_bits``: int
        Number of bits to use per dimension when computing Morton code.
      ``round_to_multiple``: int
        Round the number of nodes created up to multiple of this number
        Extra nodes will be all zero.
        
    Returns (world_coords, nodes, morton_codes), where
      ``world_coords``: chroma.bvh.WorldCoords
        Defines the fixed point coordinate system
      ``nodes``: ndarray(shape=len(mesh.triangles), dtype=uint4)
        List of leaf nodes.  Child IDs will be set to triangle offsets.
      ``morton_codes``: ndarray(shape=len(mesh.triangles), dtype=np.uint64)
        Morton codes for each triangle, using ``morton_bits`` per axis.
        Must be <= 16 bits.
    '''
    # it would be nice not to duplicate code, make functions transparent...
    context = None
    queue = None
    if gpuapi.is_gpu_api_opencl():
        context = cltools.get_last_context()
        #print context
        queue = cl.CommandQueue(context)

    # Load GPU functions
    if gpuapi.is_gpu_api_cuda():
        bvh_module = get_module('bvh.cu',
                                options=api_options,
                                include_source_directory=True)
    elif gpuapi.is_gpu_api_opencl():
        # don't like the last context method. trouble. trouble.
        bvh_module = get_module('bvh.cl',
                                cltools.get_last_context(),
                                options=api_options,
                                include_source_directory=True)
    bvh_funcs = GPUFuncs(bvh_module)

    # compute world coordinates
    world_origin_np = mesh.vertices.min(axis=0)
    world_scale = np.max(
        (mesh.vertices.max(axis=0) - world_origin_np)) / (2**16 - 2)
    world_coords = WorldCoords(world_origin=world_origin_np,
                               world_scale=world_scale)

    # Put triangles and vertices into host and device memory
    # unfortunately, opencl and cuda has different methods for managing memory here
    # we have to write divergent code
    if gpuapi.is_gpu_api_cuda():
        # here cuda supports a nice feature where we allocate host and device memory that are mapped onto one another.
        # no explicit requests for transfers here
        triangles = cutools.mapped_empty(shape=len(mesh.triangles),
                                         dtype=ga.vec.uint3,
                                         write_combined=True)
        triangles[:] = to_uint3(mesh.triangles)
        vertices = cutools.mapped_empty(shape=len(mesh.vertices),
                                        dtype=ga.vec.float3,
                                        write_combined=True)
        vertices[:] = to_float3(mesh.vertices)
        #print triangles[0:10]
        #print vertices[0:10]

        # Call GPU to compute nodes
        nodes = ga.zeros(shape=round_up_to_multiple(len(triangles),
                                                    round_to_multiple),
                         dtype=ga.vec.uint4)
        morton_codes = ga.empty(shape=len(triangles), dtype=np.uint64)

        # Convert world coords to GPU-friendly types
        world_origin = ga.vec.make_float3(*world_origin_np)
        world_scale = np.float32(world_scale)

        # generate morton codes on GPU
        for first_index, elements_this_iter, nblocks_this_iter in \
                chunk_iterator(len(triangles), nthreads_per_block,
                               max_blocks=30000):
            bvh_funcs.make_leaves(np.uint32(first_index),
                                  np.uint32(elements_this_iter),
                                  cutools.Mapped(triangles),
                                  cutools.Mapped(vertices),
                                  world_origin,
                                  world_scale,
                                  nodes,
                                  morton_codes,
                                  block=(nthreads_per_block, 1, 1),
                                  grid=(nblocks_this_iter, 1))

        morton_codes_host = morton_codes.get() >> (16 - morton_bits)

    elif gpuapi.is_gpu_api_opencl():
        # here we need to allocate a buffer on the host and on the device
        triangles = np.empty(len(mesh.triangles), dtype=ga.vec.uint3)
        copy_to_uint3(mesh.triangles, triangles)
        vertices = np.empty(len(mesh.vertices), dtype=ga.vec.float3)
        copy_to_float3(mesh.vertices, vertices)
        # now create a buffer object on the device and push data to it
        triangles_dev = ga.to_device(queue, triangles)
        vertices_dev = ga.to_device(queue, vertices)

        # Call GPU to compute nodes
        nodes = ga.zeros(queue,
                         shape=round_up_to_multiple(len(triangles),
                                                    round_to_multiple),
                         dtype=ga.vec.uint4)
        morton_codes = ga.empty(queue, shape=len(triangles), dtype=np.uint64)

        # Convert world coords to GPU-friendly types
        #world_origin = np.array(world_origin_np,dtype=np.float32)
        world_origin = np.empty(1, dtype=ga.vec.float3)
        world_origin['x'] = world_origin_np[0]
        world_origin['y'] = world_origin_np[1]
        world_origin['z'] = world_origin_np[2]
        world_scale = np.float32(world_scale)
        #print world_origin, world_scale

        # generate morton codes on GPU
        for first_index, elements_this_iter, nblocks_this_iter in \
                chunk_iterator(len(triangles), nthreads_per_block, max_blocks):
            print first_index, elements_this_iter, nblocks_this_iter
            bvh_funcs.make_leaves(
                queue,
                (nblocks_this_iter, 1, 1),
                (nthreads_per_block, 1, 1),
                #bvh_funcs.make_leaves( queue, (elements_this_iter,1,1), None,
                np.uint32(first_index),
                np.uint32(elements_this_iter),
                triangles_dev.data,
                vertices_dev.data,
                world_origin,
                world_scale,
                nodes.data,
                morton_codes.data,
                g_times_l=True).wait()

        morton_codes_host = morton_codes.get() >> (16 - morton_bits)

    return world_coords, nodes.get(), morton_codes_host
コード例 #7
0
ファイル: bvh.py プロジェクト: NuTufts/ChromaUBooNE
def merge_nodes_detailed(nodes, first_child, nchild):
    '''Merges nodes into len(first_child) parent nodes, using
    the provided arrays to determine the index of the first
    child of each parent, and how many children there are.'''
    nthreads_per_block = 256
    context = None
    queue = None
    if gpuapi.is_gpu_api_opencl():
        context = cltools.get_last_context()
        #print context
        queue = cl.CommandQueue(context)

    # Load GPU functions
    if gpuapi.is_gpu_api_cuda():
        bvh_module = get_module('bvh.cu',
                                options=api_options,
                                include_source_directory=True)
    elif gpuapi.is_gpu_api_opencl():
        # don't like the last context method. trouble. trouble.
        bvh_module = get_module('bvh.cl',
                                context,
                                options=api_options,
                                include_source_directory=True)
    else:
        raise RuntimeError('API is neither CUDA nor OpenCL?!')
    bvh_funcs = GPUFuncs(bvh_module)

    # Load Memory
    if gpuapi.is_gpu_api_cuda():
        gpu_nodes = ga.to_gpu(nodes)
        gpu_first_child = ga.to_gpu(first_child.astype(np.int32))
        gpu_nchild = ga.to_gpu(nchild.astype(np.int32))

        nparent = len(first_child)
        gpu_parent_nodes = ga.empty(shape=nparent, dtype=ga.vec.uint4)
    elif gpuapi.is_gpu_api_opencl():
        gpu_nodes = ga.to_device(queue, nodes)
        gpu_first_child = ga.to_device(queue, first_child.astype(np.int32))
        gpu_nchild = ga.to_device(queue, nchild.astype(np.int32))
        nparent = len(first_child)
        parent_nodes_np = np.zeros(shape=nparent, dtype=ga.vec.uint4)
        gpu_parent_nodes = ga.to_device(queue, parent_nodes_np)
    else:
        raise RuntimeError('API is neither CUDA nor OpenCL?!')

    # Run Kernel
    for first_index, elements_this_iter, nblocks_this_iter in \
            chunk_iterator(nparent, nthreads_per_block, max_blocks=10000):
        if gpuapi.is_gpu_api_cuda():
            bvh_funcs.make_parents_detailed(np.uint32(first_index),
                                            np.uint32(elements_this_iter),
                                            gpu_nodes,
                                            gpu_parent_nodes,
                                            gpu_first_child,
                                            gpu_nchild,
                                            block=(nthreads_per_block, 1, 1),
                                            grid=(nblocks_this_iter, 1))
        elif gpuapi.is_gpu_api_opencl():
            bvh_funcs.make_parents_detailed(queue, (elements_this_iter, 1, 1),
                                            None, np.uint32(first_index),
                                            np.uint32(elements_this_iter),
                                            gpu_nodes.data,
                                            gpu_parent_nodes.data,
                                            gpu_first_child.data,
                                            gpu_nchild.data).wait()
        else:
            raise RuntimeError('API is neither CUDA nor OpenCL?!')

    return gpu_parent_nodes.get()
コード例 #8
0
ファイル: test_texture.py プロジェクト: NuTufts/ChromaUBooNE
                dump_node_info=True)
sim = Simulation(geo, geant4_processes=0)
origin = geo.bvh.world_coords.world_origin

nodes = sim.gpu_geometry.nodes
extra_node = sim.gpu_geometry.extra_nodes
triangles = sim.gpu_geometry.triangles
vertices = sim.gpu_geometry.vertices
print vertices.shape
vertices4 = np.zeros((len(vertices), 4), dtype=np.float32)
print vertices.get().ravel().view(np.float32).shape
vertices4[:, :-1] = vertices.get().ravel().view(np.float32).reshape(
    len(vertices), 3)

module = get_module('test_texture.cu',
                    options=api_options,
                    include_source_directory=True)
gpu_funcs = GPUFuncs(module)
node_texture_ref = module.get_texref("node_tex_ref")
extra_node_texture_ref = module.get_texref("extra_node_tex_ref")
triangles_texture_ref = module.get_texref("triangles_tex_ref")
vertices_texture_ref = module.get_texref("vertices_tex_ref")

node_vec_texture_ref = module.get_texref("nodevec_tex_ref")
node_vec_texture_ref.set_format(cuda.array_format.UNSIGNED_INT32, 4)

ur_nodes = nodes.get().ravel().view(np.uint32)
ur_nodes_gpu = ga.to_gpu(ur_nodes)
ur_nodes_gpu.bind_to_texref_ext(node_texture_ref)
nodes_nbytes = ur_nodes.nbytes
コード例 #9
0
    def _call_opencl_kernel(self, sim, photons, ourphotons, max_shared_nodes,
                            nodes, workgroupsize, comqueue):
        module = get_module('wq_checknode.cl',
                            self.context,
                            options=api_options,
                            include_source_directory=True)
        gpu_funcs = GPUFuncs(module)

        # gather variables for kernel call
        gpugeo = sim.gpu_geometry
        photon_pos = photons.pos
        photon_dir = photons.dir
        photon_current_node = photons.current_node_index
        photon_tested_node = ga.to_device(
            comqueue, 1 * np.ones(len(photons.pos), dtype=np.uint32))
        photon_last_result = ga.to_device(
            comqueue, -1 * np.ones(len(photons.pos), dtype=np.int32))
        nodes = gpugeo.nodes
        node_parent = ga.to_device(comqueue,
                                   sim.detector.node_dsar_tree.parent)
        node_first_daughter = ga.to_device(
            comqueue, sim.detector.node_dsar_tree.first_daughter)
        node_sibling = ga.to_device(comqueue,
                                    sim.detector.node_dsar_tree.sibling)
        node_aunt = ga.to_device(comqueue, sim.detector.node_dsar_tree.aunt)
        world_origin = gpugeo.world_origin_gpu
        world_scale = gpugeo.world_scale
        # make queue related variables
        queue_size = np.int32(len(photons.pos) * 2)
        queue_photon_index = ga.empty(comqueue, queue_size, dtype=np.int32)
        queue_slot_flag = ga.zeros(comqueue, queue_size, dtype=np.int32)
        queue_photon_index[0:len(photons.pos)] = np.arange(0,
                                                           len(photons.pos),
                                                           dtype=np.int32)[:]
        queue_photon_index[len(photons.pos):] = (
            np.ones(len(photons.pos), dtype=np.int32) * -1)[:]
        queue_slot_flag[0:len(photons.pos)] = np.ones(len(photons.pos),
                                                      dtype=np.int32)[:]
        a = ga.zeros(comqueue, 1, dtype=ga.vec.uint4)
        b = np.array(1, dtype=np.int32)
        c = np.array(1, dtype=np.uint32)
        workgroup_photons = cl.LocalMemory(b.nbytes * workgroupsize)
        workgroup_current_node = cl.LocalMemory(b.nbytes * workgroupsize)
        workgroup_tested_node = cl.LocalMemory(b.nbytes * workgroupsize)

        max_nodes_can_store = (max_shared_nodes - 20 - 3 * workgroupsize)
        max_nodes_can_store -= max_nodes_can_store % 32
        max_nodes_can_store = np.int32(max_nodes_can_store)
        loaded_node_start_index = np.int32(0)
        loaded_node_end_index = np.int32(1)
        node_front_start = ga.empty(comqueue, 1, dtype=np.int32)
        node_front_end = ga.empty(comqueue, 1, dtype=np.int32)
        workgroup_nodes = cl.LocalMemory(a.nbytes * (max_nodes_can_store + 1))
        workgroup_daughter = cl.LocalMemory(c.nbytes *
                                            (max_nodes_can_store + 1))
        workgroup_sibling = cl.LocalMemory(c.nbytes *
                                           (max_nodes_can_store + 1))
        workgroup_aunt = cl.LocalMemory(c.nbytes * (max_nodes_can_store + 1))
        max_loops = 32

        if len(gpugeo.extra_nodes) > 1:
            raise RuntimeError('did not plan for there to be a node split.')

        print photon_current_node
        print photon_tested_node
        print queue_photon_index
        print queue_slot_flag

        print "Starting node range: ", loaded_node_start_index, " to ", loaded_node_end_index
        print "Max nodes in shared: ", max_nodes_can_store
        print "Work group nodes size: ", a.nbytes * workgroupsize, " bytes = (", a.nbytes, "*", workgroupsize, ")"
        print "Available local memsize: ", self.shared_mem_size
        print "Total number of nodes: ", len(
            nodes), " (", nodes.nbytes, " bytes)"
        print "Stored node size: ", max_nodes_can_store * a.nbytes
        print "Left over: ", self.shared_mem_size - max_nodes_can_store * a.nbytes - a.nbytes * workgroupsize
        print sim.detector.bvh.layer_bounds

        print "PRESUB CURRENT NODES"
        print photon_current_node
        print "PRESUB TESTED NODES"
        print photon_tested_node

        start_queue = time.time()
        gpu_funcs.checknode(
            comqueue, (workgroupsize, 1, 1), (workgroupsize, 1, 1),
            np.int32(max_loops), photon_pos.data, photon_dir.data,
            photon_current_node.data,
            photon_tested_node.data, photon_last_result.data,
            np.int32(len(nodes)), nodes.data, node_parent.data,
            node_first_daughter.data, node_sibling.data, node_aunt.data,
            world_origin.data, world_scale, queue_size,
            queue_photon_index.data, queue_slot_flag.data,
            np.int32(len(photon_pos)), np.int32(workgroupsize),
            workgroup_photons, workgroup_current_node, workgroup_tested_node,
            max_nodes_can_store, workgroup_nodes, workgroup_daughter,
            workgroup_sibling, workgroup_aunt, loaded_node_start_index,
            loaded_node_end_index, node_front_start.data,
            node_front_end.data).wait()
        end_queue = time.time()

        print "CheckNode Queue returns. ", end_queue - start_queue, " seconds"
        print "(Current node, To Test, result)"
        node_states = zip(photon_current_node.get(), photon_tested_node.get(),
                          photon_last_result.get())
        for x in xrange(0, len(node_states), 10):
            y = x + 10
            if y > len(node_states):
                y = len(node_states)
            print x, ": ", node_states[x:y]

        print "LAST RESULT:"
        print photon_last_result.get()

        print "PHOTON QUEUE"
        photon_queue = queue_photon_index.get()
        for x in xrange(0, len(photon_queue), 32):
            y = x + 32
            if y > len(photon_queue):
                y = len(photon_queue)
            print x, ": ", photon_queue[x:y]

        print "QUEUE SLOT FLAGS"
        slot_flags = queue_slot_flag.get()
        for x in xrange(0, len(slot_flags), 32):
            y = x + 32
            if y > len(slot_flags):
                y = len(slot_flags)
            print x, ": ", slot_flags[x:y]

        print "NODE FRONT: ", node_front_start.get(
        ), " to ", node_front_end.get(
        ), node_front_end.get() - node_front_start.get()
        return
コード例 #10
0
    def _call_cuda_kernel(self, sim, photons, ourphotons, max_shared_nodes,
                          nodes, workgroupsize):
        module = get_module('wq_checknode.cu',
                            options=api_options,
                            include_source_directory=True)
        gpu_funcs = GPUFuncs(module)

        # gather variables for kernel call
        gpugeo = sim.gpu_geometry
        photon_pos = photons.pos
        photon_dir = photons.dir
        photon_current_node = photons.current_node_index
        photon_tested_node = ga.to_gpu(
            1 * np.ones(len(photons.pos), dtype=np.uint32))
        photon_last_result = ga.to_gpu(
            -1 * np.ones(len(photons.pos), dtype=np.int32))
        nodes = gpugeo.nodes
        node_parent = ga.to_gpu(sim.detector.node_dsar_tree.parent)
        node_first_daughter = ga.to_gpu(
            sim.detector.node_dsar_tree.first_daughter)
        node_sibling = ga.to_gpu(sim.detector.node_dsar_tree.sibling)
        node_aunt = ga.to_gpu(sim.detector.node_dsar_tree.aunt)
        world_origin = gpugeo.world_origin
        world_scale = gpugeo.world_scale

        # make queue related variables
        queue_size = np.int32(len(photons.pos) * 2)
        queue_photon_index = ga.empty(queue_size, dtype=np.int32)
        queue_slot_flag = ga.zeros(queue_size, dtype=np.int32)
        queue_photon_index[0:len(photons.pos)].set(
            np.arange(0, len(photons.pos), dtype=np.int32)[:])
        queue_photon_index[len(photons.pos):].set(
            -1 * np.ones(len(photons.pos), dtype=np.int32))
        queue_slot_flag[0:len(photons.pos)].set(
            np.ones(len(photons.pos), dtype=np.int32)[:])
        a = ga.zeros(1, dtype=ga.vec.uint4)
        b = np.array(1, dtype=np.int32)
        c = np.array(1, dtype=np.uint32)

        max_nodes_can_store = (max_shared_nodes - 20 - 3 * workgroupsize)
        max_nodes_can_store -= max_nodes_can_store % 32
        max_nodes_can_store = np.int32(max_nodes_can_store)

        loaded_node_start_index = np.int32(0)
        loaded_node_end_index = np.int32(1)
        node_front_start = ga.empty(1, dtype=np.int32)
        node_front_end = ga.empty(1, dtype=np.int32)

        max_loops = 1000

        if len(gpugeo.extra_nodes) > 1:
            raise RuntimeError('did not plan for there to be a node split.')

        print photon_current_node
        print photon_tested_node
        print queue_photon_index
        print queue_slot_flag

        print "Starting node range: ", loaded_node_start_index, " to ", loaded_node_end_index
        print "Max nodes in shared: ", max_nodes_can_store
        print "Work group nodes size: ", a.nbytes * workgroupsize, " bytes = (", a.nbytes, "*", workgroupsize, ")"
        print "Available local memsize: ", self.shared_mem_size
        print "Total number of nodes: ", len(
            nodes), " (", nodes.nbytes, " bytes)"
        print "Stored node size: ", max_nodes_can_store * a.nbytes
        print "Left over: ", self.shared_mem_size - max_nodes_can_store * a.nbytes - a.nbytes * workgroupsize
        print sim.detector.bvh.layer_bounds

        print "PRESUB CURRENT NODES"
        print photon_current_node
        print "PRESUB TESTED NODES"
        print photon_tested_node
        print "STARTING QUEUE"
        print queue_photon_index

        start_queue = time.time()
        gpu_funcs.checknode(np.int32(max_loops),
                            photon_pos,
                            photon_dir,
                            photon_current_node,
                            photon_tested_node,
                            photon_last_result,
                            np.int32(len(nodes)),
                            nodes,
                            node_parent,
                            node_first_daughter,
                            node_sibling,
                            node_aunt,
                            world_origin,
                            world_scale,
                            queue_size,
                            queue_photon_index,
                            queue_slot_flag,
                            np.int32(len(photon_pos)),
                            max_nodes_can_store,
                            loaded_node_start_index,
                            loaded_node_end_index,
                            node_front_start,
                            node_front_end,
                            block=(workgroupsize, 1, 1),
                            grid=(1, 1),
                            shared=4 *
                            (7 * max_nodes_can_store + 3 * workgroupsize + 1))
        cuda.Context.get_current().synchronize()
        end_queue = time.time()

        nactive = len(np.argwhere(queue_slot_flag.get() == 1))

        print "CheckNode Queue returns. ", end_queue - start_queue, " seconds"
        print "(Current node, To Test)"
        node_states = zip(photon_current_node.get(), photon_tested_node.get(),
                          photon_last_result.get())
        for x in xrange(0, len(node_states), 10):
            y = x + 10
            if y > len(node_states):
                y = len(node_states)
            print x, ": ", node_states[x:y]

        print "LAST RESULT:"
        np_photon_results = photon_last_result.get()
        for x in xrange(0, len(np_photon_results), 10):
            y = x + 10
            if y > len(np_photon_results):
                y = len(np_photon_results)
            print x, ": ", np_photon_results[x:y]

        print "PHOTON QUEUE"
        photon_queue = queue_photon_index.get()
        for x in xrange(0, len(photon_queue), 10):
            y = x + 10
            if y > len(photon_queue):
                y = len(photon_queue)
            print x, ": ", photon_queue[x:y]

        print "QUEUE SLOT FLAGS: ", nactive, " threads"
        slot_flags = queue_slot_flag.get()
        for x in xrange(0, len(slot_flags), 10):
            y = x + 10
            if y > len(slot_flags):
                y = len(slot_flags)
            print x, ": ", slot_flags[x:y]

        print "NODE FRONT: ", node_front_start.get(
        ), " to ", node_front_end.get(
        ), node_front_end.get() - node_front_start.get()
コード例 #11
0
    def __init__(self, steps_arr, multiple=1.0, nthreads_per_block=64, max_blocks=1024, ncopies=1,
                 seed=None, cl_context=None):
        """
        Generates photons from information in the steps_arr
        
        Parameters
        ----------
        steps_arr : numpy.array with shape=(N,10) dtype=np.float
           contains [ x1, y1, z1, t1, x2, y2, z2, nphotons, fast_to_slow_ratio, fast_time_constatn, slow_time_constatn ]
           in the future could generalize this to many different time components.
           developed for liquid argon TPCs.
        multiple : float
           scale up the number of photons generated (not implemented yet)
        """
        self.steps_array = steps_arr
        self.nsteps = self.steps_array.shape[0]
        if multiple!=1.0:
            raise RuntimeError('Have not implemented scaling of the number of photons generated.')

        # ===========================
        # GEN PHOTONS
        tstart_genphotons =  time.time()
        # we do the dumbest thing first (i.e., no attempt to do fancy GPU manipulations here)
        # on the CPU, we scan the steps to determine the total number of photons using poisson statistics
        # we assume the user has seeded the random number generator to her liking
        tstart_nphotons = time.time()
        self.step_fsratio = np.array( self.steps_array[:,self._fsratio], dtype=np.float32 )
        #self.nphotons_per_step = np.array( [ np.random.poisson( z ) for z in self.steps_array[:,self._nphotons].ravel() ], dtype=np.int )
        self.nphotons_per_step = self.steps_array[ self._nphotons, : ]
        self.nphotons = reduce( lambda x, y : x + y, self.nphotons_per_step.ravel() )
        print "NSTEPS: ",self.nsteps
        print "NPHOTONS: ",self.nphotons," (time to determine per step=",time.time()-tstart_nphotons
        # now we make an index array for which step we need to get info from
        self.source_step_index = np.zeros( self.nphotons, dtype=np.int32 )
        current_index=0
        for n, n_per_step in enumerate( self.nphotons_per_step ):
            self.source_step_index[current_index:current_index+n_per_step] = n
            current_index += n_per_step
        # push everything to the GPU
        tstart_transfer = time.time()
        if api.is_gpu_api_cuda():
            # step info
            self.step_pos1_gpu = ga.empty(shape=self.nsteps, dtype=ga.vec.float3)
            self.step_pos2_gpu = ga.empty(shape=self.nsteps, dtype=ga.vec.float3)
            self.step_fsratio_gpu = ga.to_gpu( self.step_fsratio )
            self.source_step_index_gpu = ga.to_gpu( self.source_step_index )
            # photon info
            self.pos = ga.empty( shape=self.nphotons, dtype=ga.vec.float3 )
            self.dir = ga.empty( shape=self.nphotons, dtype=ga.vec.float3 )
            self.pol = ga.empty( shape=self.nphotons, dtype=ga.vec.float3 )
            self.wavelengths = ga.empty(shape=self.nphotons*ncopies, dtype=np.float32)
            self.t = ga.to_gpu( np.zeros(self.nphotons*ncopies, dtype=np.float32) )
            self.last_hit_triangles = ga.empty(shape=self.nphotons*ncopies, dtype=np.int32)
            self.flags = ga.empty(shape=self.nphotons*ncopies, dtype=np.uint32)
            self.weights = ga.empty(shape=self.nphotons*ncopies, dtype=np.float32)
        elif api.is_gpu_api_opencl():
            cl_queue = cl.CommandQueue( cl_context )
            # step info
            self.step_pos1_gpu = ga.empty(cl_queue, self.nsteps, dtype=ga.vec.float3)
            self.step_pos2_gpu = ga.empty(cl_queue, self.nsteps, dtype=ga.vec.float3)
            self.step_fsratio_gpu  = ga.to_device( cl_queue, self.step_fsratio )
            self.source_step_index_gpu = ga.to_device( cl_queue, self.source_step_index )
            # photon info
            self.pos = ga.empty( cl_queue, self.nphotons, dtype=ga.vec.float3 )
            self.dir = ga.empty( cl_queue, self.nphotons, dtype=ga.vec.float3 )
            self.pol = ga.empty( cl_queue, self.nphotons, dtype=ga.vec.float3 )
            self.wavelengths = ga.empty( cl_queue, self.nphotons*ncopies, dtype=np.float32)
            self.t = ga.zeros( cl_queue, self.nphotons*ncopies, dtype=np.float32)
            self.last_hit_triangles = ga.empty( cl_queue, self.nphotons*ncopies, dtype=np.int32)
            self.flags = ga.empty( cl_queue, self.nphotons*ncopies, dtype=np.uint32)
            self.weights = ga.empty( cl_queue, self.nphotons*ncopies, dtype=np.float32)
        
        self.step_pos1_gpu.set( to_float3( self.steps_array[:,0:3] ) )
        self.step_pos2_gpu.set( to_float3( self.steps_array[:,4:7] ) )
        self.t.set( self.steps_array[:,3] )
        self.ncopies = ncopies
        self.true_nphotons = self.nphotons

        if self.ncopies!=1:
            raise ValueError('support for multiple copies not supported')

        if api.is_gpu_api_cuda():
            self.gpumod = get_module( "gen_photon_from_step.cu", options=api_options, include_source_directory=True )
        elif api.is_gpu_api_opencl():
            self.gpumod = get_module( "gen_photon_from_step.cl", cl_context, options=api_options, include_source_directory=True )
        self.gpufuncs = GPUFuncs( self.gpumod )
        print "gen photon mem alloc/transfer time=",time.time()-tstart_transfer

        # need random numbers
        tgpu = time.time()
        if seed==None:
            seed = 5
        rng_states = get_rng_states(nthreads_per_block*max_blocks, seed=seed, cl_context=cl_context)
        for first_photon, photons_this_round, blocks in chunk_iterator(self.nphotons, nthreads_per_block, max_blocks):
            if api.is_gpu_api_cuda():
                self.gpufuncs.gen_photon_from_step( np.int32(first_photon), np.int32(self.nphotons), self.source_step_index_gpu,
                                                    self.step_pos1_gpu, self.step_pos2_gpu, self.step_fsratio_gpu,
                                                    np.float32( self.steps_array[0,self._fconst] ), np.float32( self.steps_array[0,self._sconst]  ), np.float32( 128.0 ),
                                                    rng_states,
                                                    self.pos, self.dir, self.pol, self.t, self.wavelengths, self.last_hit_triangles, self.flags, self.weights,
                                                    block=(nthreads_per_block,1,1), grid=(blocks, 1) )
            elif api.is_gpu_api_opencl():
                self.gpufuncs.gen_photon_from_step( cl_queue, ( photons_this_round, 1, 1), None,
                                                    np.int32(first_photon), np.int32(self.nphotons), self.source_step_index_gpu.data,
                                                    self.step_pos1_gpu.data, self.step_pos2_gpu.data, self.step_fsratio_gpu.data,
                                                    np.float32( self.steps_array[0,self._fconst] ), np.float32( self.steps_array[0,self._sconst]  ), np.float32( 128.0 ),
                                                    rng_states.data,
                                                    self.pos.data, self.dir.data, self.pol.data, self.t.data, self.wavelengths.data, 
                                                    self.last_hit_triangles.data, self.flags.data, self.weights.data, g_times_l=False ).wait()
                                                    
            else:
                raise RuntimeError("GPU API is neither CUDA nor OpenCL!")
        if api.is_gpu_api_cuda():
            cuda.Context.get_current().synchronize()
        tend_genphotons =  time.time()
        print "GPUPhotonFromSteps: time to gen photons ",tend_genphotons-tstart_genphotons," secs (gpu time=",time.time()-tgpu,")"

        # Now load modules
        if api.is_gpu_api_cuda():
            self.module = get_module('propagate.cu', options=api_options, include_source_directory=True)
        elif  api.is_gpu_api_opencl():
            self.module = get_module('propagate.cl', cl_context, options=api_options, include_source_directory=True)
        # define the texture references
        self.define_texture_references()
        # get kernel functions
        self.gpu_funcs = GPUFuncs(self.module)
コード例 #12
0
ファイル: linalg_test.py プロジェクト: NuTufts/ChromaUBooNE
import os, sys
os.environ["PYOPENCL_CTX"] ='1'

import numpy as np
import pyopencl as cl
import pyopencl.array as clarray
import chroma.gpu.tools as tools

float3 = clarray.vec.float3
print "float3 type: ",float3
ctx = tools.get_context()
queue = cl.CommandQueue(ctx)
dev = ctx.get_info( cl.context_info.DEVICES )[0]
print 'device %s' % dev.get_info( cl.device_info.NAME )

mod = tools.get_module( 'linalg_test.cl', ctx, include_source_directory=False )

size = {'block': (256,), 'grid': (1,)}
a_np = np.zeros((size['block'][0],3), dtype=np.float32)
b_np = np.zeros((size['block'][0],3), dtype=np.float32)
c_np = np.float32(np.random.random_sample())
mf = cl.mem_flags

a_vec_np = np.zeros(size['block'][0], dtype=float3)
b_vec_np = np.zeros(size['block'][0], dtype=float3)
d_vec_np = np.zeros(size['block'][0], dtype=float3)
#c_vec_np = np.float32(np.random.random_sample())

#float3add = mod.get_function('float3add')
#float3addequal = mod.get_function('float3addequal')
#float3sub = mod.get_function('float3sub')