def __init__(self, photons, ncopies=1): """Load ``photons`` onto the GPU, replicating as requested. Args: - photons: chroma.Event.Photons Photon state information to load onto GPU - ncopies: int, *optional* Number of times to replicate the photons on the GPU. This is used if you want to propagate the same event many times, for example in a likelihood calculation. The amount of GPU storage will be proportionally larger if ncopies > 1, so be careful. """ nphotons = len(photons) self.pos = ga.empty(shape=nphotons*ncopies, dtype=ga.vec.float3) self.dir = ga.empty(shape=nphotons*ncopies, dtype=ga.vec.float3) self.pol = ga.empty(shape=nphotons*ncopies, dtype=ga.vec.float3) self.wavelengths = ga.empty(shape=nphotons*ncopies, dtype=np.float32) self.t = ga.empty(shape=nphotons*ncopies, dtype=np.float32) self.last_hit_triangles = ga.empty(shape=nphotons*ncopies, dtype=np.int32) self.flags = ga.empty(shape=nphotons*ncopies, dtype=np.uint32) self.weights = ga.empty(shape=nphotons*ncopies, dtype=np.float32) # Assign the provided photons to the beginning (possibly # the entire array if ncopies is 1 self.pos[:nphotons].set(to_float3(photons.pos)) self.dir[:nphotons].set(to_float3(photons.dir)) self.pol[:nphotons].set(to_float3(photons.pol)) self.wavelengths[:nphotons].set(photons.wavelengths.astype(np.float32)) self.t[:nphotons].set(photons.t.astype(np.float32)) self.last_hit_triangles[:nphotons].set(photons.last_hit_triangles.astype(np.int32)) self.flags[:nphotons].set(photons.flags.astype(np.uint32)) self.weights[:nphotons].set(photons.weights.astype(np.float32)) module = get_cu_module('propagate.cu', options=cuda_options) self.gpu_funcs = GPUFuncs(module) # Replicate the photons to the rest of the slots if needed if ncopies > 1: max_blocks = 1024 nthreads_per_block = 64 for first_photon, photons_this_round, blocks in \ chunk_iterator(nphotons, nthreads_per_block, max_blocks): self.gpu_funcs.photon_duplicate(np.int32(first_photon), np.int32(photons_this_round), self.pos, self.dir, self.wavelengths, self.pol, self.t, self.flags, self.last_hit_triangles, self.weights, np.int32(ncopies-1), np.int32(nphotons), block=(nthreads_per_block,1,1), grid=(blocks, 1)) # Save the duplication information for the iterate_copies() method self.true_nphotons = nphotons self.ncopies = ncopies
def marshall_photons(self, photons, ncopies): """ Assign the provided photons to the beginning (possibly the entire array if ncopies is 1 """ nphotons = len(photons) self.pos = ga.empty(shape=nphotons * ncopies, dtype=ga.vec.float3) self.dir = ga.empty(shape=nphotons * ncopies, dtype=ga.vec.float3) self.pol = ga.empty(shape=nphotons * ncopies, dtype=ga.vec.float3) self.wavelengths = ga.empty(shape=nphotons * ncopies, dtype=np.float32) self.t = ga.empty(shape=nphotons * ncopies, dtype=np.float32) self.last_hit_triangles = ga.empty(shape=nphotons * ncopies, dtype=np.int32) self.flags = ga.empty(shape=nphotons * ncopies, dtype=np.uint32) self.weights = ga.empty(shape=nphotons * ncopies, dtype=np.float32) self.pos[:nphotons].set(to_float3(photons.pos)) self.dir[:nphotons].set(to_float3(photons.dir)) self.pol[:nphotons].set(to_float3(photons.pol)) self.wavelengths[:nphotons].set(photons.wavelengths.astype(np.float32)) self.t[:nphotons].set(photons.t.astype(np.float32)) self.last_hit_triangles[:nphotons].set( photons.last_hit_triangles.astype(np.int32)) self.flags[:nphotons].set(photons.flags.astype(np.uint32)) self.weights[:nphotons].set(photons.weights.astype(np.float32)) # Replicate the photons to the rest of the slots if needed if ncopies > 1: max_blocks = 1024 nthreads_per_block = 64 block = (nthreads_per_block, 1, 1) for first_photon, photons_this_round, blocks in chunk_iterator( nphotons, nthreads_per_block, max_blocks): pass grid = (blocks, 1) args = ( np.int32(first_photon), np.int32(photons_this_round), self.pos, self.dir, self.wavelengths, self.pol, self.t, self.flags, self.last_hit_triangles, self.weights, np.int32(ncopies - 1), np.int32(nphotons), ) self.gpu_funcs.photon_duplicate(*args, block=block, grid=grid) pass pass
def test_rotate(): n = nthreads_per_block * blocks a = np.random.rand(n, 3).astype(np.float32) t = np.random.rand(n).astype(np.float32) * 2 * np.pi w = normalize(np.random.rand(3)) a_gpu = ga.to_gpu(to_float3(a)) t_gpu = ga.to_gpu(t) dest_gpu = ga.empty(n, dtype=ga.vec.float3) t0 = time.time() rotate_gpu(a_gpu, t_gpu, ga.vec.make_float3(*w), dest_gpu, block=(nthreads_per_block, 1, 1), grid=(blocks, 1)) autoinit.context.synchronize() elapsed = time.time() - t0 print('elapsed %f sec' % elapsed) r = rotate(a, t, w) assert np.allclose(r, dest_gpu.get().view(np.float32).reshape((-1, 3)), atol=1e-5)
def __init__(self, pos, dir, max_alpha_depth=10, nblocks=64): self.pos = ga.to_gpu(to_float3(pos)) self.dir = ga.to_gpu(to_float3(dir)) self.max_alpha_depth = max_alpha_depth self.nblocks = nblocks transform_module = get_cu_module('transform.cu', options=cuda_options) self.transform_funcs = GPUFuncs(transform_module) render_module = get_cu_module('render.cu', options=cuda_options) self.render_funcs = GPUFuncs(render_module) self.dx = ga.empty(max_alpha_depth*self.pos.size, dtype=np.float32) self.color = ga.empty(self.dx.size, dtype=ga.vec.float4) self.dxlen = ga.zeros(self.pos.size, dtype=np.uint32)
def __init__(self, pos, dir, max_alpha_depth=10, nblocks=64): self.pos = ga.to_gpu(to_float3(pos)) self.dir = ga.to_gpu(to_float3(dir)) self.max_alpha_depth = max_alpha_depth self.nblocks = nblocks transform_module = get_cu_module('transform.cu', options=cuda_options) self.transform_funcs = GPUFuncs(transform_module) render_module = get_cu_module('render.cu', options=cuda_options) self.render_funcs = GPUFuncs(render_module) self.dx = ga.empty(max_alpha_depth * self.pos.size, dtype=np.float32) self.color = ga.empty(self.dx.size, dtype=ga.vec.float4) self.dxlen = ga.zeros(self.pos.size, dtype=np.uint32)
def __init__(self, photons, ncopies=1, cl_context=None): """Load ``photons`` onto the GPU, replicating as requested. Args: - photons: chroma.Event.Photons Photon state information to load onto GPU - ncopies: int, *optional* Number of times to replicate the photons on the GPU. This is used if you want to propagate the same event many times, for example in a likelihood calculation. The amount of GPU storage will be proportionally larger if ncopies > 1, so be careful. """ nphotons = len(photons) # Allocate GPU memory for photon info and push to device if api.is_gpu_api_cuda(): self.pos = ga.empty(shape=nphotons * ncopies, dtype=ga.vec.float3) self.dir = ga.empty(shape=nphotons * ncopies, dtype=ga.vec.float3) self.pol = ga.empty(shape=nphotons * ncopies, dtype=ga.vec.float3) self.wavelengths = ga.empty(shape=nphotons * ncopies, dtype=np.float32) self.t = ga.empty(shape=nphotons * ncopies, dtype=np.float32) self.last_hit_triangles = ga.empty(shape=nphotons * ncopies, dtype=np.int32) self.flags = ga.empty(shape=nphotons * ncopies, dtype=np.uint32) self.weights = ga.empty(shape=nphotons * ncopies, dtype=np.float32) self.current_node_index = ga.zeros(shape=nphotons * ncopies, dtype=np.uint32) # deprecated self.requested_workcode = ga.empty(shape=nphotons * ncopies, dtype=np.uint32) # deprecated elif api.is_gpu_api_opencl(): queue = cl.CommandQueue(cl_context) self.pos = ga.empty(queue, shape=nphotons * ncopies, dtype=ga.vec.float3) self.dir = ga.empty(queue, shape=nphotons * ncopies, dtype=ga.vec.float3) self.pol = ga.empty(queue, shape=nphotons * ncopies, dtype=ga.vec.float3) self.wavelengths = ga.empty(queue, shape=nphotons * ncopies, dtype=np.float32) self.t = ga.empty(queue, shape=nphotons * ncopies, dtype=np.float32) self.last_hit_triangles = ga.empty(queue, shape=nphotons * ncopies, dtype=np.int32) self.flags = ga.empty(queue, shape=nphotons * ncopies, dtype=np.uint32) self.weights = ga.empty(queue, shape=nphotons * ncopies, dtype=np.float32) self.current_node_index = ga.zeros(queue, shape=nphotons * ncopies, dtype=np.uint32) # deprecated self.requested_workcode = ga.empty(queue, shape=nphotons * ncopies, dtype=np.uint32) # deprecated # Assign the provided photons to the beginning (possibly # the entire array if ncopies is 1 self.pos[:nphotons].set(to_float3(photons.pos)) self.dir[:nphotons].set(to_float3(photons.dir)) self.pol[:nphotons].set(to_float3(photons.pol)) self.wavelengths[:nphotons].set(photons.wavelengths.astype(np.float32)) self.t[:nphotons].set(photons.t.astype(np.float32)) self.last_hit_triangles[:nphotons].set( photons.last_hit_triangles.astype(np.int32)) self.flags[:nphotons].set(photons.flags.astype(np.uint32)) self.weights[:nphotons].set(photons.weights.astype(np.float32)) if api.is_gpu_api_cuda(): self.module = get_module('propagate.cu', options=api_options, include_source_directory=True) elif api.is_gpu_api_opencl(): self.module = get_module('propagate.cl', cl_context, options=api_options, include_source_directory=True) # define the texture references self.define_texture_references() # get kernel functions self.gpu_funcs = GPUFuncs(self.module) # Replicate the photons to the rest of the slots if needed if ncopies > 1: max_blocks = 1024 nthreads_per_block = 64 for first_photon, photons_this_round, blocks in \ chunk_iterator(nphotons, nthreads_per_block, max_blocks): self.gpu_funcs.photon_duplicate(np.int32(first_photon), np.int32(photons_this_round), self.pos, self.dir, self.wavelengths, self.pol, self.t, self.flags, self.last_hit_triangles, self.weights, np.int32(ncopies - 1), np.int32(nphotons), block=(nthreads_per_block, 1, 1), grid=(blocks, 1)) # Save the duplication information for the iterate_copies() method self.true_nphotons = nphotons self.ncopies = ncopies
def create_leaf_nodes(mesh, morton_bits=16, round_to_multiple=1, nthreads_per_block=32, max_blocks=16): '''Compute the leaf nodes surrounding a triangle mesh. ``mesh``: chroma.geometry.Mesh Triangles to box ``morton_bits``: int Number of bits to use per dimension when computing Morton code. ``round_to_multiple``: int Round the number of nodes created up to multiple of this number Extra nodes will be all zero. Returns (world_coords, nodes, morton_codes), where ``world_coords``: chroma.bvh.WorldCoords Defines the fixed point coordinate system ``nodes``: ndarray(shape=len(mesh.triangles), dtype=uint4) List of leaf nodes. Child IDs will be set to triangle offsets. ``morton_codes``: ndarray(shape=len(mesh.triangles), dtype=np.uint64) Morton codes for each triangle, using ``morton_bits`` per axis. Must be <= 16 bits. ''' # it would be nice not to duplicate code, make functions transparent... context = None queue = None if gpuapi.is_gpu_api_opencl(): context = cltools.get_last_context() #print context queue = cl.CommandQueue(context) # Load GPU functions if gpuapi.is_gpu_api_cuda(): bvh_module = get_module('bvh.cu', options=api_options, include_source_directory=True) elif gpuapi.is_gpu_api_opencl(): # don't like the last context method. trouble. trouble. bvh_module = get_module('bvh.cl', cltools.get_last_context(), options=api_options, include_source_directory=True) bvh_funcs = GPUFuncs(bvh_module) # compute world coordinates world_origin_np = mesh.vertices.min(axis=0) world_scale = np.max( (mesh.vertices.max(axis=0) - world_origin_np)) / (2**16 - 2) world_coords = WorldCoords(world_origin=world_origin_np, world_scale=world_scale) # Put triangles and vertices into host and device memory # unfortunately, opencl and cuda has different methods for managing memory here # we have to write divergent code if gpuapi.is_gpu_api_cuda(): # here cuda supports a nice feature where we allocate host and device memory that are mapped onto one another. # no explicit requests for transfers here triangles = cutools.mapped_empty(shape=len(mesh.triangles), dtype=ga.vec.uint3, write_combined=True) triangles[:] = to_uint3(mesh.triangles) vertices = cutools.mapped_empty(shape=len(mesh.vertices), dtype=ga.vec.float3, write_combined=True) vertices[:] = to_float3(mesh.vertices) #print triangles[0:10] #print vertices[0:10] # Call GPU to compute nodes nodes = ga.zeros(shape=round_up_to_multiple(len(triangles), round_to_multiple), dtype=ga.vec.uint4) morton_codes = ga.empty(shape=len(triangles), dtype=np.uint64) # Convert world coords to GPU-friendly types world_origin = ga.vec.make_float3(*world_origin_np) world_scale = np.float32(world_scale) # generate morton codes on GPU for first_index, elements_this_iter, nblocks_this_iter in \ chunk_iterator(len(triangles), nthreads_per_block, max_blocks=30000): bvh_funcs.make_leaves(np.uint32(first_index), np.uint32(elements_this_iter), cutools.Mapped(triangles), cutools.Mapped(vertices), world_origin, world_scale, nodes, morton_codes, block=(nthreads_per_block, 1, 1), grid=(nblocks_this_iter, 1)) morton_codes_host = morton_codes.get() >> (16 - morton_bits) elif gpuapi.is_gpu_api_opencl(): # here we need to allocate a buffer on the host and on the device triangles = np.empty(len(mesh.triangles), dtype=ga.vec.uint3) copy_to_uint3(mesh.triangles, triangles) vertices = np.empty(len(mesh.vertices), dtype=ga.vec.float3) copy_to_float3(mesh.vertices, vertices) # now create a buffer object on the device and push data to it triangles_dev = ga.to_device(queue, triangles) vertices_dev = ga.to_device(queue, vertices) # Call GPU to compute nodes nodes = ga.zeros(queue, shape=round_up_to_multiple(len(triangles), round_to_multiple), dtype=ga.vec.uint4) morton_codes = ga.empty(queue, shape=len(triangles), dtype=np.uint64) # Convert world coords to GPU-friendly types #world_origin = np.array(world_origin_np,dtype=np.float32) world_origin = np.empty(1, dtype=ga.vec.float3) world_origin['x'] = world_origin_np[0] world_origin['y'] = world_origin_np[1] world_origin['z'] = world_origin_np[2] world_scale = np.float32(world_scale) #print world_origin, world_scale # generate morton codes on GPU for first_index, elements_this_iter, nblocks_this_iter in \ chunk_iterator(len(triangles), nthreads_per_block, max_blocks): print first_index, elements_this_iter, nblocks_this_iter bvh_funcs.make_leaves( queue, (nblocks_this_iter, 1, 1), (nthreads_per_block, 1, 1), #bvh_funcs.make_leaves( queue, (elements_this_iter,1,1), None, np.uint32(first_index), np.uint32(elements_this_iter), triangles_dev.data, vertices_dev.data, world_origin, world_scale, nodes.data, morton_codes.data, g_times_l=True).wait() morton_codes_host = morton_codes.get() >> (16 - morton_bits) return world_coords, nodes.get(), morton_codes_host
def __init__(self, geometry, wavelengths=None, times=None, print_usage=False, min_free_gpu_mem=300e6): if wavelengths is None: wavelengths = standard_wavelengths try: wavelength_step = np.unique(np.diff(wavelengths)).item() except ValueError: raise ValueError('wavelengths must be equally spaced apart.') if times is None: time_step = 0.05 times = np.arange(0,1000,time_step) else: try: time_step = np.unique(np.diff(times)).item() except ValueError: raise ValueError('times must be equally spaced apart.') geometry_source = get_cu_source('geometry_types.h') material_struct_size = characterize.sizeof('Material', geometry_source) surface_struct_size = characterize.sizeof('Surface', geometry_source) dichroicprops_struct_size = characterize.sizeof('DichroicProps', geometry_source) geometry_struct_size = characterize.sizeof('Geometry', geometry_source) self.material_data = [] self.material_ptrs = [] def interp_material_property(wavelengths, property): # note that it is essential that the material properties be # interpolated linearly. this fact is used in the propagation # code to guarantee that probabilities still sum to one. return np.interp(wavelengths, property[:,0], property[:,1]).astype(np.float32) for i in range(len(geometry.unique_materials)): material = geometry.unique_materials[i] if material is None: raise Exception('one or more triangles is missing a material.') refractive_index = interp_material_property(wavelengths, material.refractive_index) refractive_index_gpu = ga.to_gpu(refractive_index) absorption_length = interp_material_property(wavelengths, material.absorption_length) absorption_length_gpu = ga.to_gpu(absorption_length) scattering_length = interp_material_property(wavelengths, material.scattering_length) scattering_length_gpu = ga.to_gpu(scattering_length) num_comp = len(material.comp_reemission_prob) comp_reemission_prob_gpu = [ga.to_gpu(interp_material_property(wavelengths, component)) for component in material.comp_reemission_prob] self.material_data.append(comp_reemission_prob_gpu) comp_reemission_prob_gpu = np.uint64(0) if len(comp_reemission_prob_gpu) == 0 else make_gpu_struct(8*len(comp_reemission_prob_gpu), comp_reemission_prob_gpu) assert num_comp == len(material.comp_reemission_wvl_cdf), 'component arrays must be same length' comp_reemission_wvl_cdf_gpu = [ga.to_gpu(interp_material_property(wavelengths, component)) for component in material.comp_reemission_wvl_cdf] self.material_data.append(comp_reemission_wvl_cdf_gpu) comp_reemission_wvl_cdf_gpu = np.uint64(0) if len(comp_reemission_wvl_cdf_gpu) == 0 else make_gpu_struct(8*len(comp_reemission_wvl_cdf_gpu), comp_reemission_wvl_cdf_gpu) assert num_comp == len(material.comp_reemission_time_cdf), 'component arrays must be same length' comp_reemission_time_cdf_gpu = [ga.to_gpu(interp_material_property(times, component)) for component in material.comp_reemission_time_cdf] self.material_data.append(comp_reemission_time_cdf_gpu) comp_reemission_time_cdf_gpu = np.uint64(0) if len(comp_reemission_time_cdf_gpu) == 0 else make_gpu_struct(8*len(comp_reemission_time_cdf_gpu), comp_reemission_time_cdf_gpu) assert num_comp == len(material.comp_absorption_length), 'component arrays must be same length' comp_absorption_length_gpu = [ga.to_gpu(interp_material_property(wavelengths, component)) for component in material.comp_absorption_length] self.material_data.append(comp_absorption_length_gpu) comp_absorption_length_gpu = np.uint64(0) if len(comp_absorption_length_gpu) == 0 else make_gpu_struct(8*len(comp_absorption_length_gpu), comp_absorption_length_gpu) self.material_data.append(refractive_index_gpu) self.material_data.append(absorption_length_gpu) self.material_data.append(scattering_length_gpu) self.material_data.append(comp_reemission_prob_gpu) self.material_data.append(comp_reemission_wvl_cdf_gpu) self.material_data.append(comp_reemission_time_cdf_gpu) self.material_data.append(comp_absorption_length_gpu) material_gpu = \ make_gpu_struct(material_struct_size, [refractive_index_gpu, absorption_length_gpu, scattering_length_gpu, comp_reemission_prob_gpu, comp_reemission_wvl_cdf_gpu, comp_reemission_time_cdf_gpu, comp_absorption_length_gpu, np.uint32(num_comp), np.uint32(len(wavelengths)), np.float32(wavelength_step), np.float32(wavelengths[0]), np.uint32(len(times)), np.float32(time_step), np.float32(times[0])]) self.material_ptrs.append(material_gpu) self.material_pointer_array = \ make_gpu_struct(8*len(self.material_ptrs), self.material_ptrs) self.surface_data = [] self.surface_ptrs = [] for i in range(len(geometry.unique_surfaces)): surface = geometry.unique_surfaces[i] if surface is None: # need something to copy to the surface array struct # that is the same size as a 64-bit pointer. # this pointer will never be used by the simulation. self.surface_ptrs.append(np.uint64(0)) continue detect = interp_material_property(wavelengths, surface.detect) detect_gpu = ga.to_gpu(detect) absorb = interp_material_property(wavelengths, surface.absorb) absorb_gpu = ga.to_gpu(absorb) reemit = interp_material_property(wavelengths, surface.reemit) reemit_gpu = ga.to_gpu(reemit) reflect_diffuse = interp_material_property(wavelengths, surface.reflect_diffuse) reflect_diffuse_gpu = ga.to_gpu(reflect_diffuse) reflect_specular = interp_material_property(wavelengths, surface.reflect_specular) reflect_specular_gpu = ga.to_gpu(reflect_specular) eta = interp_material_property(wavelengths, surface.eta) eta_gpu = ga.to_gpu(eta) k = interp_material_property(wavelengths, surface.k) k_gpu = ga.to_gpu(k) reemission_cdf = interp_material_property(wavelengths, surface.reemission_cdf) reemission_cdf_gpu = ga.to_gpu(reemission_cdf) if surface.dichroic_props: props = surface.dichroic_props transmit_pointers = [] reflect_pointers = [] angles_gpu = ga.to_gpu(np.asarray(props.angles,dtype=np.float32)) self.surface_data.append(angles_gpu) for i,angle in enumerate(props.angles): dichroic_reflect = interp_material_property(wavelengths, props.dichroic_reflect[i]) dichroic_reflect_gpu = ga.to_gpu(dichroic_reflect) self.surface_data.append(dichroic_reflect_gpu) reflect_pointers.append(dichroic_reflect_gpu) dichroic_transmit = interp_material_property(wavelengths, props.dichroic_transmit[i]) dichroic_transmit_gpu = ga.to_gpu(dichroic_transmit) self.surface_data.append(dichroic_transmit_gpu) transmit_pointers.append(dichroic_transmit_gpu) reflect_arr_gpu = make_gpu_struct(8*len(reflect_pointers),reflect_pointers) self.surface_data.append(reflect_arr_gpu) transmit_arr_gpu = make_gpu_struct(8*len(transmit_pointers), transmit_pointers) self.surface_data.append(transmit_arr_gpu) dichroic_props = make_gpu_struct(dichroicprops_struct_size,[angles_gpu,reflect_arr_gpu,transmit_arr_gpu,np.uint32(len(props.angles))]) else: dichroic_props = np.uint64(0) #NULL self.surface_data.append(detect_gpu) self.surface_data.append(absorb_gpu) self.surface_data.append(reemit_gpu) self.surface_data.append(reflect_diffuse_gpu) self.surface_data.append(reflect_specular_gpu) self.surface_data.append(eta_gpu) self.surface_data.append(k_gpu) self.surface_data.append(dichroic_props) surface_gpu = \ make_gpu_struct(surface_struct_size, [detect_gpu, absorb_gpu, reemit_gpu, reflect_diffuse_gpu,reflect_specular_gpu, eta_gpu, k_gpu, reemission_cdf_gpu, dichroic_props, np.uint32(surface.model), np.uint32(len(wavelengths)), np.uint32(surface.transmissive), np.float32(wavelength_step), np.float32(wavelengths[0]), np.float32(surface.thickness)]) self.surface_ptrs.append(surface_gpu) self.surface_pointer_array = \ make_gpu_struct(8*len(self.surface_ptrs), self.surface_ptrs) self.vertices = mapped_empty(shape=len(geometry.mesh.vertices), dtype=ga.vec.float3, write_combined=True) self.triangles = mapped_empty(shape=len(geometry.mesh.triangles), dtype=ga.vec.uint3, write_combined=True) self.vertices[:] = to_float3(geometry.mesh.vertices) self.triangles[:] = to_uint3(geometry.mesh.triangles) self.world_origin = ga.vec.make_float3(*geometry.bvh.world_coords.world_origin) self.world_scale = np.float32(geometry.bvh.world_coords.world_scale) material_codes = (((geometry.material1_index & 0xff) << 24) | ((geometry.material2_index & 0xff) << 16) | ((geometry.surface_index & 0xff) << 8)).astype(np.uint32) self.material_codes = ga.to_gpu(material_codes) colors = geometry.colors.astype(np.uint32) self.colors = ga.to_gpu(colors) self.solid_id_map = ga.to_gpu(geometry.solid_id.astype(np.uint32)) # Limit memory usage by splitting BVH into on and off-GPU parts gpu_free, gpu_total = cuda.mem_get_info() node_array_usage = geometry.bvh.nodes.nbytes # Figure out how many elements we can fit on the GPU, # but no fewer than 100 elements, and no more than the number of actual nodes n_nodes = len(geometry.bvh.nodes) split_index = min( max(int((gpu_free - min_free_gpu_mem) / geometry.bvh.nodes.itemsize),100), n_nodes ) self.nodes = ga.to_gpu(geometry.bvh.nodes[:split_index]) n_extra = max(1, (n_nodes - split_index)) # forbid zero size self.extra_nodes = mapped_empty(shape=n_extra, dtype=geometry.bvh.nodes.dtype, write_combined=True) if split_index < n_nodes: logger.info('Splitting BVH between GPU and CPU memory at node %d' % split_index) self.extra_nodes[:] = geometry.bvh.nodes[split_index:] # See if there is enough memory to put the and/ortriangles back on the GPU gpu_free, gpu_total = cuda.mem_get_info() if self.triangles.nbytes < (gpu_free - min_free_gpu_mem): self.triangles = ga.to_gpu(self.triangles) logger.info('Optimization: Sufficient memory to move triangles onto GPU') gpu_free, gpu_total = cuda.mem_get_info() if self.vertices.nbytes < (gpu_free - min_free_gpu_mem): self.vertices = ga.to_gpu(self.vertices) logger.info('Optimization: Sufficient memory to move vertices onto GPU') self.gpudata = make_gpu_struct(geometry_struct_size, [Mapped(self.vertices), Mapped(self.triangles), self.material_codes, self.colors, self.nodes, Mapped(self.extra_nodes), self.material_pointer_array, self.surface_pointer_array, self.world_origin, self.world_scale, np.int32(len(self.nodes))]) self.geometry = geometry if print_usage: self.print_device_usage() logger.info(self.device_usage_str())
def __init__(self, photons, ncopies=1, copy_flags=True, copy_triangles=True, copy_weights=True): """Load ``photons`` onto the GPU, replicating as requested. Args: - photons: chroma.Event.Photons Photon state information to load onto GPU - ncopies: int, *optional* Number of times to replicate the photons on the GPU. This is used if you want to propagate the same event many times, for example in a likelihood calculation. The amount of GPU storage will be proportionally larger if ncopies > 1, so be careful. """ nphotons = len(photons) self.pos = ga.empty(shape=nphotons * ncopies, dtype=ga.vec.float3) self.dir = ga.empty(shape=nphotons * ncopies, dtype=ga.vec.float3) self.pol = ga.empty(shape=nphotons * ncopies, dtype=ga.vec.float3) self.wavelengths = ga.empty(shape=nphotons * ncopies, dtype=np.float32) self.t = ga.empty(shape=nphotons * ncopies, dtype=np.float32) self.last_hit_triangles = ga.empty(shape=nphotons * ncopies, dtype=np.int32) if not copy_triangles: self.last_hit_triangles.fill(-1) if not copy_flags: self.flags = ga.zeros(shape=nphotons * ncopies, dtype=np.uint32) else: self.flags = ga.empty(shape=nphotons * ncopies, dtype=np.uint32) if not copy_weights: self.weights = ga.ones_like(self.last_hit_triangles, dtype=np.float32) else: self.weights = ga.empty(shape=nphotons * ncopies, dtype=np.float32) self.evidx = ga.empty(shape=nphotons, dtype=np.uint32) # Assign the provided photons to the beginning (possibly # the entire array if ncopies is 1 self.pos[:nphotons].set(to_float3(photons.pos)) self.dir[:nphotons].set(to_float3(photons.dir)) self.pol[:nphotons].set(to_float3(photons.pol)) self.wavelengths[:nphotons].set(photons.wavelengths.astype(np.float32)) self.t[:nphotons].set(photons.t.astype(np.float32)) if copy_triangles: self.last_hit_triangles[:nphotons].set( photons.last_hit_triangles.astype(np.int32)) if copy_flags: self.flags[:nphotons].set(photons.flags.astype(np.uint32)) if copy_weights: self.weights[:nphotons].set(photons.weights.astype(np.float32)) self.evidx[:nphotons].set(photons.evidx.astype(np.uint32)) module = get_cu_module('propagate.cu', options=cuda_options) self.gpu_funcs = GPUFuncs(module) # Replicate the photons to the rest of the slots if needed if ncopies > 1: max_blocks = 1024 nthreads_per_block = 64 for first_photon, photons_this_round, blocks in \ chunk_iterator(nphotons, nthreads_per_block, max_blocks): self.gpu_funcs.photon_duplicate(np.int32(first_photon), np.int32(photons_this_round), self.pos, self.dir, self.wavelengths, self.pol, self.t, self.flags, self.last_hit_triangles, self.weights, self.evidx, np.int32(ncopies - 1), np.int32(nphotons), block=(nthreads_per_block, 1, 1), grid=(blocks, 1)) # Save the duplication information for the iterate_copies() method self.true_nphotons = nphotons self.ncopies = ncopies
def __init__(self, geometry, wavelengths=None, print_usage=False, min_free_gpu_mem=300e6): if wavelengths is None: wavelengths = standard_wavelengths try: wavelength_step = np.unique(np.diff(wavelengths)).item() except ValueError: raise ValueError('wavelengths must be equally spaced apart.') geometry_source = get_cu_source('geometry_types.h') material_struct_size = characterize.sizeof('Material', geometry_source) surface_struct_size = characterize.sizeof('Surface', geometry_source) geometry_struct_size = characterize.sizeof('Geometry', geometry_source) self.material_data = [] self.material_ptrs = [] def interp_material_property(wavelengths, property): # note that it is essential that the material properties be # interpolated linearly. this fact is used in the propagation # code to guarantee that probabilities still sum to one. return np.interp(wavelengths, property[:,0], property[:,1]).astype(np.float32) for i in range(len(geometry.unique_materials)): material = geometry.unique_materials[i] if material is None: raise Exception('one or more triangles is missing a material.') refractive_index = interp_material_property(wavelengths, material.refractive_index) refractive_index_gpu = ga.to_gpu(refractive_index) absorption_length = interp_material_property(wavelengths, material.absorption_length) absorption_length_gpu = ga.to_gpu(absorption_length) scattering_length = interp_material_property(wavelengths, material.scattering_length) scattering_length_gpu = ga.to_gpu(scattering_length) reemission_prob = interp_material_property(wavelengths, material.reemission_prob) reemission_prob_gpu = ga.to_gpu(reemission_prob) reemission_cdf = interp_material_property(wavelengths, material.reemission_cdf) reemission_cdf_gpu = ga.to_gpu(reemission_cdf) self.material_data.append(refractive_index_gpu) self.material_data.append(absorption_length_gpu) self.material_data.append(scattering_length_gpu) self.material_data.append(reemission_prob_gpu) self.material_data.append(reemission_cdf_gpu) material_gpu = \ make_gpu_struct(material_struct_size, [refractive_index_gpu, absorption_length_gpu, scattering_length_gpu, reemission_prob_gpu, reemission_cdf_gpu, np.uint32(len(wavelengths)), np.float32(wavelength_step), np.float32(wavelengths[0])]) self.material_ptrs.append(material_gpu) self.material_pointer_array = \ make_gpu_struct(8*len(self.material_ptrs), self.material_ptrs) self.surface_data = [] self.surface_ptrs = [] for i in range(len(geometry.unique_surfaces)): surface = geometry.unique_surfaces[i] if surface is None: # need something to copy to the surface array struct # that is the same size as a 64-bit pointer. # this pointer will never be used by the simulation. self.surface_ptrs.append(np.uint64(0)) continue detect = interp_material_property(wavelengths, surface.detect) detect_gpu = ga.to_gpu(detect) absorb = interp_material_property(wavelengths, surface.absorb) absorb_gpu = ga.to_gpu(absorb) reemit = interp_material_property(wavelengths, surface.reemit) reemit_gpu = ga.to_gpu(reemit) reflect_diffuse = interp_material_property(wavelengths, surface.reflect_diffuse) reflect_diffuse_gpu = ga.to_gpu(reflect_diffuse) reflect_specular = interp_material_property(wavelengths, surface.reflect_specular) reflect_specular_gpu = ga.to_gpu(reflect_specular) eta = interp_material_property(wavelengths, surface.eta) eta_gpu = ga.to_gpu(eta) k = interp_material_property(wavelengths, surface.k) k_gpu = ga.to_gpu(k) reemission_cdf = interp_material_property(wavelengths, surface.reemission_cdf) reemission_cdf_gpu = ga.to_gpu(reemission_cdf) self.surface_data.append(detect_gpu) self.surface_data.append(absorb_gpu) self.surface_data.append(reemit_gpu) self.surface_data.append(reflect_diffuse_gpu) self.surface_data.append(reflect_specular_gpu) self.surface_data.append(eta_gpu) self.surface_data.append(k_gpu) self.surface_data.append(reemission_cdf_gpu) surface_gpu = \ make_gpu_struct(surface_struct_size, [detect_gpu, absorb_gpu, reemit_gpu, reflect_diffuse_gpu,reflect_specular_gpu, eta_gpu, k_gpu, reemission_cdf_gpu, np.uint32(surface.model), np.uint32(len(wavelengths)), np.uint32(surface.transmissive), np.float32(wavelength_step), np.float32(wavelengths[0]), np.float32(surface.thickness)]) self.surface_ptrs.append(surface_gpu) self.surface_pointer_array = \ make_gpu_struct(8*len(self.surface_ptrs), self.surface_ptrs) self.vertices = mapped_empty(shape=len(geometry.mesh.vertices), dtype=ga.vec.float3, write_combined=True) self.triangles = mapped_empty(shape=len(geometry.mesh.triangles), dtype=ga.vec.uint3, write_combined=True) self.vertices[:] = to_float3(geometry.mesh.vertices) self.triangles[:] = to_uint3(geometry.mesh.triangles) self.world_origin = ga.vec.make_float3(*geometry.bvh.world_coords.world_origin) self.world_scale = np.float32(geometry.bvh.world_coords.world_scale) material_codes = (((geometry.material1_index & 0xff) << 24) | ((geometry.material2_index & 0xff) << 16) | ((geometry.surface_index & 0xff) << 8)).astype(np.uint32) self.material_codes = ga.to_gpu(material_codes) colors = geometry.colors.astype(np.uint32) self.colors = ga.to_gpu(colors) self.solid_id_map = ga.to_gpu(geometry.solid_id.astype(np.uint32)) # Limit memory usage by splitting BVH into on and off-GPU parts gpu_free, gpu_total = cuda.mem_get_info() node_array_usage = geometry.bvh.nodes.nbytes # Figure out how many elements we can fit on the GPU, # but no fewer than 100 elements, and no more than the number of actual nodes n_nodes = len(geometry.bvh.nodes) split_index = min( max(int((gpu_free - min_free_gpu_mem) / geometry.bvh.nodes.itemsize),100), n_nodes ) self.nodes = ga.to_gpu(geometry.bvh.nodes[:split_index]) n_extra = max(1, (n_nodes - split_index)) # forbid zero size self.extra_nodes = mapped_empty(shape=n_extra, dtype=geometry.bvh.nodes.dtype, write_combined=True) if split_index < n_nodes: logger.info('Splitting BVH between GPU and CPU memory at node %d' % split_index) self.extra_nodes[:] = geometry.bvh.nodes[split_index:] # See if there is enough memory to put the and/ortriangles back on the GPU gpu_free, gpu_total = cuda.mem_get_info() if self.triangles.nbytes < (gpu_free - min_free_gpu_mem): self.triangles = ga.to_gpu(self.triangles) logger.info('Optimization: Sufficient memory to move triangles onto GPU') gpu_free, gpu_total = cuda.mem_get_info() if self.vertices.nbytes < (gpu_free - min_free_gpu_mem): self.vertices = ga.to_gpu(self.vertices) logger.info('Optimization: Sufficient memory to move vertices onto GPU') self.gpudata = make_gpu_struct(geometry_struct_size, [Mapped(self.vertices), Mapped(self.triangles), self.material_codes, self.colors, self.nodes, Mapped(self.extra_nodes), self.material_pointer_array, self.surface_pointer_array, self.world_origin, self.world_scale, np.int32(len(self.nodes))]) self.geometry = geometry if print_usage: self.print_device_usage() logger.info(self.device_usage_str())
def __init__(self, steps_arr, multiple=1.0, nthreads_per_block=64, max_blocks=1024, ncopies=1, seed=None, cl_context=None): """ Generates photons from information in the steps_arr Parameters ---------- steps_arr : numpy.array with shape=(N,10) dtype=np.float contains [ x1, y1, z1, t1, x2, y2, z2, nphotons, fast_to_slow_ratio, fast_time_constatn, slow_time_constatn ] in the future could generalize this to many different time components. developed for liquid argon TPCs. multiple : float scale up the number of photons generated (not implemented yet) """ self.steps_array = steps_arr self.nsteps = self.steps_array.shape[0] if multiple!=1.0: raise RuntimeError('Have not implemented scaling of the number of photons generated.') # =========================== # GEN PHOTONS tstart_genphotons = time.time() # we do the dumbest thing first (i.e., no attempt to do fancy GPU manipulations here) # on the CPU, we scan the steps to determine the total number of photons using poisson statistics # we assume the user has seeded the random number generator to her liking tstart_nphotons = time.time() self.step_fsratio = np.array( self.steps_array[:,self._fsratio], dtype=np.float32 ) #self.nphotons_per_step = np.array( [ np.random.poisson( z ) for z in self.steps_array[:,self._nphotons].ravel() ], dtype=np.int ) self.nphotons_per_step = self.steps_array[ self._nphotons, : ] self.nphotons = reduce( lambda x, y : x + y, self.nphotons_per_step.ravel() ) print "NSTEPS: ",self.nsteps print "NPHOTONS: ",self.nphotons," (time to determine per step=",time.time()-tstart_nphotons # now we make an index array for which step we need to get info from self.source_step_index = np.zeros( self.nphotons, dtype=np.int32 ) current_index=0 for n, n_per_step in enumerate( self.nphotons_per_step ): self.source_step_index[current_index:current_index+n_per_step] = n current_index += n_per_step # push everything to the GPU tstart_transfer = time.time() if api.is_gpu_api_cuda(): # step info self.step_pos1_gpu = ga.empty(shape=self.nsteps, dtype=ga.vec.float3) self.step_pos2_gpu = ga.empty(shape=self.nsteps, dtype=ga.vec.float3) self.step_fsratio_gpu = ga.to_gpu( self.step_fsratio ) self.source_step_index_gpu = ga.to_gpu( self.source_step_index ) # photon info self.pos = ga.empty( shape=self.nphotons, dtype=ga.vec.float3 ) self.dir = ga.empty( shape=self.nphotons, dtype=ga.vec.float3 ) self.pol = ga.empty( shape=self.nphotons, dtype=ga.vec.float3 ) self.wavelengths = ga.empty(shape=self.nphotons*ncopies, dtype=np.float32) self.t = ga.to_gpu( np.zeros(self.nphotons*ncopies, dtype=np.float32) ) self.last_hit_triangles = ga.empty(shape=self.nphotons*ncopies, dtype=np.int32) self.flags = ga.empty(shape=self.nphotons*ncopies, dtype=np.uint32) self.weights = ga.empty(shape=self.nphotons*ncopies, dtype=np.float32) elif api.is_gpu_api_opencl(): cl_queue = cl.CommandQueue( cl_context ) # step info self.step_pos1_gpu = ga.empty(cl_queue, self.nsteps, dtype=ga.vec.float3) self.step_pos2_gpu = ga.empty(cl_queue, self.nsteps, dtype=ga.vec.float3) self.step_fsratio_gpu = ga.to_device( cl_queue, self.step_fsratio ) self.source_step_index_gpu = ga.to_device( cl_queue, self.source_step_index ) # photon info self.pos = ga.empty( cl_queue, self.nphotons, dtype=ga.vec.float3 ) self.dir = ga.empty( cl_queue, self.nphotons, dtype=ga.vec.float3 ) self.pol = ga.empty( cl_queue, self.nphotons, dtype=ga.vec.float3 ) self.wavelengths = ga.empty( cl_queue, self.nphotons*ncopies, dtype=np.float32) self.t = ga.zeros( cl_queue, self.nphotons*ncopies, dtype=np.float32) self.last_hit_triangles = ga.empty( cl_queue, self.nphotons*ncopies, dtype=np.int32) self.flags = ga.empty( cl_queue, self.nphotons*ncopies, dtype=np.uint32) self.weights = ga.empty( cl_queue, self.nphotons*ncopies, dtype=np.float32) self.step_pos1_gpu.set( to_float3( self.steps_array[:,0:3] ) ) self.step_pos2_gpu.set( to_float3( self.steps_array[:,4:7] ) ) self.t.set( self.steps_array[:,3] ) self.ncopies = ncopies self.true_nphotons = self.nphotons if self.ncopies!=1: raise ValueError('support for multiple copies not supported') if api.is_gpu_api_cuda(): self.gpumod = get_module( "gen_photon_from_step.cu", options=api_options, include_source_directory=True ) elif api.is_gpu_api_opencl(): self.gpumod = get_module( "gen_photon_from_step.cl", cl_context, options=api_options, include_source_directory=True ) self.gpufuncs = GPUFuncs( self.gpumod ) print "gen photon mem alloc/transfer time=",time.time()-tstart_transfer # need random numbers tgpu = time.time() if seed==None: seed = 5 rng_states = get_rng_states(nthreads_per_block*max_blocks, seed=seed, cl_context=cl_context) for first_photon, photons_this_round, blocks in chunk_iterator(self.nphotons, nthreads_per_block, max_blocks): if api.is_gpu_api_cuda(): self.gpufuncs.gen_photon_from_step( np.int32(first_photon), np.int32(self.nphotons), self.source_step_index_gpu, self.step_pos1_gpu, self.step_pos2_gpu, self.step_fsratio_gpu, np.float32( self.steps_array[0,self._fconst] ), np.float32( self.steps_array[0,self._sconst] ), np.float32( 128.0 ), rng_states, self.pos, self.dir, self.pol, self.t, self.wavelengths, self.last_hit_triangles, self.flags, self.weights, block=(nthreads_per_block,1,1), grid=(blocks, 1) ) elif api.is_gpu_api_opencl(): self.gpufuncs.gen_photon_from_step( cl_queue, ( photons_this_round, 1, 1), None, np.int32(first_photon), np.int32(self.nphotons), self.source_step_index_gpu.data, self.step_pos1_gpu.data, self.step_pos2_gpu.data, self.step_fsratio_gpu.data, np.float32( self.steps_array[0,self._fconst] ), np.float32( self.steps_array[0,self._sconst] ), np.float32( 128.0 ), rng_states.data, self.pos.data, self.dir.data, self.pol.data, self.t.data, self.wavelengths.data, self.last_hit_triangles.data, self.flags.data, self.weights.data, g_times_l=False ).wait() else: raise RuntimeError("GPU API is neither CUDA nor OpenCL!") if api.is_gpu_api_cuda(): cuda.Context.get_current().synchronize() tend_genphotons = time.time() print "GPUPhotonFromSteps: time to gen photons ",tend_genphotons-tstart_genphotons," secs (gpu time=",time.time()-tgpu,")" # Now load modules if api.is_gpu_api_cuda(): self.module = get_module('propagate.cu', options=api_options, include_source_directory=True) elif api.is_gpu_api_opencl(): self.module = get_module('propagate.cl', cl_context, options=api_options, include_source_directory=True) # define the texture references self.define_texture_references() # get kernel functions self.gpu_funcs = GPUFuncs(self.module)
def create_leaf_nodes(mesh, morton_bits=16, round_to_multiple=1): '''Compute the leaf nodes surrounding a triangle mesh. ``mesh``: chroma.geometry.Mesh Triangles to box ``morton_bits``: int Number of bits to use per dimension when computing Morton code. ``round_to_multiple``: int Round the number of nodes created up to multiple of this number Extra nodes will be all zero. Returns (world_coords, nodes, morton_codes), where ``world_coords``: chroma.bvh.WorldCoords Defines the fixed point coordinate system ``nodes``: ndarray(shape=len(mesh.triangles), dtype=uint4) List of leaf nodes. Child IDs will be set to triangle offsets. ``morton_codes``: ndarray(shape=len(mesh.triangles), dtype=np.uint64) Morton codes for each triangle, using ``morton_bits`` per axis. Must be <= 16 bits. ''' # Load GPU functions bvh_module = get_cu_module('bvh.cu', options=cuda_options, include_source_directory=True) bvh_funcs = GPUFuncs(bvh_module) # compute world coordinates world_origin = mesh.vertices.min(axis=0) world_scale = np.max((mesh.vertices.max(axis=0) - world_origin)) \ / (2**16 - 2) world_coords = WorldCoords(world_origin=world_origin, world_scale=world_scale) # Put triangles and vertices in mapped host memory triangles = mapped_empty(shape=len(mesh.triangles), dtype=ga.vec.uint3, write_combined=True) triangles[:] = to_uint3(mesh.triangles) vertices = mapped_empty(shape=len(mesh.vertices), dtype=ga.vec.float3, write_combined=True) vertices[:] = to_float3(mesh.vertices) # Call GPU to compute nodes nodes = ga.zeros(shape=round_up_to_multiple(len(triangles), round_to_multiple), dtype=ga.vec.uint4) morton_codes = ga.empty(shape=len(triangles), dtype=np.uint64) # Convert world coords to GPU-friendly types world_origin = ga.vec.make_float3(*world_origin) world_scale = np.float32(world_scale) nthreads_per_block = 256 for first_index, elements_this_iter, nblocks_this_iter in \ chunk_iterator(len(triangles), nthreads_per_block, max_blocks=30000): bvh_funcs.make_leaves(np.uint32(first_index), np.uint32(elements_this_iter), Mapped(triangles), Mapped(vertices), world_origin, world_scale, nodes, morton_codes, block=(nthreads_per_block,1,1), grid=(nblocks_this_iter,1)) morton_codes_host = morton_codes.get() >> (16 - morton_bits) return world_coords, nodes.get(), morton_codes_host
def __init__(self, geometry, wavelengths=None, print_usage=False, min_free_gpu_mem=300e6, cl_context=None, cl_queue=None): log.info("GPUGeometry.__init__ min_free_gpu_mem %s ", min_free_gpu_mem) self.geometry = geometry self.instance_count += 1 assert self.instance_count == 1, traceback.print_stack() self.metadata = Metadata() self.metadata(None, 'preinfo') self.metadata('a', "start") self.metadata['a_min_free_gpu_mem'] = min_free_gpu_mem if wavelengths is None: self.wavelengths = standard_wavelengths else: self.wavelengths = wavelengths try: self.wavelength_step = np.unique(np.diff(self.wavelengths)).item() except ValueError: raise ValueError('wavelengths must be equally spaced apart.') # this is where things get difficult. # pycuda and pyopencl gives us very different methods for working with structs #geometry_struct_size = characterize.sizeof('Geometry', geometry_source) # Note, that unfortunately the data types returned are very different as the if api.is_gpu_api_cuda(): self.material_data, self.material_ptrs, self.material_pointer_array = self._package_material_data_cuda( geometry, self.wavelengths, self.wavelength_step) self.surface_data, self.surface_ptrs, self.surface_pointer_array = self._package_surface_data_cuda( geometry, self.wavelengths, self.wavelength_step) elif api.is_gpu_api_opencl(): self.material_data, materials_bytes_cl = self._package_material_data_cl( cl_context, cl_queue, geometry, self.wavelengths, self.wavelength_step) self.surface_data, surfaces_bytes_cl = self._package_surface_data_cl( cl_context, cl_queue, geometry, self.wavelengths, self.wavelength_step) self.metadata('b', "after materials,surfaces") if api.is_gpu_api_opencl(): self.metadata[ 'b_gpu_used'] = materials_bytes_cl + surfaces_bytes_cl # opencl, we have to track this ourselves # Load Vertices and Triangles if api.is_gpu_api_cuda(): self.vertices = mapped_empty(shape=len(geometry.mesh.vertices), dtype=ga.vec.float3, write_combined=True) self.vertices4 = np.zeros(shape=(len(self.vertices), 4), dtype=np.float32) self.triangles = mapped_empty(shape=len(geometry.mesh.triangles), dtype=ga.vec.uint3, write_combined=True) self.triangles4 = np.zeros(shape=(len(self.triangles), 4), dtype=np.uint32) self.vertices[:] = to_float3(geometry.mesh.vertices) self.vertices4[:, :-1] = self.vertices.ravel().view( np.float32).reshape(len(self.vertices), 3) # for textures self.triangles[:] = to_uint3(geometry.mesh.triangles) self.triangles4[:, :-1] = self.triangles.ravel().view( np.uint32).reshape(len(self.triangles), 3) # for textures elif api.is_gpu_api_opencl(): self.vertices = ga.empty(cl_queue, len(geometry.mesh.vertices), dtype=ga.vec.float3) self.triangles = ga.empty(cl_queue, len(geometry.mesh.triangles), dtype=ga.vec.uint3) self.vertices[:] = to_float3(geometry.mesh.vertices) self.triangles[:] = to_uint3(geometry.mesh.triangles) if api.is_gpu_api_cuda(): self.world_origin = ga.vec.make_float3( *geometry.bvh.world_coords.world_origin) elif api.is_gpu_api_opencl(): self.world_origin = ga.vec.make_float3( *geometry.bvh.world_coords.world_origin) #self.world_origin = geometry.bvh.world_coords.world_origin self.world_origin = ga.to_device(cl_queue, self.world_origin) print type(self.world_origin), self.world_origin self.world_scale = np.float32(geometry.bvh.world_coords.world_scale) # Load material and surface indices into 8-bit codes # check if we've reached a complexity threshold if len(geometry.unique_materials) >= int(0xff): raise ValueError( 'Number of materials to index has hit maximum of %d' % (int(0xff))) if len(geometry.unique_surfaces) >= int(0xff): raise ValueError( 'Number of surfaces to index has hit maximum of %d' % (int(0xff))) # make bit code material_codes = (((geometry.material1_index & 0xff) << 24) | ((geometry.material2_index & 0xff) << 16) | ((geometry.surface_index & 0xff) << 8)).astype( np.uint32) if api.is_gpu_api_cuda(): self.material_codes = ga.to_gpu(material_codes) elif api.is_gpu_api_opencl(): self.material_codes = ga.to_device(cl_queue, material_codes) # assign color codes colors = geometry.colors.astype(np.uint32) if api.is_gpu_api_cuda(): self.colors = ga.to_gpu(colors) self.solid_id_map = ga.to_gpu(geometry.solid_id.astype(np.uint32)) elif api.is_gpu_api_opencl(): self.colors = ga.to_device(cl_queue, colors) self.solid_id_map = ga.to_device( cl_queue, geometry.solid_id.astype(np.uint32)) # Limit memory usage by splitting BVH into on and off-GPU parts self.metadata('c', "after colors, idmap") if api.is_gpu_api_cuda(): gpu_free, gpu_total = cuda.mem_get_info() elif api.is_gpu_api_opencl(): gpu_total = self.metadata['gpu_total'] meshdef_nbytes_cl = self.vertices.nbytes + self.triangles.nbytes + self.world_origin.nbytes + self.world_scale.nbytes + self.material_codes.nbytes + self.colors.nbytes + self.solid_id_map.nbytes self.metadata[ 'c_gpu_used'] = materials_bytes_cl + surfaces_bytes_cl + meshdef_nbytes_cl gpu_free = gpu_total - (materials_bytes_cl + surfaces_bytes_cl + meshdef_nbytes_cl) # Figure out how many elements we can fit on the GPU, # but no fewer than 100 elements, and no more than the number of actual nodes n_nodes = len(geometry.bvh.nodes) split_index = min( max( int((gpu_free - min_free_gpu_mem) / geometry.bvh.nodes.itemsize), 100), n_nodes) print "split index=", split_index, " vs. total nodes=", n_nodes # push nodes to GPU if api.is_gpu_api_cuda(): self.nodes = ga.to_gpu(geometry.bvh.nodes[:split_index]) elif api.is_gpu_api_opencl(): self.nodes = ga.to_device(cl_queue, geometry.bvh.nodes[:split_index]) n_extra = max(1, (n_nodes - split_index)) # forbid zero size # left over nodes if api.is_gpu_api_cuda(): self.extra_nodes = mapped_empty(shape=n_extra, dtype=geometry.bvh.nodes.dtype, write_combined=True) elif api.is_gpu_api_opencl(): self.extra_nodes = ga.empty(cl_queue, shape=n_extra, dtype=geometry.bvh.nodes.dtype) if split_index < n_nodes: log.info('Splitting BVH between GPU and CPU memory at node %d' % split_index) self.extra_nodes[:] = geometry.bvh.nodes[split_index:] splitting = 1 else: splitting = 0 self.metadata('d', "after nodes") if api.is_gpu_api_opencl(): nodes_nbytes_cl = self.nodes.nbytes self.metadata[ 'd_gpu_used'] = materials_bytes_cl + surfaces_bytes_cl + meshdef_nbytes_cl + nodes_nbytes_cl self.metadata.array("d_nodes", geometry.bvh.nodes) self.metadata['d_split_index'] = split_index self.metadata['d_extra_nodes_count'] = n_extra self.metadata['d_splitting'] = splitting self.print_device_usage(cl_context=cl_context) # CUDA See if there is enough memory to put the vertices and/or triangles back on the GPU if api.is_gpu_api_cuda(): gpu_free, gpu_total = cuda.mem_get_info() elif api.is_gpu_api_opencl(): gpu_total = self.metadata['gpu_total'] gpu_free = gpu_total - self.metadata['d_gpu_used'] self.metadata.array('e_triangles', self.triangles) if api.is_gpu_api_cuda(): if self.triangles.nbytes < (gpu_free - min_free_gpu_mem): self.triangles = ga.to_gpu(self.triangles) log.info( 'Optimization: Sufficient memory to move triangles onto GPU' ) ftriangles_gpu = 1 else: log.warn('using host mapped memory triangles') ftriangles_gpu = 0 elif api.is_gpu_api_opencl(): if self.triangles.nbytes < (gpu_free - min_free_gpu_mem): #self.triangles = ga.to_device(cl_queue,self.triangles) log.info( 'Optimization: Sufficient memory to move triangles onto GPU' ) ftriangles_gpu = 1 else: log.warn('using host mapped memory triangles') ftriangles_gpu = 0 self.metadata('e', "after triangles") self.metadata['e_triangles_gpu'] = ftriangles_gpu if api.is_gpu_api_cuda(): gpu_free, gpu_total = cuda.mem_get_info() elif api.is_gpu_api_opencl(): gpu_total = self.metadata['gpu_total'] gpu_free = gpu_total - self.metadata['d_gpu_used'] self.metadata.array('f_vertices', self.vertices) if api.is_gpu_api_cuda(): if self.vertices.nbytes < (gpu_free - min_free_gpu_mem): self.vertices = ga.to_gpu(self.vertices) log.info( 'Optimization: Sufficient memory to move vertices onto GPU' ) vertices_gpu = 1 else: log.warn('using host mapped memory vertices') vertices_gpu = 0 elif api.is_gpu_api_opencl(): if self.vertices.nbytes < (gpu_free - min_free_gpu_mem): #self.vertices = ga.to_gpu(self.vertices) log.info( 'Optimization: Sufficient memory to move vertices onto GPU' ) vertices_gpu = 1 else: log.warn('using host mapped memory vertices') vertices_gpu = 0 self.metadata('f', "after vertices") self.metadata['f_vertices_gpu'] = vertices_gpu if api.is_gpu_api_cuda(): geometry_source = cutools.get_cu_source('geometry_types.h') geometry_struct_size = characterize.sizeof('Geometry', geometry_source) self.gpudata = make_gpu_struct(geometry_struct_size, [ Mapped(self.vertices), Mapped(self.triangles), self.material_codes, self.colors, self.nodes, Mapped(self.extra_nodes), self.material_pointer_array, self.surface_pointer_array, self.world_origin, self.world_scale, np.int32(len(self.nodes)) ]) elif api.is_gpu_api_opencl(): # No relevant way to pass struct into OpenCL kernel. We have to pass everything by arrays # We then build a geometry struct later in the kernel # provided below is example/test of passing the data #if True: # for debuggin if False: # print "loading geometry_structs.cl" geostructsmod = cltools.get_cl_module( "geometry_structs.cl", cl_context, options=cltools.cl_options, include_source_directory=True) geostructsfunc = GPUFuncs(geostructsmod) geostructsfunc.make_geostruct( cl_queue, (3, ), None, self.vertices.data, self.triangles.data, self.material_codes.data, self.colors.data, self.nodes.data, self.extra_nodes.data, np.int32(len(geometry.unique_materials)), self.material_data['refractive_index'].data, self.material_data['absorption_length'].data, self.material_data['scattering_length'].data, self.material_data['reemission_prob'].data, self.material_data['reemission_cdf'].data, np.int32(len(geometry.unique_surfaces)), self.surface_data['detect'].data, self.surface_data['absorb'].data, self.surface_data['reemit'].data, self.surface_data['reflect_diffuse'].data, self.surface_data['reflect_specular'].data, self.surface_data['eta'].data, self.surface_data['k'].data, self.surface_data['reemission_cdf'].data, self.surface_data['model'].data, self.surface_data['transmissive'].data, self.surface_data['thickness'].data, self.surface_data['nplanes'].data, self.surface_data['wire_diameter'].data, self.surface_data['wire_pitch'].data, self.world_origin.data, self.world_scale, np.int32(len(self.nodes)), self.material_data['n'], self.material_data['step'], self.material_data["wavelength0"]) cl_queue.finish() self.material_codes.get() raise RuntimeError('bail') if print_usage: self.print_device_usage(cl_context=cl_context) log.info(self.device_usage_str(cl_context=cl_context)) self.metadata('g', "after geometry struct")
def create_leaf_nodes(mesh, morton_bits=16, round_to_multiple=1): '''Compute the leaf nodes surrounding a triangle mesh. ``mesh``: chroma.geometry.Mesh Triangles to box ``morton_bits``: int Number of bits to use per dimension when computing Morton code. ``round_to_multiple``: int Round the number of nodes created up to multiple of this number Extra nodes will be all zero. Returns (world_coords, nodes, morton_codes), where ``world_coords``: chroma.bvh.WorldCoords Defines the fixed point coordinate system ``nodes``: ndarray(shape=len(mesh.triangles), dtype=uint4) List of leaf nodes. Child IDs will be set to triangle offsets. ``morton_codes``: ndarray(shape=len(mesh.triangles), dtype=np.uint64) Morton codes for each triangle, using ``morton_bits`` per axis. Must be <= 16 bits. ''' # Load GPU functions bvh_module = get_cu_module('bvh.cu', options=cuda_options, include_source_directory=True) bvh_funcs = GPUFuncs(bvh_module) # compute world coordinates world_origin = mesh.vertices.min(axis=0) world_scale = np.max((mesh.vertices.max(axis=0) - world_origin)) \ / (2**16 - 2) world_coords = WorldCoords(world_origin=world_origin, world_scale=world_scale) # Put triangles and vertices in mapped host memory triangles = mapped_empty(shape=len(mesh.triangles), dtype=ga.vec.uint3, write_combined=True) triangles[:] = to_uint3(mesh.triangles) vertices = mapped_empty(shape=len(mesh.vertices), dtype=ga.vec.float3, write_combined=True) vertices[:] = to_float3(mesh.vertices) # Call GPU to compute nodes nodes = ga.zeros(shape=round_up_to_multiple(len(triangles), round_to_multiple), dtype=ga.vec.uint4) morton_codes = ga.empty(shape=len(triangles), dtype=np.uint64) # Convert world coords to GPU-friendly types world_origin = ga.vec.make_float3(*world_origin) world_scale = np.float32(world_scale) nthreads_per_block = 256 for first_index, elements_this_iter, nblocks_this_iter in \ chunk_iterator(len(triangles), nthreads_per_block, max_blocks=30000): bvh_funcs.make_leaves(np.uint32(first_index), np.uint32(elements_this_iter), Mapped(triangles), Mapped(vertices), world_origin, world_scale, nodes, morton_codes, block=(nthreads_per_block, 1, 1), grid=(nblocks_this_iter, 1)) morton_codes_host = morton_codes.get() >> (16 - morton_bits) return world_coords, nodes.get(), morton_codes_host