def setUp(self): self.context = cltools.get_last_context() self.nthreads_per_block = 256 self.myoptions = ('-I.', ) + api_options self.mod = get_module("test_sample_cdf.cl", self.context, options=self.myoptions, include_source_directory=True) self.funcs = GPUFuncs(self.mod) self.rng_states = clrand.get_rng_states(self.context, self.nthreads_per_block) self.outf = rt.TFile("output_sample_cdf.root", "RECREATE")
def collapse_chains(nodes, layer_bounds): if gpuapi.is_gpu_api_cuda(): bvh_module = get_module('bvh.cu', options=api_options, include_source_directory=True) elif gpuapi.is_gpu_api_opencl(): context = cltools.get_last_context() queue = cl.CommandQueue(context) bvh_module = get_module('bvh.cl', context, options=api_options, include_source_directory=True) else: raise RuntimeError('API neither CUDA or OpenCL') bvh_funcs = GPUFuncs(bvh_module) if gpuapi.is_gpu_api_cuda(): gpu_nodes = ga.to_gpu(nodes) elif gpuapi.is_gpu_api_opencl(): gpu_nodes = ga.to_device(queue, nodes) bounds = zip(layer_bounds[:-1], layer_bounds[1:])[:-1] bounds.reverse() nthreads_per_block = 256 for start, end in bounds: if gpuapi.is_gpu_api_cuda(): bvh_funcs.collapse_child(np.uint32(start), np.uint32(end), gpu_nodes, block=(nthreads_per_block, 1, 1), grid=(120, 1)) elif gpuapi.is_gpu_api_opencl(): bvh_funcs.collapse_child(queue, (end - start, 1, 1), None, np.uint32(start), np.uint32(end), gpu_nodes.data).wait() return gpu_nodes.get()
def __init__(self, photons, ncopies=1, cl_context=None): """Load ``photons`` onto the GPU, replicating as requested. Args: - photons: chroma.Event.Photons Photon state information to load onto GPU - ncopies: int, *optional* Number of times to replicate the photons on the GPU. This is used if you want to propagate the same event many times, for example in a likelihood calculation. The amount of GPU storage will be proportionally larger if ncopies > 1, so be careful. """ nphotons = len(photons) # Allocate GPU memory for photon info and push to device if api.is_gpu_api_cuda(): self.pos = ga.empty(shape=nphotons * ncopies, dtype=ga.vec.float3) self.dir = ga.empty(shape=nphotons * ncopies, dtype=ga.vec.float3) self.pol = ga.empty(shape=nphotons * ncopies, dtype=ga.vec.float3) self.wavelengths = ga.empty(shape=nphotons * ncopies, dtype=np.float32) self.t = ga.empty(shape=nphotons * ncopies, dtype=np.float32) self.last_hit_triangles = ga.empty(shape=nphotons * ncopies, dtype=np.int32) self.flags = ga.empty(shape=nphotons * ncopies, dtype=np.uint32) self.weights = ga.empty(shape=nphotons * ncopies, dtype=np.float32) self.current_node_index = ga.zeros(shape=nphotons * ncopies, dtype=np.uint32) # deprecated self.requested_workcode = ga.empty(shape=nphotons * ncopies, dtype=np.uint32) # deprecated elif api.is_gpu_api_opencl(): queue = cl.CommandQueue(cl_context) self.pos = ga.empty(queue, shape=nphotons * ncopies, dtype=ga.vec.float3) self.dir = ga.empty(queue, shape=nphotons * ncopies, dtype=ga.vec.float3) self.pol = ga.empty(queue, shape=nphotons * ncopies, dtype=ga.vec.float3) self.wavelengths = ga.empty(queue, shape=nphotons * ncopies, dtype=np.float32) self.t = ga.empty(queue, shape=nphotons * ncopies, dtype=np.float32) self.last_hit_triangles = ga.empty(queue, shape=nphotons * ncopies, dtype=np.int32) self.flags = ga.empty(queue, shape=nphotons * ncopies, dtype=np.uint32) self.weights = ga.empty(queue, shape=nphotons * ncopies, dtype=np.float32) self.current_node_index = ga.zeros(queue, shape=nphotons * ncopies, dtype=np.uint32) # deprecated self.requested_workcode = ga.empty(queue, shape=nphotons * ncopies, dtype=np.uint32) # deprecated # Assign the provided photons to the beginning (possibly # the entire array if ncopies is 1 self.pos[:nphotons].set(to_float3(photons.pos)) self.dir[:nphotons].set(to_float3(photons.dir)) self.pol[:nphotons].set(to_float3(photons.pol)) self.wavelengths[:nphotons].set(photons.wavelengths.astype(np.float32)) self.t[:nphotons].set(photons.t.astype(np.float32)) self.last_hit_triangles[:nphotons].set( photons.last_hit_triangles.astype(np.int32)) self.flags[:nphotons].set(photons.flags.astype(np.uint32)) self.weights[:nphotons].set(photons.weights.astype(np.float32)) if api.is_gpu_api_cuda(): self.module = get_module('propagate.cu', options=api_options, include_source_directory=True) elif api.is_gpu_api_opencl(): self.module = get_module('propagate.cl', cl_context, options=api_options, include_source_directory=True) # define the texture references self.define_texture_references() # get kernel functions self.gpu_funcs = GPUFuncs(self.module) # Replicate the photons to the rest of the slots if needed if ncopies > 1: max_blocks = 1024 nthreads_per_block = 64 for first_photon, photons_this_round, blocks in \ chunk_iterator(nphotons, nthreads_per_block, max_blocks): self.gpu_funcs.photon_duplicate(np.int32(first_photon), np.int32(photons_this_round), self.pos, self.dir, self.wavelengths, self.pol, self.t, self.flags, self.last_hit_triangles, self.weights, np.int32(ncopies - 1), np.int32(nphotons), block=(nthreads_per_block, 1, 1), grid=(blocks, 1)) # Save the duplication information for the iterate_copies() method self.true_nphotons = nphotons self.ncopies = ncopies
def concatenate_layers(layers): nthreads_per_block = 1024 context = None queue = None if gpuapi.is_gpu_api_opencl(): context = cltools.get_last_context() #print context queue = cl.CommandQueue(context) # Load GPU functions if gpuapi.is_gpu_api_cuda(): bvh_module = get_module('bvh.cu', options=api_options, include_source_directory=True) elif gpuapi.is_gpu_api_opencl(): # don't like the last context method. trouble. trouble. bvh_module = get_module('bvh.cl', cltools.get_last_context(), options=api_options, include_source_directory=True) else: raise RuntimeError('API neither CUDA nor OpenCL?!') bvh_funcs = GPUFuncs(bvh_module) # Put 0 at beginning of list layer_bounds = np.insert(np.cumsum(map(len, layers)), 0, 0) # allocate memory if gpuapi.is_gpu_api_cuda(): nodes = ga.empty(shape=int(layer_bounds[-1]), dtype=ga.vec.uint4) elif gpuapi.is_gpu_api_opencl(): totsize = 0 layer_pos = [] print layer_bounds[-1] for n, layer in enumerate(layers): layer_pos.append(totsize) print "LAYER ", n, " size=", len(layer), "start=", totsize totsize += len(layer) print "totsize: ", totsize nodes_iter_np = np.empty(totsize, dtype=ga.vec.uint4) nodes_iter_gpu = ga.to_device(queue, nodes_iter_np) nodeset_np = [] else: raise RuntimeError('API neither CUDA nor OpenCL?!') ilayer = 0 for layer_start, layer_end, layer in zip(layer_bounds[:-1], layer_bounds[1:], layers): if layer_end == layer_bounds[-1]: # leaf nodes need no offset child_offset = 0 else: child_offset = layer_end #print "ilayer,start,end,child_offset: ",ilayer,layer_start, layer_end, child_offset nmax_blocks = 10000 if gpuapi.is_gpu_api_opencl(): nthreads_per_block = 256 nmax_blocks = 1 for first_index, elements_this_iter, nblocks_this_iter in \ chunk_iterator(layer_end-layer_start, nthreads_per_block,max_blocks=nmax_blocks): #print " ",ilayer,first_index, elements_this_iter, nblocks_this_iter, layer_start if gpuapi.is_gpu_api_cuda(): bvh_funcs.copy_and_offset(np.uint32(first_index), np.uint32(elements_this_iter), np.uint32(child_offset), cuda.In(layer), nodes[layer_start:], block=(nthreads_per_block, 1, 1), grid=(nblocks_this_iter, 1)) elif gpuapi.is_gpu_api_opencl(): layer_gpu = ga.to_device(queue, layer) bvh_funcs.copy_and_offset(queue, (elements_this_iter, 1, 1), (1, 1, 1), np.uint32(first_index), np.uint32(elements_this_iter), np.uint32(child_offset), np.uint32(layer_start), layer_gpu.data, nodes_iter_gpu.data, g_times_l=True).wait() else: raise RuntimeError('API neither CUDA nor OpenCL?!') ilayer += 1 if gpuapi.is_gpu_api_cuda(): return nodes.get(), layer_bounds elif gpuapi.is_gpu_api_opencl(): return nodes_iter_gpu.get(), layer_bounds
def merge_nodes(nodes, degree, max_ratio=None): nthreads_per_block = 256 context = None queue = None if gpuapi.is_gpu_api_opencl(): context = cltools.get_last_context() queue = cl.CommandQueue(context) # Load GPU functions if gpuapi.is_gpu_api_cuda(): bvh_module = get_module('bvh.cu', options=api_options, include_source_directory=True) elif gpuapi.is_gpu_api_opencl(): # don't like the last context method. trouble. trouble. bvh_module = get_module('bvh.cl', context, options=api_options, include_source_directory=True) else: raise RuntimeError('API is neither CUDA nor OpenCL?!') bvh_funcs = GPUFuncs(bvh_module) # determine number of parents nparent = len(nodes) / degree if len(nodes) % degree != 0: nparent += 1 if nparent == 1: nparent_pad = nparent else: nparent_pad = round_up_to_multiple(nparent, 1) #degree # allocate memory if gpuapi.is_gpu_api_cuda(): gpu_parent_nodes = ga.zeros(shape=nparent_pad, dtype=ga.vec.uint4) elif gpuapi.is_gpu_api_opencl(): parent_nodes_np = np.zeros(shape=nparent, dtype=ga.vec.uint4) gpu_parent_nodes = ga.to_device(queue, parent_nodes_np) gpu_nodes = ga.to_device(queue, nodes) else: raise RuntimeError('API is neither CUDA nor OpenCL?!') # run kernel if gpuapi.is_gpu_api_cuda(): for first_index, elements_this_iter, nblocks_this_iter in \ chunk_iterator(nparent, nthreads_per_block, max_blocks=10000): bvh_funcs.make_parents(np.uint32(first_index), np.uint32(elements_this_iter), np.uint32(degree), gpu_parent_nodes, cuda.In(nodes), np.uint32(0), np.uint32(len(nodes)), block=(nthreads_per_block, 1, 1), grid=(nblocks_this_iter, 1)) elif gpuapi.is_gpu_api_opencl(): for first_index, elements_this_iter, nblocks_this_iter in \ chunk_iterator(nparent, nthreads_per_block, max_blocks=1): bvh_funcs.make_parents(queue, (elements_this_iter, 1, 1), None, np.uint32(first_index), np.uint32(elements_this_iter), np.uint32(degree), gpu_parent_nodes.data, gpu_nodes.data, np.uint32(0), np.uint32(len(nodes))).wait() else: raise RuntimeError('API is neither CUDA nor OpenCL?!') parent_nodes = gpu_parent_nodes.get() if max_ratio is not None: areas = node_areas(parent_nodes) child_areas = node_areas(nodes) excessive_area = np.zeros(shape=len(areas), dtype=bool) for i, parent_area in enumerate(areas): nchild = parent_nodes['w'][i] >> CHILD_BITS child_index = parent_nodes['w'][i] & ~NCHILD_MASK child_area = child_areas[child_index:child_index + nchild].sum() #if parent_area > 1e9: # print i, 'Children: %e, Parent: %e' % (child_area, parent_area) if child_area / parent_area < 0.3: excessive_area[i] = True #print i, 'Children: %e, Parent: %e' % (child_area, parent_area) extra_slots = round_up_to_multiple( (degree - 1) * np.count_nonzero(excessive_area), 1) print 'Extra slots:', extra_slots new_parent_nodes = np.zeros(shape=len(parent_nodes) + extra_slots, dtype=parent_nodes.dtype) new_parent_nodes[:len(parent_nodes)] = parent_nodes offset = 0 for count, index in enumerate(np.argwhere(excessive_area)): index = index[0] + offset nchild = new_parent_nodes['w'][index] >> CHILD_BITS child_index = new_parent_nodes['w'][index] & ~NCHILD_MASK new_parent_nodes[index] = nodes[child_index] #new_parent_nodes['w'][index] = 1 << CHILD_BITS | child_index tmp_nchild = new_parent_nodes['w'][index] >> CHILD_BITS tmp_child_index = new_parent_nodes['w'][index] & ~NCHILD_MASK new_parent_nodes['w'][index] = tmp_nchild << CHILD_BITS | ( tmp_child_index + len(nodes)) if nchild == 1: continue # slide everyone over #print index, nchild, len(new_parent_nodes) new_parent_nodes[index + nchild:] = new_parent_nodes[index + 1:-nchild + 1] offset += nchild - 1 for sibling in xrange(nchild - 1): new_parent_index = index + 1 + sibling new_parent_nodes[new_parent_index] = nodes[child_index + sibling + 1] if new_parent_nodes['x'][new_parent_index] != 0: tmp_nchild = new_parent_nodes['w'][ new_parent_index] >> CHILD_BITS tmp_child_index = new_parent_nodes['w'][ new_parent_index] & ~NCHILD_MASK new_parent_nodes['w'][ new_parent_index] = tmp_nchild << CHILD_BITS | ( tmp_child_index + len(nodes)) #new_parent_nodes['w'][new_parent_index] = 1 << CHILD_BITS | (child_index + sibling + 1) #print 'intermediate: %e' % node_areas(new_parent_nodes).max() print 'old: %e' % node_areas(parent_nodes).max() print 'new: %e' % node_areas(new_parent_nodes).max() if len(new_parent_nodes) < len(nodes): # Only adopt new set of parent nodes if it actually reduces the # total number of nodes at this level by 1. parent_nodes = new_parent_nodes return parent_nodes
def create_leaf_nodes(mesh, morton_bits=16, round_to_multiple=1, nthreads_per_block=32, max_blocks=16): '''Compute the leaf nodes surrounding a triangle mesh. ``mesh``: chroma.geometry.Mesh Triangles to box ``morton_bits``: int Number of bits to use per dimension when computing Morton code. ``round_to_multiple``: int Round the number of nodes created up to multiple of this number Extra nodes will be all zero. Returns (world_coords, nodes, morton_codes), where ``world_coords``: chroma.bvh.WorldCoords Defines the fixed point coordinate system ``nodes``: ndarray(shape=len(mesh.triangles), dtype=uint4) List of leaf nodes. Child IDs will be set to triangle offsets. ``morton_codes``: ndarray(shape=len(mesh.triangles), dtype=np.uint64) Morton codes for each triangle, using ``morton_bits`` per axis. Must be <= 16 bits. ''' # it would be nice not to duplicate code, make functions transparent... context = None queue = None if gpuapi.is_gpu_api_opencl(): context = cltools.get_last_context() #print context queue = cl.CommandQueue(context) # Load GPU functions if gpuapi.is_gpu_api_cuda(): bvh_module = get_module('bvh.cu', options=api_options, include_source_directory=True) elif gpuapi.is_gpu_api_opencl(): # don't like the last context method. trouble. trouble. bvh_module = get_module('bvh.cl', cltools.get_last_context(), options=api_options, include_source_directory=True) bvh_funcs = GPUFuncs(bvh_module) # compute world coordinates world_origin_np = mesh.vertices.min(axis=0) world_scale = np.max( (mesh.vertices.max(axis=0) - world_origin_np)) / (2**16 - 2) world_coords = WorldCoords(world_origin=world_origin_np, world_scale=world_scale) # Put triangles and vertices into host and device memory # unfortunately, opencl and cuda has different methods for managing memory here # we have to write divergent code if gpuapi.is_gpu_api_cuda(): # here cuda supports a nice feature where we allocate host and device memory that are mapped onto one another. # no explicit requests for transfers here triangles = cutools.mapped_empty(shape=len(mesh.triangles), dtype=ga.vec.uint3, write_combined=True) triangles[:] = to_uint3(mesh.triangles) vertices = cutools.mapped_empty(shape=len(mesh.vertices), dtype=ga.vec.float3, write_combined=True) vertices[:] = to_float3(mesh.vertices) #print triangles[0:10] #print vertices[0:10] # Call GPU to compute nodes nodes = ga.zeros(shape=round_up_to_multiple(len(triangles), round_to_multiple), dtype=ga.vec.uint4) morton_codes = ga.empty(shape=len(triangles), dtype=np.uint64) # Convert world coords to GPU-friendly types world_origin = ga.vec.make_float3(*world_origin_np) world_scale = np.float32(world_scale) # generate morton codes on GPU for first_index, elements_this_iter, nblocks_this_iter in \ chunk_iterator(len(triangles), nthreads_per_block, max_blocks=30000): bvh_funcs.make_leaves(np.uint32(first_index), np.uint32(elements_this_iter), cutools.Mapped(triangles), cutools.Mapped(vertices), world_origin, world_scale, nodes, morton_codes, block=(nthreads_per_block, 1, 1), grid=(nblocks_this_iter, 1)) morton_codes_host = morton_codes.get() >> (16 - morton_bits) elif gpuapi.is_gpu_api_opencl(): # here we need to allocate a buffer on the host and on the device triangles = np.empty(len(mesh.triangles), dtype=ga.vec.uint3) copy_to_uint3(mesh.triangles, triangles) vertices = np.empty(len(mesh.vertices), dtype=ga.vec.float3) copy_to_float3(mesh.vertices, vertices) # now create a buffer object on the device and push data to it triangles_dev = ga.to_device(queue, triangles) vertices_dev = ga.to_device(queue, vertices) # Call GPU to compute nodes nodes = ga.zeros(queue, shape=round_up_to_multiple(len(triangles), round_to_multiple), dtype=ga.vec.uint4) morton_codes = ga.empty(queue, shape=len(triangles), dtype=np.uint64) # Convert world coords to GPU-friendly types #world_origin = np.array(world_origin_np,dtype=np.float32) world_origin = np.empty(1, dtype=ga.vec.float3) world_origin['x'] = world_origin_np[0] world_origin['y'] = world_origin_np[1] world_origin['z'] = world_origin_np[2] world_scale = np.float32(world_scale) #print world_origin, world_scale # generate morton codes on GPU for first_index, elements_this_iter, nblocks_this_iter in \ chunk_iterator(len(triangles), nthreads_per_block, max_blocks): print first_index, elements_this_iter, nblocks_this_iter bvh_funcs.make_leaves( queue, (nblocks_this_iter, 1, 1), (nthreads_per_block, 1, 1), #bvh_funcs.make_leaves( queue, (elements_this_iter,1,1), None, np.uint32(first_index), np.uint32(elements_this_iter), triangles_dev.data, vertices_dev.data, world_origin, world_scale, nodes.data, morton_codes.data, g_times_l=True).wait() morton_codes_host = morton_codes.get() >> (16 - morton_bits) return world_coords, nodes.get(), morton_codes_host
def merge_nodes_detailed(nodes, first_child, nchild): '''Merges nodes into len(first_child) parent nodes, using the provided arrays to determine the index of the first child of each parent, and how many children there are.''' nthreads_per_block = 256 context = None queue = None if gpuapi.is_gpu_api_opencl(): context = cltools.get_last_context() #print context queue = cl.CommandQueue(context) # Load GPU functions if gpuapi.is_gpu_api_cuda(): bvh_module = get_module('bvh.cu', options=api_options, include_source_directory=True) elif gpuapi.is_gpu_api_opencl(): # don't like the last context method. trouble. trouble. bvh_module = get_module('bvh.cl', context, options=api_options, include_source_directory=True) else: raise RuntimeError('API is neither CUDA nor OpenCL?!') bvh_funcs = GPUFuncs(bvh_module) # Load Memory if gpuapi.is_gpu_api_cuda(): gpu_nodes = ga.to_gpu(nodes) gpu_first_child = ga.to_gpu(first_child.astype(np.int32)) gpu_nchild = ga.to_gpu(nchild.astype(np.int32)) nparent = len(first_child) gpu_parent_nodes = ga.empty(shape=nparent, dtype=ga.vec.uint4) elif gpuapi.is_gpu_api_opencl(): gpu_nodes = ga.to_device(queue, nodes) gpu_first_child = ga.to_device(queue, first_child.astype(np.int32)) gpu_nchild = ga.to_device(queue, nchild.astype(np.int32)) nparent = len(first_child) parent_nodes_np = np.zeros(shape=nparent, dtype=ga.vec.uint4) gpu_parent_nodes = ga.to_device(queue, parent_nodes_np) else: raise RuntimeError('API is neither CUDA nor OpenCL?!') # Run Kernel for first_index, elements_this_iter, nblocks_this_iter in \ chunk_iterator(nparent, nthreads_per_block, max_blocks=10000): if gpuapi.is_gpu_api_cuda(): bvh_funcs.make_parents_detailed(np.uint32(first_index), np.uint32(elements_this_iter), gpu_nodes, gpu_parent_nodes, gpu_first_child, gpu_nchild, block=(nthreads_per_block, 1, 1), grid=(nblocks_this_iter, 1)) elif gpuapi.is_gpu_api_opencl(): bvh_funcs.make_parents_detailed(queue, (elements_this_iter, 1, 1), None, np.uint32(first_index), np.uint32(elements_this_iter), gpu_nodes.data, gpu_parent_nodes.data, gpu_first_child.data, gpu_nchild.data).wait() else: raise RuntimeError('API is neither CUDA nor OpenCL?!') return gpu_parent_nodes.get()
dump_node_info=True) sim = Simulation(geo, geant4_processes=0) origin = geo.bvh.world_coords.world_origin nodes = sim.gpu_geometry.nodes extra_node = sim.gpu_geometry.extra_nodes triangles = sim.gpu_geometry.triangles vertices = sim.gpu_geometry.vertices print vertices.shape vertices4 = np.zeros((len(vertices), 4), dtype=np.float32) print vertices.get().ravel().view(np.float32).shape vertices4[:, :-1] = vertices.get().ravel().view(np.float32).reshape( len(vertices), 3) module = get_module('test_texture.cu', options=api_options, include_source_directory=True) gpu_funcs = GPUFuncs(module) node_texture_ref = module.get_texref("node_tex_ref") extra_node_texture_ref = module.get_texref("extra_node_tex_ref") triangles_texture_ref = module.get_texref("triangles_tex_ref") vertices_texture_ref = module.get_texref("vertices_tex_ref") node_vec_texture_ref = module.get_texref("nodevec_tex_ref") node_vec_texture_ref.set_format(cuda.array_format.UNSIGNED_INT32, 4) ur_nodes = nodes.get().ravel().view(np.uint32) ur_nodes_gpu = ga.to_gpu(ur_nodes) ur_nodes_gpu.bind_to_texref_ext(node_texture_ref) nodes_nbytes = ur_nodes.nbytes
def _call_opencl_kernel(self, sim, photons, ourphotons, max_shared_nodes, nodes, workgroupsize, comqueue): module = get_module('wq_checknode.cl', self.context, options=api_options, include_source_directory=True) gpu_funcs = GPUFuncs(module) # gather variables for kernel call gpugeo = sim.gpu_geometry photon_pos = photons.pos photon_dir = photons.dir photon_current_node = photons.current_node_index photon_tested_node = ga.to_device( comqueue, 1 * np.ones(len(photons.pos), dtype=np.uint32)) photon_last_result = ga.to_device( comqueue, -1 * np.ones(len(photons.pos), dtype=np.int32)) nodes = gpugeo.nodes node_parent = ga.to_device(comqueue, sim.detector.node_dsar_tree.parent) node_first_daughter = ga.to_device( comqueue, sim.detector.node_dsar_tree.first_daughter) node_sibling = ga.to_device(comqueue, sim.detector.node_dsar_tree.sibling) node_aunt = ga.to_device(comqueue, sim.detector.node_dsar_tree.aunt) world_origin = gpugeo.world_origin_gpu world_scale = gpugeo.world_scale # make queue related variables queue_size = np.int32(len(photons.pos) * 2) queue_photon_index = ga.empty(comqueue, queue_size, dtype=np.int32) queue_slot_flag = ga.zeros(comqueue, queue_size, dtype=np.int32) queue_photon_index[0:len(photons.pos)] = np.arange(0, len(photons.pos), dtype=np.int32)[:] queue_photon_index[len(photons.pos):] = ( np.ones(len(photons.pos), dtype=np.int32) * -1)[:] queue_slot_flag[0:len(photons.pos)] = np.ones(len(photons.pos), dtype=np.int32)[:] a = ga.zeros(comqueue, 1, dtype=ga.vec.uint4) b = np.array(1, dtype=np.int32) c = np.array(1, dtype=np.uint32) workgroup_photons = cl.LocalMemory(b.nbytes * workgroupsize) workgroup_current_node = cl.LocalMemory(b.nbytes * workgroupsize) workgroup_tested_node = cl.LocalMemory(b.nbytes * workgroupsize) max_nodes_can_store = (max_shared_nodes - 20 - 3 * workgroupsize) max_nodes_can_store -= max_nodes_can_store % 32 max_nodes_can_store = np.int32(max_nodes_can_store) loaded_node_start_index = np.int32(0) loaded_node_end_index = np.int32(1) node_front_start = ga.empty(comqueue, 1, dtype=np.int32) node_front_end = ga.empty(comqueue, 1, dtype=np.int32) workgroup_nodes = cl.LocalMemory(a.nbytes * (max_nodes_can_store + 1)) workgroup_daughter = cl.LocalMemory(c.nbytes * (max_nodes_can_store + 1)) workgroup_sibling = cl.LocalMemory(c.nbytes * (max_nodes_can_store + 1)) workgroup_aunt = cl.LocalMemory(c.nbytes * (max_nodes_can_store + 1)) max_loops = 32 if len(gpugeo.extra_nodes) > 1: raise RuntimeError('did not plan for there to be a node split.') print photon_current_node print photon_tested_node print queue_photon_index print queue_slot_flag print "Starting node range: ", loaded_node_start_index, " to ", loaded_node_end_index print "Max nodes in shared: ", max_nodes_can_store print "Work group nodes size: ", a.nbytes * workgroupsize, " bytes = (", a.nbytes, "*", workgroupsize, ")" print "Available local memsize: ", self.shared_mem_size print "Total number of nodes: ", len( nodes), " (", nodes.nbytes, " bytes)" print "Stored node size: ", max_nodes_can_store * a.nbytes print "Left over: ", self.shared_mem_size - max_nodes_can_store * a.nbytes - a.nbytes * workgroupsize print sim.detector.bvh.layer_bounds print "PRESUB CURRENT NODES" print photon_current_node print "PRESUB TESTED NODES" print photon_tested_node start_queue = time.time() gpu_funcs.checknode( comqueue, (workgroupsize, 1, 1), (workgroupsize, 1, 1), np.int32(max_loops), photon_pos.data, photon_dir.data, photon_current_node.data, photon_tested_node.data, photon_last_result.data, np.int32(len(nodes)), nodes.data, node_parent.data, node_first_daughter.data, node_sibling.data, node_aunt.data, world_origin.data, world_scale, queue_size, queue_photon_index.data, queue_slot_flag.data, np.int32(len(photon_pos)), np.int32(workgroupsize), workgroup_photons, workgroup_current_node, workgroup_tested_node, max_nodes_can_store, workgroup_nodes, workgroup_daughter, workgroup_sibling, workgroup_aunt, loaded_node_start_index, loaded_node_end_index, node_front_start.data, node_front_end.data).wait() end_queue = time.time() print "CheckNode Queue returns. ", end_queue - start_queue, " seconds" print "(Current node, To Test, result)" node_states = zip(photon_current_node.get(), photon_tested_node.get(), photon_last_result.get()) for x in xrange(0, len(node_states), 10): y = x + 10 if y > len(node_states): y = len(node_states) print x, ": ", node_states[x:y] print "LAST RESULT:" print photon_last_result.get() print "PHOTON QUEUE" photon_queue = queue_photon_index.get() for x in xrange(0, len(photon_queue), 32): y = x + 32 if y > len(photon_queue): y = len(photon_queue) print x, ": ", photon_queue[x:y] print "QUEUE SLOT FLAGS" slot_flags = queue_slot_flag.get() for x in xrange(0, len(slot_flags), 32): y = x + 32 if y > len(slot_flags): y = len(slot_flags) print x, ": ", slot_flags[x:y] print "NODE FRONT: ", node_front_start.get( ), " to ", node_front_end.get( ), node_front_end.get() - node_front_start.get() return
def _call_cuda_kernel(self, sim, photons, ourphotons, max_shared_nodes, nodes, workgroupsize): module = get_module('wq_checknode.cu', options=api_options, include_source_directory=True) gpu_funcs = GPUFuncs(module) # gather variables for kernel call gpugeo = sim.gpu_geometry photon_pos = photons.pos photon_dir = photons.dir photon_current_node = photons.current_node_index photon_tested_node = ga.to_gpu( 1 * np.ones(len(photons.pos), dtype=np.uint32)) photon_last_result = ga.to_gpu( -1 * np.ones(len(photons.pos), dtype=np.int32)) nodes = gpugeo.nodes node_parent = ga.to_gpu(sim.detector.node_dsar_tree.parent) node_first_daughter = ga.to_gpu( sim.detector.node_dsar_tree.first_daughter) node_sibling = ga.to_gpu(sim.detector.node_dsar_tree.sibling) node_aunt = ga.to_gpu(sim.detector.node_dsar_tree.aunt) world_origin = gpugeo.world_origin world_scale = gpugeo.world_scale # make queue related variables queue_size = np.int32(len(photons.pos) * 2) queue_photon_index = ga.empty(queue_size, dtype=np.int32) queue_slot_flag = ga.zeros(queue_size, dtype=np.int32) queue_photon_index[0:len(photons.pos)].set( np.arange(0, len(photons.pos), dtype=np.int32)[:]) queue_photon_index[len(photons.pos):].set( -1 * np.ones(len(photons.pos), dtype=np.int32)) queue_slot_flag[0:len(photons.pos)].set( np.ones(len(photons.pos), dtype=np.int32)[:]) a = ga.zeros(1, dtype=ga.vec.uint4) b = np.array(1, dtype=np.int32) c = np.array(1, dtype=np.uint32) max_nodes_can_store = (max_shared_nodes - 20 - 3 * workgroupsize) max_nodes_can_store -= max_nodes_can_store % 32 max_nodes_can_store = np.int32(max_nodes_can_store) loaded_node_start_index = np.int32(0) loaded_node_end_index = np.int32(1) node_front_start = ga.empty(1, dtype=np.int32) node_front_end = ga.empty(1, dtype=np.int32) max_loops = 1000 if len(gpugeo.extra_nodes) > 1: raise RuntimeError('did not plan for there to be a node split.') print photon_current_node print photon_tested_node print queue_photon_index print queue_slot_flag print "Starting node range: ", loaded_node_start_index, " to ", loaded_node_end_index print "Max nodes in shared: ", max_nodes_can_store print "Work group nodes size: ", a.nbytes * workgroupsize, " bytes = (", a.nbytes, "*", workgroupsize, ")" print "Available local memsize: ", self.shared_mem_size print "Total number of nodes: ", len( nodes), " (", nodes.nbytes, " bytes)" print "Stored node size: ", max_nodes_can_store * a.nbytes print "Left over: ", self.shared_mem_size - max_nodes_can_store * a.nbytes - a.nbytes * workgroupsize print sim.detector.bvh.layer_bounds print "PRESUB CURRENT NODES" print photon_current_node print "PRESUB TESTED NODES" print photon_tested_node print "STARTING QUEUE" print queue_photon_index start_queue = time.time() gpu_funcs.checknode(np.int32(max_loops), photon_pos, photon_dir, photon_current_node, photon_tested_node, photon_last_result, np.int32(len(nodes)), nodes, node_parent, node_first_daughter, node_sibling, node_aunt, world_origin, world_scale, queue_size, queue_photon_index, queue_slot_flag, np.int32(len(photon_pos)), max_nodes_can_store, loaded_node_start_index, loaded_node_end_index, node_front_start, node_front_end, block=(workgroupsize, 1, 1), grid=(1, 1), shared=4 * (7 * max_nodes_can_store + 3 * workgroupsize + 1)) cuda.Context.get_current().synchronize() end_queue = time.time() nactive = len(np.argwhere(queue_slot_flag.get() == 1)) print "CheckNode Queue returns. ", end_queue - start_queue, " seconds" print "(Current node, To Test)" node_states = zip(photon_current_node.get(), photon_tested_node.get(), photon_last_result.get()) for x in xrange(0, len(node_states), 10): y = x + 10 if y > len(node_states): y = len(node_states) print x, ": ", node_states[x:y] print "LAST RESULT:" np_photon_results = photon_last_result.get() for x in xrange(0, len(np_photon_results), 10): y = x + 10 if y > len(np_photon_results): y = len(np_photon_results) print x, ": ", np_photon_results[x:y] print "PHOTON QUEUE" photon_queue = queue_photon_index.get() for x in xrange(0, len(photon_queue), 10): y = x + 10 if y > len(photon_queue): y = len(photon_queue) print x, ": ", photon_queue[x:y] print "QUEUE SLOT FLAGS: ", nactive, " threads" slot_flags = queue_slot_flag.get() for x in xrange(0, len(slot_flags), 10): y = x + 10 if y > len(slot_flags): y = len(slot_flags) print x, ": ", slot_flags[x:y] print "NODE FRONT: ", node_front_start.get( ), " to ", node_front_end.get( ), node_front_end.get() - node_front_start.get()
def __init__(self, steps_arr, multiple=1.0, nthreads_per_block=64, max_blocks=1024, ncopies=1, seed=None, cl_context=None): """ Generates photons from information in the steps_arr Parameters ---------- steps_arr : numpy.array with shape=(N,10) dtype=np.float contains [ x1, y1, z1, t1, x2, y2, z2, nphotons, fast_to_slow_ratio, fast_time_constatn, slow_time_constatn ] in the future could generalize this to many different time components. developed for liquid argon TPCs. multiple : float scale up the number of photons generated (not implemented yet) """ self.steps_array = steps_arr self.nsteps = self.steps_array.shape[0] if multiple!=1.0: raise RuntimeError('Have not implemented scaling of the number of photons generated.') # =========================== # GEN PHOTONS tstart_genphotons = time.time() # we do the dumbest thing first (i.e., no attempt to do fancy GPU manipulations here) # on the CPU, we scan the steps to determine the total number of photons using poisson statistics # we assume the user has seeded the random number generator to her liking tstart_nphotons = time.time() self.step_fsratio = np.array( self.steps_array[:,self._fsratio], dtype=np.float32 ) #self.nphotons_per_step = np.array( [ np.random.poisson( z ) for z in self.steps_array[:,self._nphotons].ravel() ], dtype=np.int ) self.nphotons_per_step = self.steps_array[ self._nphotons, : ] self.nphotons = reduce( lambda x, y : x + y, self.nphotons_per_step.ravel() ) print "NSTEPS: ",self.nsteps print "NPHOTONS: ",self.nphotons," (time to determine per step=",time.time()-tstart_nphotons # now we make an index array for which step we need to get info from self.source_step_index = np.zeros( self.nphotons, dtype=np.int32 ) current_index=0 for n, n_per_step in enumerate( self.nphotons_per_step ): self.source_step_index[current_index:current_index+n_per_step] = n current_index += n_per_step # push everything to the GPU tstart_transfer = time.time() if api.is_gpu_api_cuda(): # step info self.step_pos1_gpu = ga.empty(shape=self.nsteps, dtype=ga.vec.float3) self.step_pos2_gpu = ga.empty(shape=self.nsteps, dtype=ga.vec.float3) self.step_fsratio_gpu = ga.to_gpu( self.step_fsratio ) self.source_step_index_gpu = ga.to_gpu( self.source_step_index ) # photon info self.pos = ga.empty( shape=self.nphotons, dtype=ga.vec.float3 ) self.dir = ga.empty( shape=self.nphotons, dtype=ga.vec.float3 ) self.pol = ga.empty( shape=self.nphotons, dtype=ga.vec.float3 ) self.wavelengths = ga.empty(shape=self.nphotons*ncopies, dtype=np.float32) self.t = ga.to_gpu( np.zeros(self.nphotons*ncopies, dtype=np.float32) ) self.last_hit_triangles = ga.empty(shape=self.nphotons*ncopies, dtype=np.int32) self.flags = ga.empty(shape=self.nphotons*ncopies, dtype=np.uint32) self.weights = ga.empty(shape=self.nphotons*ncopies, dtype=np.float32) elif api.is_gpu_api_opencl(): cl_queue = cl.CommandQueue( cl_context ) # step info self.step_pos1_gpu = ga.empty(cl_queue, self.nsteps, dtype=ga.vec.float3) self.step_pos2_gpu = ga.empty(cl_queue, self.nsteps, dtype=ga.vec.float3) self.step_fsratio_gpu = ga.to_device( cl_queue, self.step_fsratio ) self.source_step_index_gpu = ga.to_device( cl_queue, self.source_step_index ) # photon info self.pos = ga.empty( cl_queue, self.nphotons, dtype=ga.vec.float3 ) self.dir = ga.empty( cl_queue, self.nphotons, dtype=ga.vec.float3 ) self.pol = ga.empty( cl_queue, self.nphotons, dtype=ga.vec.float3 ) self.wavelengths = ga.empty( cl_queue, self.nphotons*ncopies, dtype=np.float32) self.t = ga.zeros( cl_queue, self.nphotons*ncopies, dtype=np.float32) self.last_hit_triangles = ga.empty( cl_queue, self.nphotons*ncopies, dtype=np.int32) self.flags = ga.empty( cl_queue, self.nphotons*ncopies, dtype=np.uint32) self.weights = ga.empty( cl_queue, self.nphotons*ncopies, dtype=np.float32) self.step_pos1_gpu.set( to_float3( self.steps_array[:,0:3] ) ) self.step_pos2_gpu.set( to_float3( self.steps_array[:,4:7] ) ) self.t.set( self.steps_array[:,3] ) self.ncopies = ncopies self.true_nphotons = self.nphotons if self.ncopies!=1: raise ValueError('support for multiple copies not supported') if api.is_gpu_api_cuda(): self.gpumod = get_module( "gen_photon_from_step.cu", options=api_options, include_source_directory=True ) elif api.is_gpu_api_opencl(): self.gpumod = get_module( "gen_photon_from_step.cl", cl_context, options=api_options, include_source_directory=True ) self.gpufuncs = GPUFuncs( self.gpumod ) print "gen photon mem alloc/transfer time=",time.time()-tstart_transfer # need random numbers tgpu = time.time() if seed==None: seed = 5 rng_states = get_rng_states(nthreads_per_block*max_blocks, seed=seed, cl_context=cl_context) for first_photon, photons_this_round, blocks in chunk_iterator(self.nphotons, nthreads_per_block, max_blocks): if api.is_gpu_api_cuda(): self.gpufuncs.gen_photon_from_step( np.int32(first_photon), np.int32(self.nphotons), self.source_step_index_gpu, self.step_pos1_gpu, self.step_pos2_gpu, self.step_fsratio_gpu, np.float32( self.steps_array[0,self._fconst] ), np.float32( self.steps_array[0,self._sconst] ), np.float32( 128.0 ), rng_states, self.pos, self.dir, self.pol, self.t, self.wavelengths, self.last_hit_triangles, self.flags, self.weights, block=(nthreads_per_block,1,1), grid=(blocks, 1) ) elif api.is_gpu_api_opencl(): self.gpufuncs.gen_photon_from_step( cl_queue, ( photons_this_round, 1, 1), None, np.int32(first_photon), np.int32(self.nphotons), self.source_step_index_gpu.data, self.step_pos1_gpu.data, self.step_pos2_gpu.data, self.step_fsratio_gpu.data, np.float32( self.steps_array[0,self._fconst] ), np.float32( self.steps_array[0,self._sconst] ), np.float32( 128.0 ), rng_states.data, self.pos.data, self.dir.data, self.pol.data, self.t.data, self.wavelengths.data, self.last_hit_triangles.data, self.flags.data, self.weights.data, g_times_l=False ).wait() else: raise RuntimeError("GPU API is neither CUDA nor OpenCL!") if api.is_gpu_api_cuda(): cuda.Context.get_current().synchronize() tend_genphotons = time.time() print "GPUPhotonFromSteps: time to gen photons ",tend_genphotons-tstart_genphotons," secs (gpu time=",time.time()-tgpu,")" # Now load modules if api.is_gpu_api_cuda(): self.module = get_module('propagate.cu', options=api_options, include_source_directory=True) elif api.is_gpu_api_opencl(): self.module = get_module('propagate.cl', cl_context, options=api_options, include_source_directory=True) # define the texture references self.define_texture_references() # get kernel functions self.gpu_funcs = GPUFuncs(self.module)
import os, sys os.environ["PYOPENCL_CTX"] ='1' import numpy as np import pyopencl as cl import pyopencl.array as clarray import chroma.gpu.tools as tools float3 = clarray.vec.float3 print "float3 type: ",float3 ctx = tools.get_context() queue = cl.CommandQueue(ctx) dev = ctx.get_info( cl.context_info.DEVICES )[0] print 'device %s' % dev.get_info( cl.device_info.NAME ) mod = tools.get_module( 'linalg_test.cl', ctx, include_source_directory=False ) size = {'block': (256,), 'grid': (1,)} a_np = np.zeros((size['block'][0],3), dtype=np.float32) b_np = np.zeros((size['block'][0],3), dtype=np.float32) c_np = np.float32(np.random.random_sample()) mf = cl.mem_flags a_vec_np = np.zeros(size['block'][0], dtype=float3) b_vec_np = np.zeros(size['block'][0], dtype=float3) d_vec_np = np.zeros(size['block'][0], dtype=float3) #c_vec_np = np.float32(np.random.random_sample()) #float3add = mod.get_function('float3add') #float3addequal = mod.get_function('float3addequal') #float3sub = mod.get_function('float3sub')