def to_device(array, backend='cython'): if backend == 'cython': out = array elif backend == 'opencl': import pyopencl.array as gpuarray from .opencl import get_queue out = gpuarray.to_device(get_queue(), array) elif backend == 'cuda': import pycuda.gpuarray as gpuarray out = gpuarray.to_gpu(array) return wrap_array(out, backend)
def _call_cuda_kernel(self, sim, photons, ourphotons, max_shared_nodes, nodes, workgroupsize): module = get_module('wq_checknode.cu', options=api_options, include_source_directory=True) gpu_funcs = GPUFuncs(module) # gather variables for kernel call gpugeo = sim.gpu_geometry photon_pos = photons.pos photon_dir = photons.dir photon_current_node = photons.current_node_index photon_tested_node = ga.to_gpu( 1 * np.ones(len(photons.pos), dtype=np.uint32)) photon_last_result = ga.to_gpu( -1 * np.ones(len(photons.pos), dtype=np.int32)) nodes = gpugeo.nodes node_parent = ga.to_gpu(sim.detector.node_dsar_tree.parent) node_first_daughter = ga.to_gpu( sim.detector.node_dsar_tree.first_daughter) node_sibling = ga.to_gpu(sim.detector.node_dsar_tree.sibling) node_aunt = ga.to_gpu(sim.detector.node_dsar_tree.aunt) world_origin = gpugeo.world_origin world_scale = gpugeo.world_scale # make queue related variables queue_size = np.int32(len(photons.pos) * 2) queue_photon_index = ga.empty(queue_size, dtype=np.int32) queue_slot_flag = ga.zeros(queue_size, dtype=np.int32) queue_photon_index[0:len(photons.pos)].set( np.arange(0, len(photons.pos), dtype=np.int32)[:]) queue_photon_index[len(photons.pos):].set( -1 * np.ones(len(photons.pos), dtype=np.int32)) queue_slot_flag[0:len(photons.pos)].set( np.ones(len(photons.pos), dtype=np.int32)[:]) a = ga.zeros(1, dtype=ga.vec.uint4) b = np.array(1, dtype=np.int32) c = np.array(1, dtype=np.uint32) max_nodes_can_store = (max_shared_nodes - 20 - 3 * workgroupsize) max_nodes_can_store -= max_nodes_can_store % 32 max_nodes_can_store = np.int32(max_nodes_can_store) loaded_node_start_index = np.int32(0) loaded_node_end_index = np.int32(1) node_front_start = ga.empty(1, dtype=np.int32) node_front_end = ga.empty(1, dtype=np.int32) max_loops = 1000 if len(gpugeo.extra_nodes) > 1: raise RuntimeError('did not plan for there to be a node split.') print photon_current_node print photon_tested_node print queue_photon_index print queue_slot_flag print "Starting node range: ", loaded_node_start_index, " to ", loaded_node_end_index print "Max nodes in shared: ", max_nodes_can_store print "Work group nodes size: ", a.nbytes * workgroupsize, " bytes = (", a.nbytes, "*", workgroupsize, ")" print "Available local memsize: ", self.shared_mem_size print "Total number of nodes: ", len( nodes), " (", nodes.nbytes, " bytes)" print "Stored node size: ", max_nodes_can_store * a.nbytes print "Left over: ", self.shared_mem_size - max_nodes_can_store * a.nbytes - a.nbytes * workgroupsize print sim.detector.bvh.layer_bounds print "PRESUB CURRENT NODES" print photon_current_node print "PRESUB TESTED NODES" print photon_tested_node print "STARTING QUEUE" print queue_photon_index start_queue = time.time() gpu_funcs.checknode(np.int32(max_loops), photon_pos, photon_dir, photon_current_node, photon_tested_node, photon_last_result, np.int32(len(nodes)), nodes, node_parent, node_first_daughter, node_sibling, node_aunt, world_origin, world_scale, queue_size, queue_photon_index, queue_slot_flag, np.int32(len(photon_pos)), max_nodes_can_store, loaded_node_start_index, loaded_node_end_index, node_front_start, node_front_end, block=(workgroupsize, 1, 1), grid=(1, 1), shared=4 * (7 * max_nodes_can_store + 3 * workgroupsize + 1)) cuda.Context.get_current().synchronize() end_queue = time.time() nactive = len(np.argwhere(queue_slot_flag.get() == 1)) print "CheckNode Queue returns. ", end_queue - start_queue, " seconds" print "(Current node, To Test)" node_states = zip(photon_current_node.get(), photon_tested_node.get(), photon_last_result.get()) for x in xrange(0, len(node_states), 10): y = x + 10 if y > len(node_states): y = len(node_states) print x, ": ", node_states[x:y] print "LAST RESULT:" np_photon_results = photon_last_result.get() for x in xrange(0, len(np_photon_results), 10): y = x + 10 if y > len(np_photon_results): y = len(np_photon_results) print x, ": ", np_photon_results[x:y] print "PHOTON QUEUE" photon_queue = queue_photon_index.get() for x in xrange(0, len(photon_queue), 10): y = x + 10 if y > len(photon_queue): y = len(photon_queue) print x, ": ", photon_queue[x:y] print "QUEUE SLOT FLAGS: ", nactive, " threads" slot_flags = queue_slot_flag.get() for x in xrange(0, len(slot_flags), 10): y = x + 10 if y > len(slot_flags): y = len(slot_flags) print x, ": ", slot_flags[x:y] print "NODE FRONT: ", node_front_start.get( ), " to ", node_front_end.get( ), node_front_end.get() - node_front_start.get()