Ejemplo n.º 1
0
def to_device(array, backend='cython'):
    if backend == 'cython':
        out = array
    elif backend == 'opencl':
        import pyopencl.array as gpuarray
        from .opencl import get_queue
        out = gpuarray.to_device(get_queue(), array)
    elif backend == 'cuda':
        import pycuda.gpuarray as gpuarray
        out = gpuarray.to_gpu(array)
    return wrap_array(out, backend)
Ejemplo n.º 2
0
    def _call_cuda_kernel(self, sim, photons, ourphotons, max_shared_nodes,
                          nodes, workgroupsize):
        module = get_module('wq_checknode.cu',
                            options=api_options,
                            include_source_directory=True)
        gpu_funcs = GPUFuncs(module)

        # gather variables for kernel call
        gpugeo = sim.gpu_geometry
        photon_pos = photons.pos
        photon_dir = photons.dir
        photon_current_node = photons.current_node_index
        photon_tested_node = ga.to_gpu(
            1 * np.ones(len(photons.pos), dtype=np.uint32))
        photon_last_result = ga.to_gpu(
            -1 * np.ones(len(photons.pos), dtype=np.int32))
        nodes = gpugeo.nodes
        node_parent = ga.to_gpu(sim.detector.node_dsar_tree.parent)
        node_first_daughter = ga.to_gpu(
            sim.detector.node_dsar_tree.first_daughter)
        node_sibling = ga.to_gpu(sim.detector.node_dsar_tree.sibling)
        node_aunt = ga.to_gpu(sim.detector.node_dsar_tree.aunt)
        world_origin = gpugeo.world_origin
        world_scale = gpugeo.world_scale

        # make queue related variables
        queue_size = np.int32(len(photons.pos) * 2)
        queue_photon_index = ga.empty(queue_size, dtype=np.int32)
        queue_slot_flag = ga.zeros(queue_size, dtype=np.int32)
        queue_photon_index[0:len(photons.pos)].set(
            np.arange(0, len(photons.pos), dtype=np.int32)[:])
        queue_photon_index[len(photons.pos):].set(
            -1 * np.ones(len(photons.pos), dtype=np.int32))
        queue_slot_flag[0:len(photons.pos)].set(
            np.ones(len(photons.pos), dtype=np.int32)[:])
        a = ga.zeros(1, dtype=ga.vec.uint4)
        b = np.array(1, dtype=np.int32)
        c = np.array(1, dtype=np.uint32)

        max_nodes_can_store = (max_shared_nodes - 20 - 3 * workgroupsize)
        max_nodes_can_store -= max_nodes_can_store % 32
        max_nodes_can_store = np.int32(max_nodes_can_store)

        loaded_node_start_index = np.int32(0)
        loaded_node_end_index = np.int32(1)
        node_front_start = ga.empty(1, dtype=np.int32)
        node_front_end = ga.empty(1, dtype=np.int32)

        max_loops = 1000

        if len(gpugeo.extra_nodes) > 1:
            raise RuntimeError('did not plan for there to be a node split.')

        print photon_current_node
        print photon_tested_node
        print queue_photon_index
        print queue_slot_flag

        print "Starting node range: ", loaded_node_start_index, " to ", loaded_node_end_index
        print "Max nodes in shared: ", max_nodes_can_store
        print "Work group nodes size: ", a.nbytes * workgroupsize, " bytes = (", a.nbytes, "*", workgroupsize, ")"
        print "Available local memsize: ", self.shared_mem_size
        print "Total number of nodes: ", len(
            nodes), " (", nodes.nbytes, " bytes)"
        print "Stored node size: ", max_nodes_can_store * a.nbytes
        print "Left over: ", self.shared_mem_size - max_nodes_can_store * a.nbytes - a.nbytes * workgroupsize
        print sim.detector.bvh.layer_bounds

        print "PRESUB CURRENT NODES"
        print photon_current_node
        print "PRESUB TESTED NODES"
        print photon_tested_node
        print "STARTING QUEUE"
        print queue_photon_index

        start_queue = time.time()
        gpu_funcs.checknode(np.int32(max_loops),
                            photon_pos,
                            photon_dir,
                            photon_current_node,
                            photon_tested_node,
                            photon_last_result,
                            np.int32(len(nodes)),
                            nodes,
                            node_parent,
                            node_first_daughter,
                            node_sibling,
                            node_aunt,
                            world_origin,
                            world_scale,
                            queue_size,
                            queue_photon_index,
                            queue_slot_flag,
                            np.int32(len(photon_pos)),
                            max_nodes_can_store,
                            loaded_node_start_index,
                            loaded_node_end_index,
                            node_front_start,
                            node_front_end,
                            block=(workgroupsize, 1, 1),
                            grid=(1, 1),
                            shared=4 *
                            (7 * max_nodes_can_store + 3 * workgroupsize + 1))
        cuda.Context.get_current().synchronize()
        end_queue = time.time()

        nactive = len(np.argwhere(queue_slot_flag.get() == 1))

        print "CheckNode Queue returns. ", end_queue - start_queue, " seconds"
        print "(Current node, To Test)"
        node_states = zip(photon_current_node.get(), photon_tested_node.get(),
                          photon_last_result.get())
        for x in xrange(0, len(node_states), 10):
            y = x + 10
            if y > len(node_states):
                y = len(node_states)
            print x, ": ", node_states[x:y]

        print "LAST RESULT:"
        np_photon_results = photon_last_result.get()
        for x in xrange(0, len(np_photon_results), 10):
            y = x + 10
            if y > len(np_photon_results):
                y = len(np_photon_results)
            print x, ": ", np_photon_results[x:y]

        print "PHOTON QUEUE"
        photon_queue = queue_photon_index.get()
        for x in xrange(0, len(photon_queue), 10):
            y = x + 10
            if y > len(photon_queue):
                y = len(photon_queue)
            print x, ": ", photon_queue[x:y]

        print "QUEUE SLOT FLAGS: ", nactive, " threads"
        slot_flags = queue_slot_flag.get()
        for x in xrange(0, len(slot_flags), 10):
            y = x + 10
            if y > len(slot_flags):
                y = len(slot_flags)
            print x, ": ", slot_flags[x:y]

        print "NODE FRONT: ", node_front_start.get(
        ), " to ", node_front_end.get(
        ), node_front_end.get() - node_front_start.get()