Beispiel #1
0
    def _call_opencl_kernel(self, sim, photons, ourphotons, max_shared_nodes,
                            nodes, workgroupsize, comqueue):
        module = get_module('wq_checknode.cl',
                            self.context,
                            options=api_options,
                            include_source_directory=True)
        gpu_funcs = GPUFuncs(module)

        # gather variables for kernel call
        gpugeo = sim.gpu_geometry
        photon_pos = photons.pos
        photon_dir = photons.dir
        photon_current_node = photons.current_node_index
        photon_tested_node = ga.to_device(
            comqueue, 1 * np.ones(len(photons.pos), dtype=np.uint32))
        photon_last_result = ga.to_device(
            comqueue, -1 * np.ones(len(photons.pos), dtype=np.int32))
        nodes = gpugeo.nodes
        node_parent = ga.to_device(comqueue,
                                   sim.detector.node_dsar_tree.parent)
        node_first_daughter = ga.to_device(
            comqueue, sim.detector.node_dsar_tree.first_daughter)
        node_sibling = ga.to_device(comqueue,
                                    sim.detector.node_dsar_tree.sibling)
        node_aunt = ga.to_device(comqueue, sim.detector.node_dsar_tree.aunt)
        world_origin = gpugeo.world_origin_gpu
        world_scale = gpugeo.world_scale
        # make queue related variables
        queue_size = np.int32(len(photons.pos) * 2)
        queue_photon_index = ga.empty(comqueue, queue_size, dtype=np.int32)
        queue_slot_flag = ga.zeros(comqueue, queue_size, dtype=np.int32)
        queue_photon_index[0:len(photons.pos)] = np.arange(0,
                                                           len(photons.pos),
                                                           dtype=np.int32)[:]
        queue_photon_index[len(photons.pos):] = (
            np.ones(len(photons.pos), dtype=np.int32) * -1)[:]
        queue_slot_flag[0:len(photons.pos)] = np.ones(len(photons.pos),
                                                      dtype=np.int32)[:]
        a = ga.zeros(comqueue, 1, dtype=ga.vec.uint4)
        b = np.array(1, dtype=np.int32)
        c = np.array(1, dtype=np.uint32)
        workgroup_photons = cl.LocalMemory(b.nbytes * workgroupsize)
        workgroup_current_node = cl.LocalMemory(b.nbytes * workgroupsize)
        workgroup_tested_node = cl.LocalMemory(b.nbytes * workgroupsize)

        max_nodes_can_store = (max_shared_nodes - 20 - 3 * workgroupsize)
        max_nodes_can_store -= max_nodes_can_store % 32
        max_nodes_can_store = np.int32(max_nodes_can_store)
        loaded_node_start_index = np.int32(0)
        loaded_node_end_index = np.int32(1)
        node_front_start = ga.empty(comqueue, 1, dtype=np.int32)
        node_front_end = ga.empty(comqueue, 1, dtype=np.int32)
        workgroup_nodes = cl.LocalMemory(a.nbytes * (max_nodes_can_store + 1))
        workgroup_daughter = cl.LocalMemory(c.nbytes *
                                            (max_nodes_can_store + 1))
        workgroup_sibling = cl.LocalMemory(c.nbytes *
                                           (max_nodes_can_store + 1))
        workgroup_aunt = cl.LocalMemory(c.nbytes * (max_nodes_can_store + 1))
        max_loops = 32

        if len(gpugeo.extra_nodes) > 1:
            raise RuntimeError('did not plan for there to be a node split.')

        print photon_current_node
        print photon_tested_node
        print queue_photon_index
        print queue_slot_flag

        print "Starting node range: ", loaded_node_start_index, " to ", loaded_node_end_index
        print "Max nodes in shared: ", max_nodes_can_store
        print "Work group nodes size: ", a.nbytes * workgroupsize, " bytes = (", a.nbytes, "*", workgroupsize, ")"
        print "Available local memsize: ", self.shared_mem_size
        print "Total number of nodes: ", len(
            nodes), " (", nodes.nbytes, " bytes)"
        print "Stored node size: ", max_nodes_can_store * a.nbytes
        print "Left over: ", self.shared_mem_size - max_nodes_can_store * a.nbytes - a.nbytes * workgroupsize
        print sim.detector.bvh.layer_bounds

        print "PRESUB CURRENT NODES"
        print photon_current_node
        print "PRESUB TESTED NODES"
        print photon_tested_node

        start_queue = time.time()
        gpu_funcs.checknode(
            comqueue, (workgroupsize, 1, 1), (workgroupsize, 1, 1),
            np.int32(max_loops), photon_pos.data, photon_dir.data,
            photon_current_node.data,
            photon_tested_node.data, photon_last_result.data,
            np.int32(len(nodes)), nodes.data, node_parent.data,
            node_first_daughter.data, node_sibling.data, node_aunt.data,
            world_origin.data, world_scale, queue_size,
            queue_photon_index.data, queue_slot_flag.data,
            np.int32(len(photon_pos)), np.int32(workgroupsize),
            workgroup_photons, workgroup_current_node, workgroup_tested_node,
            max_nodes_can_store, workgroup_nodes, workgroup_daughter,
            workgroup_sibling, workgroup_aunt, loaded_node_start_index,
            loaded_node_end_index, node_front_start.data,
            node_front_end.data).wait()
        end_queue = time.time()

        print "CheckNode Queue returns. ", end_queue - start_queue, " seconds"
        print "(Current node, To Test, result)"
        node_states = zip(photon_current_node.get(), photon_tested_node.get(),
                          photon_last_result.get())
        for x in xrange(0, len(node_states), 10):
            y = x + 10
            if y > len(node_states):
                y = len(node_states)
            print x, ": ", node_states[x:y]

        print "LAST RESULT:"
        print photon_last_result.get()

        print "PHOTON QUEUE"
        photon_queue = queue_photon_index.get()
        for x in xrange(0, len(photon_queue), 32):
            y = x + 32
            if y > len(photon_queue):
                y = len(photon_queue)
            print x, ": ", photon_queue[x:y]

        print "QUEUE SLOT FLAGS"
        slot_flags = queue_slot_flag.get()
        for x in xrange(0, len(slot_flags), 32):
            y = x + 32
            if y > len(slot_flags):
                y = len(slot_flags)
            print x, ": ", slot_flags[x:y]

        print "NODE FRONT: ", node_front_start.get(
        ), " to ", node_front_end.get(
        ), node_front_end.get() - node_front_start.get()
        return
Beispiel #2
0
    def _call_cuda_kernel(self, sim, photons, ourphotons, max_shared_nodes,
                          nodes, workgroupsize):
        module = get_module('wq_checknode.cu',
                            options=api_options,
                            include_source_directory=True)
        gpu_funcs = GPUFuncs(module)

        # gather variables for kernel call
        gpugeo = sim.gpu_geometry
        photon_pos = photons.pos
        photon_dir = photons.dir
        photon_current_node = photons.current_node_index
        photon_tested_node = ga.to_gpu(
            1 * np.ones(len(photons.pos), dtype=np.uint32))
        photon_last_result = ga.to_gpu(
            -1 * np.ones(len(photons.pos), dtype=np.int32))
        nodes = gpugeo.nodes
        node_parent = ga.to_gpu(sim.detector.node_dsar_tree.parent)
        node_first_daughter = ga.to_gpu(
            sim.detector.node_dsar_tree.first_daughter)
        node_sibling = ga.to_gpu(sim.detector.node_dsar_tree.sibling)
        node_aunt = ga.to_gpu(sim.detector.node_dsar_tree.aunt)
        world_origin = gpugeo.world_origin
        world_scale = gpugeo.world_scale

        # make queue related variables
        queue_size = np.int32(len(photons.pos) * 2)
        queue_photon_index = ga.empty(queue_size, dtype=np.int32)
        queue_slot_flag = ga.zeros(queue_size, dtype=np.int32)
        queue_photon_index[0:len(photons.pos)].set(
            np.arange(0, len(photons.pos), dtype=np.int32)[:])
        queue_photon_index[len(photons.pos):].set(
            -1 * np.ones(len(photons.pos), dtype=np.int32))
        queue_slot_flag[0:len(photons.pos)].set(
            np.ones(len(photons.pos), dtype=np.int32)[:])
        a = ga.zeros(1, dtype=ga.vec.uint4)
        b = np.array(1, dtype=np.int32)
        c = np.array(1, dtype=np.uint32)

        max_nodes_can_store = (max_shared_nodes - 20 - 3 * workgroupsize)
        max_nodes_can_store -= max_nodes_can_store % 32
        max_nodes_can_store = np.int32(max_nodes_can_store)

        loaded_node_start_index = np.int32(0)
        loaded_node_end_index = np.int32(1)
        node_front_start = ga.empty(1, dtype=np.int32)
        node_front_end = ga.empty(1, dtype=np.int32)

        max_loops = 1000

        if len(gpugeo.extra_nodes) > 1:
            raise RuntimeError('did not plan for there to be a node split.')

        print photon_current_node
        print photon_tested_node
        print queue_photon_index
        print queue_slot_flag

        print "Starting node range: ", loaded_node_start_index, " to ", loaded_node_end_index
        print "Max nodes in shared: ", max_nodes_can_store
        print "Work group nodes size: ", a.nbytes * workgroupsize, " bytes = (", a.nbytes, "*", workgroupsize, ")"
        print "Available local memsize: ", self.shared_mem_size
        print "Total number of nodes: ", len(
            nodes), " (", nodes.nbytes, " bytes)"
        print "Stored node size: ", max_nodes_can_store * a.nbytes
        print "Left over: ", self.shared_mem_size - max_nodes_can_store * a.nbytes - a.nbytes * workgroupsize
        print sim.detector.bvh.layer_bounds

        print "PRESUB CURRENT NODES"
        print photon_current_node
        print "PRESUB TESTED NODES"
        print photon_tested_node
        print "STARTING QUEUE"
        print queue_photon_index

        start_queue = time.time()
        gpu_funcs.checknode(np.int32(max_loops),
                            photon_pos,
                            photon_dir,
                            photon_current_node,
                            photon_tested_node,
                            photon_last_result,
                            np.int32(len(nodes)),
                            nodes,
                            node_parent,
                            node_first_daughter,
                            node_sibling,
                            node_aunt,
                            world_origin,
                            world_scale,
                            queue_size,
                            queue_photon_index,
                            queue_slot_flag,
                            np.int32(len(photon_pos)),
                            max_nodes_can_store,
                            loaded_node_start_index,
                            loaded_node_end_index,
                            node_front_start,
                            node_front_end,
                            block=(workgroupsize, 1, 1),
                            grid=(1, 1),
                            shared=4 *
                            (7 * max_nodes_can_store + 3 * workgroupsize + 1))
        cuda.Context.get_current().synchronize()
        end_queue = time.time()

        nactive = len(np.argwhere(queue_slot_flag.get() == 1))

        print "CheckNode Queue returns. ", end_queue - start_queue, " seconds"
        print "(Current node, To Test)"
        node_states = zip(photon_current_node.get(), photon_tested_node.get(),
                          photon_last_result.get())
        for x in xrange(0, len(node_states), 10):
            y = x + 10
            if y > len(node_states):
                y = len(node_states)
            print x, ": ", node_states[x:y]

        print "LAST RESULT:"
        np_photon_results = photon_last_result.get()
        for x in xrange(0, len(np_photon_results), 10):
            y = x + 10
            if y > len(np_photon_results):
                y = len(np_photon_results)
            print x, ": ", np_photon_results[x:y]

        print "PHOTON QUEUE"
        photon_queue = queue_photon_index.get()
        for x in xrange(0, len(photon_queue), 10):
            y = x + 10
            if y > len(photon_queue):
                y = len(photon_queue)
            print x, ": ", photon_queue[x:y]

        print "QUEUE SLOT FLAGS: ", nactive, " threads"
        slot_flags = queue_slot_flag.get()
        for x in xrange(0, len(slot_flags), 10):
            y = x + 10
            if y > len(slot_flags):
                y = len(slot_flags)
            print x, ": ", slot_flags[x:y]

        print "NODE FRONT: ", node_front_start.get(
        ), " to ", node_front_end.get(
        ), node_front_end.get() - node_front_start.get()