def check_element_prop_threshold(self, element_property, threshold, refine_flags, debug, wait_for=None): actx = self.array_context knl = self.code_container.element_prop_threshold_checker() if debug: nelements_to_refine_prev = actx.to_numpy( actx.np.sum(refine_flags)).item() element_property = flatten(element_property, self.array_context) evt, out = knl(actx.queue, element_property=element_property, refine_flags=refine_flags, refine_flags_updated=np.array(0), threshold=np.array(threshold), wait_for=wait_for) import pyopencl as cl cl.wait_for_events([evt]) if debug: nelements_to_refine = actx.to_numpy( actx.np.sum(refine_flags)).item() if nelements_to_refine > nelements_to_refine_prev: logger.debug("refiner: found %d element(s) to refine", nelements_to_refine - nelements_to_refine_prev) return out["refine_flags_updated"] == 1
def check_expansion_disks_undisturbed_by_sources(self, stage1_density_discr, tree, peer_lists, expansion_disturbance_tolerance, refine_flags, debug, wait_for=None): # Avoid generating too many kernels. from pytools import div_ceil max_levels = MAX_LEVELS_INCREMENT * div_ceil( tree.nlevels, MAX_LEVELS_INCREMENT) knl = self.code_container.expansion_disk_undisturbed_by_sources_checker( tree.dimensions, tree.coord_dtype, tree.box_id_dtype, peer_lists.peer_list_starts.dtype, tree.particle_id_dtype, max_levels) if debug: npanels_to_refine_prev = cl.array.sum(refine_flags).get() found_panel_to_refine = cl.array.zeros(self.queue, 1, np.int32) found_panel_to_refine.finish() unwrap_args = AreaQueryElementwiseTemplate.unwrap_args from pytential import bind, sym center_danger_zone_radii = flatten( bind(stage1_density_discr, sym.expansion_radii(stage1_density_discr.ambient_dim, granularity=sym.GRANULARITY_CENTER))(self.array_context)) evt = knl( *unwrap_args( tree, peer_lists, tree.box_to_qbx_source_starts, tree.box_to_qbx_source_lists, tree.qbx_panel_to_source_starts, tree.qbx_panel_to_center_starts, tree.qbx_user_source_slice.start, tree.qbx_user_center_slice.start, tree.sorted_target_ids, center_danger_zone_radii, expansion_disturbance_tolerance, tree.nqbxpanels, refine_flags, found_panel_to_refine, *tree.sources), range=slice(tree.nqbxcenters), queue=self.queue, wait_for=wait_for) cl.wait_for_events([evt]) if debug: npanels_to_refine = cl.array.sum(refine_flags).get() if npanels_to_refine > npanels_to_refine_prev: logger.debug("refiner: found {} panel(s) to refine".format( npanels_to_refine - npanels_to_refine_prev)) return found_panel_to_refine.get()[0] == 1
def _write_time_shift(self, queues): """Estimate the time shift between devices with respect to a global clock. This is important for evaluating relative device runtimes with respect to each other. """ # Get only a single command queue for a device on which we will # determine the zero time of a device. unique_queues = [] devices = [] for queue in queues: if queue.device not in devices: unique_queues.append(queue) devices.append(queue.device) starts = {} start = time.time() for i in range(len(devices)): starts[devices[i]] = cl.enqueue_marker(unique_queues[i]) d_t = (time.time() - start) * q.s cl.wait_for_events(list(starts.values())) for device in starts: starts[device] = starts[device].profile.queued # Write the zero time for every device into the profiling file. self._profile_file.write("# device\tinitial_time\n") for device in starts: self._cldevices[device] = self._cldevice_next() self._profile_file.write("%d\t%d\n" % (self._cldevices[device], starts[device])) self._profile_file.write("# END_INIT_T0\n") self._profile_file.write("# Relative device timing error\n%g\n" % d_t.rescale(q.ns)) self._profile_file.write("# END_INIT\n")
def check_element_prop_threshold(self, element_property, threshold, refine_flags, debug, wait_for=None): knl = self.code_container.element_prop_threshold_checker() if debug: npanels_to_refine_prev = cl.array.sum(refine_flags).get() from pytential.utils import flatten_if_needed element_property = flatten_if_needed(self.array_context, element_property) evt, out = knl(self.queue, element_property=element_property, refine_flags=refine_flags, refine_flags_updated=np.array(0), threshold=np.array(threshold), wait_for=wait_for) cl.wait_for_events([evt]) if debug: npanels_to_refine = cl.array.sum(refine_flags).get() if npanels_to_refine > npanels_to_refine_prev: logger.debug("refiner: found {} panel(s) to refine".format( npanels_to_refine - npanels_to_refine_prev)) return (out["refine_flags_updated"] == 1).all()
def _write_time_shift(self, queues): """Estimate the time shift between devices with respect to a global clock. This is important for evaluating relative device runtimes with respect to each other. """ # Get only a single command queue for a device on which we will # determine the zero time of a device. unique_queues = [] devices = [] for queue in queues: if queue.device not in devices: unique_queues.append(queue) devices.append(queue.device) starts = {} start = time.time() for i in range(len(devices)): starts[devices[i]] = cl.enqueue_marker(unique_queues[i]) d_t = (time.time() - start) * q.s cl.wait_for_events(starts.values()) for device in starts: starts[device] = starts[device].profile.queued # Write the zero time for every device into the profiling file. self._profile_file.write("# device\tinitial_time\n") for device in starts: self._cldevices[device] = self._cldevice_next() self._profile_file.write("%d\t%d\n" % (self._cldevices[device], starts[device])) self._profile_file.write("# END_INIT_T0\n") self._profile_file.write("# Relative device timing error\n%g\n" % d_t.rescale(q.ns)) self._profile_file.write("# END_INIT\n")
def test_area_query_elwise(ctx_getter, dims, do_plot=False): ctx = ctx_getter() queue = cl.CommandQueue(ctx) nparticles = 10**5 dtype = np.float64 particles = make_normal_particle_array(queue, nparticles, dims, dtype) if do_plot: import matplotlib.pyplot as pt pt.plot(particles[0].get(), particles[1].get(), "x") from boxtree import TreeBuilder tb = TreeBuilder(ctx) queue.finish() tree, _ = tb(queue, particles, max_particles_in_box=30, debug=True) nballs = 10**4 ball_centers = make_normal_particle_array(queue, nballs, dims, dtype) ball_radii = cl.array.empty(queue, nballs, dtype).fill(0.1) from boxtree.area_query import ( AreaQueryElementwiseTemplate, PeerListFinder) template = AreaQueryElementwiseTemplate( extra_args=""" coord_t *ball_radii, %for ax in AXIS_NAMES[:dimensions]: coord_t *ball_${ax}, %endfor """, ball_center_and_radius_expr=""" %for ax in AXIS_NAMES[:dimensions]: ${ball_center}.${ax} = ball_${ax}[${i}]; %endfor ${ball_radius} = ball_radii[${i}]; """, leaf_found_op="") peer_lists, evt = PeerListFinder(ctx)(queue, tree) kernel = template.generate( ctx, dims, tree.coord_dtype, tree.box_id_dtype, peer_lists.peer_list_starts.dtype, tree.nlevels) evt = kernel( *template.unwrap_args( tree, peer_lists, ball_radii, *ball_centers), queue=queue, wait_for=[evt], range=slice(len(ball_radii))) cl.wait_for_events([evt])
def find_peer_lists(self, tree): plf = self.code_container.peer_list_finder() peer_lists, evt = plf(self.queue, tree) import pyopencl as cl cl.wait_for_events([evt]) return peer_lists
def check_sufficient_source_quadrature_resolution(self, stage2_density_discr, tree, peer_lists, refine_flags, debug, wait_for=None): actx = self.array_context # Avoid generating too many kernels. from pytools import div_ceil max_levels = MAX_LEVELS_INCREMENT * div_ceil(tree.nlevels, MAX_LEVELS_INCREMENT) knl = self.code_container.sufficient_source_quadrature_resolution_checker( tree.dimensions, tree.coord_dtype, tree.box_id_dtype, peer_lists.peer_list_starts.dtype, tree.particle_id_dtype, max_levels) if debug: nelements_to_refine_prev = actx.to_numpy( actx.np.sum(refine_flags)).item() found_element_to_refine = actx.zeros(1, dtype=np.int32) found_element_to_refine.finish() from pytential import bind, sym dd = sym.as_dofdesc(sym.GRANULARITY_ELEMENT).to_stage2() source_danger_zone_radii_by_element = flatten( bind( stage2_density_discr, sym._source_danger_zone_radii(stage2_density_discr.ambient_dim, dofdesc=dd))(self.array_context), self.array_context) unwrap_args = AreaQueryElementwiseTemplate.unwrap_args evt = knl(*unwrap_args( tree, peer_lists, tree.box_to_qbx_center_starts, tree.box_to_qbx_center_lists, tree.qbx_element_to_source_starts, tree.qbx_user_source_slice.start, tree.qbx_user_center_slice.start, tree.sorted_target_ids, source_danger_zone_radii_by_element, tree.nqbxelements, refine_flags, found_element_to_refine, *tree.sources), range=slice(tree.nqbxsources), queue=actx.queue, wait_for=wait_for) import pyopencl as cl cl.wait_for_events([evt]) if debug: nelements_to_refine = actx.to_numpy( actx.np.sum(refine_flags)).item() if nelements_to_refine > nelements_to_refine_prev: logger.debug("refiner: found %d element(s) to refine", nelements_to_refine - nelements_to_refine_prev) return actx.to_numpy(found_element_to_refine)[0] == 1
def check_expansion_disks_undisturbed_by_sources(self, lpot_source, tree, peer_lists, expansion_disturbance_tolerance, refine_flags, debug, wait_for=None): # Avoid generating too many kernels. from pytools import div_ceil max_levels = MAX_LEVELS_INCREMENT * div_ceil( tree.nlevels, MAX_LEVELS_INCREMENT) knl = self.code_container.expansion_disk_undisturbed_by_sources_checker( tree.dimensions, tree.coord_dtype, tree.box_id_dtype, peer_lists.peer_list_starts.dtype, tree.particle_id_dtype, max_levels) if debug: npanels_to_refine_prev = cl.array.sum(refine_flags).get() found_panel_to_refine = cl.array.zeros(self.queue, 1, np.int32) found_panel_to_refine.finish() unwrap_args = AreaQueryElementwiseTemplate.unwrap_args center_danger_zone_radii = lpot_source._expansion_radii("ncenters") evt = knl( *unwrap_args( tree, peer_lists, tree.box_to_qbx_source_starts, tree.box_to_qbx_source_lists, tree.qbx_panel_to_source_starts, tree.qbx_panel_to_center_starts, tree.qbx_user_source_slice.start, tree.qbx_user_center_slice.start, tree.sorted_target_ids, center_danger_zone_radii, expansion_disturbance_tolerance, tree.nqbxpanels, refine_flags, found_panel_to_refine, *tree.sources), range=slice(tree.nqbxcenters), queue=self.queue, wait_for=wait_for) cl.wait_for_events([evt]) if debug: npanels_to_refine = cl.array.sum(refine_flags).get() if npanels_to_refine > npanels_to_refine_prev: logger.debug("refiner: found {} panel(s) to refine".format( npanels_to_refine - npanels_to_refine_prev)) return found_panel_to_refine.get()[0] == 1
def check_sufficient_source_quadrature_resolution( self, lpot_source, tree, peer_lists, refine_flags, debug, wait_for=None): # Avoid generating too many kernels. from pytools import div_ceil max_levels = MAX_LEVELS_INCREMENT * div_ceil( tree.nlevels, MAX_LEVELS_INCREMENT) knl = self.code_container.sufficient_source_quadrature_resolution_checker( tree.dimensions, tree.coord_dtype, tree.box_id_dtype, peer_lists.peer_list_starts.dtype, tree.particle_id_dtype, max_levels) if debug: npanels_to_refine_prev = cl.array.sum(refine_flags).get() found_panel_to_refine = cl.array.zeros(self.queue, 1, np.int32) found_panel_to_refine.finish() from pytential import bind, sym source_danger_zone_radii_by_panel = bind(lpot_source, sym._source_danger_zone_radii( lpot_source.ambient_dim, dofdesc=sym.GRANULARITY_ELEMENT))(self.queue) unwrap_args = AreaQueryElementwiseTemplate.unwrap_args evt = knl( *unwrap_args( tree, peer_lists, tree.box_to_qbx_center_starts, tree.box_to_qbx_center_lists, tree.qbx_panel_to_source_starts, tree.qbx_user_source_slice.start, tree.qbx_user_center_slice.start, tree.sorted_target_ids, source_danger_zone_radii_by_panel, tree.nqbxpanels, refine_flags, found_panel_to_refine, *tree.sources), range=slice(tree.nqbxsources), queue=self.queue, wait_for=wait_for) cl.wait_for_events([evt]) if debug: npanels_to_refine = cl.array.sum(refine_flags).get() if npanels_to_refine > npanels_to_refine_prev: logger.debug("refiner: found {} panel(s) to refine".format( npanels_to_refine - npanels_to_refine_prev)) return found_panel_to_refine.get()[0] == 1
def readReturn(self, idx): ev0 = cl.enqueue_copy(self.queue_, src=self.ret_[idx], dest=self.retNonces_, is_blocking=False) ev1 = cl.enqueue_copy(self.queue_, src=self.retMap_[idx], dest=self.retMaps_, is_blocking=False) cl.wait_for_events([ev0, ev1]) return self.retMaps_, self.retNonces_
def test_area_query_elwise(ctx_factory, dims, do_plot=False): ctx = ctx_factory() queue = cl.CommandQueue(ctx) nparticles = 10**5 dtype = np.float64 particles = make_normal_particle_array(queue, nparticles, dims, dtype) if do_plot: import matplotlib.pyplot as pt pt.plot(particles[0].get(), particles[1].get(), "x") from boxtree import TreeBuilder tb = TreeBuilder(ctx) queue.finish() tree, _ = tb(queue, particles, max_particles_in_box=30, debug=True) nballs = 10**4 ball_centers = make_normal_particle_array(queue, nballs, dims, dtype) ball_radii = cl.array.empty(queue, nballs, dtype).fill(0.1) from boxtree.area_query import (AreaQueryElementwiseTemplate, PeerListFinder) template = AreaQueryElementwiseTemplate(extra_args=""" coord_t *ball_radii, %for ax in AXIS_NAMES[:dimensions]: coord_t *ball_${ax}, %endfor """, ball_center_and_radius_expr=""" %for ax in AXIS_NAMES[:dimensions]: ${ball_center}.${ax} = ball_${ax}[${i}]; %endfor ${ball_radius} = ball_radii[${i}]; """, leaf_found_op="") peer_lists, evt = PeerListFinder(ctx)(queue, tree) kernel = template.generate(ctx, dims, tree.coord_dtype, tree.box_id_dtype, peer_lists.peer_list_starts.dtype, tree.nlevels) evt = kernel(*template.unwrap_args(tree, peer_lists, ball_radii, *ball_centers), queue=queue, wait_for=[evt], range=slice(len(ball_radii))) cl.wait_for_events([evt])
def run(self): """Run in a separate thread and serve the incoming events.""" while not self.finish or not self._events.empty(): try: event, kernel = self._events.get(timeout=0.1) cl.wait_for_events([event]) self._process(event, kernel) self._events.task_done() except Empty: pass except Exception as exc: LOG.error(exc) self._profile_file.close()
def inspect_geo_data(insn, bound_expr, geo_data): nonlocal sizes, nsources, ncenters tree = geo_data.tree().with_queue(queue) from boxtree.area_query import PeerListFinder plf = PeerListFinder(queue.context) pl, evt = plf(queue, tree) # Perform an area query around each QBX center, counting the # neighborhood sizes. knl = NeighborhoodCounter.generate( queue.context, tree.dimensions, tree.coord_dtype, tree.box_id_dtype, tree.box_id_dtype, tree.nlevels, extra_type_aliases=(('particle_id_t', tree.particle_id_dtype),)) centers = geo_data.centers() search_radii = radius * geo_data.expansion_radii().with_queue(queue) ncenters = len(search_radii) nsources = tree.nsources sizes = cl.array.zeros(queue, ncenters, np.int32) assert nsources == lpot_source.quad_stage2_density_discr.nnodes coords = [] coords.extend(tree.sources) coords.extend(centers) evt = knl( *NeighborhoodCounter.unwrap_args( tree, pl, tree.box_source_starts, tree.box_source_counts_cumul, search_radii, sizes, *coords), range=slice(ncenters), queue=queue, wait_for=[evt]) cl.wait_for_events([evt]) return False # no need to do the actual FMM
def _finish_profile_events(self) -> None: # First, wait for completion of all events if self.profile_events: cl.wait_for_events([pevt.cl_event for pevt in self.profile_events]) # Then, collect all events and store them for t in self.profile_events: program = t.program r = self._get_kernel_stats(program, t.args_tuple) time = t.cl_event.profile.end - t.cl_event.profile.start new = ProfileResult(time, r.flops, r.bytes_accessed, r.footprint_bytes) self.profile_results.setdefault(program, []).append(new) self.profile_events = []
def get_interleaved_centers(queue, lpot_source): """ Return an array of shape (dim, ncenters) in which interior centers are placed next to corresponding exterior centers. """ knl = get_interleaver_kernel(lpot_source.density_discr.real_dtype) int_centers = get_centers_on_side(lpot_source, -1) ext_centers = get_centers_on_side(lpot_source, +1) result = [] wait_for = [] for int_axis, ext_axis in zip(int_centers, ext_centers): axis = cl.array.empty(queue, len(int_axis) * 2, int_axis.dtype) evt, _ = knl(queue, src1=int_axis, src2=ext_axis, dst=axis) result.append(axis) wait_for.append(evt) cl.wait_for_events(wait_for) return result
def _wait_and_transfer_profile_events(self) -> None: # First, wait for completion of all events if self.profile_events: cl.wait_for_events([pevt.cl_event for pevt in self.profile_events]) # Then, collect all events and store them for t in self.profile_events: t_unit = t.translation_unit if isinstance(t_unit, lp.TranslationUnit): name = t_unit.default_entrypoint.name else: # It's actually a cl.Kernel name = t_unit.function_name r = self._get_kernel_stats(t_unit, t.args_tuple) time = t.cl_event.profile.end - t.cl_event.profile.start new = SingleCallKernelProfile(time, r.flops, r.bytes_accessed, r.footprint_bytes) self.profile_results.setdefault(name, []).append(new) self.profile_events = []
def copy_outputs(self, exec_event, output_defs, output_buffers): """Copies outputs of a kernel execution to host, returning a list of `numpy.ndarray`s. You can obtain the first two arguments, `exec_event` and `output_buffers`, by calling the `run_kernel` method. Args: exec_event (pyopencl.Event): Kernel execution event output_defs (list): See documentation for `run_kernel` output_buffers (list): List of `pyopencl.Buffer`s containing kernel outputs. """ output_arrays = [ np.zeros(length, dtype=dtype) for (length, dtype) in output_defs ] copy_events = [ cl.enqueue_copy(self.cmd_queue, host_buf, device_buf, wait_for=[exec_event]) for (host_buf, device_buf) in zip(output_arrays, output_buffers) ] cl.wait_for_events(copy_events) return output_arrays
def check_element_prop_threshold(self, element_property, threshold, refine_flags, debug, wait_for=None): knl = self.code_container.element_prop_threshold_checker() if debug: npanels_to_refine_prev = cl.array.sum(refine_flags).get() evt, out = knl(self.queue, element_property=element_property, # lpot_source._coarsest_quad_resolution("npanels")), refine_flags=refine_flags, refine_flags_updated=np.array(0), threshold=np.array(threshold), wait_for=wait_for) cl.wait_for_events([evt]) if debug: npanels_to_refine = cl.array.sum(refine_flags).get() if npanels_to_refine > npanels_to_refine_prev: logger.debug("refiner: found {} panel(s) to refine".format( npanels_to_refine - npanels_to_refine_prev)) return (out["refine_flags_updated"].get() == 1).all()
def __forward_ocl_vec4_interleaved(self, x, h0, c0, sm=False): def interleave(matrix): if len(matrix.shape) == 1: return np.squeeze(interleave(np.expand_dims(matrix, axis=0)), axis=0).copy() new = np.zeros_like(matrix) a, b, c, d = np.hsplit(matrix, 4) simd = 4 rng = np.arange(0, matrix.shape[1], simd) new[:, rng] = a new[:, rng + 1] = b new[:, rng + 2] = c new[:, rng + 3] = d return new.copy() seq_len = x.shape[0] batch_size = x.shape[1] weights = interleave( np.concatenate((np.transpose(self.Wi), np.transpose(self.Wh)), 0)).T.astype(np.float32) ifcos = np.zeros((batch_size, 4 * self.hidden_size)).astype(np.float32) hy = np.zeros( (seq_len + 1, batch_size, self.hidden_size)).astype(np.float32) cy = np.zeros( (seq_len + 1, batch_size, self.hidden_size)).astype(np.float32) hy[0] = h0 cy[0] = c0 platform = cl.get_platforms()[0] # Select the first platform [0] device = platform.get_devices()[ 0] # Select the first device on this platform [0] context = cl.Context([device]) # Create a context with your device queue = cl.CommandQueue( context) # Create a command queue with your context # Allocate on device x_gpu = cl.Buffer(context, cl.mem_flags.COPY_HOST_PTR, hostbuf=x.copy('C')) # cuda_alloc(x) weights_gpu = cl.Buffer( context, cl.mem_flags.COPY_HOST_PTR, hostbuf=weights.copy('C')) # cuda_alloc(weights) bias_gpu = cl.Buffer(context, cl.mem_flags.COPY_HOST_PTR, hostbuf=interleave( self.B).copy('C')) # cuda_alloc(self.B) ifcos_gpu = cl.Buffer(context, cl.mem_flags.READ_WRITE | cl.mem_flags.COPY_HOST_PTR, hostbuf=ifcos.copy('C')) # cuda_alloc(ifcos) hy_gpu = cl.Buffer(context, cl.mem_flags.READ_WRITE | cl.mem_flags.COPY_HOST_PTR, hostbuf=hy.copy('C')) # cuda_alloc(hy) cy_gpu = cl.Buffer(context, cl.mem_flags.READ_WRITE | cl.mem_flags.COPY_HOST_PTR, hostbuf=cy.copy('C')) # cuda_alloc(cy) kernelSource = '' kernelFilename = 'lstm_vec4_interleaved.cl' with open(kernelFilename, 'r') as file: kernelSource = file.read() program = cl.Program(context, kernelSource).build() M = np.int32(batch_size) K = np.int32(self.input_size + self.hidden_size) N = np.int32(4 * self.hidden_size) gemm_lws = (8, 8, 1) gemm_gws = int(M), int(N), gemm_lws[2] eltwise_lws = (8, 8, 1) eltwise_gws = int(M), int(self.hidden_size), eltwise_lws[2] events = [] for i in range(0, seq_len): gemm_kernel = program.lstm_gemm gemm_kernel.set_args(x_gpu, hy_gpu, weights_gpu, bias_gpu, ifcos_gpu, M, K, N, np.int32(self.input_size), np.int32(self.hidden_size), np.int32(i)) ev1 = cl.enqueue_nd_range_kernel(queue, gemm_kernel, gemm_gws, gemm_lws) events.append(ev1) cl.enqueue_barrier(queue) eltwise_kernel = program.lstm_eltwise eltwise_kernel.set_args(cy_gpu, ifcos_gpu, hy_gpu, np.int32(self.hidden_size), np.int32(batch_size), np.int32(i)) ev2 = cl.enqueue_nd_range_kernel(queue, eltwise_kernel, eltwise_gws, eltwise_lws) events.append(ev2) cl.enqueue_barrier(queue) timer_start = datetime.datetime.now() cl.wait_for_events(events) execution_time = (datetime.datetime.now() - timer_start).total_seconds() * 1000 cl.enqueue_copy(queue, ifcos, ifcos_gpu) cl.enqueue_copy(queue, hy, hy_gpu) cl.enqueue_copy(queue, cy, cy_gpu) queue.finish() # Copy the data for array c back to the host results = hy[1:], hy[-1:], cy[-1:] return results, execution_time
def __forward_ocl_naive(self, x, h0, c0, acc): seq_len = x.shape[0] batch_size = x.shape[1] weights = np.concatenate( (np.transpose(self.Wi), np.transpose(self.Wh)), 0).astype(np.float32) ifcos = np.zeros((batch_size, 4 * self.hidden_size)).astype(np.float32) hy = np.zeros( (seq_len + 1, batch_size, self.hidden_size)).astype(np.float32) cy = np.zeros( (seq_len + 1, batch_size, self.hidden_size)).astype(np.float32) hy[0] = h0 cy[0] = c0 platform = cl.get_platforms()[0] # Select the first platform [0] device = platform.get_devices()[ 0] # Select the first device on this platform [0] context = cl.Context([device]) # Create a context with your device queue = cl.CommandQueue( context) # Create a command queue with your context # Allocate on device x_gpu = cl.Buffer(context, cl.mem_flags.COPY_HOST_PTR, hostbuf=x.copy('C')) # cuda_alloc(x) weights_gpu = cl.Buffer( context, cl.mem_flags.COPY_HOST_PTR, hostbuf=weights.copy('C')) # cuda_alloc(weights) bias_gpu = cl.Buffer(context, cl.mem_flags.COPY_HOST_PTR, hostbuf=self.B.copy('C')) # cuda_alloc(self.B) ifcos_gpu = cl.Buffer(context, cl.mem_flags.READ_WRITE | cl.mem_flags.COPY_HOST_PTR, hostbuf=ifcos.copy('C')) # cuda_alloc(ifcos) hy_gpu = cl.Buffer(context, cl.mem_flags.READ_WRITE | cl.mem_flags.COPY_HOST_PTR, hostbuf=hy.copy('C')) # cuda_alloc(hy) cy_gpu = cl.Buffer(context, cl.mem_flags.READ_WRITE | cl.mem_flags.COPY_HOST_PTR, hostbuf=cy.copy('C')) # cuda_alloc(cy) kernelSource = '' kernelFilename = 'lstm_naive.cl' if acc is False else 'lstm_naive_acc.cl' with open(kernelFilename, 'r') as file: kernelSource = file.read() program = cl.Program(context, kernelSource).build() M = np.int32(batch_size) K = np.int32(self.input_size + self.hidden_size) N = np.int32(4 * self.hidden_size) gemm_lws = (1, 1, 1) gemm_gws = int(M), int(N), gemm_lws[2] eltwise_lws = (1, 1, 1) eltwise_gws = int(M), int(self.hidden_size), eltwise_lws[2] events = [] for i in range(0, seq_len): gemm_kernel = program.lstm_gemm gemm_kernel.set_args(x_gpu, hy_gpu, weights_gpu, bias_gpu, ifcos_gpu, M, K, N, np.int32(self.input_size), np.int32(self.hidden_size), np.int32(i)) ev1 = cl.enqueue_nd_range_kernel(queue, gemm_kernel, gemm_gws, gemm_lws) events.append(ev1) cl.enqueue_barrier(queue) eltwise_kernel = program.lstm_eltwise eltwise_kernel.set_args(cy_gpu, ifcos_gpu, hy_gpu, np.int32(self.hidden_size), np.int32(batch_size), np.int32(i)) ev2 = cl.enqueue_nd_range_kernel(queue, eltwise_kernel, eltwise_gws, eltwise_lws) events.append(ev2) cl.enqueue_barrier(queue) timer_start = datetime.datetime.now() cl.wait_for_events(events) execution_time = (datetime.datetime.now() - timer_start).total_seconds() * 1000 cl.enqueue_copy(queue, ifcos, ifcos_gpu) cl.enqueue_copy(queue, hy, hy_gpu) cl.enqueue_copy(queue, cy, cy_gpu) queue.finish() results = hy[1:], hy[-1:], cy[-1:] return results, execution_time
def find_peer_lists(self, tree): plf = self.code_container.peer_list_finder() peer_lists, evt = plf(self.queue, tree) cl.wait_for_events([evt]) return peer_lists
def test_speed(ctx, rng): try: import pyopencl_blas except ImportError: pyopencl_blas = None # enable_out_of_order = ( # cl.command_queue_properties.OUT_OF_ORDER_EXEC_MODE_ENABLE) k = 300 # k = 100 # k = 32 # k = 16 ms = [rng.randint(100, 1000) for i in range(k)] ns = [rng.randint(100, 1000) for i in range(k)] # ms = [4096 for i in range(k)] # ns = [4096 for i in range(k)] aa = [ rng.uniform(-1, 1, size=(m, n)).astype('float32') for m, n in zip(ms, ns) ] xx = [rng.uniform(-1, 1, size=n).astype('float32') for n in ns] yy = [rng.uniform(-1, 1, size=m).astype('float32') for m in ms] ajs = [np.int32(i) for i in range(k)] xjs = [np.int32(i) for i in range(k)] # ajs = [rng.randint(k, size=p) for i in range(k)] # xjs = [rng.randint(k, size=p) for i in range(k)] # alpha = 0.5 # beta = 0.1 alpha = 1.0 beta = 1.0 # -- prepare initial conditions on device queue = cl.CommandQueue(ctx) # queue = cl.CommandQueue(ctx, properties=enable_out_of_order) clA = CLRA.from_arrays(queue, aa) clX = CLRA.from_arrays(queue, xx) clY = CLRA.from_arrays(queue, yy) A_js = RA(ajs, dtype=np.int32) X_js = RA(xjs, dtype=np.int32) # -- run cl computation prog = plan_ragged_gather_gemv(queue, alpha, clA, A_js, clX, X_js, beta, clY) plans = prog.choose_plans() print('') print('-' * 5 + ' Plans ' + '-' * 45) for plan in plans: print(plan) with Timer() as timer: for plan in plans: plan() print("nengo_ocl: %0.3f" % timer.duration) # -- speed test in ocl blas if pyopencl_blas: pyopencl_blas.setup() def array(a): cla = cl.array.Array(queue, a.shape, a.dtype) cla.set(a) return cla clAs = [array(a) for a in aa] clXs = [array(x.ravel()) for x in xx] clYs = [array(y.ravel()) for y in yy] queues = [cl.CommandQueue(ctx) for _ in range(k)] # queues = [cl.CommandQueue(ctx, properties=enable_out_of_order) # for _ in range(k)] queue.finish() with Timer() as timer: if 0: # use a single queue for A, X, Y in zip(clAs, clXs, clYs): pyopencl_blas.gemv(queue, A, X, Y) queue.finish() else: # use multiple parallel queues events = [] for i, [A, X, Y] in enumerate(zip(clAs, clXs, clYs)): q = queues[i % len(queues)] e = pyopencl_blas.gemv(q, A, X, Y) events.append(e) for q in queues: q.flush() cl.wait_for_events(events) print("clBLAS: %0.3f" % timer.duration)
def prefix_sum(cq, scanner, values_buf): cl.wait_for_events([scanner.prefix_sum(cq, values_buf)])
def mark_targets(self, places, dofdesc, tree, peer_lists, target_status, debug, wait_for=None): from pytential import bind, sym ambient_dim = places.ambient_dim # Round up level count--this gets included in the kernel as # a stack bound. Rounding avoids too many kernel versions. from pytools import div_ceil max_levels = 10 * div_ceil(tree.nlevels, 10) knl = self.code_container.target_marker( tree.dimensions, tree.coord_dtype, tree.box_id_dtype, peer_lists.peer_list_starts.dtype, tree.particle_id_dtype, max_levels) found_target_close_to_panel = cl.array.zeros(self.queue, 1, np.int32) found_target_close_to_panel.finish() # Perform a space invader query over the sources. source_slice = tree.sorted_target_ids[tree.qbx_user_source_slice] sources = [ axis.with_queue(self.queue)[source_slice] for axis in tree.sources ] tunnel_radius_by_source = flatten( bind(places, sym._close_target_tunnel_radii(ambient_dim, dofdesc=dofdesc))( self.array_context)) # Target-marking algorithm (TGTMARK): # # (1) Use a space invader query to tag each leaf box that intersects with the # "near-source-detection tunnel" with the distance to the closest source. # # (2) Do an area query around all targets with the radius resulting # from the space invader query, enumerate sources in that vicinity. # If a source is found whose distance to the target is less than the # source's tunnel radius, mark that target as pending. # (or below: mark the source for refinement) # Note that this comment is referred to below by "TGTMARK". If you # remove this comment or change the algorithm here, make sure that # the reference below is still accurate. # Trade off for space-invaders vs directly tagging targets in # endangered boxes: # # (-) More complicated # (-) More actual work # (+) Taking the point of view of the targets could potentially lead to # more parallelism, if you think of the targets as unbounded while the # sources are fixed (which sort of makes sense, given that the number # of targets per box is not bounded). box_to_search_dist, evt = self.code_container.space_invader_query()( self.queue, tree, sources, tunnel_radius_by_source, peer_lists, wait_for=wait_for) wait_for = [evt] evt = knl(*unwrap_args(tree, peer_lists, tree.box_to_qbx_source_starts, tree.box_to_qbx_source_lists, tree.qbx_user_source_slice.start, tree.qbx_user_target_slice.start, tree.sorted_target_ids, tunnel_radius_by_source, box_to_search_dist, target_status, found_target_close_to_panel, *tree.sources), range=slice(tree.nqbxtargets), queue=self.queue, wait_for=wait_for) if debug: target_status.finish() # Marked target = 1, 0 otherwise marked_target_count = cl.array.sum(target_status).get() logger.debug( "target association: {}/{} targets marked close to panels". format(marked_target_count, tree.nqbxtargets)) cl.wait_for_events([evt]) return (found_target_close_to_panel == 1).all().get()
def try_find_centers(self, tree, peer_lists, lpot_source, target_status, target_flags, target_assoc, target_association_tolerance, debug, wait_for=None): # Round up level count--this gets included in the kernel as # a stack bound. Rounding avoids too many kernel versions. from pytools import div_ceil max_levels = 10 * div_ceil(tree.nlevels, 10) knl = self.code_container.center_finder( tree.dimensions, tree.coord_dtype, tree.box_id_dtype, peer_lists.peer_list_starts.dtype, tree.particle_id_dtype, max_levels) if debug: target_status.finish() marked_target_count = int(cl.array.sum(target_status).get()) # Perform a space invader query over the centers. center_slice = ( tree.sorted_target_ids[tree.qbx_user_center_slice] .with_queue(self.queue)) centers = [ axis.with_queue(self.queue)[center_slice] for axis in tree.sources] expansion_radii_by_center = \ lpot_source._expansion_radii("ncenters").with_queue(self.queue) expansion_radii_by_center_with_tolerance = \ expansion_radii_by_center * (1 + target_association_tolerance) # Idea: # # (1) Tag leaf boxes around centers with max distance to usable center. # (2) Area query from targets with those radii to find closest eligible # center. box_to_search_dist, evt = self.code_container.space_invader_query()( self.queue, tree, centers, expansion_radii_by_center_with_tolerance, peer_lists, wait_for=wait_for) wait_for = [evt] min_dist_to_center = cl.array.empty( self.queue, tree.nqbxtargets, tree.coord_dtype) min_dist_to_center.fill(np.inf) wait_for.extend(min_dist_to_center.events) evt = knl( *unwrap_args( tree, peer_lists, tree.box_to_qbx_center_starts, tree.box_to_qbx_center_lists, tree.qbx_user_center_slice.start, tree.qbx_user_target_slice.start, tree.sorted_target_ids, expansion_radii_by_center_with_tolerance, box_to_search_dist, target_flags, target_status, target_assoc.target_to_center, min_dist_to_center, *tree.sources), range=slice(tree.nqbxtargets), queue=self.queue, wait_for=wait_for) if debug: target_status.finish() # Associated target = 2, marked target = 1 ntargets_associated = ( int(cl.array.sum(target_status).get()) - marked_target_count) assert ntargets_associated >= 0 logger.debug("target association: {} targets were assigned centers" .format(ntargets_associated)) cl.wait_for_events([evt])
def event_waiter2(e, key): cl.wait_for_events([e]) status[key] = True
def mark_panels_for_refinement(self, places, dofdesc, tree, peer_lists, target_status, refine_flags, debug, wait_for=None): from pytential import bind, sym ambient_dim = places.ambient_dim # Round up level count--this gets included in the kernel as # a stack bound. Rounding avoids too many kernel versions. from pytools import div_ceil max_levels = 10 * div_ceil(tree.nlevels, 10) knl = self.code_container.refiner_for_failed_target_association( tree.dimensions, tree.coord_dtype, tree.box_id_dtype, peer_lists.peer_list_starts.dtype, tree.particle_id_dtype, max_levels) found_panel_to_refine = cl.array.zeros(self.queue, 1, np.int32) found_panel_to_refine.finish() # Perform a space invader query over the sources. source_slice = tree.user_source_ids[tree.qbx_user_source_slice] sources = [ axis.with_queue(self.queue)[source_slice] for axis in tree.sources ] tunnel_radius_by_source = flatten( bind(places, sym._close_target_tunnel_radii(ambient_dim, dofdesc=dofdesc))( self.array_context)) # see (TGTMARK) above for algorithm. box_to_search_dist, evt = self.code_container.space_invader_query()( self.queue, tree, sources, tunnel_radius_by_source, peer_lists, wait_for=wait_for) wait_for = [evt] evt = knl(*unwrap_args( tree, peer_lists, tree.box_to_qbx_source_starts, tree.box_to_qbx_source_lists, tree.qbx_panel_to_source_starts, tree.qbx_user_source_slice.start, tree.qbx_user_target_slice.start, tree.nqbxpanels, tree.sorted_target_ids, tunnel_radius_by_source, target_status, box_to_search_dist, refine_flags, found_panel_to_refine, *tree.sources), range=slice(tree.nqbxtargets), queue=self.queue, wait_for=wait_for) if debug: refine_flags.finish() # Marked panel = 1, 0 otherwise marked_panel_count = cl.array.sum(refine_flags).get() logger.debug( "target association: {} panels flagged for refinement".format( marked_panel_count)) cl.wait_for_events([evt]) return (found_panel_to_refine == 1).all().get()
def gpu_generate_next_flock(step, queue, intermediary_events, kernels, gpu_params, buffers, flocks, global_map): """ Does one iteration of the computation. To be called in the increasing continuous order of the integer "step" argument """ # Prepare memory for the generated flock new_flock = agents.Flock(cfg.NumberOfBoids) new_flock.init_empty_array() events = {} # Transfer the iteration number iteration = np.uint16(step) intermediary_events.append(cl.enqueue_copy(queue, buffers["global_iteration"], iteration)) # Transfer the random number random = np.float32(np.random.rand(1) * 2 * np.pi) intermediary_events.append(cl.enqueue_copy(queue, buffers["global_random"], random)) # ------------------------------------------------------------------------- # Example workgroup/workitem pair: 7 groups of 64 items (400 boids) events["k_agent_reynolds_rules13_preprocess"] = cl.enqueue_nd_range_kernel( queue, kernels["k_agent_reynolds_rules13_preprocess"], (int(np.ceil(cfg.NumberOfBoids / gpu_params["preferred_multiple"]) * gpu_params["preferred_multiple"]),), (gpu_params["preferred_multiple"],), global_work_offset=None, wait_for=intermediary_events) # Example workgroup/workitem pair: 625 groups of 256 items (400x400 boids) events["k_agent_reynolds_rule2_preprocess"] = cl.enqueue_nd_range_kernel( queue, kernels["k_agent_reynolds_rule2_preprocess"], (int(np.ceil(np.square(cfg.NumberOfBoids) / gpu_params["max_work_group_size"]) * gpu_params["max_work_group_size"]),), (gpu_params["max_work_group_size"],), global_work_offset=None, wait_for=intermediary_events) # Example workgroup/workitem pair: 7 groups of 64 items (400 boids) events["k_agent_ai_and_sim"] = cl.enqueue_nd_range_kernel( queue, kernels["k_agent_ai_and_sim"], (int(np.ceil(cfg.NumberOfBoids / gpu_params["preferred_multiple"]) * gpu_params["preferred_multiple"]),), (gpu_params["preferred_multiple"],), global_work_offset=None, wait_for=[events["k_agent_reynolds_rules13_preprocess"], events["k_agent_reynolds_rule2_preprocess"]]) # transfer device -> host ------------------------------------------------- # Second parameter size defines transfer size events["transfer_flocks"] = cl.enqueue_copy( queue, new_flock.np_arrays, buffers["global_generated_flocks"], device_offset=step * cfg.NumberOfBoids * agents.Boid.arraySize, wait_for=[events["k_agent_ai_and_sim"]]) if cfg.track_map_changes: events["transfer_map"] = cl.enqueue_copy( queue, global_map[step], buffers["global_map"], wait_for=[events["k_agent_ai_and_sim"]]) else: events["transfer_map"] = None cl.wait_for_events([events["transfer_flocks"], events["transfer_map"]]) # print(new_flock.np_arrays) flocks.append(new_flock) # TEST if cfg.debug_on: global_test = np.zeros(10).astype(np.float32) events["transfer_test"] = cl.enqueue_copy(queue, global_test, buffers["global_test"]) events["transfer_test"].wait() s = "%3d " % step for value in global_test: s += "%9.3f " % value print(s)
def main(mname): # Readin the matrix data = np.load(mname) indptr = data['indptr'] indices = data['indices'] M = data['M'] m = len(indptr) - 1 nSmp = 1000 M = np.tile(M, (1, nSmp, 1, 1)) v = np.ones((m * 3, nSmp)) y = np.zeros((m * 3, nSmp)) # Get the ranks. comm = MPI.COMM_WORLD rank = comm.Get_rank() size = comm.Get_size() print(rank) # Slice the data to be processed. srow = rank * int(m / size) erow = (rank + 1) * int(m / size) if rank == size - 1: erow = m lm = erow - srow tM = M[indptr[srow]:indptr[erow]] ty = y[srow * 3:erow * 3] tIndptr = indptr[srow:erow + 1] - indptr[srow] tIndices = indices[indptr[srow]:indptr[erow]] platforms = cl.get_platforms() devices = platforms[0].get_devices(cl.device_type.GPU) ndevices = len(devices) if ndevices < size: print('GPUs is not enough! Actural size: {}, need: {}'.format( ndevices, size)) return device = devices[rank] context = cl.Context([device]) queues = [cl.CommandQueue(context) for i in range(2)] # Create the buffers. mem_flags = cl.mem_flags indptr_buf = cl.Buffer(context, mem_flags.READ_ONLY | mem_flags.COPY_HOST_PTR, hostbuf=tIndptr) indices_buf = cl.Buffer(context, mem_flags.READ_ONLY | mem_flags.COPY_HOST_PTR, hostbuf=tIndices) # Allocate the OpenCL source and result buffer memory objects on GPU device GMEM. matrix_buf = cl.Buffer(context, mem_flags.READ_ONLY, tM.nbytes) vector_buf = cl.Buffer(context, mem_flags.READ_ONLY, v.nbytes) destination_buf = cl.Buffer(context, mem_flags.WRITE_ONLY, ty.nbytes) # Allocate pinned source and result host buffers: # Note: Pinned (Page Locked) memory is needed for async host<->GPU memory copy operations *** pinnedM = cl.Buffer(context, mem_flags.READ_WRITE | mem_flags.ALLOC_HOST_PTR, tM.nbytes) pinnedV = cl.Buffer(context, mem_flags.READ_WRITE | mem_flags.ALLOC_HOST_PTR, v.nbytes) pinnedRes = cl.Buffer(context, mem_flags.READ_WRITE | mem_flags.ALLOC_HOST_PTR, ty.nbytes) # Get mapped pointers to pinned input host buffers. # Note: This allows general (non-OpenCL) host functions to access pinned buffers using standard pointers map_flags = cl.map_flags srcM, _eventSrcM = cl.enqueue_map_buffer(queues[0], pinnedM, map_flags.WRITE, 0, tM.shape, tM.dtype) srcV, _eventSrcV = cl.enqueue_map_buffer(queues[0], pinnedV, map_flags.WRITE, 0, v.shape, v.dtype) srcRes, _eventSrcRes = cl.enqueue_map_buffer(queues[0], pinnedRes, map_flags.READ, 0, ty.shape, ty.dtype) srcM[:, :, :, :] = tM srcV[:, :] = v halfSize = int(lm / 2) localWorkSize = 64 num_compute_units = device.max_compute_units # assumes all the devices have same number of computes unit. globalWorkSize = 8 * num_compute_units * localWorkSize print('gpu {} num of computing unites {}'.format(rank, num_compute_units)) # Read and build the kernel. kernelsource = open("multiGPUsTest.cl").read() program = cl.Program(context, kernelsource).build() start = timer() for iloop in range(LOOP_COUNT): eventV = cl.enqueue_copy(queues[0], vector_buf, srcV) eventM0 = cl.enqueue_copy(queues[0], matrix_buf, srcM[:tIndptr[halfSize]]) # Kernel. matrix_dot_vector_kernel_event0 = \ program.matrix_dot_vector(queues[0], (globalWorkSize,), (localWorkSize,), np.int64(halfSize), np.int64(nSmp), np.int64(0), indptr_buf, indices_buf, matrix_buf, vector_buf, destination_buf, wait_for=[eventV, eventM0]) eventM1 = cl.enqueue_copy(queues[1], matrix_buf, srcM[tIndptr[halfSize]:], device_offset=tIndptr[halfSize] * nSmp * 9 * 8) # Kernel. matrix_dot_vector_kernel_event1 = \ program.matrix_dot_vector(queues[1], (globalWorkSize,), (localWorkSize,), np.int64(lm), np.int64(nSmp), np.int64(halfSize), indptr_buf, indices_buf, matrix_buf, vector_buf, destination_buf, wait_for=[eventV, eventM1]) ## Step #11. Move the kernel's output data to host memory. matrix_dot_vector_copy_event0 = \ cl.enqueue_copy(queues[0], srcRes[:halfSize*3], destination_buf, is_blocking=False, wait_for=[matrix_dot_vector_kernel_event0]) matrix_dot_vector_copy_event1 = \ cl.enqueue_copy(queues[1], srcRes[halfSize*3:], destination_buf, is_blocking=False, wait_for=[matrix_dot_vector_kernel_event1], device_offset=halfSize*3*nSmp*8) # matrix_dot_vector_copy_event0.wait() cl.wait_for_events( [matrix_dot_vector_copy_event0, matrix_dot_vector_copy_event1]) end = timer() print('OK, \t\t\t time: {:10.5f} ms'.format( (end - start) / float(LOOP_COUNT) * 1000.0))
def wait(): cl.wait_for_events(_events)
def pairGroup(group, dirtyElements, pairDistanceLimit=None): group = list(group) startedDirty = list(dirtyElements) isDirty = array(dirtyElements, dtype=uint8) ## Create a distance matrix (woohoo, will fit into memory now :D) distanceMatrix, clDistanceMatrix = calcDistanceMatrix(group, doIncludeFunc=lambda i: not isDirty[i], pairDistanceLimit=pairDistanceLimit) distanceMatrixStride = len(group) distanceMatrixSize = len(group) # Got all data... pair! doPairing = True # Initialization of OpenCL buffers minPosition = ndarray((2), dtype=int32) minPosition[:] = 0 clMinPosition = cl.Buffer(clContext, cl.mem_flags.READ_WRITE | cl.mem_flags.COPY_HOST_PTR, hostbuf=minPosition) clColBuffer = cl.Buffer(clContext, cl.mem_flags.READ_WRITE, size=distanceMatrixStride * float32().nbytes) clRowBuffer = cl.Buffer(clContext, cl.mem_flags.READ_WRITE, size=distanceMatrixStride * float32().nbytes) clDirtVector = cl.Buffer(clContext, cl.mem_flags.READ_WRITE | cl.mem_flags.COPY_HOST_PTR, hostbuf=isDirty) prevs = () while doPairing: ## Find minimum in distance matrix # Python way #i_py, j_py = unravel_index(distanceMatrix.argmin(), distanceMatrix.shape) # OpenCL way # Create minimum vectors if len(prevs) > 0: cl.wait_for_events(prevs) prevs = \ pairerProg.minCompressCol(clQueue, [distanceMatrixSize], None, int32(distanceMatrixSize), int32(distanceMatrixStride), clDistanceMatrix, clColBuffer), # Compress these vectors to matrix offsets prevs = \ pairerProg.reduceMinIndex(clQueue, [1], None, int32(distanceMatrixSize), clColBuffer, int32(0), clMinPosition, wait_for=prevs), prevs = \ pairerProg.reduceMinRowIndex(clQueue, [1], None, int32(distanceMatrixSize), int32(distanceMatrixStride), clDistanceMatrix, clMinPosition, wait_for=prevs), # Load indexes into Python memory cl.enqueue_copy(clQueue, minPosition, clMinPosition, is_blocking=True, wait_for=prevs) i, j = minPosition ## Do pairing i, j = min(i, j), max(i, j) # Track if this is a valid pairing with other bins doPairing = (i >= 0) and (not any([isDirty[x] for x in (i, j)])) # !!!Can perhaps merge more points by marking points as dirty if they cannot be matched!!! <- testing right now if doPairing: ## Adapt group to do the paring group[i] = [group[i], group[j]] group.pop(j) isDirty[i] = False isDirty = delete(isDirty, j) startedDirty.pop(j) ## Matrix update # Py way #minCol = hstack((distanceMatrix[[i],:].T, distanceMatrix[:,[i]], distanceMatrix[[j],:].T, distanceMatrix[:,[j]])).min(1) #distanceMatrix[[i],:] = array([hstack((array([inf] * (i + 1)), minCol[i + 1:]))]) #distanceMatrix[:,[i]] = array([hstack((minCol[:i], array([inf] * (distanceMatrix.shape[0] - i))))]).T #distanceMatrix = delete(delete(distanceMatrix, (j,), 1), (j,), 0) # Cl way prevs = (pairerProg.minUnifyDistances(clQueue, [distanceMatrixSize], None, int32(i), int32(j), int32(distanceMatrixStride), clDistanceMatrix),) prevs = (pairerProg.deleteRowAndCol(clQueue, [distanceMatrixSize], None, int32(j), int32(distanceMatrixSize), int32(distanceMatrixStride), clDistanceMatrix, wait_for=prevs),\ pairerProg.deleteBoolVectorElement(clQueue, [1], None, int32(j), int32(distanceMatrixSize), clDirtVector, wait_for=prevs)) distanceMatrixSize -= 1 doPairing = distanceMatrixSize > 1 elif i < 0: pass # Nothing to do... nothing to match... quit! else: ## Mark pair as dirty isDirty[i] = True isDirty[j] = True ## Delete low value from matrix # Py way # - no py way yet - need to cross out a cross shape in matrix # Cl way prevs = (pairerProg.crossDirty(clQueue, [2], None, int32(i), int32(distanceMatrixStride), int32(distanceMatrixSize), clDistanceMatrix, clDirtVector, wait_for=prevs),) prevs = (pairerProg.crossDirty(clQueue, [2], None, int32(j), int32(distanceMatrixStride), int32(distanceMatrixSize), clDistanceMatrix, clDirtVector, wait_for=prevs),) doPairing = sum([1 for d in isDirty if not d]) > 1 # Debug stuff #print "cl:",i, j, " py:",i_py, j_py #print distanceMatrix[i,j], distanceMatrix[i_py,j_py] #print distanceMatrix.argmin() #print "py:" #print distanceMatrix #distanceMatrixDeb = ndarray((distanceMatrixStride, distanceMatrixStride), dtype=float32) #cl.enqueue_copy(clQueue, distanceMatrixDeb, clDistanceMatrix, is_blocking=True, wait_for=prevs) #print "cl:" #print distanceMatrixDeb[:distanceMatrixSize,:distanceMatrixSize] #cl.enqueue_copy(clQueue, isDirty, clDirtVector, is_blocking=True, wait_for=prevs) #print "CopDirt",isDirty cl.wait_for_events(prevs) return [g for i, g in enumerate(group) if not startedDirty[i]]
def find_offsets(cq, finder, *args): cl.wait_for_events([finder.find_offsets(cq, *args)])
def test_wait_for_events(ctx_factory): ctx = ctx_factory() queue = cl.CommandQueue(ctx) evt1 = cl.enqueue_marker(queue) evt2 = cl.enqueue_marker(queue) cl.wait_for_events([evt1, evt2])
def mark_targets(self, tree, peer_lists, lpot_source, target_status, debug, wait_for=None): # Round up level count--this gets included in the kernel as # a stack bound. Rounding avoids too many kernel versions. from pytools import div_ceil max_levels = 10 * div_ceil(tree.nlevels, 10) knl = self.code_container.target_marker( tree.dimensions, tree.coord_dtype, tree.box_id_dtype, peer_lists.peer_list_starts.dtype, tree.particle_id_dtype, max_levels) found_target_close_to_panel = cl.array.zeros(self.queue, 1, np.int32) found_target_close_to_panel.finish() # Perform a space invader query over the sources. source_slice = tree.sorted_target_ids[tree.qbx_user_source_slice] sources = [ axis.with_queue(self.queue)[source_slice] for axis in tree.sources] tunnel_radius_by_source = ( lpot_source._close_target_tunnel_radius("nsources") .with_queue(self.queue)) # Target-marking algorithm (TGTMARK): # # (1) Use a space invader query to tag each leaf box that intersects with the # "near-source-detection tunnel" with the distance to the closest source. # # (2) Do an area query around all targets with the radius resulting # from the space invader query, enumerate sources in that vicinity. # If a source is found whose distance to the target is less than the # source's tunnel radius, mark that target as pending. # (or below: mark the source for refinement) # Note that this comment is referred to below by "TGTMARK". If you # remove this comment or change the algorithm here, make sure that # the reference below is still accurate. # Trade off for space-invaders vs directly tagging targets in # endangered boxes: # # (-) More complicated # (-) More actual work # (+) Taking the point of view of the targets could potentially lead to # more parallelism, if you think of the targets as unbounded while the # sources are fixed (which sort of makes sense, given that the number # of targets per box is not bounded). box_to_search_dist, evt = self.code_container.space_invader_query()( self.queue, tree, sources, tunnel_radius_by_source, peer_lists, wait_for=wait_for) wait_for = [evt] tunnel_radius_by_source = lpot_source._close_target_tunnel_radius("nsources") evt = knl( *unwrap_args( tree, peer_lists, tree.box_to_qbx_source_starts, tree.box_to_qbx_source_lists, tree.qbx_user_source_slice.start, tree.qbx_user_target_slice.start, tree.sorted_target_ids, tunnel_radius_by_source, box_to_search_dist, target_status, found_target_close_to_panel, *tree.sources), range=slice(tree.nqbxtargets), queue=self.queue, wait_for=wait_for) if debug: target_status.finish() # Marked target = 1, 0 otherwise marked_target_count = cl.array.sum(target_status).get() logger.debug("target association: {}/{} targets marked close to panels" .format(marked_target_count, tree.nqbxtargets)) cl.wait_for_events([evt]) return (found_target_close_to_panel == 1).all().get()
def find_centers(self, places, dofdesc, tree, peer_lists, target_status, target_flags, target_assoc, target_association_tolerance, debug, wait_for=None): from pytential import bind, sym ambient_dim = places.ambient_dim # Round up level count--this gets included in the kernel as # a stack bound. Rounding avoids too many kernel versions. from pytools import div_ceil max_levels = 10 * div_ceil(tree.nlevels, 10) knl = self.code_container.center_finder( tree.dimensions, tree.coord_dtype, tree.box_id_dtype, peer_lists.peer_list_starts.dtype, tree.particle_id_dtype, max_levels) if debug: target_status.finish() marked_target_count = int(cl.array.sum(target_status).get()) # Perform a space invader query over the centers. center_slice = ( tree.sorted_target_ids[tree.qbx_user_center_slice].with_queue( self.queue)) centers = [ axis.with_queue(self.queue)[center_slice] for axis in tree.sources ] expansion_radii_by_center = bind( places, sym.expansion_radii(ambient_dim, granularity=sym.GRANULARITY_CENTER, dofdesc=dofdesc))(self.array_context) expansion_radii_by_center_with_tolerance = flatten( expansion_radii_by_center * (1 + target_association_tolerance)) # Idea: # # (1) Tag leaf boxes around centers with max distance to usable center. # (2) Area query from targets with those radii to find closest eligible # center in terms of relative distance. box_to_search_dist, evt = self.code_container.space_invader_query()( self.queue, tree, centers, expansion_radii_by_center_with_tolerance, peer_lists, wait_for=wait_for) wait_for = [evt] def make_target_field(fill_val, dtype=tree.coord_dtype): arr = cl.array.empty(self.queue, tree.nqbxtargets, dtype) arr.fill(fill_val) wait_for.extend(arr.events) return arr target_to_center_plus = make_target_field(-1, np.int32) target_to_center_minus = make_target_field(-1, np.int32) min_dist_to_center_plus = make_target_field(np.inf) min_dist_to_center_minus = make_target_field(np.inf) min_rel_dist_to_center_plus = make_target_field(np.inf) min_rel_dist_to_center_minus = make_target_field(np.inf) evt = knl(*unwrap_args( tree, peer_lists, tree.box_to_qbx_center_starts, tree.box_to_qbx_center_lists, tree.qbx_user_center_slice.start, tree.qbx_user_target_slice.start, tree.sorted_target_ids, expansion_radii_by_center_with_tolerance, box_to_search_dist, target_flags, target_status, target_assoc.target_to_center, target_to_center_plus, target_to_center_minus, min_dist_to_center_plus, min_dist_to_center_minus, min_rel_dist_to_center_plus, min_rel_dist_to_center_minus, *tree.sources), range=slice(tree.nqbxtargets), queue=self.queue, wait_for=wait_for) if debug: target_status.finish() # Associated target = 2, marked target = 1 ntargets_associated = (int(cl.array.sum(target_status).get()) - marked_target_count) assert ntargets_associated >= 0 logger.debug( "target association: {} targets were assigned centers".format( ntargets_associated)) cl.wait_for_events([evt])
def mark_panels_for_refinement(self, tree, peer_lists, lpot_source, target_status, refine_flags, debug, wait_for=None): # Round up level count--this gets included in the kernel as # a stack bound. Rounding avoids too many kernel versions. from pytools import div_ceil max_levels = 10 * div_ceil(tree.nlevels, 10) knl = self.code_container.refiner_for_failed_target_association( tree.dimensions, tree.coord_dtype, tree.box_id_dtype, peer_lists.peer_list_starts.dtype, tree.particle_id_dtype, max_levels) found_panel_to_refine = cl.array.zeros(self.queue, 1, np.int32) found_panel_to_refine.finish() # Perform a space invader query over the sources. source_slice = tree.user_source_ids[tree.qbx_user_source_slice] sources = [ axis.with_queue(self.queue)[source_slice] for axis in tree.sources] tunnel_radius_by_source = ( lpot_source._close_target_tunnel_radius("nsources") .with_queue(self.queue)) # See (TGTMARK) above for algorithm. box_to_search_dist, evt = self.code_container.space_invader_query()( self.queue, tree, sources, tunnel_radius_by_source, peer_lists, wait_for=wait_for) wait_for = [evt] evt = knl( *unwrap_args( tree, peer_lists, tree.box_to_qbx_source_starts, tree.box_to_qbx_source_lists, tree.qbx_panel_to_source_starts, tree.qbx_user_source_slice.start, tree.qbx_user_target_slice.start, tree.nqbxpanels, tree.sorted_target_ids, lpot_source._close_target_tunnel_radius("nsources"), target_status, box_to_search_dist, refine_flags, found_panel_to_refine, *tree.sources), range=slice(tree.nqbxtargets), queue=self.queue, wait_for=wait_for) if debug: refine_flags.finish() # Marked panel = 1, 0 otherwise marked_panel_count = cl.array.sum(refine_flags).get() logger.debug("target association: {} panels flagged for refinement" .format(marked_panel_count)) cl.wait_for_events([evt]) return (found_panel_to_refine == 1).all().get()
def gpu_generate_next_flock(step, queue, intermediary_events, kernels, gpu_params, buffers, flocks): """ Does one iteration of the computation. To be called in the increasing continuous order of the integer "step" argument """ # Prepare memory for the generated flock new_flock = {} new_flock["flock"] = agents.Flock(cfg.NumberOfBoids) new_flock["flock"].np_arrays = np.zeros( (cfg.NumberOfBoids, cfg.ARRDIM, cfg.Dimensions), dtype=np.float32) new_flock["predators"] = agents.Flock(cfg.NumberOfPredators) new_flock["predators"].np_arrays = np.zeros( (cfg.NumberOfPredators, cfg.ARRDIM, cfg.Dimensions), dtype=np.float32) events = {} # Transfer the iteration number iteration = np.uint16(step) intermediary_events.append(cl.enqueue_copy(queue, buffers["global_iteration"], iteration)) # 7 groups of 64 items (400 boids) events["k_predator_ai_preprocess"] = cl.enqueue_nd_range_kernel( queue, kernels["k_predator_ai_preprocess"], (int(np.ceil(cfg.NumberOfBoids / gpu_params["preferred_multiple"]) * gpu_params["preferred_multiple"]),), (gpu_params["preferred_multiple"],), global_work_offset=None, wait_for=intermediary_events) # 1 group of 5 items (5 predators) events["k_predator_ai"] = cl.enqueue_nd_range_kernel( queue, kernels["k_predator_ai"], (cfg.NumberOfPredators,), (cfg.NumberOfPredators,), global_work_offset=None, wait_for=(events["k_predator_ai_preprocess"],)) # transfer device -> host ------------------------------------------------- # Second parameter size defines transfer size events["transfer_predators"] = cl.enqueue_copy( queue, new_flock["predators"].np_arrays, buffers["generated_predators"], device_offset=step * cfg.NumberOfPredators * cfg.ARRDIM * cfg.Dimensions * np.dtype(np.float32).itemsize, wait_for=(events["k_predator_ai"],)) # ------------------------------------------------------------------------- # 625 groups of 256 items (400x400 boids) events["k_agent_ai_preprocess"] = cl.enqueue_nd_range_kernel( queue, kernels["k_agent_ai_preprocess"], (int(np.ceil(np.square(cfg.NumberOfBoids) / gpu_params["max_work_group_size"]) * gpu_params[ "max_work_group_size"]),), (gpu_params["max_work_group_size"],), global_work_offset=None, wait_for=(events["k_predator_ai"],)) # 7 groups of 64 items (400 boids) events["k_agent_ai"] = cl.enqueue_nd_range_kernel( queue, kernels["k_agent_ai"], (int(np.ceil(cfg.NumberOfBoids / gpu_params["preferred_multiple"]) * gpu_params["preferred_multiple"]),), (gpu_params["preferred_multiple"],), global_work_offset=None, wait_for=(events["k_agent_ai_preprocess"],)) # transfer device -> host ------------------------------------------------- # Second parameter size defines transfer size events["transfer_flocks"] = cl.enqueue_copy( queue, new_flock["flock"].np_arrays, buffers["generated_flocks"], device_offset=step * cfg.NumberOfBoids * cfg.ARRDIM * cfg.Dimensions * np.dtype(np.float32).itemsize, wait_for=(events["k_agent_ai"],)) cl.wait_for_events([events["transfer_predators"], events["transfer_flocks"]]) flocks.append(new_flock)
def finish(self): # undoc cl.wait_for_events(self.events) del self.events[:]
queue = cl.CommandQueue( context, dev, properties=cl.command_queue_properties.OUT_OF_ORDER_EXEC_MODE_ENABLE) source = ''' __kernel void hello_world(const int id) { printf(\"Hello world from id %d \\n", id); } ''' program = cl.Program(context, source) program.build() kernel = cl.Kernel(program, "hello_world") events = [] for i in range(num_kernel): kernel.set_args(np.int32(i)) if i == 0: event = cl.enqueue_nd_range_kernel(queue, kernel, [global_work_size], [local_work_size]) else: event = cl.enqueue_nd_range_kernel(queue, kernel, [global_work_size], [local_work_size], wait_for=[events[-1]]) events.append(event) cl.wait_for_events(events)
def test_speed(rng): try: import pyopencl_blas except ImportError: pyopencl_blas = None # enable_out_of_order = ( # cl.command_queue_properties.OUT_OF_ORDER_EXEC_MODE_ENABLE) k = 300 # k = 100 # k = 32 # k = 16 ms = [rng.randint(100, 1000) for i in range(k)] ns = [rng.randint(100, 1000) for i in range(k)] # ms = [4096 for i in range(k)] # ns = [4096 for i in range(k)] aa = [rng.uniform(-1, 1, size=(m, n)).astype('float32') for m, n in zip(ms, ns)] xx = [rng.uniform(-1, 1, size=n).astype('float32') for n in ns] yy = [rng.uniform(-1, 1, size=m).astype('float32') for m in ms] ajs = [np.int32(i) for i in range(k)] xjs = [np.int32(i) for i in range(k)] # ajs = [rng.randint(k, size=p) for i in range(k)] # xjs = [rng.randint(k, size=p) for i in range(k)] # alpha = 0.5 # beta = 0.1 alpha = 1.0 beta = 1.0 # -- prepare initial conditions on device queue = cl.CommandQueue(ctx) # queue = cl.CommandQueue(ctx, properties=enable_out_of_order) clA = CLRA.from_arrays(queue, aa) clX = CLRA.from_arrays(queue, xx) clY = CLRA.from_arrays(queue, yy) A_js = RA(ajs, dtype=np.int32) X_js = RA(xjs, dtype=np.int32) # -- run cl computation prog = plan_ragged_gather_gemv( queue, alpha, clA, A_js, clX, X_js, beta, clY) plans = prog.choose_plans() print('') print('-' * 5 + ' Plans ' + '-' * 45) for plan in plans: print(plan) with Timer() as timer: for plan in plans: plan() print("nengo_ocl: %0.3f" % timer.duration) # -- speed test in ocl blas if pyopencl_blas: pyopencl_blas.setup() def array(a): cla = cl.array.Array(queue, a.shape, a.dtype) cla.set(a) return cla clAs = [array(a) for a in aa] clXs = [array(x.ravel()) for x in xx] clYs = [array(y.ravel()) for y in yy] queues = [cl.CommandQueue(ctx) for _ in range(k)] # queues = [cl.CommandQueue(ctx, properties=enable_out_of_order) # for _ in range(k)] queue.finish() with Timer() as timer: if 0: # use a single queue for A, X, Y in zip(clAs, clXs, clYs): pyopencl_blas.gemv(queue, A, X, Y) queue.finish() else: # use multiple parallel queues events = [] for i, [A, X, Y] in enumerate(zip(clAs, clXs, clYs)): q = queues[i % len(queues)] e = pyopencl_blas.gemv(q, A, X, Y) events.append(e) for q in queues: q.flush() cl.wait_for_events(events) print("clBLAS: %0.3f" % timer.duration)
# spMVOverlapping matrix_dot_vector_kernel_event0 = \ program.matrix_dot_vector(queues[0], (globalWorkSize,), (localWorkSize,), np.int64(halfSize), np.int64(nSmp), np.int64(0), indptr_buf, indices_buf, matrix_buf, vector_buf, destination_buf, wait_for=[eventM0]) eventM1 = cl.enqueue_copy(queues[1], matrix_buf, srcM[indptr[halfSize]:], is_blocking=False, device_offset=indptr[halfSize] * nSmp * 9 * 8) # spMVOverlapping matrix_dot_vector_kernel_event1 = \ program.matrix_dot_vector(queues[1], (globalWorkSize,), (localWorkSize,), np.int64(m), np.int64(nSmp), np.int64(halfSize), indptr_buf, indices_buf, matrix_buf, vector_buf, destination_buf, wait_for=[eventM1]) ## Step #11. Move the kernel's output data to host memory. matrix_dot_vector_copy_event0 = \ cl.enqueue_copy(queues[0], srcRes[:halfSize*3], destination_buf, is_blocking=False, wait_for=[matrix_dot_vector_kernel_event0]) matrix_dot_vector_copy_event1 = \ cl.enqueue_copy(queues[1], srcRes[halfSize*3:], destination_buf, is_blocking=False, wait_for=[matrix_dot_vector_kernel_event1], device_offset=halfSize*3*nSmp*8) cl.wait_for_events( [matrix_dot_vector_copy_event0, matrix_dot_vector_copy_event1]) end = timer() print('OK, \t\t\t time: {:10.5f} ms'.format( (end - start) / float(LOOP_COUNT) * 1000.0)) # print(srcRes) # print(srcM[2:])
def collide(cq, collider, *args): cl.wait_for_events([collider.get_collisions(cq, *args)])
def elapsed(self): cl.wait_for_events([self.start_event, self.stop_event]) return ( self.stop_event.profile.end - self.start_event.profile.end) / SECONDS_PER_NANOSECOND
def nn3d(pts, values, minima=np.zeros((3,), dtype=np.float32), maxima=np.ones((3,), dtype=np.float32), res=np.ones((3,), dtype=np.int32) * 32, ): ctx = cl.create_some_context() queue = cl.CommandQueue(ctx) mf = cl.mem_flags pts2 = np.hstack((pts, np.reshape(values, (len(values),1)))) print 'pts2:', pts2 node = buildKDTree(pts2) flat = flattenKDTree(node) flat_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf = flat) minima_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf = minima) maxima_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf = maxima) res_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf = res) out = np.zeros(res, dtype=np.float32) out_g = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf = out) # values_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf = values.astype(np.float32)) accum = np.zeros(res, dtype=np.float32) cnt = np.zeros(res, dtype=np.float32) accum_g = cl.Buffer(ctx, mf.WRITE_ONLY | mf.COPY_HOST_PTR, hostbuf=accum) cnt_g = cl.Buffer(ctx, mf.WRITE_ONLY | mf.COPY_HOST_PTR, hostbuf=cnt) prg = cl.Program(ctx, """ void atomic_add_global(volatile __global float *source, const float operand) { // *source += operand; union { unsigned int intVal; float floatVal; } newVal; union { unsigned int intVal; float floatVal; } prevVal; do { prevVal.floatVal = *source; newVal.floatVal = prevVal.floatVal + operand; } while (atomic_cmpxchg((volatile global unsigned int *)source, prevVal.intVal, newVal.intVal) != prevVal.intVal); } __global const float* closestPoint2(__global const float *flat_g, int cnt, float *pt) { float minDistSq = 1.0 / 0.0; int minIdx = -1; __global const float *p = flat_g; for (int i = 0; i < cnt; i++) { float x = *p++; float y = *p++; float z = *p++; float val = *p++; // skip float distSq = pow(x - pt[0], 2) + pow(y - pt[1], 2) + pow(z - pt[2], 2); if (distSq < minDistSq) { minDistSq = distSq; minIdx = i; } } return flat_g + minIdx * 4; } __global const float* closestPoint(__global const float *flat_g, float *pt) { __global const float *p = flat_g; int ax = (int) p[0]; float split = p[1]; int n = (int) p[2]; int childCnt = (int) p[3]; int nL = (int) p[4]; int nR = (int) p[5]; __global const float *current_best = 0; __global const float *new_best = 0; float best_dist; float dist; int crossBorder = 0; p += 6; for (;;) { if (pt[ax] <= split || crossBorder == -1) { // left if (nL >= 0) { new_best = closestPoint2(p, nL, pt); } else { new_best = closestPoint(p, pt); } dist = sqrt(pow(new_best[0] - pt[0], 2) + pow(new_best[1] - pt[1], 2) + pow(new_best[2] - pt[2], 2)); if (current_best == 0 || dist < best_dist) { current_best = new_best; best_dist = dist; } if (dist + pt[ax] > split && crossBorder == 0) { crossBorder = 1; } else { return current_best; } } if (pt[ax] > split || crossBorder == 1) { // right if (nL >= 0) { p += nL * 4; } else { nL = (int) p[2]; int childCntL = (int) p[3]; p += childCntL * 6 + nL * 4; } if (nR >= 0) { new_best = closestPoint2(p, nR, pt); } else { new_best = closestPoint(p, pt); } dist = sqrt(pow(new_best[0] - pt[0], 2) + pow(new_best[1] - pt[1], 2) + pow(new_best[2] - pt[2], 2)); if (current_best == 0 || dist < best_dist) { current_best = new_best; best_dist = dist; } if (pt[ax] - dist <= split && crossBorder == 0) { crossBorder = -1; p = flat_g + 6; } else { return current_best; } } } } __kernel void nn3d(__global const float *flat_g, __global const float *minima_g, __global const float *maxima_g, __global const int *res, __global float *out_g, __global float *accum_g, __global float *cnt_g, int ofs_x, int ofs_y) { float span[3] = {maxima_g[0] - minima_g[0], maxima_g[1] - minima_g[1], maxima_g[2] - minima_g[2]}; int x = ofs_x + get_global_id(0); int y = ofs_y + get_global_id(1); int z = get_global_id(2); int ofs = (z * res[1] + y) * res[0] + x; // *(out_g + ofs) = (float)(x + y + z); // return; float pt[3]; pt[0] = minima_g[0] + (maxima_g[0] - minima_g[0]) * x / ((float) res[0] - 1.0f); pt[1] = minima_g[1] + (maxima_g[1] - minima_g[1]) * y / ((float) res[1] - 1.0f); pt[2] = minima_g[2] + (maxima_g[2] - minima_g[2]) * z / ((float) res[2] - 1.0f); __global const float *closest = closestPoint(flat_g, pt); // *(out_g + ofs) = closest[3]; // return; float radiusSq = pow(closest[0] - pt[0], 2) + pow(closest[1] - pt[1], 2) + pow(closest[2] - pt[2], 2); float radius = sqrt(radiusSq); float val = closest[3]; int radius_x = ceil(radius * res[0] / (maxima_g[0] - minima_g[0])); int radius_y = ceil(radius * res[1] / (maxima_g[1] - minima_g[1])); int radius_z = ceil(radius * res[2] / (maxima_g[2] - minima_g[2])); for (int x2 = max(0, x - radius_x); x2 <= min(res[0] - 1, x + radius_x); x2++) { float x3 = minima_g[0] + span[0] * x2 / ((float) res[0] - 1.0f); float dxSq = pow(x3 - pt[0], 2); for (int y2 = max(0, y - radius_y); y2 <= min(res[1] - 1, y + radius_y); y2++) { float y3 = minima_g[1] + span[1] * y2 / ((float) res[1] - 1.0f); float dySq = pow(y3 - pt[1], 2); for (int z2 = max(0, z - radius_z); z2 <= min(res[2] - 1, z + radius_z); z2++) { float z3 = minima_g[2] + span[2] * z2 / ((float) res[2] - 1.0f); float dzSq = pow(z3 - pt[2], 2); float distSq = dxSq + dySq + dzSq; if (distSq <= radiusSq) { int ofs = (z2 * res[1] + y2) * res[0] + x2; atomic_add_global(accum_g + ofs, val); atomic_add_global(cnt_g + ofs, 1.0f); } } } } } """).build() for x in xrange(res[0]): for y in xrange(res[1]): print 'x:', x, 'y:', y ev = prg.nn3d(queue, [1, 1, out.shape[2]], None, flat_g, minima_g, maxima_g, res_g, out_g, accum_g, cnt_g, np.int32(x), np.int32(y)) print 'ev:', ev # cl.enqueue_barrier(queue, wait_for=[ev]) # ev.wait() cl.wait_for_events([ev]) cl.enqueue_copy(queue, accum, accum_g) cl.enqueue_copy(queue, cnt, cnt_g) return (accum, cnt) # return out
def radix_sort(cq, sorter, *args): cl.wait_for_events([sorter.sort(cq, *args)])