def __call__(self, input_ary, output_ary=None, allocator=None, stream=None): allocator = allocator or input_ary.allocator if output_ary is None: output_ary = input_ary if isinstance(output_ary, (str, unicode)) and output_ary == "new": output_ary = cl_array.empty_like(input_ary, allocator=allocator) if input_ary.shape != output_ary.shape: raise ValueError("input and output must have the same shape") if not input_ary.flags.forc: raise RuntimeError("ScanKernel cannot " "deal with non-contiguous arrays") n, = input_ary.shape if not n: return output_ary unit_size = self.scan_wg_size * self.scan_wg_seq_batches dev = driver.Context.get_device() max_groups = 3*dev.get_attribute( driver.device_attribute.MULTIPROCESSOR_COUNT) from pytools import uniform_interval_splitting interval_size, num_groups = uniform_interval_splitting( n, unit_size, max_groups); block_results = allocator(self.dtype.itemsize*num_groups) dummy_results = allocator(self.dtype.itemsize) # first level scan of interval (one interval per block) self.scan_intervals_knl.prepared_async_call( (num_groups, 1), (self.scan_wg_size, 1, 1), stream, input_ary.gpudata, n, interval_size, output_ary.gpudata, block_results) # second level inclusive scan of per-block results self.scan_intervals_knl.prepared_async_call( (1,1), (self.scan_wg_size, 1, 1), stream, block_results, num_groups, interval_size, block_results, dummy_results) # update intervals with result of second level scan self.final_update_knl.prepared_async_call( (num_groups, 1,), (self.update_wg_size, 1, 1), stream, output_ary.gpudata, n, interval_size, block_results) return output_ary
def __call__(self, input_ary, output_ary=None, allocator=None, stream=None): allocator = allocator or input_ary.allocator if output_ary is None: output_ary = input_ary if isinstance(output_ary, (str, unicode)) and output_ary == "new": output_ary = cl_array.empty_like(input_ary, allocator=allocator) if input_ary.shape != output_ary.shape: raise ValueError("input and output must have the same shape") if not input_ary.flags.forc: raise RuntimeError("ScanKernel cannot " "deal with non-contiguous arrays") n, = input_ary.shape if not n: return output_ary unit_size = self.scan_wg_size * self.scan_wg_seq_batches dev = driver.Context.get_device() max_groups = 3 * dev.get_attribute( driver.device_attribute.MULTIPROCESSOR_COUNT) from pytools import uniform_interval_splitting interval_size, num_groups = uniform_interval_splitting( n, unit_size, max_groups) block_results = allocator(self.dtype.itemsize * num_groups) dummy_results = allocator(self.dtype.itemsize) # first level scan of interval (one interval per block) self.scan_intervals_knl.prepared_async_call( (num_groups, 1), (self.scan_wg_size, 1, 1), stream, input_ary.gpudata, n, interval_size, output_ary.gpudata, block_results) # second level inclusive scan of per-block results self.scan_intervals_knl.prepared_async_call( (1, 1), (self.scan_wg_size, 1, 1), stream, block_results, num_groups, interval_size, block_results, dummy_results) # update intervals with result of second level scan self.final_update_knl.prepared_async_call(( num_groups, 1, ), (self.update_wg_size, 1, 1), stream, output_ary.gpudata, n, interval_size, block_results) return output_ary
def __call__(self, input_ary, output_ary=None, allocator=None, queue=None): allocator = allocator or input_ary.allocator queue = queue or input_ary.queue or output_ary.queue if output_ary is None: output_ary = input_ary if isinstance(output_ary, (str, unicode)) and output_ary == "new": output_ary = cl_array.empty_like(input_ary, allocator=allocator) if input_ary.shape != output_ary.shape: raise ValueError("input and output must have the same shape") if not input_ary.flags.forc: raise RuntimeError("ScanKernel cannot " "deal with non-contiguous arrays") n, = input_ary.shape if not n: return output_ary unit_size = self.scan_wg_size * self.scan_wg_seq_batches max_groups = 3*max(dev.max_compute_units for dev in self.devices) from pytools import uniform_interval_splitting interval_size, num_groups = uniform_interval_splitting( n, unit_size, max_groups); block_results = allocator(self.dtype.itemsize*num_groups) dummy_results = allocator(self.dtype.itemsize) # first level scan of interval (one interval per block) self.scan_intervals_knl( queue, (num_groups*self.scan_wg_size,), (self.scan_wg_size,), input_ary.data, n, interval_size, output_ary.data, block_results) # second level inclusive scan of per-block results self.scan_intervals_knl( queue, (self.scan_wg_size,), (self.scan_wg_size,), block_results, num_groups, interval_size, block_results, dummy_results) # update intervals with result of second level scan self.final_update_knl( queue, (num_groups*self.update_wg_size,), (self.update_wg_size,), output_ary.data, n, interval_size, block_results) return output_ary
def __call__(self, input_ary, output_ary=None, allocator=None, queue=None): allocator = allocator or input_ary.allocator queue = queue or input_ary.queue or output_ary.queue if output_ary is None: output_ary = input_ary if isinstance(output_ary, (str, unicode)) and output_ary == "new": output_ary = cl_array.empty_like(input_ary, allocator=allocator) if input_ary.shape != output_ary.shape: raise ValueError("input and output must have the same shape") if not input_ary.flags.forc: raise RuntimeError("ScanKernel cannot " "deal with non-contiguous arrays") n, = input_ary.shape if not n: return output_ary unit_size = self.scan_wg_size * self.scan_wg_seq_batches max_groups = 3 * max(dev.max_compute_units for dev in self.devices) from pytools import uniform_interval_splitting interval_size, num_groups = uniform_interval_splitting( n, unit_size, max_groups) block_results = allocator(self.dtype.itemsize * num_groups) dummy_results = allocator(self.dtype.itemsize) # first level scan of interval (one interval per block) self.scan_intervals_knl(queue, (num_groups * self.scan_wg_size, ), (self.scan_wg_size, ), input_ary.data, n, interval_size, output_ary.data, block_results) # second level inclusive scan of per-block results self.scan_intervals_knl(queue, (self.scan_wg_size, ), (self.scan_wg_size, ), block_results, num_groups, interval_size, block_results, dummy_results) # update intervals with result of second level scan self.final_update_knl(queue, (num_groups * self.update_wg_size, ), (self.update_wg_size, ), output_ary.data, n, interval_size, block_results) return output_ary