def get_kernels(self, key_dtype, value_dtype, starts_dtype): from pyopencl.algorithm import RadixSort from pyopencl.tools import VectorArg, ScalarArg by_target_sorter = RadixSort( self.context, [ VectorArg(value_dtype, "values"), VectorArg(key_dtype, "keys"), ], key_expr="keys[i]", sort_arg_names=["values", "keys"]) from pyopencl.elementwise import ElementwiseTemplate start_finder = ElementwiseTemplate( arguments="""//CL// starts_t *key_group_starts, key_t *keys_sorted_by_key, """, operation=r"""//CL// key_t my_key = keys_sorted_by_key[i]; if (i == 0 || my_key != keys_sorted_by_key[i-1]) key_group_starts[my_key] = i; """, name="find_starts").build(self.context, type_aliases=( ("key_t", starts_dtype), ("starts_t", starts_dtype), ), var_values=()) from pyopencl.scan import GenericScanKernel bound_propagation_scan = GenericScanKernel( self.context, starts_dtype, arguments=[ VectorArg(starts_dtype, "starts"), # starts has length n+1 ScalarArg(key_dtype, "nkeys"), ], input_expr="starts[nkeys-i]", scan_expr="min(a, b)", neutral=_make_cl_int_literal( np.iinfo(starts_dtype).max, starts_dtype), output_statement="starts[nkeys-i] = item;") return _KernelInfo( by_target_sorter=by_target_sorter, start_finder=start_finder, bound_propagation_scan=bound_propagation_scan)
def test_sort(ctx_factory, scan_kernel): from pytest import importorskip importorskip("mako") context = ctx_factory() queue = cl.CommandQueue(context) dtype = np.int32 from pyopencl.algorithm import RadixSort sort = RadixSort(context, "int *ary", key_expr="ary[i]", sort_arg_names=["ary"], scan_kernel=scan_kernel) from pyopencl.clrandom import RanluxGenerator rng = RanluxGenerator(queue, seed=15) from time import time # intermediate arrays for largest size cause out-of-memory on low-end GPUs for n in scan_test_counts[:-1]: if n >= 2000 and isinstance(scan_kernel, GenericDebugScanKernel): continue print(n) print(" rng") a_dev = rng.uniform(queue, (n, ), dtype=dtype, a=0, b=2**16) a = a_dev.get() dev_start = time() print(" device") (a_dev_sorted, ), evt = sort(a_dev, key_bits=16) queue.finish() dev_end = time() print(" numpy") a_sorted = np.sort(a) numpy_end = time() numpy_elapsed = numpy_end - dev_end dev_elapsed = dev_end - dev_start print(" dev: %.2f MKeys/s numpy: %.2f MKeys/s ratio: %.2fx" % (1e-6 * n / dev_elapsed, 1e-6 * n / numpy_elapsed, numpy_elapsed / dev_elapsed)) assert (a_dev_sorted.get() == a_sorted).all()
def test_sort(ctx_factory): context = ctx_factory() queue = cl.CommandQueue(context) dtype = np.int32 from pyopencl.algorithm import RadixSort sort = RadixSort(context, "int *ary", key_expr="ary[i]", sort_arg_names=["ary"]) from pyopencl.clrandom import RanluxGenerator rng = RanluxGenerator(queue, seed=15) from time import time for n in scan_test_counts: print(n) print(" rng") a_dev = rng.uniform(queue, (n,), dtype=dtype, a=0, b=2**16) a = a_dev.get() dev_start = time() print(" device") a_dev_sorted, = sort(a_dev, key_bits=16) queue.finish() dev_end = time() print(" numpy") a_sorted = np.sort(a) numpy_end = time() numpy_elapsed = numpy_end-dev_end dev_elapsed = dev_end-dev_start print (" dev: %.2f MKeys/s numpy: %.2f MKeys/s ratio: %.2fx" % ( 1e-6*n/dev_elapsed, 1e-6*n/numpy_elapsed, numpy_elapsed/dev_elapsed)) assert (a_dev_sorted.get() == a_sorted).all()
@timeit_repeat(reps) def test_numpy_speed(buff): np.sort(buff) from collections import defaultdict times = defaultdict(list) dtype = np.int32 xs = range(5, 32) bs = BitonicSort(ctx) rs = RadixSort(ctx, "int *ary", key_expr="ary[i]", sort_arg_names=["ary"], scan_kernel=GenericScanKernel) for size in xs: print("running size=2^{} = {}".format(size, 2**size)) s = clrandom.rand(queue, (2**size, ), dtype, a=0, b=2**16) times['bitonic'].append(test_bitonic_speed(s, bs).microseconds / 1000000.) times['radix'].append(test_radix_speed(s, rs).microseconds / 1000000.) times['numpy'].append( test_numpy_speed(s.get().copy()).microseconds / 1000000.) print("\t".join(["Size"] + times.keys())) for idx, s in enumerate(xs): print("\t".join(["2^" + str(s)] + [str(times[k][idx]) for k in times.keys()]))
def initialize(cls): ''' Compile kernels ''' cls.program = cl.Program(cl_ctx, F(cls.KERNEL)).build() cls.longitudinal_sort_kernel = RadixSort(cl_ctx, [VectorArg(cl_ftype, "x"), VectorArg(cl_ftype, "px"), VectorArg(cl_ftype, "y"), VectorArg(cl_ftype, "py"), VectorArg(cl_ftype, "theta"), VectorArg(cl_ftype, "gamma"), ScalarArg(cl_ftype, "inv_slice_len")], key_expr="(int) floor(theta[i]*inv_slice_len)", sort_arg_names=["x", "px", "y", "py", "theta", "gamma"], key_dtype=np.int32) class LongitudinalTraverseScanKernel(GenericScanKernel): ''' Adds a preamble method for the longitudinal traverse sort ''' def __init__(self, *argl, **argd): ''' Patch argd['preamble'] ''' sort_fun = ''' int sort_fun(FLOAT_TYPE x, FLOAT_TYPE y, FLOAT_TYPE theta, FLOAT_TYPE inv_slice_len, FLOAT_TYPE inv_traverse_len, int bins) { FLOAT_TYPE xnorm = 0.5 + (inv_traverse_len*x); FLOAT_TYPE ynorm = 0.5 + (inv_traverse_len*y); int xbin = (int) floor(xnorm * inv_traverse_len); int ybin = (int) floor(ynorm * inv_traverse_len); int zbin = (int) floor(theta*inv_slice_len); if ((xbin < 0) || (xbin >= bins) || (ybin < 0) || (ybin >= bins)) { xbin = 0; ybin = 0; } return xbin+bins*(ybin+bins*zbin); } ''' new_argd = dict(argd) new_argd['preamble'] = F(sort_fun + new_argd['preamble']) super().__init__(*argl, **new_argd) cls.longitudinal_traverse_sort_kernel = RadixSort(cl_ctx, [VectorArg(cl_ftype, "x"), VectorArg(cl_ftype, "px"), VectorArg(cl_ftype, "y"), VectorArg(cl_ftype, "py"), VectorArg(cl_ftype, "theta"), VectorArg(cl_ftype, "gamma"), ScalarArg(cl_ftype, "inv_slice_len"), ScalarArg(cl_ftype, "inv_traverse_len"), ScalarArg(np.int32, "bins")], key_expr="sort_fun(x[i],y[i],theta[i], inv_slice_len, inv_traverse_len, bins)", sort_arg_names=["x", "px", "y", "py", "theta", "gamma"], scan_kernel = LongitudinalTraverseScanKernel, key_dtype=np.int32)