コード例 #1
0
    def get_kernels(self, key_dtype, value_dtype, starts_dtype):
        from pyopencl.algorithm import RadixSort
        from pyopencl.tools import VectorArg, ScalarArg

        by_target_sorter = RadixSort(
                self.context, [
                    VectorArg(value_dtype, "values"),
                    VectorArg(key_dtype, "keys"),
                    ],
                key_expr="keys[i]",
                sort_arg_names=["values", "keys"])

        from pyopencl.elementwise import ElementwiseTemplate
        start_finder = ElementwiseTemplate(
                arguments="""//CL//
                starts_t *key_group_starts,
                key_t *keys_sorted_by_key,
                """,

                operation=r"""//CL//
                key_t my_key = keys_sorted_by_key[i];

                if (i == 0 || my_key != keys_sorted_by_key[i-1])
                    key_group_starts[my_key] = i;
                """,
                name="find_starts").build(self.context,
                        type_aliases=(
                            ("key_t", starts_dtype),
                            ("starts_t", starts_dtype),
                            ),
                        var_values=())

        from pyopencl.scan import GenericScanKernel
        bound_propagation_scan = GenericScanKernel(
                self.context, starts_dtype,
                arguments=[
                    VectorArg(starts_dtype, "starts"),
                    # starts has length n+1
                    ScalarArg(key_dtype, "nkeys"),
                    ],
                input_expr="starts[nkeys-i]",
                scan_expr="min(a, b)",
                neutral=_make_cl_int_literal(
                    np.iinfo(starts_dtype).max, starts_dtype),
                output_statement="starts[nkeys-i] = item;")

        return _KernelInfo(
                by_target_sorter=by_target_sorter,
                start_finder=start_finder,
                bound_propagation_scan=bound_propagation_scan)
コード例 #2
0
def test_sort(ctx_factory, scan_kernel):
    from pytest import importorskip
    importorskip("mako")

    context = ctx_factory()
    queue = cl.CommandQueue(context)

    dtype = np.int32

    from pyopencl.algorithm import RadixSort
    sort = RadixSort(context,
                     "int *ary",
                     key_expr="ary[i]",
                     sort_arg_names=["ary"],
                     scan_kernel=scan_kernel)

    from pyopencl.clrandom import RanluxGenerator
    rng = RanluxGenerator(queue, seed=15)

    from time import time

    # intermediate arrays for largest size cause out-of-memory on low-end GPUs
    for n in scan_test_counts[:-1]:
        if n >= 2000 and isinstance(scan_kernel, GenericDebugScanKernel):
            continue

        print(n)

        print("  rng")
        a_dev = rng.uniform(queue, (n, ), dtype=dtype, a=0, b=2**16)
        a = a_dev.get()

        dev_start = time()
        print("  device")
        (a_dev_sorted, ), evt = sort(a_dev, key_bits=16)
        queue.finish()
        dev_end = time()
        print("  numpy")
        a_sorted = np.sort(a)
        numpy_end = time()

        numpy_elapsed = numpy_end - dev_end
        dev_elapsed = dev_end - dev_start
        print("  dev: %.2f MKeys/s numpy: %.2f MKeys/s ratio: %.2fx" %
              (1e-6 * n / dev_elapsed, 1e-6 * n / numpy_elapsed,
               numpy_elapsed / dev_elapsed))
        assert (a_dev_sorted.get() == a_sorted).all()
コード例 #3
0
ファイル: test_array.py プロジェクト: stephenbalaban/pyopencl
def test_sort(ctx_factory):
    context = ctx_factory()
    queue = cl.CommandQueue(context)

    dtype = np.int32

    from pyopencl.algorithm import RadixSort
    sort = RadixSort(context, "int *ary", key_expr="ary[i]",
            sort_arg_names=["ary"])

    from pyopencl.clrandom import RanluxGenerator
    rng = RanluxGenerator(queue, seed=15)

    from time import time

    for n in scan_test_counts:
        print(n)

        print("  rng")
        a_dev = rng.uniform(queue, (n,), dtype=dtype, a=0, b=2**16)
        a = a_dev.get()

        dev_start = time()
        print("  device")
        a_dev_sorted, = sort(a_dev, key_bits=16)
        queue.finish()
        dev_end = time()
        print("  numpy")
        a_sorted = np.sort(a)
        numpy_end = time()

        numpy_elapsed = numpy_end-dev_end
        dev_elapsed = dev_end-dev_start
        print ("  dev: %.2f MKeys/s numpy: %.2f MKeys/s ratio: %.2fx" % (
                1e-6*n/dev_elapsed, 1e-6*n/numpy_elapsed, numpy_elapsed/dev_elapsed))
        assert (a_dev_sorted.get() == a_sorted).all()
コード例 #4
0
ファイル: sort_test.py プロジェクト: JonnoFTW/htm-cl

@timeit_repeat(reps)
def test_numpy_speed(buff):
    np.sort(buff)


from collections import defaultdict

times = defaultdict(list)
dtype = np.int32
xs = range(5, 32)
bs = BitonicSort(ctx)
rs = RadixSort(ctx,
               "int *ary",
               key_expr="ary[i]",
               sort_arg_names=["ary"],
               scan_kernel=GenericScanKernel)
for size in xs:
    print("running size=2^{} = {}".format(size, 2**size))
    s = clrandom.rand(queue, (2**size, ), dtype, a=0, b=2**16)
    times['bitonic'].append(test_bitonic_speed(s, bs).microseconds / 1000000.)
    times['radix'].append(test_radix_speed(s, rs).microseconds / 1000000.)
    times['numpy'].append(
        test_numpy_speed(s.get().copy()).microseconds / 1000000.)

print("\t".join(["Size"] + times.keys()))
for idx, s in enumerate(xs):
    print("\t".join(["2^" + str(s)] +
                    [str(times[k][idx]) for k in times.keys()]))
コード例 #5
0
ファイル: beam.py プロジェクト: bellaz89/pyFEL
    def initialize(cls):
        '''
            Compile kernels
        '''
        cls.program = cl.Program(cl_ctx, F(cls.KERNEL)).build()
        cls.longitudinal_sort_kernel = RadixSort(cl_ctx,
                                                 [VectorArg(cl_ftype, "x"), 
                                                  VectorArg(cl_ftype, "px"),
                                                  VectorArg(cl_ftype, "y"),
                                                  VectorArg(cl_ftype, "py"),
                                                  VectorArg(cl_ftype, "theta"),
                                                  VectorArg(cl_ftype, "gamma"),
                                                  ScalarArg(cl_ftype, "inv_slice_len")],
                                                 key_expr="(int) floor(theta[i]*inv_slice_len)",
                                                 sort_arg_names=["x", "px", "y", "py", "theta", "gamma"],
                                                 key_dtype=np.int32)

        class LongitudinalTraverseScanKernel(GenericScanKernel):
            '''
                Adds a preamble method for the longitudinal traverse sort
            '''
            def __init__(self, *argl, **argd):
                '''
                    Patch argd['preamble']
                '''

                sort_fun = '''
                            int sort_fun(FLOAT_TYPE x, 
                                         FLOAT_TYPE y, 
                                         FLOAT_TYPE theta, 
                                         FLOAT_TYPE inv_slice_len, 
                                         FLOAT_TYPE inv_traverse_len,
                                         int bins) {
                                         
                                         FLOAT_TYPE xnorm = 0.5 + (inv_traverse_len*x);
                                         FLOAT_TYPE ynorm = 0.5 + (inv_traverse_len*y);
                                         int xbin = (int) floor(xnorm * inv_traverse_len);
                                         int ybin = (int) floor(ynorm * inv_traverse_len);
                                         int zbin = (int) floor(theta*inv_slice_len);

                                         if ((xbin < 0) || (xbin >= bins) || (ybin < 0) || (ybin >= bins)) {
                                            xbin = 0;
                                            ybin = 0;

                                         }

                                         return xbin+bins*(ybin+bins*zbin);
                            }
                           '''
                
                new_argd = dict(argd)
                new_argd['preamble'] = F(sort_fun + new_argd['preamble'])
                super().__init__(*argl, **new_argd)
        
        cls.longitudinal_traverse_sort_kernel = RadixSort(cl_ctx,
                                                          [VectorArg(cl_ftype, "x"), 
                                                           VectorArg(cl_ftype, "px"),
                                                           VectorArg(cl_ftype, "y"),
                                                           VectorArg(cl_ftype, "py"),
                                                           VectorArg(cl_ftype, "theta"),
                                                           VectorArg(cl_ftype, "gamma"),
                                                           ScalarArg(cl_ftype, "inv_slice_len"),
                                                           ScalarArg(cl_ftype, "inv_traverse_len"),
                                                           ScalarArg(np.int32, "bins")],
                                                           key_expr="sort_fun(x[i],y[i],theta[i], inv_slice_len, inv_traverse_len, bins)",
                                                           sort_arg_names=["x", "px", "y", "py", "theta", "gamma"],
                                                           scan_kernel = LongitudinalTraverseScanKernel,
                                                           key_dtype=np.int32)