コード例 #1
0
    def get_compress_kernel(self, index_dtype):
        arguments = """
            __global ${index_t} *count,
            __global ${index_t} *compressed_counts,
            __global ${index_t} *nonempty_indices,
            __global ${index_t} *compressed_indices,
            __global ${index_t} *num_non_empty_list
        """
        from sys import version_info
        if version_info > (3, 0):
            arguments = Template(arguments)
        else:
            arguments = Template(arguments, disable_unicode=True)

        from pyopencl.scan import GenericScanKernel
        return GenericScanKernel(
                self.context, index_dtype,
                arguments=arguments.render(index_t=dtype_to_ctype(index_dtype)),
                input_expr="count[i] == 0 ? 0 : 1",
                scan_expr="a+b", neutral="0",
                output_statement="""
                    if (i + 1 < N) compressed_indices[i + 1] = item;
                    if (prev_item != item) {
                        nonempty_indices[item - 1] = i;
                        compressed_counts[item - 1] = count[i];
                    }
                    if (i + 1 == N) *num_non_empty_list = item;
                    """,
                devices=self.devices)
コード例 #2
0
ファイル: parallel.py プロジェクト: sankasuraj/pysph
    def _generate(self):
        if self.backend == 'opencl':
            input_expr, input_args = self._wrap_ocl_function(self.input_func)
            output_expr, output_args = self._wrap_ocl_function(
                self.output_func
            )
            segment_expr, segment_args = self._wrap_ocl_function(
                self.is_segment_func
            )

            preamble = convert_to_float_if_needed(self.tp.get_code())

            from .opencl import get_context, get_queue
            from pyopencl.scan import GenericScanKernel
            ctx = get_context()
            self.queue = get_queue()
            knl = GenericScanKernel(
                ctx,
                dtype=self.dtype,
                arguments=input_args,
                input_expr=input_expr,
                scan_expr=self.scan_expr,
                neutral=self.neutral,
                output_statement=output_expr,
                is_segment_start_expr=segment_expr,
                preamble=preamble
            )
            self.c_func = knl
コード例 #3
0
    def remove(self, indices, input_sorted=False):
        if len(indices) > self.length:
            msg = 'Number of indices to be removed is greater than'
            msg += 'number of indices in array'
            raise ValueError(msg)

        if_remove = DeviceArray(np.int32, n=self.length)
        if_remove.fill(0)
        new_array = self.copy()

        fill_if_remove_knl = get_elwise_kernel(
            "fill_if_remove_knl",
            "int* indices, int* if_remove",
            "if_remove[indices[i]] = 1;"
        )

        fill_if_remove_knl(indices, if_remove.array)

        remove_knl = GenericScanKernel(
            self.ctx, np.int32,
            arguments="__global int *if_remove,\
            __global %(dtype)s *array,\
            __global %(dtype)s *new_array" %
            {"dtype": cl.tools.dtype_to_ctype(self.dtype)},
            input_expr="if_remove[i]",
            scan_expr="a+b", neutral="0",
            output_statement="""
            if(!if_remove[i]) new_array[i - item] = array[i];
            """)

        remove_knl(if_remove.array, self.array, new_array.array)

        self.set_data(new_array.array[:-len(indices)])
コード例 #4
0
def sim_health_index(n_runs):
    # Set up OpenCL context and command queue
    ctx = cl.create_some_context()
    queue = cl.CommandQueue(ctx)
    mem_pool = cltools.MemoryPool(cltools.ImmediateAllocator(queue))

    t0 = time.time()

    rho = 0.5
    mu = 3.0
    sigma = 1.0
    z_0 = mu

    # Generate an array of Normal Random Numbers on GPU of length n_sims*n_steps
    n_steps = int(4160)  #4160
    rand_gen = clrand.PhiloxGenerator(ctx)
    ran = rand_gen.normal(queue, (n_runs * n_steps),
                          np.float32,
                          mu=0,
                          sigma=1.0)

    # Establish boundaries for each simulated walk (i.e. start and end)
    # Necessary so that we perform scan only within rand walks and not between
    seg_boundaries = [1] + [0] * (n_steps - 1)
    seg_boundaries = np.array(seg_boundaries, dtype=np.uint8)
    seg_boundary_flags = np.tile(seg_boundaries, int(n_runs))
    seg_boundary_flags = cl_array.to_device(queue, seg_boundary_flags)

    # GPU: Define Segmented Scan Kernel, scanning simulations: f(n-1) + f(n)
    prefix_sum = GenericScanKernel(
        ctx,
        np.float32,
        arguments="__global float *ary, __global char *segflags, "
        "__global float *out, float rho, float mu",
        input_expr="segflags[i] ? (ary[i]+mu):(ary[i]+(1-rho)*mu)",
        scan_expr="across_seg_boundary ? (b):(rho*a+b)",
        neutral="0",
        is_segment_start_expr="segflags[i]",
        output_statement="out[i] = item",
        options=[])

    dev_result = cl_array.arange(queue,
                                 len(ran),
                                 dtype=np.float32,
                                 allocator=mem_pool)

    # Enqueue and Run Scan Kernel
    prefix_sum(ran, seg_boundary_flags, dev_result, rho, mu)

    # Get results back on CPU to plot and do final calcs, just as in Lab 1
    health_index_all = (dev_result.get().reshape(n_runs, n_steps).transpose())

    final_time = time.time()
    time_elapsed = final_time - t0

    print("Simulated %d Health Index in: %f seconds" % (n_runs, time_elapsed))
    #print(health_index_all)
    #print(ran.reshape(n_runs, n_steps).transpose())
    #plt.plot(health_index_all)
    return
コード例 #5
0
    def _init_double_scan(self):
        """"generates a double scan on indexes and values in one operation"""
        arguments = "__global int *value", "__global int *index"
        int2 = pyopencl.tools.get_or_register_dtype("int2")
        input_expr = "index[i]>0 ? (int2)(0, 0) : (int2)(value[i], 1)"
        scan_expr = "a+b"
        neutral = "(int2)(0,0)"
        output_statement = "value[i] = item.s0; index[i+1] = item.s1;"

        if self.block_size > 256:
            knl = GenericScanKernel(self.ctx,
                                    dtype=int2,
                                    arguments=arguments,
                                    input_expr=input_expr,
                                    scan_expr=scan_expr,
                                    neutral=neutral,
                                    output_statement=output_statement)
        else:  # MacOS on CPU
            knl = GenericDebugScanKernel(self.ctx,
                                         dtype=int2,
                                         arguments=arguments,
                                         input_expr=input_expr,
                                         scan_expr=scan_expr,
                                         neutral=neutral,
                                         output_statement=output_statement)
        return knl
コード例 #6
0
ファイル: parallel.py プロジェクト: manish364824/compyle
    def _generate_opencl_kernel(self, declarations=None):
        scan_expr, arg_defn, input_expr, output_expr, \
            segment_expr, preamble = self._get_opencl_cuda_code(
                declarations=declarations
            )

        from .opencl import get_context, get_queue
        from pyopencl.scan import GenericScanKernel
        ctx = get_context()
        self.queue = get_queue()
        knl = GenericScanKernel(ctx,
                                dtype=self.dtype,
                                arguments=arg_defn,
                                input_expr=input_expr,
                                scan_expr=scan_expr,
                                neutral=self.neutral,
                                output_statement=output_expr,
                                is_segment_start_expr=segment_expr,
                                preamble=preamble)
        self.source = preamble
        if knl.first_level_scan_info.kernel.program.source:
            self.all_source = '\n'.join([
                '// ----- Level 1 ------',
                knl.first_level_scan_info.kernel.program.source,
                '// ----- Level 2 ------',
                knl.second_level_scan_info.kernel.program.source,
                '// ----- Final output ------',
                knl.final_update_info.kernel.program.source,
            ])
        else:
            self.all_source = self.source
        return knl
コード例 #7
0
ファイル: geometry.py プロジェクト: nchristensen/pytential
    def filter_center_and_target_ids(self, particle_id_dtype):
        from pyopencl.scan import GenericScanKernel
        from pyopencl.tools import VectorArg
        return GenericScanKernel(
            self.cl_context,
            particle_id_dtype,
            arguments=[
                VectorArg(particle_id_dtype, "target_to_center"),
                VectorArg(particle_id_dtype, "filtered_target_to_center"),
                VectorArg(particle_id_dtype, "filtered_target_id"),
                VectorArg(particle_id_dtype, "count"),
            ],

            # "Does this target have a QBX center?"
            input_expr="(target_to_center[i] >= 0) ? 1 : 0",
            scan_expr="a+b",
            neutral="0",
            output_statement="""
                    if (prev_item != item)
                    {
                        filtered_target_to_center[item-1] = target_to_center[i];
                        filtered_target_id[item-1] = i;
                    }
                    if (i+1 == N) *count = item;
                    """)
コード例 #8
0
ファイル: point_tree.py プロジェクト: yang7857854/pysph
def _get_neighbor_count_prefix_sum_kernel(ctx):
    return GenericScanKernel(ctx,
                             np.int32,
                             arguments="__global int *ary",
                             input_expr="ary[i]",
                             scan_expr="a+b",
                             neutral="0",
                             output_statement="ary[i] = prev_item")
コード例 #9
0
 def get_scan_kernel(self, index_dtype):
     from pyopencl.scan import GenericScanKernel
     return GenericScanKernel(
             self.context, index_dtype,
             arguments="__global %s *ary" % dtype_to_ctype(index_dtype),
             input_expr="ary[i]",
             scan_expr="a+b", neutral="0",
             output_statement="ary[i+1] = item;",
             devices=self.devices)
コード例 #10
0
def get_prefix_sum_knl():
    from ..opencl import get_queue, get_context
    from pyopencl.scan import GenericScanKernel
    ctx = get_context()
    queue = get_queue()
    return GenericScanKernel(ctx,
                             np.int32,
                             arguments="__global int *ary",
                             input_expr="ary[i]",
                             scan_expr="a+b",
                             neutral="0",
                             output_statement="ary[i] = prev_item")
コード例 #11
0
def _get_particle_kernel(ctx, k, args, index_code):
    return GenericScanKernel(
        ctx,
        get_vector_dtype('uint', k),
        neutral="0",
        arguments=r"""__global char *seg_flag,
                    __global uint%(k)s *prefix_sum_vector,
                    """ % dict(k=k) + args,
        input_expr="M[%(index_code)s]" % dict(index_code=index_code),
        scan_expr="(across_seg_boundary ? b : a + b)",
        is_segment_start_expr="seg_flag[i]",
        output_statement=r"""prefix_sum_vector[i]=item;""",
        preamble=get_M_array_initialization(k))
コード例 #12
0
    def get_kernels(self, key_dtype, value_dtype, starts_dtype):
        from pyopencl.algorithm import RadixSort
        from pyopencl.tools import VectorArg, ScalarArg

        by_target_sorter = RadixSort(
                self.context, [
                    VectorArg(value_dtype, "values"),
                    VectorArg(key_dtype, "keys"),
                    ],
                key_expr="keys[i]",
                sort_arg_names=["values", "keys"])

        from pyopencl.elementwise import ElementwiseTemplate
        start_finder = ElementwiseTemplate(
                arguments="""//CL//
                starts_t *key_group_starts,
                key_t *keys_sorted_by_key,
                """,

                operation=r"""//CL//
                key_t my_key = keys_sorted_by_key[i];

                if (i == 0 || my_key != keys_sorted_by_key[i-1])
                    key_group_starts[my_key] = i;
                """,
                name="find_starts").build(self.context,
                        type_aliases=(
                            ("key_t", starts_dtype),
                            ("starts_t", starts_dtype),
                            ),
                        var_values=())

        from pyopencl.scan import GenericScanKernel
        bound_propagation_scan = GenericScanKernel(
                self.context, starts_dtype,
                arguments=[
                    VectorArg(starts_dtype, "starts"),
                    # starts has length n+1
                    ScalarArg(key_dtype, "nkeys"),
                    ],
                input_expr="starts[nkeys-i]",
                scan_expr="min(a, b)",
                neutral=_make_cl_int_literal(
                    np.iinfo(starts_dtype).max, starts_dtype),
                output_statement="starts[nkeys-i] = item;")

        return _KernelInfo(
                by_target_sorter=by_target_sorter,
                start_finder=start_finder,
                bound_propagation_scan=bound_propagation_scan)
コード例 #13
0
def _get_unique_cids_kernel(ctx):
    return GenericScanKernel(ctx,
                             np.int32,
                             neutral="0",
                             arguments=r"""int *cids, int *unique_cids_map,
                int *unique_cids, int *unique_cids_count""",
                             input_expr="(i == 0 || cids[i] != cids[i-1])",
                             scan_expr="a + b",
                             output_statement=r"""
            if (item != prev_item) {
                unique_cids[item - 1] = cids[i];
            }
            unique_cids_map[i] = item - 1;
            if (i == N - 1) *unique_cids_count = item;
        """)
コード例 #14
0
ファイル: __init__.py プロジェクト: choward1491/pytential
    def get_qbx_target_numberer(self, dtype):
        assert dtype == np.int32
        from pyopencl.scan import GenericScanKernel
        return GenericScanKernel(
                self.cl_context, np.int32,
                arguments="int *tgt_to_qbx_center, int *qbx_tgt_number, int *count",
                input_expr="tgt_to_qbx_center[i] >= 0 ? 1 : 0",
                scan_expr="a+b", neutral="0",
                output_statement="""
                    if (item != prev_item)
                        qbx_tgt_number[item-1] = i;

                    if (i+1 == N)
                        *count = item;
                    """)
コード例 #15
0
    def _generate_cuda_kernel(self):
        scan_expr, arg_defn, input_expr, output_expr, \
            segment_expr, preamble = self._get_opencl_cuda_code()

        from .cuda import set_context, GenericScanKernel
        set_context()
        knl = GenericScanKernel(dtype=self.dtype,
                                arguments=arg_defn,
                                input_expr=input_expr,
                                scan_expr=scan_expr,
                                neutral=self.neutral,
                                output_statement=output_expr,
                                is_segment_start_expr=segment_expr,
                                preamble=preamble)
        return knl
コード例 #16
0
def sim_lifetime(S, T):
    ctx = cl.create_some_context()
    queue = cl.CommandQueue(ctx)

    t0 = time.time()

    rand_gen = clrand.PhiloxGenerator(ctx, seed=25)
    eps_mat = rand_gen.normal(queue, (S*T), np.float32, mu=0, sigma=1)

    z_row = np.array(([3] + [0] * (T-1)), dtype=np.float32)
    z_mat = np.tile(z_row, int(S))
    z_mat = cl_array.to_device(queue, z_mat)

    seg_boundaries = [1] + [0]*(T-1)
    seg_boundaries = np.array(seg_boundaries, dtype=np.uint8)
    seg_boundary_flags = np.tile(seg_boundaries, int(S))
    seg_boundary_flags = cl_array.to_device(queue, seg_boundary_flags)
    
    prefix_sum = GenericScanKernel(ctx, np.float32,
                arguments="__global float *ary, __global char *segflags, "
                    "__global float *eps, __global float *out, __global float r",
                input_expr="ary[i] + eps[i] + 3*(1-r)",
                scan_expr="across_seg_boundary ? b : (r*a+b)", neutral="0",
                is_segment_start_expr="segflags[i]",
                output_statement="out[i] = item",
                options=[])
    
    rho_neg_tracker = []
    
    for r in np.linspace(-0.95, 0.95, 200):
        
        dev_result = cl_array.empty_like(eps_mat)

        prefix_sum(z_mat, seg_boundary_flags, eps_mat, dev_result, r)
        simulation_all = (dev_result.get().reshape(S, T))

        neg_mean = avg_first_negative(simulation_all)
        rho_neg_tracker.append([r, neg_mean])
    
    best_rho = find_best_rho(rho_neg_tracker)

    time_elapsed = time.time() - t0
    print('Time taken to run: {}'.format(time_elapsed))

    print('Best Rho Value: {}'.format(best_rho[0]))
    print('Max period: {}'.format(best_rho[1]))
   
    return
コード例 #17
0
def _get_leaves_kernel(ctx, leaf_size):
    return GenericScanKernel(
        ctx,
        np.int32,
        neutral="0",
        arguments="int *offsets, uint2 pbounds, int *leaf_cids, "
        "int *num_leaves",
        input_expr="(pbounds[i].s1 - pbounds[i].s0 <= %(leaf_size)s)" %
        dict(leaf_size=leaf_size),
        scan_expr="a+b",
        output_statement=r"""
            if (item != prev_item) {
                leaf_cids[item - 1] = i;
            }
            if (i == N - 1) *num_leaves = item;
        """)
コード例 #18
0
def sim_lifetime(S, T):
    ctx = cl.create_some_context()
    queue = cl.CommandQueue(ctx)

    t0 = time.time()

    rand_gen = clrand.PhiloxGenerator(ctx, seed=25)
    eps_mat = rand_gen.normal(queue, (S * T), np.float32, mu=0, sigma=1)

    z_row = np.array(([3] + [0] * (T - 1)), dtype=np.float32)
    z_mat = np.tile(z_row, int(S))
    z_mat = cl_array.to_device(queue, z_mat)

    seg_boundaries = [1] + [0] * (T - 1)
    seg_boundaries = np.array(seg_boundaries, dtype=np.uint8)
    seg_boundary_flags = np.tile(seg_boundaries, int(S))
    seg_boundary_flags = cl_array.to_device(queue, seg_boundary_flags)

    prefix_sum = GenericScanKernel(
        ctx,
        np.float32,
        arguments="__global float *ary, __global char *segflags, "
        "__global float *eps, __global float *out",
        input_expr="ary[i] + eps[i] + 3*(1-0.5)",
        scan_expr="across_seg_boundary ? b : (0.5*a+b)",
        neutral="0",
        is_segment_start_expr="segflags[i]",
        output_statement="out[i] = item",
        options=[])

    dev_result = cl_array.empty_like(eps_mat)

    prefix_sum(z_mat, seg_boundary_flags, eps_mat, dev_result)

    simulation_all = (dev_result.get().reshape(S, T).transpose())

    average_finish = np.mean(simulation_all[-1])
    std_finish = np.std(simulation_all[-1])
    final_time = time.time()
    time_elapsed = final_time - t0

    print("Simulated %d lifetimes in: %f seconds" % (S, time_elapsed))
    print("Average final health score: %f, Standard Deviation: %f" %
          (average_finish, std_finish))

    return
コード例 #19
0
def _get_set_offset_kernel(ctx, k, leaf_size):
    return GenericScanKernel(
        ctx,
        np.int32,
        neutral="0",
        arguments=r"""__global uint2 *pbounds, __global uint *offsets,
                      __global int *leaf_count, int csum_nodes_next""",
        input_expr="(pbounds[i].s1 - pbounds[i].s0 > %(leaf_size)s)" %
        {'leaf_size': leaf_size},
        scan_expr="a + b",
        output_statement=r"""{
            offsets[i] = ((pbounds[i].s1 - pbounds[i].s0 > %(leaf_size)s) ?
                           csum_nodes_next + (%(k)s * (item - 1)) : -1);
            if (i == N - 1) { *leaf_count = (N - item); }
        }""" % {
            'leaf_size': leaf_size,
            'k': k
        })
コード例 #20
0
    def _generate_opencl_kernel(self):
        scan_expr, arg_defn, input_expr, output_expr, \
            segment_expr, preamble = self._get_opencl_cuda_code()

        from .opencl import get_context, get_queue
        from pyopencl.scan import GenericScanKernel
        ctx = get_context()
        self.queue = get_queue()
        knl = GenericScanKernel(ctx,
                                dtype=self.dtype,
                                arguments=arg_defn,
                                input_expr=input_expr,
                                scan_expr=scan_expr,
                                neutral=self.neutral,
                                output_statement=output_expr,
                                is_segment_start_expr=segment_expr,
                                preamble=preamble)
        return knl
コード例 #21
0
def sim_lifetime(rho):
    ctx = cl.create_some_context()
    queue = cl.CommandQueue(ctx)

    t0 = time.time()

    S = 1000
    T = int(4160)

    rand_gen = clrand.PhiloxGenerator(ctx, seed=25)
    eps_mat = rand_gen.normal(queue, (S * T), np.float32, mu=0, sigma=1)

    z_row = np.array(([3] + [0] * (T - 1)), dtype=np.float32)
    z_mat = np.tile(z_row, int(S))
    z_mat = cl_array.to_device(queue, z_mat)

    seg_boundaries = [1] + [0] * (T - 1)
    seg_boundaries = np.array(seg_boundaries, dtype=np.uint8)
    seg_boundary_flags = np.tile(seg_boundaries, int(S))
    seg_boundary_flags = cl_array.to_device(queue, seg_boundary_flags)

    prefix_sum = GenericScanKernel(
        ctx,
        np.float32,
        arguments="__global float *ary, __global char *segflags, "
        "__global float *eps, __global float *out, __global float r",
        input_expr="ary[i] + eps[i] + 3*(1-r)",
        scan_expr="across_seg_boundary ? b : (r*a+b)",
        neutral="0",
        is_segment_start_expr="segflags[i]",
        output_statement="out[i] = item",
        options=[])

    dev_result = cl_array.empty_like(eps_mat)

    prefix_sum(z_mat, seg_boundary_flags, eps_mat, dev_result, rho)

    simulation_all = (dev_result.get().reshape(S, T).transpose())

    avg_first_neg = avg_first_negative(simulation_all)

    return -avg_first_neg  # turned negative for minimization
コード例 #22
0
ファイル: parallel.py プロジェクト: manish364824/compyle
    def _generate_cuda_kernel(self, declarations=None):
        scan_expr, arg_defn, input_expr, output_expr, \
            segment_expr, preamble = self._get_opencl_cuda_code(
                declarations=declarations
            )

        from .cuda import set_context, GenericScanKernel
        set_context()
        knl = GenericScanKernel(dtype=self.dtype,
                                arguments=arg_defn,
                                input_expr=input_expr,
                                scan_expr=scan_expr,
                                neutral=self.neutral,
                                output_statement=output_expr,
                                is_segment_start_expr=segment_expr,
                                preamble=preamble)
        self.source = preamble
        # FIXME: Difficult to get the pycuda sources
        self.all_source = self.source
        return knl
コード例 #23
0
def _get_cid_groups_kernel(ctx):
    return GenericScanKernel(
        ctx,
        np.uint32,
        neutral="0",
        arguments="""int *unique_cids, uint2 *pbounds,
            int *group_cids, int *group_count, int gmin, int gmax""",
        input_expr="pass(pbounds[unique_cids[i]], gmin, gmax)",
        scan_expr="(a + b)",
        output_statement=r"""
        if (item != prev_item) {
            group_cids[item - 1] = unique_cids[i];
        }
        if (i == N - 1) *group_count = item;
        """,
        preamble="""
        char pass(uint2 pbound, int gmin, int gmax) {
            int leaf_size = pbound.s1 - pbound.s0;
            return (leaf_size > gmin && leaf_size <= gmax);
        }
        """)
コード例 #24
0
ファイル: parallel.py プロジェクト: MyEvan415210/pysph
    def _generate_opencl_code(self):
        input_expr, input_args, input_c_args = \
            self._wrap_ocl_function(self.input_func, func_type='input')

        output_expr, output_args, output_c_args = \
            self._wrap_ocl_function(self.output_func)

        segment_expr, segment_args, segment_c_args = \
            self._wrap_ocl_function(self.is_segment_func)

        scan_expr = self._get_scan_expr_opencl()

        preamble = convert_to_float_if_needed(self.tp.get_code())

        args = input_args + segment_args + output_args
        args = drop_duplicates(args)
        arg_defn = convert_to_float_if_needed(','.join(args))

        c_args = input_c_args + segment_c_args + output_c_args
        c_args = drop_duplicates(c_args)
        self.arg_keys = c_args

        from .opencl import get_context, get_queue
        from pyopencl.scan import GenericScanKernel
        ctx = get_context()
        self.queue = get_queue()
        knl = GenericScanKernel(
            ctx,
            dtype=self.dtype,
            arguments=arg_defn,
            input_expr=input_expr,
            scan_expr=scan_expr,
            neutral=self.neutral,
            output_statement=output_expr,
            is_segment_start_expr=segment_expr,
            preamble=preamble
        )
        self.c_func = knl
コード例 #25
0
ファイル: sparse.py プロジェクト: eresh124/Advance-Blog
 def _setup_compaction_kernel(self):
     self.scan_kernel = GenericScanKernel(
         self.ctx,
         self.indice_dtype,
         arguments=
         "__global float* data, __global float *data_compacted, __global int *indices, __global int* indptr",
         input_expr="(fabs(data[i]) > 0.0f) ? 1 : 0",
         scan_expr="a+b",
         neutral="0",
         output_statement="""
             // item is the running sum of input_expr(i), i.e the cumsum of "nonzero"
             if (prev_item != item) {
                 data_compacted[item-1] = data[i];
                 indices[item-1] = GET_INDEX(i);
             }
             // The last cumsum element of each line of "nonzero" goes to inptr[i]
             if ((i+1) % IMAGE_WIDTH == 0) {
                 indptr[(i/IMAGE_WIDTH)+1] = item;
             }
             """,
         options="-DIMAGE_WIDTH=%d" % self.shape[1],
         preamble="#define GET_INDEX(i) (i % IMAGE_WIDTH)",
     )
コード例 #26
0
    def _setup_compaction_kernel(self):
        kernel_signature = str(
            "__global %s *data, \
            __global %s *data_compacted, \
            __global %s *indices, \
            __global %s* indptr \
            "
            "" %
            (self.c_dtype, self.c_dtype, self.idx_c_dtype, self.idx_c_dtype))
        if self.dtype.kind == "f":
            map_nonzero_expr = "(fabs(data[i]) > %s) ? 1 : 0" % self._c_zero_str
        elif self.dtype.kind in ["u", "i"]:
            map_nonzero_expr = "(data[i] != %s) ? 1 : 0" % self._c_zero_str
        else:
            raise ValueError("Unknown data type")

        self.scan_kernel = GenericScanKernel(
            self.ctx,
            self.indice_dtype,
            arguments=kernel_signature,
            input_expr=map_nonzero_expr,
            scan_expr="a+b",
            neutral="0",
            output_statement="""
                // item is the running sum of input_expr(i), i.e the cumsum of "nonzero"
                if (prev_item != item) {
                    data_compacted[item-1] = data[i];
                    indices[item-1] = GET_INDEX(i);
                }
                // The last cumsum element of each line of "nonzero" goes to inptr[i]
                if ((i+1) % IMAGE_WIDTH == 0) {
                    indptr[(i/IMAGE_WIDTH)+1] = item;
                }
                """,
            options=["-DIMAGE_WIDTH=%d" % self.shape[1]],
            preamble="#define GET_INDEX(i) (i % IMAGE_WIDTH)",
        )
    def __init__(self,
                 context,
                 arguments,
                 key_expr,
                 sort_arg_names,
                 bits_at_a_time=2,
                 index_dtype=np.int32,
                 key_dtype=np.uint32,
                 options=[]):
        """
        :arg arguments: A string of comma-separated C argument declarations.
            If *arguments* is specified, then *input_expr* must also be
            specified. All types used here must be known to PyOpenCL.
            (see :func:`pyopencl.tools.get_or_register_dtype`).
        :arg key_expr: An integer-valued C expression returning the
            key based on which the sort is performed. The array index
            for which the key is to be computed is available as `i`.
            The expression may refer to any of the *arguments*.
        :arg sort_arg_names: A list of argument names whose corresponding
            array arguments will be sorted according to *key_expr*.
        """

        # {{{ arg processing

        from pyopencl.tools import parse_arg_list
        self.arguments = parse_arg_list(arguments)
        del arguments

        self.sort_arg_names = sort_arg_names
        self.bits = int(bits_at_a_time)
        self.index_dtype = np.dtype(index_dtype)
        self.key_dtype = np.dtype(key_dtype)

        self.options = options

        # }}}

        # {{{ kernel creation

        scan_ctype, scan_dtype, scan_t_cdecl = \
                _make_sort_scan_type(context.devices[0], self.bits, self.index_dtype)

        from pyopencl.tools import VectorArg, ScalarArg
        scan_arguments = (list(self.arguments) + [
            VectorArg(arg.dtype, "sorted_" + arg.name)
            for arg in self.arguments if arg.name in sort_arg_names
        ] + [ScalarArg(np.int32, "base_bit")])

        def get_count_branch(known_bits):
            if len(known_bits) == self.bits:
                return "s.c%s" % known_bits

            boundary_mnr = known_bits + "1" + (self.bits - len(known_bits) -
                                               1) * "0"

            return ("((mnr < %s) ? %s : %s)" %
                    (int(boundary_mnr, 2), get_count_branch(known_bits + "0"),
                     get_count_branch(known_bits + "1")))

        codegen_args = dict(
            bits=self.bits,
            key_ctype=dtype_to_ctype(self.key_dtype),
            key_expr=key_expr,
            index_ctype=dtype_to_ctype(self.index_dtype),
            index_type_max=np.iinfo(self.index_dtype).max,
            padded_bin=_padded_bin,
            scan_ctype=scan_ctype,
            sort_arg_names=sort_arg_names,
            get_count_branch=get_count_branch,
        )

        preamble = scan_t_cdecl + RADIX_SORT_PREAMBLE_TPL.render(
            **codegen_args)
        scan_preamble = preamble \
                + RADIX_SORT_SCAN_PREAMBLE_TPL.render(**codegen_args)

        from pyopencl.scan import GenericScanKernel
        self.scan_kernel = GenericScanKernel(
            context,
            scan_dtype,
            arguments=scan_arguments,
            input_expr="scan_t_from_value(%s, base_bit, i)" % key_expr,
            scan_expr="scan_t_add(a, b, across_seg_boundary)",
            neutral="scan_t_neutral()",
            output_statement=RADIX_SORT_OUTPUT_STMT_TPL.render(**codegen_args),
            preamble=scan_preamble,
            options=self.options)

        for i, arg in enumerate(self.arguments):
            if isinstance(arg, VectorArg):
                self.first_array_arg_idx = i
コード例 #28
0
    def _init_compression_scan(self):
        """Initialize CBF compression scan kernels"""
        preamble = """
        int compressed_size(int diff) {
            int abs_diff = abs(diff);

            if (abs_diff < 128) {
                return 1;
            }
            else if (abs_diff < 32768) {
                return 3;
            }
            else {
                return 7;
            }
        }

        void write(const int index,
                   const int diff,
                   global char *output) {
            int abs_diff = abs(diff);

            if (abs_diff < 128) {
                output[index] = (char) diff;
            }
            else if (abs_diff < 32768) {
                output[index] = -128;
                output[index + 1] = (char) (diff >> 0);
                output[index + 2] = (char) (diff >> 8);
            }
            else {
                output[index] = -128;
                output[index + 1] = 0;
                output[index + 2] = -128;
                output[index + 3] = (char) (diff >> 0);
                output[index + 4] = (char) (diff >> 8);
                output[index + 5] = (char) (diff >> 16);
                output[index + 6] = (char) (diff >> 24);
            }
        }
        """
        arguments = "__global const int *data, __global char *compressed, __global int *size"
        input_expr = "compressed_size((i == 0) ? data[0] : (data[i] - data[i - 1]))"
        scan_expr = "a+b"
        neutral = "0"
        output_statement = """
        if (prev_item == 0) { // 1st thread store compressed data size
            size[0] = last_item;
        }
        write(prev_item, (i == 0) ? data[0] : (data[i] - data[i - 1]), compressed);
        """

        if self.block_size >= 64:
            knl = GenericScanKernel(self.ctx,
                                    dtype=numpy.int32,
                                    preamble=preamble,
                                    arguments=arguments,
                                    input_expr=input_expr,
                                    scan_expr=scan_expr,
                                    neutral=neutral,
                                    output_statement=output_statement)
        else:  # MacOS on CPU
            knl = GenericDebugScanKernel(self.ctx,
                                         dtype=numpy.int32,
                                         preamble=preamble,
                                         arguments=arguments,
                                         input_expr=input_expr,
                                         scan_expr=scan_expr,
                                         neutral=neutral,
                                         output_statement=output_statement)
        return knl
コード例 #29
0
import pyopencl as cl
import pyopencl.algorithm
from pyopencl.scan import GenericScanKernel
import numpy as np

ctx = cl.create_some_context()
queue = cl.CommandQueue(ctx)
mf = cl.mem_flags

knl = GenericScanKernel(
    ctx,
    np.int32,
    arguments="__global int *ary, __global int *out",
    input_expr="(ary[i] < 104) ? 1 : 0",
    scan_expr="a+b",
    neutral="0",
    output_statement="""if (prev_item != item) out[item-1] = ary[i];""")

rand = np.random.random_integers(0, 2**10, size=(2**10) * 8).astype(np.uint32)
ary = cl.array.arange(queue, 10000, dtype=np.uint32)
print ary
out = ary.copy()
knl(ary, out)

a_host = ary.get()
out_host = a_host[a_host < 104]

print out

#code = open("knl.cl", "w").write(knl)
コード例 #30
0
import pyopencl as cl
import pyopencl.clrandom as clrand
from pyopencl.scan import GenericScanKernel

# np.cumsum([1, 2, 3])
# np.array([1, 3, 6])

ctx = cl.create_some_context()
queue = cl.CommandQueue(ctx)
print("queue: ", queue)
print()

sknl = GenericScanKernel(ctx,
                         np.float64,
                         arguments="double *y, double *x",
                         input_expr="x[i]",
                         scan_expr="a+b",
                         neutral="0",
                         output_statement="y[i] = item;")

n = 10**7
x = clrand.rand(queue, n, np.float64)
print("x:", x)
print()

result = cl.array.empty_like(x)
# result = cl.array.arange(queue, n, dtype=np.float64)
sknl(result, x, queue=queue)
print("result", result)
print()