def get_compress_kernel(self, index_dtype): arguments = """ __global ${index_t} *count, __global ${index_t} *compressed_counts, __global ${index_t} *nonempty_indices, __global ${index_t} *compressed_indices, __global ${index_t} *num_non_empty_list """ from sys import version_info if version_info > (3, 0): arguments = Template(arguments) else: arguments = Template(arguments, disable_unicode=True) from pyopencl.scan import GenericScanKernel return GenericScanKernel( self.context, index_dtype, arguments=arguments.render(index_t=dtype_to_ctype(index_dtype)), input_expr="count[i] == 0 ? 0 : 1", scan_expr="a+b", neutral="0", output_statement=""" if (i + 1 < N) compressed_indices[i + 1] = item; if (prev_item != item) { nonempty_indices[item - 1] = i; compressed_counts[item - 1] = count[i]; } if (i + 1 == N) *num_non_empty_list = item; """, devices=self.devices)
def _generate(self): if self.backend == 'opencl': input_expr, input_args = self._wrap_ocl_function(self.input_func) output_expr, output_args = self._wrap_ocl_function( self.output_func ) segment_expr, segment_args = self._wrap_ocl_function( self.is_segment_func ) preamble = convert_to_float_if_needed(self.tp.get_code()) from .opencl import get_context, get_queue from pyopencl.scan import GenericScanKernel ctx = get_context() self.queue = get_queue() knl = GenericScanKernel( ctx, dtype=self.dtype, arguments=input_args, input_expr=input_expr, scan_expr=self.scan_expr, neutral=self.neutral, output_statement=output_expr, is_segment_start_expr=segment_expr, preamble=preamble ) self.c_func = knl
def remove(self, indices, input_sorted=False): if len(indices) > self.length: msg = 'Number of indices to be removed is greater than' msg += 'number of indices in array' raise ValueError(msg) if_remove = DeviceArray(np.int32, n=self.length) if_remove.fill(0) new_array = self.copy() fill_if_remove_knl = get_elwise_kernel( "fill_if_remove_knl", "int* indices, int* if_remove", "if_remove[indices[i]] = 1;" ) fill_if_remove_knl(indices, if_remove.array) remove_knl = GenericScanKernel( self.ctx, np.int32, arguments="__global int *if_remove,\ __global %(dtype)s *array,\ __global %(dtype)s *new_array" % {"dtype": cl.tools.dtype_to_ctype(self.dtype)}, input_expr="if_remove[i]", scan_expr="a+b", neutral="0", output_statement=""" if(!if_remove[i]) new_array[i - item] = array[i]; """) remove_knl(if_remove.array, self.array, new_array.array) self.set_data(new_array.array[:-len(indices)])
def sim_health_index(n_runs): # Set up OpenCL context and command queue ctx = cl.create_some_context() queue = cl.CommandQueue(ctx) mem_pool = cltools.MemoryPool(cltools.ImmediateAllocator(queue)) t0 = time.time() rho = 0.5 mu = 3.0 sigma = 1.0 z_0 = mu # Generate an array of Normal Random Numbers on GPU of length n_sims*n_steps n_steps = int(4160) #4160 rand_gen = clrand.PhiloxGenerator(ctx) ran = rand_gen.normal(queue, (n_runs * n_steps), np.float32, mu=0, sigma=1.0) # Establish boundaries for each simulated walk (i.e. start and end) # Necessary so that we perform scan only within rand walks and not between seg_boundaries = [1] + [0] * (n_steps - 1) seg_boundaries = np.array(seg_boundaries, dtype=np.uint8) seg_boundary_flags = np.tile(seg_boundaries, int(n_runs)) seg_boundary_flags = cl_array.to_device(queue, seg_boundary_flags) # GPU: Define Segmented Scan Kernel, scanning simulations: f(n-1) + f(n) prefix_sum = GenericScanKernel( ctx, np.float32, arguments="__global float *ary, __global char *segflags, " "__global float *out, float rho, float mu", input_expr="segflags[i] ? (ary[i]+mu):(ary[i]+(1-rho)*mu)", scan_expr="across_seg_boundary ? (b):(rho*a+b)", neutral="0", is_segment_start_expr="segflags[i]", output_statement="out[i] = item", options=[]) dev_result = cl_array.arange(queue, len(ran), dtype=np.float32, allocator=mem_pool) # Enqueue and Run Scan Kernel prefix_sum(ran, seg_boundary_flags, dev_result, rho, mu) # Get results back on CPU to plot and do final calcs, just as in Lab 1 health_index_all = (dev_result.get().reshape(n_runs, n_steps).transpose()) final_time = time.time() time_elapsed = final_time - t0 print("Simulated %d Health Index in: %f seconds" % (n_runs, time_elapsed)) #print(health_index_all) #print(ran.reshape(n_runs, n_steps).transpose()) #plt.plot(health_index_all) return
def _init_double_scan(self): """"generates a double scan on indexes and values in one operation""" arguments = "__global int *value", "__global int *index" int2 = pyopencl.tools.get_or_register_dtype("int2") input_expr = "index[i]>0 ? (int2)(0, 0) : (int2)(value[i], 1)" scan_expr = "a+b" neutral = "(int2)(0,0)" output_statement = "value[i] = item.s0; index[i+1] = item.s1;" if self.block_size > 256: knl = GenericScanKernel(self.ctx, dtype=int2, arguments=arguments, input_expr=input_expr, scan_expr=scan_expr, neutral=neutral, output_statement=output_statement) else: # MacOS on CPU knl = GenericDebugScanKernel(self.ctx, dtype=int2, arguments=arguments, input_expr=input_expr, scan_expr=scan_expr, neutral=neutral, output_statement=output_statement) return knl
def _generate_opencl_kernel(self, declarations=None): scan_expr, arg_defn, input_expr, output_expr, \ segment_expr, preamble = self._get_opencl_cuda_code( declarations=declarations ) from .opencl import get_context, get_queue from pyopencl.scan import GenericScanKernel ctx = get_context() self.queue = get_queue() knl = GenericScanKernel(ctx, dtype=self.dtype, arguments=arg_defn, input_expr=input_expr, scan_expr=scan_expr, neutral=self.neutral, output_statement=output_expr, is_segment_start_expr=segment_expr, preamble=preamble) self.source = preamble if knl.first_level_scan_info.kernel.program.source: self.all_source = '\n'.join([ '// ----- Level 1 ------', knl.first_level_scan_info.kernel.program.source, '// ----- Level 2 ------', knl.second_level_scan_info.kernel.program.source, '// ----- Final output ------', knl.final_update_info.kernel.program.source, ]) else: self.all_source = self.source return knl
def filter_center_and_target_ids(self, particle_id_dtype): from pyopencl.scan import GenericScanKernel from pyopencl.tools import VectorArg return GenericScanKernel( self.cl_context, particle_id_dtype, arguments=[ VectorArg(particle_id_dtype, "target_to_center"), VectorArg(particle_id_dtype, "filtered_target_to_center"), VectorArg(particle_id_dtype, "filtered_target_id"), VectorArg(particle_id_dtype, "count"), ], # "Does this target have a QBX center?" input_expr="(target_to_center[i] >= 0) ? 1 : 0", scan_expr="a+b", neutral="0", output_statement=""" if (prev_item != item) { filtered_target_to_center[item-1] = target_to_center[i]; filtered_target_id[item-1] = i; } if (i+1 == N) *count = item; """)
def _get_neighbor_count_prefix_sum_kernel(ctx): return GenericScanKernel(ctx, np.int32, arguments="__global int *ary", input_expr="ary[i]", scan_expr="a+b", neutral="0", output_statement="ary[i] = prev_item")
def get_scan_kernel(self, index_dtype): from pyopencl.scan import GenericScanKernel return GenericScanKernel( self.context, index_dtype, arguments="__global %s *ary" % dtype_to_ctype(index_dtype), input_expr="ary[i]", scan_expr="a+b", neutral="0", output_statement="ary[i+1] = item;", devices=self.devices)
def get_prefix_sum_knl(): from ..opencl import get_queue, get_context from pyopencl.scan import GenericScanKernel ctx = get_context() queue = get_queue() return GenericScanKernel(ctx, np.int32, arguments="__global int *ary", input_expr="ary[i]", scan_expr="a+b", neutral="0", output_statement="ary[i] = prev_item")
def _get_particle_kernel(ctx, k, args, index_code): return GenericScanKernel( ctx, get_vector_dtype('uint', k), neutral="0", arguments=r"""__global char *seg_flag, __global uint%(k)s *prefix_sum_vector, """ % dict(k=k) + args, input_expr="M[%(index_code)s]" % dict(index_code=index_code), scan_expr="(across_seg_boundary ? b : a + b)", is_segment_start_expr="seg_flag[i]", output_statement=r"""prefix_sum_vector[i]=item;""", preamble=get_M_array_initialization(k))
def get_kernels(self, key_dtype, value_dtype, starts_dtype): from pyopencl.algorithm import RadixSort from pyopencl.tools import VectorArg, ScalarArg by_target_sorter = RadixSort( self.context, [ VectorArg(value_dtype, "values"), VectorArg(key_dtype, "keys"), ], key_expr="keys[i]", sort_arg_names=["values", "keys"]) from pyopencl.elementwise import ElementwiseTemplate start_finder = ElementwiseTemplate( arguments="""//CL// starts_t *key_group_starts, key_t *keys_sorted_by_key, """, operation=r"""//CL// key_t my_key = keys_sorted_by_key[i]; if (i == 0 || my_key != keys_sorted_by_key[i-1]) key_group_starts[my_key] = i; """, name="find_starts").build(self.context, type_aliases=( ("key_t", starts_dtype), ("starts_t", starts_dtype), ), var_values=()) from pyopencl.scan import GenericScanKernel bound_propagation_scan = GenericScanKernel( self.context, starts_dtype, arguments=[ VectorArg(starts_dtype, "starts"), # starts has length n+1 ScalarArg(key_dtype, "nkeys"), ], input_expr="starts[nkeys-i]", scan_expr="min(a, b)", neutral=_make_cl_int_literal( np.iinfo(starts_dtype).max, starts_dtype), output_statement="starts[nkeys-i] = item;") return _KernelInfo( by_target_sorter=by_target_sorter, start_finder=start_finder, bound_propagation_scan=bound_propagation_scan)
def _get_unique_cids_kernel(ctx): return GenericScanKernel(ctx, np.int32, neutral="0", arguments=r"""int *cids, int *unique_cids_map, int *unique_cids, int *unique_cids_count""", input_expr="(i == 0 || cids[i] != cids[i-1])", scan_expr="a + b", output_statement=r""" if (item != prev_item) { unique_cids[item - 1] = cids[i]; } unique_cids_map[i] = item - 1; if (i == N - 1) *unique_cids_count = item; """)
def get_qbx_target_numberer(self, dtype): assert dtype == np.int32 from pyopencl.scan import GenericScanKernel return GenericScanKernel( self.cl_context, np.int32, arguments="int *tgt_to_qbx_center, int *qbx_tgt_number, int *count", input_expr="tgt_to_qbx_center[i] >= 0 ? 1 : 0", scan_expr="a+b", neutral="0", output_statement=""" if (item != prev_item) qbx_tgt_number[item-1] = i; if (i+1 == N) *count = item; """)
def _generate_cuda_kernel(self): scan_expr, arg_defn, input_expr, output_expr, \ segment_expr, preamble = self._get_opencl_cuda_code() from .cuda import set_context, GenericScanKernel set_context() knl = GenericScanKernel(dtype=self.dtype, arguments=arg_defn, input_expr=input_expr, scan_expr=scan_expr, neutral=self.neutral, output_statement=output_expr, is_segment_start_expr=segment_expr, preamble=preamble) return knl
def sim_lifetime(S, T): ctx = cl.create_some_context() queue = cl.CommandQueue(ctx) t0 = time.time() rand_gen = clrand.PhiloxGenerator(ctx, seed=25) eps_mat = rand_gen.normal(queue, (S*T), np.float32, mu=0, sigma=1) z_row = np.array(([3] + [0] * (T-1)), dtype=np.float32) z_mat = np.tile(z_row, int(S)) z_mat = cl_array.to_device(queue, z_mat) seg_boundaries = [1] + [0]*(T-1) seg_boundaries = np.array(seg_boundaries, dtype=np.uint8) seg_boundary_flags = np.tile(seg_boundaries, int(S)) seg_boundary_flags = cl_array.to_device(queue, seg_boundary_flags) prefix_sum = GenericScanKernel(ctx, np.float32, arguments="__global float *ary, __global char *segflags, " "__global float *eps, __global float *out, __global float r", input_expr="ary[i] + eps[i] + 3*(1-r)", scan_expr="across_seg_boundary ? b : (r*a+b)", neutral="0", is_segment_start_expr="segflags[i]", output_statement="out[i] = item", options=[]) rho_neg_tracker = [] for r in np.linspace(-0.95, 0.95, 200): dev_result = cl_array.empty_like(eps_mat) prefix_sum(z_mat, seg_boundary_flags, eps_mat, dev_result, r) simulation_all = (dev_result.get().reshape(S, T)) neg_mean = avg_first_negative(simulation_all) rho_neg_tracker.append([r, neg_mean]) best_rho = find_best_rho(rho_neg_tracker) time_elapsed = time.time() - t0 print('Time taken to run: {}'.format(time_elapsed)) print('Best Rho Value: {}'.format(best_rho[0])) print('Max period: {}'.format(best_rho[1])) return
def _get_leaves_kernel(ctx, leaf_size): return GenericScanKernel( ctx, np.int32, neutral="0", arguments="int *offsets, uint2 pbounds, int *leaf_cids, " "int *num_leaves", input_expr="(pbounds[i].s1 - pbounds[i].s0 <= %(leaf_size)s)" % dict(leaf_size=leaf_size), scan_expr="a+b", output_statement=r""" if (item != prev_item) { leaf_cids[item - 1] = i; } if (i == N - 1) *num_leaves = item; """)
def sim_lifetime(S, T): ctx = cl.create_some_context() queue = cl.CommandQueue(ctx) t0 = time.time() rand_gen = clrand.PhiloxGenerator(ctx, seed=25) eps_mat = rand_gen.normal(queue, (S * T), np.float32, mu=0, sigma=1) z_row = np.array(([3] + [0] * (T - 1)), dtype=np.float32) z_mat = np.tile(z_row, int(S)) z_mat = cl_array.to_device(queue, z_mat) seg_boundaries = [1] + [0] * (T - 1) seg_boundaries = np.array(seg_boundaries, dtype=np.uint8) seg_boundary_flags = np.tile(seg_boundaries, int(S)) seg_boundary_flags = cl_array.to_device(queue, seg_boundary_flags) prefix_sum = GenericScanKernel( ctx, np.float32, arguments="__global float *ary, __global char *segflags, " "__global float *eps, __global float *out", input_expr="ary[i] + eps[i] + 3*(1-0.5)", scan_expr="across_seg_boundary ? b : (0.5*a+b)", neutral="0", is_segment_start_expr="segflags[i]", output_statement="out[i] = item", options=[]) dev_result = cl_array.empty_like(eps_mat) prefix_sum(z_mat, seg_boundary_flags, eps_mat, dev_result) simulation_all = (dev_result.get().reshape(S, T).transpose()) average_finish = np.mean(simulation_all[-1]) std_finish = np.std(simulation_all[-1]) final_time = time.time() time_elapsed = final_time - t0 print("Simulated %d lifetimes in: %f seconds" % (S, time_elapsed)) print("Average final health score: %f, Standard Deviation: %f" % (average_finish, std_finish)) return
def _get_set_offset_kernel(ctx, k, leaf_size): return GenericScanKernel( ctx, np.int32, neutral="0", arguments=r"""__global uint2 *pbounds, __global uint *offsets, __global int *leaf_count, int csum_nodes_next""", input_expr="(pbounds[i].s1 - pbounds[i].s0 > %(leaf_size)s)" % {'leaf_size': leaf_size}, scan_expr="a + b", output_statement=r"""{ offsets[i] = ((pbounds[i].s1 - pbounds[i].s0 > %(leaf_size)s) ? csum_nodes_next + (%(k)s * (item - 1)) : -1); if (i == N - 1) { *leaf_count = (N - item); } }""" % { 'leaf_size': leaf_size, 'k': k })
def _generate_opencl_kernel(self): scan_expr, arg_defn, input_expr, output_expr, \ segment_expr, preamble = self._get_opencl_cuda_code() from .opencl import get_context, get_queue from pyopencl.scan import GenericScanKernel ctx = get_context() self.queue = get_queue() knl = GenericScanKernel(ctx, dtype=self.dtype, arguments=arg_defn, input_expr=input_expr, scan_expr=scan_expr, neutral=self.neutral, output_statement=output_expr, is_segment_start_expr=segment_expr, preamble=preamble) return knl
def sim_lifetime(rho): ctx = cl.create_some_context() queue = cl.CommandQueue(ctx) t0 = time.time() S = 1000 T = int(4160) rand_gen = clrand.PhiloxGenerator(ctx, seed=25) eps_mat = rand_gen.normal(queue, (S * T), np.float32, mu=0, sigma=1) z_row = np.array(([3] + [0] * (T - 1)), dtype=np.float32) z_mat = np.tile(z_row, int(S)) z_mat = cl_array.to_device(queue, z_mat) seg_boundaries = [1] + [0] * (T - 1) seg_boundaries = np.array(seg_boundaries, dtype=np.uint8) seg_boundary_flags = np.tile(seg_boundaries, int(S)) seg_boundary_flags = cl_array.to_device(queue, seg_boundary_flags) prefix_sum = GenericScanKernel( ctx, np.float32, arguments="__global float *ary, __global char *segflags, " "__global float *eps, __global float *out, __global float r", input_expr="ary[i] + eps[i] + 3*(1-r)", scan_expr="across_seg_boundary ? b : (r*a+b)", neutral="0", is_segment_start_expr="segflags[i]", output_statement="out[i] = item", options=[]) dev_result = cl_array.empty_like(eps_mat) prefix_sum(z_mat, seg_boundary_flags, eps_mat, dev_result, rho) simulation_all = (dev_result.get().reshape(S, T).transpose()) avg_first_neg = avg_first_negative(simulation_all) return -avg_first_neg # turned negative for minimization
def _generate_cuda_kernel(self, declarations=None): scan_expr, arg_defn, input_expr, output_expr, \ segment_expr, preamble = self._get_opencl_cuda_code( declarations=declarations ) from .cuda import set_context, GenericScanKernel set_context() knl = GenericScanKernel(dtype=self.dtype, arguments=arg_defn, input_expr=input_expr, scan_expr=scan_expr, neutral=self.neutral, output_statement=output_expr, is_segment_start_expr=segment_expr, preamble=preamble) self.source = preamble # FIXME: Difficult to get the pycuda sources self.all_source = self.source return knl
def _get_cid_groups_kernel(ctx): return GenericScanKernel( ctx, np.uint32, neutral="0", arguments="""int *unique_cids, uint2 *pbounds, int *group_cids, int *group_count, int gmin, int gmax""", input_expr="pass(pbounds[unique_cids[i]], gmin, gmax)", scan_expr="(a + b)", output_statement=r""" if (item != prev_item) { group_cids[item - 1] = unique_cids[i]; } if (i == N - 1) *group_count = item; """, preamble=""" char pass(uint2 pbound, int gmin, int gmax) { int leaf_size = pbound.s1 - pbound.s0; return (leaf_size > gmin && leaf_size <= gmax); } """)
def _generate_opencl_code(self): input_expr, input_args, input_c_args = \ self._wrap_ocl_function(self.input_func, func_type='input') output_expr, output_args, output_c_args = \ self._wrap_ocl_function(self.output_func) segment_expr, segment_args, segment_c_args = \ self._wrap_ocl_function(self.is_segment_func) scan_expr = self._get_scan_expr_opencl() preamble = convert_to_float_if_needed(self.tp.get_code()) args = input_args + segment_args + output_args args = drop_duplicates(args) arg_defn = convert_to_float_if_needed(','.join(args)) c_args = input_c_args + segment_c_args + output_c_args c_args = drop_duplicates(c_args) self.arg_keys = c_args from .opencl import get_context, get_queue from pyopencl.scan import GenericScanKernel ctx = get_context() self.queue = get_queue() knl = GenericScanKernel( ctx, dtype=self.dtype, arguments=arg_defn, input_expr=input_expr, scan_expr=scan_expr, neutral=self.neutral, output_statement=output_expr, is_segment_start_expr=segment_expr, preamble=preamble ) self.c_func = knl
def _setup_compaction_kernel(self): self.scan_kernel = GenericScanKernel( self.ctx, self.indice_dtype, arguments= "__global float* data, __global float *data_compacted, __global int *indices, __global int* indptr", input_expr="(fabs(data[i]) > 0.0f) ? 1 : 0", scan_expr="a+b", neutral="0", output_statement=""" // item is the running sum of input_expr(i), i.e the cumsum of "nonzero" if (prev_item != item) { data_compacted[item-1] = data[i]; indices[item-1] = GET_INDEX(i); } // The last cumsum element of each line of "nonzero" goes to inptr[i] if ((i+1) % IMAGE_WIDTH == 0) { indptr[(i/IMAGE_WIDTH)+1] = item; } """, options="-DIMAGE_WIDTH=%d" % self.shape[1], preamble="#define GET_INDEX(i) (i % IMAGE_WIDTH)", )
def _setup_compaction_kernel(self): kernel_signature = str( "__global %s *data, \ __global %s *data_compacted, \ __global %s *indices, \ __global %s* indptr \ " "" % (self.c_dtype, self.c_dtype, self.idx_c_dtype, self.idx_c_dtype)) if self.dtype.kind == "f": map_nonzero_expr = "(fabs(data[i]) > %s) ? 1 : 0" % self._c_zero_str elif self.dtype.kind in ["u", "i"]: map_nonzero_expr = "(data[i] != %s) ? 1 : 0" % self._c_zero_str else: raise ValueError("Unknown data type") self.scan_kernel = GenericScanKernel( self.ctx, self.indice_dtype, arguments=kernel_signature, input_expr=map_nonzero_expr, scan_expr="a+b", neutral="0", output_statement=""" // item is the running sum of input_expr(i), i.e the cumsum of "nonzero" if (prev_item != item) { data_compacted[item-1] = data[i]; indices[item-1] = GET_INDEX(i); } // The last cumsum element of each line of "nonzero" goes to inptr[i] if ((i+1) % IMAGE_WIDTH == 0) { indptr[(i/IMAGE_WIDTH)+1] = item; } """, options=["-DIMAGE_WIDTH=%d" % self.shape[1]], preamble="#define GET_INDEX(i) (i % IMAGE_WIDTH)", )
def __init__(self, context, arguments, key_expr, sort_arg_names, bits_at_a_time=2, index_dtype=np.int32, key_dtype=np.uint32, options=[]): """ :arg arguments: A string of comma-separated C argument declarations. If *arguments* is specified, then *input_expr* must also be specified. All types used here must be known to PyOpenCL. (see :func:`pyopencl.tools.get_or_register_dtype`). :arg key_expr: An integer-valued C expression returning the key based on which the sort is performed. The array index for which the key is to be computed is available as `i`. The expression may refer to any of the *arguments*. :arg sort_arg_names: A list of argument names whose corresponding array arguments will be sorted according to *key_expr*. """ # {{{ arg processing from pyopencl.tools import parse_arg_list self.arguments = parse_arg_list(arguments) del arguments self.sort_arg_names = sort_arg_names self.bits = int(bits_at_a_time) self.index_dtype = np.dtype(index_dtype) self.key_dtype = np.dtype(key_dtype) self.options = options # }}} # {{{ kernel creation scan_ctype, scan_dtype, scan_t_cdecl = \ _make_sort_scan_type(context.devices[0], self.bits, self.index_dtype) from pyopencl.tools import VectorArg, ScalarArg scan_arguments = (list(self.arguments) + [ VectorArg(arg.dtype, "sorted_" + arg.name) for arg in self.arguments if arg.name in sort_arg_names ] + [ScalarArg(np.int32, "base_bit")]) def get_count_branch(known_bits): if len(known_bits) == self.bits: return "s.c%s" % known_bits boundary_mnr = known_bits + "1" + (self.bits - len(known_bits) - 1) * "0" return ("((mnr < %s) ? %s : %s)" % (int(boundary_mnr, 2), get_count_branch(known_bits + "0"), get_count_branch(known_bits + "1"))) codegen_args = dict( bits=self.bits, key_ctype=dtype_to_ctype(self.key_dtype), key_expr=key_expr, index_ctype=dtype_to_ctype(self.index_dtype), index_type_max=np.iinfo(self.index_dtype).max, padded_bin=_padded_bin, scan_ctype=scan_ctype, sort_arg_names=sort_arg_names, get_count_branch=get_count_branch, ) preamble = scan_t_cdecl + RADIX_SORT_PREAMBLE_TPL.render( **codegen_args) scan_preamble = preamble \ + RADIX_SORT_SCAN_PREAMBLE_TPL.render(**codegen_args) from pyopencl.scan import GenericScanKernel self.scan_kernel = GenericScanKernel( context, scan_dtype, arguments=scan_arguments, input_expr="scan_t_from_value(%s, base_bit, i)" % key_expr, scan_expr="scan_t_add(a, b, across_seg_boundary)", neutral="scan_t_neutral()", output_statement=RADIX_SORT_OUTPUT_STMT_TPL.render(**codegen_args), preamble=scan_preamble, options=self.options) for i, arg in enumerate(self.arguments): if isinstance(arg, VectorArg): self.first_array_arg_idx = i
def _init_compression_scan(self): """Initialize CBF compression scan kernels""" preamble = """ int compressed_size(int diff) { int abs_diff = abs(diff); if (abs_diff < 128) { return 1; } else if (abs_diff < 32768) { return 3; } else { return 7; } } void write(const int index, const int diff, global char *output) { int abs_diff = abs(diff); if (abs_diff < 128) { output[index] = (char) diff; } else if (abs_diff < 32768) { output[index] = -128; output[index + 1] = (char) (diff >> 0); output[index + 2] = (char) (diff >> 8); } else { output[index] = -128; output[index + 1] = 0; output[index + 2] = -128; output[index + 3] = (char) (diff >> 0); output[index + 4] = (char) (diff >> 8); output[index + 5] = (char) (diff >> 16); output[index + 6] = (char) (diff >> 24); } } """ arguments = "__global const int *data, __global char *compressed, __global int *size" input_expr = "compressed_size((i == 0) ? data[0] : (data[i] - data[i - 1]))" scan_expr = "a+b" neutral = "0" output_statement = """ if (prev_item == 0) { // 1st thread store compressed data size size[0] = last_item; } write(prev_item, (i == 0) ? data[0] : (data[i] - data[i - 1]), compressed); """ if self.block_size >= 64: knl = GenericScanKernel(self.ctx, dtype=numpy.int32, preamble=preamble, arguments=arguments, input_expr=input_expr, scan_expr=scan_expr, neutral=neutral, output_statement=output_statement) else: # MacOS on CPU knl = GenericDebugScanKernel(self.ctx, dtype=numpy.int32, preamble=preamble, arguments=arguments, input_expr=input_expr, scan_expr=scan_expr, neutral=neutral, output_statement=output_statement) return knl
import pyopencl as cl import pyopencl.algorithm from pyopencl.scan import GenericScanKernel import numpy as np ctx = cl.create_some_context() queue = cl.CommandQueue(ctx) mf = cl.mem_flags knl = GenericScanKernel( ctx, np.int32, arguments="__global int *ary, __global int *out", input_expr="(ary[i] < 104) ? 1 : 0", scan_expr="a+b", neutral="0", output_statement="""if (prev_item != item) out[item-1] = ary[i];""") rand = np.random.random_integers(0, 2**10, size=(2**10) * 8).astype(np.uint32) ary = cl.array.arange(queue, 10000, dtype=np.uint32) print ary out = ary.copy() knl(ary, out) a_host = ary.get() out_host = a_host[a_host < 104] print out #code = open("knl.cl", "w").write(knl)
import pyopencl as cl import pyopencl.clrandom as clrand from pyopencl.scan import GenericScanKernel # np.cumsum([1, 2, 3]) # np.array([1, 3, 6]) ctx = cl.create_some_context() queue = cl.CommandQueue(ctx) print("queue: ", queue) print() sknl = GenericScanKernel(ctx, np.float64, arguments="double *y, double *x", input_expr="x[i]", scan_expr="a+b", neutral="0", output_statement="y[i] = item;") n = 10**7 x = clrand.rand(queue, n, np.float64) print("x:", x) print() result = cl.array.empty_like(x) # result = cl.array.arange(queue, n, dtype=np.float64) sknl(result, x, queue=queue) print("result", result) print()