def _struct_alignment(alignments): """ Returns the minimum alignment for a structure given alignments for its fields. According to the C standard, it the lowest common multiple of the alignments of all of the members of the struct rounded up to the nearest power of two. """ return bounding_power_of_2(_lcm(*alignments))
def get_fft_kernels(input_shape, dtype, axes, device_params, local_kernel_limit): kernels = [] # Starting from the most local transformation, for the sake of neatness. # Does not really matter. for axis in reversed(axes): outer_shape = input_shape[:axis] fft_size = input_shape[axis] inner_shape = input_shape[axis + 1:] if fft_size == 1: continue bounding_size = helpers.bounding_power_of_2(fft_size) if bounding_size == fft_size: kernels.extend( get_fft_1d_kernels(dtype, device_params, outer_shape, fft_size, inner_shape, local_kernel_limit)) else: # padding FFT for the chirp-z transform fft_size_padded = 2 * bounding_size args = (dtype, device_params, outer_shape, fft_size_padded, inner_shape, local_kernel_limit) new_kernels = [] new_kernels.extend( get_fft_1d_kernels(*args, fft_size_real=fft_size)) new_kernels.extend( get_fft_1d_kernels(*args, reverse_direction=True, fft_size_real=fft_size)) # Since during pad-in or pad-out input and output blocks are no longer aligned, # these kernels lose their inplace_possible property new_kernels[0].inplace_possible = False new_kernels[-1].inplace_possible = False kernels.extend(new_kernels) return kernels
def get_fft_kernels(input_shape, dtype, axes, device_params, local_kernel_limit): kernels = [] # Starting from the most local transformation, for the sake of neatness. # Does not really matter. for axis in reversed(axes): outer_shape = input_shape[:axis] fft_size = input_shape[axis] inner_shape = input_shape[axis+1:] if fft_size == 1: continue bounding_size = helpers.bounding_power_of_2(fft_size) if bounding_size == fft_size: kernels.extend(get_fft_1d_kernels( dtype, device_params, outer_shape, fft_size, inner_shape, local_kernel_limit)) else: # padding FFT for the chirp-z transform fft_size_padded = 2 * bounding_size args = (dtype, device_params, outer_shape, fft_size_padded, inner_shape, local_kernel_limit) new_kernels = [] new_kernels.extend(get_fft_1d_kernels( *args, fft_size_real=fft_size)) new_kernels.extend(get_fft_1d_kernels( *args, reverse_direction=True, fft_size_real=fft_size)) # Since during pad-in or pad-out input and output blocks are no longer aligned, # these kernels lose their inplace_possible property new_kernels[0].inplace_possible = False new_kernels[-1].inplace_possible = False kernels.extend(new_kernels) return kernels
def _build_plan_for_wg_size(self, plan_factory, warp_size, max_wg_size, output, input_): plan = plan_factory() # Using algorithm cascading: sequential reduction, and then the parallel one. # According to Brent's theorem, the optimal sequential size is O(log(n)). # Setting it to the nearest power of 2 to simplify integer operations. max_seq_size = helpers.bounding_power_of_2(helpers.log2(max_wg_size)) max_reduce_power = max_wg_size * max_seq_size if self._transpose_axes is None: # normal reduction cur_input = input_ else: transpose = Transpose(input_, axes=self._transpose_axes) tr_output = plan.temp_array_like(transpose.parameter.output) plan.computation_call(transpose, tr_output, input_) cur_input = tr_output axis_start = len(output.shape) axis_end = len(input_.shape) - 1 input_slices = (axis_start, axis_end - axis_start + 1) part_size = helpers.product(cur_input.shape[axis_start:]) final_size = helpers.product(cur_input.shape[:axis_start]) while part_size > 1: if part_size > max_reduce_power: seq_size = max_seq_size block_size = max_wg_size blocks_per_part = helpers.min_blocks(part_size, block_size * seq_size) cur_output = plan.temp_array((final_size, blocks_per_part), input_.dtype) output_slices = (1, 1) else: if part_size > max_wg_size: seq_size = helpers.min_blocks(part_size, max_wg_size) block_size = max_wg_size else: seq_size = 1 block_size = helpers.bounding_power_of_2(part_size) blocks_per_part = 1 cur_output = output output_slices = (len(cur_output.shape), 0) if part_size % (block_size * seq_size) != 0: last_block_size = part_size % (block_size * seq_size) else: last_block_size = block_size * seq_size render_kwds = dict(seq_size=seq_size, blocks_per_part=blocks_per_part, last_block_size=last_block_size, log2=helpers.log2, block_size=block_size, warp_size=warp_size, empty=self._empty, operation=self._operation, input_slices=input_slices, output_slices=output_slices) plan.kernel_call(TEMPLATE.get_def('reduce'), [cur_output, cur_input], global_size=(final_size, blocks_per_part * block_size), local_size=(1, block_size), render_kwds=render_kwds) part_size = blocks_per_part cur_input = cur_output input_slices = output_slices return plan
def _build_plan_for_wg_size(self, plan_factory, warp_size, max_wg_size, output, input_): plan = plan_factory() # Using algorithm cascading: sequential reduction, and then the parallel one. # According to Brent's theorem, the optimal sequential size is O(log(n)). # Setting it to the nearest power of 2 to simplify integer operations. max_seq_size = helpers.bounding_power_of_2(helpers.log2(max_wg_size)) max_reduce_power = max_wg_size * max_seq_size if self._transpose_axes is None: # normal reduction cur_input = input_ else: transpose = Transpose(input_, axes=self._transpose_axes) tr_output = plan.temp_array_like(transpose.parameter.output) plan.computation_call(transpose, tr_output, input_) cur_input = tr_output axis_start = len(output.shape) axis_end = len(input_.shape) - 1 input_slices = (axis_start, axis_end - axis_start + 1) part_size = helpers.product(cur_input.shape[axis_start:]) final_size = helpers.product(cur_input.shape[:axis_start]) while part_size > 1: if part_size > max_reduce_power: seq_size = max_seq_size block_size = max_wg_size blocks_per_part = helpers.min_blocks(part_size, block_size * seq_size) cur_output = plan.temp_array( (final_size, blocks_per_part), input_.dtype) output_slices = (1, 1) else: if part_size > max_wg_size: seq_size = helpers.min_blocks(part_size, max_wg_size) block_size = max_wg_size else: seq_size = 1 block_size = helpers.bounding_power_of_2(part_size) blocks_per_part = 1 cur_output = output output_slices = (len(cur_output.shape), 0) if part_size % (block_size * seq_size) != 0: last_block_size = part_size % (block_size * seq_size) else: last_block_size = block_size * seq_size render_kwds = dict( seq_size=seq_size, blocks_per_part=blocks_per_part, last_block_size=last_block_size, log2=helpers.log2, block_size=block_size, warp_size=warp_size, empty=self._empty, operation=self._operation, input_slices=input_slices, output_slices=output_slices) plan.kernel_call( TEMPLATE.get_def('reduce'), [cur_output, cur_input], global_size=(final_size, blocks_per_part * block_size), local_size=(1, block_size), render_kwds=render_kwds) part_size = blocks_per_part cur_input = cur_output input_slices = output_slices return plan
def _build_plan(self, plan_factory, device_params, output, input_): plan = plan_factory() if self._transpose_to is not None: transpose_to = Transpose(input_, axes=self._transpose_to) transposed = plan.temp_array_like(transpose_to.parameter.output) sub_scan = Scan( transposed, self._predicate, axes=self._axes, exclusive=self._exclusive, max_work_group_size=self._max_work_group_size) transposed_scanned = plan.temp_array_like(sub_scan.parameter.output) transpose_from = Transpose( transposed_scanned, axes=self._transpose_from, output_arr_t=output) plan.computation_call(transpose_to, transposed, input_) plan.computation_call(sub_scan, transposed_scanned, transposed) plan.computation_call(transpose_from, output, transposed_scanned) else: scan_ndim = len(self._axes) # assuming that at this point axes are inner and sorted batch_shape = output.shape[:-scan_ndim] batch_size = helpers.product(batch_shape) scan_shape = output.shape[-scan_ndim:] scan_size = helpers.product(scan_shape) if self._max_work_group_size is None: max_wg_size = device_params.max_work_group_size else: max_wg_size = self._max_work_group_size # The current algorithm requires workgroup size to be a power of 2. assert max_wg_size == 2**helpers.log2(max_wg_size) # Using algorithm cascading: sequential reduction, and then the parallel one. # According to Brent's theorem, the optimal sequential size is O(log(n)). # So, ideally we want the minimum `wg_size` for which # `wg_size * log2(wg_size) >= scan_size`. if self._seq_size is None: wg_size = 2 while wg_size < max_wg_size: seq_size = helpers.bounding_power_of_2(helpers.log2(wg_size) - 1) if wg_size * seq_size >= scan_size: break wg_size *= 2 else: seq_size = self._seq_size wg_size = helpers.bounding_power_of_2(helpers.min_blocks(scan_size, seq_size)) if wg_size > max_wg_size: raise ValueError( "Sequential size " + str(seq_size) + " cannot be set because of the maximum workgroup size " + max_wg_size) wg_totals_size = helpers.min_blocks(scan_size, wg_size * seq_size) wg_totals = plan.temp_array((batch_size, wg_totals_size,), output.dtype) if wg_totals_size > 1: temp_output = plan.temp_array_like(output) else: temp_output = output last_part_size = scan_size % (wg_size * seq_size) if last_part_size == 0: last_part_size = wg_size * seq_size plan.kernel_call( TEMPLATE.get_def('scan'), [temp_output, input_, wg_totals], kernel_name="kernel_scan_wg", global_size=(batch_size, wg_size * wg_totals_size), local_size=(1, wg_size), render_kwds=dict( slices=(len(batch_shape), len(scan_shape)), log_num_banks=helpers.log2(device_params.local_mem_banks), exclusive=self._exclusive, wg_size=wg_size, seq_size=seq_size, scan_size=scan_size, last_part_size=last_part_size, wg_totals_size=wg_totals_size, log_wg_size=helpers.log2(wg_size), predicate=self._predicate )) if wg_totals_size > 1: sub_scan = Scan( wg_totals, self._predicate, axes=(1,), exclusive=True, max_work_group_size=self._max_work_group_size) scanned_wg_totals = plan.temp_array_like(wg_totals) plan.computation_call(sub_scan, scanned_wg_totals, wg_totals) plan.kernel_call( TEMPLATE.get_def('add_wg_totals'), [output, temp_output, scanned_wg_totals], kernel_name="kernel_scan_add_wg_totals", global_size=(batch_size, scan_size,), render_kwds=dict( slices=(len(batch_shape), len(scan_shape),), wg_size=wg_size, seq_size=seq_size, )) return plan
def _build_plan(self, plan_factory, device_params, output, input_): plan = plan_factory() if self._transpose_to is not None: transpose_to = Transpose(input_, axes=self._transpose_to) transposed = plan.temp_array_like(transpose_to.parameter.output) sub_scan = Scan(transposed, self._predicate, axes=self._axes, exclusive=self._exclusive, max_work_group_size=self._max_work_group_size) transposed_scanned = plan.temp_array_like( sub_scan.parameter.output) transpose_from = Transpose(transposed_scanned, axes=self._transpose_from, output_arr_t=output) plan.computation_call(transpose_to, transposed, input_) plan.computation_call(sub_scan, transposed_scanned, transposed) plan.computation_call(transpose_from, output, transposed_scanned) else: scan_ndim = len( self._axes ) # assuming that at this point axes are inner and sorted batch_shape = output.shape[:-scan_ndim] batch_size = helpers.product(batch_shape) scan_shape = output.shape[-scan_ndim:] scan_size = helpers.product(scan_shape) if self._max_work_group_size is None: max_wg_size = device_params.max_work_group_size else: max_wg_size = self._max_work_group_size # The current algorithm requires workgroup size to be a power of 2. assert max_wg_size == 2**helpers.log2(max_wg_size) # Using algorithm cascading: sequential reduction, and then the parallel one. # According to Brent's theorem, the optimal sequential size is O(log(n)). # So, ideally we want the minimum `wg_size` for which # `wg_size * log2(wg_size) >= scan_size`. if self._seq_size is None: wg_size = 2 while wg_size < max_wg_size: seq_size = helpers.bounding_power_of_2( helpers.log2(wg_size) - 1) if wg_size * seq_size >= scan_size: break wg_size *= 2 else: seq_size = self._seq_size wg_size = helpers.bounding_power_of_2( helpers.min_blocks(scan_size, seq_size)) if wg_size > max_wg_size: raise ValueError( "Sequential size " + str(seq_size) + " cannot be set because of the maximum workgroup size " + max_wg_size) wg_totals_size = helpers.min_blocks(scan_size, wg_size * seq_size) wg_totals = plan.temp_array(( batch_size, wg_totals_size, ), output.dtype) if wg_totals_size > 1: temp_output = plan.temp_array_like(output) else: temp_output = output last_part_size = scan_size % (wg_size * seq_size) if last_part_size == 0: last_part_size = wg_size * seq_size plan.kernel_call( TEMPLATE.get_def('scan'), [temp_output, input_, wg_totals], kernel_name="kernel_scan_wg", global_size=(batch_size, wg_size * wg_totals_size), local_size=(1, wg_size), render_kwds=dict(slices=(len(batch_shape), len(scan_shape)), log_num_banks=helpers.log2( device_params.local_mem_banks), exclusive=self._exclusive, wg_size=wg_size, seq_size=seq_size, scan_size=scan_size, last_part_size=last_part_size, wg_totals_size=wg_totals_size, log_wg_size=helpers.log2(wg_size), predicate=self._predicate)) if wg_totals_size > 1: sub_scan = Scan(wg_totals, self._predicate, axes=(1, ), exclusive=True, max_work_group_size=self._max_work_group_size) scanned_wg_totals = plan.temp_array_like(wg_totals) plan.computation_call(sub_scan, scanned_wg_totals, wg_totals) plan.kernel_call(TEMPLATE.get_def('add_wg_totals'), [output, temp_output, scanned_wg_totals], kernel_name="kernel_scan_add_wg_totals", global_size=( batch_size, scan_size, ), render_kwds=dict( slices=( len(batch_shape), len(scan_shape), ), wg_size=wg_size, seq_size=seq_size, )) return plan
def _build_plan(self, plan_factory, device_params, output, alpha, beta): plan = plan_factory() samples, modes = alpha.shape for_reduction = Type(alpha.dtype, (samples, self._max_total_clicks + 1)) prepared_state = plan.temp_array_like(alpha) plan.kernel_call( TEMPLATE.get_def("compound_click_probability_prepare"), [prepared_state, alpha, beta], kernel_name="compound_click_probability_prepare", global_size=alpha.shape, render_kwds=dict( mul_cc=functions.mul(alpha.dtype, alpha.dtype), exp_c=functions.exp(alpha.dtype), )) # Block size is limited by the amount of available local memory. # In some OpenCL implementations the number reported cannot actually be fully used # (because it's used by kernel arguments), so we're padding it a little. local_mem_size = device_params.local_mem_size max_elems = (local_mem_size - 256) // alpha.dtype.itemsize block_size = 2**helpers.log2(max_elems) # No reason to have block size larger than the number of modes block_size = min(block_size, helpers.bounding_power_of_2(modes)) products_gsize = (samples, helpers.min_blocks(self._max_total_clicks + 1, block_size) * block_size) products = plan.temp_array_like(for_reduction) read_size = min(block_size, device_params.max_work_group_size) while read_size > 1: full_steps = modes // block_size remainder_size = modes % block_size try: plan.kernel_call( TEMPLATE.get_def("compound_click_probability_aggregate"), [products, prepared_state], kernel_name="compound_click_probability_aggregate", global_size=products_gsize, local_size=(1, read_size,), render_kwds=dict( block_size=block_size, read_size=read_size, full_steps=full_steps, remainder_size=remainder_size, output_size=self._max_total_clicks + 1, mul_cc=functions.mul(alpha.dtype, alpha.dtype), add_cc=functions.add(alpha.dtype, alpha.dtype), polar_unit=functions.polar_unit(dtypes.real_for(alpha.dtype)), modes=self._system.modes, max_total_clicks=self._max_total_clicks, )) except OutOfResourcesError: read_size //= 2 break reduction = Reduce(for_reduction, predicate_sum(alpha.dtype), axes=(0,)) temp = plan.temp_array_like(reduction.parameter.output) plan.computation_call(reduction, temp, products) fft = FFT(temp) real_trf = Transformation([ Parameter('output', Annotation(output, 'o')), Parameter('input', Annotation(temp, 'i')), ], """ ${input.ctype} val = ${input.load_same}; ${output.store_same}(val.x); """) fft.parameter.output.connect(real_trf, real_trf.input, output_p=real_trf.output) plan.computation_call(fft, output, temp, True) return plan