Exemple #1
0
def _struct_alignment(alignments):
    """
    Returns the minimum alignment for a structure given alignments for its fields.
    According to the C standard, it the lowest common multiple of the alignments
    of all of the members of the struct rounded up to the nearest power of two.
    """
    return bounding_power_of_2(_lcm(*alignments))
Exemple #2
0
def _struct_alignment(alignments):
    """
    Returns the minimum alignment for a structure given alignments for its fields.
    According to the C standard, it the lowest common multiple of the alignments
    of all of the members of the struct rounded up to the nearest power of two.
    """
    return bounding_power_of_2(_lcm(*alignments))
Exemple #3
0
def get_fft_kernels(input_shape, dtype, axes, device_params,
                    local_kernel_limit):
    kernels = []

    # Starting from the most local transformation, for the sake of neatness.
    # Does not really matter.
    for axis in reversed(axes):
        outer_shape = input_shape[:axis]
        fft_size = input_shape[axis]
        inner_shape = input_shape[axis + 1:]

        if fft_size == 1:
            continue

        bounding_size = helpers.bounding_power_of_2(fft_size)

        if bounding_size == fft_size:
            kernels.extend(
                get_fft_1d_kernels(dtype, device_params, outer_shape, fft_size,
                                   inner_shape, local_kernel_limit))
        else:
            # padding FFT for the chirp-z transform
            fft_size_padded = 2 * bounding_size
            args = (dtype, device_params, outer_shape, fft_size_padded,
                    inner_shape, local_kernel_limit)

            new_kernels = []
            new_kernels.extend(
                get_fft_1d_kernels(*args, fft_size_real=fft_size))
            new_kernels.extend(
                get_fft_1d_kernels(*args,
                                   reverse_direction=True,
                                   fft_size_real=fft_size))

            # Since during pad-in or pad-out input and output blocks are no longer aligned,
            # these kernels lose their inplace_possible property
            new_kernels[0].inplace_possible = False
            new_kernels[-1].inplace_possible = False

            kernels.extend(new_kernels)

    return kernels
Exemple #4
0
def get_fft_kernels(input_shape, dtype, axes, device_params, local_kernel_limit):
    kernels = []

    # Starting from the most local transformation, for the sake of neatness.
    # Does not really matter.
    for axis in reversed(axes):
        outer_shape = input_shape[:axis]
        fft_size = input_shape[axis]
        inner_shape = input_shape[axis+1:]

        if fft_size == 1:
            continue

        bounding_size = helpers.bounding_power_of_2(fft_size)

        if bounding_size == fft_size:
            kernels.extend(get_fft_1d_kernels(
                dtype, device_params, outer_shape, fft_size,
                inner_shape, local_kernel_limit))
        else:
            # padding FFT for the chirp-z transform
            fft_size_padded = 2 * bounding_size
            args = (dtype, device_params, outer_shape, fft_size_padded,
                inner_shape, local_kernel_limit)

            new_kernels = []
            new_kernels.extend(get_fft_1d_kernels(
                *args, fft_size_real=fft_size))
            new_kernels.extend(get_fft_1d_kernels(
                *args, reverse_direction=True, fft_size_real=fft_size))

            # Since during pad-in or pad-out input and output blocks are no longer aligned,
            # these kernels lose their inplace_possible property
            new_kernels[0].inplace_possible = False
            new_kernels[-1].inplace_possible = False

            kernels.extend(new_kernels)

    return kernels
Exemple #5
0
    def _build_plan_for_wg_size(self, plan_factory, warp_size, max_wg_size,
                                output, input_):

        plan = plan_factory()

        # Using algorithm cascading: sequential reduction, and then the parallel one.
        # According to Brent's theorem, the optimal sequential size is O(log(n)).
        # Setting it to the nearest power of 2 to simplify integer operations.
        max_seq_size = helpers.bounding_power_of_2(helpers.log2(max_wg_size))
        max_reduce_power = max_wg_size * max_seq_size

        if self._transpose_axes is None:
            # normal reduction
            cur_input = input_
        else:
            transpose = Transpose(input_, axes=self._transpose_axes)
            tr_output = plan.temp_array_like(transpose.parameter.output)
            plan.computation_call(transpose, tr_output, input_)

            cur_input = tr_output

        axis_start = len(output.shape)
        axis_end = len(input_.shape) - 1

        input_slices = (axis_start, axis_end - axis_start + 1)

        part_size = helpers.product(cur_input.shape[axis_start:])
        final_size = helpers.product(cur_input.shape[:axis_start])

        while part_size > 1:

            if part_size > max_reduce_power:
                seq_size = max_seq_size
                block_size = max_wg_size
                blocks_per_part = helpers.min_blocks(part_size,
                                                     block_size * seq_size)
                cur_output = plan.temp_array((final_size, blocks_per_part),
                                             input_.dtype)
                output_slices = (1, 1)
            else:
                if part_size > max_wg_size:
                    seq_size = helpers.min_blocks(part_size, max_wg_size)
                    block_size = max_wg_size
                else:
                    seq_size = 1
                    block_size = helpers.bounding_power_of_2(part_size)
                blocks_per_part = 1
                cur_output = output
                output_slices = (len(cur_output.shape), 0)

            if part_size % (block_size * seq_size) != 0:
                last_block_size = part_size % (block_size * seq_size)
            else:
                last_block_size = block_size * seq_size

            render_kwds = dict(seq_size=seq_size,
                               blocks_per_part=blocks_per_part,
                               last_block_size=last_block_size,
                               log2=helpers.log2,
                               block_size=block_size,
                               warp_size=warp_size,
                               empty=self._empty,
                               operation=self._operation,
                               input_slices=input_slices,
                               output_slices=output_slices)

            plan.kernel_call(TEMPLATE.get_def('reduce'),
                             [cur_output, cur_input],
                             global_size=(final_size,
                                          blocks_per_part * block_size),
                             local_size=(1, block_size),
                             render_kwds=render_kwds)

            part_size = blocks_per_part
            cur_input = cur_output
            input_slices = output_slices

        return plan
Exemple #6
0
    def _build_plan_for_wg_size(self, plan_factory, warp_size, max_wg_size, output, input_):

        plan = plan_factory()

        # Using algorithm cascading: sequential reduction, and then the parallel one.
        # According to Brent's theorem, the optimal sequential size is O(log(n)).
        # Setting it to the nearest power of 2 to simplify integer operations.
        max_seq_size = helpers.bounding_power_of_2(helpers.log2(max_wg_size))
        max_reduce_power = max_wg_size * max_seq_size

        if self._transpose_axes is None:
            # normal reduction
            cur_input = input_
        else:
            transpose = Transpose(input_, axes=self._transpose_axes)
            tr_output = plan.temp_array_like(transpose.parameter.output)
            plan.computation_call(transpose, tr_output, input_)

            cur_input = tr_output

        axis_start = len(output.shape)
        axis_end = len(input_.shape) - 1

        input_slices = (axis_start, axis_end - axis_start + 1)

        part_size = helpers.product(cur_input.shape[axis_start:])
        final_size = helpers.product(cur_input.shape[:axis_start])

        while part_size > 1:

            if part_size > max_reduce_power:
                seq_size = max_seq_size
                block_size = max_wg_size
                blocks_per_part = helpers.min_blocks(part_size, block_size * seq_size)
                cur_output = plan.temp_array(
                    (final_size, blocks_per_part), input_.dtype)
                output_slices = (1, 1)
            else:
                if part_size > max_wg_size:
                    seq_size = helpers.min_blocks(part_size, max_wg_size)
                    block_size = max_wg_size
                else:
                    seq_size = 1
                    block_size = helpers.bounding_power_of_2(part_size)
                blocks_per_part = 1
                cur_output = output
                output_slices = (len(cur_output.shape), 0)

            if part_size % (block_size * seq_size) != 0:
                last_block_size = part_size % (block_size * seq_size)
            else:
                last_block_size = block_size * seq_size

            render_kwds = dict(
                seq_size=seq_size,
                blocks_per_part=blocks_per_part,
                last_block_size=last_block_size,
                log2=helpers.log2, block_size=block_size,
                warp_size=warp_size,
                empty=self._empty,
                operation=self._operation,
                input_slices=input_slices,
                output_slices=output_slices)

            plan.kernel_call(
                TEMPLATE.get_def('reduce'),
                [cur_output, cur_input],
                global_size=(final_size, blocks_per_part * block_size),
                local_size=(1, block_size),
                render_kwds=render_kwds)

            part_size = blocks_per_part
            cur_input = cur_output
            input_slices = output_slices

        return plan
Exemple #7
0
    def _build_plan(self, plan_factory, device_params, output, input_):
        plan = plan_factory()

        if self._transpose_to is not None:

            transpose_to = Transpose(input_, axes=self._transpose_to)
            transposed = plan.temp_array_like(transpose_to.parameter.output)

            sub_scan = Scan(
                transposed, self._predicate, axes=self._axes, exclusive=self._exclusive,
                max_work_group_size=self._max_work_group_size)
            transposed_scanned = plan.temp_array_like(sub_scan.parameter.output)

            transpose_from = Transpose(
                transposed_scanned, axes=self._transpose_from, output_arr_t=output)

            plan.computation_call(transpose_to, transposed, input_)
            plan.computation_call(sub_scan, transposed_scanned, transposed)
            plan.computation_call(transpose_from, output, transposed_scanned)

        else:

            scan_ndim = len(self._axes) # assuming that at this point axes are inner and sorted
            batch_shape = output.shape[:-scan_ndim]
            batch_size = helpers.product(batch_shape)
            scan_shape = output.shape[-scan_ndim:]
            scan_size = helpers.product(scan_shape)

            if self._max_work_group_size is None:
                max_wg_size = device_params.max_work_group_size
            else:
                max_wg_size = self._max_work_group_size

            # The current algorithm requires workgroup size to be a power of 2.
            assert max_wg_size == 2**helpers.log2(max_wg_size)

            # Using algorithm cascading: sequential reduction, and then the parallel one.
            # According to Brent's theorem, the optimal sequential size is O(log(n)).
            # So, ideally we want the minimum `wg_size` for which
            # `wg_size * log2(wg_size) >= scan_size`.
            if self._seq_size is None:
                wg_size = 2
                while wg_size < max_wg_size:
                    seq_size = helpers.bounding_power_of_2(helpers.log2(wg_size) - 1)
                    if wg_size * seq_size >= scan_size:
                        break
                    wg_size *= 2
            else:
                seq_size = self._seq_size
                wg_size = helpers.bounding_power_of_2(helpers.min_blocks(scan_size, seq_size))
                if wg_size > max_wg_size:
                    raise ValueError(
                        "Sequential size " + str(seq_size)
                        + " cannot be set because of the maximum workgroup size " + max_wg_size)

            wg_totals_size = helpers.min_blocks(scan_size, wg_size * seq_size)
            wg_totals = plan.temp_array((batch_size, wg_totals_size,), output.dtype)

            if wg_totals_size > 1:
                temp_output = plan.temp_array_like(output)
            else:
                temp_output = output

            last_part_size = scan_size % (wg_size * seq_size)
            if last_part_size == 0:
                last_part_size = wg_size * seq_size

            plan.kernel_call(
                TEMPLATE.get_def('scan'),
                    [temp_output, input_, wg_totals],
                    kernel_name="kernel_scan_wg",
                    global_size=(batch_size, wg_size * wg_totals_size),
                    local_size=(1, wg_size),
                    render_kwds=dict(
                        slices=(len(batch_shape), len(scan_shape)),
                        log_num_banks=helpers.log2(device_params.local_mem_banks),
                        exclusive=self._exclusive,
                        wg_size=wg_size,
                        seq_size=seq_size,
                        scan_size=scan_size,
                        last_part_size=last_part_size,
                        wg_totals_size=wg_totals_size,
                        log_wg_size=helpers.log2(wg_size),
                        predicate=self._predicate
                        ))

            if wg_totals_size > 1:
                sub_scan = Scan(
                    wg_totals, self._predicate, axes=(1,), exclusive=True,
                    max_work_group_size=self._max_work_group_size)
                scanned_wg_totals = plan.temp_array_like(wg_totals)
                plan.computation_call(sub_scan, scanned_wg_totals, wg_totals)

                plan.kernel_call(
                    TEMPLATE.get_def('add_wg_totals'),
                        [output, temp_output, scanned_wg_totals],
                        kernel_name="kernel_scan_add_wg_totals",
                        global_size=(batch_size, scan_size,),
                        render_kwds=dict(
                            slices=(len(batch_shape), len(scan_shape),),
                            wg_size=wg_size,
                            seq_size=seq_size,
                            ))

        return plan
Exemple #8
0
    def _build_plan(self, plan_factory, device_params, output, input_):
        plan = plan_factory()

        if self._transpose_to is not None:

            transpose_to = Transpose(input_, axes=self._transpose_to)
            transposed = plan.temp_array_like(transpose_to.parameter.output)

            sub_scan = Scan(transposed,
                            self._predicate,
                            axes=self._axes,
                            exclusive=self._exclusive,
                            max_work_group_size=self._max_work_group_size)
            transposed_scanned = plan.temp_array_like(
                sub_scan.parameter.output)

            transpose_from = Transpose(transposed_scanned,
                                       axes=self._transpose_from,
                                       output_arr_t=output)

            plan.computation_call(transpose_to, transposed, input_)
            plan.computation_call(sub_scan, transposed_scanned, transposed)
            plan.computation_call(transpose_from, output, transposed_scanned)

        else:

            scan_ndim = len(
                self._axes
            )  # assuming that at this point axes are inner and sorted
            batch_shape = output.shape[:-scan_ndim]
            batch_size = helpers.product(batch_shape)
            scan_shape = output.shape[-scan_ndim:]
            scan_size = helpers.product(scan_shape)

            if self._max_work_group_size is None:
                max_wg_size = device_params.max_work_group_size
            else:
                max_wg_size = self._max_work_group_size

            # The current algorithm requires workgroup size to be a power of 2.
            assert max_wg_size == 2**helpers.log2(max_wg_size)

            # Using algorithm cascading: sequential reduction, and then the parallel one.
            # According to Brent's theorem, the optimal sequential size is O(log(n)).
            # So, ideally we want the minimum `wg_size` for which
            # `wg_size * log2(wg_size) >= scan_size`.
            if self._seq_size is None:
                wg_size = 2
                while wg_size < max_wg_size:
                    seq_size = helpers.bounding_power_of_2(
                        helpers.log2(wg_size) - 1)
                    if wg_size * seq_size >= scan_size:
                        break
                    wg_size *= 2
            else:
                seq_size = self._seq_size
                wg_size = helpers.bounding_power_of_2(
                    helpers.min_blocks(scan_size, seq_size))
                if wg_size > max_wg_size:
                    raise ValueError(
                        "Sequential size " + str(seq_size) +
                        " cannot be set because of the maximum workgroup size "
                        + max_wg_size)

            wg_totals_size = helpers.min_blocks(scan_size, wg_size * seq_size)
            wg_totals = plan.temp_array((
                batch_size,
                wg_totals_size,
            ), output.dtype)

            if wg_totals_size > 1:
                temp_output = plan.temp_array_like(output)
            else:
                temp_output = output

            last_part_size = scan_size % (wg_size * seq_size)
            if last_part_size == 0:
                last_part_size = wg_size * seq_size

            plan.kernel_call(
                TEMPLATE.get_def('scan'), [temp_output, input_, wg_totals],
                kernel_name="kernel_scan_wg",
                global_size=(batch_size, wg_size * wg_totals_size),
                local_size=(1, wg_size),
                render_kwds=dict(slices=(len(batch_shape), len(scan_shape)),
                                 log_num_banks=helpers.log2(
                                     device_params.local_mem_banks),
                                 exclusive=self._exclusive,
                                 wg_size=wg_size,
                                 seq_size=seq_size,
                                 scan_size=scan_size,
                                 last_part_size=last_part_size,
                                 wg_totals_size=wg_totals_size,
                                 log_wg_size=helpers.log2(wg_size),
                                 predicate=self._predicate))

            if wg_totals_size > 1:
                sub_scan = Scan(wg_totals,
                                self._predicate,
                                axes=(1, ),
                                exclusive=True,
                                max_work_group_size=self._max_work_group_size)
                scanned_wg_totals = plan.temp_array_like(wg_totals)
                plan.computation_call(sub_scan, scanned_wg_totals, wg_totals)

                plan.kernel_call(TEMPLATE.get_def('add_wg_totals'),
                                 [output, temp_output, scanned_wg_totals],
                                 kernel_name="kernel_scan_add_wg_totals",
                                 global_size=(
                                     batch_size,
                                     scan_size,
                                 ),
                                 render_kwds=dict(
                                     slices=(
                                         len(batch_shape),
                                         len(scan_shape),
                                     ),
                                     wg_size=wg_size,
                                     seq_size=seq_size,
                                 ))

        return plan
Exemple #9
0
    def _build_plan(self, plan_factory, device_params, output, alpha, beta):

        plan = plan_factory()

        samples, modes = alpha.shape

        for_reduction = Type(alpha.dtype, (samples, self._max_total_clicks + 1))

        prepared_state = plan.temp_array_like(alpha)

        plan.kernel_call(
            TEMPLATE.get_def("compound_click_probability_prepare"),
            [prepared_state, alpha, beta],
            kernel_name="compound_click_probability_prepare",
            global_size=alpha.shape,
            render_kwds=dict(
                mul_cc=functions.mul(alpha.dtype, alpha.dtype),
                exp_c=functions.exp(alpha.dtype),
                ))

        # Block size is limited by the amount of available local memory.
        # In some OpenCL implementations the number reported cannot actually be fully used
        # (because it's used by kernel arguments), so we're padding it a little.
        local_mem_size = device_params.local_mem_size
        max_elems = (local_mem_size - 256) // alpha.dtype.itemsize
        block_size = 2**helpers.log2(max_elems)

        # No reason to have block size larger than the number of modes
        block_size = min(block_size, helpers.bounding_power_of_2(modes))

        products_gsize = (samples, helpers.min_blocks(self._max_total_clicks + 1, block_size) * block_size)
        products = plan.temp_array_like(for_reduction)

        read_size = min(block_size, device_params.max_work_group_size)

        while read_size > 1:

            full_steps = modes // block_size
            remainder_size = modes % block_size

            try:
                plan.kernel_call(
                    TEMPLATE.get_def("compound_click_probability_aggregate"),
                    [products, prepared_state],
                    kernel_name="compound_click_probability_aggregate",
                    global_size=products_gsize,
                    local_size=(1, read_size,),
                    render_kwds=dict(
                        block_size=block_size,
                        read_size=read_size,
                        full_steps=full_steps,
                        remainder_size=remainder_size,
                        output_size=self._max_total_clicks + 1,
                        mul_cc=functions.mul(alpha.dtype, alpha.dtype),
                        add_cc=functions.add(alpha.dtype, alpha.dtype),
                        polar_unit=functions.polar_unit(dtypes.real_for(alpha.dtype)),
                        modes=self._system.modes,
                        max_total_clicks=self._max_total_clicks,
                        ))

            except OutOfResourcesError:
                read_size //= 2

            break

        reduction = Reduce(for_reduction, predicate_sum(alpha.dtype), axes=(0,))

        temp = plan.temp_array_like(reduction.parameter.output)

        plan.computation_call(reduction, temp, products)

        fft = FFT(temp)
        real_trf = Transformation([
            Parameter('output', Annotation(output, 'o')),
            Parameter('input', Annotation(temp, 'i')),
            ],
            """
                ${input.ctype} val = ${input.load_same};
                ${output.store_same}(val.x);
                """)
        fft.parameter.output.connect(real_trf, real_trf.input, output_p=real_trf.output)

        plan.computation_call(fft, output, temp, True)

        return plan