Example #1
0
    def _add_transpose(self, plan, device_params,
            mem_out, mem_in, batch_shape, height_shape, width_shape):

        bso = self._block_width_override
        block_width = device_params.local_mem_banks if bso is None else bso

        if block_width ** 2 > device_params.max_work_group_size:
            # If it is not CPU, current solution may affect performance
            block_width = int(numpy.sqrt(device_params.max_work_group_size))

        input_height = helpers.product(height_shape)
        input_width = helpers.product(width_shape)
        batch = helpers.product(batch_shape)

        blocks_per_matrix = helpers.min_blocks(input_height, block_width)
        grid_width = helpers.min_blocks(input_width, block_width)

        render_kwds = dict(
            input_width=input_width, input_height=input_height, batch=batch,
            block_width=block_width,
            grid_width=grid_width,
            blocks_per_matrix=blocks_per_matrix,
            input_slices=[len(batch_shape), len(height_shape), len(width_shape)],
            output_slices=[len(batch_shape), len(width_shape), len(height_shape)])

        plan.kernel_call(
            TEMPLATE.get_def('transpose'), [mem_out, mem_in],
            kernel_name="kernel_transpose",
            global_size=(batch, blocks_per_matrix * block_width, grid_width * block_width),
            local_size=(1, block_width, block_width),
            render_kwds=render_kwds)
Example #2
0
    def _add_transpose(self, plan, device_params,
            mem_out, mem_in, batch_shape, height_shape, width_shape):

        bso = self._block_width_override
        block_width = device_params.local_mem_banks if bso is None else bso

        if block_width ** 2 > device_params.max_work_group_size:
            # If it is not CPU, current solution may affect performance
            block_width = int(numpy.sqrt(device_params.max_work_group_size))

        input_height = helpers.product(height_shape)
        input_width = helpers.product(width_shape)
        batch = helpers.product(batch_shape)

        blocks_per_matrix = helpers.min_blocks(input_height, block_width)
        grid_width = helpers.min_blocks(input_width, block_width)

        render_kwds = dict(
            input_width=input_width, input_height=input_height, batch=batch,
            block_width=block_width,
            grid_width=grid_width,
            blocks_per_matrix=blocks_per_matrix,
            input_slices=[len(batch_shape), len(height_shape), len(width_shape)],
            output_slices=[len(batch_shape), len(width_shape), len(height_shape)])

        plan.kernel_call(
            TEMPLATE.get_def('transpose'), [mem_out, mem_in],
            global_size=(batch, blocks_per_matrix * block_width, grid_width * block_width),
            local_size=(1, block_width, block_width),
            render_kwds=render_kwds)
Example #3
0
    def _build_plan(self, plan_factory, device_params, output, matrix_a,
                    matrix_b):
        bwo = self._block_width_override

        if bwo is not None:
            block_widths = [bwo]
        else:
            nbanks = device_params.local_mem_banks
            block_widths = [2**n for n in range(helpers.log2(nbanks), -1, -1)]

        a_batch = helpers.product(matrix_a.shape[:-2])
        b_batch = helpers.product(matrix_b.shape[:-2])
        batch = max(a_batch, b_batch)

        for block_width in block_widths:

            plan = plan_factory()

            if block_width**2 > device_params.max_work_group_size:
                continue

            num_steps = helpers.min_blocks(self._convolution_size, block_width)
            a_blocks = helpers.min_blocks(self._a_outer_size, block_width)
            b_blocks = helpers.min_blocks(self._b_outer_size, block_width)

            render_kwds = dict(batched_a=(a_batch != 1),
                               batched_b=(b_batch != 1),
                               transposed_a=self._transposed_a,
                               transposed_b=self._transposed_b,
                               num_steps=num_steps,
                               a_slices=(len(matrix_a.shape) - 2, 1, 1),
                               b_slices=(len(matrix_b.shape) - 2, 1, 1),
                               output_slices=(len(output.shape) - 2, 1, 1),
                               block_width=block_width,
                               mul=functions.mul(matrix_a.dtype,
                                                 matrix_b.dtype,
                                                 out_dtype=output.dtype))

            try:
                plan.kernel_call(TEMPLATE.get_def('matrixmul'),
                                 [output, matrix_a, matrix_b],
                                 kernel_name="kernel_matrixmul",
                                 global_size=(batch, a_blocks * block_width,
                                              b_blocks * block_width),
                                 local_size=(1, block_width, block_width),
                                 render_kwds=render_kwds)
            except OutOfResourcesError:
                continue

            return plan

        raise ValueError(
            "Could not find suitable call parameters for the kernel")
Example #4
0
    def _build_plan(self, plan_factory, device_params, output, matrix_a, matrix_b):
        bwo = self._block_width_override

        if bwo is not None:
            block_widths = [bwo]
        else:
            nbanks = device_params.local_mem_banks
            block_widths = [2 ** n for n in range(helpers.log2(nbanks), -1, -1)]

        a_batch = helpers.product(matrix_a.shape[:-2])
        b_batch = helpers.product(matrix_b.shape[:-2])
        batch = max(a_batch, b_batch)

        for block_width in block_widths:

            plan = plan_factory()

            if block_width ** 2 > device_params.max_work_group_size:
                continue

            num_steps = helpers.min_blocks(self._convolution_size, block_width)
            a_blocks = helpers.min_blocks(self._a_outer_size, block_width)
            b_blocks = helpers.min_blocks(self._b_outer_size, block_width)

            render_kwds = dict(
                batched_a=(a_batch != 1),
                batched_b=(b_batch != 1),
                transposed_a=self._transposed_a,
                transposed_b=self._transposed_b,
                num_steps=num_steps,
                a_slices=(len(matrix_a.shape) - 2, 1, 1),
                b_slices=(len(matrix_b.shape) - 2, 1, 1),
                output_slices=(len(output.shape) - 2, 1, 1),
                block_width=block_width,
                mul=functions.mul(matrix_a.dtype, matrix_b.dtype, out_dtype=output.dtype))

            try:
                plan.kernel_call(
                    TEMPLATE.get_def('matrixmul'),
                    [output, matrix_a, matrix_b],
                    kernel_name="kernel_matrixmul",
                    global_size=(
                        batch,
                        a_blocks * block_width,
                        b_blocks * block_width),
                    local_size=(1, block_width, block_width),
                    render_kwds=render_kwds)
            except OutOfResourcesError:
                continue

            return plan

        raise ValueError("Could not find suitable call parameters for the kernel")
Example #5
0
def find_bounding_shape(virtual_size, available_shape):
    """
    Finds a tuple of the same length as ``available_shape``, with every element
    not greater than the corresponding element of ``available_shape``,
    and product not lower than ``virtual_size``.
    """
    assert virtual_size <= product(available_shape)

    free_size = virtual_size
    free_dims = set(range(len(available_shape)))
    bounding_shape = [None] * len(available_shape)

    while len(free_dims) > 0:
        guess = ceiling_root(free_size, len(free_dims))
        for fdim in free_dims:
            bounding_shape[fdim] = guess

        for fdim in free_dims:
            if bounding_shape[fdim] > available_shape[fdim]:
                bounding_shape[fdim] = available_shape[fdim]
                free_dims.remove(fdim)
                free_size = min_blocks(free_size, bounding_shape[fdim])
                break
        else:
            return tuple(bounding_shape)

    return tuple(available_shape)
Example #6
0
    def predict_group_ids(self, dim):
        global_len = self.global_size[dim]
        local_len = self.local_size[dim]
        repetitions = min_blocks(global_len, local_len)

        pattern = numpy.repeat(numpy.arange(repetitions), local_len)[:global_len]
        return self._tile_pattern(pattern, dim, self.global_size)
Example #7
0
    def try_create(cls, global_size, local_size, max_num_groups, max_work_item_sizes):
        """
        This method is used to filter working combinations of parameters
        from the cartesian product of all possible ones.
        Returns ``None`` if the parameters are not compatible.
        """
        if len(max_num_groups) != len(max_work_item_sizes):
            return None

        if local_size is not None:
            if len(local_size) > len(global_size):
                return None
            else:
                # we need local size and global size of the same length
                local_size = local_size + (1,) * (len(global_size) - len(local_size))

            if product(local_size) > product(max_work_item_sizes):
                return None

            bounding_global_size = [
                ls * min_blocks(gs, ls) for gs, ls
                in zip(global_size, local_size)]

            if product(bounding_global_size) > product(max_num_groups):
                return None

        else:
            if product(global_size) > product(max_num_groups):
                return None

        return cls(global_size, local_size, max_num_groups, max_work_item_sizes)
Example #8
0
File: fft.py Project: ringw/reikna
    def prepare_for(self, max_local_size):
        kwds = dict(self._constant_kwds)
        fft_size = self._fft_size

        radix_array = get_radix_array(fft_size)
        if fft_size // radix_array[0] > max_local_size:
            radix_array = get_radix_array(fft_size, use_max_radix=True)

        threads_per_xform = fft_size // radix_array[0]
        local_size = max(64, threads_per_xform)
        if local_size > max_local_size:
            raise OutOfResourcesError
        xforms_per_workgroup = local_size // threads_per_xform
        workgroups_num = helpers.min_blocks(self._outer_batch,
                                            xforms_per_workgroup)

        lmem_size = get_local_memory_size(fft_size, radix_array,
                                          threads_per_xform,
                                          xforms_per_workgroup,
                                          kwds['local_mem_banks'],
                                          kwds['min_mem_coalesce_width'])

        if lmem_size * self._itemsize // 2 > self._local_mem_size:
            raise OutOfResourcesError

        kwds.update(
            dict(fft_size=fft_size,
                 fft_size_real=self._fft_size_real,
                 radix_arr=radix_array,
                 lmem_size=lmem_size,
                 threads_per_xform=threads_per_xform,
                 xforms_per_workgroup=xforms_per_workgroup,
                 outer_batch=self._outer_batch))

        return local_size * workgroups_num, local_size, kwds
Example #9
0
    def _build_plan(self, plan_factory, device_params, output, input_):

        plan = plan_factory()

        batch_size = helpers.product(output.shape[:-1])
        blocks_num = helpers.min_blocks(batch_size, self._transforms_per_block)

        cdata_arr = self._transform.cdata_inv if self._inverse else self._transform.cdata_fw
        if self._transform.use_constant_memory:
            cdata = plan.constant_array(cdata_arr)
        else:
            cdata = plan.persistent_array(cdata_arr)

        plan.kernel_call(
            TEMPLATE.get_def('standalone_transform'), [output, input_, cdata],
            global_size=(blocks_num, self._transform.threads_per_transform *
                         self._transforms_per_block),
            local_size=(1, self._transform.threads_per_transform *
                        self._transforms_per_block),
            render_kwds=dict(inverse=self._inverse,
                             i32_conversion=self._i32_conversion,
                             kernel_repetitions=self._kernel_repetitions,
                             transform=self._transform,
                             transforms_per_block=self._transforms_per_block,
                             batch_size=batch_size,
                             blocks_num=blocks_num,
                             slices=(len(output.shape) - 1, 1)))

        return plan
Example #10
0
File: fft.py Project: ringw/reikna
    def prepare_for(self, max_local_size):
        kwds = dict(self._constant_kwds)

        radix_arr, radix1_arr, radix2_arr = get_global_radix_info(
            self._fft_size)

        radix = radix_arr[self._pass_num]
        radix1 = radix1_arr[self._pass_num]
        radix2 = radix2_arr[self._pass_num]

        stride_out = self._inner_batch * helpers.product(
            radix_arr[:self._pass_num])
        stride = stride_out * radix
        stride_in = stride_out * helpers.product(
            radix_arr[self._pass_num + 1:])

        threads_per_xform = radix2

        coalesce_width = kwds['min_mem_coalesce_width']
        local_batch = max_local_size if radix2 == 1 else coalesce_width
        local_batch = min(local_batch, stride_in)
        local_size = min(local_batch * threads_per_xform, max_local_size)
        local_batch = local_size // threads_per_xform

        workgroups_num = helpers.min_blocks(stride_in,
                                            local_batch) * self._outer_batch

        if radix2 == 1:
            lmem_size = 0
        else:
            if stride_out == 1:
                lmem_size = (radix + 1) * local_batch
            else:
                lmem_size = local_size * radix1

        if lmem_size * self._itemsize // 2 > self._local_mem_size:
            raise OutOfResourcesError

        kwds.update(self._constant_kwds)
        kwds.update(
            dict(fft_size=self._fft_size,
                 curr_size=self._curr_size,
                 fft_size_real=self._fft_size_real,
                 pass_num=self._pass_num,
                 lmem_size=lmem_size,
                 local_batch=local_batch,
                 local_size=local_size,
                 inner_batch=self._inner_batch,
                 radix_arr=radix_arr,
                 radix1_arr=radix1_arr,
                 radix2_arr=radix2_arr,
                 radix1=radix1,
                 radix2=radix2,
                 radix=radix,
                 stride_in=stride_in,
                 stride_out=stride_out,
                 stride=stride,
                 last_pass=self._last_pass))

        return workgroups_num * local_size, local_size, kwds
Example #11
0
    def prepare_for(self, max_local_size):
        kwds = dict(self._constant_kwds)
        fft_size = self._fft_size

        radix_array = get_radix_array(fft_size)
        if fft_size // radix_array[0] > max_local_size:
            radix_array = get_radix_array(fft_size, use_max_radix=True)

        threads_per_xform = fft_size // radix_array[0]
        local_size = max(64, threads_per_xform)
        if local_size > max_local_size:
            raise OutOfResourcesError
        xforms_per_workgroup = local_size // threads_per_xform
        workgroups_num = helpers.min_blocks(self._outer_batch, xforms_per_workgroup)

        lmem_size = get_local_memory_size(
            fft_size, radix_array, threads_per_xform, xforms_per_workgroup,
            kwds['local_mem_banks'], kwds['min_mem_coalesce_width'])

        if lmem_size * self._itemsize // 2 > self._local_mem_size:
            raise OutOfResourcesError

        kwds.update(dict(
            fft_size=fft_size, fft_size_real=self._fft_size_real, radix_arr=radix_array,
            lmem_size=lmem_size, threads_per_xform=threads_per_xform,
            xforms_per_workgroup=xforms_per_workgroup,
            outer_batch=self._outer_batch))

        return local_size * workgroups_num, local_size, kwds
Example #12
0
    def predict_local_ids(self, dim):
        global_len = self.global_size[dim]
        local_len = self.local_size[dim]
        repetitions = min_blocks(global_len, local_len)

        pattern = numpy.tile(numpy.arange(local_len), repetitions)[:global_len]
        return self._tile_pattern(pattern, dim, self.global_size)
Example #13
0
def find_local_size(global_size, flat_local_size, threshold=0.05):
    """
    Returns a tuple of the same size as ``global_size``,
    with the product equal to ``flat_local_size``,
    and minimal difference between ``product(global_size)``
    and ``product(min_blocks(gs, ls) for gs, ls in zip(global_size, local_size))``
    (i.e. tries to minimize the amount of empty threads).
    """
    flat_global_size = product(global_size)
    if flat_local_size >= flat_global_size:
        return global_size

    threads_num = product(global_size)

    best_ratio = None
    best_local_size = None

    for local_size in get_decompositions(flat_local_size, len(global_size)):
        bounding_global_size = tuple(
            ls * min_blocks(gs, ls) for gs, ls in zip(global_size, local_size))
        empty_threads = product(bounding_global_size) - threads_num
        ratio = float(empty_threads) / threads_num

        # Stopping iteration early, because there may be a lot of elements to iterate over,
        # and we do not need the perfect solution.
        if ratio < threshold:
            return local_size

        if best_ratio is None or ratio < best_ratio:
            best_ratio = ratio
            best_local_size = local_size

    return best_local_size
Example #14
0
    def try_create(cls, global_size, local_size, max_num_groups,
                   max_work_item_sizes):
        """
        This method is used to filter working combinations of parameters
        from the cartesian product of all possible ones.
        Returns ``None`` if the parameters are not compatible.
        """
        if len(max_num_groups) != len(max_work_item_sizes):
            return None

        if local_size is not None:
            if len(local_size) > len(global_size):
                return None
            else:
                # we need local size and global size of the same length
                local_size = local_size + (1, ) * (len(global_size) -
                                                   len(local_size))

            if product(local_size) > product(max_work_item_sizes):
                return None

            bounding_global_size = [
                ls * min_blocks(gs, ls)
                for gs, ls in zip(global_size, local_size)
            ]

            if product(bounding_global_size) > product(max_num_groups):
                return None

        else:
            if product(global_size) > product(max_num_groups):
                return None

        return cls(global_size, local_size, max_num_groups,
                   max_work_item_sizes)
Example #15
0
    def __init__(self, global_size, local_size, max_num_groups, max_work_item_sizes):
        self.global_size = global_size
        self.local_size = local_size
        if local_size is not None:
            self.grid_size = tuple(min_blocks(gs, ls) for gs, ls in zip(global_size, local_size))

        self.max_num_groups = max_num_groups
        self.max_work_item_sizes = max_work_item_sizes
Example #16
0
    def __init__(self, global_size, local_size, max_num_groups, max_work_item_sizes):
        self.global_size = global_size
        self.local_size = local_size
        if local_size is not None:
            self.grid_size = tuple(min_blocks(gs, ls) for gs, ls in zip(global_size, local_size))

        self.max_num_groups = max_num_groups
        self.max_work_item_sizes = max_work_item_sizes
Example #17
0
def get_radix_array(size, use_max_radix=False):
    """
    For any ``size``, this function decomposes ``size`` into factors for loacal memory tranpose
    based fft. Factors (radices) are sorted such that the first one (radix_array[0])
    is the largest. This base radix determines the number of registers used by each
    work item and product of remaining radices determine the size of work group needed.
    To make things concrete with and example, suppose ``size`` = 1024. It is decomposed into
    1024 = 16 x 16 x 4. Hence kernel uses float2 a[16], for local in-register fft and
    needs 16 x 4 = 64 work items per work group. So kernel first performance 64 length
    16 ffts (64 work items working in parallel) following by transpose using local
    memory followed by again 64 length 16 ffts followed by transpose using local memory
    followed by 256 length 4 ffts. For the last step since with size of work group is
    64 and each work item can array for 16 values, 64 work items can compute 256 length
    4 ffts by each work item computing 4 length 4 ffts.
    Similarly for ``size`` = 2048 = 8 x 8 x 8 x 4, each work group has 8 x 8 x 4 = 256 work
    iterms which each computes 256 (in-parallel) length 8 ffts in-register, followed
    by transpose using local memory, followed by 256 length 8 in-register ffts, followed
    by transpose using local memory, followed by 256 length 8 in-register ffts, followed
    by transpose using local memory, followed by 512 length 4 in-register ffts. Again,
    for the last step, each work item computes two length 4 in-register ffts and thus
    256 work items are needed to compute all 512 ffts.
    For ``size`` = 32 = 8 x 4, 4 work items first compute 4 in-register
    lenth 8 ffts, followed by transpose using local memory followed by 8 in-register
    length 4 ffts, where each work item computes two length 4 ffts thus 4 work items
    can compute 8 length 4 ffts. However if work group size of say 64 is choosen,
    each work group can compute 64/ 4 = 16 size 32 ffts (batched transform).
    Users can play with these parameters to figure what gives best performance on
    their particular device i.e. some device have less register space thus using
    smaller base radix can avoid spilling ... some has small local memory thus
    using smaller work group size may be required etc
    """
    assert size == 2 ** helpers.log2(size)

    if use_max_radix:
        radix = min(size, MAX_RADIX)
        radix_array = []
        while size > radix:
            radix_array.append(radix)
            size //= radix
        radix_array.append(size)
        return radix_array
    else:
        arrays = {
            2: [2], 4: [4], 8: [8],
            16: [8, 2], 32: [8, 4], 64: [8, 8],
            128: [8, 4, 4],
            256: [4, 4, 4, 4],
            512: [8, 8, 8],
            1024: [16, 16, 4],
            2048: [8, 8, 8, 4]
        }
        if size in arrays:
            return arrays[size]
        else:
            # Naive algorithm, can be imroved.
            lsize = helpers.log2(size)
            num_elems = helpers.min_blocks(lsize, 4)
            return [16] * (num_elems - 1) + [16 if lsize % 4 == 0 else 2 ** (lsize % 4)]
Example #18
0
def get_radix_array(size, use_max_radix=False):
    """
    For any ``size``, this function decomposes ``size`` into factors for loacal memory tranpose
    based fft. Factors (radices) are sorted such that the first one (radix_array[0])
    is the largest. This base radix determines the number of registers used by each
    work item and product of remaining radices determine the size of work group needed.
    To make things concrete with and example, suppose ``size`` = 1024. It is decomposed into
    1024 = 16 x 16 x 4. Hence kernel uses float2 a[16], for local in-register fft and
    needs 16 x 4 = 64 work items per work group. So kernel first performance 64 length
    16 ffts (64 work items working in parallel) following by transpose using local
    memory followed by again 64 length 16 ffts followed by transpose using local memory
    followed by 256 length 4 ffts. For the last step since with size of work group is
    64 and each work item can array for 16 values, 64 work items can compute 256 length
    4 ffts by each work item computing 4 length 4 ffts.
    Similarly for ``size`` = 2048 = 8 x 8 x 8 x 4, each work group has 8 x 8 x 4 = 256 work
    iterms which each computes 256 (in-parallel) length 8 ffts in-register, followed
    by transpose using local memory, followed by 256 length 8 in-register ffts, followed
    by transpose using local memory, followed by 256 length 8 in-register ffts, followed
    by transpose using local memory, followed by 512 length 4 in-register ffts. Again,
    for the last step, each work item computes two length 4 in-register ffts and thus
    256 work items are needed to compute all 512 ffts.
    For ``size`` = 32 = 8 x 4, 4 work items first compute 4 in-register
    lenth 8 ffts, followed by transpose using local memory followed by 8 in-register
    length 4 ffts, where each work item computes two length 4 ffts thus 4 work items
    can compute 8 length 4 ffts. However if work group size of say 64 is choosen,
    each work group can compute 64/ 4 = 16 size 32 ffts (batched transform).
    Users can play with these parameters to figure what gives best performance on
    their particular device i.e. some device have less register space thus using
    smaller base radix can avoid spilling ... some has small local memory thus
    using smaller work group size may be required etc
    """
    assert size == 2 ** helpers.log2(size)

    if use_max_radix:
        radix = min(size, MAX_RADIX)
        radix_array = []
        while size > radix:
            radix_array.append(radix)
            size //= radix
        radix_array.append(size)
        return radix_array
    else:
        arrays = {
            2: [2], 4: [4], 8: [8],
            16: [8, 2], 32: [8, 4], 64: [8, 8],
            128: [8, 4, 4],
            256: [4, 4, 4, 4],
            512: [8, 8, 8],
            1024: [16, 16, 4],
            2048: [8, 8, 8, 4]
        }
        if size in arrays:
            return arrays[size]
        else:
            # Naive algorithm, can be imroved.
            lsize = helpers.log2(size)
            num_elems = helpers.min_blocks(lsize, 4)
            return [16] * (num_elems - 1) + [16 if lsize % 4 == 0 else 2 ** (lsize % 4)]
Example #19
0
def align(dtype):
    """
    Returns a new struct dtype with the field offsets changed to the ones a compiler would use
    (without being given any explicit alignment qualifiers).
    Ignores all existing explicit itemsizes and offsets.
    """
    dtype = normalize_type(dtype)

    if len(dtype.shape) > 0:
        return numpy.dtype((align(dtype.base), dtype.shape))

    if dtype.names is None:
        return dtype

    # Align the nested fields
    adjusted_fields = [align(dtype.fields[name][0]) for name in dtype.names]

    # Get base alignments for the nested fields
    alignments = [
        _find_alignments(field_dtype)[0] for field_dtype in adjusted_fields
    ]

    # Build offsets for the structure using a procedure
    # similar to the one a compiler would use
    offsets = [0]
    for name, prev_field_dtype, alignment in zip(dtype.names[1:],
                                                 adjusted_fields[:-1],
                                                 alignments[1:]):
        prev_end = offsets[-1] + prev_field_dtype.itemsize
        offsets.append(min_blocks(prev_end, alignment) * alignment)

    # Find the total itemsize.
    # According to the standard, it must be a multiple of the minimal alignment.
    struct_alignment = _struct_alignment(alignments)
    min_itemsize = offsets[-1] + adjusted_fields[-1].itemsize
    itemsize = min_blocks(min_itemsize, struct_alignment) * struct_alignment

    return numpy.dtype(
        dict(names=dtype.names,
             formats=adjusted_fields,
             offsets=offsets,
             itemsize=itemsize,
             aligned=True))
Example #20
0
def align(dtype):
    """
    Returns a new struct dtype with the field offsets changed to the ones a compiler would use
    (without being given any explicit alignment qualifiers).
    Ignores all existing explicit itemsizes and offsets.
    """
    dtype = normalize_type(dtype)

    if len(dtype.shape) > 0:
        return numpy.dtype((align(dtype.base), dtype.shape))

    if dtype.names is None:
        return dtype

    # Align the nested fields
    adjusted_fields = [
        align(dtype.fields[name][0])
        for name in dtype.names]

    # Get base alignments for the nested fields
    alignments = [_find_alignments(field_dtype)[0] for field_dtype in adjusted_fields]

    # Build offsets for the structure using a procedure
    # similar to the one a compiler would use
    offsets = [0]
    for name, prev_field_dtype, alignment in zip(
            dtype.names[1:], adjusted_fields[:-1], alignments[1:]):
        prev_end = offsets[-1] + prev_field_dtype.itemsize
        offsets.append(min_blocks(prev_end, alignment) * alignment)

    # Find the total itemsize.
    # According to the standard, it must be a multiple of the minimal alignment.
    struct_alignment = _struct_alignment(alignments)
    min_itemsize = offsets[-1] + adjusted_fields[-1].itemsize
    itemsize = min_blocks(min_itemsize, struct_alignment) * struct_alignment

    return numpy.dtype(dict(
        names=dtype.names,
        formats=adjusted_fields,
        offsets=offsets,
        itemsize=itemsize,
        aligned=True))
Example #21
0
    def prepare_for(self, max_local_size):
        kwds = dict(self._constant_kwds)

        radix_arr, radix1_arr, radix2_arr = get_global_radix_info(self._fft_size)

        radix = radix_arr[self._pass_num]
        radix1 = radix1_arr[self._pass_num]
        radix2 = radix2_arr[self._pass_num]

        stride_out = self._inner_batch * helpers.product(radix_arr[:self._pass_num])
        stride = stride_out * radix
        stride_in = stride_out * helpers.product(radix_arr[self._pass_num+1:])

        threads_per_xform = radix2

        coalesce_width = kwds['min_mem_coalesce_width']
        local_batch = max_local_size if radix2 == 1 else coalesce_width
        local_batch = min(local_batch, stride_in)
        local_size = min(local_batch * threads_per_xform, max_local_size)
        local_batch = local_size // threads_per_xform

        workgroups_num = helpers.min_blocks(stride_in, local_batch) * self._outer_batch

        if radix2 == 1:
            lmem_size = 0
        else:
            if stride_out == 1:
                lmem_size = (radix + 1) * local_batch
            else:
                lmem_size = local_size * radix1

        if lmem_size * self._itemsize // 2 > self._local_mem_size:
            raise OutOfResourcesError

        kwds.update(self._constant_kwds)
        kwds.update(dict(
            fft_size=self._fft_size, curr_size=self._curr_size, fft_size_real=self._fft_size_real,
            pass_num=self._pass_num,
            lmem_size=lmem_size, local_batch=local_batch, local_size=local_size,
            inner_batch=self._inner_batch,
            radix_arr=radix_arr, radix1_arr=radix1_arr, radix2_arr=radix2_arr,
            radix1=radix1, radix2=radix2, radix=radix,
            stride_in=stride_in, stride_out=stride_out, stride=stride,
            last_pass=self._last_pass))

        return workgroups_num * local_size, local_size, kwds
Example #22
0
    def _build_plan(self, plan_factory, device_params, output, input_):

        plan = plan_factory()

        batch_size = helpers.product(output.shape[:-1])

        cdata_arr = self._transform.cdata_inv if self._inverse else self._transform.cdata_fw
        if self._transform.use_constant_memory:
            cdata = plan.constant_array(cdata_arr)
        else:
            cdata = plan.persistent_array(cdata_arr)

        tpb = self._transforms_per_block
        while tpb >= 1:
            blocks_num = helpers.min_blocks(batch_size, tpb)
            try:
                plan.kernel_call(
                    TEMPLATE.get_def('standalone_transform'),
                        [output, input_, cdata],
                        kernel_name="standalone_transform",
                        global_size=(
                            blocks_num,
                            self._transform.threads_per_transform * tpb),
                        local_size=(
                            1,
                            self._transform.threads_per_transform * tpb),
                        render_kwds=dict(
                            inverse=self._inverse,
                            i32_conversion=self._i32_conversion,
                            kernel_repetitions=self._kernel_repetitions,
                            transform=self._transform,
                            transforms_per_block=tpb,
                            batch_size=batch_size,
                            blocks_num=blocks_num,
                            slices=(len(output.shape) - 1, 1)))
                break
            except OutOfResourcesError:
                tpb -= 1
        else:
            raise Exception(
                "The selected device does not have enough resources for the selected transform")

        return plan
Example #23
0
    def __init__(self, device_params, virtual_global_size,
            virtual_local_size=None, max_local_size=None):

        virtual_global_size = wrap_in_tuple(virtual_global_size)
        if virtual_local_size is not None:
            virtual_local_size = wrap_in_tuple(virtual_local_size)
            if len(virtual_local_size) != len(virtual_global_size):
                raise ValueError("Global size and local size must have the same dimensions")

        # Since the device uses column-major ordering of sizes, while we get
        # row-major ordered shapes, we temporarily invert our shapes
        # to facilitate internal handling.
        virtual_global_size = tuple(reversed(virtual_global_size))
        if virtual_local_size is not None:
            virtual_local_size = tuple(reversed(virtual_local_size))

        # Restrict local sizes using the provided explicit limit
        if max_local_size is not None:
            max_work_group_size = min(
                max_local_size,
                device_params.max_work_group_size,
                product(device_params.max_work_item_sizes))
            max_work_item_sizes = [
                min(max_local_size, mwis) for mwis in device_params.max_work_item_sizes]
        else:
            # Assuming:
            # 1) max_work_group_size <= product(max_work_item_sizes)
            # 2) max(max_work_item_sizes) <= max_work_group_size
            max_work_group_size = device_params.max_work_group_size
            max_work_item_sizes = device_params.max_work_item_sizes

        if virtual_local_size is None:
            # FIXME: we can obtain better results by taking occupancy into account here,
            # but for now we will assume that the more threads, the better.
            flat_global_size = product(virtual_global_size)
            multiple = device_params.warp_size

            if flat_global_size < max_work_group_size:
                flat_local_size = flat_global_size
            elif max_work_group_size < multiple:
                flat_local_size = 1
            else:
                flat_local_size = multiple * (max_work_group_size // multiple)

            # product(virtual_local_size) == flat_local_size <= max_work_group_size
            virtual_local_size = find_local_size(virtual_global_size, flat_local_size)
        else:
            if product(virtual_local_size) > max_work_group_size:
                raise OutOfResourcesError(
                    "Requested local size is greater than the maximum " + str(max_work_group_size))

        # Global and local sizes supported by CUDA or OpenCL restricted number of dimensions,
        # which may have limited size, so we need to pack our multidimensional sizes.

        virtual_grid_size = tuple(
            min_blocks(gs, ls) for gs, ls in zip(virtual_global_size, virtual_local_size))
        bounding_global_size = tuple(
            grs * ls for grs, ls in zip(virtual_grid_size, virtual_local_size))

        if product(virtual_grid_size) > product(device_params.max_num_groups):
            raise OutOfResourcesError(
                "Bounding global size " + repr(bounding_global_size) + " is too large")

        local_groups = ShapeGroups(virtual_local_size, max_work_item_sizes)
        grid_groups = ShapeGroups(virtual_grid_size, device_params.max_num_groups)

        # Returning back to the row-major ordering
        self.virtual_local_size = tuple(reversed(virtual_local_size))
        self.virtual_global_size = tuple(reversed(virtual_global_size))

        # These can be different lenghts because of expansion into multiple dimensions
        # find_bounding_shape() does.
        real_local_size = tuple(local_groups.bounding_shape)
        real_grid_size = tuple(grid_groups.bounding_shape)

        diff = len(real_local_size) - len(real_grid_size)
        if diff > 0:
            self.real_local_size = real_local_size
            self.real_grid_size = real_grid_size + (1,) * abs(diff)
        else:
            self.real_local_size = real_local_size + (1,) * abs(diff)
            self.real_grid_size = real_grid_size

        self.real_global_size = tuple(
            gs * ls for gs, ls
            in zip(self.real_grid_size, self.real_local_size))

        # This function will be used to translate between internal column-major vdims
        # and user-supplied row-major vdims.
        vdim_inverse = lambda dim: len(self.virtual_local_size) - dim - 1

        self.vsize_functions = render_template(
            TEMPLATE,
            virtual_local_size=virtual_local_size,
            virtual_global_size=virtual_global_size,
            bounding_global_size=bounding_global_size,
            virtual_grid_size=virtual_grid_size,
            local_groups=local_groups,
            grid_groups=grid_groups,
            product=product,
            vdim_inverse=vdim_inverse)
Example #24
0
 def __init__(self, global_size, local_size):
     self.global_size = global_size
     if local_size is not None:
         self.local_size = local_size
         self.grid_size = tuple(
             min_blocks(gs, ls) for gs, ls in zip(global_size, local_size))
Example #25
0
 def uniform_bool(self, shape):
     length = numpy.prod(shape)
     nbytes = min_blocks(length, 8)
     random_bytes = numpy.frombuffer(urandom(nbytes), numpy.uint8)
     random_bits = numpy.unpackbits(random_bytes)[:length]
     return random_bits.reshape(shape).astype(Int32)
Example #26
0
    def _build_plan(self, plan_factory, device_params, output, alpha, beta):

        plan = plan_factory()

        samples, modes = alpha.shape

        for_reduction = Type(alpha.dtype, (samples, self._max_total_clicks + 1))

        prepared_state = plan.temp_array_like(alpha)

        plan.kernel_call(
            TEMPLATE.get_def("compound_click_probability_prepare"),
            [prepared_state, alpha, beta],
            kernel_name="compound_click_probability_prepare",
            global_size=alpha.shape,
            render_kwds=dict(
                mul_cc=functions.mul(alpha.dtype, alpha.dtype),
                exp_c=functions.exp(alpha.dtype),
                ))

        # Block size is limited by the amount of available local memory.
        # In some OpenCL implementations the number reported cannot actually be fully used
        # (because it's used by kernel arguments), so we're padding it a little.
        local_mem_size = device_params.local_mem_size
        max_elems = (local_mem_size - 256) // alpha.dtype.itemsize
        block_size = 2**helpers.log2(max_elems)

        # No reason to have block size larger than the number of modes
        block_size = min(block_size, helpers.bounding_power_of_2(modes))

        products_gsize = (samples, helpers.min_blocks(self._max_total_clicks + 1, block_size) * block_size)
        products = plan.temp_array_like(for_reduction)

        read_size = min(block_size, device_params.max_work_group_size)

        while read_size > 1:

            full_steps = modes // block_size
            remainder_size = modes % block_size

            try:
                plan.kernel_call(
                    TEMPLATE.get_def("compound_click_probability_aggregate"),
                    [products, prepared_state],
                    kernel_name="compound_click_probability_aggregate",
                    global_size=products_gsize,
                    local_size=(1, read_size,),
                    render_kwds=dict(
                        block_size=block_size,
                        read_size=read_size,
                        full_steps=full_steps,
                        remainder_size=remainder_size,
                        output_size=self._max_total_clicks + 1,
                        mul_cc=functions.mul(alpha.dtype, alpha.dtype),
                        add_cc=functions.add(alpha.dtype, alpha.dtype),
                        polar_unit=functions.polar_unit(dtypes.real_for(alpha.dtype)),
                        modes=self._system.modes,
                        max_total_clicks=self._max_total_clicks,
                        ))

            except OutOfResourcesError:
                read_size //= 2

            break

        reduction = Reduce(for_reduction, predicate_sum(alpha.dtype), axes=(0,))

        temp = plan.temp_array_like(reduction.parameter.output)

        plan.computation_call(reduction, temp, products)

        fft = FFT(temp)
        real_trf = Transformation([
            Parameter('output', Annotation(output, 'o')),
            Parameter('input', Annotation(temp, 'i')),
            ],
            """
                ${input.ctype} val = ${input.load_same};
                ${output.store_same}(val.x);
                """)
        fft.parameter.output.connect(real_trf, real_trf.input, output_p=real_trf.output)

        plan.computation_call(fft, output, temp, True)

        return plan
Example #27
0
 def __init__(self, global_size, local_size):
     self.global_size = global_size
     if local_size is not None:
         self.local_size = local_size
         self.grid_size = tuple(min_blocks(gs, ls) for gs, ls in zip(global_size, local_size))
Example #28
0
    def _build_plan_for_wg_size(self, plan_factory, warp_size, max_wg_size, output, input_):

        plan = plan_factory()

        # Using algorithm cascading: sequential reduction, and then the parallel one.
        # According to Brent's theorem, the optimal sequential size is O(log(n)).
        # Setting it to the nearest power of 2 to simplify integer operations.
        max_seq_size = helpers.bounding_power_of_2(helpers.log2(max_wg_size))
        max_reduce_power = max_wg_size * max_seq_size

        if self._transpose_axes is None:
            # normal reduction
            cur_input = input_
        else:
            transpose = Transpose(input_, axes=self._transpose_axes)
            tr_output = plan.temp_array_like(transpose.parameter.output)
            plan.computation_call(transpose, tr_output, input_)

            cur_input = tr_output

        axis_start = len(output.shape)
        axis_end = len(input_.shape) - 1

        input_slices = (axis_start, axis_end - axis_start + 1)

        part_size = helpers.product(cur_input.shape[axis_start:])
        final_size = helpers.product(cur_input.shape[:axis_start])

        while part_size > 1:

            if part_size > max_reduce_power:
                seq_size = max_seq_size
                block_size = max_wg_size
                blocks_per_part = helpers.min_blocks(part_size, block_size * seq_size)
                cur_output = plan.temp_array(
                    (final_size, blocks_per_part), input_.dtype)
                output_slices = (1, 1)
            else:
                if part_size > max_wg_size:
                    seq_size = helpers.min_blocks(part_size, max_wg_size)
                    block_size = max_wg_size
                else:
                    seq_size = 1
                    block_size = helpers.bounding_power_of_2(part_size)
                blocks_per_part = 1
                cur_output = output
                output_slices = (len(cur_output.shape), 0)

            if part_size % (block_size * seq_size) != 0:
                last_block_size = part_size % (block_size * seq_size)
            else:
                last_block_size = block_size * seq_size

            render_kwds = dict(
                seq_size=seq_size,
                blocks_per_part=blocks_per_part,
                last_block_size=last_block_size,
                log2=helpers.log2, block_size=block_size,
                warp_size=warp_size,
                empty=self._empty,
                operation=self._operation,
                input_slices=input_slices,
                output_slices=output_slices)

            plan.kernel_call(
                TEMPLATE.get_def('reduce'),
                [cur_output, cur_input],
                global_size=(final_size, blocks_per_part * block_size),
                local_size=(1, block_size),
                render_kwds=render_kwds)

            part_size = blocks_per_part
            cur_input = cur_output
            input_slices = output_slices

        return plan
Example #29
0
    def _build_plan(self, plan_factory, device_params, output, input_):
        plan = plan_factory()

        if self._transpose_to is not None:

            transpose_to = Transpose(input_, axes=self._transpose_to)
            transposed = plan.temp_array_like(transpose_to.parameter.output)

            sub_scan = Scan(transposed,
                            self._predicate,
                            axes=self._axes,
                            exclusive=self._exclusive,
                            max_work_group_size=self._max_work_group_size)
            transposed_scanned = plan.temp_array_like(
                sub_scan.parameter.output)

            transpose_from = Transpose(transposed_scanned,
                                       axes=self._transpose_from,
                                       output_arr_t=output)

            plan.computation_call(transpose_to, transposed, input_)
            plan.computation_call(sub_scan, transposed_scanned, transposed)
            plan.computation_call(transpose_from, output, transposed_scanned)

        else:

            scan_ndim = len(
                self._axes
            )  # assuming that at this point axes are inner and sorted
            batch_shape = output.shape[:-scan_ndim]
            batch_size = helpers.product(batch_shape)
            scan_shape = output.shape[-scan_ndim:]
            scan_size = helpers.product(scan_shape)

            if self._max_work_group_size is None:
                max_wg_size = device_params.max_work_group_size
            else:
                max_wg_size = self._max_work_group_size

            # The current algorithm requires workgroup size to be a power of 2.
            assert max_wg_size == 2**helpers.log2(max_wg_size)

            # Using algorithm cascading: sequential reduction, and then the parallel one.
            # According to Brent's theorem, the optimal sequential size is O(log(n)).
            # So, ideally we want the minimum `wg_size` for which
            # `wg_size * log2(wg_size) >= scan_size`.
            if self._seq_size is None:
                wg_size = 2
                while wg_size < max_wg_size:
                    seq_size = helpers.bounding_power_of_2(
                        helpers.log2(wg_size) - 1)
                    if wg_size * seq_size >= scan_size:
                        break
                    wg_size *= 2
            else:
                seq_size = self._seq_size
                wg_size = helpers.bounding_power_of_2(
                    helpers.min_blocks(scan_size, seq_size))
                if wg_size > max_wg_size:
                    raise ValueError(
                        "Sequential size " + str(seq_size) +
                        " cannot be set because of the maximum workgroup size "
                        + max_wg_size)

            wg_totals_size = helpers.min_blocks(scan_size, wg_size * seq_size)
            wg_totals = plan.temp_array((
                batch_size,
                wg_totals_size,
            ), output.dtype)

            if wg_totals_size > 1:
                temp_output = plan.temp_array_like(output)
            else:
                temp_output = output

            last_part_size = scan_size % (wg_size * seq_size)
            if last_part_size == 0:
                last_part_size = wg_size * seq_size

            plan.kernel_call(
                TEMPLATE.get_def('scan'), [temp_output, input_, wg_totals],
                kernel_name="kernel_scan_wg",
                global_size=(batch_size, wg_size * wg_totals_size),
                local_size=(1, wg_size),
                render_kwds=dict(slices=(len(batch_shape), len(scan_shape)),
                                 log_num_banks=helpers.log2(
                                     device_params.local_mem_banks),
                                 exclusive=self._exclusive,
                                 wg_size=wg_size,
                                 seq_size=seq_size,
                                 scan_size=scan_size,
                                 last_part_size=last_part_size,
                                 wg_totals_size=wg_totals_size,
                                 log_wg_size=helpers.log2(wg_size),
                                 predicate=self._predicate))

            if wg_totals_size > 1:
                sub_scan = Scan(wg_totals,
                                self._predicate,
                                axes=(1, ),
                                exclusive=True,
                                max_work_group_size=self._max_work_group_size)
                scanned_wg_totals = plan.temp_array_like(wg_totals)
                plan.computation_call(sub_scan, scanned_wg_totals, wg_totals)

                plan.kernel_call(TEMPLATE.get_def('add_wg_totals'),
                                 [output, temp_output, scanned_wg_totals],
                                 kernel_name="kernel_scan_add_wg_totals",
                                 global_size=(
                                     batch_size,
                                     scan_size,
                                 ),
                                 render_kwds=dict(
                                     slices=(
                                         len(batch_shape),
                                         len(scan_shape),
                                     ),
                                     wg_size=wg_size,
                                     seq_size=seq_size,
                                 ))

        return plan
Example #30
0
    def _build_plan(self, plan_factory, device_params, output, input_):
        plan = plan_factory()

        if self._transpose_to is not None:

            transpose_to = Transpose(input_, axes=self._transpose_to)
            transposed = plan.temp_array_like(transpose_to.parameter.output)

            sub_scan = Scan(
                transposed, self._predicate, axes=self._axes, exclusive=self._exclusive,
                max_work_group_size=self._max_work_group_size)
            transposed_scanned = plan.temp_array_like(sub_scan.parameter.output)

            transpose_from = Transpose(
                transposed_scanned, axes=self._transpose_from, output_arr_t=output)

            plan.computation_call(transpose_to, transposed, input_)
            plan.computation_call(sub_scan, transposed_scanned, transposed)
            plan.computation_call(transpose_from, output, transposed_scanned)

        else:

            scan_ndim = len(self._axes) # assuming that at this point axes are inner and sorted
            batch_shape = output.shape[:-scan_ndim]
            batch_size = helpers.product(batch_shape)
            scan_shape = output.shape[-scan_ndim:]
            scan_size = helpers.product(scan_shape)

            if self._max_work_group_size is None:
                max_wg_size = device_params.max_work_group_size
            else:
                max_wg_size = self._max_work_group_size

            # The current algorithm requires workgroup size to be a power of 2.
            assert max_wg_size == 2**helpers.log2(max_wg_size)

            # Using algorithm cascading: sequential reduction, and then the parallel one.
            # According to Brent's theorem, the optimal sequential size is O(log(n)).
            # So, ideally we want the minimum `wg_size` for which
            # `wg_size * log2(wg_size) >= scan_size`.
            if self._seq_size is None:
                wg_size = 2
                while wg_size < max_wg_size:
                    seq_size = helpers.bounding_power_of_2(helpers.log2(wg_size) - 1)
                    if wg_size * seq_size >= scan_size:
                        break
                    wg_size *= 2
            else:
                seq_size = self._seq_size
                wg_size = helpers.bounding_power_of_2(helpers.min_blocks(scan_size, seq_size))
                if wg_size > max_wg_size:
                    raise ValueError(
                        "Sequential size " + str(seq_size)
                        + " cannot be set because of the maximum workgroup size " + max_wg_size)

            wg_totals_size = helpers.min_blocks(scan_size, wg_size * seq_size)
            wg_totals = plan.temp_array((batch_size, wg_totals_size,), output.dtype)

            if wg_totals_size > 1:
                temp_output = plan.temp_array_like(output)
            else:
                temp_output = output

            last_part_size = scan_size % (wg_size * seq_size)
            if last_part_size == 0:
                last_part_size = wg_size * seq_size

            plan.kernel_call(
                TEMPLATE.get_def('scan'),
                    [temp_output, input_, wg_totals],
                    kernel_name="kernel_scan_wg",
                    global_size=(batch_size, wg_size * wg_totals_size),
                    local_size=(1, wg_size),
                    render_kwds=dict(
                        slices=(len(batch_shape), len(scan_shape)),
                        log_num_banks=helpers.log2(device_params.local_mem_banks),
                        exclusive=self._exclusive,
                        wg_size=wg_size,
                        seq_size=seq_size,
                        scan_size=scan_size,
                        last_part_size=last_part_size,
                        wg_totals_size=wg_totals_size,
                        log_wg_size=helpers.log2(wg_size),
                        predicate=self._predicate
                        ))

            if wg_totals_size > 1:
                sub_scan = Scan(
                    wg_totals, self._predicate, axes=(1,), exclusive=True,
                    max_work_group_size=self._max_work_group_size)
                scanned_wg_totals = plan.temp_array_like(wg_totals)
                plan.computation_call(sub_scan, scanned_wg_totals, wg_totals)

                plan.kernel_call(
                    TEMPLATE.get_def('add_wg_totals'),
                        [output, temp_output, scanned_wg_totals],
                        kernel_name="kernel_scan_add_wg_totals",
                        global_size=(batch_size, scan_size,),
                        render_kwds=dict(
                            slices=(len(batch_shape), len(scan_shape),),
                            wg_size=wg_size,
                            seq_size=seq_size,
                            ))

        return plan
Example #31
0
    def _build_plan_for_wg_size(self, plan_factory, warp_size, max_wg_size,
                                output, input_):

        plan = plan_factory()

        # Using algorithm cascading: sequential reduction, and then the parallel one.
        # According to Brent's theorem, the optimal sequential size is O(log(n)).
        # Setting it to the nearest power of 2 to simplify integer operations.
        max_seq_size = helpers.bounding_power_of_2(helpers.log2(max_wg_size))
        max_reduce_power = max_wg_size * max_seq_size

        if self._transpose_axes is None:
            # normal reduction
            cur_input = input_
        else:
            transpose = Transpose(input_, axes=self._transpose_axes)
            tr_output = plan.temp_array_like(transpose.parameter.output)
            plan.computation_call(transpose, tr_output, input_)

            cur_input = tr_output

        axis_start = len(output.shape)
        axis_end = len(input_.shape) - 1

        input_slices = (axis_start, axis_end - axis_start + 1)

        part_size = helpers.product(cur_input.shape[axis_start:])
        final_size = helpers.product(cur_input.shape[:axis_start])

        while part_size > 1:

            if part_size > max_reduce_power:
                seq_size = max_seq_size
                block_size = max_wg_size
                blocks_per_part = helpers.min_blocks(part_size,
                                                     block_size * seq_size)
                cur_output = plan.temp_array((final_size, blocks_per_part),
                                             input_.dtype)
                output_slices = (1, 1)
            else:
                if part_size > max_wg_size:
                    seq_size = helpers.min_blocks(part_size, max_wg_size)
                    block_size = max_wg_size
                else:
                    seq_size = 1
                    block_size = helpers.bounding_power_of_2(part_size)
                blocks_per_part = 1
                cur_output = output
                output_slices = (len(cur_output.shape), 0)

            if part_size % (block_size * seq_size) != 0:
                last_block_size = part_size % (block_size * seq_size)
            else:
                last_block_size = block_size * seq_size

            render_kwds = dict(seq_size=seq_size,
                               blocks_per_part=blocks_per_part,
                               last_block_size=last_block_size,
                               log2=helpers.log2,
                               block_size=block_size,
                               warp_size=warp_size,
                               empty=self._empty,
                               operation=self._operation,
                               input_slices=input_slices,
                               output_slices=output_slices)

            plan.kernel_call(TEMPLATE.get_def('reduce'),
                             [cur_output, cur_input],
                             global_size=(final_size,
                                          blocks_per_part * block_size),
                             local_size=(1, block_size),
                             render_kwds=render_kwds)

            part_size = blocks_per_part
            cur_input = cur_output
            input_slices = output_slices

        return plan
Example #32
0
def max_supported_transforms_per_block(device_params, transform_type):
    reqs = get_transform(transform_type).transform_module_requirements()
    return min_blocks(device_params.max_work_group_size,
                      reqs['threads_per_transform'])