Example #1
0
    def _construct_operations(self, basis, device_params):

        if product([basis.shape[i] for i in basis.axes]) == 1:
            # Trivial problem. Need to add a dummy kernel
            # because we still have to run transformations.
            operations = self._get_operation_recorder()

            identity = self.get_nested_computation(
                specialize_elementwise('output', 'input', 'direction',
                    dict(kernel="${output.store}(idx, ${input.load}(idx));")))
            operations.add_computation(identity, 'output', 'input', 'direction')
            return operations

        # While resource consumption of GlobalFFTKernel can be made lower by passing
        # lower value to prepare_for(), LocalFFTKernel may have to be split into several kernels.
        # Therefore, if GlobalFFTKernel.prepare_for() raises OutOfResourcesError,
        # we just call prepare_for() with lower limit, but if LocalFFTKernel.prepare_for()
        # does that, we have to recreate the whole chain.
        local_kernel_limit = device_params.max_work_group_size
        kernel_calls = []

        while local_kernel_limit >= 1:
            # Starting from scratch.
            operations = self._get_operation_recorder()
            kernels = get_fft_kernels(basis, device_params, local_kernel_limit)

            for i, kernel in enumerate(kernels):

                mem_in = 'input' if i == 0 else mem_out
                if i == len(kernels) - 1:
                    mem_out = 'output'
                else:
                    mem_out = operations.add_allocation(kernel.output_shape, basis.dtype)

                if kernel.kweights is not None:
                    kweights = operations.add_const_allocation(
                        kernel.kweights.astype(basis.dtype))
                    kweights_arg = [kweights]
                else:
                    kweights_arg = []

                argnames = [mem_out, mem_in] + kweights_arg + ['direction']

                # Try to find local size for each of the kernels
                local_size = device_params.max_work_group_size
                local_kernel_fail = False # marks the event when LocalFFTKernel is out of resources
                while local_size >= 1 and not local_kernel_fail:
                    try:
                        gs, ls, kwds = kernel.prepare_for(local_size)
                        operations.add_kernel(
                            TEMPLATE, kernel.name, argnames,
                            global_size=gs, local_size=ls, render_kwds=kwds,
                            inplace=([(mem_out, mem_in)] if kernel.inplace_possible else None))
                    except OutOfResourcesError:
                        if isinstance(kernel, GlobalFFTKernel):
                            local_size //= 2
                        else:
                            local_kernel_fail = True
                        continue

                    kernel_calls.append((kernel.name, argnames, gs, ls, kwds))
                    break
                else:
                    if not local_kernel_fail:
                        raise ValueError(
                            "Could not find suitable call parameters for one of the global kernels")

                if local_kernel_fail:
                    break
            else:
                # everything went well, returning list of calls
                return operations

            # The cycle above received 'break', meaning that LocalFFTKernel was out of resources.
            # Reduce the limit and try to create operations from scratch again.
            local_kernel_limit //= 2

        else:
            raise ValueError("Could not find suitable call parameters for one of the local kernels")
import tigger.transformations as tr

from helpers import *


def pytest_generate_tests(metafunc):
    int_dtypes = [numpy.dtype('int32'), numpy.dtype('int64')]
    float_dtypes = [numpy.dtype('float32')]
    complex_dtypes = [numpy.dtype('complex64')]

    if 'any_dtype' in metafunc.funcargnames:
        dtypes = int_dtypes + float_dtypes + complex_dtypes
        metafunc.parametrize('any_dtype', dtypes, ids=[str(x) for x in dtypes])


TestComputation = specialize_elementwise('output', 'input', None,
    dict(kernel="${output.store}(idx, ${input.load}(idx));"))


def test_identity(some_ctx, any_dtype):

    input = get_test_array((1000,), any_dtype)
    input_dev = some_ctx.to_device(input)
    output_dev = some_ctx.empty_like(input_dev)

    test = TestComputation(some_ctx)
    test.connect(tr.identity(), 'input', ['input_prime'])
    test.connect(tr.identity(), 'output', ['output_prime'])
    test.prepare_for(output_dev, input_dev)

    test(output_dev, input_dev)
    assert diff_is_negligible(output_dev.get(), input)
from helpers import *


def pytest_generate_tests(metafunc):
    int_dtypes = [numpy.dtype('int32'), numpy.dtype('int64')]
    float_dtypes = [numpy.dtype('float32')]
    complex_dtypes = [numpy.dtype('complex64')]

    if 'any_dtype' in metafunc.funcargnames:
        dtypes = int_dtypes + float_dtypes + complex_dtypes
        metafunc.parametrize('any_dtype', dtypes, ids=[str(x) for x in dtypes])


TestComputation = specialize_elementwise(
    'output', 'input', None,
    dict(kernel="${output.store}(idx, ${input.load}(idx));"))


def test_identity(some_ctx, any_dtype):

    input = get_test_array((1000, ), any_dtype)
    input_dev = some_ctx.to_device(input)
    output_dev = some_ctx.empty_like(input_dev)

    test = TestComputation(some_ctx)
    test.connect(tr.identity(), 'input', ['input_prime'])
    test.connect(tr.identity(), 'output', ['output_prime'])
    test.prepare_for(output_dev, input_dev)

    test(output_dev, input_dev)
Example #4
0
    def _construct_operations(self, basis, device_params):

        if product([basis.shape[i] for i in basis.axes]) == 1:
            # Trivial problem. Need to add a dummy kernel
            # because we still have to run transformations.
            operations = self._get_operation_recorder()

            identity = self.get_nested_computation(
                specialize_elementwise(
                    'output', 'input', 'direction',
                    dict(kernel="${output.store}(idx, ${input.load}(idx));")))
            operations.add_computation(identity, 'output', 'input',
                                       'direction')
            return operations

        # While resource consumption of GlobalFFTKernel can be made lower by passing
        # lower value to prepare_for(), LocalFFTKernel may have to be split into several kernels.
        # Therefore, if GlobalFFTKernel.prepare_for() raises OutOfResourcesError,
        # we just call prepare_for() with lower limit, but if LocalFFTKernel.prepare_for()
        # does that, we have to recreate the whole chain.
        local_kernel_limit = device_params.max_work_group_size
        kernel_calls = []

        while local_kernel_limit >= 1:
            # Starting from scratch.
            operations = self._get_operation_recorder()
            kernels = get_fft_kernels(basis, device_params, local_kernel_limit)

            for i, kernel in enumerate(kernels):

                mem_in = 'input' if i == 0 else mem_out
                if i == len(kernels) - 1:
                    mem_out = 'output'
                else:
                    mem_out = operations.add_allocation(
                        kernel.output_shape, basis.dtype)

                if kernel.kweights is not None:
                    kweights = operations.add_const_allocation(
                        kernel.kweights.astype(basis.dtype))
                    kweights_arg = [kweights]
                else:
                    kweights_arg = []

                argnames = [mem_out, mem_in] + kweights_arg + ['direction']

                # Try to find local size for each of the kernels
                local_size = device_params.max_work_group_size
                local_kernel_fail = False  # marks the event when LocalFFTKernel is out of resources
                while local_size >= 1 and not local_kernel_fail:
                    try:
                        gs, ls, kwds = kernel.prepare_for(local_size)
                        operations.add_kernel(
                            TEMPLATE,
                            kernel.name,
                            argnames,
                            global_size=gs,
                            local_size=ls,
                            render_kwds=kwds,
                            inplace=([(mem_out, mem_in)]
                                     if kernel.inplace_possible else None))
                    except OutOfResourcesError:
                        if isinstance(kernel, GlobalFFTKernel):
                            local_size //= 2
                        else:
                            local_kernel_fail = True
                        continue

                    kernel_calls.append((kernel.name, argnames, gs, ls, kwds))
                    break
                else:
                    if not local_kernel_fail:
                        raise ValueError(
                            "Could not find suitable call parameters for one of the global kernels"
                        )

                if local_kernel_fail:
                    break
            else:
                # everything went well, returning list of calls
                return operations

            # The cycle above received 'break', meaning that LocalFFTKernel was out of resources.
            # Reduce the limit and try to create operations from scratch again.
            local_kernel_limit //= 2

        else:
            raise ValueError(
                "Could not find suitable call parameters for one of the local kernels"
            )