Example #1
0
import numpy as np
import pytest
from numpy import linalg

from conftest import skipif
from devito import TimeFunction
from devito.logger import log
from examples.seismic import Model, AcquisitionGeometry
from examples.seismic.acoustic import AcousticWaveSolver
from examples.seismic.tti import AnisotropicWaveSolver

pytestmark = skipif(['yask', 'ops'])


@pytest.mark.parametrize('shape', [(120, 140), (120, 140, 150)])
@pytest.mark.parametrize('space_order', [4, 8])
@pytest.mark.parametrize('kernel', ['centered'])
def test_tti(shape, space_order, kernel):
    """
    This first test compare the solution of the acoustic wave-equation and the
    TTI wave-eqatuon with all anisotropy parametrs to 0. The two solutions should
    be the same.
    """
    if kernel == 'shifted':
        space_order *= 2
    to = 2
    so = space_order
    nbl = 10
    origin = [0. for _ in shape]
    spacing = [10. for _ in shape]
    vp = 1.5 * np.ones(shape)
Example #2
0
import itertools
import pytest
import numpy as np

from conftest import skipif
from sympy import Integer
from sympy.core.numbers import Zero, One  # noqa

pytestmark = skipif('noops', whole_module=True)

# All ops-specific imports *must* be avoided if `backend != ops`, otherwise
# a backend reinitialization would be triggered via `devito/ops/.__init__.py`,
# thus invalidating all of the future tests. This is guaranteed by the
# `pytestmark` above
from devito import Eq, Function, Grid, Operator, TimeFunction, configuration  # noqa
from devito.ops.node_factory import OPSNodeFactory  # noqa
from devito.ops.transformer import create_ops_arg, create_ops_dat, make_ops_ast, to_ops_stencil  # noqa
from devito.ops.types import OpsAccessible, OpsDat, OpsStencil, OpsBlock  # noqa
from devito.ops.utils import namespace, AccessibleInfo  # noqa
from devito.symbolics import Byref, Literal, indexify  # noqa
from devito.tools import dtype_to_cstr  # noqa
from devito.types import Buffer, Constant, Symbol  # noqa


class TestOPSExpression(object):
    @pytest.mark.parametrize('equation, expected', [
        ('Eq(u,3*a - 4**a)', 'void OPS_Kernel_0(ACC<float> & ut0)\n'
         '{\n  ut0(0) = -2.97015324253729F;\n}'),
        ('Eq(u, u.dxl)',
         'void OPS_Kernel_0(ACC<float> & ut0, const float *h_x)\n'
         '{\n  r0 = 1.0/*h_x;\n  '
Example #3
0
from operator import mul

import numpy as np
import pytest

from conftest import EVAL, skipif
from devito import Grid, Function, TimeFunction, SparseTimeFunction, Eq, Operator, solve
from devito.dle import NThreads, transform
from devito.dle.parallelizer import nhyperthreads
from devito.ir.equations import DummyEq
from devito.ir.iet import (Call, Expression, Iteration, Conditional, FindNodes,
                           iet_analyze, retrieve_iteration_tree)
from devito.tools import as_tuple
from unittest.mock import patch

pytestmark = skipif(['yask', 'ops'])


def get_blocksizes(op, dle, grid, blockshape):
    blocksizes = {'%s0_blk_size' % d: v for d, v in zip(grid.dimensions, blockshape)}
    blocksizes = {k: v for k, v in blocksizes.items() if k in op._known_arguments}
    # Sanity check
    if grid.dim == 1 or len(blockshape) == 0:
        assert len(blocksizes) == 0
        return {}
    try:
        if dle[1].get('blockinner'):
            assert len(blocksizes) >= 1
            if grid.dim == len(blockshape):
                assert len(blocksizes) == len(blockshape)
            else:
Example #4
0
from sympy import cos
import numpy as np
from cached_property import cached_property

import pytest  # noqa

pexpect = pytest.importorskip('yask')  # Run only if YASK is available

from conftest import skipif  # noqa
from devito import (Eq, Grid, Dimension, ConditionalDimension, Operator, Constant,
                    Function, TimeFunction,  SparseTimeFunction, configuration, clear_cache)  # noqa
from devito.ir.iet import FindNodes, ForeignExpression, retrieve_iteration_tree  # noqa
from examples.seismic.acoustic import iso_stencil  # noqa
from examples.seismic import demo_model, TimeAxis, RickerSource, Receiver  # noqa

pytestmark = skipif('noyask')


def setup_module(module):
    """Get rid of any YASK modules generated and JIT-compiled in previous runs.
    This is not strictly necessary for the tests, but it helps in keeping the
    lib directory clean, which may be helpful for offline analysis.
    """
    from devito.yask.wrappers import contexts  # noqa
    contexts.dump()


@pytest.fixture(autouse=True)
def reset_isa():
    """Force back to NO-SIMD after each test, as some tests may optionally
    switch on SIMD.
Example #5
0
import numpy as np
import pytest

from conftest import skipif, opts_device_tiling
from devito import (Grid, Dimension, Function, TimeFunction, Eq, Inc, solve,
                    Operator, norm, cos)
from devito.exceptions import InvalidOperator
from devito.ir.iet import retrieve_iteration_tree
from examples.seismic import TimeAxis, RickerSource, Receiver

pytestmark = skipif(['nodevice'], whole_module=True)


class TestCodeGeneration(object):
    def test_init_omp_env(self):
        grid = Grid(shape=(3, 3, 3))

        u = TimeFunction(name='u', grid=grid)

        op = Operator(Eq(u.forward, u.dx + 1), language='openmp')

        assert str(op.body.init[0].body[0]) ==\
            'if (deviceid != -1)\n{\n  omp_set_default_device(deviceid);\n}'

    @skipif('device-aomp')
    @pytest.mark.parallel(mode=1)
    def test_init_omp_env_w_mpi(self):
        grid = Grid(shape=(3, 3, 3))

        u = TimeFunction(name='u', grid=grid)
Example #6
0
import pytest
import numpy as np

from conftest import skipif
from devito import (Grid, Function, TimeFunction, SparseTimeFunction, Dimension, # noqa
                    Eq, Operator, ALLOC_GUARD, ALLOC_FLAT)
from devito.data import LEFT, RIGHT, Decomposition

pytestmark = skipif('ops')


class TestDataBasic(object):

    def test_simple_indexing(self):
        """Test data packing/unpacking via basic indexing."""
        grid = Grid(shape=(16, 16, 16))
        u = Function(name='yu3D', grid=grid, space_order=0)

        # Test simple insertion and extraction
        u.data[0, 1, 1] = 1.
        assert u.data[0, 0, 0] == 0.
        assert u.data[0, 1, 1] == 1.
        assert np.all(u.data == u.data[:, :, :])
        assert 1. in u.data[0]
        assert 1. in u.data[0, 1]

        # Test negative indices
        assert u.data[0, -15, -15] == 1.
        u.data[6, 0, 0] = 1.
        assert u.data[-10, :, :].sum() == 1.
import pytest
import numpy as np
from unittest.mock import patch

from conftest import skipif
from devito import Grid, TimeFunction, Eq, Operator, configuration, switchconfig
from devito.data import LEFT

pytestmark = skipif(['yask', 'ops'], whole_module=True)

# All core-specific imports *must* be avoided if `backend != core`, otherwise
# a backend reinitialization would be triggered via `devito/core/.__init__.py`,
# thus invalidating all of the future tests. This is guaranteed by the
# `pytestmark` above
from devito.core.autotuning import options  # noqa


@switchconfig(log_level='DEBUG')
@pytest.mark.parametrize("shape,expected", [
    ((30, 30), 13),
    ((30, 30, 30), 17)
])
def test_at_is_actually_working(shape, expected):
    """
    Check that autotuning is actually running when switched on,
    in both 2D and 3D operators.
    """
    grid = Grid(shape=shape)
    f = TimeFunction(name='f', grid=grid)

    eqn = Eq(f.forward, f + 1)
Example #8
0
import numpy as np
from cached_property import cached_property

import pytest  # noqa

pexpect = pytest.importorskip('yask')  # Run only if YASK is available

from conftest import skipif  # noqa
from devito import (Eq, Grid, Dimension, ConditionalDimension, Operator, Constant,
                    Function, TimeFunction, SparseTimeFunction, configuration,
                    clear_cache, switchconfig)  # noqa
from devito.ir.iet import FindNodes, ForeignExpression, retrieve_iteration_tree  # noqa
from examples.seismic.acoustic import iso_stencil  # noqa
from examples.seismic import demo_model, TimeAxis, RickerSource, Receiver  # noqa

pytestmark = skipif('noyask')


def setup_module(module):
    """Get rid of any YASK modules generated and JIT-compiled in previous runs.
    This is not strictly necessary for the tests, but it helps in keeping the
    lib directory clean, which may be helpful for offline analysis.
    """
    from devito.yask.wrappers import contexts  # noqa
    contexts.dump()


@pytest.fixture(autouse=True)
def reset_isa():
    """Force back to NO-SIMD after each test, as some tests may optionally
    switch on SIMD.
Example #9
0
import pytest
import numpy as np

from conftest import skipif
from devito import (Grid, Function, TimeFunction, SparseTimeFunction, Dimension, # noqa
                    Eq, Operator, ALLOC_GUARD, ALLOC_FLAT)
from devito.data import LEFT, RIGHT, Decomposition

pytestmark = skipif('ops')


class TestDataBasic(object):

    def test_simple_indexing(self):
        """Test data packing/unpacking via basic indexing."""
        grid = Grid(shape=(16, 16, 16))
        u = Function(name='yu3D', grid=grid, space_order=0)

        # Test simple insertion and extraction
        u.data[0, 1, 1] = 1.
        assert u.data[0, 0, 0] == 0.
        assert u.data[0, 1, 1] == 1.
        assert np.all(u.data == u.data[:, :, :])
        assert 1. in u.data[0]
        assert 1. in u.data[0, 1]

        # Test negative indices
        assert u.data[0, -15, -15] == 1.
        u.data[6, 0, 0] = 1.
        assert u.data[-10, :, :].sum() == 1.
Example #10
0
import pytest

from conftest import skipif
from devito import Function, Grid, NODE
from devito.tools import powerset

pytestmark = skipif(['yask'])


@pytest.mark.parametrize('ndim', [1, 2, 3])
def test_indices(ndim):
    """
    Test that inidces are shifted by half a grid point for staggered Function
    """
    grid = Grid(tuple([10] * ndim))
    dims = grid.dimensions
    for d in list(powerset(dims))[1:]:
        f = Function(name="f", grid=grid, staggered=d)
        for dd in d:
            assert f.indices_ref[dd] == dd + dd.spacing / 2


@pytest.mark.parametrize('ndim', [1, 2, 3])
def test_avg(ndim):
    """
    Test automatic averaging of Function at undefined grid points
    """
    grid = Grid(tuple([10] * ndim))
    dims = list(powerset(grid.dimensions))[1:]
    for d in dims:
        f = Function(name="f", grid=grid, staggered=d)
Example #11
0
import pytest

from conftest import skipif
from devito import Eq, Grid, Operator, TimeFunction, configuration  # noqa
from devito.symbolics import indexify

pytestmark = skipif('noops', whole_module=True)

# All ops-specific imports *must* be avoided if `backend != ops`, otherwise
# a backend reinitialization would be triggered via `devito/ops/.__init__.py`,
# thus invalidating all of the future tests. This is guaranteed by the
# `pytestmark` above
from devito.ops.node_factory import OPSNodeFactory  # noqa
from devito.ops.transformer import make_ops_ast  # noqa


class TestOPSExpression(object):

    @pytest.mark.parametrize('equation, expected', [
        ('Eq(u,3*a - 4**a)', 'Eq(ut0[OPS_ACC0(0)], -2.97015324253729)'),
        ('Eq(u, u.dxl)',
         'Eq(ut0[OPS_ACC0(0)], -2.0*ut0[OPS_ACC0(-1)]/h_x + '
            '0.5*ut0[OPS_ACC0(-2)]/h_x + 1.5*ut0[OPS_ACC0(0)]/h_x)'),
        ('Eq(v,1)', 'Eq(vt0[OPS_ACC0(0,0)], 1)'),
        ('Eq(v,v.dxl + v.dxr - v.dyr - v.dyl)',
         'Eq(vt0[OPS_ACC0(0,0)], 2.0*vt0[OPS_ACC0(0,-1)]/h_y - '
            '0.5*vt0[OPS_ACC0(0,-2)]/h_y - 2.0*vt0[OPS_ACC0(0,1)]/h_y + '
            '0.5*vt0[OPS_ACC0(0,2)]/h_y - 2.0*vt0[OPS_ACC0(-1,0)]/h_x + '
            '0.5*vt0[OPS_ACC0(-2,0)]/h_x + 2.0*vt0[OPS_ACC0(1,0)]/h_x - '
            '0.5*vt0[OPS_ACC0(2,0)]/h_x)'),
        ('Eq(v,v**2 - 3*v)',
Example #12
0
import numpy as np
import pytest

from conftest import skipif
from devito import (Grid, Constant, Function, TimeFunction, SparseFunction,
                    SparseTimeFunction, Dimension, ConditionalDimension,
                    SubDimension, Eq, Inc, Operator, norm, inner)
from devito.data import LEFT, RIGHT
from devito.ir.iet import Call, Conditional, Iteration, FindNodes
from devito.mpi import MPI, HaloExchangeBuilder, HaloSchemeEntry
from examples.seismic.acoustic import acoustic_setup

pytestmark = skipif(['yask', 'ops', 'nompi'])


class TestDistributor(object):

    @pytest.mark.parallel(mode=[2, 4])
    def test_partitioning(self):
        grid = Grid(shape=(15, 15))
        f = Function(name='f', grid=grid)

        distributor = grid.distributor
        expected = {  # nprocs -> [(rank0 shape), (rank1 shape), ...]
            2: [(15, 8), (15, 7)],
            4: [(8, 8), (8, 7), (7, 8), (7, 7)]
        }
        assert f.shape == expected[distributor.nprocs][distributor.myrank]

    @pytest.mark.parallel(mode=[2, 4])
    def test_partitioning_fewer_dims(self):
Example #13
0
import numpy as np
import pytest

from devito import Operator, norm, Function, Grid, SparseFunction
from devito.logger import info
from examples.seismic import demo_model, Receiver
from examples.seismic.acoustic import acoustic_setup
from examples.seismic.tti import tti_setup
from examples.seismic.viscoacoustic import viscoacoustic_setup
from conftest import skipif

pytestmark = skipif('device-openmp', whole_module=True)

presets = {
    'constant': {
        'preset': 'constant-isotropic'
    },
    'layers': {
        'preset': 'layers-isotropic',
        'nlayers': 2
    },
    'layers-tti': {
        'preset': 'layers-tti',
        'nlayers': 2
    },
    'layers-viscoacoustic': {
        'preset': 'layers-viscoacoustic',
        'nlayers': 2
    },
}
Example #14
0
class TestStreaming(object):
    @pytest.mark.parametrize('opt', [
        ('tasking', 'orchestrate'),
        ('tasking', 'orchestrate', {
            'linearize': True
        }),
    ])
    def test_tasking_in_isolation(self, opt):
        nt = 10
        bundle0 = Bundle()
        grid = Grid(shape=(10, 10, 10), subdomains=bundle0)

        tmp = Function(name='tmp', grid=grid)
        u = TimeFunction(name='u', grid=grid, save=nt)
        v = TimeFunction(name='v', grid=grid)

        eqns = [
            Eq(tmp, v),
            Eq(v.forward, v + 1),
            Eq(u.forward, tmp, subdomain=bundle0)
        ]

        op = Operator(eqns, opt=opt)

        # Check generated code
        assert len(retrieve_iteration_tree(op)) == 5
        assert len([i for i in FindSymbols().visit(op)
                    if isinstance(i, Lock)]) == 1
        sections = FindNodes(Section).visit(op)
        assert len(sections) == 3
        assert str(sections[0].body[0].body[0].body[0].body[0]
                   ) == 'while(lock0[0] == 0);'
        body = sections[2].body[0].body[0]
        assert (str(body.body[1].condition) == 'Ne(lock0[0], 2) | '
                'Ne(FieldFromComposite(flag, sdata0[wi0], ()), 1)')
        assert str(body.body[2]) == 'sdata0[wi0].time = time;'
        assert str(body.body[3]) == 'lock0[0] = 0;'
        assert str(body.body[4]) == 'sdata0[wi0].flag = 2;'

        op.apply(time_M=nt - 2)

        assert np.all(u.data[nt - 1] == 8)

    def test_tasking_fused(self):
        nt = 10
        bundle0 = Bundle()
        grid = Grid(shape=(10, 10, 10), subdomains=bundle0)

        tmp = Function(name='tmp', grid=grid)
        u = TimeFunction(name='u', grid=grid, save=nt)
        v = TimeFunction(name='v', grid=grid, save=nt)
        w = TimeFunction(name='w', grid=grid)

        eqns = [
            Eq(w.forward, w + 1),
            Eq(tmp, w.forward),
            Eq(u.forward, tmp, subdomain=bundle0),
            Eq(v.forward, tmp, subdomain=bundle0)
        ]

        op = Operator(eqns, opt=('tasking', 'fuse', 'orchestrate'))

        # Check generated code
        assert len(retrieve_iteration_tree(op)) == 5
        locks = [i for i in FindSymbols().visit(op) if isinstance(i, Lock)]
        assert len(
            locks) == 1  # Only 1 because it's only `tmp` that needs protection
        assert len(op._func_table) == 2
        exprs = FindNodes(Expression).visit(
            op._func_table['copy_device_to_host0'].root)
        assert len(exprs) == 20
        assert str(exprs[12]) == 'int id = sdata0->id;'
        assert str(exprs[13]) == 'int deviceid = sdata0->deviceid;'
        assert str(exprs[14]) == 'const int time = sdata0->time;'
        assert str(exprs[15]) == 'lock0[0] = 1;'
        assert exprs[16].write is u
        assert exprs[17].write is v
        assert str(exprs[18]) == 'lock0[0] = 2;'
        assert str(exprs[19]) == 'sdata0->flag = 1;'

        op.apply(time_M=nt - 2)

        assert np.all(u.data[nt - 1] == 9)
        assert np.all(v.data[nt - 1] == 9)

    def test_tasking_unfused_two_locks(self):
        nt = 10
        bundle0 = Bundle()
        grid = Grid(shape=(10, 10, 10), subdomains=bundle0)

        tmp0 = Function(name='tmp0', grid=grid)
        tmp1 = Function(name='tmp1', grid=grid)
        u = TimeFunction(name='u', grid=grid, save=nt)
        v = TimeFunction(name='v', grid=grid, save=nt)
        w = TimeFunction(name='w', grid=grid)

        eqns = [
            Eq(w.forward, w + 1),
            Eq(tmp0, w.forward),
            Eq(tmp1, w.forward),
            Eq(u.forward, tmp0, subdomain=bundle0),
            Eq(v.forward, tmp1, subdomain=bundle0)
        ]

        op = Operator(eqns, opt=('tasking', 'fuse', 'orchestrate'))

        # Check generated code
        assert len(retrieve_iteration_tree(op)) == 7
        assert len([i for i in FindSymbols().visit(op)
                    if isinstance(i, Lock)]) == 2
        sections = FindNodes(Section).visit(op)
        assert len(sections) == 4
        assert (str(sections[1].body[0].body[0].body[0].body[0]) ==
                'while(lock0[0] == 0 || lock1[0] == 0);')  # Wait-lock
        body = sections[2].body[0].body[0]
        assert (str(body.body[1].condition) == 'Ne(lock0[0], 2) | '
                'Ne(FieldFromComposite(flag, sdata0[wi0], ()), 1)'
                )  # Wait-thread
        assert (str(body.body[1].body[0]) == 'wi0 = (wi0 + 1)%(npthreads0);')
        assert str(body.body[2]) == 'sdata0[wi0].time = time;'
        assert str(body.body[3]) == 'lock0[0] = 0;'  # Set-lock
        assert str(body.body[4]) == 'sdata0[wi0].flag = 2;'
        body = sections[3].body[0].body[0]
        assert (str(body.body[1].condition) == 'Ne(lock1[0], 2) | '
                'Ne(FieldFromComposite(flag, sdata1[wi1], ()), 1)'
                )  # Wait-thread
        assert (str(body.body[1].body[0]) == 'wi1 = (wi1 + 1)%(npthreads1);')
        assert str(body.body[2]) == 'sdata1[wi1].time = time;'
        assert str(body.body[3]) == 'lock1[0] = 0;'  # Set-lock
        assert str(body.body[4]) == 'sdata1[wi1].flag = 2;'
        assert len(op._func_table) == 4
        exprs = FindNodes(Expression).visit(
            op._func_table['copy_device_to_host0'].root)
        assert len(exprs) == 19
        assert str(exprs[15]) == 'lock0[0] = 1;'
        assert exprs[16].write is u
        exprs = FindNodes(Expression).visit(
            op._func_table['copy_device_to_host1'].root)
        assert str(exprs[15]) == 'lock1[0] = 1;'
        assert exprs[16].write is v

        op.apply(time_M=nt - 2)

        assert np.all(u.data[nt - 1] == 9)
        assert np.all(v.data[nt - 1] == 9)

    def test_tasking_forcefuse(self):
        nt = 10
        bundle0 = Bundle()
        grid = Grid(shape=(10, 10, 10), subdomains=bundle0)

        tmp0 = Function(name='tmp0', grid=grid)
        tmp1 = Function(name='tmp1', grid=grid)
        u = TimeFunction(name='u', grid=grid, save=nt)
        v = TimeFunction(name='v', grid=grid, save=nt)
        w = TimeFunction(name='w', grid=grid)

        eqns = [
            Eq(w.forward, w + 1),
            Eq(tmp0, w.forward),
            Eq(tmp1, w.forward),
            Eq(u.forward, tmp0, subdomain=bundle0),
            Eq(v.forward, tmp1, subdomain=bundle0)
        ]

        op = Operator(eqns,
                      opt=('tasking', 'fuse', 'orchestrate', {
                          'fuse-tasks': True
                      }))

        # Check generated code
        assert len(retrieve_iteration_tree(op)) == 5
        assert len([i for i in FindSymbols().visit(op)
                    if isinstance(i, Lock)]) == 2
        sections = FindNodes(Section).visit(op)
        assert len(sections) == 3
        assert (str(sections[1].body[0].body[0].body[0].body[0]) ==
                'while(lock0[0] == 0 || lock1[0] == 0);')  # Wait-lock
        body = sections[2].body[0].body[0]
        assert (str(body.body[1].condition) == 'Ne(lock0[0], 2) | '
                'Ne(lock1[0], 2) | '
                'Ne(FieldFromComposite(flag, sdata0[wi0], ()), 1)'
                )  # Wait-thread
        assert (str(body.body[1].body[0]) == 'wi0 = (wi0 + 1)%(npthreads0);')
        assert str(body.body[2]) == 'sdata0[wi0].time = time;'
        assert str(body.body[3]) == 'lock0[0] = 0;'  # Set-lock
        assert str(body.body[4]) == 'lock1[0] = 0;'  # Set-lock
        assert str(body.body[5]) == 'sdata0[wi0].flag = 2;'
        assert len(op._func_table) == 2
        exprs = FindNodes(Expression).visit(
            op._func_table['copy_device_to_host0'].root)
        assert len(exprs) == 22
        assert str(exprs[15]) == 'lock0[0] = 1;'
        assert str(exprs[16]) == 'lock1[0] = 1;'
        assert exprs[17].write is u
        assert exprs[18].write is v

        op.apply(time_M=nt - 2)

        assert np.all(u.data[nt - 1] == 9)
        assert np.all(v.data[nt - 1] == 9)

    @pytest.mark.parametrize('opt', [
        ('tasking', 'orchestrate'),
        ('tasking', 'streaming', 'orchestrate'),
    ])
    def test_attempt_tasking_but_no_temporaries(self, opt):
        grid = Grid(shape=(10, 10, 10))

        u = TimeFunction(name='u', grid=grid, save=10)

        op = Operator(Eq(u.forward, u + 1), opt=opt)

        piters = FindNodes(OmpIteration).visit(op)
        assert len(piters) == 0

        op = Operator(Eq(u.forward, u + 1), opt=(opt, {'par-disabled': False}))

        # Degenerates to host execution with no data movement, since `u` is
        # a host Function
        piters = FindNodes(OmpIteration).visit(op)
        assert len(piters) == 1
        assert type(piters.pop()) == OmpIteration

    def test_tasking_multi_output(self):
        nt = 10
        bundle0 = Bundle()
        grid = Grid(shape=(10, 10, 10), subdomains=bundle0)
        t = grid.stepping_dim
        x, y, z = grid.dimensions

        u = TimeFunction(name='u', grid=grid, time_order=2)
        u1 = TimeFunction(name='u', grid=grid, time_order=2)
        usave = TimeFunction(name='usave', grid=grid, save=nt)
        usave1 = TimeFunction(name='usave', grid=grid, save=nt)

        eqns = [
            Eq(u.forward, u + 1),
            Eq(usave,
               u.forward + u + u.backward + u[t, x - 1, y, z],
               subdomain=bundle0)
        ]

        op0 = Operator(eqns, opt=('noop', {'gpu-fit': usave}))
        op1 = Operator(eqns, opt=('tasking', 'orchestrate'))

        # Check generated code
        assert len(retrieve_iteration_tree(op1)) == 4
        assert len(
            [i for i in FindSymbols().visit(op1) if isinstance(i, Lock)]) == 1
        sections = FindNodes(Section).visit(op1)
        assert len(sections) == 2
        assert str(sections[0].body[0].body[0].body[0].body[0]) ==\
            'while(lock0[t2] == 0);'
        for i in range(3):
            assert 'lock0[t' in str(
                sections[1].body[0].body[0].body[6 + i])  # Set-lock
        assert str(
            sections[1].body[0].body[0].body[9]) == 'sdata0[wi0].flag = 2;'
        assert len(op1._func_table) == 2
        exprs = FindNodes(Expression).visit(
            op1._func_table['copy_device_to_host0'].root)
        assert len(exprs) == 26
        for i in range(3):
            assert 'lock0[t' in str(exprs[18 + i])
        assert exprs[21].write is usave

        op0.apply(time_M=nt - 2)
        op1.apply(time_M=nt - 2, u=u1, usave=usave1)

        assert np.all(u.data[:] == u1.data[:])
        assert np.all(usave.data[:] == usave1.data[:])

    def test_tasking_lock_placement(self):
        grid = Grid(shape=(10, 10, 10))

        f = Function(name='f', grid=grid, space_order=2)
        u = TimeFunction(name='u', grid=grid)
        usave = TimeFunction(name='usave', grid=grid, save=10)

        eqns = [Eq(f, u + 1), Eq(u.forward, f.dx + u + 1), Eq(usave, u)]

        op = Operator(eqns, opt=('tasking', 'orchestrate'))

        # Check generated code -- the wait-lock is expected in section1
        assert len(retrieve_iteration_tree(op)) == 5
        assert len([i for i in FindSymbols().visit(op)
                    if isinstance(i, Lock)]) == 1
        sections = FindNodes(Section).visit(op)
        assert len(sections) == 3
        assert sections[0].body[0].body[0].body[0].is_Iteration
        assert str(sections[1].body[0].body[0].body[0].body[0]) ==\
            'while(lock0[t1] == 0);'

    @pytest.mark.parametrize('opt,ntmps', [
        pytest.param(
            ('streaming', 'orchestrate'), 0, marks=skipif('device-openmp')),
        (('buffering', 'streaming', 'orchestrate'), 1),
        (('buffering', 'streaming', 'orchestrate', {
            'linearize': True
        }), 1),
    ])
    def test_streaming_basic(self, opt, ntmps):
        nt = 10
        grid = Grid(shape=(4, 4))

        u = TimeFunction(name='u', grid=grid)
        usave = TimeFunction(name='usave', grid=grid, save=nt)

        for i in range(nt):
            usave.data[i, :] = i

        eqn = Eq(u.forward, u + usave)

        op = Operator(eqn, opt=opt)

        # Check generated code
        assert len(op._func_table) == 3
        assert len([i for i in FindSymbols().visit(op) if i.is_Array]) == ntmps

        op.apply(time_M=nt - 2)

        assert np.all(u.data[0] == 28)
        assert np.all(u.data[1] == 36)

    @pytest.mark.parametrize('opt,ntmps,nfuncs', [
        pytest.param(
            ('streaming', 'orchestrate'), 0, 3, marks=skipif('device-openmp')),
        (('buffering', 'streaming', 'orchestrate'), 2, 6),
        (('buffering', 'streaming', 'fuse', 'orchestrate'), 2, 3),
    ])
    def test_streaming_two_buffers(self, opt, ntmps, nfuncs):
        nt = 10
        grid = Grid(shape=(4, 4))

        u = TimeFunction(name='u', grid=grid)
        usave = TimeFunction(name='usave', grid=grid, save=nt)
        vsave = TimeFunction(name='vsave', grid=grid, save=nt)

        for i in range(nt):
            usave.data[i, :] = i
            vsave.data[i, :] = i

        eqn = Eq(u.forward, u + usave + vsave)

        op = Operator(eqn, opt=opt)

        # Check generated code
        assert len(op._func_table) == nfuncs
        assert len([i for i in FindSymbols().visit(op) if i.is_Array]) == ntmps

        op.apply(time_M=nt - 2)

        assert np.all(u.data[0] == 56)
        assert np.all(u.data[1] == 72)

    @pytest.mark.parametrize('opt', [
        pytest.param(
            ('streaming', 'orchestrate'), marks=skipif('device-openmp')),
        ('buffering', 'streaming', 'orchestrate'),
    ])
    def test_streaming_conddim_forward(self, opt):
        nt = 10
        grid = Grid(shape=(4, 4))
        time_dim = grid.time_dim

        factor = Constant(name='factor', value=2, dtype=np.int32)
        time_sub = ConditionalDimension(name="time_sub",
                                        parent=time_dim,
                                        factor=factor)

        u = TimeFunction(name='u', grid=grid)
        usave = TimeFunction(name='usave',
                             grid=grid,
                             time_order=0,
                             save=(int(nt // factor.data)),
                             time_dim=time_sub)

        for i in range(usave.save):
            usave.data[i, :] = i

        eqn = Eq(u.forward, u.forward + u + usave)

        op = Operator(eqn, opt=opt)

        # TODO: we are *not* using the last entry of usave, so we gotta ensure
        # it is *not* streamed on to the device (thus avoiding dangerous leaks).
        # But how can we explicitly check this?
        time_M = 6

        op.apply(time_M=time_M)

        # We entered the eq four times (at time=0,2,4,6)
        # Since factor=2, we *only* write to u.data[(time+1)%2]=u.data[1]
        assert np.all(u.data[0] == 0)
        # 1st time u[1] = u[0]+u[1]+usave[0] = 0+0+0 = 0
        # 2nd time u[1] = u[0]+u[1]+usave[1] = 0+0+1 = 1
        # 3rd time u[1] = u[0]+u[1]+usave[2] = 0+1+2 = 3
        # 4th time u[1] = u[0]+u[1]+usave[3] = 0+3+3 = 6
        assert np.all(u.data[1] == 6)

    @pytest.mark.parametrize('opt', [
        pytest.param(
            ('streaming', 'orchestrate'), marks=skipif('device-openmp')),
        ('buffering', 'streaming', 'orchestrate'),
    ])
    def test_streaming_conddim_backward(self, opt):
        nt = 10
        grid = Grid(shape=(4, 4))
        time_dim = grid.time_dim

        factor = Constant(name='factor', value=2, dtype=np.int32)
        time_sub = ConditionalDimension(name="time_sub",
                                        parent=time_dim,
                                        factor=factor)

        u = TimeFunction(name='u', grid=grid)
        usave = TimeFunction(name='usave',
                             grid=grid,
                             time_order=0,
                             save=(int(nt // factor.data)),
                             time_dim=time_sub)

        for i in range(usave.save):
            usave.data[i, :] = i

        eqn = Eq(u.backward, u.backward + u + usave)

        op = Operator(eqn, opt=opt)

        # TODO: we are *not* using the first two entries of usave, so we gotta ensure
        # they are *not* streamed on to the device (thus avoiding dangerous leaks).
        # But how can we explicitly check this?
        time_m = 4

        op.apply(time_m=time_m, time_M=nt - 2)

        # We entered the eq three times (at time=8,6,4)
        # Since factor=2, we *only* write to u.data[(time-1)%2]=u.data[1]
        assert np.all(u.data[0] == 0)
        # 1st time u[1] = u[0]+u[1]+usave[4] = 0+0+4 = 4
        # 2nd time u[1] = u[0]+u[1]+usave[3] = 0+4+3 = 7
        # 3rd time u[1] = u[0]+u[1]+usave[2] = 0+7+2 = 9
        assert np.all(u.data[1] == 9)

    @pytest.mark.parametrize('opt,ntmps', [
        pytest.param(
            ('streaming', 'orchestrate'), 0, marks=skipif('device-openmp')),
        (('buffering', 'streaming', 'orchestrate'), 1),
    ])
    def test_streaming_multi_input(self, opt, ntmps):
        nt = 100
        grid = Grid(shape=(10, 10))

        u = TimeFunction(name='u',
                         grid=grid,
                         save=nt,
                         time_order=2,
                         space_order=2)
        v = TimeFunction(name='v',
                         grid=grid,
                         save=None,
                         time_order=2,
                         space_order=2)
        grad = Function(name='grad', grid=grid)
        grad1 = Function(name='grad', grid=grid)

        v.data[:] = 0.02
        for i in range(nt):
            u.data[i, :] = i + 0.1

        eqn = Eq(grad, grad - u.dt2 * v)

        op0 = Operator(eqn, opt=('noop', {'gpu-fit': u}))
        op1 = Operator(eqn, opt=opt)

        # Check generated code
        assert len(op1._func_table) == 3
        assert len([i for i in FindSymbols().visit(op1)
                    if i.is_Array]) == ntmps

        op0.apply(time_M=nt - 2, dt=0.1)
        op1.apply(time_M=nt - 2, dt=0.1, grad=grad1)

        assert np.all(grad.data == grad1.data)

    @pytest.mark.parametrize('opt,ntmps', [
        pytest.param(
            ('streaming', 'orchestrate'), 0, marks=skipif('device-openmp')),
        (('buffering', 'streaming', 'orchestrate'), 1),
    ])
    def test_streaming_postponed_deletion(self, opt, ntmps):
        nt = 10
        grid = Grid(shape=(10, 10, 10))

        u = TimeFunction(name='u', grid=grid)
        v = TimeFunction(name='v', grid=grid)
        usave = TimeFunction(name='usave', grid=grid, save=nt)
        u1 = TimeFunction(name='u', grid=grid)
        v1 = TimeFunction(name='v', grid=grid)

        for i in range(nt):
            usave.data[i, :] = i

        eqns = [
            Eq(u.forward, u + usave),
            Eq(v.forward, v + u.forward.dx + usave)
        ]

        op0 = Operator(eqns, opt=('noop', {'gpu-fit': usave}))
        op1 = Operator(eqns, opt=opt)

        # Check generated code
        assert len(op1._func_table) == 3
        assert len([i for i in FindSymbols().visit(op1)
                    if i.is_Array]) == ntmps

        op0.apply(time_M=nt - 1)
        op1.apply(time_M=nt - 1, u=u1, v=v1)

        assert np.all(u.data == u1.data)
        assert np.all(v.data == v1.data)

    def test_streaming_with_host_loop(self):
        grid = Grid(shape=(10, 10, 10))

        f = Function(name='f', grid=grid)
        u = TimeFunction(name='u', grid=grid, save=10)

        eqns = [Eq(f, u), Eq(u.forward, f + 1)]

        op = Operator(eqns, opt=('streaming', 'orchestrate'))

        assert len(op._func_table) == 3
        assert 'init_device0' in op._func_table
        assert 'prefetch_host_to_device0' in op._func_table

    @skipif('device-openmp'
            )  # TODO: Still unsupported with OpenMP, but soon will be
    def test_composite_streaming_tasking(self):
        nt = 10
        grid = Grid(shape=(10, 10, 10))

        u = TimeFunction(name='u', grid=grid)
        u1 = TimeFunction(name='u', grid=grid)
        fsave = TimeFunction(name='fsave', grid=grid, save=nt)
        usave = TimeFunction(name='usave', grid=grid, save=nt)
        usave1 = TimeFunction(name='usave', grid=grid, save=nt)

        for i in range(nt):
            fsave.data[i, :] = i

        eqns = [Eq(u.forward, u + fsave + 1), Eq(usave, u)]

        op0 = Operator(eqns, opt=('noop', {'gpu-fit': (fsave, usave)}))
        op1 = Operator(eqns, opt=('tasking', 'streaming', 'orchestrate'))

        # Check generated code
        assert len(retrieve_iteration_tree(op0)) == 1
        assert len(retrieve_iteration_tree(op1)) == 4
        symbols = FindSymbols().visit(op1)
        assert len([i for i in symbols if isinstance(i, Lock)]) == 1
        threads = [i for i in symbols if isinstance(i, PThreadArray)]
        assert len(threads) == 2
        assert threads[0].size == 1
        assert threads[1].size.size == 2

        op0.apply(time_M=nt - 1)
        op1.apply(time_M=nt - 1, u=u1, usave=usave1)

        assert np.all(u.data == u1.data)
        assert np.all(usave.data == usave1.data)

    def test_composite_buffering_tasking(self):
        nt = 10
        bundle0 = Bundle()
        grid = Grid(shape=(4, 4, 4), subdomains=bundle0)

        u = TimeFunction(name='u', grid=grid, time_order=2)
        u1 = TimeFunction(name='u', grid=grid, time_order=2)
        usave = TimeFunction(name='usave', grid=grid, save=nt)
        usave1 = TimeFunction(name='usave', grid=grid, save=nt)

        eqns = [
            Eq(u.forward, u * 1.1 + 1),
            Eq(usave, u.dt2, subdomain=bundle0)
        ]

        op0 = Operator(eqns, opt=('noop', {'gpu-fit': usave}))
        op1 = Operator(eqns, opt=('buffering', 'tasking', 'orchestrate'))

        # Check generated code -- thanks to buffering only expect 1 lock!
        assert len(retrieve_iteration_tree(op0)) == 2
        assert len(retrieve_iteration_tree(op1)) == 5
        symbols = FindSymbols().visit(op1)
        assert len([i for i in symbols if isinstance(i, Lock)]) == 1
        threads = [i for i in symbols if isinstance(i, PThreadArray)]
        assert len(threads) == 1
        assert threads[0].size.size == 1

        op0.apply(time_M=nt - 1, dt=0.1)
        op1.apply(time_M=nt - 1, dt=0.1, u=u1, usave=usave1)

        assert np.all(u.data == u1.data)
        assert np.all(usave.data == usave1.data)

    def test_composite_buffering_tasking_multi_output(self):
        nt = 10
        bundle0 = Bundle()
        grid = Grid(shape=(4, 4, 4), subdomains=bundle0)

        u = TimeFunction(name='u', grid=grid, time_order=2)
        v = TimeFunction(name='v', grid=grid, time_order=2)
        usave = TimeFunction(name='usave', grid=grid, save=nt)
        vsave = TimeFunction(name='vsave', grid=grid, save=nt)

        u1 = TimeFunction(name='u', grid=grid, time_order=2)
        v1 = TimeFunction(name='v', grid=grid, time_order=2)
        usave1 = TimeFunction(name='usave', grid=grid, save=nt)
        vsave1 = TimeFunction(name='vsave', grid=grid, save=nt)

        eqns = [
            Eq(u.forward, u + 1),
            Eq(v.forward, v + 1),
            Eq(usave, u, subdomain=bundle0),
            Eq(vsave, v, subdomain=bundle0)
        ]

        op0 = Operator(eqns, opt=('noop', {'gpu-fit': (usave, vsave)}))
        op1 = Operator(eqns,
                       opt=('buffering', 'tasking', 'topofuse', 'orchestrate'))

        # Check generated code -- thanks to buffering only expect 1 lock!
        assert len(retrieve_iteration_tree(op0)) == 2
        assert len(retrieve_iteration_tree(op1)) == 7
        symbols = FindSymbols().visit(op1)
        assert len([i for i in symbols if isinstance(i, Lock)]) == 2
        threads = [i for i in symbols if isinstance(i, PThreadArray)]
        assert len(threads) == 2
        assert threads[0].size.size == 1
        assert threads[1].size.size == 1
        assert len(op1._func_table
                   ) == 4  # usave and vsave eqns are in two diff efuncs

        op0.apply(time_M=nt - 1)
        op1.apply(time_M=nt - 1, u=u1, v=v1, usave=usave1, vsave=vsave1)

        assert np.all(u.data == u1.data)
        assert np.all(v.data == v1.data)
        assert np.all(usave.data == usave1.data)
        assert np.all(vsave.data == vsave1.data)

    @pytest.mark.parametrize('opt', [
        ('buffering', 'tasking', 'streaming', 'orchestrate'),
        ('buffering', 'tasking', 'streaming', 'orchestrate', {
            'linearize': True
        }),
    ])
    def test_composite_full(self, opt):
        nt = 10
        grid = Grid(shape=(4, 4))

        u = TimeFunction(name='u', grid=grid, save=nt)
        v = TimeFunction(name='v', grid=grid, save=nt)
        u1 = TimeFunction(name='u', grid=grid, save=nt)
        v1 = TimeFunction(name='v', grid=grid, save=nt)

        for i in range(nt):
            u.data[i, :] = i
            u1.data[i, :] = i

        eqns = [Eq(u.forward, u + v + 1), Eq(v.forward, u + v + v.backward)]

        op0 = Operator(eqns, opt=('noop', {'gpu-fit': (u, v)}))
        op1 = Operator(eqns, opt=opt)

        # Check generated code
        assert len(retrieve_iteration_tree(op1)) == 7
        assert len(
            [i for i in FindSymbols().visit(op1) if isinstance(i, Lock)]) == 2

        op0.apply(time_M=nt - 2)
        op1.apply(time_M=nt - 2, u=u1, v=v1)

        assert np.all(u.data == u1.data)
        assert np.all(v.data == v1.data)

    def test_tasking_over_compiler_generated(self):
        nt = 10
        bundle0 = Bundle()
        grid = Grid(shape=(4, 4, 4), subdomains=bundle0)

        u = TimeFunction(name='u', grid=grid, space_order=4)
        u1 = TimeFunction(name='u', grid=grid, space_order=4)
        usave = TimeFunction(name='usave', grid=grid, save=nt)
        usave1 = TimeFunction(name='usave', grid=grid, save=nt)

        eqns = [
            Eq(u.forward, u.dx.dx * 0.042 + 1),
            Eq(usave, u, subdomain=bundle0)
        ]

        op0 = Operator(eqns, opt=('cire-sops', {'gpu-fit': usave}))
        op1 = Operator(eqns, opt=('cire-sops', 'tasking', 'orchestrate'))
        op2 = Operator(eqns, opt=('tasking', 'cire-sops', 'orchestrate'))

        # Check generated code
        for op in [op1, op2]:
            assert len(retrieve_iteration_tree(op)) == 5
            assert len([
                i for i in FindSymbols().visit(op) if isinstance(i, Lock)
            ]) == 1
            sections = FindNodes(Section).visit(op)
            assert len(sections) == 3
            assert 'while(lock0[t1] == 0)' in str(
                sections[1].body[0].body[0].body[0])

        op0.apply(time_M=nt - 1)
        op1.apply(time_M=nt - 1, u=u1, usave=usave1)

        assert np.all(u.data == u1.data)
        assert np.all(usave.data == usave1.data)

    @pytest.mark.parametrize('opt,gpu_fit,async_degree,linearize', [
        (('tasking', 'orchestrate'), True, None, False),
        (('buffering', 'tasking', 'orchestrate'), True, None, False),
        (('buffering', 'tasking', 'orchestrate'), False, None, False),
        (('buffering', 'tasking', 'orchestrate'), False, 3, False),
        (('buffering', 'tasking', 'orchestrate'), False, 3, True),
    ])
    def test_save(self, opt, gpu_fit, async_degree, linearize):
        nt = 10
        grid = Grid(shape=(300, 300, 300))
        time_dim = grid.time_dim

        factor = Constant(name='factor', value=2, dtype=np.int32)
        time_sub = ConditionalDimension(name="time_sub",
                                        parent=time_dim,
                                        factor=factor)

        u = TimeFunction(name='u', grid=grid)
        usave = TimeFunction(name='usave',
                             grid=grid,
                             time_order=0,
                             save=int(nt // factor.data),
                             time_dim=time_sub)
        # For the given `nt` and grid shape, `usave` is roughly 4*5*300**3=~ .5GB of data

        op = Operator(
            [Eq(u.forward, u + 1), Eq(usave, u.forward)],
            opt=(opt, {
                'gpu-fit': usave if gpu_fit else None,
                'buf-async-degree': async_degree,
                'linearize': linearize
            }))

        op.apply(time_M=nt - 1)

        assert all(
            np.all(usave.data[i] == 2 * i + 1) for i in range(usave.save))

    def test_save_multi_output(self):
        nt = 10
        grid = Grid(shape=(150, 150, 150))
        time_dim = grid.time_dim

        factor = Constant(name='factor', value=2, dtype=np.int32)
        time_sub = ConditionalDimension(name="time_sub",
                                        parent=time_dim,
                                        factor=factor)

        u = TimeFunction(name='u', grid=grid)
        usave = TimeFunction(name='usave',
                             grid=grid,
                             time_order=0,
                             save=int(nt // factor.data),
                             time_dim=time_sub)
        vsave = TimeFunction(name='vsave',
                             grid=grid,
                             time_order=0,
                             save=int(nt // factor.data),
                             time_dim=time_sub)

        eqns = [
            Eq(u.forward, u + 1),
            Eq(usave, u.forward),
            Eq(vsave, u.forward)
        ]

        op = Operator(eqns,
                      opt=('buffering', 'tasking', 'topofuse', 'orchestrate'))

        # Check generated code
        assert len(
            op._func_table) == 4  # usave and vsave eqns are in separate tasks

        op.apply(time_M=nt - 1)

        assert all(
            np.all(usave.data[i] == 2 * i + 1) for i in range(usave.save))
        assert all(
            np.all(vsave.data[i] == 2 * i + 1) for i in range(vsave.save))

    @pytest.mark.parametrize('opt', [
        ('buffering', 'tasking', 'orchestrate'),
        ('buffering', 'tasking', 'orchestrate', {
            'linearize': True
        }),
    ])
    def test_save_w_shifting(self, opt):
        factor = 4
        nt = 19
        grid = Grid(shape=(11, 11))
        time = grid.time_dim

        time_subsampled = ConditionalDimension('t_sub',
                                               parent=time,
                                               factor=factor)

        u = TimeFunction(name='u', grid=grid)
        usave = TimeFunction(name='usave',
                             grid=grid,
                             save=2,
                             time_dim=time_subsampled)

        save_shift = Constant(name='save_shift', dtype=np.int32)

        eqns = [
            Eq(u.forward, u + 1.),
            Eq(usave.subs(time_subsampled, time_subsampled - save_shift), u)
        ]

        op = Operator(eqns, opt=opt)

        # Starting at time_m=10, so time_subsampled - save_shift is in range
        op.apply(time_m=10, time_M=nt - 2, save_shift=3)
        assert np.all(np.allclose(u.data[0], 8))
        assert np.all(
            [np.allclose(usave.data[i], 2 + i * factor) for i in range(2)])

    def test_save_w_nonaffine_time(self):
        factor = 4
        grid = Grid(shape=(11, 11))
        x, y = grid.dimensions
        t = grid.stepping_dim
        time = grid.time_dim

        time_subsampled = ConditionalDimension('t_sub',
                                               parent=time,
                                               factor=factor)

        f = Function(name='f', grid=grid, dtype=np.int32)
        u = TimeFunction(name='u', grid=grid)
        usave = TimeFunction(name='usave',
                             grid=grid,
                             save=2,
                             time_dim=time_subsampled)

        save_shift = Constant(name='save_shift', dtype=np.int32)

        eqns = [
            Eq(u.forward, u[t, f[x, x], f[y, y]] + 1.),
            Eq(usave.subs(time_subsampled, time_subsampled - save_shift), u)
        ]

        op = Operator(eqns, opt=('buffering', 'tasking', 'orchestrate'))

        # We just check the generated code here
        assert len([i for i in FindSymbols().visit(op)
                    if isinstance(i, Lock)]) == 1
        assert len(op._func_table) == 2

    def test_save_w_subdims(self):
        nt = 10
        grid = Grid(shape=(10, 10))
        x, y = grid.dimensions
        time_dim = grid.time_dim
        xi = SubDimension.middle(name='xi',
                                 parent=x,
                                 thickness_left=3,
                                 thickness_right=3)
        yi = SubDimension.middle(name='yi',
                                 parent=y,
                                 thickness_left=3,
                                 thickness_right=3)

        factor = Constant(name='factor', value=2, dtype=np.int32)
        time_sub = ConditionalDimension(name="time_sub",
                                        parent=time_dim,
                                        factor=factor)

        u = TimeFunction(name='u', grid=grid)
        usave = TimeFunction(name='usave',
                             grid=grid,
                             time_order=0,
                             save=int(nt // factor.data),
                             time_dim=time_sub)

        eqns = [Eq(u.forward, u + 1), Eq(usave, u.forward)]
        eqns = [e.xreplace({x: xi, y: yi}) for e in eqns]

        op = Operator(eqns, opt=('buffering', 'tasking', 'orchestrate'))

        op.apply(time_M=nt - 1)

        for i in range(usave.save):
            assert np.all(usave.data[i, 3:-3, 3:-3] == 2 * i + 1)
            assert np.all(usave.data[i, :3, :] == 0)
            assert np.all(usave.data[i, -3:, :] == 0)
            assert np.all(usave.data[i, :, :3] == 0)
            assert np.all(usave.data[i, :, -3:] == 0)

    @pytest.mark.parametrize('opt,ntmps', [
        pytest.param(
            ('streaming', 'orchestrate'), 0, marks=skipif('device-openmp')),
        pytest.param(('streaming', 'orchestrate', {
            'linearize': True
        }),
                     0,
                     marks=skipif('device-openmp')),
        (('buffering', 'streaming', 'orchestrate'), 1),
        (('buffering', 'streaming', 'orchestrate', {
            'linearize': True
        }), 1),
    ])
    def test_streaming_w_shifting(self, opt, ntmps):
        nt = 50
        grid = Grid(shape=(5, 5))
        time = grid.time_dim

        factor = Constant(name='factor', value=5, dtype=np.int32)
        t_sub = ConditionalDimension('t_sub', parent=time, factor=factor)
        save_shift = Constant(name='save_shift', dtype=np.int32)

        u = TimeFunction(name='u', grid=grid, time_order=0)
        usave = TimeFunction(name='usave',
                             grid=grid,
                             time_order=0,
                             save=(int(nt // factor.data)),
                             time_dim=t_sub)

        for i in range(usave.save):
            usave.data[i, :] = i

        eqns = Eq(u.forward, u + usave.subs(t_sub, t_sub - save_shift))

        op = Operator(eqns, opt=opt)

        # Check generated code
        assert len(op._func_table) == 3
        assert len([i for i in FindSymbols().visit(op) if i.is_Array]) == ntmps

        # From time_m=15 to time_M=35 with a factor=5 -- it means that, thanks
        # to t_sub, we enter the Eq exactly (35-15)/5 + 1 = 5 times. We set
        # save_shift=1 so instead of accessing the range usave[15/5:35/5+1],
        # we rather access the range usave[15/5-1:35:5], which means accessing
        # the usave values 2, 3, 4, 5, 6.
        op.apply(time_m=15, time_M=35, save_shift=1)
        assert np.allclose(u.data, 20)

        # Again, but with a different shift
        op.apply(time_m=15, time_M=35, save_shift=-2)
        assert np.allclose(u.data, 20 + 35)

    def test_streaming_complete(self):
        nt = 50
        grid = Grid(shape=(6, 6))
        x, y = grid.dimensions
        time = grid.time_dim
        xi = SubDimension.middle(name='xi',
                                 parent=x,
                                 thickness_left=2,
                                 thickness_right=2)
        yi = SubDimension.middle(name='yi',
                                 parent=y,
                                 thickness_left=2,
                                 thickness_right=2)

        factor = Constant(name='factor', value=5, dtype=np.int32)
        t_sub = ConditionalDimension('t_sub', parent=time, factor=factor)
        save_shift = Constant(name='save_shift', dtype=np.int32)

        u = TimeFunction(name='u', grid=grid, time_order=0)
        u1 = TimeFunction(name='u', grid=grid, time_order=0)
        u2 = TimeFunction(name='u', grid=grid, time_order=0)
        va = TimeFunction(name='va',
                          grid=grid,
                          time_order=0,
                          save=(int(nt // factor.data)),
                          time_dim=t_sub)
        vb = TimeFunction(name='vb',
                          grid=grid,
                          time_order=0,
                          save=(int(nt // factor.data)),
                          time_dim=t_sub)

        for i in range(va.save):
            va.data[i, :] = i
            vb.data[i, :] = i * 2 - 1

        vas = va.subs(t_sub, t_sub - save_shift)
        vasb = va.subs(t_sub, t_sub - 1 - save_shift)
        vasf = va.subs(t_sub, t_sub + 1 - save_shift)

        eqns = [Eq(u.forward, u + (vasb + vas + vasf) * 2. + vb)]

        eqns = [e.xreplace({x: xi, y: yi}) for e in eqns]

        op0 = Operator(eqns, opt='noop')
        op1 = Operator(eqns, opt=('buffering', 'streaming', 'orchestrate'))
        op2 = Operator(eqns,
                       opt=('buffering', 'streaming', 'fuse', 'orchestrate'))

        # Check generated code
        assert len(op1._func_table) == 6
        assert len([i for i in FindSymbols().visit(op1) if i.is_Array]) == 2
        assert len(op2._func_table) == 4
        assert len([i for i in FindSymbols().visit(op2) if i.is_Array]) == 2

        op0.apply(time_m=15, time_M=35, save_shift=0)
        op1.apply(time_m=15, time_M=35, save_shift=0, u=u1)
        op2.apply(time_m=15, time_M=35, save_shift=0, u=u2)

        assert np.all(u.data == u1.data)
        assert np.all(u.data == u2.data)

    def test_streaming_split_noleak(self):
        """
        Make sure the helper pthreads leak no memory in the target langauge runtime.
        """
        nt = 1000
        grid = Grid(shape=(20, 20, 20))

        u = TimeFunction(name='u', grid=grid)
        u1 = TimeFunction(name='u', grid=grid)
        usave = TimeFunction(name='usave', grid=grid, save=nt)

        for i in range(nt):
            usave.data[i, :] = i

        eqn = Eq(u.forward, u + usave + usave.backward)

        op0 = Operator(eqn, opt='noop')
        op1 = Operator(eqn, opt=('buffering', 'streaming', 'orchestrate'))

        op0.apply(time_M=nt - 2)

        # We'll call `op1` in total `X` times, which will create and destroy
        # `X` pthreads. With `X` at least O(10), this test would be enough
        # to uncover outrageous memory leaks due to leaking resources in
        # the runtime (in the past, we've seen leaks due to pthreads-local
        # pinned memory used for the data transfers)
        m = 1
        l = 20
        npairs = nt // l + (1 if nt % l > 0 else 0)
        X = [(m + i * l, min((i + 1) * l, nt - 2)) for i in range(npairs)]
        for m, M in X:
            op1.apply(time_m=m, time_M=M, u=u1)

        assert np.all(u.data[0] == u1.data[0])
        assert np.all(u.data[1] == u1.data[1])

    @pytest.mark.parametrize(
        'opt,opt_options,gpu_fit',
        [(('streaming', 'orchestrate'), {}, True),
         pytest.param(('streaming', 'orchestrate'), {},
                      False,
                      marks=skipif('device-openmp')),
         (('buffering', 'streaming', 'orchestrate'), {}, False),
         (('buffering', 'streaming', 'orchestrate'), {
             'linearize': True
         }, False)])
    def test_xcor_from_saved(self, opt, opt_options, gpu_fit):
        nt = 10
        grid = Grid(shape=(300, 300, 300))
        time_dim = grid.time_dim

        period = 2
        factor = Constant(name='factor', value=period, dtype=np.int32)
        time_sub = ConditionalDimension(name="time_sub",
                                        parent=time_dim,
                                        factor=factor)

        g = Function(name='g', grid=grid)
        v = TimeFunction(name='v', grid=grid)
        usave = TimeFunction(name='usave',
                             grid=grid,
                             time_order=0,
                             save=int(nt // factor.data),
                             time_dim=time_sub)
        # For the given `nt` and grid shape, `usave` is roughly 4*5*300**3=~ .5GB of data

        for i in range(int(nt // period)):
            usave.data[i, :] = i
        v.data[:] = i * 2 + 1

        opt_options = {'gpu-fit': usave if gpu_fit else None, **opt_options}

        # Assuming nt//period=5, we are computing, over 5 iterations:
        # g = 4*4  [time=8] + 3*3 [time=6] + 2*2 [time=4] + 1*1 [time=2]
        op = Operator([Eq(v.backward, v - 1),
                       Inc(g, usave * (v / 2))],
                      opt=(opt, opt_options))

        op.apply(time_M=nt - 1)

        assert np.all(g.data == 30)
Example #15
0
from functools import reduce
from operator import mul

import pytest
import numpy as np
from unittest.mock import patch

from conftest import skipif
from devito import (Grid, Function, TimeFunction, Eq, Operator, configuration,
                    switchconfig)
from devito.data import LEFT

pytestmark = skipif(['yask', 'ops'], whole_module=True)

# All core-specific imports *must* be avoided if `backend != core`, otherwise
# a backend reinitialization would be triggered via `devito/core/.__init__.py`,
# thus invalidating all of the future tests. This is guaranteed by the
# `pytestmark` above
from devito.core.autotuning import options  # noqa


@switchconfig(log_level='DEBUG')
@pytest.mark.parametrize("shape,expected", [
    ((30, 30), 13),
    ((30, 30, 30), 17)
])
def test_at_is_actually_working(shape, expected):
    """
    Check that autotuning is actually running when switched on,
    in both 2D and 3D operators.
    """