Example #1
0
def exprs(a, b):
    return [
        Expression(DummyEq(a, a + b + 5.)),
        Expression(DummyEq(a, b - a)),
        Expression(DummyEq(a, 4 * (b * a))),
        Expression(DummyEq(a, (6. / b) + (8. * a)))
    ]
Example #2
0
def exprs(dims):
    a = Array(name='a', shape=(3,), dimensions=(dims["i"],)).indexify()
    b = Array(name='b', shape=(3,), dimensions=(dims["i"],)).indexify()
    return [Expression(DummyEq(a, a + b + 5.)),
            Expression(DummyEq(a, b - a)),
            Expression(DummyEq(a, 4 * (b * a))),
            Expression(DummyEq(a, (6. / b) + (8. * a)))]
Example #3
0
    def _initialize(iet):
        comm = None

        for i in iet.parameters:
            if isinstance(i, MPICommObject):
                comm = i
                break

        if comm is not None:
            rank = Symbol(name='rank')
            rank_decl = LocalExpression(DummyEq(rank, 0))
            rank_init = Call('MPI_Comm_rank', [comm, Byref(rank)])

            ngpus = Symbol(name='ngpus')
            call = Function('omp_get_num_devices')()
            ngpus_init = LocalExpression(DummyEq(ngpus, call))

            set_device_num = Call('omp_set_default_device', [rank % ngpus])

            body = [rank_decl, rank_init, ngpus_init, set_device_num]

            init = List(header=c.Comment('Begin of OpenMP+MPI setup'),
                        body=body,
                        footer=(c.Comment('End of OpenMP+MPI setup'), c.Line()))

            iet = iet._rebuild(body=(init,) + iet.body)

        return iet
Example #4
0
    def _make_copy(self, f, hse, key, swap=False):
        buf_dims = []
        buf_indices = []
        for d in f.dimensions:
            if d not in hse.loc_indices:
                buf_dims.append(Dimension(name='buf_%s' % d.root))
                buf_indices.append(d.root)
        buf = Array(name='buf', dimensions=buf_dims, dtype=f.dtype, padding=0)

        f_offsets = []
        f_indices = []
        for d in f.dimensions:
            offset = Symbol(name='o%s' % d.root)
            f_offsets.append(offset)
            f_indices.append(offset +
                             (d.root if d not in hse.loc_indices else 0))

        if swap is False:
            eq = DummyEq(buf[buf_indices], f[f_indices])
            name = 'gather_%s' % key
        else:
            eq = DummyEq(f[f_indices], buf[buf_indices])
            name = 'scatter_%s' % key

        iet = Expression(eq)
        for i, d in reversed(list(zip(buf_indices, buf_dims))):
            # The -1 below is because an Iteration, by default, generates <=
            iet = Iteration(iet,
                            i,
                            d.symbolic_size - 1,
                            properties=(PARALLEL, AFFINE))

        parameters = [buf] + list(buf.shape) + [f] + f_offsets
        return Callable(name, iet, 'void', parameters, ('static', ))
Example #5
0
    def _make_poke(self, hs, key, msgs):
        lflag = Symbol(name='lflag')
        gflag = Symbol(name='gflag')

        # Init flags
        body = [Expression(DummyEq(lflag, 0)), Expression(DummyEq(gflag, 1))]

        # For each msg, build an Iteration calling MPI_Test on all peers
        for msg in msgs:
            dim = Dimension(name='i')
            msgi = IndexedPointer(msg, dim)

            rrecv = Byref(FieldFromComposite(msg._C_field_rrecv, msgi))
            testrecv = Call(
                'MPI_Test',
                [rrecv, Byref(lflag),
                 Macro('MPI_STATUS_IGNORE')])

            rsend = Byref(FieldFromComposite(msg._C_field_rsend, msgi))
            testsend = Call(
                'MPI_Test',
                [rsend, Byref(lflag),
                 Macro('MPI_STATUS_IGNORE')])

            update = AugmentedExpression(DummyEq(gflag, lflag), '&')

            body.append(
                Iteration([testsend, update, testrecv, update], dim,
                          msg.npeers - 1))

        body.append(Return(gflag))

        return make_efunc('pokempi%d' % key, List(body=body), retval='int')
Example #6
0
    def _(iet):
        # TODO: we need to pick the rank from `comm_shm`, not `comm`,
        # so that we have nranks == ngpus (as long as the user has launched
        # the right number of MPI processes per node given the available
        # number of GPUs per node)

        objcomm = None
        for i in iet.parameters:
            if isinstance(i, MPICommObject):
                objcomm = i
                break

        deviceid = DeviceID()
        device_nvidia = Macro('acc_device_nvidia')
        if objcomm is not None:
            rank = Symbol(name='rank')
            rank_decl = LocalExpression(DummyEq(rank, 0))
            rank_init = Call('MPI_Comm_rank', [objcomm, Byref(rank)])

            ngpus = Symbol(name='ngpus')
            call = DefFunction('acc_get_num_devices', device_nvidia)
            ngpus_init = LocalExpression(DummyEq(ngpus, call))

            asdn_then = Call('acc_set_device_num', [deviceid, device_nvidia])
            asdn_else = Call('acc_set_device_num',
                             [rank % ngpus, device_nvidia])

            body = [
                Call('acc_init', [device_nvidia]),
                Conditional(
                    CondNe(deviceid, -1), asdn_then,
                    List(body=[rank_decl, rank_init, ngpus_init, asdn_else]))
            ]
        else:
            body = [
                Call('acc_init', [device_nvidia]),
                Conditional(
                    CondNe(deviceid, -1),
                    Call('acc_set_device_num', [deviceid, device_nvidia]))
            ]

        init = List(header=c.Comment('Begin of OpenACC+MPI setup'),
                    body=body,
                    footer=(c.Comment('End of OpenACC+MPI setup'), c.Line()))
        iet = iet._rebuild(body=(init, ) + iet.body)

        return iet, {'args': deviceid}
Example #7
0
def exprs(a, b, c, d, a_dense, b_dense):
    return [Expression(DummyEq(a, a + b + 5.)),
            Expression(DummyEq(a, b*d - a*c)),
            Expression(DummyEq(b, a + b*b + 3)),
            Expression(DummyEq(a, a*b*d*c)),
            Expression(DummyEq(a, 4 * ((b + d) * (a + c)))),
            Expression(DummyEq(a, (6. / b) + (8. * a))),
            Expression(DummyEq(a_dense, a_dense + b_dense + 5.))]
Example #8
0
    def _make_copy(self, f, fixed, swap=False):
        """
        Construct a Callable performing a copy of:

            * an arbitrary convex region of ``f`` into a contiguous Array, OR
            * if ``swap=True``, a contiguous Array into an arbitrary convex
              region of ``f``.
        """
        buf_dims = []
        buf_indices = []
        for d in f.dimensions:
            if d not in fixed:
                buf_dims.append(Dimension(name='buf_%s' % d.root))
                buf_indices.append(d.root)
        buf = Array(name='buf', dimensions=buf_dims, dtype=f.dtype)

        f_offsets = []
        f_indices = []
        for d in f.dimensions:
            offset = Symbol(name='o%s' % d.root)
            f_offsets.append(offset)
            f_indices.append(offset + (d.root if d not in fixed else 0))

        if swap is False:
            eq = DummyEq(buf[buf_indices], f[f_indices])
            name = 'gather%dd' % f.ndim
        else:
            eq = DummyEq(f[f_indices], buf[buf_indices])
            name = 'scatter%dd' % f.ndim

        iet = Expression(eq)
        for i, d in reversed(list(zip(buf_indices, buf_dims))):
            # The -1 below is because an Iteration, by default, generates <=
            iet = Iteration(iet, i, d.symbolic_size - 1, properties=PARALLEL)
        iet = List(body=[ArrayCast(f), ArrayCast(buf), iet])

        # Optimize the memory copy with the DLE
        from devito.dle import transform
        state = transform(iet, 'simd', {'openmp': self._threaded})

        parameters = [buf] + list(buf.shape) + [f] + f_offsets + state.input
        return Callable(name, state.nodes, 'void', parameters,
                        ('static', )), state.input
Example #9
0
    def _make_withlock(self, iet, sync_ops, pieces, root):
        # Sorting for deterministic code gen
        locks = sorted({s.lock for s in sync_ops}, key=lambda i: i.name)

        # The `min` is used to pick the maximum possible degree of parallelism.
        # For example, assume there are two locks in the given `sync_ops`, `lock0(i)`
        # and `lock1(j)`. If, say, `lock0` protects 3 entries of a certain Function
        # `u`, while `lock1` protects 2 entries of the Function `v`, then there
        # will never be more than 2 threads in flight concurrently
        npthreads = min(i.size for i in locks)

        preactions = []
        postactions = []
        for s in sync_ops:
            imask = [
                s.handle.indices[d]
                if d.root in s.lock.locked_dimensions else FULL
                for d in s.target.dimensions
            ]
            update = List(header=self.lang._map_update_wait_host(
                s.target, imask, SharedData._field_id))
            preactions.append(
                List(body=[BlankLine, update,
                           DummyExpr(s.handle, 1)]))
            postactions.append(DummyExpr(s.handle, 2))
        preactions.append(BlankLine)
        postactions.insert(0, BlankLine)

        # Turn `iet` into a ThreadFunction so that it can be executed
        # asynchronously by a pthread in the `npthreads` pool
        name = self.sregistry.make_name(prefix='copy_device_to_host')
        body = List(body=tuple(preactions) + iet.body + tuple(postactions))
        tctx = make_thread_ctx(name, body, root, npthreads, sync_ops,
                               self.sregistry)
        pieces.funcs.extend(tctx.funcs)

        # Schedule computation to the first available thread
        iet = tctx.activate

        # Initialize the locks
        for i in locks:
            values = np.full(i.shape, 2, dtype=np.int32).tolist()
            pieces.init.append(
                LocalExpression(DummyEq(i, ListInitializer(values))))

        # Fire up the threads
        pieces.init.append(tctx.init)

        # Final wait before jumping back to Python land
        pieces.finalize.append(tctx.finalize)

        # Keep track of created objects
        pieces.objs.add(sync_ops, tctx.sdata, tctx.threads)

        return iet
Example #10
0
    def make_parallel(self, iet):
        """
        Transform ``iet`` by decorating its parallel :class:`Iteration`s with
        suitable ``#pragma omp ...`` for thread-level parallelism.
        """
        # Group sequences of loops that should go within the same parallel region
        was_tagged = False
        groups = OrderedDict()
        for tree in retrieve_iteration_tree(iet):
            # Determine the number of consecutive parallelizable Iterations
            candidates = filter_iterations(tree, key=self.key, stop='asap')
            if not candidates:
                was_tagged = False
                continue
            # Consecutive tagged Iteration go in the same group
            is_tagged = any(i.tag is not None for i in tree)
            key = len(groups) - (is_tagged & was_tagged)
            handle = groups.setdefault(key, OrderedDict())
            handle[candidates[0]] = candidates
            was_tagged = is_tagged

        mapper = OrderedDict()
        for group in groups.values():
            private = []
            for root, candidates in group.items():
                mapper.update(self._make_parallel_tree(root, candidates))

                # Track the thread-private and thread-shared variables
                private.extend([
                    i for i in FindSymbols('symbolics').visit(root)
                    if i.is_Array and i._mem_stack
                ])

            # Build the parallel region
            private = sorted(set([i.name for i in private]))
            private = ('private(%s)' % ','.join(private)) if private else ''
            rebuilt = [v for k, v in mapper.items() if k in group]
            par_region = Block(header=self.lang['par-region'](private),
                               body=rebuilt)
            for k, v in list(mapper.items()):
                if isinstance(v, Iteration):
                    mapper[k] = None if v.is_Remainder else par_region
        processed = Transformer(mapper).visit(iet)

        # Hack/workaround to the fact that the OpenMP pragmas are not true
        # IET nodes, so the `nthreads` variables won't be detected as a
        # Callable parameter unless inserted in a mock Expression
        if mapper:
            nt = NThreads()
            eq = LocalExpression(DummyEq(Symbol(name='nt', dtype=np.int32),
                                         nt))
            return List(body=[eq, processed]), {'input': [nt]}
        else:
            return List(body=processed), {}
Example #11
0
def copy(f, fixed, swap=False):
    """
    Construct a :class:`Callable` capable of copying: ::

        * an arbitrary convex region of ``f`` into a contiguous :class:`Array`, OR
        * if ``swap=True``, a contiguous :class:`Array` into an arbitrary convex
          region of ``f``.
    """
    buf_dims = []
    buf_indices = []
    for d in f.dimensions:
        if d not in fixed:
            buf_dims.append(Dimension(name='buf_%s' % d.root))
            buf_indices.append(d.root)
    buf = Array(name='buf', dimensions=buf_dims, dtype=f.dtype)

    dat_dims = []
    dat_offsets = []
    dat_indices = []
    for d in f.dimensions:
        dat_dims.append(Dimension(name='dat_%s' % d.root))
        offset = Symbol(name='o%s' % d.root)
        dat_offsets.append(offset)
        dat_indices.append(offset + (d.root if d not in fixed else 0))
    dat = Array(name='dat', dimensions=dat_dims, dtype=f.dtype)

    if swap is False:
        eq = DummyEq(buf[buf_indices], dat[dat_indices])
        name = 'gather_%s' % f.name
    else:
        eq = DummyEq(dat[dat_indices], buf[buf_indices])
        name = 'scatter_%s' % f.name

    iet = Expression(eq)
    for i, d in reversed(list(zip(buf_indices, buf_dims))):
        iet = Iteration(iet, i,
                        d.symbolic_size - 1)  # -1 as Iteration generates <=
    iet = List(body=[ArrayCast(dat), ArrayCast(buf), iet])
    parameters = [buf] + list(buf.shape) + [dat] + list(
        dat.shape) + dat_offsets
    return Callable(name, iet, 'void', parameters, ('static', ))
Example #12
0
    def _initialize(iet):
        # TODO: we need to pick the rank from `comm_shm`, not `comm`,
        # so that we have nranks == ngpus (as long as the user has launched
        # the right number of MPI processes per node given the available
        # number of GPUs per node)
        comm = None
        for i in iet.parameters:
            if isinstance(i, MPICommObject):
                comm = i
                break

        device_nvidia = Macro('acc_device_nvidia')
        body = Call('acc_init', [device_nvidia])

        if comm is not None:
            rank = Symbol(name='rank')
            rank_decl = LocalExpression(DummyEq(rank, 0))
            rank_init = Call('MPI_Comm_rank', [comm, Byref(rank)])

            ngpus = Symbol(name='ngpus')
            call = DefFunction('acc_get_num_devices', device_nvidia)
            ngpus_init = LocalExpression(DummyEq(ngpus, call))

            devicenum = Symbol(name='devicenum')
            devicenum_init = LocalExpression(DummyEq(devicenum, rank % ngpus))

            set_device_num = Call('acc_set_device_num',
                                  [devicenum, device_nvidia])

            body = [
                rank_decl, rank_init, ngpus_init, devicenum_init,
                set_device_num, body
            ]

        init = List(header=c.Comment('Begin of OpenACC+MPI setup'),
                    body=body,
                    footer=(c.Comment('End of OpenACC+MPI setup'), c.Line()))

        iet = iet._rebuild(body=(init, ) + iet.body)

        return iet
Example #13
0
    def _make_withlock(self, iet, sync_ops, pieces, root):
        locks = sorted({s.lock for s in sync_ops}, key=lambda i: i.name)

        threads = self.__make_threads(value=min(i.size for i in locks))

        preactions = []
        postactions = []
        for s in sync_ops:
            imask = [
                s.handle.indices[d]
                if d.root in s.lock.locked_dimensions else FULL
                for d in s.target.dimensions
            ]

            preactions.append(
                List(body=[
                    BlankLine,
                    List(header=self._P._map_update_wait_host(
                        s.target, imask, SharedData._field_id)),
                    DummyExpr(s.handle, 1)
                ]))
            postactions.append(DummyExpr(s.handle, 2))
        preactions.append(BlankLine)
        postactions.insert(0, BlankLine)

        # Turn `iet` into an ElementalFunction so that it can be
        # executed asynchronously by `threadhost`
        name = self.sregistry.make_name(prefix='copy_device_to_host')
        body = List(body=tuple(preactions) + iet.body + tuple(postactions))
        tfunc, sdata = self.__make_tfunc(name, body, root, threads)
        pieces.funcs.append(tfunc)

        # Schedule computation to the first available thread
        iet = self.__make_activate_thread(threads, sdata, sync_ops)

        # Initialize the locks
        for i in locks:
            values = np.full(i.shape, 2, dtype=np.int32).tolist()
            pieces.init.append(
                LocalExpression(DummyEq(i, ListInitializer(values))))

        # Fire up the threads
        pieces.init.append(
            self.__make_init_threads(threads, sdata, tfunc, pieces))
        pieces.threads.append(threads)

        # Final wait before jumping back to Python land
        pieces.finalize.append(self.__make_finalize_threads(threads, sdata))

        return iet
Example #14
0
def test_loops_collapsed(fe, t0, t1, t2, t3, exprs, expected, iters):
    scope = [fe, t0, t1, t2, t3]
    node_exprs = [Expression(DummyEq(EVAL(i, *scope))) for i in exprs]
    ast = iters[6](iters[7](iters[8](node_exprs)))

    ast = iet_analyze(ast)

    nodes = transform(ast, mode='openmp').nodes
    iterations = FindNodes(Iteration).visit(nodes)
    assert len(iterations) == len(expected)

    # Check for presence of pragma omp
    for i, j in zip(iterations, expected):
        pragmas = i.pragmas
        if j is True:
            assert len(pragmas) == 1
            pragma = pragmas[0]
            assert 'omp for collapse' in pragma.value
        else:
            for k in pragmas:
                assert 'omp for collapse' not in k.value
Example #15
0
    def test_iterations_ompized(self, fa, fb, fc, fd, t0, t1, t2, t3, exprs,
                                expected, iters):
        scope = [fa, fb, fc, fd, t0, t1, t2, t3]
        node_exprs = [Expression(DummyEq(EVAL(i, *scope))) for i in exprs]
        ast = iters[6](iters[7](node_exprs))

        ast = iet_analyze(ast)

        iet, _ = transform(ast, mode='openmp')
        iterations = FindNodes(Iteration).visit(iet)
        assert len(iterations) == len(expected)

        # Check for presence of pragma omp
        for i, j in zip(iterations, expected):
            pragmas = i.pragmas
            if j is True:
                assert len(pragmas) == 1
                pragma = pragmas[0]
                assert 'omp for' in pragma.value
            else:
                for k in pragmas:
                    assert 'omp for' not in k.value
Example #16
0
    def _make_poke(self, hs, key, msgs):
        flag = Symbol(name='flag')
        initflag = LocalExpression(DummyEq(flag, 0))

        body = [initflag]
        for msg in msgs:
            dim = Dimension(name='i')
            msgi = IndexedPointer(msg, dim)

            rrecv = Byref(FieldFromComposite(msg._C_field_rrecv, msgi))
            rsend = Byref(FieldFromComposite(msg._C_field_rsend, msgi))
            testrecv = Call(
                'MPI_Test',
                [rrecv, Byref(flag),
                 Macro('MPI_STATUS_IGNORE')])
            testsend = Call(
                'MPI_Test',
                [rsend, Byref(flag),
                 Macro('MPI_STATUS_IGNORE')])

            body.append(Iteration([testsend, testrecv], dim, msg.npeers - 1))

        return make_efunc('pokempi%d' % key, body)
Example #17
0
 def test_nodes_conditional(self, fc):
     then_body = Expression(DummyEq(fc[x, y], fc[x, y] + 1))
     else_body = Expression(DummyEq(fc[x, y], fc[x, y] + 2))
     conditional = Conditional(x < 3, then_body, else_body)
     assert str(conditional) == """\
Example #18
0
def DummyExpr(*args, init=False):
    return Expression(DummyEq(*args), init=init)
Example #19
0
def DummyExpr(*args):
    return Expression(DummyEq(*args))
Example #20
0
def test_make_cpp_parfor():
    """
    Test construction of a CPP parallel for. This excites the IET construction
    machinery in several ways, in particular by using Lambda nodes (to generate
    C++ lambda functions) and nested Calls.
    """
    class STDVectorThreads(LocalObject):
        dtype = type('std::vector<std::thread>', (c_void_p, ), {})

        def __init__(self):
            self.name = 'threads'

    class STDThread(LocalObject):
        dtype = type('std::thread&', (c_void_p, ), {})

        def __init__(self, name):
            self.name = name

    class FunctionType(LocalObject):
        dtype = type('FuncType&&', (c_void_p, ), {})

        def __init__(self, name):
            self.name = name

    # Basic symbols
    nthreads = Symbol(name='nthreads', is_const=True)
    threshold = Symbol(name='threshold', is_const=True)
    last = Symbol(name='last', is_const=True)
    first = Symbol(name='first', is_const=True)
    portion = Symbol(name='portion', is_const=True)

    # Composite symbols
    threads = STDVectorThreads()

    # Iteration helper symbols
    begin = Symbol(name='begin')
    l = Symbol(name='l')
    end = Symbol(name='end')

    # Functions
    stdmax = sympy.Function('std::max')

    # Construct the parallel-for body
    func = FunctionType('func')
    i = Dimension(name='i')
    threadobj = Call(
        'std::thread',
        Lambda(
            Iteration(Call(func.name, i), i, (begin, end - 1, 1)),
            ['=', Byref(func.name)],
        ))
    threadpush = Call(FieldFromComposite('push_back', threads), threadobj)
    it = Dimension(name='it')
    iteration = Iteration([
        LocalExpression(DummyEq(begin, it)),
        LocalExpression(DummyEq(l, it + portion)),
        LocalExpression(DummyEq(end, InlineIf(l > last, last, l))), threadpush
    ], it, (first, last, portion))
    thread = STDThread('x')
    waitcall = Call('std::for_each', [
        Call(FieldFromComposite('begin', threads)),
        Call(FieldFromComposite('end', threads)),
        Lambda(Call(FieldFromComposite('join', thread.name)), [], [thread])
    ])
    body = [
        LocalExpression(DummyEq(threshold, 1)),
        LocalExpression(
            DummyEq(portion, stdmax(threshold, (last - first) / nthreads))),
        Call(FieldFromComposite('reserve', threads), nthreads), iteration,
        waitcall
    ]

    parfor = ElementalFunction('parallel_for', body, 'void',
                               [first, last, func, nthreads])

    assert str(parfor) == """\
Example #21
0
 def test_conditional(self, fc, grid):
     x, y, _ = grid.dimensions
     then_body = Expression(DummyEq(fc[x, y], fc[x, y] + 1))
     else_body = Expression(DummyEq(fc[x, y], fc[x, y] + 2))
     conditional = Conditional(x < 3, then_body, else_body)
     assert str(conditional) == """\