Esempio n. 1
0
    def __make_activate_thread(self, threads, sdata, sync_ops):
        if threads.size == 1:
            d = threads.index
        else:
            d = Symbol(name=self.sregistry.make_name(
                prefix=threads.index.name))

        sync_locks = [s for s in sync_ops if s.is_SyncLock]
        condition = Or(
            *([CondNe(s.handle, 2) for s in sync_locks] +
              [CondNe(FieldFromComposite(sdata._field_flag, sdata[d]), 1)]))

        if threads.size == 1:
            activation = [BusyWait(condition)]
        else:
            activation = [
                DummyExpr(d, 0),
                BusyWait(condition, DummyExpr(d, (d + 1) % threads.size))
            ]

        activation.extend([
            DummyExpr(FieldFromComposite(i.name, sdata[d]), i)
            for i in sdata.fields
        ])
        activation.extend([DummyExpr(s.handle, 0) for s in sync_locks])
        activation.append(
            DummyExpr(FieldFromComposite(sdata._field_flag, sdata[d]), 2))
        activation = List(
            header=[c.Line(),
                    c.Comment("Activate `%s`" % threads.name)],
            body=activation,
            footer=c.Line())

        return activation
Esempio n. 2
0
    def _make_withlock(self, iet, sync_ops, pieces, root):
        # Sorting for deterministic code gen
        locks = sorted({s.lock for s in sync_ops}, key=lambda i: i.name)

        # The `min` is used to pick the maximum possible degree of parallelism.
        # For example, assume there are two locks in the given `sync_ops`, `lock0(i)`
        # and `lock1(j)`. If, say, `lock0` protects 3 entries of a certain Function
        # `u`, while `lock1` protects 2 entries of the Function `v`, then there
        # will never be more than 2 threads in flight concurrently
        npthreads = min(i.size for i in locks)

        preactions = []
        postactions = []
        for s in sync_ops:
            imask = [
                s.handle.indices[d]
                if d.root in s.lock.locked_dimensions else FULL
                for d in s.target.dimensions
            ]
            update = List(header=self.lang._map_update_wait_host(
                s.target, imask, SharedData._field_id))
            preactions.append(
                List(body=[BlankLine, update,
                           DummyExpr(s.handle, 1)]))
            postactions.append(DummyExpr(s.handle, 2))
        preactions.append(BlankLine)
        postactions.insert(0, BlankLine)

        # Turn `iet` into a ThreadFunction so that it can be executed
        # asynchronously by a pthread in the `npthreads` pool
        name = self.sregistry.make_name(prefix='copy_device_to_host')
        body = List(body=tuple(preactions) + iet.body + tuple(postactions))
        tctx = make_thread_ctx(name, body, root, npthreads, sync_ops,
                               self.sregistry)
        pieces.funcs.extend(tctx.funcs)

        # Schedule computation to the first available thread
        iet = tctx.activate

        # Initialize the locks
        for i in locks:
            values = np.full(i.shape, 2, dtype=np.int32).tolist()
            pieces.init.append(
                LocalExpression(DummyEq(i, ListInitializer(values))))

        # Fire up the threads
        pieces.init.append(tctx.init)

        # Final wait before jumping back to Python land
        pieces.finalize.append(tctx.finalize)

        # Keep track of created objects
        pieces.objs.add(sync_ops, tctx.sdata, tctx.threads)

        return iet
Esempio n. 3
0
    def _make_withlock(self, iet, sync_ops, pieces, root):
        locks = sorted({s.lock for s in sync_ops}, key=lambda i: i.name)

        threads = self.__make_threads(value=min(i.size for i in locks))

        preactions = []
        postactions = []
        for s in sync_ops:
            imask = [
                s.handle.indices[d]
                if d.root in s.lock.locked_dimensions else FULL
                for d in s.target.dimensions
            ]

            preactions.append(
                List(body=[
                    BlankLine,
                    List(header=self._P._map_update_wait_host(
                        s.target, imask, SharedData._field_id)),
                    DummyExpr(s.handle, 1)
                ]))
            postactions.append(DummyExpr(s.handle, 2))
        preactions.append(BlankLine)
        postactions.insert(0, BlankLine)

        # Turn `iet` into an ElementalFunction so that it can be
        # executed asynchronously by `threadhost`
        name = self.sregistry.make_name(prefix='copy_device_to_host')
        body = List(body=tuple(preactions) + iet.body + tuple(postactions))
        tfunc, sdata = self.__make_tfunc(name, body, root, threads)
        pieces.funcs.append(tfunc)

        # Schedule computation to the first available thread
        iet = self.__make_activate_thread(threads, sdata, sync_ops)

        # Initialize the locks
        for i in locks:
            values = np.full(i.shape, 2, dtype=np.int32).tolist()
            pieces.init.append(
                LocalExpression(DummyEq(i, ListInitializer(values))))

        # Fire up the threads
        pieces.init.append(
            self.__make_init_threads(threads, sdata, tfunc, pieces))
        pieces.threads.append(threads)

        # Final wait before jumping back to Python land
        pieces.finalize.append(self.__make_finalize_threads(threads, sdata))

        return iet
Esempio n. 4
0
    def __make_tfunc(self, name, iet, root, threads):
        # Create the SharedData
        required = derive_parameters(iet)
        known = (root.parameters +
                 tuple(i for i in required if i.is_Array and i._mem_shared))
        parameters, dynamic_parameters = split(required, lambda i: i in known)

        sdata = SharedData(name=self.sregistry.make_name(prefix='sdata'),
                           nthreads_std=threads.size,
                           fields=dynamic_parameters)
        parameters.append(sdata)

        # Prepend the unwinded SharedData fields, available upon thread activation
        preactions = [
            DummyExpr(i, FieldFromPointer(i.name, sdata.symbolic_base))
            for i in dynamic_parameters
        ]
        preactions.append(
            DummyExpr(sdata.symbolic_id,
                      FieldFromPointer(sdata._field_id, sdata.symbolic_base)))

        # Append the flag reset
        postactions = [
            List(body=[
                BlankLine,
                DummyExpr(
                    FieldFromPointer(sdata._field_flag, sdata.symbolic_base),
                    1)
            ])
        ]

        iet = List(body=preactions + [iet] + postactions)

        # Append the flag reset

        # The thread has work to do when it receives the signal that all locks have
        # been set to 0 by the main thread
        iet = Conditional(
            CondEq(FieldFromPointer(sdata._field_flag, sdata.symbolic_base),
                   2), iet)

        # The thread keeps spinning until the alive flag is set to 0 by the main thread
        iet = While(
            CondNe(FieldFromPointer(sdata._field_flag, sdata.symbolic_base),
                   0), iet)

        return Callable(name, iet, 'void', parameters, 'static'), sdata
Esempio n. 5
0
def test_null_init():
    grid = Grid(shape=(10, 10))

    u = Function(name='u', grid=grid)

    expr = DummyExpr(u.indexed, Macro('NULL'), init=True)

    assert str(expr) == "float * u = NULL;"
    assert expr.defines == (u.indexed, )
Esempio n. 6
0
def test_call_indexed():
    grid = Grid(shape=(10, 10))

    u = Function(name='u', grid=grid)

    foo = Callable('foo', DummyExpr(u, 1), 'void', parameters=[u, u.indexed])
    call = Call(foo.name, [u, u.indexed])

    assert str(call) == "foo(u_vec,u);"
    assert str(foo) == """\
Esempio n. 7
0
    def __make_finalize_threads(self, threads, sdata):
        d = threads.index
        if threads.size == 1:
            callback = lambda body: body
        else:
            callback = lambda body: Iteration(body, d, threads.size - 1)

        threadswait = List(
            header=c.Comment("Wait for completion of `%s`" % threads.name),
            body=callback([
                While(
                    CondEq(FieldFromComposite(sdata._field_flag, sdata[d]),
                           2)),
                DummyExpr(FieldFromComposite(sdata._field_flag, sdata[d]), 0),
                Call(FieldFromComposite('join', threads[d]))
            ]))

        return threadswait
Esempio n. 8
0
    def __make_init_threads(self, threads, sdata, tfunc, pieces):
        d = threads.index
        if threads.size == 1:
            callback = lambda body: body
        else:
            callback = lambda body: Iteration(body, d, threads.size - 1)

        idinit = DummyExpr(FieldFromComposite(sdata._field_id, sdata[d]),
                           1 + sum(i.size for i in pieces.threads) + d)
        arguments = list(tfunc.parameters)
        arguments[-1] = sdata.symbolic_base + d
        call = Call('std::thread',
                    Call(tfunc.name, arguments, is_indirect=True),
                    retobj=threads[d])
        threadsinit = List(header=c.Comment("Fire up and initialize `%s`" %
                                            threads.name),
                           body=callback([idinit, call]))

        return threadsinit
Esempio n. 9
0
def test_conditional(fc, grid):
    x, y, _ = grid.dimensions
    then_body = DummyExpr(fc[x, y], fc[x, y] + 1)
    else_body = DummyExpr(fc[x, y], fc[x, y] + 2)
    conditional = Conditional(x < 3, then_body, else_body)
    assert str(conditional) == """\
Esempio n. 10
0
def test_make_cpp_parfor():
    """
    Test construction of a CPP parallel for. This excites the IET construction
    machinery in several ways, in particular by using Lambda nodes (to generate
    C++ lambda functions) and nested Calls.
    """
    class STDVectorThreads(LocalObject):
        dtype = type('std::vector<std::thread>', (c_void_p, ), {})

        def __init__(self):
            self.name = 'threads'

    class STDThread(LocalObject):
        dtype = type('std::thread&', (c_void_p, ), {})

        def __init__(self, name):
            self.name = name

    class FunctionType(LocalObject):
        dtype = type('FuncType&&', (c_void_p, ), {})

        def __init__(self, name):
            self.name = name

    # Basic symbols
    nthreads = Symbol(name='nthreads', is_const=True)
    threshold = Symbol(name='threshold', is_const=True)
    last = Symbol(name='last', is_const=True)
    first = Symbol(name='first', is_const=True)
    portion = Symbol(name='portion', is_const=True)

    # Composite symbols
    threads = STDVectorThreads()

    # Iteration helper symbols
    begin = Symbol(name='begin')
    l = Symbol(name='l')
    end = Symbol(name='end')

    # Functions
    stdmax = sympy.Function('std::max')

    # Construct the parallel-for body
    func = FunctionType('func')
    i = Dimension(name='i')
    threadobj = Call(
        'std::thread',
        Lambda(
            Iteration(Call(func.name, i), i, (begin, end - 1, 1)),
            ['=', Byref(func.name)],
        ))
    threadpush = Call(FieldFromComposite('push_back', threads), threadobj)
    it = Dimension(name='it')
    iteration = Iteration([
        DummyExpr(begin, it, init=True),
        DummyExpr(l, it + portion, init=True),
        DummyExpr(end, InlineIf(l > last, last, l), init=True), threadpush
    ], it, (first, last, portion))
    thread = STDThread('x')
    waitcall = Call('std::for_each', [
        Call(FieldFromComposite('begin', threads)),
        Call(FieldFromComposite('end', threads)),
        Lambda(Call(FieldFromComposite('join', thread.name)), [], [thread])
    ])
    body = [
        DummyExpr(threshold, 1, init=True),
        DummyExpr(portion,
                  stdmax(threshold, (last - first) / nthreads),
                  init=True),
        Call(FieldFromComposite('reserve', threads), nthreads), iteration,
        waitcall
    ]

    parfor = ElementalFunction('parallel_for', body, 'void',
                               [first, last, func, nthreads])

    assert str(parfor) == """\