def __make_activate_thread(self, threads, sdata, sync_ops): if threads.size == 1: d = threads.index else: d = Symbol(name=self.sregistry.make_name( prefix=threads.index.name)) sync_locks = [s for s in sync_ops if s.is_SyncLock] condition = Or( *([CondNe(s.handle, 2) for s in sync_locks] + [CondNe(FieldFromComposite(sdata._field_flag, sdata[d]), 1)])) if threads.size == 1: activation = [BusyWait(condition)] else: activation = [ DummyExpr(d, 0), BusyWait(condition, DummyExpr(d, (d + 1) % threads.size)) ] activation.extend([ DummyExpr(FieldFromComposite(i.name, sdata[d]), i) for i in sdata.fields ]) activation.extend([DummyExpr(s.handle, 0) for s in sync_locks]) activation.append( DummyExpr(FieldFromComposite(sdata._field_flag, sdata[d]), 2)) activation = List( header=[c.Line(), c.Comment("Activate `%s`" % threads.name)], body=activation, footer=c.Line()) return activation
def _make_withlock(self, iet, sync_ops, pieces, root): # Sorting for deterministic code gen locks = sorted({s.lock for s in sync_ops}, key=lambda i: i.name) # The `min` is used to pick the maximum possible degree of parallelism. # For example, assume there are two locks in the given `sync_ops`, `lock0(i)` # and `lock1(j)`. If, say, `lock0` protects 3 entries of a certain Function # `u`, while `lock1` protects 2 entries of the Function `v`, then there # will never be more than 2 threads in flight concurrently npthreads = min(i.size for i in locks) preactions = [] postactions = [] for s in sync_ops: imask = [ s.handle.indices[d] if d.root in s.lock.locked_dimensions else FULL for d in s.target.dimensions ] update = List(header=self.lang._map_update_wait_host( s.target, imask, SharedData._field_id)) preactions.append( List(body=[BlankLine, update, DummyExpr(s.handle, 1)])) postactions.append(DummyExpr(s.handle, 2)) preactions.append(BlankLine) postactions.insert(0, BlankLine) # Turn `iet` into a ThreadFunction so that it can be executed # asynchronously by a pthread in the `npthreads` pool name = self.sregistry.make_name(prefix='copy_device_to_host') body = List(body=tuple(preactions) + iet.body + tuple(postactions)) tctx = make_thread_ctx(name, body, root, npthreads, sync_ops, self.sregistry) pieces.funcs.extend(tctx.funcs) # Schedule computation to the first available thread iet = tctx.activate # Initialize the locks for i in locks: values = np.full(i.shape, 2, dtype=np.int32).tolist() pieces.init.append( LocalExpression(DummyEq(i, ListInitializer(values)))) # Fire up the threads pieces.init.append(tctx.init) # Final wait before jumping back to Python land pieces.finalize.append(tctx.finalize) # Keep track of created objects pieces.objs.add(sync_ops, tctx.sdata, tctx.threads) return iet
def _make_withlock(self, iet, sync_ops, pieces, root): locks = sorted({s.lock for s in sync_ops}, key=lambda i: i.name) threads = self.__make_threads(value=min(i.size for i in locks)) preactions = [] postactions = [] for s in sync_ops: imask = [ s.handle.indices[d] if d.root in s.lock.locked_dimensions else FULL for d in s.target.dimensions ] preactions.append( List(body=[ BlankLine, List(header=self._P._map_update_wait_host( s.target, imask, SharedData._field_id)), DummyExpr(s.handle, 1) ])) postactions.append(DummyExpr(s.handle, 2)) preactions.append(BlankLine) postactions.insert(0, BlankLine) # Turn `iet` into an ElementalFunction so that it can be # executed asynchronously by `threadhost` name = self.sregistry.make_name(prefix='copy_device_to_host') body = List(body=tuple(preactions) + iet.body + tuple(postactions)) tfunc, sdata = self.__make_tfunc(name, body, root, threads) pieces.funcs.append(tfunc) # Schedule computation to the first available thread iet = self.__make_activate_thread(threads, sdata, sync_ops) # Initialize the locks for i in locks: values = np.full(i.shape, 2, dtype=np.int32).tolist() pieces.init.append( LocalExpression(DummyEq(i, ListInitializer(values)))) # Fire up the threads pieces.init.append( self.__make_init_threads(threads, sdata, tfunc, pieces)) pieces.threads.append(threads) # Final wait before jumping back to Python land pieces.finalize.append(self.__make_finalize_threads(threads, sdata)) return iet
def __make_tfunc(self, name, iet, root, threads): # Create the SharedData required = derive_parameters(iet) known = (root.parameters + tuple(i for i in required if i.is_Array and i._mem_shared)) parameters, dynamic_parameters = split(required, lambda i: i in known) sdata = SharedData(name=self.sregistry.make_name(prefix='sdata'), nthreads_std=threads.size, fields=dynamic_parameters) parameters.append(sdata) # Prepend the unwinded SharedData fields, available upon thread activation preactions = [ DummyExpr(i, FieldFromPointer(i.name, sdata.symbolic_base)) for i in dynamic_parameters ] preactions.append( DummyExpr(sdata.symbolic_id, FieldFromPointer(sdata._field_id, sdata.symbolic_base))) # Append the flag reset postactions = [ List(body=[ BlankLine, DummyExpr( FieldFromPointer(sdata._field_flag, sdata.symbolic_base), 1) ]) ] iet = List(body=preactions + [iet] + postactions) # Append the flag reset # The thread has work to do when it receives the signal that all locks have # been set to 0 by the main thread iet = Conditional( CondEq(FieldFromPointer(sdata._field_flag, sdata.symbolic_base), 2), iet) # The thread keeps spinning until the alive flag is set to 0 by the main thread iet = While( CondNe(FieldFromPointer(sdata._field_flag, sdata.symbolic_base), 0), iet) return Callable(name, iet, 'void', parameters, 'static'), sdata
def test_null_init(): grid = Grid(shape=(10, 10)) u = Function(name='u', grid=grid) expr = DummyExpr(u.indexed, Macro('NULL'), init=True) assert str(expr) == "float * u = NULL;" assert expr.defines == (u.indexed, )
def test_call_indexed(): grid = Grid(shape=(10, 10)) u = Function(name='u', grid=grid) foo = Callable('foo', DummyExpr(u, 1), 'void', parameters=[u, u.indexed]) call = Call(foo.name, [u, u.indexed]) assert str(call) == "foo(u_vec,u);" assert str(foo) == """\
def __make_finalize_threads(self, threads, sdata): d = threads.index if threads.size == 1: callback = lambda body: body else: callback = lambda body: Iteration(body, d, threads.size - 1) threadswait = List( header=c.Comment("Wait for completion of `%s`" % threads.name), body=callback([ While( CondEq(FieldFromComposite(sdata._field_flag, sdata[d]), 2)), DummyExpr(FieldFromComposite(sdata._field_flag, sdata[d]), 0), Call(FieldFromComposite('join', threads[d])) ])) return threadswait
def __make_init_threads(self, threads, sdata, tfunc, pieces): d = threads.index if threads.size == 1: callback = lambda body: body else: callback = lambda body: Iteration(body, d, threads.size - 1) idinit = DummyExpr(FieldFromComposite(sdata._field_id, sdata[d]), 1 + sum(i.size for i in pieces.threads) + d) arguments = list(tfunc.parameters) arguments[-1] = sdata.symbolic_base + d call = Call('std::thread', Call(tfunc.name, arguments, is_indirect=True), retobj=threads[d]) threadsinit = List(header=c.Comment("Fire up and initialize `%s`" % threads.name), body=callback([idinit, call])) return threadsinit
def test_conditional(fc, grid): x, y, _ = grid.dimensions then_body = DummyExpr(fc[x, y], fc[x, y] + 1) else_body = DummyExpr(fc[x, y], fc[x, y] + 2) conditional = Conditional(x < 3, then_body, else_body) assert str(conditional) == """\
def test_make_cpp_parfor(): """ Test construction of a CPP parallel for. This excites the IET construction machinery in several ways, in particular by using Lambda nodes (to generate C++ lambda functions) and nested Calls. """ class STDVectorThreads(LocalObject): dtype = type('std::vector<std::thread>', (c_void_p, ), {}) def __init__(self): self.name = 'threads' class STDThread(LocalObject): dtype = type('std::thread&', (c_void_p, ), {}) def __init__(self, name): self.name = name class FunctionType(LocalObject): dtype = type('FuncType&&', (c_void_p, ), {}) def __init__(self, name): self.name = name # Basic symbols nthreads = Symbol(name='nthreads', is_const=True) threshold = Symbol(name='threshold', is_const=True) last = Symbol(name='last', is_const=True) first = Symbol(name='first', is_const=True) portion = Symbol(name='portion', is_const=True) # Composite symbols threads = STDVectorThreads() # Iteration helper symbols begin = Symbol(name='begin') l = Symbol(name='l') end = Symbol(name='end') # Functions stdmax = sympy.Function('std::max') # Construct the parallel-for body func = FunctionType('func') i = Dimension(name='i') threadobj = Call( 'std::thread', Lambda( Iteration(Call(func.name, i), i, (begin, end - 1, 1)), ['=', Byref(func.name)], )) threadpush = Call(FieldFromComposite('push_back', threads), threadobj) it = Dimension(name='it') iteration = Iteration([ DummyExpr(begin, it, init=True), DummyExpr(l, it + portion, init=True), DummyExpr(end, InlineIf(l > last, last, l), init=True), threadpush ], it, (first, last, portion)) thread = STDThread('x') waitcall = Call('std::for_each', [ Call(FieldFromComposite('begin', threads)), Call(FieldFromComposite('end', threads)), Lambda(Call(FieldFromComposite('join', thread.name)), [], [thread]) ]) body = [ DummyExpr(threshold, 1, init=True), DummyExpr(portion, stdmax(threshold, (last - first) / nthreads), init=True), Call(FieldFromComposite('reserve', threads), nthreads), iteration, waitcall ] parfor = ElementalFunction('parallel_for', body, 'void', [first, last, func, nthreads]) assert str(parfor) == """\