def _make_thread_activate(threads, sdata, sync_ops, sregistry): if threads.size == 1: d = threads.index else: d = Symbol(name=sregistry.make_name(prefix=threads.index.name)) sync_locks = [s for s in sync_ops if s.is_SyncLock] condition = Or(*([CondNe(s.handle, 2) for s in sync_locks] + [CondNe(FieldFromComposite(sdata._field_flag, sdata[d]), 1)])) if threads.size == 1: activation = [While(condition)] else: activation = [DummyExpr(d, 0), While(condition, DummyExpr(d, (d + 1) % threads.size))] activation.extend([DummyExpr(FieldFromComposite(i.name, sdata[d]), i) for i in sdata.dynamic_fields]) activation.extend([DummyExpr(s.handle, 0) for s in sync_locks]) activation.append(DummyExpr(FieldFromComposite(sdata._field_flag, sdata[d]), 2)) activation = List( header=[c.Line(), c.Comment("Activate `%s`" % threads.name)], body=activation, footer=c.Line() ) return activation
def _make_poke(self, hs, key, msgs): lflag = Symbol(name='lflag') gflag = Symbol(name='gflag') # Init flags body = [Expression(DummyEq(lflag, 0)), Expression(DummyEq(gflag, 1))] # For each msg, build an Iteration calling MPI_Test on all peers for msg in msgs: dim = Dimension(name='i') msgi = IndexedPointer(msg, dim) rrecv = Byref(FieldFromComposite(msg._C_field_rrecv, msgi)) testrecv = Call( 'MPI_Test', [rrecv, Byref(lflag), Macro('MPI_STATUS_IGNORE')]) rsend = Byref(FieldFromComposite(msg._C_field_rsend, msgi)) testsend = Call( 'MPI_Test', [rsend, Byref(lflag), Macro('MPI_STATUS_IGNORE')]) update = AugmentedExpression(DummyEq(gflag, lflag), '&') body.append( Iteration([testsend, update, testrecv, update], dim, msg.npeers - 1)) body.append(Return(gflag)) return make_efunc('pokempi%d' % key, List(body=body), retval='int')
def _make_thread_finalize(threads, sdata): d = threads.index if threads.size == 1: callback = lambda body: body else: callback = lambda body: Iteration(body, d, threads.size - 1) threadswait = List( header=c.Comment("Wait for completion of `%s`" % threads.name), body=callback([ While(CondEq(FieldFromComposite(sdata._field_flag, sdata[d]), 2)), DummyExpr(FieldFromComposite(sdata._field_flag, sdata[d]), 0), Call('pthread_join', (threads[d], Macro('NULL'))) ])) return threadswait
def _make_remainder(self, hs, key, callcompute, region): assert callcompute.is_Call dim = Dimension(name='i') region_i = IndexedPointer(region, dim) dynamic_args_mapper = {} for i in hs.arguments: if i.is_Dimension: dynamic_args_mapper[i] = (FieldFromComposite(i.min_name, region_i), FieldFromComposite(i.max_name, region_i)) else: dynamic_args_mapper[i] = (FieldFromComposite(i.name, region_i),) iet = callcompute._rebuild(dynamic_args_mapper=dynamic_args_mapper) # The -1 below is because an Iteration, by default, generates <= iet = Iteration(iet, dim, region.nregions - 1) return make_efunc('remainder%d' % key, iet)
def _make_haloupdate(self, f, hse, key, msg=None): comm = f.grid.distributor._obj_comm fixed = {d: Symbol(name="o%s" % d.root) for d in hse.loc_indices} dim = Dimension(name='i') msgi = IndexedPointer(msg, dim) bufg = FieldFromComposite(msg._C_field_bufg, msgi) bufs = FieldFromComposite(msg._C_field_bufs, msgi) fromrank = FieldFromComposite(msg._C_field_from, msgi) torank = FieldFromComposite(msg._C_field_to, msgi) sizes = [ FieldFromComposite('%s[%d]' % (msg._C_field_sizes, i), msgi) for i in range(len(f._dist_dimensions)) ] ofsg = [ FieldFromComposite('%s[%d]' % (msg._C_field_ofsg, i), msgi) for i in range(len(f._dist_dimensions)) ] ofsg = [fixed.get(d) or ofsg.pop(0) for d in f.dimensions] # The `gather` is unnecessary if sending to MPI.PROC_NULL gather = Call('gather_%s' % key, [bufg] + sizes + [f] + ofsg) gather = Conditional(CondNe(torank, Macro('MPI_PROC_NULL')), gather) # Make Irecv/Isend count = reduce(mul, sizes, 1) rrecv = Byref(FieldFromComposite(msg._C_field_rrecv, msgi)) rsend = Byref(FieldFromComposite(msg._C_field_rsend, msgi)) recv = Call('MPI_Irecv', [ bufs, count, Macro(dtype_to_mpitype(f.dtype)), fromrank, Integer(13), comm, rrecv ]) send = Call('MPI_Isend', [ bufg, count, Macro(dtype_to_mpitype(f.dtype)), torank, Integer(13), comm, rsend ]) # The -1 below is because an Iteration, by default, generates <= ncomms = Symbol(name='ncomms') iet = Iteration([recv, gather, send], dim, ncomms - 1) parameters = ([f, comm, msg, ncomms]) + list(fixed.values()) return Callable('haloupdate%d' % key, iet, 'void', parameters, ('static', ))
def _make_halowait(self, f, hse, key, msg=None): cast = cast_mapper[(f.dtype, '*')] fixed = {d: Symbol(name="o%s" % d.root) for d in hse.loc_indices} dim = Dimension(name='i') msgi = IndexedPointer(msg, dim) bufs = FieldFromComposite(msg._C_field_bufs, msgi) fromrank = FieldFromComposite(msg._C_field_from, msgi) sizes = [FieldFromComposite('%s[%d]' % (msg._C_field_sizes, i), msgi) for i in range(len(f._dist_dimensions))] ofss = [FieldFromComposite('%s[%d]' % (msg._C_field_ofss, i), msgi) for i in range(len(f._dist_dimensions))] ofss = [fixed.get(d) or ofss.pop(0) for d in f.dimensions] # The `scatter` must be guarded as we must not alter the halo values along # the domain boundary, where the sender is actually MPI.PROC_NULL scatter = Call('scatter%s' % key, [cast(bufs)] + sizes + [f] + ofss) scatter = Conditional(CondNe(fromrank, Macro('MPI_PROC_NULL')), scatter) rrecv = Byref(FieldFromComposite(msg._C_field_rrecv, msgi)) waitrecv = Call('MPI_Wait', [rrecv, Macro('MPI_STATUS_IGNORE')]) rsend = Byref(FieldFromComposite(msg._C_field_rsend, msgi)) waitsend = Call('MPI_Wait', [rsend, Macro('MPI_STATUS_IGNORE')]) # The -1 below is because an Iteration, by default, generates <= ncomms = Symbol(name='ncomms') iet = Iteration([waitsend, waitrecv, scatter], dim, ncomms - 1) parameters = ([f] + list(fixed.values()) + [msg, ncomms]) return Callable('halowait%d' % key, iet, 'void', parameters, ('static',))
def _make_waitprefetch(self, iet, sync_ops, pieces, *args): ff = SharedData._field_flag waits = [] objs = filter_ordered(pieces.objs.get(s) for s in sync_ops) for sdata, threads in objs: wait = BusyWait( CondNe(FieldFromComposite(ff, sdata[threads.index]), 1)) waits.append(wait) iet = List(header=c.Comment("Wait for the arrival of prefetched data"), body=waits + [BlankLine, iet]) return iet
def _make_poke(self, hs, key, msgs): flag = Symbol(name='flag') initflag = LocalExpression(DummyEq(flag, 0)) body = [initflag] for msg in msgs: dim = Dimension(name='i') msgi = IndexedPointer(msg, dim) rrecv = Byref(FieldFromComposite(msg._C_field_rrecv, msgi)) rsend = Byref(FieldFromComposite(msg._C_field_rsend, msgi)) testrecv = Call( 'MPI_Test', [rrecv, Byref(flag), Macro('MPI_STATUS_IGNORE')]) testsend = Call( 'MPI_Test', [rsend, Byref(flag), Macro('MPI_STATUS_IGNORE')]) body.append(Iteration([testsend, testrecv], dim, msg.npeers - 1)) return make_efunc('pokempi%d' % key, body)
def __make_init_threads(self, threads, sdata, tfunc, pieces): d = threads.index if threads.size == 1: callback = lambda body: body else: callback = lambda body: Iteration(body, d, threads.size - 1) idinit = DummyExpr(FieldFromComposite(sdata._field_id, sdata[d]), 1 + sum(i.size for i in pieces.threads) + d) arguments = list(tfunc.parameters) arguments[-1] = sdata.symbolic_base + d call = Call('std::thread', Call(tfunc.name, arguments, is_indirect=True), retobj=threads[d]) threadsinit = List(header=c.Comment("Fire up and initialize `%s`" % threads.name), body=callback([idinit, call])) return threadsinit
def _make_fetchwaitprefetch(self, iet, sync_ops, pieces, root): fetches = [] prefetches = [] presents = [] for s in sync_ops: if s.direction is Forward: fc = s.fetch.subs(s.dim, s.dim.symbolic_min) pfc = s.fetch + 1 fc_cond = s.next_cbk(s.dim.symbolic_min) pfc_cond = s.next_cbk(s.dim + 1) else: fc = s.fetch.subs(s.dim, s.dim.symbolic_max) pfc = s.fetch - 1 fc_cond = s.next_cbk(s.dim.symbolic_max) pfc_cond = s.next_cbk(s.dim - 1) # Construct init IET imask = [(fc, s.size) if d.root is s.dim.root else FULL for d in s.dimensions] fetch = PragmaList(self.lang._map_to(s.function, imask), {s.function} | fc.free_symbols) fetches.append(Conditional(fc_cond, fetch)) # Construct present clauses imask = [(s.fetch, s.size) if d.root is s.dim.root else FULL for d in s.dimensions] presents.extend(as_list(self.lang._map_present(s.function, imask))) # Construct prefetch IET imask = [(pfc, s.size) if d.root is s.dim.root else FULL for d in s.dimensions] prefetch = PragmaList(self.lang._map_to_wait(s.function, imask, SharedData._field_id), {s.function} | pfc.free_symbols) prefetches.append(Conditional(pfc_cond, prefetch)) # Turn init IET into a Callable functions = filter_ordered(s.function for s in sync_ops) name = self.sregistry.make_name(prefix='init_device') body = List(body=fetches) parameters = filter_sorted(functions + derive_parameters(body)) func = Callable(name, body, 'void', parameters, 'static') pieces.funcs.append(func) # Perform initial fetch by the main thread pieces.init.append(List( header=c.Comment("Initialize data stream"), body=[Call(name, parameters), BlankLine] )) # Turn prefetch IET into a ThreadFunction name = self.sregistry.make_name(prefix='prefetch_host_to_device') body = List(header=c.Line(), body=prefetches) tctx = make_thread_ctx(name, body, root, None, sync_ops, self.sregistry) pieces.funcs.extend(tctx.funcs) # Glue together all the IET pieces, including the activation logic sdata = tctx.sdata threads = tctx.threads iet = List(body=[ BlankLine, BusyWait(CondNe(FieldFromComposite(sdata._field_flag, sdata[threads.index]), 1)), List(header=presents), iet, tctx.activate ]) # Fire up the threads pieces.init.append(tctx.init) pieces.threads.append(threads) # Final wait before jumping back to Python land pieces.finalize.append(tctx.finalize) return iet
def test_make_cpp_parfor(): """ Test construction of a CPP parallel for. This excites the IET construction machinery in several ways, in particular by using Lambda nodes (to generate C++ lambda functions) and nested Calls. """ class STDVectorThreads(LocalObject): dtype = type('std::vector<std::thread>', (c_void_p, ), {}) def __init__(self): self.name = 'threads' class STDThread(LocalObject): dtype = type('std::thread&', (c_void_p, ), {}) def __init__(self, name): self.name = name class FunctionType(LocalObject): dtype = type('FuncType&&', (c_void_p, ), {}) def __init__(self, name): self.name = name # Basic symbols nthreads = Symbol(name='nthreads', is_const=True) threshold = Symbol(name='threshold', is_const=True) last = Symbol(name='last', is_const=True) first = Symbol(name='first', is_const=True) portion = Symbol(name='portion', is_const=True) # Composite symbols threads = STDVectorThreads() # Iteration helper symbols begin = Symbol(name='begin') l = Symbol(name='l') end = Symbol(name='end') # Functions stdmax = sympy.Function('std::max') # Construct the parallel-for body func = FunctionType('func') i = Dimension(name='i') threadobj = Call( 'std::thread', Lambda( Iteration(Call(func.name, i), i, (begin, end - 1, 1)), ['=', Byref(func.name)], )) threadpush = Call(FieldFromComposite('push_back', threads), threadobj) it = Dimension(name='it') iteration = Iteration([ LocalExpression(DummyEq(begin, it)), LocalExpression(DummyEq(l, it + portion)), LocalExpression(DummyEq(end, InlineIf(l > last, last, l))), threadpush ], it, (first, last, portion)) thread = STDThread('x') waitcall = Call('std::for_each', [ Call(FieldFromComposite('begin', threads)), Call(FieldFromComposite('end', threads)), Lambda(Call(FieldFromComposite('join', thread.name)), [], [thread]) ]) body = [ LocalExpression(DummyEq(threshold, 1)), LocalExpression( DummyEq(portion, stdmax(threshold, (last - first) / nthreads))), Call(FieldFromComposite('reserve', threads), nthreads), iteration, waitcall ] parfor = ElementalFunction('parallel_for', body, 'void', [first, last, func, nthreads]) assert str(parfor) == """\
def _make_fetchwaitprefetch(self, iet, sync_ops, pieces, root): threads = self.__make_threads() fetches = [] prefetches = [] presents = [] for s in sync_ops: if s.direction is Forward: fc = s.fetch.subs(s.dim, s.dim.symbolic_min) fsize = s.function._C_get_field(FULL, s.dim).size fc_cond = fc + (s.size - 1) < fsize pfc = s.fetch + 1 pfc_cond = pfc + (s.size - 1) < fsize else: fc = s.fetch.subs(s.dim, s.dim.symbolic_max) fc_cond = fc >= 0 pfc = s.fetch - 1 pfc_cond = pfc >= 0 # Construct fetch IET imask = [(fc, s.size) if d.root is s.dim.root else FULL for d in s.dimensions] fetch = List(header=self._P._map_to(s.function, imask)) fetches.append(Conditional(fc_cond, fetch)) # Construct present clauses imask = [(s.fetch, s.size) if d.root is s.dim.root else FULL for d in s.dimensions] presents.extend(as_list(self._P._map_present(s.function, imask))) # Construct prefetch IET imask = [(pfc, s.size) if d.root is s.dim.root else FULL for d in s.dimensions] prefetch = List(header=self._P._map_to_wait( s.function, imask, SharedData._field_id)) prefetches.append(Conditional(pfc_cond, prefetch)) functions = filter_ordered(s.function for s in sync_ops) casts = [PointerCast(f) for f in functions] # Turn init IET into a Callable name = self.sregistry.make_name(prefix='init_device') body = List(body=casts + fetches) parameters = filter_sorted(functions + derive_parameters(body)) func = Callable(name, body, 'void', parameters, 'static') pieces.funcs.append(func) # Perform initial fetch by the main thread pieces.init.append( List(header=c.Comment("Initialize data stream for `%s`" % threads.name), body=[Call(name, func.parameters), BlankLine])) # Turn prefetch IET into a threaded Callable name = self.sregistry.make_name(prefix='prefetch_host_to_device') body = List(header=c.Line(), body=casts + prefetches) tfunc, sdata = self.__make_tfunc(name, body, root, threads) pieces.funcs.append(tfunc) # Glue together all the IET pieces, including the activation bits iet = List(body=[ BlankLine, BusyWait( CondNe( FieldFromComposite(sdata._field_flag, sdata[ threads.index]), 1)), List(header=presents), iet, self.__make_activate_thread(threads, sdata, sync_ops) ]) # Fire up the threads pieces.init.append( self.__make_init_threads(threads, sdata, tfunc, pieces)) pieces.threads.append(threads) # Final wait before jumping back to Python land pieces.finalize.append(self.__make_finalize_threads(threads, sdata)) return iet
def _make_fetchprefetch(self, iet, sync_ops, pieces, root): fid = SharedData._field_id fetches = [] prefetches = [] presents = [] for s in sync_ops: f = s.function dimensions = s.dimensions fc = s.fetch ifc = s.ifetch pfc = s.pfetch fcond = s.fcond pcond = s.pcond # Construct init IET imask = [(ifc, s.size) if d.root is s.dim.root else FULL for d in dimensions] fetch = PragmaTransfer(self.lang._map_to, f, imask=imask) fetches.append(Conditional(fcond, fetch)) # Construct present clauses imask = [(fc, s.size) if d.root is s.dim.root else FULL for d in dimensions] presents.append( PragmaTransfer(self.lang._map_present, f, imask=imask)) # Construct prefetch IET imask = [(pfc, s.size) if d.root is s.dim.root else FULL for d in dimensions] prefetch = PragmaTransfer(self.lang._map_to_wait, f, imask=imask, queueid=fid) prefetches.append(Conditional(pcond, prefetch)) # Turn init IET into a Callable functions = filter_ordered(s.function for s in sync_ops) name = self.sregistry.make_name(prefix='init_device') body = List(body=fetches) parameters = filter_sorted(functions + derive_parameters(body)) func = Callable(name, body, 'void', parameters, 'static') pieces.funcs.append(func) # Perform initial fetch by the main thread pieces.init.append( List(header=c.Comment("Initialize data stream"), body=[Call(name, parameters), BlankLine])) # Turn prefetch IET into a ThreadFunction name = self.sregistry.make_name(prefix='prefetch_host_to_device') body = List(header=c.Line(), body=prefetches) tctx = make_thread_ctx(name, body, root, None, sync_ops, self.sregistry) pieces.funcs.extend(tctx.funcs) # Glue together all the IET pieces, including the activation logic sdata = tctx.sdata threads = tctx.threads iet = List(body=[ BlankLine, BusyWait( CondNe( FieldFromComposite(sdata._field_flag, sdata[ threads.index]), 1)) ] + presents + [iet, tctx.activate]) # Fire up the threads pieces.init.append(tctx.init) # Final wait before jumping back to Python land pieces.finalize.append(tctx.finalize) # Keep track of created objects pieces.objs.add(sync_ops, sdata, threads) return iet