def __init__(self, name="Circle", split="equal", k=2, dbname=None, resume=False): random.seed() # use system time to seed self.comm = MPI.COMM_WORLD self.comm.Set_name(name) self.size = self.comm.Get_size() self.rank = self.comm.Get_rank() self.d = {"rank": "rank %s" % self.rank} self.logger = getLogger(__name__) self.useStore = G.use_store self.split = split self.dbname = dbname self.resume = resume self.reduce_time_interval = G.reduce_interval self.task = None self.abort = False self.requestors = [] # workq buffer if self.useStore: self.workq_buf = deque() # counters self.work_requested = 0 self.work_processed = 0 self.work_request_received = 0 self.workreq_outstanding = False self.workreq_rank = None # reduction self.reduce_enabled = True self.reduce_time_last = MPI.Wtime() self.reduce_outstanding = False self.reduce_replies = 0 self.reduce_buf = {} self.reduce_status = None # barriers self.barrier_started = False self.barrier_up = False # flag to indicate barrier sent to parent self.barrier_replies = 0 self.workdir = os.getcwd() self.tempdir = os.path.join(self.workdir, ".pcircle") if not os.path.exists(self.tempdir): try: os.mkdir(self.tempdir) except OSError: pass # token self.token = Token(self) # tree init self.k = k self.parent_rank = MPI.PROC_NULL self.child_ranks = [] # [MPI.PROC_NULL] * k is too much C self.children = 0 # compute rank of parent if we have one if self.rank > 0: self.parent_rank = (self.rank - 1) / k # identify ranks of what would be leftmost and rightmost children left = self.rank * k + 1 right = self.rank * k + k # if we have at least one child # compute number of children and list of child ranks if left < self.size: # adjust right child in case we don't have a full set of k if right >= self.size: right = self.size - 1 # compute number of children and the list self.children = right - left + 1 for i in range(self.children): self.child_ranks.append(left + i) self.logger.debug("parent: %s, children: %s" % (self.parent_rank, self.child_ranks), extra=self.d) # workq init # TODO: compare list vs. deque if G.use_store: self.workq_init(dbname, resume) else: self.workq = [] self.logger.debug("Circle initialized", extra=self.d)
def __init__(self, name="Circle", split="equal", k=2, dbname=None, resume=False): random.seed() # use system time to seed self.comm = MPI.COMM_WORLD self.comm.Set_name(name) self.size = self.comm.Get_size() self.rank = self.comm.Get_rank() self.host = MPI.Get_processor_name() self.pid = os.getpid() self.d = {"rank": "rank %s" % self.rank} self.logger = getLogger(__name__) self.split = split self.dbname = dbname self.resume = resume self.reduce_time_interval = G.reduce_interval self.task = None self.abort = False self.requestors = [] # counters self.work_requested = 0 self.work_processed = 0 self.work_request_received = 0 self.workreq_outstanding = False self.workreq_rank = None # reduction self.reduce_enabled = False self.reduce_time_last = MPI.Wtime() self.reduce_outstanding = False self.reduce_replies = 0 self.reduce_buf = {} self.reduce_status = None # periodic report self.report_enabled = False self.report_interval = 60 self.report_last = MPI.Wtime() self.report_processed = 0 # barriers self.barrier_started = False self.barrier_up = False # flag to indicate barrier sent to parent self.barrier_replies = 0 self.workdir = os.getcwd() if not G.tempdir: G.tempdir = os.path.join(os.getcwd(), (".pcircle" + utils.timestamp())) G.tempdir = self.comm.bcast(G.tempdir) if not os.path.exists(G.tempdir): try: os.mkdir(G.tempdir) except OSError: pass # token self.token = Token(self) # tree init self.k = k self.parent_rank = MPI.PROC_NULL self.child_ranks = [] # [MPI.PROC_NULL] * k is too much C self.children = 0 # compute rank of parent if we have one if self.rank > 0: self.parent_rank = (self.rank - 1) // k # identify ranks of what would be leftmost and rightmost children left = self.rank * k + 1 right = self.rank * k + k # if we have at least one child # compute number of children and list of child ranks if left < self.size: # adjust right child in case we don't have a full set of k if right >= self.size: right = self.size - 1 # compute number of children and the list self.children = right - left + 1 for i in range(self.children): self.child_ranks.append(left + i) self.logger.debug("parent: %s, children: %s" % (self.parent_rank, self.child_ranks), extra=self.d) # workq init # TODO: compare list vs. deque # 3 possible workq: workq, workq_buf(locates in memory, used when pushing to or retrieving from database ) self.workq = deque() # workq buffer self.workq_buf = deque() # flag that indicates database is used for workq self.use_store = False if G.resume: self.workq_init(self.dbname, G.resume) self.logger.debug("Circle initialized", extra=self.d)
class Circle: def __init__(self, name="Circle", split="equal", k=2, dbname=None, resume=False): random.seed() # use system time to seed self.comm = MPI.COMM_WORLD self.comm.Set_name(name) self.size = self.comm.Get_size() self.rank = self.comm.Get_rank() self.d = {"rank": "rank %s" % self.rank} self.logger = getLogger(__name__) self.useStore = G.use_store self.split = split self.dbname = dbname self.resume = resume self.reduce_time_interval = G.reduce_interval self.task = None self.abort = False self.requestors = [] # workq buffer if self.useStore: self.workq_buf = deque() # counters self.work_requested = 0 self.work_processed = 0 self.work_request_received = 0 self.workreq_outstanding = False self.workreq_rank = None # reduction self.reduce_enabled = True self.reduce_time_last = MPI.Wtime() self.reduce_outstanding = False self.reduce_replies = 0 self.reduce_buf = {} self.reduce_status = None # barriers self.barrier_started = False self.barrier_up = False # flag to indicate barrier sent to parent self.barrier_replies = 0 self.workdir = os.getcwd() self.tempdir = os.path.join(self.workdir, ".pcircle") if not os.path.exists(self.tempdir): try: os.mkdir(self.tempdir) except OSError: pass # token self.token = Token(self) # tree init self.k = k self.parent_rank = MPI.PROC_NULL self.child_ranks = [] # [MPI.PROC_NULL] * k is too much C self.children = 0 # compute rank of parent if we have one if self.rank > 0: self.parent_rank = (self.rank - 1) / k # identify ranks of what would be leftmost and rightmost children left = self.rank * k + 1 right = self.rank * k + k # if we have at least one child # compute number of children and list of child ranks if left < self.size: # adjust right child in case we don't have a full set of k if right >= self.size: right = self.size - 1 # compute number of children and the list self.children = right - left + 1 for i in range(self.children): self.child_ranks.append(left + i) self.logger.debug("parent: %s, children: %s" % (self.parent_rank, self.child_ranks), extra=self.d) # workq init # TODO: compare list vs. deque if G.use_store: self.workq_init(dbname, resume) else: self.workq = [] self.logger.debug("Circle initialized", extra=self.d) def finalize(self, cleanup=True): if cleanup and G.use_store: self.workq.cleanup() def workq_init(self, dbname=None, resume=False): # NOTE: the db filename and its rank is seprated with "-" # we rely on this to separate, so the filename itself (within our control) # should not use dash ... the default is to use "." for sepration # Yes, this is very fragile, hopefully we will fix this later if dbname is None: self.dbname = os.path.join(self.tempdir, "workq-%s" % self.rank) else: self.dbname = os.path.join(self.tempdir, "%s-%s" % (dbname, self.rank)) self.workq = DbStore(self.dbname, resume=resume) def next_proc(self): """ Note next proc could return rank of itself """ if self.size == 1: return MPI.PROC_NULL else: return random.randint(0, self.size - 1) def workq_info(self): s = "has %s items in work queue\n" % len(self.workq) return s def qsize(self): return len(self.workq) def begin(self, task): """ entry point to work """ self.task = task self.task.create() self.comm.barrier() self.loop() if self.rank == 0: self.logger.debug("Loop finished, cleaning up ... ", extra=self.d) self.cleanup() self.comm.barrier() if len(self.workq) != 0: pprint("Rank %s workq.len = %s" % (self.rank, self.qsize())) pprint(self.__dict__) sys.stdout.flush() self.comm.Abort(1) def loop(self): """ central loop to finish the work """ while True: # check for and service requests self.workreq_check() if self.reduce_enabled: self.reduce_check() if len(self.workq) == 0: self.request_work() # if I have work, and no abort signal, process one if len(self.workq) > 0 and not self.abort: self.task.process() self.work_processed += 1 else: status = self.token.check_for_term() if status == G.TERMINATE: break def enq(self, work): if work is None: self.logger.warn("enq work item is None", extra=self.d) return self.workq.append(work) def preq(self, work): self.workq.insert(0, work) def setq(self, q): self.workq = q def deq(self): if len(self.workq) > 0: return self.workq.pop() else: return None def barrier_start(self): self.barrier_started = True def barrier_test(self): if not self.barrier_started: return False # check if we have received message from all children if self.barrier_replies < self.children: # still waiting for barries from children st = MPI.Status() flag = self.comm.Iprobe(MPI.ANY_SOURCE, T.BARRIER, st) if flag: child = st.Get_source() self.comm.recv(source=child, tag=T.BARRIER) self.barrier_replies += 1 # if we have not sent a message to our parent, and we # have received a message from all of our children (or we have no children) # send a message to our parent if not self.barrier_up and self.barrier_replies == self.children: if self.parent_rank != MPI.PROC_NULL: self.comm.send(None, self.parent_rank, T.BARRIER) # transition to state where we're waiting for parent # to notify us that the barrier is complete self.barrier_up = True # wait for message to come back down from parent to mark end of barrier complete = False if self.barrier_up: if self.parent_rank != MPI.PROC_NULL: # check for message from parent flag = self.comm.Iprobe(self.parent_rank, T.BARRIER) if flag: self.comm.recv(source=self.parent_rank, tag=T.BARRIER) # mark barrier as complete complete = True else: # we have no parent, we must be root # so mark the barrier complete complete = True # barrier is complete, send messages to children if any and return true if complete: for child in self.child_ranks: self.comm.send(None, dest=child, tag=T.BARRIER) # reset state for another barrier self.barrier_started = False self.barrier_up = False self.barrier_replies = 0 return True # barrier still not complete return False def bcast_abort(self): self.abort = True buf = G.ABORT for i in range(self.size): if (i != self.rank): self.comm.send(buf, dest=i, tag=T.WORK_REQUEST) self.logger.warn("abort message sent to %s" % i, extra=self.d) def cleanup(self): while True: # start non-block barrier if we have no outstanding items if not self.reduce_outstanding and \ not self.workreq_outstanding and \ self.token.send_req == MPI.REQUEST_NULL: self.barrier_start() # break the loop when non-blocking barrier completes if self.barrier_test(): break # send no work message for any work request that comes in self.workreq_check(cleanup=True) # clean up any outstanding reduction if self.reduce_enabled: self.reduce_check(cleanup=True) # recv any incoming work reply messages self.request_work(cleanup=True) # check and recv any incoming token self.token.check_and_recv() # if we have an outstanding token, check if it has been recv'ed # I don't think this is needed as there seem no side effect if self.token.send_req != MPI.REQUEST_NULL: if self.token.send_req.Test(): self.token.send_req = MPI.REQUEST_NULL def workreq_check(self, cleanup=False): """ for any process that sends work request message: add the process to the requester list if my work queue is not empty: distribute the work evenly else: send "no work" message to each requester reset the requester list to empty """ while True: st = MPI.Status() ret = self.comm.Iprobe(source=MPI.ANY_SOURCE, tag=T.WORK_REQUEST, status=st) if not ret: # no work request, break out the loop break # we have work request message rank = st.Get_source() buf = self.comm.recv(source=rank, tag=T.WORK_REQUEST, status=st) if buf == G.ABORT: self.logger.warn("Abort request from rank %s" % rank, extra=self.d) self.abort = True self.send_no_work(rank) return else: self.logger.debug("receive work request from requestor [%s]" % rank, extra=self.d) # add rank to requesters self.requestors.append(rank) # out of while loop if not self.requestors: return else: self.logger.debug("have %s requesters, with %s work items in queue" % (len(self.requestors), len(self.workq)), extra=self.d) # have work requesters if len(self.workq) == 0 or cleanup: for rank in self.requestors: self.send_no_work(rank) else: # we do have work self.send_work_to_many() self.requestors = [] def spread_counts(self, rcount, wcount): """ @rcount: # of requestors @wcount: # of work items @return: spread it evenly among all requesters case 1: wcount == rcount: base = 0 extra = wcount each requestor get 1 case 2: wcount < rcount: base = 0 extra = wcount first "wcount" requester get 1 case 3: wcount > rcount: is it possible? """ if self.split != "equal": raise NotImplementedError base = wcount / (rcount + 1) # leave self a base number of works extra = wcount - base * (rcount + 1) assert extra <= rcount sizes = [base] * rcount for i in xrange(extra): sizes[i] += 1 return sizes def send_no_work(self, rank): """ send no work reply to someone requesting work""" buf = {G.KEY: G.ABORT} if self.abort else {G.KEY: G.ZERO} r = self.comm.isend(buf, dest=rank, tag=T.WORK_REPLY) r.wait() self.logger.debug("Send no work reply to %s" % rank, extra=self.d) def send_work_to_many(self): rcount = len(self.requestors) wcount = len(self.workq) sizes = self.spread_counts(rcount, wcount) self.logger.debug("requester count: %s, work count: %s, spread: %s" % (rcount, wcount, sizes), extra=self.d) for idx, dest in enumerate(self.requestors): self.send_work(dest, sizes[idx]) def send_work(self, rank, witems): """ @dest - the rank of requester @count - the number of work to send """ if witems <= 0: self.send_no_work(rank) return # for termination detection if (rank < self.rank) or (rank == self.token.src): self.token.proc = G.BLACK buf = None # based on if it is memory or store-based # we have different ways of constructing buf if self.useStore: objs, size = self.workq.mget(witems) buf = {G.KEY: witems, G.VAL: objs} else: buf = {G.KEY: witems, G.VAL: self.workq[0:witems]} self.comm.send(buf, dest=rank, tag=T.WORK_REPLY) self.logger.debug("%s work items sent to rank %s" % (witems, rank), extra=self.d) # remove (witems) of work items # for DbStotre, all we need is a number, not the actual objects # for KVStore, we do need the object list for its key value # the "size" is a bit awkward use - we know the size after we do mget() # however, it is not readily available when we do mdel(), so we keep # previous data and pass it back in to save us some time. # if self.useStore: self.workq.mdel(witems, size) else: del self.workq[0:witems] def request_work(self, cleanup=False): if self.workreq_outstanding: st = MPI.Status() reply = self.comm.Iprobe(source=self.work_requested_rank, tag=T.WORK_REPLY, status=st) if reply: self.work_receive(self.work_requested_rank) # flip flag to indicate we no longer waiting for reply self.workreq_outstanding = False # else: # self.logger.debug("has req outstanding, dest = %s, no reply" % # self.work_requested_rank, extra = self.d) elif not cleanup: # send request dest = self.next_proc() if dest == self.rank or dest == MPI.PROC_NULL: # have no one to ask, we are done return buf = G.ABORT if self.abort else G.MSG # blocking send self.logger.debug("send work request to rank %s : %s" % (dest, G.str[buf]), extra=self.d) self.comm.send(buf, dest, T.WORK_REQUEST) self.workreq_outstanding = True self.work_requested_rank = dest def work_receive(self, rank): """ when incoming work reply detected """ buf = self.comm.recv(source=rank, tag=T.WORK_REPLY) if buf[G.KEY] == G.ABORT: self.logger.debug("receive abort signal", extra=self.d) self.abort = True return elif buf[G.KEY] == G.ZERO: self.logger.debug("receive no work signal", extra=self.d) return else: assert type(buf[G.VAL]) == list self.workq.extend(buf[G.VAL]) def reduce(self, buf): # copy data from user buffer self.reduce_buf = copy(buf) def reduce_check(self, cleanup=False): """ initiate and progress a reduce operation at specified interval, ensure progress of reduction in background, stop reduction if cleanup flag is True """ if self.reduce_outstanding: # if we have outstanding reduce, check message from children # otherwise, check whether we should start new reduce for child in self.child_ranks: if self.comm.Iprobe(source=child, tag=T.REDUCE): # receive message from child # 'status' element is G.MSG_VALID or not # the rest is opaque inbuf = self.comm.recv(source=child, tag=T.REDUCE) self.reduce_replies += 1 self.logger.debug("client data from %s: %s" % (child, inbuf), extra=self.d) if inbuf['status'] == G.MSG_INVALID: self.reduce_status = False else: self.reduce_status = True # invoke user's callback to reduce user data if hasattr(self.task, "reduce"): self.task.reduce(self.reduce_buf, inbuf) # check if we have gotten replies from all children if self.reduce_replies == self.children: # all children replied # add our own contents to reduce buffer # send message to parent if we have one if self.parent_rank != MPI.PROC_NULL: self.comm.send(self.reduce_buf, self.parent_rank, T.REDUCE) else: # we are the root, print results if we have valid data if self.reduce_status and hasattr(self.task, "reduce_report"): self.task.reduce_report(self.reduce_buf) # invoke callback on root to deliver final results if hasattr(self.task, "reduce_finish"): self.task.reduce_finish(self.reduce_buf) # disable flag to indicate we got what we want self.reduce_outstanding = False else: # we don't have an outstanding reduction # determine if a new reduce should be started # only bother checking if we think it is about time or # we are in cleanup mode start_reduce = False time_now = MPI.Wtime() time_next = self.reduce_time_last + self.reduce_time_interval if time_now >= time_next or cleanup: if self.parent_rank == MPI.PROC_NULL: # we are root, kick it off start_reduce = True elif self.comm.Iprobe(source=self.parent_rank, tag=T.REDUCE): # we are not root, check if parent sent us a message # receive message from parent and set flag to start reduce self.comm.recv(source=self.parent_rank, tag=T.REDUCE) start_reduce = True # it is critical that we don't start a reduce if we are in cleanup # phase because we may have already started the non-blocking barrier # just send an invalid message back to parent if start_reduce and cleanup: # avoid starting a reduce start_reduce = False # if we have parent, send invalid msg if self.parent_rank != MPI.PROC_NULL: self.reduce_status = G.MSG_INVALID self.comm.send(self.reduce_buf, self.parent_rank, T.REDUCE) if start_reduce: # set flag to indicate we have a reduce outstanding # and initiate state for a fresh reduction self.reduce_time_last = time_now self.reduce_outstanding = True self.reduce_replies = 0 self.reduce_status = G.MSG_VALID self.reduce_buf['status'] = G.MSG_VALID # invoke callback to get input data if hasattr(self.task, "reduce_init"): self.task.reduce_init(self.reduce_buf) # sent message to each child for child in self.child_ranks: self.comm.send(None, child, T.REDUCE) @staticmethod def exit(code): MPI.Finalize() sys.exit(code)
class Circle: def __init__(self, name="Circle", split="equal", k=2, dbname=None, resume=False): random.seed() # use system time to seed self.comm = MPI.COMM_WORLD self.comm.Set_name(name) self.size = self.comm.Get_size() self.rank = self.comm.Get_rank() self.host = MPI.Get_processor_name() self.pid = os.getpid() self.d = {"rank": "rank %s" % self.rank} self.logger = getLogger(__name__) self.split = split self.dbname = dbname self.resume = resume self.reduce_time_interval = G.reduce_interval self.task = None self.abort = False self.requestors = [] # counters self.work_requested = 0 self.work_processed = 0 self.work_request_received = 0 self.workreq_outstanding = False self.workreq_rank = None # reduction self.reduce_enabled = False self.reduce_time_last = MPI.Wtime() self.reduce_outstanding = False self.reduce_replies = 0 self.reduce_buf = {} self.reduce_status = None # periodic report self.report_enabled = False self.report_interval = 60 self.report_last = MPI.Wtime() self.report_processed = 0 # barriers self.barrier_started = False self.barrier_up = False # flag to indicate barrier sent to parent self.barrier_replies = 0 self.workdir = os.getcwd() if not G.tempdir: G.tempdir = os.path.join(os.getcwd(), (".pcircle" + utils.timestamp())) G.tempdir = self.comm.bcast(G.tempdir) if not os.path.exists(G.tempdir): try: os.mkdir(G.tempdir) except OSError: pass # token self.token = Token(self) # tree init self.k = k self.parent_rank = MPI.PROC_NULL self.child_ranks = [] # [MPI.PROC_NULL] * k is too much C self.children = 0 # compute rank of parent if we have one if self.rank > 0: self.parent_rank = (self.rank - 1) // k # identify ranks of what would be leftmost and rightmost children left = self.rank * k + 1 right = self.rank * k + k # if we have at least one child # compute number of children and list of child ranks if left < self.size: # adjust right child in case we don't have a full set of k if right >= self.size: right = self.size - 1 # compute number of children and the list self.children = right - left + 1 for i in range(self.children): self.child_ranks.append(left + i) self.logger.debug("parent: %s, children: %s" % (self.parent_rank, self.child_ranks), extra=self.d) # workq init # TODO: compare list vs. deque # 3 possible workq: workq, workq_buf(locates in memory, used when pushing to or retrieving from database ) self.workq = deque() # workq buffer self.workq_buf = deque() # flag that indicates database is used for workq self.use_store = False if G.resume: self.workq_init(self.dbname, G.resume) self.logger.debug("Circle initialized", extra=self.d) def finalize(self, cleanup=True): if cleanup and hasattr(self, "workq_db"): self.workq_db.cleanup() if os.path.exists(G.tempdir) and cleanup: try: shutil.rmtree(G.tempdir) except: pass def workq_init(self, dbname=None, resume=False): # NOTE: the db filename and its rank is seprated with "-" # we rely on this to separate, so the filename itself (within our control) # should not use dash ... the default is to use "." for sepration # Yes, this is very fragile, hopefully we will fix this later if G.resume == True: self.dbname = os.path.join( self.workdir, ".pcp_workq.%s.%s.db" % (G.rid, self.rank)) if os.path.exists(self.dbname): self.workq_db = DbStore(self.dbname, G.resume) else: if dbname is None: self.dbname = os.path.join(G.tempdir, "workq-%s" % self.rank) else: self.dbname = os.path.join(G.tempdir, "%s.workq-%s" % (dbname, self.rank)) self.workq_db = DbStore(self.dbname, resume=G.resume) # after task(fcp) creation, push works in workq_buf into workq_db def push_remaining_buf(self): if len(self.workq_buf) > 0: self.workq_db.mput(self.workq_buf) self.workq_db.clear() def next_proc(self): """ Note next proc could return rank of itself """ if self.size == 1: return MPI.PROC_NULL else: return random.randint(0, self.size - 1) def workq_info(self): s = "has %s items in work queue\n" % self.qsize return s def qsize(self): qsize = len(self.workq) + len(self.workq_buf) if hasattr(self, "workq_db"): qsize += len(self.workq_db) return qsize def begin(self, task): """ entry point to work """ self.task = task self.task.create() self.comm.barrier() self.loop() self.cleanup() if self.report_enabled: self.do_periodic_report(prefix="Circle final report") self.comm.barrier() if self.qsize() != 0: pprint("Rank %s workq.len = %s" % (self.rank, self.qsize())) pprint(self.__dict__) sys.stdout.flush() self.comm.Abort(1) def loop(self): """ central loop to finish the work """ while True: # check if we shall do report cur_time = MPI.Wtime() if self.report_enabled and (cur_time - self.report_last > self.report_interval): self.report_last = cur_time self.do_periodic_report() # check for and service requests self.workreq_check() if self.reduce_enabled: self.reduce_check() if self.qsize() == 0: self.request_work() # if I have work, and no abort signal, process one if self.qsize() > 0 and not self.abort: self.task.process() self.work_processed += 1 else: status = self.token.check_for_term() if status == G.TERMINATE: break def enq(self, work): if work is None: self.logger.warn("enq work item is None", extra=self.d) return if len(self.workq) < G.memitem_threshold: self.workq.append(work) return else: self.workq_buf.append(work) if len(self.workq_buf) == G.DB_BUFSIZE: if self.use_store == False: self.workq_init(self.dbname, G.resume) self.use_store = True self.workq_db.mput(self.workq_buf) self.workq_buf.clear() def preq(self, work): self.workq.appendleft(work) def setq(self, q): self.workq = q def deq(self): # deque a work starting from workq, then from workq_buf, then from workq_db if len(self.workq) > 0: return self.workq.pop() elif len(self.workq_buf) > 0: return self.workq_buf.pop() elif hasattr(self, "workq_db") and len(self.workq_db) > 0: #read a batch of works into memory workq, objs_size = self.workq_db.mget(G.memitem_threshold) self.workq = deque(workq) self.workq_db.mdel(G.memitem_threshold, objs_size) if len(self.workq) > 0: return self.workq.pop() else: return None def barrier_start(self): self.barrier_started = True def barrier_test(self): if not self.barrier_started: return False # check if we have received message from all children if self.barrier_replies < self.children: # still waiting for barries from children st = MPI.Status() flag = self.comm.Iprobe(MPI.ANY_SOURCE, T.BARRIER, st) if flag: child = st.Get_source() self.comm.recv(source=child, tag=T.BARRIER) self.barrier_replies += 1 # if we have not sent a message to our parent, and we # have received a message from all of our children (or we have no children) # send a message to our parent if not self.barrier_up and self.barrier_replies == self.children: if self.parent_rank != MPI.PROC_NULL: self.comm.send(None, self.parent_rank, T.BARRIER) # transition to state where we're waiting for parent # to notify us that the barrier is complete self.barrier_up = True # wait for message to come back down from parent to mark end of barrier complete = False if self.barrier_up: if self.parent_rank != MPI.PROC_NULL: # check for message from parent flag = self.comm.Iprobe(self.parent_rank, T.BARRIER) if flag: self.comm.recv(source=self.parent_rank, tag=T.BARRIER) # mark barrier as complete complete = True else: # we have no parent, we must be root # so mark the barrier complete complete = True # barrier is complete, send messages to children if any and return true if complete: for child in self.child_ranks: self.comm.send(None, dest=child, tag=T.BARRIER) # reset state for another barrier self.barrier_started = False self.barrier_up = False self.barrier_replies = 0 return True # barrier still not complete return False def bcast_abort(self): self.abort = True buf = G.ABORT for i in range(self.size): if (i != self.rank): self.comm.send(buf, dest=i, tag=T.WORK_REQUEST) self.logger.warn("abort message sent to %s" % i, extra=self.d) def cleanup(self): while True: # start non-block barrier if we have no outstanding items if not self.reduce_outstanding and \ not self.workreq_outstanding and \ self.token.send_req == MPI.REQUEST_NULL: self.barrier_start() # break the loop when non-blocking barrier completes if self.barrier_test(): break # send no work message for any work request that comes in self.workreq_check(cleanup=True) # clean up any outstanding reduction if self.reduce_enabled: self.reduce_check(cleanup=True) # recv any incoming work reply messages self.request_work(cleanup=True) # check and recv any incoming token self.token.check_and_recv() # if we have an outstanding token, check if it has been recv'ed # I don't think this is needed as there seem no side effect if self.token.send_req != MPI.REQUEST_NULL: if self.token.send_req.Test(): self.token.send_req = MPI.REQUEST_NULL def workreq_check(self, cleanup=False): """ for any process that sends work request message: add the process to the requester list if my work queue is not empty: distribute the work evenly else: send "no work" message to each requester reset the requester list to empty """ while True: st = MPI.Status() ret = self.comm.Iprobe(source=MPI.ANY_SOURCE, tag=T.WORK_REQUEST, status=st) if not ret: # no work request, break out the loop break # we have work request message rank = st.Get_source() buf = self.comm.recv(source=rank, tag=T.WORK_REQUEST, status=st) if buf == G.ABORT: self.logger.warn("Abort request from rank %s" % rank, extra=self.d) self.abort = True self.send_no_work(rank) return else: self.logger.debug("receive work request from requestor [%s]" % rank, extra=self.d) # add rank to requesters self.requestors.append(rank) # out of while loop if not self.requestors: return else: # first combine workq and work_buf, both of them are in memory if len(self.workq_buf) > 0: self.workq.extend(self.workq_buf) self.workq_buf.clear() # if in-memory workq is empty, get a batch of works from database if len(self.workq) == 0 and hasattr(self, "workq_db"): if len(self.workq_db) > 0: workq, objs_size = self.workq_db.mget(G.memitem_threshold) self.workq = deque(workq) self.workq_db.mdel(G.memitem_threshold, objs_size) self.logger.debug( "have %s requesters, with %s work items in queue" % (len(self.requestors), len(self.workq)), extra=self.d) # have work requesters if self.qsize() == 0 or cleanup: for rank in self.requestors: self.send_no_work(rank) else: # we do have work self.send_work_to_many() self.requestors = [] def spread_counts(self, rcount, wcount): """ @rcount: # of requestors @wcount: # of work items @return: spread it evenly among all requesters case 1: wcount == rcount: base = 0 extra = wcount each requestor get 1 case 2: wcount < rcount: base = 0 extra = wcount first "wcount" requester get 1 case 3: wcount > rcount: is it possible? """ if self.split != "equal": raise NotImplementedError base = wcount // (rcount + 1) # leave self a base number of works extra = wcount - base * (rcount + 1) assert extra <= rcount sizes = [base] * rcount for i in range(extra): sizes[i] += 1 return sizes def send_no_work(self, rank): """ send no work reply to someone requesting work""" buf = {G.KEY: G.ABORT} if self.abort else {G.KEY: G.ZERO} r = self.comm.isend(buf, dest=rank, tag=T.WORK_REPLY) r.wait() self.logger.debug("Send no work reply to %s" % rank, extra=self.d) def send_work_to_many(self): rcount = len(self.requestors) wcount = len(self.workq) sizes = self.spread_counts(rcount, wcount) self.logger.debug("requester count: %s, work count: %s, spread: %s" % (rcount, wcount, sizes), extra=self.d) for idx, dest in enumerate(self.requestors): self.send_work(dest, sizes[idx]) def send_work(self, rank, witems): """ @dest - the rank of requester @count - the number of work to send """ if witems <= 0: self.send_no_work(rank) return # for termination detection if (rank < self.rank) or (rank == self.token.src): self.token.proc = G.BLACK buf = None # based on if it is memory or store-based # we have different ways of constructing buf sliced = list(itertools.islice(self.workq, 0, witems)) buf = {G.KEY: witems, G.VAL: sliced} self.comm.send(buf, dest=rank, tag=T.WORK_REPLY) self.logger.debug("%s work items sent to rank %s" % (witems, rank), extra=self.d) # remove (witems) of work items # for DbStotre, all we need is a number, not the actual objects # for KVStore, we do need the object list for its key value # the "size" is a bit awkward use - we know the size after we do mget() # however, it is not readily available when we do mdel(), so we keep # previous data and pass it back in to save us some time. # for i in range(witems): self.workq.popleft() def request_work(self, cleanup=False): if self.workreq_outstanding: st = MPI.Status() reply = self.comm.Iprobe(source=self.work_requested_rank, tag=T.WORK_REPLY, status=st) if reply: self.work_receive(self.work_requested_rank) # flip flag to indicate we no longer waiting for reply self.workreq_outstanding = False # else: # self.logger.debug("has req outstanding, dest = %s, no reply" % # self.work_requested_rank, extra = self.d) elif not cleanup: # send request dest = self.next_proc() if dest == self.rank or dest == MPI.PROC_NULL: # have no one to ask, we are done return buf = G.ABORT if self.abort else G.MSG # blocking send self.logger.debug("send work request to rank %s : %s" % (dest, G.str[buf]), extra=self.d) self.comm.send(buf, dest, T.WORK_REQUEST) self.workreq_outstanding = True self.work_requested_rank = dest def work_receive(self, rank): """ when incoming work reply detected """ buf = self.comm.recv(source=rank, tag=T.WORK_REPLY) if buf[G.KEY] == G.ABORT: self.logger.debug("receive abort signal", extra=self.d) self.abort = True return elif buf[G.KEY] == G.ZERO: self.logger.debug("receive no work signal", extra=self.d) return else: assert type(buf[G.VAL]) == list self.workq.extend(buf[G.VAL]) def reduce(self, buf): # copy data from user buffer self.reduce_buf = copy(buf) def reduce_check(self, cleanup=False): """ initiate and progress a reduce operation at specified interval, ensure progress of reduction in background, stop reduction if cleanup flag is True """ if self.reduce_outstanding: # if we have outstanding reduce, check message from children # otherwise, check whether we should start new reduce for child in self.child_ranks: if self.comm.Iprobe(source=child, tag=T.REDUCE): # receive message from child # 'status' element is G.MSG_VALID or not # the rest is opaque inbuf = self.comm.recv(source=child, tag=T.REDUCE) self.reduce_replies += 1 self.logger.debug("client data from %s: %s" % (child, inbuf), extra=self.d) if inbuf['status'] == G.MSG_INVALID: self.reduce_status = False else: self.reduce_status = True # invoke user's callback to reduce user data if hasattr(self.task, "reduce"): self.task.reduce(self.reduce_buf, inbuf) # check if we have gotten replies from all children if self.reduce_replies == self.children: # all children replied # add our own contents to reduce buffer # send message to parent if we have one if self.parent_rank != MPI.PROC_NULL: self.comm.send(self.reduce_buf, self.parent_rank, T.REDUCE) else: # we are the root, print results if we have valid data if self.reduce_status and hasattr(self.task, "reduce_report"): self.task.reduce_report(self.reduce_buf) # invoke callback on root to deliver final results if hasattr(self.task, "reduce_finish"): self.task.reduce_finish(self.reduce_buf) # disable flag to indicate we got what we want self.reduce_outstanding = False else: # we don't have an outstanding reduction # determine if a new reduce should be started # only bother checking if we think it is about time or # we are in cleanup mode start_reduce = False time_now = MPI.Wtime() time_next = self.reduce_time_last + self.reduce_time_interval if time_now >= time_next or cleanup: if self.parent_rank == MPI.PROC_NULL: # we are root, kick it off start_reduce = True elif self.comm.Iprobe(source=self.parent_rank, tag=T.REDUCE): # we are not root, check if parent sent us a message # receive message from parent and set flag to start reduce self.comm.recv(source=self.parent_rank, tag=T.REDUCE) start_reduce = True # it is critical that we don't start a reduce if we are in cleanup # phase because we may have already started the non-blocking barrier # just send an invalid message back to parent if start_reduce and cleanup: # avoid starting a reduce start_reduce = False # if we have parent, send invalid msg if self.parent_rank != MPI.PROC_NULL: self.reduce_status = G.MSG_INVALID self.comm.send(self.reduce_buf, self.parent_rank, T.REDUCE) if start_reduce: # set flag to indicate we have a reduce outstanding # and initiate state for a fresh reduction self.reduce_time_last = time_now self.reduce_outstanding = True self.reduce_replies = 0 self.reduce_status = G.MSG_VALID self.reduce_buf['status'] = G.MSG_VALID # invoke callback to get input data if hasattr(self.task, "reduce_init"): self.task.reduce_init(self.reduce_buf) # sent message to each child for child in self.child_ranks: self.comm.send(None, child, T.REDUCE) def do_periodic_report(self, prefix="Circle report"): delta = self.work_processed - self.report_processed rate = int(delta / self.report_interval) self.report_processed = self.work_processed s = "\n%s on [rank: %s %s/%s] at %s\n" % \ (prefix, self.rank, self.host, self.pid, time.strftime("%Y-%m-%d %H:%M:%S")) s += "\t{:<20}{:<10,}{:5}{:<20}{:<12,}\n".format( "work queue size:", len(self.workq), "|", "work processed:", self.work_processed) s += "\t{:<20}{:<10,}{:5}{:<20}{:<10}\n".format( "work delta:", delta, "|", "rate:", "%s /s" % rate) print(s) @staticmethod def exit(code): MPI.Finalize() sys.exit(code)