def __init__(self, cs): try: Operator.__init__(self, cs) except: # kill the workers too so everything dies. runLog.important("Master node failed on init. Quitting.") if armi.MPI_COMM: # else it's a single cpu case. armi.MPI_COMM.bcast("quit", root=0) raise
def operate(self): """ Operate method for all nodes. Calls _mainOperate or workerOperate depending on which MPI rank we are, and handles errors. """ runLog.debug("OperatorMPI.operate") if armi.MPI_RANK == 0: # this is the master try: # run the regular old operate function Operator.operate(self) runLog.important(time.ctime()) except Exception as ee: runLog.error( "Error in Master Node. Check STDERR for a traceback.\n{}". format(ee)) raise finally: if armi.MPI_SIZE > 0: runLog.important( "Stopping all MPI worker nodes and cleaning temps.") armi.MPI_COMM.bcast( "quit", root=0) # send the quit command to the workers. runLog.debug("Waiting for all nodes to close down") armi.MPI_COMM.bcast( "finished", root=0) # wait until they're done cleaning up. runLog.important("All worker nodes stopped.") time.sleep( 1 ) # even though we waited, still need more time to close stdout. runLog.debug("Main operate finished") runLog.LOG.close() # concatenate all logs. else: try: self.workerOperate() except: # grab the final command runLog.warning( "An error has occurred in one of the worker nodes. See STDERR for traceback." ) # bcasting quit won't work if the main is sitting around waiting for a # different bcast or gather. traceback.print_exc() runLog.debug("Worker failed") runLog.LOG.close() raise
def setUp(self): self.standaloneDetailedCS = Settings() self.standaloneDetailedCS.loadFromString(self.detailedCyclesSettings) self.detailedOperator = Operator(self.standaloneDetailedCS)