Example #1
0
 def __init__(self, cs):
     try:
         Operator.__init__(self, cs)
     except:
         # kill the workers too so everything dies.
         runLog.important("Master node failed on init. Quitting.")
         if armi.MPI_COMM:  # else it's a single cpu case.
             armi.MPI_COMM.bcast("quit", root=0)
         raise
Example #2
0
    def operate(self):
        """
        Operate method for all nodes.

        Calls _mainOperate or workerOperate depending on which MPI rank we are, and
        handles errors.
        """
        runLog.debug("OperatorMPI.operate")
        if armi.MPI_RANK == 0:
            # this is the master
            try:
                # run the regular old operate function
                Operator.operate(self)
                runLog.important(time.ctime())
            except Exception as ee:
                runLog.error(
                    "Error in Master Node. Check STDERR for a traceback.\n{}".
                    format(ee))
                raise
            finally:
                if armi.MPI_SIZE > 0:
                    runLog.important(
                        "Stopping all MPI worker nodes and cleaning temps.")
                    armi.MPI_COMM.bcast(
                        "quit",
                        root=0)  # send the quit command to the workers.
                    runLog.debug("Waiting for all nodes to close down")
                    armi.MPI_COMM.bcast(
                        "finished",
                        root=0)  # wait until they're done cleaning up.
                    runLog.important("All worker nodes stopped.")
                time.sleep(
                    1
                )  # even though we waited, still need more time to close stdout.
                runLog.debug("Main operate finished")
                runLog.LOG.close()  # concatenate all logs.
        else:
            try:
                self.workerOperate()
            except:
                # grab the final command
                runLog.warning(
                    "An error has occurred in one of the worker nodes. See STDERR for traceback."
                )
                # bcasting quit won't work if the main is sitting around waiting for a
                # different bcast or gather.
                traceback.print_exc()
                runLog.debug("Worker failed")
                runLog.LOG.close()
                raise
Example #3
0
 def setUp(self):
     self.standaloneDetailedCS = Settings()
     self.standaloneDetailedCS.loadFromString(self.detailedCyclesSettings)
     self.detailedOperator = Operator(self.standaloneDetailedCS)