def wait_for_shutdown(process_list): """ Go through list of processes and make sure they all have terminated """ logger = Logger() exit_codes = [] while process_list: remove = [] for p in process_list: returncode = p.poll() #logger.debug("Got return code: %s" % returncode) if returncode is None: # still alive pass elif returncode == 0: # exited correctly exit_codes += [returncode] remove.append(p) #process_list.remove( p ) logger.debug( "A process exited with a status of 0. And we have %i left." % (len(process_list) - len(remove))) else: # error code exit_codes += [returncode] remove.append(p) #process_list.remove( p ) logger.debug( "A process exited with return code %d. And we have %i left." % (returncode, len(process_list) - len(remove))) # We remove outside iteration over list just to be safe for p in remove: process_list.remove(p) time.sleep(1) # Target list is empty unless the option process_io=localfile is specified, in # which case we close the filedescriptors of all the log files made for t in io_target_list: t.close() return exit_codes
def __init__(self): Thread.__init__(self) """ Initializes the MPI environment. This will give each process a separate rank in the MPI_COMM_WORLD communicator along with the total number of processes in the communicator. Both attributes can be read just after startup:: from mpi import MPI mpi = MPI() rank = mpi.MPI_COMM_WORLD.rank() size = mpi.MPI_COMM_WORLD.size() print "Proc %d of %d started" % (rank, size) mpi.finalize() """ self.name = "MPI" # Thread name # Startup time. Used in Wtime() implementation. self.startup_timestamp = time.time() # Event for handling thread packing. self.packing = threading.Event() # Data structures for jobs. # The locks are for guarding the data structures # The events are for signalling change in data structures # Pending requests are recieve requests where the data may or may not have arrived self.pending_requests = [] self.pending_requests_lock = threading.Lock() self.pending_requests_has_work = threading.Event() # Raw data are messages that have arrived but not been unpickled yet self.raw_data_queue = [] self.raw_data_lock = threading.Lock() self.raw_data_has_work = threading.Event() # Recieved data are messages that have arrived and are unpickled # (ie. ready for matching with a posted recv request) #There are no events as this is handled through the "pending_request_" event. self.received_data = [] self.received_data_lock = threading.Lock() # General event to wake up main mpi thread self.has_work_event = threading.Event() # Shutdown signals self.shutdown_event = threading.Event() # MPI finalize has been called, shutdown in progress # Lock and counter for enumerating request ids self.current_request_id_lock = threading.Lock() self.current_request_id = 0 # Pending system commands. These will be executed at first chance we have (we # need access to the user code). We also have a lock around the list, to ensure # proper access. self.pending_systems_commands = [] self.pending_systems_commands_lock = threading.Lock() # Unstarted collective requests. self.unstarted_collective_requests = [] self.unstarted_collective_requests_lock = threading.Lock() self.unstarted_collective_requests_has_work = threading.Event() # When the collective requsts are started they are moved to this queue until # they are finished. self.pending_collective_requests = [] self.received_collective_data_lock = threading.Lock() self.received_collective_data = [] self.pending_collective_requests_has_work = threading.Event() # The settings module. This will be handle proper by the # function ``generate_settings``. self.settings = None self.config_callbacks = [] # Append callbacks from mpi.settings import standard_callbacks self.config_callbacks.extend(standard_callbacks) options = self.parse_options() # TODO: See if logger initialisations below here shouldn't be refactored into one # Decide how to deal with I/O if options.process_io == "remotefile": # Initialise the logger import os logger = Logger(os.path.join(options.logdir,"remotelog"), "proc-%d" % options.rank, options.debug, options.verbosity, True) filename = constants.DEFAULT_LOGDIR+'mpi.local.rank%s.log' % options.rank logger.debug("Opening file for I/O: %s" % filename) try: output = open(filename, "w") except: raise MPIException("File for I/O not writeable - check that this path exists and is writeable:\n%s" % constants.DEFAULT_LOGDIR) sys.stdout = output sys.stderr = output elif options.process_io == "none": # Initialise the logger logger = Logger(options.logdir+"mpi", "proc-%d" % options.rank, options.debug, options.verbosity, True) logger.debug("Closing stdout") sys.stdout = None else: # Initialise the logger logger = Logger(options.logdir+"mpi", "proc-%d" % options.rank, options.debug, options.verbosity, options.quiet) # TODO: Put this info under settings when they start to work properly # Also we should check that the path here is accessible and valid # if filepath starts with something else than / it is a relative path and we assume it relative to pupympi dir if not options.logdir.startswith('/'): _BASE = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) self.logdir = os.path.join(_BASE,options.logdir) else: self.logdir = options.logdir # Parse and save settings. self.generate_settings(options.settings) # Attributes for the security component. self.disable_utilities = options.disable_utilities self.security_component = None # First check for required Python version self._version_check() # Check for yappi support self._yappi_enabled = False if options.yappi: try: import yappi self._yappi_enabled = True self._yappi_sorttype = yappi.SORTTYPE_NCALL if options.yappi_sorttype: if options.yappi_sorttype == 'name': self._yappi_sorttype = yappi.SORTTYPE_NAME elif options.yappi_sorttype == 'ncall': self._yappi_sorttype = yappi.SORTTYPE_NCALL elif options.yappi_sorttype == 'ttotal': self._yappi_sorttype = yappi.SORTTYPE_TTOTAL elif options.yappi_sorttype == 'tsub': self._yappi_sorttype = yappi.SORTTYPE_TSUB elif options.yappi_sorttype == 'tavg': self._yappi_sorttype = yappi.SORTTYPE_TAVG else: logger.warn("Unknown yappi sorttype '%s' - defaulting to ncall." % options.yappi_sorttype) except ImportError: logger.warn("Yappi is not supported on this system. Statistics will not be logged.") self._yappi_enabled = False # Start built-in profiling facility self._profiler_enabled = False if options.enable_profiling: if self._yappi_enabled: logger.warn("Running yappi and pupyprof simultaneously is unpossible. Pupyprof has been disabled."); else: try: import pupyprof self._profiler_enabled = True except ImportError: logger.warn("Pupyprof is not supported on this system. Tracefile will not be generated"); self._profiler_enabled = False # Set a resume parameter indicating if we are resuming a packed job. # This will be changed (maybe) in the netowrk startup. self.resume = False # Enable a register for the users to put values in. This register can be read # with the readregister.py script found in bin/utils/ self.user_register = {} # Place to keep functions needed when packing / unpacking the running MPI # instance. The best place to start is migrate.py self.migrate_onpack = None self.network = Network(self, options) # Create the initial global Group, and assign the network all_procs as members world_Group = Group(options.rank) world_Group.members = self.network.all_procs # Create the initial communicator MPI_COMM_WORLD. It is initialized with # the rank of the process that holds it and size. # The members are filled out after the network is initialized. self.communicators = {} self.MPI_COMM_WORLD = Communicator(self, options.rank, options.size, self.network, world_Group, comm_root=None) # Tell the network about the global MPI_COMM_WORLD, and let it start to # listen on the corresponding network channels self.network.MPI_COMM_WORLD = self.MPI_COMM_WORLD # Change the contents of sys.argv runtime, so the user processes # can't see all the mpi specific parameters we start with. user_options =[sys.argv[0], ] user_options.extend(sys.argv[sys.argv.index("--")+1:]) sys.argv = user_options # Set up the global mpi constants constants.MPI_GROUP_EMPTY = Group() self.daemon = True resumer = None if self.resume: resumer = self.resume_packed_state() self.start() # Make every node connect to each other if settings specify it if not options.disable_full_network_startup: self.network.start_full_network() self.initinfo = (self.MPI_COMM_WORLD, self.MPI_COMM_WORLD.rank(), self.MPI_COMM_WORLD.size()) # Set a static attribute on the class so we know it is initialised. self.__class__._initialized = True if self._profiler_enabled: pupyprof.start() if self.resume and resumer: resumer(self)
def terminate_children(): for p in process_list: logger = Logger() logger.debug("Killing %s" % p) p.terminate()