class MpiBackend(Backend): """ A message passing interface (mpi) backend to pySPACE In order to use this backend, you need a working MPI distribution and mpi4py. You can download mpi4py from http://code.google.com/p/mpi4py/. mpi4py is compatible with a python 2.3 to 2.7 or 3.0 to 3.1 distribution. This backend assumes a global file system that is seen by all nodes running the processes. """ def __init__(self): super(MpiBackend, self).__init__() self.COMMAND_MPI = '/usr/lib64/openmpi/bin/mpirun' self.COMMAND_PYTHON = sys.executable self.runner_script = os.sep.join([pySPACE.configuration.root_dir, "app", "backends", "mpi_runner.py"]) # start as many processes as the total number of processors # available self.NumberOfProcessesToRunAtBeginning = 156 self.NumberOfProcessesToRunLater = 39 def __del__(self): pass def stage_in(self, operation): """ Stage the current operation """ super(MpiBackend, self).stage_in(operation) # init of process lists, because backend is only initialized once self.process_args_list = [] self.IndexCopyStart = 0 self.ProcessingSuccessful = True self.TotalProcessesFinished = 0 self.CrashedProcesses = [] # Set up progress bar widgets = ['Operation progress: ', Percentage(), ' ', Bar(), ' ', ETA()] self.progress_bar = ProgressBar(widgets = widgets, maxval = self.current_operation.number_processes) self.progress_bar.start() # The handler that is used remotely for logging handler_class = logging.handlers.SocketHandler handler_args = {"host" : self.host, "port" : self.port} # Set up stage in directory stagein_dir = os.sep.join([self.current_operation.result_directory, ".stagein"]) # Check if hosts file is created in the right directoy HostfileCreated = pySPACE.configuration.root_dir+ "/" +'hostsfile' if (not os.path.isfile(HostfileCreated)): print "***************************************************************************************************" print "hostsfile not created !" print "Please create the hosts file with a filename 'hostsfile' under ", pySPACE.configuration.root_dir print "***************************************************************************************************" raise UserWarning('Missing hostsfile.') if not os.path.exists(stagein_dir): os.mkdir(stagein_dir) process = self.current_operation.processes.get() print "Preparing processes. This might take a few minutes...." # Until not all Processes have been created prepare all processes # from the queue for remote execution and execute them i = 0 while process != False: process.prepare(pySPACE.configuration, handler_class, handler_args) # since preparing the process might be quite faster than executing # it we need another queue where processes get out when they have # finished execution #self.result_handlers.put(1) # Execute all functions in the process pool but return immediately #self.pool.apply_async(process, callback=self.dequeue_process) proc_file_name = os.sep.join([stagein_dir, "process_%d.pickle" % i]) proc_file = open(proc_file_name, "w") cPickle.dump(process, proc_file) proc_file.close() # Add task to job specification self.process_args_list.append(proc_file_name) # Get the next process process = self.current_operation.processes.get() i+=1 self._log("Operation - staged") self.state = "staged" def execute(self): """ Executes all processes specified in the currently staged operation. """ assert(self.state == "staged") def check_status(self): """ Returns a description of the current state of the operations execution. """ #self.progress_bar.update(float(self.current_job.info()["percentDone"])) #return float(self.current_job.info()["percentDone"]) / 100.0 #return float(self.current_process) / self.current_operation.number_processes return 1.0 def not_xor(self, a, b): return not((a or b) and not (a and b)) def retrieve(self): """ Returns the result of the operation. """ self.state = "executing" self._log("Operation - executing") if (self.NumberOfProcessesToRunAtBeginning > len(self.process_args_list)): args = ([self.COMMAND_MPI] + ['--loadbalance']+ ['--nolocal']+ ['--hostfile'] + [pySPACE.configuration.root_dir+ "/" +'hostsfile'] + ['-n', str(len(self.process_args_list))] + [self.COMMAND_PYTHON] + [self.runner_script] + self.process_args_list) # Start the processes. p =subprocess.Popen(args) #self.pids.append(p) self.IndexCopyStart += self.NumberOfProcessesToRunAtBeginning #print args else: #copy the arguments of the processes to run sub_process_args_list = (self.process_args_list[self.IndexCopyStart: self.NumberOfProcessesToRunAtBeginning]) args = ([self.COMMAND_MPI] + ['--loadbalance']+ ['--nolocal']+ ['--hostfile'] + [pySPACE.configuration.root_dir+ "/" +'hostsfile'] + ['-n', str(len(sub_process_args_list))] + [self.COMMAND_PYTHON] + [self.runner_script] + sub_process_args_list) # Start the processes. p = subprocess.Popen(args) #self.pids.append(p) # TODO: call p.poll() for p in self.pids after all processes have exited self.IndexCopyStart += self.NumberOfProcessesToRunAtBeginning #print args # Create a list of boolean for processes which are finished. # First we assume that all processes are not started, so we set # every element of the list to false FinishedProcesses=[False for i in range(len(self.process_args_list))] # Wait until all processes finish and start new processes # when old ones finish print "Waiting for the processes to finish...." # Counter for the processes which are finished. It will be reset # after 'NumberOfProcessesToRunLater' processes are finished CounterProcessesFinished = 0 processes_Finished = False while not processes_Finished: try: processes_Finished = True for LoopCounter, process_args in enumerate(self.process_args_list): if (self.not_xor (os.path.isfile(process_args+"_Finished"), os.path.isfile(process_args+"_Crashed"))): processes_Finished = False else: if (FinishedProcesses[LoopCounter] == False): # Record that the process is finished FinishedProcesses[LoopCounter] = True # If the process is crashed take note of that if (os.path.isfile(process_args+"_Crashed")): self.CrashedProcesses.append(process_args) # Increment the counter for the number of processes finished # by one CounterProcessesFinished += 1 self.TotalProcessesFinished += 1 # update the progress bar self.progress_bar.update(self.TotalProcessesFinished) if (CounterProcessesFinished == self.NumberOfProcessesToRunLater): # Define a variable for a subset of processes to run sub_process_args_list = [] if (self.IndexCopyStart==len(self.process_args_list)): break elif ((self.IndexCopyStart+self.NumberOfProcessesToRunLater)< len(self.process_args_list)): sub_process_args_list = (self.process_args_list[self.IndexCopyStart: self.IndexCopyStart +self.NumberOfProcessesToRunLater]) else: sub_process_args_list = self.process_args_list[self.IndexCopyStart:len(self.process_args_list)] args = ([self.COMMAND_MPI] + ['--loadbalance']+ ['--nolocal']+ ['--hostfile'] + [pySPACE.configuration.root_dir+ "/" +'hostsfile'] + ['-n', str(len(sub_process_args_list))] + [self.COMMAND_PYTHON] + [self.runner_script] + sub_process_args_list) # Start the processes if (len(sub_process_args_list) > 0): p = subprocess.Popen(args) #print args # Adjust the start index self.IndexCopyStart += self.NumberOfProcessesToRunLater # Reset the counter for processes finished CounterProcessesFinished = 0 # sleep for one second time.sleep(1) except (KeyboardInterrupt, SystemExit): # if processes hang forever self.ProcessingSuccessful = False print "*********************************************************************************************************" print "pySPACE forced to stop ..." print "Please wait until mpi_backend is finished with consolidating the results generated and with clean up ..." print "**********************************************************************************************************" import pySPACE.resources.dataset_defs.performance_result.PerformanceResultSummary as PerformanceResultSummary # merge the remaining files print "***************************************************************************************************" print "Starting merging . . ." PerformanceResultSummary.merge_performance_results(self.current_operation.result_directory) print "Merging complete . . ." print "***************************************************************************************************" break #The while loop will break self._log("Operation - processing finished") # Change the state to retrieved self.state = "retrieved" return None def consolidate(self): """ Consolidates the results of the single processes into a consistent result of the whole operation """ assert(self.state == "retrieved") if ((self.ProcessingSuccessful ==True) and (len(self.CrashedProcesses) == 0)): self.current_operation.consolidate() if ((self.ProcessingSuccessful ==True) and (len(self.CrashedProcesses) != 0)): import pySPACE.resources.dataset_defs.performance_result.PerformanceResultSummary as PerformanceResultSummary # merge the remaining files print "***************************************************************************************************" print "Starting merging . . ." PerformanceResultSummary.merge_performance_results(self.current_operation.result_directory) print "Merging complete . . ." print "***************************************************************************************************" self._log("Operation - consolidated") self.state = "consolidated" def cleanup(self): """ Remove the current operation and all potential results that have been stored in this object """ self.state = "idling" # Cleaning up... stagein_dir = os.sep.join([self.current_operation.result_directory, ".stagein"]) if ((self.ProcessingSuccessful == True) and (len(self.CrashedProcesses) == 0)): deleted = False while not deleted: try: os.chdir("..") shutil.rmtree(stagein_dir) deleted = True except OSError, e: if e.errno == 66: self._log("Could not remove .stagein dir " ", waiting for NFS lock", level=logging.WARNING) time.sleep(5) self._log("Operation - cleaned up") self._log("Idling...") # Remove the file logger for this operation logging.getLogger('').removeHandler(self.file_handler) # close listener socket self.sock.close() self.current_operation = None
class SerialBackend(Backend): """ A backend that allows for easy debugging since the program flow is not threaded or distributed over several OS processes. """ def __init__(self): super(SerialBackend, self).__init__() self.state = "idling" self.current_process = 0 def stage_in(self, operation): """ Stage the current operation """ super(SerialBackend, self).stage_in(operation) # Set up progress bar widgets = ['Operation progress: ', Percentage(), ' ', Bar(), ' ', ETA()] self.progress_bar = ProgressBar(widgets = widgets, maxval = self.current_operation.number_processes) self.progress_bar.start() self._log("Operation - staged") self.state = "staged" def execute(self): """ Executes all processes specified in the currently staged operation. """ assert(self.state == "staged") self.state = "executing" self._log("Operation - executing") # The handler that is used remotely for logging handler_class = logging.handlers.SocketHandler handler_args = {"host" : self.host, "port" : self.port} try: process = self.current_operation.processes.get() except KeyboardInterrupt: self._log(traceback.format_exc(), level=logging.CRITICAL) process = False # while there are Processes in the queue ... while not process is False: process.prepare(pySPACE.configuration, handler_class, handler_args) # Execute process, update progress bar and get next queue-element try: process() # if an exception is raised somewhere in the code we maybe want to # further try other processes except Exception: self._log(traceback.format_exc(), level=logging.CRITICAL) process.post_benchmarking() process = False # if ctrl+c is pressed we want to immediately stop everything except KeyboardInterrupt: self._log(traceback.format_exc(), level=logging.CRITICAL) process.post_benchmarking() process = False else: self.current_process += 1 self.progress_bar.update(self.current_process) process = self.current_operation.processes.get() def check_status(self): """ Returns a description of the current state of the operations execution. .. todo:: do we really need this method??? """ # Returns which percentage of processes of the current operation # is already finished return float(self.current_process)/self.current_operation.number_processes def retrieve(self): """ Returns the result of the operation. This is trivial in the Debug-Backend since execute blocks. """ assert(self.state == "executing") self._log("Operation - retrieved") self.current_operation.processes.close() # if process creation has another thread if hasattr(self.current_operation, "create_process") \ and self.current_operation.create_process != None: self.current_operation.create_process.join() # Change the state to retrieved self.state = "retrieved" def consolidate(self): """ Consolidates the results of the single processes into a consistent result of the whole operation """ assert(self.state == "retrieved") try: self.current_operation.consolidate() except Exception: self._log(traceback.format_exc(), level=logging.CRITICAL) self._log("Operation - consolidated") self.state = "consolidated" def cleanup(self): """ Remove the current operation and all potential results that have been stored in this object """ self.state = "idling" self._log("Operation - cleaned up") self._log("Idling...") # Remove the file logger for this operation logging.getLogger('').removeHandler(self.file_handler) # close listener socket self.sock.close() self.current_operation = None self.current_process = 0
class SerialBackend(Backend): """ A backend that allows for easy debugging since the program flow is not threaded or distributed over several OS processes. """ def __init__(self): super(SerialBackend, self).__init__() self.state = "idling" self.current_process = 0 def stage_in(self, operation): """ Stage the current operation """ super(SerialBackend, self).stage_in(operation) # Set up progress bar widgets = [ 'Operation progress: ', Percentage(), ' ', Bar(), ' ', ETA() ] self.progress_bar = ProgressBar( widgets=widgets, maxval=self.current_operation.number_processes) self.progress_bar.start() self._log("Operation - staged") self.state = "staged" def execute(self, timeout=1e6): """ Executes all processes specified in the currently staged operation. """ assert (self.state == "staged") self.state = "executing" self._log("Operation - executing") # The handler that is used remotely for logging handler_class = logging.handlers.SocketHandler handler_args = {"host": self.host, "port": self.port} get_process = partial(self.current_operation.processes.get, timeout=timeout) for process in iter(get_process, False): process.prepare(pySPACE.configuration, handler_class, handler_args) # Execute process, update progress bar and get next queue-element try: process() # if an exception is raised somewhere in the code we maybe want to # further try other processes except Exception: self._log(traceback.format_exc(), level=logging.CRITICAL) process.post_benchmarking() process = False # if ctrl+c is pressed we want to immediately stop everything except KeyboardInterrupt: self._log(traceback.format_exc(), level=logging.CRITICAL) process.post_benchmarking() process = False else: self.current_process += 1 self.progress_bar.update(self.current_process) def check_status(self): """ Returns a description of the current state of the operations execution. .. todo:: do we really need this method??? """ # Returns which percentage of processes of the current operation # is already finished return float( self.current_process) / self.current_operation.number_processes def retrieve(self, timeout=1e6): """ Returns the result of the operation. This is trivial in the Debug-Backend since execute blocks. """ assert (self.state == "executing") self._log("Operation - retrieved") self.current_operation.processes.close() # if process creation has another thread if hasattr(self.current_operation, "create_process") \ and self.current_operation.create_process != None: self.current_operation.create_process.join(timeout=1e6) # Change the state to retrieved self.state = "retrieved" def consolidate(self): """ Consolidates the results of the single processes into a consistent result of the whole operation """ assert (self.state == "retrieved") try: self.current_operation.consolidate() except Exception: self._log(traceback.format_exc(), level=logging.CRITICAL) self._log("Operation - consolidated") self.state = "consolidated" def cleanup(self): """ Remove the current operation and all potential results that have been stored in this object """ self.state = "idling" self._log("Operation - cleaned up") self._log("Idling...") # Remove the file logger for this operation logging.getLogger('').removeHandler(self.file_handler) # close listener socket self.sock.close() self.current_operation = None self.current_process = 0
class MulticoreBackend(Backend): """ Execute as many processes in parallel as there are (logical) CPUs on the local machine This backend is based on the multiprocessing package and should work on every multicore system without additional settings even on virtual machines. Each process corresponds to one combination of input data set and parameter choice. :Author: Anett Seeland ([email protected]) :LastChange: 2012/09/24 """ def __init__(self, pool_size=None): super(MulticoreBackend, self).__init__() # Set the number of processes in the pool # per default to the number of CPUs if pool_size == None: pool_size = MulticoreBackend.detect_CPUs() self.pool_size = pool_size self.state = "idling" # queue for execution self.result_handlers = multiprocessing.Queue(pool_size + 2) self.pool = None self.current_process = 0 self._log("Created MulticoreBackend with pool size %s" % pool_size) def reset_queue(self): """ Resets the execution queue""" self.result_handlers = multiprocessing.Queue(self.pool_size + 2) def stage_in(self, operation): """ Stage the current operation """ super(MulticoreBackend, self).stage_in(operation) self.pool = multiprocessing.Pool(processes=self.pool_size) # Set up progress bar widgets = [ 'Operation progress: ', Percentage(), ' ', Bar(), ' ', ETA() ] self.progress_bar = ProgressBar( widgets=widgets, maxval=self.current_operation.number_processes) self.progress_bar.start() self._log("Operation - staged") self.state = "staged" def execute(self): """ Execute all processes specified in the currently staged operation """ assert (self.state == "staged") self._log("Operation - executing") self.state = "executing" # The handler that is used remotely for logging handler_class = logging.handlers.SocketHandler handler_args = {"host": self.host, "port": self.port} backend_com = (self.SERVER_IP, self.SERVER_PORT) # A socket communication thread to handle e.g. subflows self.listener = LocalComHandler(self.sock) self.listener.start() try: process = self.current_operation.processes.get() except KeyboardInterrupt: process = False # Until not all Processes have been created prepare all processes # from the queue for remote execution and execute them while process != False: process.prepare(pySPACE.configuration, handler_class, handler_args, backend_com) # since preparing the process might be quite faster than executing # it we need another queue where processes get out when they have # finished execution self.result_handlers.put(1) # Execute all functions in the process pool but return immediately self.pool.apply_async(process, callback=self.dequeue_process) process = self.current_operation.processes.get() time.sleep(0.1) def dequeue_process(self, result): """ Callback function for finished processes """ self.current_process += 1 self.result_handlers.get() self.progress_bar.update(self.current_process) def check_status(self): """ Return a description of the current state of the operations execution .. todo:: do we really need this method??? """ # Returns which percentage of processes of the current operation # is already finished return float( self.current_process) / self.current_operation.number_processes def retrieve(self): """ Wait for all results of the operation This call blocks until all processes are finished. """ assert (self.state == "executing") # Prevent any other processes from being submitted to the pool # (necessary for join) self.pool.close() self._log("Closing pool", level=logging.DEBUG) self._log("Operation - retrieved") self.current_operation.processes.close() # if process creation has another thread if hasattr(self.current_operation, "create_process") \ and self.current_operation.create_process != None: self.current_operation.create_process.join() self.pool.join() # Wait for worker processes to exit self._log("Worker processes have exited gracefully") self.result_handlers.close() # inform listener that its time to die self.listener.operation_finished = True time.sleep(1) self.listener.join() # Change the state to finished self.state = "retrieved" def consolidate(self): """ Consolidate the single processes' results into a consistent result of the whole operation """ assert (self.state == "retrieved") try: self.current_operation.consolidate() except Exception: import traceback self._log(traceback.format_exc(), level=logging.ERROR) self._log("Operation - consolidated") self.state = "consolidated" def cleanup(self): """ Remove the current operation and all potential results that have been stored in this object """ self.state = "idling" self._log("Operation - cleaned up") self._log("Idling...") # Remove the file logger for this operation logging.getLogger('').removeHandler(self.file_handler) # close listener socket self.sock.close() self.current_operation = None self.current_process = 0 @classmethod def detect_CPUs(cls): """ Detects the number of CPUs on a system. Cribbed from pp. :from: http://codeliberates.blogspot.com/2008/05/detecting-cpuscores-in-python.html """ ncpus = None # Linux, Unix and MacOS: if hasattr(os, "sysconf"): if os.sysconf_names.has_key("SC_NPROCESSORS_ONLN"): # Linux & Unix: ncpus = os.sysconf("SC_NPROCESSORS_ONLN") if isinstance(ncpus, int) and ncpus > 0: return ncpus else: # OSX: return int(os.popen2("sysctl -n hw.ncpu")[1].read()) # Windows: if os.environ.has_key("NUMBER_OF_PROCESSORS"): ncpus = int(os.environ["NUMBER_OF_PROCESSORS"]) if ncpus > 0: return ncpus return 1 # Default
class MpiBackend(Backend): """ A message passing interface (mpi) backend to pySPACE In order to use this backend, you need a working MPI distribution and mpi4py. You can download mpi4py from http://code.google.com/p/mpi4py/. mpi4py is compatible with a python 2.3 to 2.7 or 3.0 to 3.1 distribution. This backend assumes a global file system that is seen by all nodes running the processes. **Parameters** :pool_size: Define how many MPI processes should be started in parallel. This should not exceed the amount of available processors. (or the number of mpi slots defined in the hostsfile) (*recommended, default: 156*) """ def __init__(self, pool_size=156): super(MpiBackend, self).__init__() #self.COMMAND_MPI = '/usr/lib64/openmpi/bin/mpirun' self.COMMAND_MPI = 'mpirun' self.COMMAND_PYTHON = sys.executable self.runner_script = os.sep.join([ pySPACE.configuration.root_dir, "environments", "backends", "mpi_runner.py" ]) # start as many processes as the total number of processors # available self.NumberOfProcessesToRunAtBeginning = pool_size self.NumberOfProcessesToRunLater = pool_size #39 def __del__(self): pass def stage_in(self, operation): """ Stage the current operation """ super(MpiBackend, self).stage_in(operation) # init of process lists, because backend is only initialized once self.process_args_list = [] self.IndexCopyStart = 0 self.ProcessingSuccessful = True self.TotalProcessesFinished = 0 self.CrashedProcesses = [] # Set up progress bar widgets = [ 'Operation progress: ', Percentage(), ' ', Bar(), ' ', ETA() ] self.progress_bar = ProgressBar( widgets=widgets, maxval=self.current_operation.number_processes) self.progress_bar.start() # The handler that is used remotely for logging handler_class = logging.handlers.SocketHandler handler_args = {"host": self.host, "port": self.port} # Set up stage in directory stagein_dir = os.sep.join( [self.current_operation.result_directory, ".stagein"]) # Check if hosts file is created in the right directoy HostfileCreated = pySPACE.configuration.root_dir + "/" + 'hostsfile' if (not os.path.isfile(HostfileCreated)): print "***************************************************************************************************" print "hostsfile not created !" print "Please create the hosts file with a filename 'hostsfile' under ", pySPACE.configuration.root_dir print "***************************************************************************************************" raise UserWarning('Missing hostsfile.') if not os.path.exists(stagein_dir): os.mkdir(stagein_dir) process = self.current_operation.processes.get() print "Preparing processes. This might take a few minutes...." # Until not all Processes have been created prepare all processes # from the queue for remote execution and execute them i = 0 while process != False: process.prepare(pySPACE.configuration, handler_class, handler_args) # since preparing the process might be quite faster than executing # it we need another queue where processes get out when they have # finished execution #self.result_handlers.put(1) # Execute all functions in the process pool but return immediately #self.pool.apply_async(process, callback=self.dequeue_process) proc_file_name = os.sep.join( [stagein_dir, "process_%d.pickle" % i]) proc_file = open(proc_file_name, "w") cPickle.dump(process, proc_file) proc_file.close() # Add task to job specification self.process_args_list.append(proc_file_name) # Get the next process process = self.current_operation.processes.get() i += 1 self._log("Operation - staged") self.state = "staged" def execute(self, timeout=1e6): """ Executes all processes specified in the currently staged operation. """ assert (self.state == "staged") def check_status(self): """ Returns a description of the current state of the operations execution. """ #self.progress_bar.update(float(self.current_job.info()["percentDone"])) #return float(self.current_job.info()["percentDone"]) / 100.0 #return float(self.current_process) / self.current_operation.number_processes return 1.0 def not_xor(self, a, b): return not ((a or b) and not (a and b)) def retrieve(self, timeout=1e6): """ Returns the result of the operation. """ self.state = "executing" self._log("Operation - executing") if (self.NumberOfProcessesToRunAtBeginning > len( self.process_args_list)): args = ([self.COMMAND_MPI] + ['--loadbalance'] + ['--nolocal'] + ['--hostfile'] + [pySPACE.configuration.root_dir + "/" + 'hostsfile'] + ['-n', str(len(self.process_args_list))] + [self.COMMAND_PYTHON] + [self.runner_script] + self.process_args_list) # Start the processes. self._log("mpi-parameters: %s" % args, level=logging.DEBUG) self._log("mpi-parameters-joined: %s" % os.path.join(args), level=logging.DEBUG) p = subprocess.Popen(args) #self.pids.append(p) self.IndexCopyStart += self.NumberOfProcessesToRunAtBeginning #print args else: #copy the arguments of the processes to run sub_process_args_list = ( self.process_args_list[self.IndexCopyStart:self. NumberOfProcessesToRunAtBeginning]) args = ([self.COMMAND_MPI] + ['--loadbalance'] + ['--nolocal'] + ['--hostfile'] + [pySPACE.configuration.root_dir + "/" + 'hostsfile'] + ['-n', str(len(sub_process_args_list))] + [self.COMMAND_PYTHON] + [self.runner_script] + sub_process_args_list) # Start the processes. p = subprocess.Popen(args) #self.pids.append(p) # TODO: call p.poll() for p in self.pids after all processes have exited self.IndexCopyStart += self.NumberOfProcessesToRunAtBeginning #print args # Create a list of boolean for processes which are finished. # First we assume that all processes are not started, so we set # every element of the list to false FinishedProcesses = [False for i in range(len(self.process_args_list))] # Wait until all processes finish and start new processes # when old ones finish print "Waiting for the processes to finish...." # Counter for the processes which are finished. It will be reset # after 'NumberOfProcessesToRunLater' processes are finished CounterProcessesFinished = 0 processes_Finished = False while not processes_Finished: try: processes_Finished = True for LoopCounter, process_args in enumerate( self.process_args_list): if (self.not_xor( os.path.isfile(process_args + "_Finished"), os.path.isfile(process_args + "_Crashed"))): processes_Finished = False else: if (FinishedProcesses[LoopCounter] == False): # Record that the process is finished FinishedProcesses[LoopCounter] = True # If the process is crashed take note of that if (os.path.isfile(process_args + "_Crashed")): self.CrashedProcesses.append(process_args) # Increment the counter for the number of processes finished # by one CounterProcessesFinished += 1 self.TotalProcessesFinished += 1 # update the progress bar self.progress_bar.update( self.TotalProcessesFinished) if (CounterProcessesFinished == self.NumberOfProcessesToRunLater): # Define a variable for a subset of processes to run sub_process_args_list = [] if (self.IndexCopyStart == len( self.process_args_list)): break elif ((self.IndexCopyStart + self.NumberOfProcessesToRunLater) < len( self.process_args_list)): sub_process_args_list = ( self.process_args_list[ self.IndexCopyStart:self. IndexCopyStart + self.NumberOfProcessesToRunLater]) else: sub_process_args_list = self.process_args_list[ self.IndexCopyStart:len( self.process_args_list)] args = ( [self.COMMAND_MPI] + ['--loadbalance'] + ['--nolocal'] + ['--hostfile'] + [ pySPACE.configuration.root_dir + "/" + 'hostsfile' ] + ['-n', str(len(sub_process_args_list))] + [self.COMMAND_PYTHON] + [self.runner_script] + sub_process_args_list) # Start the processes if (len(sub_process_args_list) > 0): p = subprocess.Popen(args) #print args # Adjust the start index self.IndexCopyStart += self.NumberOfProcessesToRunLater # Reset the counter for processes finished CounterProcessesFinished = 0 # sleep for one second time.sleep(1) except (KeyboardInterrupt, SystemExit): # if processes hang forever self.ProcessingSuccessful = False print "*********************************************************************************************************" print "pySPACE forced to stop ..." print "Please wait until mpi_backend is finished with consolidating the results generated and with clean up ..." print "**********************************************************************************************************" import pySPACE.resources.dataset_defs.performance_result.PerformanceResultSummary as PerformanceResultSummary # merge the remaining files print "***************************************************************************************************" print "Starting merging . . ." PerformanceResultSummary.merge_performance_results( self.current_operation.result_directory) print "Merging complete . . ." print "***************************************************************************************************" break #The while loop will break self._log("Operation - processing finished") # Change the state to retrieved self.state = "retrieved" return None def consolidate(self): """ Consolidates the results of the single processes into a consistent result of the whole operation """ assert (self.state == "retrieved") if ((self.ProcessingSuccessful == True) and (len(self.CrashedProcesses) == 0)): self.current_operation.consolidate() if ((self.ProcessingSuccessful == True) and (len(self.CrashedProcesses) != 0)): import pySPACE.resources.dataset_defs.performance_result.PerformanceResultSummary as PerformanceResultSummary # merge the remaining files print "***************************************************************************************************" print "Starting merging . . ." PerformanceResultSummary.merge_performance_results( self.current_operation.result_directory) print "Merging complete . . ." print "***************************************************************************************************" self._log("Operation - consolidated") self.state = "consolidated" def cleanup(self): """ Remove the current operation and all potential results that have been stored in this object """ self.state = "idling" # Cleaning up... stagein_dir = os.sep.join( [self.current_operation.result_directory, ".stagein"]) if ((self.ProcessingSuccessful == True) and (len(self.CrashedProcesses) == 0)): deleted = False while not deleted: try: os.chdir("..") shutil.rmtree(stagein_dir) deleted = True except OSError, e: if e.errno == 66: self._log( "Could not remove .stagein dir " ", waiting for NFS lock", level=logging.WARNING) time.sleep(5) self._log("Operation - cleaned up") self._log("Idling...") # Remove the file logger for this operation logging.getLogger('').removeHandler(self.file_handler) # close listener socket self.sock.close() self.current_operation = None
class MulticoreBackend(Backend): """ Execute as many processes in parallel as there are (logical) CPUs on the local machine This backend is based on the multiprocessing package and should work on every multicore system without additional settings even on virtual machines. Each process corresponds to one combination of input data set and parameter choice. :Author: Anett Seeland ([email protected]) :LastChange: 2012/09/24 """ def __init__(self, pool_size=None): super(MulticoreBackend, self).__init__() # Set the number of processes in the pool # per default to the number of CPUs if pool_size is None: pool_size = MulticoreBackend.detect_CPUs() self.pool_size = pool_size self.state = "idling" # queue for execution self.result_handlers = [] self.pool = None self.current_process = 0 self._log("Created MulticoreBackend with pool size %s" % pool_size) def reset_queue(self): """ Resets the execution queue""" self.result_handlers = [] def stage_in(self, operation): """ Stage the current operation """ super(MulticoreBackend, self).stage_in(operation) self.pool = multiprocessing.Pool(processes=self.pool_size) # Set up progress bar widgets = ["Operation progress: ", Percentage(), " ", Bar(), " ", ETA()] self.progress_bar = ProgressBar(widgets=widgets, maxval=self.current_operation.number_processes) self.progress_bar.start() self._log("Operation - staged") self.state = "staged" def execute(self, timeout=None): """ Execute all processes specified in the currently staged operation """ # This blocks until all results are available, hence this call is synchronize assert self.state == "staged" self._log("Operation - executing") self.state = "executing" # The handler that is used remotely for logging handler_class = logging.handlers.SocketHandler handler_args = {"host": self.host, "port": self.port} backend_com = (self.SERVER_IP, self.SERVER_PORT) # A socket communication thread to handle e.g. subflows self.listener = LocalComHandler(self.sock) self.listener.start() try: process = self.current_operation.processes.get(timeout=timeout) except KeyboardInterrupt: process = False # Until not all Processes have been created prepare all processes # from the queue for remote execution and execute them while not process is False: process.prepare(pySPACE.configuration, handler_class, handler_args, backend_com) # Execute all functions in the process pool but return immediately self.pool.apply_async(process, callback=self.dequeue_process) process = self.current_operation.processes.get(timeout=timeout) time.sleep(0.1) def dequeue_process(self, result): """ Callback function for finished processes """ self.current_process += 1 self.progress_bar.update(self.current_process) def check_status(self): """ Return a description of the current state of the operations execution .. todo:: do we really need this method??? """ # Returns which percentage of processes of the current operation # is already finished return float(self.current_process) / self.current_operation.number_processes def retrieve(self, timeout=0): """ Wait for all results of the operation This call blocks until all processes are finished. """ assert self.state == "executing" # Prevent any other processes from being submitted to the pool # (necessary for join) self.pool.close() self._log("Closing pool", level=logging.DEBUG) self._log("Operation - retrieved") self.current_operation.processes.close() # if process creation has another thread if hasattr(self.current_operation, "create_process") and self.current_operation.create_process != None: self.current_operation.create_process.join() # Close the result handler and wait for every process # to terminate try: for result in self.result_handlers: result.wait(timeout=timeout) except multiprocessing.TimeoutError: # A timeout occurred, terminate the pool self._log("Timeout occurred, terminating worker processes") self.pool.terminate() return False finally: self.pool.join() # Wait for worker processes to exit # inform listener that its time to die self.listener.operation_finished = True time.sleep(1) self.listener.join() # Change the state to finished self.state = "retrieved" self._log("Worker processes have exited gracefully") return True def consolidate(self): """ Consolidate the single processes' results into a consistent result of the whole operation """ assert self.state == "retrieved" try: self.current_operation.consolidate() except Exception: import traceback self._log(traceback.format_exc(), level=logging.ERROR) self._log("Operation - consolidated") self.state = "consolidated" def cleanup(self): """ Remove the current operation and all potential results that have been stored in this object """ self.state = "idling" self._log("Operation - cleaned up") self._log("Idling...") # Remove the file logger for this operation logging.getLogger("").removeHandler(self.file_handler) # close listener socket self.sock.close() self.current_operation = None self.current_process = 0 @classmethod def detect_CPUs(cls): """ Detects the number of CPUs on a system. Cribbed from pp. :from: http://codeliberates.blogspot.com/2008/05/detecting-cpuscores-in-python.html """ ncpus = None # Linux, Unix and MacOS: if hasattr(os, "sysconf"): if os.sysconf_names.has_key("SC_NPROCESSORS_ONLN"): # Linux & Unix: ncpus = os.sysconf("SC_NPROCESSORS_ONLN") if isinstance(ncpus, int) and ncpus > 0: return ncpus else: # OSX: return int(os.popen2("sysctl -n hw.ncpu")[1].read()) # Windows: if os.environ.has_key("NUMBER_OF_PROCESSORS"): ncpus = int(os.environ["NUMBER_OF_PROCESSORS"]) if ncpus > 0: return ncpus return 1 # Default