コード例 #1
0
class MpiBackend(Backend):
    """ 
    A message passing interface (mpi) backend to pySPACE
    
    In order to use this backend, you need a working MPI distribution and mpi4py. 
    You can download mpi4py from http://code.google.com/p/mpi4py/. mpi4py is 
    compatible with a python 2.3  to 2.7 or 3.0 to 3.1 distribution. 

    This backend assumes a global file system that is seen by all nodes running 
    the processes. 
 
    """
    
    def __init__(self):
        super(MpiBackend, self).__init__()
        
        self.COMMAND_MPI = '/usr/lib64/openmpi/bin/mpirun'
        self.COMMAND_PYTHON = sys.executable
        self.runner_script = os.sep.join([pySPACE.configuration.root_dir,
                             "app",
                             "backends",
                             "mpi_runner.py"])
        # start as many processes as the total number of processors
        # available
        self.NumberOfProcessesToRunAtBeginning = 156
        self.NumberOfProcessesToRunLater = 39
    def __del__(self):
        pass
        

    def stage_in(self, operation):
        """
        Stage the current operation
        """
        super(MpiBackend, self).stage_in(operation)
        # init of process lists, because backend is only initialized once        
        self.process_args_list = []
        self.IndexCopyStart = 0
        self.ProcessingSuccessful = True
        self.TotalProcessesFinished = 0
        self.CrashedProcesses = []
        # Set up progress bar
        widgets = ['Operation progress: ', Percentage(), ' ', Bar(), ' ', ETA()]
        self.progress_bar = ProgressBar(widgets = widgets, 
                                       maxval = self.current_operation.number_processes)
        self.progress_bar.start()

        # The handler that is used remotely for logging
        handler_class = logging.handlers.SocketHandler
        handler_args = {"host" : self.host, "port" : self.port}
        
        # Set up stage in directory
        stagein_dir = os.sep.join([self.current_operation.result_directory,
                                   ".stagein"])
        # Check if hosts file is created in the right directoy
        HostfileCreated = pySPACE.configuration.root_dir+ "/" +'hostsfile'
        if (not os.path.isfile(HostfileCreated)):
            print "***************************************************************************************************"
            print "hostsfile not created !"
            print "Please create the hosts file with a filename 'hostsfile' under ", pySPACE.configuration.root_dir
            print "***************************************************************************************************"
            raise UserWarning('Missing hostsfile.')
        if not os.path.exists(stagein_dir):
            os.mkdir(stagein_dir)   

        process = self.current_operation.processes.get()
        print "Preparing processes. This might take a few minutes...."
        # Until not all Processes have been created prepare all processes
        # from the queue for remote execution and execute them
        i = 0
        while process != False:
            process.prepare(pySPACE.configuration, handler_class, handler_args)
            # since preparing the process might be quite faster than executing
            # it we need another queue where processes get out when they have
            # finished execution
            #self.result_handlers.put(1)
            # Execute all functions in the process pool but return immediately
            #self.pool.apply_async(process, callback=self.dequeue_process)
            proc_file_name = os.sep.join([stagein_dir,
                                          "process_%d.pickle" % i])
            proc_file = open(proc_file_name, "w")
            cPickle.dump(process, proc_file)
            proc_file.close()
            # Add task to job specification
            self.process_args_list.append(proc_file_name)
            # Get the next process
            process = self.current_operation.processes.get()
            i+=1

        self._log("Operation - staged")
        self.state = "staged"        
        
    def execute(self):
        """
        Executes all processes specified in the currently staged
        operation.
        """
        assert(self.state == "staged")
        
        
    def check_status(self):
        """
        Returns a description of the current state of the operations
        execution. 
        """
        #self.progress_bar.update(float(self.current_job.info()["percentDone"]))
        #return float(self.current_job.info()["percentDone"]) / 100.0
        #return float(self.current_process) / self.current_operation.number_processes
        return 1.0

    def not_xor(self, a, b):
        return not((a or b) and not (a and b))
    
    def retrieve(self):
        """
        Returns the result of the operation.
        """
        
        self.state = "executing" 
        self._log("Operation - executing") 
        if (self.NumberOfProcessesToRunAtBeginning > len(self.process_args_list)):
            args = ([self.COMMAND_MPI] +
                ['--loadbalance']+
                ['--nolocal']+
                ['--hostfile'] +
                [pySPACE.configuration.root_dir+ "/" +'hostsfile'] +
                ['-n', str(len(self.process_args_list))] +
                [self.COMMAND_PYTHON] +  
                [self.runner_script] + 
                self.process_args_list)
            # Start the processes. 
            p =subprocess.Popen(args)
            #self.pids.append(p)
            self.IndexCopyStart += self.NumberOfProcessesToRunAtBeginning
            #print args
        else:
            #copy the arguments of the processes to run
            sub_process_args_list = (self.process_args_list[self.IndexCopyStart: 
                                     self.NumberOfProcessesToRunAtBeginning])
            args = ([self.COMMAND_MPI] +
                ['--loadbalance']+
                ['--nolocal']+
                ['--hostfile'] +
                [pySPACE.configuration.root_dir+ "/" +'hostsfile'] +
                ['-n', str(len(sub_process_args_list))] +
                [self.COMMAND_PYTHON] +  
                [self.runner_script] + 
                sub_process_args_list)
            # Start the processes. 
            p = subprocess.Popen(args)
            #self.pids.append(p) # TODO: call p.poll() for p in self.pids after all processes have exited
            self.IndexCopyStart += self.NumberOfProcessesToRunAtBeginning
            #print args

        # Create a list of boolean for processes which are finished.
        # First we assume that all processes are not started, so we set
        # every element of the list to false
        FinishedProcesses=[False for i in range(len(self.process_args_list))] 
        
        # Wait until all processes finish and start new processes
        # when old ones finish

        print "Waiting for the processes to finish...."

        # Counter for the processes which are finished. It will be reset
        # after 'NumberOfProcessesToRunLater' processes are finished
        CounterProcessesFinished = 0
        processes_Finished = False

        while not processes_Finished:
          try:
             processes_Finished = True
             for LoopCounter, process_args in enumerate(self.process_args_list):
                 if (self.not_xor (os.path.isfile(process_args+"_Finished"), 
                               os.path.isfile(process_args+"_Crashed"))):
                    processes_Finished = False
                 else:
                    if (FinishedProcesses[LoopCounter] == False):
                       # Record that the process is finished                       
                       FinishedProcesses[LoopCounter] = True
                       # If the process is crashed take note of that
                       if (os.path.isfile(process_args+"_Crashed")):
                           self.CrashedProcesses.append(process_args)
                       # Increment the counter for the number of processes finished
                       # by one
                       CounterProcessesFinished += 1
                       self.TotalProcessesFinished += 1 
                       # update the progress bar
                       self.progress_bar.update(self.TotalProcessesFinished)
                       if (CounterProcessesFinished == self.NumberOfProcessesToRunLater):
                          # Define a variable for a subset of processes to run
                          sub_process_args_list = []
                          if (self.IndexCopyStart==len(self.process_args_list)):
                              break
                          elif ((self.IndexCopyStart+self.NumberOfProcessesToRunLater)< len(self.process_args_list)):
                              sub_process_args_list = (self.process_args_list[self.IndexCopyStart:
                                                       self.IndexCopyStart +self.NumberOfProcessesToRunLater])
                          else:
                              sub_process_args_list = self.process_args_list[self.IndexCopyStart:len(self.process_args_list)]
                          args = ([self.COMMAND_MPI] +
                                 ['--loadbalance']+
                                 ['--nolocal']+
                                 ['--hostfile'] +
                                 [pySPACE.configuration.root_dir+ "/" +'hostsfile'] +
                                 ['-n', str(len(sub_process_args_list))] +
                                 [self.COMMAND_PYTHON] +  
                                 [self.runner_script] + 
                                 sub_process_args_list)
                          # Start the processes
                          if (len(sub_process_args_list) > 0):
                             p = subprocess.Popen(args)
                          #print args                          
                          # Adjust the start index
                          self.IndexCopyStart += self.NumberOfProcessesToRunLater
                          # Reset the counter for processes finished
                          CounterProcessesFinished = 0
             # sleep for one second                
             time.sleep(1)
          except (KeyboardInterrupt, SystemExit): # if processes hang forever
            self.ProcessingSuccessful = False
            print "*********************************************************************************************************"
            print "pySPACE forced to stop ..."
            print "Please wait until mpi_backend is finished with consolidating the results generated and with clean up ..."
            print "**********************************************************************************************************"
            import pySPACE.resources.dataset_defs.performance_result.PerformanceResultSummary as PerformanceResultSummary
            # merge the remaining files
            print "***************************************************************************************************"
            print "Starting merging . . ."
            PerformanceResultSummary.merge_performance_results(self.current_operation.result_directory)
            print "Merging complete . . ."
            print "***************************************************************************************************"
            break #The while loop will break

        self._log("Operation - processing finished")
        
        # Change the state to retrieved
        self.state = "retrieved"
        
        return None


    def consolidate(self):
        """
        Consolidates the results of the single processes into a consistent result of the whole
        operation
        """
        assert(self.state == "retrieved")
        
        if ((self.ProcessingSuccessful ==True) and (len(self.CrashedProcesses) == 0)):
            self.current_operation.consolidate()
                 
        if ((self.ProcessingSuccessful ==True) and (len(self.CrashedProcesses) != 0)):
            import pySPACE.resources.dataset_defs.performance_result.PerformanceResultSummary as PerformanceResultSummary
            # merge the remaining files
            print "***************************************************************************************************"
            print "Starting merging . . ."
            PerformanceResultSummary.merge_performance_results(self.current_operation.result_directory)
            print "Merging complete . . ."
            print "***************************************************************************************************"

        self._log("Operation - consolidated")
        
        self.state = "consolidated"
        
        
    def cleanup(self):
        """
        Remove the current operation and all potential results that
        have been stored in this object
        """
        self.state = "idling"

        # Cleaning up...
        stagein_dir = os.sep.join([self.current_operation.result_directory,
                                   ".stagein"])
        if ((self.ProcessingSuccessful == True) and (len(self.CrashedProcesses) == 0)):
           deleted = False

           while not deleted:
               try:
                  os.chdir("..")
                  shutil.rmtree(stagein_dir)
                  deleted = True
               except OSError, e:
                  if e.errno == 66:
                     self._log("Could not remove .stagein dir " 
                             ", waiting for NFS lock",
                              level=logging.WARNING)
                  time.sleep(5)
               
        self._log("Operation - cleaned up")
        self._log("Idling...")
        
        # Remove the file logger for this operation
        logging.getLogger('').removeHandler(self.file_handler)       
        # close listener socket
        self.sock.close()
        self.current_operation = None 
コード例 #2
0
ファイル: serial.py プロジェクト: Crespo911/pyspace
class SerialBackend(Backend):
    """ A backend that allows for easy debugging since the program flow
    is not threaded or distributed over several OS processes.
    """
    
    def __init__(self):
        super(SerialBackend, self).__init__()
              
        self.state = "idling"  
        self.current_process = 0
        
        
    def stage_in(self, operation):
        """
        Stage the current operation
        """
        super(SerialBackend, self).stage_in(operation)
        
        # Set up progress bar
        widgets = ['Operation progress: ', Percentage(), ' ', Bar(), ' ', ETA()]
        self.progress_bar = ProgressBar(widgets = widgets, 
                                        maxval = self.current_operation.number_processes)
        self.progress_bar.start()
        
        self._log("Operation - staged")
        self.state = "staged"
        
    def execute(self):
        """
        Executes all processes specified in the currently staged
        operation.
        """
        assert(self.state == "staged")
        
        self.state = "executing" 
        self._log("Operation - executing")
        
        # The handler that is used remotely for logging
        handler_class = logging.handlers.SocketHandler
        handler_args = {"host" : self.host, "port" : self.port}
        
        try:
            process = self.current_operation.processes.get()
        except KeyboardInterrupt:
            self._log(traceback.format_exc(), level=logging.CRITICAL)
            process = False
        # while there are Processes in the queue ...
        while not process is False:
            process.prepare(pySPACE.configuration, handler_class, handler_args)
            # Execute process, update progress bar and get next queue-element
            try:
                process()
            # if an exception is raised somewhere in the code we maybe want to
            # further try other processes
            except Exception:
                self._log(traceback.format_exc(), level=logging.CRITICAL)
                process.post_benchmarking()
                process = False
            # if ctrl+c is pressed we want to immediately stop everything
            except KeyboardInterrupt:
                self._log(traceback.format_exc(), level=logging.CRITICAL)
                process.post_benchmarking()
                process = False
            else:    
                self.current_process += 1
                self.progress_bar.update(self.current_process)
                process = self.current_operation.processes.get()

    def check_status(self):
        """
        Returns a description of the current state of the operations
        execution.
        
        .. todo:: do we really need this method???
        """
        # Returns which percentage of processes of the current operation
        # is already finished
        return float(self.current_process)/self.current_operation.number_processes
    
    def retrieve(self):
        """
        Returns the result of the operation.
        
        This is trivial in the Debug-Backend since execute blocks.
        """
        assert(self.state == "executing")
        
        self._log("Operation - retrieved")
        
        self.current_operation.processes.close()
        # if process creation has another thread
        if hasattr(self.current_operation, "create_process") \
                        and self.current_operation.create_process != None:
            self.current_operation.create_process.join()
            
        # Change the state to retrieved
        self.state = "retrieved"
    
    def consolidate(self):
        """
        Consolidates the results of the single processes into a consistent result of the whole
        operation
        """
        assert(self.state == "retrieved")
        
        try:
            self.current_operation.consolidate()
        except Exception:
            self._log(traceback.format_exc(), level=logging.CRITICAL)
        
        self._log("Operation - consolidated")
        self.state = "consolidated"
    
    def cleanup(self):
        """
        Remove the current operation and all potential results that
        have been stored in this object
        """
        self.state = "idling"
        
        self._log("Operation - cleaned up")
        self._log("Idling...")

        # Remove the file logger for this operation
        logging.getLogger('').removeHandler(self.file_handler)
        # close listener socket
        self.sock.close()
        
        self.current_operation = None
        self.current_process = 0      
コード例 #3
0
ファイル: serial.py プロジェクト: jhuebotter/pyspace
class SerialBackend(Backend):
    """ A backend that allows for easy debugging since the program flow
    is not threaded or distributed over several OS processes.
    """
    def __init__(self):
        super(SerialBackend, self).__init__()

        self.state = "idling"
        self.current_process = 0

    def stage_in(self, operation):
        """
        Stage the current operation
        """
        super(SerialBackend, self).stage_in(operation)

        # Set up progress bar
        widgets = [
            'Operation progress: ',
            Percentage(), ' ',
            Bar(), ' ',
            ETA()
        ]
        self.progress_bar = ProgressBar(
            widgets=widgets, maxval=self.current_operation.number_processes)
        self.progress_bar.start()

        self._log("Operation - staged")
        self.state = "staged"

    def execute(self, timeout=1e6):
        """
        Executes all processes specified in the currently staged
        operation.
        """
        assert (self.state == "staged")

        self.state = "executing"
        self._log("Operation - executing")

        # The handler that is used remotely for logging
        handler_class = logging.handlers.SocketHandler
        handler_args = {"host": self.host, "port": self.port}

        get_process = partial(self.current_operation.processes.get,
                              timeout=timeout)
        for process in iter(get_process, False):
            process.prepare(pySPACE.configuration, handler_class, handler_args)
            # Execute process, update progress bar and get next queue-element
            try:
                process()
            # if an exception is raised somewhere in the code we maybe want to
            # further try other processes
            except Exception:
                self._log(traceback.format_exc(), level=logging.CRITICAL)
                process.post_benchmarking()
                process = False
            # if ctrl+c is pressed we want to immediately stop everything
            except KeyboardInterrupt:
                self._log(traceback.format_exc(), level=logging.CRITICAL)
                process.post_benchmarking()
                process = False
            else:
                self.current_process += 1
                self.progress_bar.update(self.current_process)

    def check_status(self):
        """
        Returns a description of the current state of the operations
        execution.
        
        .. todo:: do we really need this method???
        """
        # Returns which percentage of processes of the current operation
        # is already finished
        return float(
            self.current_process) / self.current_operation.number_processes

    def retrieve(self, timeout=1e6):
        """
        Returns the result of the operation.
        
        This is trivial in the Debug-Backend since execute blocks.
        """
        assert (self.state == "executing")

        self._log("Operation - retrieved")

        self.current_operation.processes.close()
        # if process creation has another thread
        if hasattr(self.current_operation, "create_process") \
                        and self.current_operation.create_process != None:
            self.current_operation.create_process.join(timeout=1e6)

        # Change the state to retrieved
        self.state = "retrieved"

    def consolidate(self):
        """
        Consolidates the results of the single processes into a consistent result of the whole
        operation
        """
        assert (self.state == "retrieved")

        try:
            self.current_operation.consolidate()
        except Exception:
            self._log(traceback.format_exc(), level=logging.CRITICAL)

        self._log("Operation - consolidated")
        self.state = "consolidated"

    def cleanup(self):
        """
        Remove the current operation and all potential results that
        have been stored in this object
        """
        self.state = "idling"

        self._log("Operation - cleaned up")
        self._log("Idling...")

        # Remove the file logger for this operation
        logging.getLogger('').removeHandler(self.file_handler)
        # close listener socket
        self.sock.close()

        self.current_operation = None
        self.current_process = 0
コード例 #4
0
ファイル: multicore.py プロジェクト: pyspace/test
class MulticoreBackend(Backend):
    """ Execute as many processes in parallel as there are (logical) CPUs on the local machine
    
    This backend is based on the multiprocessing package and should work on every
    multicore system without additional settings even on virtual machines.
    Each process corresponds to one combination of input data set and
    parameter choice.
    
    :Author: Anett Seeland ([email protected])
    :LastChange: 2012/09/24
    
    """
    def __init__(self, pool_size=None):
        super(MulticoreBackend, self).__init__()

        # Set the number of processes in the pool
        # per default to the number of CPUs
        if pool_size == None:
            pool_size = MulticoreBackend.detect_CPUs()

        self.pool_size = pool_size

        self.state = "idling"

        # queue for execution
        self.result_handlers = multiprocessing.Queue(pool_size + 2)

        self.pool = None
        self.current_process = 0

        self._log("Created MulticoreBackend with pool size %s" % pool_size)

    def reset_queue(self):
        """ Resets the execution queue"""
        self.result_handlers = multiprocessing.Queue(self.pool_size + 2)

    def stage_in(self, operation):
        """ Stage the current operation """
        super(MulticoreBackend, self).stage_in(operation)
        self.pool = multiprocessing.Pool(processes=self.pool_size)

        # Set up progress bar
        widgets = [
            'Operation progress: ',
            Percentage(), ' ',
            Bar(), ' ',
            ETA()
        ]
        self.progress_bar = ProgressBar(
            widgets=widgets, maxval=self.current_operation.number_processes)
        self.progress_bar.start()

        self._log("Operation - staged")
        self.state = "staged"

    def execute(self):
        """ Execute all processes specified in the currently staged operation """
        assert (self.state == "staged")

        self._log("Operation - executing")
        self.state = "executing"

        # The handler that is used remotely for logging
        handler_class = logging.handlers.SocketHandler
        handler_args = {"host": self.host, "port": self.port}
        backend_com = (self.SERVER_IP, self.SERVER_PORT)

        # A socket communication thread to handle e.g. subflows
        self.listener = LocalComHandler(self.sock)
        self.listener.start()

        try:
            process = self.current_operation.processes.get()
        except KeyboardInterrupt:
            process = False
        # Until not all Processes have been created prepare all processes
        # from the queue for remote execution and execute them
        while process != False:
            process.prepare(pySPACE.configuration, handler_class, handler_args,
                            backend_com)
            # since preparing the process might be quite faster than executing
            # it we need another queue where processes get out when they have
            # finished execution
            self.result_handlers.put(1)
            # Execute all functions in the process pool but return immediately
            self.pool.apply_async(process, callback=self.dequeue_process)
            process = self.current_operation.processes.get()
            time.sleep(0.1)

    def dequeue_process(self, result):
        """ Callback function for finished processes """
        self.current_process += 1
        self.result_handlers.get()
        self.progress_bar.update(self.current_process)

    def check_status(self):
        """ Return a description of the current state of the operations execution
        
        .. todo:: do we really need this method???
        """
        # Returns which percentage of processes of the current operation
        # is already finished
        return float(
            self.current_process) / self.current_operation.number_processes

    def retrieve(self):
        """ Wait for all results of the operation
        
        This call blocks until all processes are finished.
        """
        assert (self.state == "executing")

        # Prevent any other processes from being submitted to the pool
        # (necessary for join)
        self.pool.close()
        self._log("Closing pool", level=logging.DEBUG)

        self._log("Operation - retrieved")
        self.current_operation.processes.close()
        # if process creation has another thread
        if hasattr(self.current_operation, "create_process") \
            and self.current_operation.create_process != None:
            self.current_operation.create_process.join()
        self.pool.join()  # Wait for worker processes to exit
        self._log("Worker processes have exited gracefully")
        self.result_handlers.close()
        # inform listener that its time to die
        self.listener.operation_finished = True
        time.sleep(1)
        self.listener.join()
        # Change the state to finished
        self.state = "retrieved"

    def consolidate(self):
        """ Consolidate the single processes' results into a consistent result of the whole operation """
        assert (self.state == "retrieved")

        try:
            self.current_operation.consolidate()
        except Exception:
            import traceback
            self._log(traceback.format_exc(), level=logging.ERROR)

        self._log("Operation - consolidated")

        self.state = "consolidated"

    def cleanup(self):
        """ Remove the current operation and all potential results that have been stored in this object """
        self.state = "idling"

        self._log("Operation - cleaned up")
        self._log("Idling...")

        # Remove the file logger for this operation
        logging.getLogger('').removeHandler(self.file_handler)
        # close listener socket
        self.sock.close()

        self.current_operation = None
        self.current_process = 0

    @classmethod
    def detect_CPUs(cls):
        """ Detects the number of CPUs on a system. Cribbed from pp.
        
        :from: http://codeliberates.blogspot.com/2008/05/detecting-cpuscores-in-python.html
        """
        ncpus = None
        # Linux, Unix and MacOS:
        if hasattr(os, "sysconf"):
            if os.sysconf_names.has_key("SC_NPROCESSORS_ONLN"):
                # Linux & Unix:
                ncpus = os.sysconf("SC_NPROCESSORS_ONLN")
            if isinstance(ncpus, int) and ncpus > 0:
                return ncpus
            else:  # OSX:
                return int(os.popen2("sysctl -n hw.ncpu")[1].read())
        # Windows:
        if os.environ.has_key("NUMBER_OF_PROCESSORS"):
            ncpus = int(os.environ["NUMBER_OF_PROCESSORS"])
            if ncpus > 0:
                return ncpus
        return 1  # Default
コード例 #5
0
class MpiBackend(Backend):
    """ 
    A message passing interface (mpi) backend to pySPACE
    
    In order to use this backend, you need a working MPI distribution and mpi4py. 
    You can download mpi4py from http://code.google.com/p/mpi4py/. mpi4py is 
    compatible with a python 2.3  to 2.7 or 3.0 to 3.1 distribution. 

    This backend assumes a global file system that is seen by all nodes running 
    the processes. 
    
    **Parameters**
        :pool_size: Define how many MPI processes should be started in parallel.
                    This should not exceed the amount of available processors.
                    (or the number of mpi slots defined in the hostsfile)
        
            (*recommended, default: 156*)
 
    """
    def __init__(self, pool_size=156):
        super(MpiBackend, self).__init__()
        #self.COMMAND_MPI = '/usr/lib64/openmpi/bin/mpirun'
        self.COMMAND_MPI = 'mpirun'
        self.COMMAND_PYTHON = sys.executable
        self.runner_script = os.sep.join([
            pySPACE.configuration.root_dir, "environments", "backends",
            "mpi_runner.py"
        ])
        # start as many processes as the total number of processors
        # available
        self.NumberOfProcessesToRunAtBeginning = pool_size
        self.NumberOfProcessesToRunLater = pool_size  #39

    def __del__(self):
        pass

    def stage_in(self, operation):
        """
        Stage the current operation
        """
        super(MpiBackend, self).stage_in(operation)
        # init of process lists, because backend is only initialized once
        self.process_args_list = []
        self.IndexCopyStart = 0
        self.ProcessingSuccessful = True
        self.TotalProcessesFinished = 0
        self.CrashedProcesses = []
        # Set up progress bar
        widgets = [
            'Operation progress: ',
            Percentage(), ' ',
            Bar(), ' ',
            ETA()
        ]
        self.progress_bar = ProgressBar(
            widgets=widgets, maxval=self.current_operation.number_processes)
        self.progress_bar.start()

        # The handler that is used remotely for logging
        handler_class = logging.handlers.SocketHandler
        handler_args = {"host": self.host, "port": self.port}

        # Set up stage in directory
        stagein_dir = os.sep.join(
            [self.current_operation.result_directory, ".stagein"])
        # Check if hosts file is created in the right directoy
        HostfileCreated = pySPACE.configuration.root_dir + "/" + 'hostsfile'
        if (not os.path.isfile(HostfileCreated)):
            print "***************************************************************************************************"
            print "hostsfile not created !"
            print "Please create the hosts file with a filename 'hostsfile' under ", pySPACE.configuration.root_dir
            print "***************************************************************************************************"
            raise UserWarning('Missing hostsfile.')
        if not os.path.exists(stagein_dir):
            os.mkdir(stagein_dir)

        process = self.current_operation.processes.get()
        print "Preparing processes. This might take a few minutes...."
        # Until not all Processes have been created prepare all processes
        # from the queue for remote execution and execute them
        i = 0
        while process != False:
            process.prepare(pySPACE.configuration, handler_class, handler_args)
            # since preparing the process might be quite faster than executing
            # it we need another queue where processes get out when they have
            # finished execution
            #self.result_handlers.put(1)
            # Execute all functions in the process pool but return immediately
            #self.pool.apply_async(process, callback=self.dequeue_process)
            proc_file_name = os.sep.join(
                [stagein_dir, "process_%d.pickle" % i])
            proc_file = open(proc_file_name, "w")
            cPickle.dump(process, proc_file)
            proc_file.close()
            # Add task to job specification
            self.process_args_list.append(proc_file_name)
            # Get the next process
            process = self.current_operation.processes.get()
            i += 1

        self._log("Operation - staged")
        self.state = "staged"

    def execute(self, timeout=1e6):
        """
        Executes all processes specified in the currently staged
        operation.
        """
        assert (self.state == "staged")

    def check_status(self):
        """
        Returns a description of the current state of the operations
        execution. 
        """
        #self.progress_bar.update(float(self.current_job.info()["percentDone"]))
        #return float(self.current_job.info()["percentDone"]) / 100.0
        #return float(self.current_process) / self.current_operation.number_processes
        return 1.0

    def not_xor(self, a, b):
        return not ((a or b) and not (a and b))

    def retrieve(self, timeout=1e6):
        """
        Returns the result of the operation.
        """

        self.state = "executing"
        self._log("Operation - executing")
        if (self.NumberOfProcessesToRunAtBeginning > len(
                self.process_args_list)):
            args = ([self.COMMAND_MPI] + ['--loadbalance'] + ['--nolocal'] +
                    ['--hostfile'] +
                    [pySPACE.configuration.root_dir + "/" + 'hostsfile'] +
                    ['-n', str(len(self.process_args_list))] +
                    [self.COMMAND_PYTHON] + [self.runner_script] +
                    self.process_args_list)
            # Start the processes.
            self._log("mpi-parameters: %s" % args, level=logging.DEBUG)
            self._log("mpi-parameters-joined: %s" % os.path.join(args),
                      level=logging.DEBUG)
            p = subprocess.Popen(args)
            #self.pids.append(p)
            self.IndexCopyStart += self.NumberOfProcessesToRunAtBeginning
            #print args
        else:
            #copy the arguments of the processes to run
            sub_process_args_list = (
                self.process_args_list[self.IndexCopyStart:self.
                                       NumberOfProcessesToRunAtBeginning])
            args = ([self.COMMAND_MPI] + ['--loadbalance'] + ['--nolocal'] +
                    ['--hostfile'] +
                    [pySPACE.configuration.root_dir + "/" + 'hostsfile'] +
                    ['-n', str(len(sub_process_args_list))] +
                    [self.COMMAND_PYTHON] + [self.runner_script] +
                    sub_process_args_list)
            # Start the processes.
            p = subprocess.Popen(args)
            #self.pids.append(p) # TODO: call p.poll() for p in self.pids after all processes have exited
            self.IndexCopyStart += self.NumberOfProcessesToRunAtBeginning
            #print args

        # Create a list of boolean for processes which are finished.
        # First we assume that all processes are not started, so we set
        # every element of the list to false
        FinishedProcesses = [False for i in range(len(self.process_args_list))]

        # Wait until all processes finish and start new processes
        # when old ones finish

        print "Waiting for the processes to finish...."

        # Counter for the processes which are finished. It will be reset
        # after 'NumberOfProcessesToRunLater' processes are finished
        CounterProcessesFinished = 0
        processes_Finished = False

        while not processes_Finished:
            try:
                processes_Finished = True
                for LoopCounter, process_args in enumerate(
                        self.process_args_list):
                    if (self.not_xor(
                            os.path.isfile(process_args + "_Finished"),
                            os.path.isfile(process_args + "_Crashed"))):
                        processes_Finished = False
                    else:
                        if (FinishedProcesses[LoopCounter] == False):
                            # Record that the process is finished
                            FinishedProcesses[LoopCounter] = True
                            # If the process is crashed take note of that
                            if (os.path.isfile(process_args + "_Crashed")):
                                self.CrashedProcesses.append(process_args)
                            # Increment the counter for the number of processes finished
                            # by one
                            CounterProcessesFinished += 1
                            self.TotalProcessesFinished += 1
                            # update the progress bar
                            self.progress_bar.update(
                                self.TotalProcessesFinished)
                            if (CounterProcessesFinished ==
                                    self.NumberOfProcessesToRunLater):
                                # Define a variable for a subset of processes to run
                                sub_process_args_list = []
                                if (self.IndexCopyStart == len(
                                        self.process_args_list)):
                                    break
                                elif ((self.IndexCopyStart +
                                       self.NumberOfProcessesToRunLater) < len(
                                           self.process_args_list)):
                                    sub_process_args_list = (
                                        self.process_args_list[
                                            self.IndexCopyStart:self.
                                            IndexCopyStart +
                                            self.NumberOfProcessesToRunLater])
                                else:
                                    sub_process_args_list = self.process_args_list[
                                        self.IndexCopyStart:len(
                                            self.process_args_list)]
                                args = (
                                    [self.COMMAND_MPI] + ['--loadbalance'] +
                                    ['--nolocal'] + ['--hostfile'] + [
                                        pySPACE.configuration.root_dir + "/" +
                                        'hostsfile'
                                    ] +
                                    ['-n',
                                     str(len(sub_process_args_list))] +
                                    [self.COMMAND_PYTHON] +
                                    [self.runner_script] +
                                    sub_process_args_list)
                                # Start the processes
                                if (len(sub_process_args_list) > 0):
                                    p = subprocess.Popen(args)
                                #print args
                                # Adjust the start index
                                self.IndexCopyStart += self.NumberOfProcessesToRunLater
                                # Reset the counter for processes finished
                                CounterProcessesFinished = 0
                # sleep for one second
                time.sleep(1)
            except (KeyboardInterrupt,
                    SystemExit):  # if processes hang forever
                self.ProcessingSuccessful = False
                print "*********************************************************************************************************"
                print "pySPACE forced to stop ..."
                print "Please wait until mpi_backend is finished with consolidating the results generated and with clean up ..."
                print "**********************************************************************************************************"
                import pySPACE.resources.dataset_defs.performance_result.PerformanceResultSummary as PerformanceResultSummary
                # merge the remaining files
                print "***************************************************************************************************"
                print "Starting merging . . ."
                PerformanceResultSummary.merge_performance_results(
                    self.current_operation.result_directory)
                print "Merging complete . . ."
                print "***************************************************************************************************"
                break  #The while loop will break

        self._log("Operation - processing finished")

        # Change the state to retrieved
        self.state = "retrieved"

        return None

    def consolidate(self):
        """
        Consolidates the results of the single processes into a consistent result of the whole
        operation
        """
        assert (self.state == "retrieved")

        if ((self.ProcessingSuccessful == True)
                and (len(self.CrashedProcesses) == 0)):
            self.current_operation.consolidate()

        if ((self.ProcessingSuccessful == True)
                and (len(self.CrashedProcesses) != 0)):
            import pySPACE.resources.dataset_defs.performance_result.PerformanceResultSummary as PerformanceResultSummary
            # merge the remaining files
            print "***************************************************************************************************"
            print "Starting merging . . ."
            PerformanceResultSummary.merge_performance_results(
                self.current_operation.result_directory)
            print "Merging complete . . ."
            print "***************************************************************************************************"

        self._log("Operation - consolidated")

        self.state = "consolidated"

    def cleanup(self):
        """
        Remove the current operation and all potential results that
        have been stored in this object
        """
        self.state = "idling"

        # Cleaning up...
        stagein_dir = os.sep.join(
            [self.current_operation.result_directory, ".stagein"])
        if ((self.ProcessingSuccessful == True)
                and (len(self.CrashedProcesses) == 0)):
            deleted = False

            while not deleted:
                try:
                    os.chdir("..")
                    shutil.rmtree(stagein_dir)
                    deleted = True
                except OSError, e:
                    if e.errno == 66:
                        self._log(
                            "Could not remove .stagein dir "
                            ", waiting for NFS lock",
                            level=logging.WARNING)
                    time.sleep(5)

        self._log("Operation - cleaned up")
        self._log("Idling...")

        # Remove the file logger for this operation
        logging.getLogger('').removeHandler(self.file_handler)
        # close listener socket
        self.sock.close()
        self.current_operation = None
コード例 #6
0
ファイル: multicore.py プロジェクト: Hansa064/pyspace
class MulticoreBackend(Backend):
    """ Execute as many processes in parallel as there are (logical) CPUs on the local machine
    
    This backend is based on the multiprocessing package and should work on every
    multicore system without additional settings even on virtual machines.
    Each process corresponds to one combination of input data set and
    parameter choice.
    
    :Author: Anett Seeland ([email protected])
    :LastChange: 2012/09/24
    
    """

    def __init__(self, pool_size=None):
        super(MulticoreBackend, self).__init__()

        # Set the number of processes in the pool
        # per default to the number of CPUs
        if pool_size is None:
            pool_size = MulticoreBackend.detect_CPUs()
        self.pool_size = pool_size
        self.state = "idling"

        # queue for execution
        self.result_handlers = []
        self.pool = None
        self.current_process = 0
        self._log("Created MulticoreBackend with pool size %s" % pool_size)

    def reset_queue(self):
        """ Resets the execution queue"""
        self.result_handlers = []

    def stage_in(self, operation):
        """ Stage the current operation """
        super(MulticoreBackend, self).stage_in(operation)
        self.pool = multiprocessing.Pool(processes=self.pool_size)

        # Set up progress bar
        widgets = ["Operation progress: ", Percentage(), " ", Bar(), " ", ETA()]
        self.progress_bar = ProgressBar(widgets=widgets, maxval=self.current_operation.number_processes)
        self.progress_bar.start()

        self._log("Operation - staged")
        self.state = "staged"

    def execute(self, timeout=None):
        """ Execute all processes specified in the currently staged operation """
        # This blocks until all results are available, hence this call is synchronize
        assert self.state == "staged"

        self._log("Operation - executing")
        self.state = "executing"

        # The handler that is used remotely for logging
        handler_class = logging.handlers.SocketHandler
        handler_args = {"host": self.host, "port": self.port}
        backend_com = (self.SERVER_IP, self.SERVER_PORT)

        # A socket communication thread to handle e.g. subflows
        self.listener = LocalComHandler(self.sock)
        self.listener.start()

        try:
            process = self.current_operation.processes.get(timeout=timeout)
        except KeyboardInterrupt:
            process = False
        # Until not all Processes have been created prepare all processes
        # from the queue for remote execution and execute them
        while not process is False:
            process.prepare(pySPACE.configuration, handler_class, handler_args, backend_com)
            # Execute all functions in the process pool but return immediately
            self.pool.apply_async(process, callback=self.dequeue_process)
            process = self.current_operation.processes.get(timeout=timeout)
            time.sleep(0.1)

    def dequeue_process(self, result):
        """ Callback function for finished processes """
        self.current_process += 1
        self.progress_bar.update(self.current_process)

    def check_status(self):
        """ Return a description of the current state of the operations execution
        
        .. todo:: do we really need this method???
        """
        # Returns which percentage of processes of the current operation
        # is already finished
        return float(self.current_process) / self.current_operation.number_processes

    def retrieve(self, timeout=0):
        """ Wait for all results of the operation
        
        This call blocks until all processes are finished.
        """
        assert self.state == "executing"

        # Prevent any other processes from being submitted to the pool
        # (necessary for join)
        self.pool.close()
        self._log("Closing pool", level=logging.DEBUG)

        self._log("Operation - retrieved")
        self.current_operation.processes.close()
        # if process creation has another thread
        if hasattr(self.current_operation, "create_process") and self.current_operation.create_process != None:
            self.current_operation.create_process.join()
        # Close the result handler and wait for every process
        # to terminate
        try:
            for result in self.result_handlers:
                result.wait(timeout=timeout)
        except multiprocessing.TimeoutError:
            # A timeout occurred, terminate the pool
            self._log("Timeout occurred, terminating worker processes")
            self.pool.terminate()
            return False
        finally:
            self.pool.join()  # Wait for worker processes to exit
            # inform listener that its time to die
            self.listener.operation_finished = True
            time.sleep(1)
            self.listener.join()
            # Change the state to finished
            self.state = "retrieved"
        self._log("Worker processes have exited gracefully")
        return True

    def consolidate(self):
        """ Consolidate the single processes' results into a consistent result of the whole operation """
        assert self.state == "retrieved"
        try:
            self.current_operation.consolidate()
        except Exception:
            import traceback

            self._log(traceback.format_exc(), level=logging.ERROR)
        self._log("Operation - consolidated")
        self.state = "consolidated"

    def cleanup(self):
        """ Remove the current operation and all potential results that have been stored in this object """
        self.state = "idling"
        self._log("Operation - cleaned up")
        self._log("Idling...")
        # Remove the file logger for this operation
        logging.getLogger("").removeHandler(self.file_handler)
        # close listener socket
        self.sock.close()
        self.current_operation = None
        self.current_process = 0

    @classmethod
    def detect_CPUs(cls):
        """ Detects the number of CPUs on a system. Cribbed from pp.
        
        :from: http://codeliberates.blogspot.com/2008/05/detecting-cpuscores-in-python.html
        """
        ncpus = None
        # Linux, Unix and MacOS:
        if hasattr(os, "sysconf"):
            if os.sysconf_names.has_key("SC_NPROCESSORS_ONLN"):
                # Linux & Unix:
                ncpus = os.sysconf("SC_NPROCESSORS_ONLN")
            if isinstance(ncpus, int) and ncpus > 0:
                return ncpus
            else:  # OSX:
                return int(os.popen2("sysctl -n hw.ncpu")[1].read())
        # Windows:
        if os.environ.has_key("NUMBER_OF_PROCESSORS"):
            ncpus = int(os.environ["NUMBER_OF_PROCESSORS"])
            if ncpus > 0:
                return ncpus
        return 1  # Default