Ejemplo n.º 1
0
        def __init__(self):

            # Reference to inner cluster object (equivalent to parallel_go)
            self._cluster = MPIInterfaceCore()

            # Direct reference to MPICommandClient for methods not resorting to the inner cluster
            self.__command_client = MPICommandClient()
Ejemplo n.º 2
0
 def __init__(self):
     
     # Reference to inner cluster object (equivalent to parallel_go)
     self._cluster = MPIInterfaceCore()
     
     # Direct reference to MPICommandClient for methods not resorting to the inner cluster
     self.__command_client = MPICommandClient()
Ejemplo n.º 3
0
        def __init__(self):

            self.__command_client = MPICommandClient()
            self.__monitor_client = MPIMonitorClient()
Ejemplo n.º 4
0
    class __MPIInterfaceCoreImpl:
        """ Implementation of the MPIInterfaceCore singleton interface """
        def __init__(self):

            self.__command_client = MPICommandClient()
            self.__monitor_client = MPIMonitorClient()

        def start_cluster(self, cl_file=None):

            self.__command_client.set_log_mode(MPIInterface._log_mode)
            self.__command_client.start_services()

        def stop_cluster(self):

            self.__command_client.stop_services()

        def get_engines(self):

            return MPIEnvironment.mpi_server_rank_list()

        def get_nodes(self):

            # Get list of all server Ids
            servers_list = MPIEnvironment.mpi_server_rank_list()

            # Get list of hostnames for each server Id
            hostnames = []
            for server in servers_list:
                hostnames.append(
                    self.__monitor_client.get_server_status_keyword(
                        server, 'processor'))

            # Remove duplicates to get the list of host used in general
            hostnames_not_repeated = list(set(hostnames))

            return hostnames_not_repeated

        def pgc(self, commands, block=True):
            """This method has two modes:
            
               - When the input command is a dictionary of commands execute  
                 each command taking the dictionary key as target node
                 (Equivalent to various calls to odo/execute)  
                 
               - When commands is a single command execute it in all engines          
            """

            # Get list of jobs and commands
            ret = None
            if isinstance(commands, dict):

                # Spawn jobs in non-blocking mode
                jobId_list = []
                for server in commands:
                    cmd = commands[server]
                    jobId = self.__command_client.push_command_request(
                        cmd, block=False, target_server=server)
                    jobId_list.append(jobId[0])

                # If user requests blocking mode wait until execution is completed
                ret = self.__command_client.get_command_response(jobId_list,
                                                                 block=block,
                                                                 verbose=True)

            else:
                cmd = commands
                # Get list of all servers
                all_servers_list = MPIEnvironment.mpi_server_rank_list()
                # Execute command in all servers
                ret = self.__command_client.push_command_request(
                    cmd, block=block, target_server=all_servers_list)

            # Return result
            return ret

        def odo(self, job, nodes):
            """Execute a job on a subset of engines in non-blocking mode"""

            cmd_ids = self.__command_client.push_command_request(
                job, block=False, target_server=nodes)

            return cmd_ids

        def execute(self, job, nodes):
            """Execute a job on a subset of engines in blocking mode"""

            result_list = self.__command_client.push_command_request(
                job, block=True, target_server=nodes)

            return result_list

        def push(self, variables, targets=None):
            """Set variables in a sub-set of engines"""

            # Determine target servers
            target_server = []
            if targets is None or targets == 'all':
                target_server = MPIEnvironment.mpi_server_rank_list()
            else:
                target_server = list(targets)

            # Push variables
            ret = self.__command_client.push_command_request(
                "push",
                block=True,
                target_server=target_server,
                parameters=dict(variables))

            # Return request result for further processing
            return ret

        def pull(self, varname="", targets=None):
            """Retrieve a variable from a sub-set of engines"""

            # Pull variable from all servers
            result_list = []
            if targets is None:
                result_list = self.pgc(varname)
            else:
                result_list = self.execute(varname, nodes=targets)

            # Extract result vars and corresponding server Ids
            res = {}
            for result in result_list:
                res[result['server']] = result['ret']

            # Return result
            return res

        def check_job(self, jobId, verbose=True):
            """Check the status of a non-blocking job"""

            jobId_list = list(jobId)
            command_response_list = self.__command_client.get_command_response(
                jobId_list, block=False, verbose=verbose)

            # Aggregate exceptions and completed jobIds
            error_msg = ''
            completed_jobs = []
            for command_response in command_response_list:
                if not command_response['successful']:
                    if len(error_msg) > 0:
                        error_msg += "\n"

                    error_msg += "Exception executing command in server %s: %s" % (
                        command_response['server'],
                        command_response['traceback'])
                else:
                    completed_jobs.append(command_response['id'])

            # Re-throw aggregated exception
            if len(error_msg) > 0:
                casalog.post(error_msg, "SEVERE",
                             "MPIInterfaceCore::check_job")
                raise Exception(error_msg)

            # Check that all jobs have been completed
            completed = True
            for jobId in jobId_list:
                if jobId not in completed_jobs:
                    completed = False
                    break

            # Return completion status
            return completed

        def get_server_status(self):

            return self.__command_client.get_server_status()

        def get_command_request_list(self):

            return self.__command_client.get_command_request_list()

        def get_command_response_list(self):

            return self.__command_client.get_command_response_list()
Ejemplo n.º 5
0
    class __MPIInterfaceImpl:
        """ Implementation of the MPIInterface singleton interface """
        def __init__(self):

            # Reference to inner cluster object (equivalent to parallel_go)
            self._cluster = MPIInterfaceCore()

            # Direct reference to MPICommandClient for methods not resorting to the inner cluster
            self.__command_client = MPICommandClient()

        def isClusterRunning(self):

            if self.__command_client.get_lifecyle_state() == 1:
                return True
            else:
                return False

        def init_cluster(self, clusterfile=None, project=None):

            # NOTE: In the MPI framework the clusterfile is processed by mpirun
            #       So it is not necessary to process and validate clusterfile here
            self.start_cluster()

        def start_cluster(self):

            # TODO: This should set OMP_NUM_THREADS as well
            self._cluster.start_cluster()

        def stop_cluster(self):

            self._cluster.stop_cluster()

        def do_and_record(self, cmd, id=None, group='', subMS=''):

            jobId = self._cluster.odo(cmd, id)
            return jobId

        def get_engine_store(self, id):

            res = self._cluster.execute('os.getcwd()', id)

            return res[0]['ret']

        def get_status(self):

            command_request_list = self._cluster.get_command_request_list()
            command_response_list = self._cluster.get_command_response_list()
            server_status = self._cluster.get_server_status()

            # Show first jobs running, then the ones holding the queue and finally the ones already processed
            status_list = [
                'request sent', 'holding queue', 'timeout', 'response received'
            ]

            # Generate job status table
            job_status_list = [[
                'JOB ID', 'SERVER', 'HOSTNAME', 'QUEUE STATUS', 'COMMAND',
                'ELAPSED TIME (s)', 'EXECUTION STATUS'
            ]]
            for status in status_list:
                for jobId in command_request_list:

                    # Generate job status list
                    if command_request_list[jobId]['status'] == status:

                        # Get server to access info from the server status
                        server = command_request_list[jobId]['server']
                        hostname = server_status[server]['processor']

                        # Create job status info
                        job_status = []
                        job_status.append(str(jobId))
                        job_status.append(str(server))
                        job_status.append(hostname)
                        job_status.append(
                            command_request_list[jobId]['status'])
                        job_status.append(
                            command_request_list[jobId]['command'])

                        # Add run time
                        elapsed_time = ''
                        if status == status_list[1]:  # holding queue
                            elapsed_time = 'N/A'
                        elif status == status_list[0] or status == status_list[
                                2]:  # request sent / timeout
                            start_time = server_status[server][
                                'command_start_time']
                            if start_time is not None:
                                elapsed_time = "%.2f" % (time.time() -
                                                         start_time)
                            else:
                                elapsed_time = 'unset'
                        elif status == status_list[3]:  # 'response received'
                            start_time = command_response_list[jobId][
                                'command_start_time']
                            stop_time = command_response_list[jobId][
                                'command_stop_time']
                            if start_time is not None and stop_time is not None:
                                elapsed_time = "%.2f" % (stop_time -
                                                         start_time)
                            else:
                                elapsed_time = 'unset'

                        job_status.append(elapsed_time)

                        # Add job execution status
                        execution_status = ''
                        if status == status_list[0] or status == status_list[
                                1]:  # request sent / holding queue
                            execution_status = 'pending'
                        elif status == status_list[2]:  # timeout
                            execution_status = 'timeout'
                        elif status == status_list[3]:  # response received
                            if command_response_list[jobId]['successful']:
                                ret = command_response_list[jobId]['ret']
                                if isinstance(ret, bool):
                                    if ret is True:
                                        execution_status = 'completed - True'
                                    else:
                                        execution_status = 'completed - False'
                                else:
                                    execution_status = 'completed'
                            else:
                                execution_status = 'exception raised'

                        job_status.append(execution_status)

                        # Append job status info to list
                        job_status_list.append(job_status)

            # Print job status table
            MPIInterface.print_table(job_status_list)

            # Return job status table
            return job_status_list

        def set_log_level(self, log_level):

            self.__command_client.set_log_level(log_level)
Ejemplo n.º 6
0
 def __init__(self):
     
     self.__command_client = MPICommandClient()
     self.__monitor_client = MPIMonitorClient()
Ejemplo n.º 7
0
    class __MPIInterfaceCoreImpl:
        """ Implementation of the MPIInterfaceCore singleton interface """

        
        def __init__(self):
            
            self.__command_client = MPICommandClient()
            self.__monitor_client = MPIMonitorClient()
        
        
        def start_cluster(self, cl_file=None):
            
            self.__command_client.set_log_mode(MPIInterface._log_mode)
            self.__command_client.start_services()
            
            
        def stop_cluster(self):

            self.__command_client.stop_services()        
            
            
        def get_engines(self):
            
            return MPIEnvironment.mpi_server_rank_list()
        
        
        def get_nodes(self):
            
            # Get list of all server Ids
            servers_list = MPIEnvironment.mpi_server_rank_list()
            
            # Get list of hostnames for each server Id
            hostnames = []
            for server in servers_list:
                hostnames.append(self.__monitor_client.get_server_status_keyword(server,'processor'))
                
            # Remove duplicates to get the list of host used in general
            hostnames_not_repeated = list(set(hostnames))
            
            return hostnames_not_repeated
        
        
        def pgc(self,commands,block=True):
            """This method has two modes:
            
               - When the input command is a dictionary of commands execute  
                 each command taking the dictionary key as target node
                 (Equivalent to various calls to odo/execute)  
                 
               - When commands is a single command execute it in all engines          
            """
            
            # Get list of jobs and commands
            ret = None
            if isinstance(commands,dict):
                
                # Spawn jobs in non-blocking mode
                jobId_list = []
                for server in commands:
                    cmd = commands[server]
                    jobId = self.__command_client.push_command_request(   cmd,
                                                                          block=False,
                                                                          target_server=server)
                    jobId_list.append(jobId[0])
                
                # If user requests blocking mode wait until execution is completed    
                ret = self.__command_client.get_command_response(jobId_list,block=block,verbose=True)
                    
                
            else:
                cmd = commands
                # Get list of all servers
                all_servers_list = MPIEnvironment.mpi_server_rank_list()
                # Execute command in all servers          
                ret = self.__command_client.push_command_request(   cmd,
                                                                    block=block,
                                                                    target_server=all_servers_list)
            
            # Return result
            return ret        
        
        
        def odo(self, job, nodes):
            """Execute a job on a subset of engines in non-blocking mode"""             
            
            cmd_ids = self.__command_client.push_command_request(job,block=False,target_server=nodes)
            
            return cmd_ids
        
        
        def execute(self, job, nodes):
            """Execute a job on a subset of engines in blocking mode"""             
            
            result_list = self.__command_client.push_command_request(job,block=True,target_server=nodes)
            
            return result_list      
        
        
        def push(self, variables, targets=None):
            """Set variables in a sub-set of engines"""
            
            # Determine target servers
            target_server = []
            if targets is None or targets == 'all':
                target_server = MPIEnvironment.mpi_server_rank_list()
            else:
                target_server = list(targets)
                
            # Push variables
            ret = self.__command_client.push_command_request(   "push",
                                                                block=True,
                                                                target_server=target_server,
                                                                parameters=dict(variables))
            
            # Return request result for further processing
            return ret
            
        
        def pull(self, varname="", targets=None):
            """Retrieve a variable from a sub-set of engines"""
             
            # Pull variable from all servers
            result_list = []
            if targets is None:
                result_list = self.pgc(varname)
            else:
                result_list = self.execute(varname,nodes=targets)
            
            # Extract result vars and corresponding server Ids
            res = {}
            for result in result_list:
                res[result['server']] = result['ret']
                
            # Return result
            return res
        
        
        def check_job(self, jobId, verbose=True):
            """Check the status of a non-blocking job"""
            
            jobId_list = list(jobId)
            command_response_list = self.__command_client.get_command_response(jobId_list,block=False,verbose=verbose)
            
            # Aggregate exceptions and completed jobIds
            error_msg = ''
            completed_jobs = []
            for command_response in command_response_list:
                if not command_response['successful']:
                    if len(error_msg) > 0: error_msg += "\n"
                    
                    error_msg += "Exception executing command in server %s: %s" % (command_response['server'],
                                                                                  command_response['traceback'])
                else:
                    completed_jobs.append(command_response['id'])
                    
            # Re-throw aggregated exception
            if len(error_msg) > 0:
                casalog.post(error_msg,"SEVERE","MPIInterfaceCore::check_job")    
                raise Exception(error_msg) 
                    
            # Check that all jobs have been completed
            completed = True
            for jobId in jobId_list:
                if jobId not in completed_jobs:
                    completed = False
                    break
            
            # Return completion status
            return completed
            
            
        def get_server_status(self):
            
            return self.__command_client.get_server_status()
        
        
        def get_command_request_list(self):
            
            return self.__command_client.get_command_request_list()      
        

        def get_command_response_list(self):
            
            return self.__command_client.get_command_response_list()               
Ejemplo n.º 8
0
    class __MPIInterfaceImpl:
        """ Implementation of the MPIInterface singleton interface """

        
        def __init__(self):
            
            # Reference to inner cluster object (equivalent to parallel_go)
            self._cluster = MPIInterfaceCore()
            
            # Direct reference to MPICommandClient for methods not resorting to the inner cluster
            self.__command_client = MPICommandClient()
            
            
        def isClusterRunning(self):
            
            if self.__command_client.get_lifecyle_state() == 1:
                return True
            else:
                return False    

            
        def init_cluster(self, clusterfile=None, project=None):
            
            # NOTE: In the MPI framework the clusterfile is processed by mpirun
            #       So it is not necessary to process and validate clusterfile here
            self.start_cluster()
            
            
        def start_cluster(self):
            
            # TODO: This should set OMP_NUM_THREADS as well
            self._cluster.start_cluster()
            
            
        def stop_cluster(self):

            self._cluster.stop_cluster()    
            
    
        def do_and_record(self, cmd, id=None, group='', subMS=''):
            
            jobId = self._cluster.odo(cmd,id)
            return jobId
            
            
        def get_engine_store(self, id):
             
            res = self._cluster.execute('os.getcwd()',id)
             
            return res[0]['ret']
         
         
        def get_status(self):
            
            command_request_list = self._cluster.get_command_request_list()
            command_response_list = self._cluster.get_command_response_list()
            server_status = self._cluster.get_server_status()
            
            # Show first jobs running, then the ones holding the queue and finally the ones already processed
            status_list = ['request sent','holding queue','timeout','response received']
            
            # Generate job status table
            job_status_list = [['JOB ID','SERVER','HOSTNAME','QUEUE STATUS','COMMAND','ELAPSED TIME (s)','EXECUTION STATUS']]
            for status in status_list:
                for jobId in command_request_list:
                        
                    # Generate job status list
                    if command_request_list[jobId]['status'] == status:
                        
                        # Get server to access info from the server status
                        server = command_request_list[jobId]['server']
                        hostname = server_status[server]['processor']
                        
                        # Create job status info
                        job_status = []
                        job_status.append(str(jobId))
                        job_status.append(str(server))
                        job_status.append(hostname)
                        job_status.append(command_request_list[jobId]['status'])
                        job_status.append(command_request_list[jobId]['command'])
                                               
                        # Add run time 
                        elapsed_time = ''
                        if status == status_list[1]: # holding queue
                            elapsed_time = 'N/A'
                        elif status == status_list[0] or status == status_list[2]: # request sent / timeout
                            start_time = server_status[server]['command_start_time'] 
                            if start_time is not None:
                                elapsed_time = "%.2f" % (time.time() - start_time)
                            else:
                                elapsed_time = 'unset'
                        elif status == status_list[3]: # 'response received'
                            start_time = command_response_list[jobId]['command_start_time']
                            stop_time = command_response_list[jobId]['command_stop_time']
                            if start_time is not None and stop_time is not None:
                                elapsed_time = "%.2f" % (stop_time- start_time)
                            else:
                                elapsed_time = 'unset'
                                                       
                        job_status.append(elapsed_time)
                        
                        # Add job execution status
                        execution_status = ''
                        if status == status_list[0] or status == status_list[1]: # request sent / holding queue
                            execution_status = 'pending'
                        elif status == status_list[2]: # timeout
                            execution_status = 'timeout'
                        elif status == status_list[3]: # response received
                            if command_response_list[jobId]['successful']:
                                ret = command_response_list[jobId]['ret']
                                if isinstance(ret,bool):
                                    if ret == True:
                                        execution_status = 'completed - True'
                                    else:
                                        execution_status = 'completed - False'
                                else:
                                    execution_status = 'completed'
                            else:
                                execution_status = 'exception raised'
                                
                        job_status.append(execution_status)
                        
                        # Append job status info to list
                        job_status_list.append(job_status)
                        
            # Print job status table
            MPIInterface.print_table(job_status_list)
            
            # Return job status table
            return job_status_list     
        
        
        def set_log_level(self,log_level):
            
            self.__command_client.set_log_level(log_level)