Beispiel #1
0
        def __init__(self, start_services=True):

            # Initialize server status state
            self.__server_status_list = {}
            mpi_server_rank_list = MPIEnvironment.mpi_server_rank_list()
            for rank in mpi_server_rank_list:
                self.__server_status_list[rank] = {}
                self.__server_status_list[rank]['rank'] = rank
                self.__server_status_list[rank]['processor'] = None
                self.__server_status_list[rank]['pid'] = None
                self.__server_status_list[rank]['busy'] = False
                self.__server_status_list[rank]['command'] = None
                self.__server_status_list[rank]['command_start_time'] = None
                self.__server_status_list[rank]['pong_pending'] = False
                self.__server_status_list[rank]['ping_time'] = None
                self.__server_status_list[rank]['pong_time'] = None
                self.__server_status_list[rank]['timeout'] = False

            # Initialize monitor service state
            self.__monitor_status_service_on = False
            self.__monitor_status_service_running = False
            self.__monitor_status_service_thread = None

            # Initialize ping status response handler service state
            self.__ping_status_response_handler_service_on = False
            self.__ping_status_response_handler_service_running = False
            self.__ping_status_response_handler_service_thread = None

            # Instantiate MPICommunicator reference
            self.__communicator = MPICommunicator()

            # Automatically start services
            if start_services:
                self.start_services()
Beispiel #2
0
        def __init__(self, start_services=True):

            # Initialize status state dict
            self.__status = {}
            self.__status['rank'] = MPIEnvironment.mpi_processor_rank
            self.__status['processor'] = MPIEnvironment.hostname
            self.__status['pid'] = os.getpid()
            self.__status['busy'] = False
            self.__status['command'] = None
            self.__status['command_start_time'] = None
            self.__status['command_stop_time'] = None

            # Initialize ping status request handler service state
            self.__ping_status_request_handler_service_on = False
            self.__ping_status_request_handler_service_final_round = False
            self.__ping_status_request_handler_service_running = False
            self.__ping_status_request_handler_service_thread = None
            self.__last_ping_status_request_time = None
            self.__client_timeout = False

            # Instantiate MPICommunicator reference
            self.__communicator = MPICommunicator()

            # Automatically start services
            if start_services:
                self.start_services()
Beispiel #3
0
 def __init__(self,start_services=False):
     
     # Initialize life cycle state
     # 0-> Services not started
     # 1-> Services started
     # 2-> Stop service signal sent
     self.__life_cycle_state = 0
      
             
     # Initialize command request id counter and list
     self.__command_request_counter = 1
     self.__command_request_list = {}
     self.__command_response_list = {}
     
     # Initialize command group response state
     self.__command_group_response_counter = 1
     self.__command_group_response_list = {}            
     
     # Initialize command response handler service state
     self.__command_response_handler_service_on = False
     self.__command_response_handler_service_running = False
     self.__command_response_handler_service_thread = None 
     self.__command_response_handler_service_event_controller = threading.Event()       
     self.__command_response_handler_service_event_controller.clear()            
     
     # Initialize command request queue service state
     self.__command_request_queue = []
     self.__command_request_queue_service_on = False
     self.__command_request_queue_service_running = False
     self.__command_request_queue_service_thread = None             
     self.__command_request_queue_service_event_controller = threading.Event()       
     self.__command_request_queue_service_event_controller.clear()    
     
     # Setup a command request input queue to append the jobs
     # to be picked up by the command request queue service
     self.__command_request_input_queue = []
     self.__command_request_input_queue_lock = threading.Lock()
     
     # Instantiate MPICommunicator reference
     self.__communicator = MPICommunicator()
     
     # Instantiate MPIMonitorClient reference
     self.__monitor_client = MPIMonitorClient(False)            
     
     # Automatically start services
     if start_services:
         self.start_services()
         
     # Log mode
     self.__log_mode = 'unified'
         
     # Register exit handler
     # NOTE: It is not guaranteed that __del__() methods are called 
     #       for objects that still exist when the interpreter exits.
     atexit.register(self.stop_services,force_command_request_interruption=True)
Beispiel #4
0
 def __init__(self,start_services=True):
     
     # Initialize status state dict
     self.__status = {}
     self.__status['rank'] = MPIEnvironment.mpi_processor_rank
     self.__status['processor'] = MPIEnvironment.hostname
     self.__status['pid'] = os.getpid()
     self.__status['busy'] = False
     self.__status['command'] = None
     self.__status['command_start_time'] = None
     self.__status['command_stop_time'] = None
 
     # Initialize ping status request handler service state
     self.__ping_status_request_handler_service_on = False
     self.__ping_status_request_handler_service_final_round = False
     self.__ping_status_request_handler_service_running = False
     self.__ping_status_request_handler_service_thread = None
     self.__last_ping_status_request_time = None
     self.__client_timeout = False
     
     # Instantiate MPICommunicator reference
     self.__communicator = MPICommunicator()
     
     # Automatically start services
     if start_services:
         self.start_services()
Beispiel #5
0
 def __init__(self,start_services=True):
                          
     # Initialize server status state
     self.__server_status_list = {}
     mpi_server_rank_list = MPIEnvironment.mpi_server_rank_list()
     for rank in mpi_server_rank_list:
         self.__server_status_list[rank] = {}
         self.__server_status_list[rank]['rank'] = rank
         self.__server_status_list[rank]['processor'] = None
         self.__server_status_list[rank]['pid'] = None
         self.__server_status_list[rank]['busy'] = False
         self.__server_status_list[rank]['command'] = None
         self.__server_status_list[rank]['command_start_time'] = None
         self.__server_status_list[rank]['pong_pending'] = False
         self.__server_status_list[rank]['ping_time'] = None
         self.__server_status_list[rank]['pong_time'] = None          
         self.__server_status_list[rank]['timeout'] = False      
            
     # Initialize monitor service state
     self.__monitor_status_service_on = False
     self.__monitor_status_service_running = False
     self.__monitor_status_service_thread = None          
     
     # Initialize ping status response handler service state
     self.__ping_status_response_handler_service_on = False
     self.__ping_status_response_handler_service_running = False
     self.__ping_status_response_handler_service_thread = None  
     
     # Instantiate MPICommunicator reference
     self.__communicator = MPICommunicator()
     
     # Automatically start services
     if start_services:
         self.start_services()
Beispiel #6
0
        def __init__(self, start_services=False):

            # Initialize life cycle state
            # 0-> Services not started
            # 1-> Services started
            # 2-> Stop service signal sent
            self.__life_cycle_state = 0

            # Initialize command request id counter and list
            self.__command_request_counter = 1
            self.__command_request_list = {}
            self.__command_response_list = {}

            # Initialize command group response state
            self.__command_group_response_counter = 1
            self.__command_group_response_list = {}

            # Initialize command response handler service state
            self.__command_response_handler_service_on = False
            self.__command_response_handler_service_running = False
            self.__command_response_handler_service_thread = None
            self.__command_response_handler_service_event_controller = threading.Event()
            self.__command_response_handler_service_event_controller.clear()

            # Initialize command request queue service state
            self.__command_request_queue = []
            self.__command_request_queue_service_on = False
            self.__command_request_queue_service_running = False
            self.__command_request_queue_service_thread = None
            self.__command_request_queue_service_event_controller = threading.Event()
            self.__command_request_queue_service_event_controller.clear()

            # Setup a command request input queue to append the jobs
            # to be picked up by the command request queue service
            self.__command_request_input_queue = []
            self.__command_request_input_queue_lock = threading.Lock()

            # Instantiate MPICommunicator reference
            self.__communicator = MPICommunicator()

            # Instantiate MPIMonitorClient reference
            self.__monitor_client = MPIMonitorClient(False)

            # Automatically start services
            if start_services:
                self.start_services()

            # Log mode
            self.__log_mode = "unified"

            # Register exit handler
            # NOTE: It is not guaranteed that __del__() methods are called
            #       for objects that still exist when the interpreter exits.
            atexit.register(self.stop_services, force_command_request_interruption=True)
Beispiel #7
0
 def __init__(self,start_services=False):
     
     # Initialize command request handler service state
     self.__command_request_handler_service_on = False
     self.__command_request_handler_service_running = False
     self.__command_request_handler_service_thread = None   
     
     # Instantiate MPICommunicator reference
     self.__communicator = MPICommunicator()
     
     # Instantiate MPIMonitorClient reference
     self.__monitor_server = MPIMonitorServer(False)
     
     # Initialize logfile descriptor
     self.__logfile_descriptor = open(casalog.logfile(), 'a')
     
     # Initialize virtual frame buffer state
     self.__virtual_frame_buffer_port = None
     self.__virtual_frame_buffer_process = None
     
     # Automatically start services
     if start_services:
         self.start_services()
Beispiel #8
0
        def __init__(self, start_services=False):

            # Initialize command request handler service state
            self.__command_request_handler_service_on = False
            self.__command_request_handler_service_running = False
            self.__command_request_handler_service_thread = None

            # Instantiate MPICommunicator reference
            self.__communicator = MPICommunicator()

            # Instantiate MPIMonitorClient reference
            self.__monitor_server = MPIMonitorServer(False)

            # Initialize logfile descriptor
            self.__logfile_descriptor = open(casalog.logfile(), "a")

            # Initialize virtual frame buffer state
            self.__virtual_frame_buffer_port = None
            self.__virtual_frame_buffer_process = None

            # Automatically start services
            if start_services:
                self.start_services()
Beispiel #9
0
    class __MPICommandServerImpl:
        """ Implementation of the MPICommandServer singleton interface """

        def __init__(self, start_services=False):

            # Initialize command request handler service state
            self.__command_request_handler_service_on = False
            self.__command_request_handler_service_running = False
            self.__command_request_handler_service_thread = None

            # Instantiate MPICommunicator reference
            self.__communicator = MPICommunicator()

            # Instantiate MPIMonitorClient reference
            self.__monitor_server = MPIMonitorServer(False)

            # Initialize logfile descriptor
            self.__logfile_descriptor = open(casalog.logfile(), "a")

            # Initialize virtual frame buffer state
            self.__virtual_frame_buffer_port = None
            self.__virtual_frame_buffer_process = None

            # Automatically start services
            if start_services:
                self.start_services()

        ################################################################################################################
        # Private methods ##############################################################################################
        ################################################################################################################

        def __command_request_handler_service(self):

            casalog_call_origin = "MPICommandServer::command_request_handler_service"

            # Mark service as running
            self.__command_request_handler_service_running = True

            while self.__command_request_handler_service_on:

                # First check if there is a command request msg available
                msg_available = False
                try:
                    msg_available = self.__communicator.command_request_probe()
                except Exception as instance:
                    casalog.post(
                        "Exception checking if command request msg is available: %s" % str(instance),
                        "SEVERE",
                        casalog_call_origin,
                    )
                    msg_available = False

                # Then receive command request msg
                msg_received = False
                if msg_available:
                    try:
                        command_request = self.__communicator.command_request_recv()
                        casalog.post(
                            "Received command request msg: %s" % command_request["command"],
                            MPIEnvironment.command_handling_log_level,
                            casalog_call_origin,
                        )
                        msg_received = True
                    except:
                        formatted_traceback = traceback.format_exc()
                        casalog.post(
                            "Exception receiving command request msg: %s" % str(formatted_traceback),
                            "SEVERE",
                            casalog_call_origin,
                        )
                        msg_received = False

                # Finally process command request and send back response
                if msg_received:

                    # Start timer
                    command_start_time = time.time()

                    # Update server state
                    self.__monitor_server.set_status("busy", True)
                    self.__monitor_server.set_status("command", command_request["command"])
                    self.__monitor_server.set_status("command_start_time", command_start_time)
                    # Get command request id
                    command_request_id = command_request["id"]
                    # Prepare command response
                    command_response = dict(command_request)

                    # Set command start time
                    command_response["command_start_time"] = command_start_time

                    # Execute/Evaluate command request
                    try:
                        # Add dict-defined parameters to globals
                        if isinstance(command_request["parameters"], dict):
                            globals().update(command_request["parameters"])

                        # Execute command
                        if command_request["mode"] == "eval":
                            casalog.post(
                                "Going to evaluate command request with id# %s as an expression via eval: %s"
                                % (str(command_request_id), str(command_request["command"])),
                                MPIEnvironment.command_handling_log_level,
                                casalog_call_origin,
                            )
                            command_response["ret"] = eval(command_request["command"])
                        elif command_request["mode"] == "exec":
                            casalog.post(
                                "Going to execute command request with id# %s as a statement via exec: %s"
                                % (str(command_request_id), command_request["command"]),
                                MPIEnvironment.command_handling_log_level,
                                casalog_call_origin,
                            )
                            code = compile(command_request["command"], casalog_call_origin, "exec")
                            exec(code)
                            command_response["ret"] = None
                        elif command_request["mode"] == "push":
                            casalog.post(
                                "Command request with id# %s is a push operation" % str(command_request_id),
                                MPIEnvironment.command_handling_log_level,
                                casalog_call_origin,
                            )
                            command_response["ret"] = None

                        # Set command response parameters
                        command_response["successful"] = True
                        command_response["traceback"] = None

                    except:
                        formatted_traceback = traceback.format_exc()
                        casalog.post(
                            "Exception executing command request via %s: %s"
                            % (command_request["mode"], str(formatted_traceback)),
                            "SEVERE",
                            casalog_call_origin,
                        )
                        # Set command response parameters
                        command_response["successful"] = False
                        command_response["traceback"] = formatted_traceback
                        command_response["ret"] = None
                    # Variables are cleaned from the environment regardless of the result
                    finally:
                        # Clear parameter variables
                        if isinstance(command_request["parameters"], dict) and command_request["mode"] != "push":
                            for parameter in command_request["parameters"]:
                                try:
                                    del globals()[parameter]
                                except:
                                    formatted_traceback = traceback.format_exc()
                                    casalog.post(
                                        "Exception deleting parameter variable '%s' from global environment: %s"
                                        % (str(parameter), str(formatted_traceback)),
                                        "WARN",
                                        casalog_call_origin,
                                    )

                    # Set command stop time
                    command_stop_time = time.time()
                    command_response["command_stop_time"] = command_stop_time

                    # Update server state
                    self.__monitor_server.set_status("busy", False)
                    self.__monitor_server.set_status("command", None)
                    self.__monitor_server.set_status("command_start_time", None)
                    # Send response back (successful or not)
                    try:
                        casalog.post(
                            "Command request with id %s successfully processed in %s mode, sending back response ..."
                            % (str(command_response["id"]), str(command_response["mode"])),
                            MPIEnvironment.command_handling_log_level,
                            casalog_call_origin,
                        )
                        self.__communicator.command_response_send(response=command_response)
                    except:
                        formatted_traceback = traceback.format_exc()
                        casalog.post(
                            "Exception sending back command response: %s" % str(formatted_traceback),
                            "SEVERE",
                            casalog_call_origin,
                        )
                else:
                    time.sleep(MPIEnvironment.mpi_command_request_handler_service_sleep_time)

            # Mark service as not running
            self.__command_request_handler_service_running = False

        def __start_command_request_handler_service(self):

            casalog_call_origin = "MPICommandServer::start_command_request_handler_service"

            if self.__command_request_handler_service_running:
                casalog.post("MPI command request handler service is already running", "WARN", casalog_call_origin)
                return True

            try:
                self.__command_request_handler_service_on = True
                self.__command_request_handler_service_thread = thread.start_new_thread(
                    self.__command_request_handler_service, ()
                )
            except Exception as instance:
                self.__command_request_handler_service_on = False
                self.__command_request_handler_service_running = False
                casalog.post(
                    "Exception starting MPI command request handler service: %s" % str(instance),
                    "SEVERE",
                    casalog_call_origin,
                )
                return False

            while not self.__command_request_handler_service_running:
                time.sleep(MPIEnvironment.mpi_check_start_service_sleep_time)

            casalog.post("MPI command request handler service started", "INFO", casalog_call_origin)

            return True

        def __stop_command_request_handler_service(self):

            casalog_call_origin = "MPICommandServer::stop_command_request_handler_service"

            if not self.__command_request_handler_service_running:
                casalog.post("MPI command request handler service is not running", "WARN", casalog_call_origin)
                return

            self.__command_request_handler_service_on = False

            while self.__command_request_handler_service_running:
                time.sleep(MPIEnvironment.mpi_check_stop_service_sleep_time)

            casalog.post("MPI command request handler service stopped", "INFO", casalog_call_origin)

        ################################################################################################################
        # Public methods ###############################################################################################
        ################################################################################################################

        def start_virtual_frame_buffer(self):

            casalog_call_origin = "MPICommandServer::start_virtual_frame_buffer"

            displayport = os.getpid()
            while os.path.exists("/tmp/.X%d-lock" % displayport):
                displayport += 1

            self.__virtual_frame_buffer_port = ":%d" % displayport

            self.__xauthfile = tempfile.NamedTemporaryFile()

            try:
                cookie = subprocess.check_output(["mcookie"], universal_newlines=True).strip()
            except:
                cookie = str(uuid.uuid4()).replace("-", "")

            # sometimes also works without auth, so accept failure
            subprocess.call(
                ["xauth", "-f", self.__xauthfile.name, "add", self.__virtual_frame_buffer_port, ".", cookie],
                stdout=self.__logfile_descriptor,
                stderr=self.__logfile_descriptor,
            )

            try:
                self.__virtual_frame_buffer_process = subprocess.Popen(
                    ["Xvfb", self.__virtual_frame_buffer_port, "-auth", self.__xauthfile.name],
                    stdout=self.__logfile_descriptor,
                    stderr=self.__logfile_descriptor,
                    shell=False,
                )
                os.environ["DISPLAY"] = self.__virtual_frame_buffer_port
                os.environ["XAUTHORITY"] = self.__xauthfile.name
                casalog.post(
                    "Deployed virtual frame buffer at %s with pid %s"
                    % (self.__virtual_frame_buffer_port, str(self.__virtual_frame_buffer_process.pid)),
                    "INFO",
                    casalog_call_origin,
                )
            except:
                self.__virtual_frame_buffer_process = None
                formatted_traceback = traceback.format_exc()
                casalog.post(
                    "Exception deploying virtual frame buffer at %s: %s"
                    % (self.__virtual_frame_buffer_port, str(formatted_traceback)),
                    "SEVERE",
                    casalog_call_origin,
                )

        def stop_virtual_frame_buffer(self):

            casalog_call_origin = "MPICommandServer::stop_virtual_frame_buffer"

            if self.__virtual_frame_buffer_process is not None:
                try:
                    self.__virtual_frame_buffer_process.terminate()
                    casalog.post(
                        "Virtual frame buffer deployed at %s with pid %s successfully shutdown"
                        % (self.__virtual_frame_buffer_port, str(self.__virtual_frame_buffer_process.pid)),
                        "DEBUG",
                        casalog_call_origin,
                    )
                    self.__virtual_frame_buffer_process = None
                except:
                    formatted_traceback = traceback.format_exc()
                    casalog.post(
                        "Exception shutting down virtual frame buffer deployed at %s with pid %s: %s"
                        % (
                            self.__virtual_frame_buffer_port,
                            str(self.__virtual_frame_buffer_process.pid),
                            str(formatted_traceback),
                        ),
                        "SEVERE",
                        casalog_call_origin,
                    )
            else:
                casalog.post("Virtual frame buffer not deployed", "WARN", casalog_call_origin)

            subprocess.call(
                ["xauth", "-f", self.__xauthfile.name, "remove", self.__virtual_frame_buffer_port],
                stdout=self.__logfile_descriptor,
                stderr=self.__logfile_descriptor,
            )
            self.__xauthfile.close()

        def start_services(self):

            self.__monitor_server.start_services()
            self.__start_command_request_handler_service()
            self.start_virtual_frame_buffer()

        def stop_services(self, force_command_request_interruption=False):

            if self.__logfile_descriptor is not None:
                self.__logfile_descriptor.close()
                self.__logfile_descriptor = None

            self.__monitor_server.stop_services()

            if not force_command_request_interruption:
                self.__stop_command_request_handler_service()

        def serve(self):

            casalog_call_origin = "MPICommandServer::serve"

            # First start command and ping status services
            casalog.post("Starting services...", "INFO", casalog_call_origin)
            self.start_services()

            # Notify to MPICommandClient that service is up and running
            self.__communicator.control_service_response_send(response=self.__monitor_server.get_status())

            # Keep serving until a stop signal service is received
            control_service_request = {}
            stop_service_requested = False
            while (not stop_service_requested) and (not self.__monitor_server.get_client_timeout()):

                # Check if there is an incoming control service msg
                msg_available = False
                try:
                    msg_available = self.__communicator.control_service_request_probe()
                except:
                    msg_available = False
                    formatted_traceback = traceback.format_exc()
                    casalog.post(
                        "Exception checking if control service msg is available: %s" % str(formatted_traceback),
                        "SEVERE",
                        casalog_call_origin,
                    )

                # Notify to MPICommandClient that control signal has been processed
                if msg_available:

                    # Receive control service msg
                    msg_received = False
                    control_service_request = {}
                    try:
                        control_service_request = self.__communicator.control_service_request_recv()
                        msg_received = True
                    except:
                        msg_received = False
                        formatted_traceback = traceback.format_exc()
                        casalog.post(
                            "Exception receiving control service msg: %s" % str(formatted_traceback),
                            "SEVERE",
                            casalog_call_origin,
                        )
                        continue

                    # Process control service msg
                    cmd = None
                    send_response = False
                    if msg_received:
                        try:
                            cmd = control_service_request["command"]
                            send_response = control_service_request["send_response"]
                            code = compile(cmd, casalog_call_origin, "exec")
                            exec(code)
                            casalog.post(
                                "Control signal %s successfully handled by server %s"
                                % (str(cmd), str(MPIEnvironment.mpi_processor_rank)),
                                "INFO",
                                casalog_call_origin,
                            )
                        except:
                            formatted_traceback = traceback.format_exc()
                            casalog.post(
                                "Exception handling control signal command %s in server %s: %s"
                                % (
                                    str(control_service_request),
                                    str(MPIEnvironment.mpi_processor_rank),
                                    str(formatted_traceback),
                                ),
                                "SEVERE",
                                casalog_call_origin,
                            )

                    # Notify to MPICommandClient that control signal has been processed
                    if send_response:
                        try:
                            self.__communicator.control_service_response_send(
                                response=self.__monitor_server.get_status()
                            )
                        except:
                            formatted_traceback = traceback.format_exc()
                            casalog.post(
                                "Exception sending response to control signal command %s in server %s: %s"
                                % (str(cmd), str(MPIEnvironment.mpi_processor_rank), str(formatted_traceback)),
                                "SEVERE",
                                casalog_call_origin,
                            )

                time.sleep(MPIEnvironment.mpi_stop_service_sleep_time)

            # Process stop service request
            if stop_service_requested:

                # Check if force mode is needed
                force_command_request_interruption = control_service_request["force_command_request_interruption"]
                finalize_mpi_environment = control_service_request["finalize_mpi_environment"]
                busy = self.__monitor_server.get_status("busy")
                if force_command_request_interruption and busy:
                    casalog.post(
                        "force-stop service signal received, stopping services, "
                        + "command request handler service will be interrupted...",
                        "INFO",
                        casalog_call_origin,
                    )
                else:
                    force_command_request_interruption = False
                    casalog.post("stop service signal received, stopping services...", "INFO", casalog_call_origin)
            else:
                force_command_request_interruption = True
                casalog.post(
                    "client timeout, forcing disconnection, " + "command request handler service will be interrupted..",
                    "INFO",
                    casalog_call_origin,
                )

            # Stop services
            self.stop_services(force_command_request_interruption)

            # Finalize MPI environment
            if finalize_mpi_environment:
                try:
                    casalog.post("Going to finalize MPI environment", "INFO", casalog_call_origin)
                    MPIEnvironment.finalize_mpi_environment()
                except:
                    formatted_traceback = traceback.format_exc()
                    casalog.post(
                        "Exception finalizing MPI environment %s" % str(formatted_traceback),
                        "SEVERE",
                        casalog_call_origin,
                    )

            # Exit
            casalog.post("Exiting", "INFO", casalog_call_origin)
Beispiel #10
0
    class __MPIMonitorClientImpl:
        """ Implementation of the MPIMonitorClient singleton interface """
        def __init__(self, start_services=True):

            # Initialize server status state
            self.__server_status_list = {}
            mpi_server_rank_list = MPIEnvironment.mpi_server_rank_list()
            for rank in mpi_server_rank_list:
                self.__server_status_list[rank] = {}
                self.__server_status_list[rank]['rank'] = rank
                self.__server_status_list[rank]['processor'] = None
                self.__server_status_list[rank]['pid'] = None
                self.__server_status_list[rank]['busy'] = False
                self.__server_status_list[rank]['command'] = None
                self.__server_status_list[rank]['command_start_time'] = None
                self.__server_status_list[rank]['pong_pending'] = False
                self.__server_status_list[rank]['ping_time'] = None
                self.__server_status_list[rank]['pong_time'] = None
                self.__server_status_list[rank]['timeout'] = False

            # Initialize monitor service state
            self.__monitor_status_service_on = False
            self.__monitor_status_service_running = False
            self.__monitor_status_service_thread = None

            # Initialize ping status response handler service state
            self.__ping_status_response_handler_service_on = False
            self.__ping_status_response_handler_service_running = False
            self.__ping_status_response_handler_service_thread = None

            # Instantiate MPICommunicator reference
            self.__communicator = MPICommunicator()

            # Automatically start services
            if start_services:
                self.start_services()

        ################################################################################################################
        # Private methods ##############################################################################################
        ################################################################################################################

        def __monitor_status_service(self):

            casalog_call_origin = "MPIMonitorClient::monitor_status_service"

            # Mark service as running
            self.__monitor_status_service_running = True

            mpi_server_rank_list = MPIEnvironment.mpi_server_rank_list()

            while (self.__monitor_status_service_on):
                # Iterate over servers
                for rank in mpi_server_rank_list:
                    # Send ping status request if there is none pending
                    if not self.__server_status_list[rank]['pong_pending']:
                        try:
                            self.__communicator.ping_status_request_send(
                                server=rank)
                            self.__server_status_list[rank][
                                'ping_time'] = time.time()
                            self.__server_status_list[rank][
                                'pong_pending'] = True
                            self.__server_status_list[rank]['pong_checks'] = 0
                        except:
                            formatted_traceback = traceback.format_exc()
                            casalog.post(
                                "Exception sending ping status request to server %s: %s"
                                % (str(rank), str(formatted_traceback)),
                                "SEVERE", casalog_call_origin)
                    else:
                        self.__server_status_list[rank]['pong_checks'] += 1
                        elapsed_time = MPIEnvironment.mpi_monitor_status_service_heartbeat
                        elapsed_time *= self.__server_status_list[rank][
                            'pong_checks']
                        # elapsed_time = int(round(time.time() - self.__server_status_list[rank]['ping_time']))
                        # Notify when a server reaches timeout condition
                        if (MPIEnvironment.
                                mpi_monitor_status_service_timeout_enabled and
                            (elapsed_time >
                             MPIEnvironment.mpi_monitor_status_service_timeout)
                                and
                            (not self.__server_status_list[rank]['timeout'])):
                            casalog.post(
                                "Ping status response from server %s not received "
                                "in the last %ss. Setting its status to 'timeout'"
                                % (str(rank), str(int(elapsed_time))),
                                "SEVERE", casalog_call_origin)
                            self.__server_status_list[rank]['timeout'] = True
                # Sleep before next round
                time.sleep(MPIEnvironment.mpi_monitor_status_service_heartbeat)

            # Mark service as not running
            self.__monitor_status_service_running = False

        def __start_monitor_status_service(self):

            casalog_call_origin = "MPIMonitorClient::start_monitor_status_service"

            if self.__monitor_status_service_running:
                casalog.post("MPI monitor status service is already running",
                             "WARN", casalog_call_origin)
                return True

            try:
                self.__monitor_status_service_on = True
                self.__monitor_status_service_thread = thread.start_new_thread(
                    self.__monitor_status_service, ())
            except:
                formatted_traceback = traceback.format_exc()
                self.__monitor_status_service_on = False
                self.__monitor_status_service_running = False
                casalog.post(
                    "Exception starting MPI monitor status service: %s" %
                    str(formatted_traceback), "SEVERE", casalog_call_origin)
                return False

            while (not self.__monitor_status_service_running):
                time.sleep(MPIEnvironment.mpi_check_start_service_sleep_time)

            casalog.post("MPI monitor status service started", "INFO",
                         casalog_call_origin)

            return True

        def __stop_monitor_status_service(self):

            casalog_call_origin = "MPIMonitorClient::stop_monitor_status_service"

            if not self.__monitor_status_service_running:
                casalog.post(
                    "MPI ping status response handler service is not running",
                    "WARN", casalog_call_origin)
                return

            self.__monitor_status_service_on = False

            while (self.__monitor_status_service_running):
                time.sleep(MPIEnvironment.mpi_check_stop_service_sleep_time)

            casalog.post("MPI monitor status service stopped", "INFO",
                         casalog_call_origin)

        def __ping_status_response_handler_service(self):

            casalog_call_origin = "MPIMonitorClient::ping_status_response_handler_service"

            # Mark service as running
            self.__ping_status_response_handler_service_running = True

            while (self.__ping_status_response_handler_service_on):

                # First check if there is a ping_status response msg available
                msg_available = False
                try:
                    msg_available = self.__communicator.ping_status_response_probe(
                    )
                except:
                    msg_available = False
                    formatted_traceback = traceback.format_exc()
                    casalog.post(
                        "Exception checking if ping status response msg is available: %s"
                        % str(formatted_traceback), "SEVERE",
                        casalog_call_origin)

                # Then receive, store and post ping_status response msg
                if (msg_available):
                    try:
                        ping_status_response = self.__communicator.ping_status_response_recv(
                        )
                        pong_time = time.time()
                        rank = ping_status_response['rank']
                        self.__server_status_list[rank][
                            'command'] = ping_status_response['command']
                        self.__server_status_list[rank][
                            'command_start_time'] = ping_status_response[
                                'command_start_time']
                        self.__server_status_list[rank][
                            'pong_time'] = pong_time
                        self.__server_status_list[rank]['pong_pending'] = False
                        elapsed_time = pong_time - self.__server_status_list[
                            rank]['ping_time']
                        # Notify if the response has been received after timeout
                        if self.__server_status_list[rank]['timeout']:
                            self.__server_status_list[rank]['timeout'] = False
                            casalog.post(
                                "Ping status response from server %s received after %ss"
                                % (str(rank), str(int(elapsed_time))), "WARN",
                                casalog_call_origin)
                    except:
                        formatted_traceback = traceback.format_exc()
                        casalog.post(
                            "Exception receiving ping status response msg: %s"
                            % str(formatted_traceback), "SEVERE",
                            casalog_call_origin)
                else:
                    time.sleep(
                        MPIEnvironment.
                        mpi_ping_status_response_handler_service_sleep_time)

            # Mark service as not running
            self.__ping_status_response_handler_service_running = False

        def __start_ping_status_response_handler_service(self):

            casalog_call_origin = "MPIMonitorClient::start_ping_status_response_handler_service"

            if self.__ping_status_response_handler_service_running:
                casalog.post(
                    "MPI ping status response handler service is already running",
                    "WARN", casalog_call_origin)
                return True

            try:
                self.__ping_status_response_handler_service_on = True
                self.__ping_status_response_handler_service_thread = thread.start_new_thread(
                    self.__ping_status_response_handler_service, ())
            except:
                formatted_traceback = traceback.format_exc()
                self.__ping_status_response_handler_service_on = False
                self.__ping_status_response_handler_service_running = False
                casalog.post(
                    "Exception starting MPI ping status response handler service: %s"
                    % str(formatted_traceback), "SEVERE", casalog_call_origin)
                return False

            while (not self.__ping_status_response_handler_service_running):
                time.sleep(MPIEnvironment.mpi_check_start_service_sleep_time)

            casalog.post("MPI ping status response handler service started",
                         "INFO", casalog_call_origin)

            return True

        def __stop_ping_status_response_handler_service(self):

            casalog_call_origin = "MPIMonitorClient::stop_ping_status_response_handler_service"

            if not self.__ping_status_response_handler_service_running:
                casalog.post(
                    "MPI ping status response handler service is not running",
                    "WARN", casalog_call_origin)
                return

            self.__ping_status_response_handler_service_on = False

            while (self.__ping_status_response_handler_service_running):
                time.sleep(MPIEnvironment.mpi_check_stop_service_sleep_time)

            casalog.post("MPI ping status response handler service stopped",
                         "INFO", casalog_call_origin)

        ################################################################################################################
        # Public methods ###############################################################################################
        ################################################################################################################

        def start_services(self):

            self.__start_ping_status_response_handler_service()
            self.__start_monitor_status_service()

        def stop_services(self):

            self.__stop_monitor_status_service()
            self.__stop_ping_status_response_handler_service()

        def get_server_status(self, server=None):

            casalog_call_origin = "MPIMonitorClient::get_server_status"

            if server is None:
                return dict(self.__server_status_list)
            else:
                if self.__server_status_list.has_key(server):
                    return dict(self.__server_status_list[server])
                else:
                    casalog.post("Server n# %s is out of range" % str(server),
                                 "WARN", casalog_call_origin)

        def get_server_status_keyword(self, server, keyword):

            casalog_call_origin = "MPIMonitorClient::get_server_status_keyword"

            if self.__server_status_list.has_key(server):
                if self.__server_status_list[server].has_key(keyword):
                    return self.__server_status_list[server][keyword]
                else:
                    casalog.post(
                        "Status keyword %s not defined" % str(keyword), "WARN",
                        casalog_call_origin)
            else:
                casalog.post("Server n# %s is out of range" % str(server),
                             "WARN", casalog_call_origin)

        def set_server_status_keyword(self, server, keyword, value):

            casalog_call_origin = "MPIMonitorClient::set_server_status_keyword"

            if self.__server_status_list.has_key(server):
                if self.__server_status_list[server].has_key(keyword):
                    self.__server_status_list[server][keyword] = value
                else:
                    casalog.post(
                        "Status keyword %s not defined" % str(keyword), "WARN",
                        casalog_call_origin)
            else:
                casalog.post("Server n# %s is out of range" % str(server),
                             "WARN", casalog_call_origin)

        def get_server_rank_available(self, verbose=False):

            server_rank_available = []
            for rank in self.__server_status_list:
                if not (self.__server_status_list[rank]['busy']
                        or self.__server_status_list[rank]['timeout']):
                    server_rank_available.append(rank)

            return server_rank_available

        def get_server_rank_online(self, verbose=False):

            server_rank_online = []
            for rank in self.__server_status_list:
                if not self.__server_status_list[rank]['timeout']:
                    server_rank_online.append(rank)

            return server_rank_online

        def get_server_timeout(self):

            casalog_call_origin = "MPIMonitorClient::get_server_timeout"

            server_rank_timeout = []
            for rank in self.__server_status_list:
                if self.__server_status_list[rank]['timeout'] is True:
                    server_rank_timeout.append(rank)

            casalog.post(
                'Found {} server in timeout status'.format(
                    len(server_rank_timeout)), "INFO", casalog_call_origin)
            return server_rank_timeout

        def start_debugging_mode(self):
            """ Enter debugging/development mode. This disables the heart-beat time
            out mechanism (which would otherwise trigger when a debugger is attached
            to MPI server processes). After this no more servers will be flagged as
            'timeout', until stop_debugging_mode() is called."""

            casalog_call_origin = "MPIMonitorClient::start_debugging_mode"

            MPIEnvironment.mpi_monitor_status_service_timeout_enabled = False
            casalog.post("Started debugging mode. Timeout mechanism disabled.",
                         "INFO", casalog_call_origin)

        def stop_debugging_mode(self):
            """ Leave debugging/development mode. The heart-beat timeout mechanism is
            re-enabled. """

            casalog_call_origin = "MPIMonitorClient::stop_debugging_mode"

            # Clear all 'pong_pending' and start ping/pong counts anew
            for rank in self.__server_status_list:
                if not self.__server_status_list[rank]['timeout'] is True:
                    self.__server_status_list[rank]['pong_pending'] = False
                    self.__server_status_list[rank]['pong_checks'] = 0
            MPIEnvironment.mpi_monitor_status_service_timeout_enabled = True

            casalog.post("Stopped debugging mode. Timeout mechanism enabled.",
                         "INFO", casalog_call_origin)
Beispiel #11
0
    class __MPICommandServerImpl:
        """ Implementation of the MPICommandServer singleton interface """    
    
        def __init__(self,start_services=False):
            
            # Initialize command request handler service state
            self.__command_request_handler_service_on = False
            self.__command_request_handler_service_running = False
            self.__command_request_handler_service_thread = None   
            
            # Instantiate MPICommunicator reference
            self.__communicator = MPICommunicator()
            
            # Instantiate MPIMonitorClient reference
            self.__monitor_server = MPIMonitorServer(False)
            
            # Initialize logfile descriptor
            self.__logfile_descriptor = open(casalog.logfile(), 'a')
            
            # Initialize virtual frame buffer state
            self.__virtual_frame_buffer_port = None
            self.__virtual_frame_buffer_process = None
            
            # Automatically start services
            if start_services:
                self.start_services()
                

        ################################################################################################################            
        # Private methods ##############################################################################################
        ################################################################################################################   
        
        
        def __command_request_handler_service(self):
            
            casalog_call_origin = "MPICommandServer::command_request_handler_service"
            
            # Mark service as running
            self.__command_request_handler_service_running = True                        
                          
            while (self.__command_request_handler_service_on):
                
                # First check if there is a command request msg available
                msg_available = False
                try:
                    msg_available = self.__communicator.command_request_probe()
                except Exception as instance:
                    casalog.post("Exception checking if command request msg is available: %s" 
                                 % str(instance),"SEVERE",casalog_call_origin)
                    msg_available = False
                    
                # Then receive command request msg
                msg_received = False
                if (msg_available):
                    try:
                        command_request = self.__communicator.command_request_recv()
                        casalog.post("Received command request msg: %s" 
                                     % command_request['command'],MPIEnvironment.command_handling_log_level,casalog_call_origin)
                        msg_received = True
                    except:
                        formatted_traceback = traceback.format_exc()
                        casalog.post("Exception receiving command request msg: %s" 
                                     % str(formatted_traceback),"SEVERE",casalog_call_origin)
                        msg_received = False
                        
                # Finally process command request and send back response
                if (msg_received):
                    
                    # Start timer
                    command_start_time = time.time()
                                     
                    # Update server state
                    self.__monitor_server.set_status('busy',True)
                    self.__monitor_server.set_status('command',command_request['command'])
                    self.__monitor_server.set_status('command_start_time',command_start_time)            
                    # Get command request id 
                    command_request_id = command_request['id']
                    # Prepare command response
                    command_response = dict(command_request)  
                    
                    # Set command start time
                    command_response['command_start_time'] = command_start_time
                    
                    # Execute/Evaluate command request
                    try:
                        # Add dict-defined parameters to globals
                        if isinstance(command_request['parameters'],dict):
                            globals().update(command_request['parameters'])

                        # Execute command
                        if command_request['mode']=='eval':
                            casalog.post("Going to evaluate command request with id# %s as an expression via eval: %s" 
                                         % (str(command_request_id),str(command_request['command'])),
                                         MPIEnvironment.command_handling_log_level,casalog_call_origin) 
                            command_response['ret'] = eval(command_request['command'])
                        elif command_request['mode']=='exec':
                            casalog.post("Going to execute command request with id# %s as a statement via exec: %s" 
                                         % (str(command_request_id),command_request['command']),
                                         MPIEnvironment.command_handling_log_level,casalog_call_origin)      
                            code = compile(command_request['command'], casalog_call_origin, 'exec')                                                   
                            exec(code)
                            command_response['ret'] = None
                        elif command_request['mode']=='push':
                            casalog.post("Command request with id# %s is a push operation" 
                                         % str(command_request_id),
                                         MPIEnvironment.command_handling_log_level,casalog_call_origin)  
                            command_response['ret'] = None
                                    
                        # Set command response parameters
                        command_response['successful'] = True
                        command_response['traceback'] = None
                        
                    except:
                        formatted_traceback = traceback.format_exc()
                        casalog.post("Exception executing command request via %s: %s" 
                                     % (command_request['mode'],str(formatted_traceback)),"SEVERE",casalog_call_origin)
                        # Set command response parameters
                        command_response['successful'] = False
                        command_response['traceback']=formatted_traceback
                        command_response['ret']=None
                    # Variables are cleaned from the environment regardless of the result
                    finally:
                        # Clear parameter variables
                        if isinstance(command_request['parameters'],dict) and command_request['mode']!='push':
                            for parameter in command_request['parameters']:
                                try:
                                    del globals()[parameter]
                                except:
                                    formatted_traceback = traceback.format_exc()
                                    casalog.post("Exception deleting parameter variable '%s' from global environment: %s" 
                                                 % (str(parameter),str(formatted_traceback)),"WARN",casalog_call_origin)
                                    
                    # Set command stop time
                    command_stop_time = time.time()
                    command_response['command_stop_time'] = command_stop_time
                        
                    # Update server state 
                    self.__monitor_server.set_status('busy',False)
                    self.__monitor_server.set_status('command',None)
                    self.__monitor_server.set_status('command_start_time',None)                      
                    # Send response back (successful or not)
                    try:
                        casalog.post("Command request with id %s successfully processed in %s mode, sending back response ..." 
                                     % (str(command_response['id']),str(command_response['mode'])),
                                     MPIEnvironment.command_handling_log_level,casalog_call_origin)                           
                        self.__communicator.command_response_send(response=command_response)
                    except:
                        formatted_traceback = traceback.format_exc()
                        casalog.post("Exception sending back command response: %s" 
                                     % str(formatted_traceback),"SEVERE",casalog_call_origin)                
                else:
                    time.sleep(MPIEnvironment.mpi_command_request_handler_service_sleep_time) 
        
            # Mark service as not running
            self.__command_request_handler_service_running = False            

            
        def __start_command_request_handler_service(self):
        
            casalog_call_origin = "MPICommandServer::start_command_request_handler_service"

            if self.__command_request_handler_service_running:
                casalog.post("MPI command request handler service is already running","WARN",casalog_call_origin)
                return True
            
            try:
                self.__command_request_handler_service_on = True
                self.__command_request_handler_service_thread = thread.start_new_thread(self.__command_request_handler_service, ())
            except Exception as instance:
                self.__command_request_handler_service_on = False
                self.__command_request_handler_service_running = False
                casalog.post("Exception starting MPI command request handler service: %s" 
                             % str(instance),"SEVERE",casalog_call_origin)  
                return False
        
            while (not self.__command_request_handler_service_running):
                time.sleep(MPIEnvironment.mpi_check_start_service_sleep_time) 
        
            casalog.post("MPI command request handler service started","INFO",casalog_call_origin)
            
            return True
        
        
        def __stop_command_request_handler_service(self):
        
            casalog_call_origin = "MPICommandServer::stop_command_request_handler_service"
        
            if not self.__command_request_handler_service_running:
                casalog.post("MPI command request handler service is not running","WARN",casalog_call_origin)
                return             

            self.__command_request_handler_service_on = False
        
            while (self.__command_request_handler_service_running):
                time.sleep(MPIEnvironment.mpi_check_stop_service_sleep_time)

            casalog.post("MPI command request handler service stopped","INFO",casalog_call_origin)
      
            
        ################################################################################################################            
        # Public methods ###############################################################################################
        ################################################################################################################            
            
            
        def start_virtual_frame_buffer(self):
            
            casalog_call_origin = "MPICommandServer::start_virtual_frame_buffer"

            displayport = os.getpid()
            while os.path.exists('/tmp/.X%d-lock' % displayport):
                displayport += 1

            self.__virtual_frame_buffer_port = ":%d" % displayport

            self.__xauthfile = tempfile.NamedTemporaryFile()

            try:
                cookie = subprocess.check_output(['mcookie'], universal_newlines=True).strip()
            except:
                cookie = str(uuid.uuid4()).replace('-', '')

            #sometimes also works without auth, so accept failure
            subprocess.call(['xauth', '-f', self.__xauthfile.name, 'add',
                             self.__virtual_frame_buffer_port, '.', cookie],
                             stdout=self.__logfile_descriptor,
                             stderr=self.__logfile_descriptor)

            try:
                self.__virtual_frame_buffer_process = subprocess.Popen(['Xvfb',self.__virtual_frame_buffer_port,
                                                                       '-auth', self.__xauthfile.name],
                                                                       stdout=self.__logfile_descriptor, 
                                                                       stderr=self.__logfile_descriptor,
                                                                       shell=False)
                os.environ['DISPLAY']=self.__virtual_frame_buffer_port
                os.environ['XAUTHORITY'] = self.__xauthfile.name
                casalog.post("Deployed virtual frame buffer at %s with pid %s" % 
                             (self.__virtual_frame_buffer_port,
                              str(self.__virtual_frame_buffer_process.pid)),
                             "INFO",casalog_call_origin)
            except:
                self.__virtual_frame_buffer_process = None                
                formatted_traceback = traceback.format_exc()
                casalog.post("Exception deploying virtual frame buffer at %s: %s" 
                             % (self.__virtual_frame_buffer_port,
                                str(formatted_traceback)),
                             "SEVERE",casalog_call_origin)
                
                
        def stop_virtual_frame_buffer(self):
            
            casalog_call_origin = "MPICommandServer::stop_virtual_frame_buffer"
            
            if self.__virtual_frame_buffer_process is not None:
                try:
                    self.__virtual_frame_buffer_process.terminate()
                    casalog.post("Virtual frame buffer deployed at %s with pid %s successfully shutdown" % 
                                 (self.__virtual_frame_buffer_port,
                                  str(self.__virtual_frame_buffer_process.pid)),
                                 "DEBUG",casalog_call_origin)
                    self.__virtual_frame_buffer_process = None
                except:
                    formatted_traceback = traceback.format_exc()
                    casalog.post("Exception shutting down virtual frame buffer deployed at %s with pid %s: %s" 
                                 % (self.__virtual_frame_buffer_port,
                                    str(self.__virtual_frame_buffer_process.pid),
                                    str(formatted_traceback)),
                                    "SEVERE",casalog_call_origin)
            else:
                casalog.post("Virtual frame buffer not deployed","WARN",casalog_call_origin)            

            subprocess.call(['xauth', '-f', self.__xauthfile.name, 'remove',
                             self.__virtual_frame_buffer_port],
                             stdout=self.__logfile_descriptor,
                             stderr=self.__logfile_descriptor)
            self.__xauthfile.close()
            
        def start_services(self):
        
            self.__monitor_server.start_services()
            self.__start_command_request_handler_service()
            self.start_virtual_frame_buffer()
        
        
        def stop_services(self,force_command_request_interruption=False):

            if self.__logfile_descriptor is not None:
                 self.__logfile_descriptor.close()
                 self.__logfile_descriptor = None
                 
            self.__monitor_server.stop_services()
            
            if not force_command_request_interruption:
                self.__stop_command_request_handler_service()
            
            
        def serve(self):
            
            casalog_call_origin = "MPICommandServer::serve"        
            
            # First start command and ping status services
            casalog.post("Starting services...","INFO",casalog_call_origin) 
            self.start_services()
            
            # Notify to MPICommandClient that service is up and running
            self.__communicator.control_service_response_send(response=self.__monitor_server.get_status())
            
            # Keep serving until a stop signal service is received
            control_service_request = {}
            stop_service_requested = False
            while ((not stop_service_requested) and (not self.__monitor_server.get_client_timeout())):
                
                # Check if there is an incoming control service msg
                msg_available = False
                try:
                    msg_available = self.__communicator.control_service_request_probe()
                except:
                    msg_available = False
                    formatted_traceback = traceback.format_exc()
                    casalog.post("Exception checking if control service msg is available: %s" 
                                 % str(formatted_traceback),"SEVERE",casalog_call_origin)                        
                
                # Notify to MPICommandClient that control signal has been processed
                if msg_available:
                    
                    # Receive control service msg
                    msg_received = False                    
                    control_service_request = {}
                    try:
                        control_service_request = self.__communicator.control_service_request_recv()
                        msg_received = True
                    except:
                        msg_received = False
                        formatted_traceback = traceback.format_exc()
                        casalog.post("Exception receiving control service msg: %s"
                                     % str(formatted_traceback),"SEVERE",casalog_call_origin)
                        continue
                    
                    # Process control service msg
                    cmd = None
                    send_response = False
                    if msg_received:
                        try:
                            cmd = control_service_request['command']
                            send_response = control_service_request['send_response']
                            code = compile(cmd, casalog_call_origin, 'exec')                                                   
                            exec(code)
                            casalog.post("Control signal %s successfully handled by server %s" 
                                         % (str(cmd),str(MPIEnvironment.mpi_processor_rank)),
                                         "INFO",casalog_call_origin)                            
                        except:
                            formatted_traceback = traceback.format_exc()
                            casalog.post("Exception handling control signal command %s in server %s: %s" 
                                         % (str(control_service_request),
                                            str(MPIEnvironment.mpi_processor_rank),
                                            str(formatted_traceback)),
                                         "SEVERE",casalog_call_origin)
                            
                    # Notify to MPICommandClient that control signal has been processed
                    if send_response:
                        try:
                            self.__communicator.control_service_response_send(response=self.__monitor_server.get_status())
                        except:
                            formatted_traceback = traceback.format_exc()
                            casalog.post("Exception sending response to control signal command %s in server %s: %s" 
                                         % (str(cmd),str(MPIEnvironment.mpi_processor_rank),str(formatted_traceback)),
                                         "SEVERE",casalog_call_origin)
                    
                time.sleep(MPIEnvironment.mpi_stop_service_sleep_time)
            
            # Process stop service request
            if stop_service_requested:
                
                # Check if force mode is needed
                force_command_request_interruption = control_service_request['force_command_request_interruption']
                finalize_mpi_environment = control_service_request['finalize_mpi_environment']
                busy = self.__monitor_server.get_status('busy')
                if force_command_request_interruption and busy:
                    casalog.post("force-stop service signal received, stopping services, " + 
                                 "command request handler service will be interrupted...","INFO",casalog_call_origin)
                else:
                    force_command_request_interruption = False
                    casalog.post("stop service signal received, stopping services...","INFO",casalog_call_origin)
            else:
                force_command_request_interruption = True
                casalog.post("client timeout, forcing disconnection, " +
                             "command request handler service will be interrupted.." ,"INFO",casalog_call_origin)
            
            # Stop services
            self.stop_services(force_command_request_interruption)
            
            # Finalize MPI environment   
            if finalize_mpi_environment:    
                try:
                    casalog.post("Going to finalize MPI environment","INFO",casalog_call_origin)
                    MPIEnvironment.finalize_mpi_environment()
                except:
                    formatted_traceback = traceback.format_exc()
                    casalog.post("Exception finalizing MPI environment %s" 
                                 % str(formatted_traceback),"SEVERE",casalog_call_origin)

            # Exit
            casalog.post("Exiting","INFO",casalog_call_origin)
Beispiel #12
0
    class __MPICommandClientImpl:
        """ Implementation of the MPICommandClient singleton interface """

        def __init__(self, start_services=False):

            # Initialize life cycle state
            # 0-> Services not started
            # 1-> Services started
            # 2-> Stop service signal sent
            self.__life_cycle_state = 0

            # Initialize command request id counter and list
            self.__command_request_counter = 1
            self.__command_request_list = {}
            self.__command_response_list = {}

            # Initialize command group response state
            self.__command_group_response_counter = 1
            self.__command_group_response_list = {}

            # Initialize command response handler service state
            self.__command_response_handler_service_on = False
            self.__command_response_handler_service_running = False
            self.__command_response_handler_service_thread = None
            self.__command_response_handler_service_event_controller = threading.Event()
            self.__command_response_handler_service_event_controller.clear()

            # Initialize command request queue service state
            self.__command_request_queue = []
            self.__command_request_queue_service_on = False
            self.__command_request_queue_service_running = False
            self.__command_request_queue_service_thread = None
            self.__command_request_queue_service_event_controller = threading.Event()
            self.__command_request_queue_service_event_controller.clear()

            # Setup a command request input queue to append the jobs
            # to be picked up by the command request queue service
            self.__command_request_input_queue = []
            self.__command_request_input_queue_lock = threading.Lock()

            # Instantiate MPICommunicator reference
            self.__communicator = MPICommunicator()

            # Instantiate MPIMonitorClient reference
            self.__monitor_client = MPIMonitorClient(False)

            # Automatically start services
            if start_services:
                self.start_services()

            # Log mode
            self.__log_mode = "unified"

            # Register exit handler
            # NOTE: It is not guaranteed that __del__() methods are called
            #       for objects that still exist when the interpreter exits.
            atexit.register(self.stop_services, force_command_request_interruption=True)

        ################################################################################################################
        # Private methods ##############################################################################################
        ################################################################################################################

        def __command_response_handler_service(self):

            casalog_call_origin = "MPICommandClient::command_response_handler_service"

            # Mark service as running
            self.__command_response_handler_service_running = True

            while self.__command_response_handler_service_on:

                # Wait until there are command request whose response is pending
                if len(self.__command_response_list) == len(self.__command_request_list):
                    self.__command_response_handler_service_event_controller.wait()

                # First check if there is a command response msg available
                msg_available = False
                try:
                    msg_available = self.__communicator.command_response_probe()
                except:
                    msg_available = False
                    formatted_traceback = traceback.format_exc()
                    casalog.post(
                        "Exception checking if command response msg is available: %s" % str(formatted_traceback),
                        "SEVERE",
                        casalog_call_origin,
                    )

                # Then receive, store and post command response msg
                if msg_available:
                    try:
                        # Receive command response
                        command_response = self.__communicator.command_response_recv()
                        server = command_response["server"]
                        successful = command_response["successful"]
                        command_id = command_response["id"]
                        # Mark immediately server as not-busy
                        self.__monitor_client.set_server_status_keyword(server, "busy", False)
                        # Store command response
                        self.__command_response_list[command_id] = command_response
                        # If there are no pending command responses clear the event controller
                        if len(self.__command_response_list) == len(self.__command_request_list):
                            self.__command_response_handler_service_event_controller.clear()
                        # Mark command request as received
                        self.__command_request_list[command_id]["status"] = "response received"
                        self.__command_response_list[command_id]["status"] = "response received"
                        # Notify that command response has been received
                        if successful:
                            casalog.post(
                                "Command request with id %s successfully handled by server n# %s"
                                % (str(command_id), str(server)),
                                MPIEnvironment.command_handling_log_level,
                                casalog_call_origin,
                            )
                        else:
                            casalog.post(
                                "Command request with id %s failed in server n# %s with traceback %s"
                                % (str(command_id), str(server), str(command_response["traceback"])),
                                "SEVERE",
                                casalog_call_origin,
                            )
                        # If this request belongs to a group update the group response object
                        if self.__command_request_list[command_id].has_key("group"):
                            command_group_response_id = self.__command_request_list[command_id]["group"]
                            self.__command_group_response_list[command_group_response_id]["list"].remove(command_id)
                            # If there are no requests pending from this group send the group response signal
                            if len(self.__command_group_response_list[command_group_response_id]["list"]) == 0:
                                self.__command_group_response_list[command_group_response_id]["event"].set()
                    except:
                        formatted_traceback = traceback.format_exc()
                        casalog.post(
                            "Exception receiving command request response msg: %s" % str(formatted_traceback),
                            "SEVERE",
                            casalog_call_origin,
                        )
                else:
                    time.sleep(MPIEnvironment.mpi_command_response_handler_service_sleep_time)

            # Mark service as not running
            self.__command_response_handler_service_running = False

        def __start_command_response_handler_service(self):

            casalog_call_origin = "MPICommandClient::start_command_response_handler_service"

            if self.__command_response_handler_service_running:
                casalog.post("MPI command response handler service is already running", "WARN", casalog_call_origin)
                return True

            try:
                self.__command_response_handler_service_on = True
                self.__command_response_handler_service_thread = thread.start_new_thread(
                    self.__command_response_handler_service, ()
                )
            except:
                formatted_traceback = traceback.format_exc()
                self.__command_response_handler_service_on = False
                self.__command_response_handler_service_running = False
                casalog.post(
                    "Exception starting MPI command response handler service: %s" % str(formatted_traceback),
                    "SEVERE",
                    casalog_call_origin,
                )
                return False

            while not self.__command_response_handler_service_running:
                time.sleep(MPIEnvironment.mpi_check_start_service_sleep_time)

            casalog.post("MPI command response handler service started", "INFO", casalog_call_origin)

            return True

        def __stop_command_response_handler_service(self):

            casalog_call_origin = "MPICommandClient::stop_command_response_handler_service"

            if not self.__command_response_handler_service_running:
                casalog.post("MPI command response handler service is not running", "WARN", casalog_call_origin)
                return

            self.__command_response_handler_service_on = False
            # Send signal to the thread to be awakened
            self.__command_response_handler_service_event_controller.set()

            while self.__command_response_handler_service_running:
                time.sleep(MPIEnvironment.mpi_check_stop_service_sleep_time)

            casalog.post("MPI command response handler service stopped", "INFO", casalog_call_origin)

        def __command_request_queue_service(self):

            casalog_call_origin = "MPICommandClient::command_request_queue_service"

            # Mark service as running
            self.__command_request_queue_service_running = True

            while self.__command_request_queue_service_on:

                # Wait until there are pending command responses
                self.__command_request_input_queue_lock.acquire()
                if len(self.__command_request_queue) == 0 and len(self.__command_request_input_queue) == 0:
                    self.__command_request_input_queue_lock.release()
                    self.__command_request_queue_service_event_controller.wait()
                else:
                    self.__command_request_input_queue_lock.release()

                # Pick up jobs from input queue
                self.__command_request_input_queue_lock.acquire()
                while len(self.__command_request_input_queue) > 0:
                    self.__command_request_queue.append(self.__command_request_input_queue.pop(0))
                self.__command_request_input_queue_lock.release()

                # Get list of available servers
                available_servers_list = self.__monitor_client.get_server_rank_available()
                if len(available_servers_list) >= 1:
                    # Get list of matching command requests
                    matching_command_request_id_list = self.__match_available_servers_with_command_requests(
                        available_servers_list
                    )
                    # Iterate over matching command request list
                    for command_request_id in matching_command_request_id_list:
                        command_request_found = False
                        command_request_queue_idx = 0
                        # Iterate over command request queue to find the corresponding index
                        for command_request in self.__command_request_queue:
                            if command_request["id"] == command_request_id:
                                command_request_found = True
                                break
                            else:
                                command_request_queue_idx = command_request_queue_idx + 1
                        # Extract command request from queue and send it
                        if command_request_found:
                            command_request = self.__command_request_queue.pop(command_request_queue_idx)
                            # If command request queue us empty clear the event controller
                            if len(self.__command_request_queue) == 0:
                                self.__command_request_queue_service_event_controller.clear()
                            server = command_request["server"]
                            try:
                                # Mark assigned server as busy and set command info in server status
                                self.__monitor_client.set_server_status_keyword(server, "busy", True)
                                self.__monitor_client.set_server_status_keyword(
                                    server, "command", command_request["command"]
                                )
                                # Send command request
                                self.__communicator.command_request_send(request=command_request, server=server)
                                # Mark command request as sent
                                self.__command_request_list[command_request_id]["status"] = "request sent"
                                # Notify that command request has been sent
                                casalog.post(
                                    "Command request with id# %s sent to server n# %s"
                                    % (str(command_request_id), str(server)),
                                    MPIEnvironment.command_handling_log_level,
                                    casalog_call_origin,
                                )
                            except:
                                # Get and format traceback
                                formatted_traceback = traceback.format_exc()
                                # Simulate response
                                command_response = dict(command_request)
                                command_response["successful"] = False
                                command_response["traceback"] = formatted_traceback
                                self.__command_response_list[command_request_id] = command_response
                                # Notify exception
                                casalog.post(
                                    "Exception sending command request with id# %s to server n# %s: %s"
                                    % (str(command_request_id), str(server), str(formatted_traceback)),
                                    "SEVERE",
                                    casalog_call_origin,
                                )
                        else:
                            casalog.post(
                                "Command request with id# %s not found" % str(command_request_id),
                                "SEVERE",
                                casalog_call_origin,
                            )
                else:
                    # Sleep in order not to saturate the system
                    time.sleep(MPIEnvironment.mpi_command_request_queue_service_sleep_time)

            # Mark service as not running
            self.__command_request_queue_service_running = False

        def __match_available_servers_with_command_requests(self, available_servers):

            matching_command_request_id_list = []
            unassigned_command_request_id_list = []
            available_servers_left = list(available_servers)
            for command_request in self.__command_request_queue:
                server = command_request["server"]
                command_request_id = command_request["id"]
                # Command request does not have any pre-assigned server
                if server is None:
                    matching_command_request_id_list.append(command_request_id)
                    unassigned_command_request_id_list.append(command_request_id)
                # Assigned server is within the list of available servers
                elif server in available_servers_left:
                    matching_command_request_id_list.append(command_request_id)
                    # Remove server from the list of available servers to avoid multiple assignment
                    available_servers_left.remove(server)

                # Exit loop if we have enough matching requests
                if len(matching_command_request_id_list) >= len(available_servers):
                    break

            # Assign servers to the remaining requests
            for command_request in self.__command_request_queue:
                if command_request["id"] in unassigned_command_request_id_list:
                    server = available_servers_left.pop()
                    command_request["server"] = server

            # Return matching command request id list
            return matching_command_request_id_list

        def __start_command_request_queue_service(self):

            casalog_call_origin = "MPICommandClient::start_command_request_queue_service"

            if self.__command_request_queue_service_running:
                casalog.post("MPI command request queue service is already running", "WARN", casalog_call_origin)
                return True

            try:
                self.__command_request_queue_service_on = True
                self.__command_request_queue_service_thread = thread.start_new_thread(
                    self.__command_request_queue_service, ()
                )
            except:
                formatted_traceback = traceback.format_exc()
                self.__command_request_queue_service_on = False
                self.__command_request_queue_service_running = False
                casalog.post(
                    "Exception starting MPI command request queue service: %s" % str(formatted_traceback),
                    "SEVERE",
                    casalog_call_origin,
                )
                return False

            while not self.__command_request_queue_service_running:
                time.sleep(MPIEnvironment.mpi_check_start_service_sleep_time)

            casalog.post("MPI command request queue service started", "INFO", casalog_call_origin)

            return True

        def __stop_command_request_queue_service(self):

            casalog_call_origin = "MPICommandClient::stop_command_request_queue_service"

            if not self.__command_request_queue_service_running:
                casalog.post("MPI command request queue service is not running", "WARN", casalog_call_origin)
                return

            self.__command_request_queue_service_on = False
            # Send signal to the thread to be awakened
            self.__command_request_queue_service_event_controller.set()

            while self.__command_request_queue_service_running:
                time.sleep(MPIEnvironment.mpi_check_stop_service_sleep_time)

            casalog.post("MPI command request queue service stopped", "INFO", casalog_call_origin)

        def __send_start_service_signal(self):

            casalog_call_origin = "MPICommandClient::send_start_service_signal"

            casalog.post("Sending start service signal to all servers", "INFO", casalog_call_origin)

            # Prepare stop service request
            request = {}
            request["signal"] = "start"
            request["casa"] = casa  # The request contains the global casa dictionary to be used by the servers
            request["logmode"] = self.__log_mode

            # Send request to all servers
            self.__communicator.control_service_request_broadcast(request, casalog)

            # Then wait until all servers have handled the signal
            mpi_server_rank_list = MPIEnvironment.mpi_server_rank_list()
            while len(mpi_server_rank_list) > 0:
                response_available = False
                response_available = self.__communicator.control_service_response_probe()
                if response_available:
                    # Receive start service response to know what server has started
                    response = self.__communicator.control_service_response_recv()
                    rank = response["rank"]
                    # Store processor name and PID info in the MPIMonitorClient
                    self.__monitor_client.set_server_status_keyword(rank, "processor", response["processor"])
                    self.__monitor_client.set_server_status_keyword(rank, "pid", response["pid"])
                    # Remove server from list
                    mpi_server_rank_list.remove(rank)
                    # Communicate that server response to start service signal has been received
                    casalog.post(
                        "Server with rank %s started at %s with PID %s"
                        % (str(rank), str(response["processor"]), str(response["pid"])),
                        "INFO",
                        casalog_call_origin,
                    )
                else:
                    time.sleep(MPIEnvironment.mpi_check_start_service_sleep_time)

            casalog.post("Received response from all servers to start service signal", "INFO", casalog_call_origin)

        def __send_control_signal(self, signal, check_response=True):

            casalog_call_origin = "MPICommandClient::send_app_control_signal"

            casalog.post("Sending control signal to all servers: %s" % signal["command"], "INFO", casalog_call_origin)

            # Add check_response to signal
            signal["send_response"] = check_response

            # Send request to all servers
            try:
                self.__communicator.control_service_request_broadcast(signal, casalog)
            except:
                formatted_traceback = traceback.format_exc()
                casalog.post(
                    "Exception sending control signal to all servers: %s" % str(formatted_traceback),
                    "SEVERE",
                    casalog_call_origin,
                )
                return

            # Then wait until all servers have handled the signal
            if check_response:

                try:
                    mpi_server_rank_list = self.__monitor_client.get_server_rank_online()
                except:
                    formatted_traceback = traceback.format_exc()
                    casalog.post(
                        "Exception checking for response to control signal: %s" % str(formatted_traceback),
                        "SEVERE",
                        casalog_call_origin,
                    )
                    return

                while len(mpi_server_rank_list) > 0:

                    response_available = False
                    try:
                        response_available = self.__communicator.control_service_response_probe()
                    except:
                        response_available = False
                        formatted_traceback = traceback.format_exc()
                        casalog.post(
                            "Exception getting response to control signal: %s" % str(formatted_traceback),
                            "SEVERE",
                            casalog_call_origin,
                        )
                        return

                    if response_available:
                        # Receive control signal response
                        response = self.__communicator.control_service_response_recv()
                        rank = response["rank"]
                        # Remove server from list
                        # CAS-7721: Control signals are sent to all servers, even if not responsive
                        # So we may get a response from a server which is not in the initial online servers list
                        if mpi_server_rank_list.count(rank):
                            mpi_server_rank_list.remove(rank)
                            # Communicate that server response to start service signal has been received
                            casalog.post(
                                "Server with rank %s handled control signal %s" % (str(rank), signal["command"]),
                                "DEBUG",
                                casalog_call_origin,
                            )
                    else:
                        time.sleep(MPIEnvironment.mpi_check_stop_service_sleep_time)

                casalog.post(
                    "Control signal handled by all servers: %s" % signal["command"], "INFO", casalog_call_origin
                )

            else:

                casalog.post("Control signal sent to all servers: %s" % signal["command"], "INFO", casalog_call_origin)

        def __validate_target_servers(self, target_server):

            casalog_call_origin = "MPICommandClient::validate_target_servers"

            # Get list of valid MPIServer ranks
            mpi_server_rank_list = MPIEnvironment.mpi_server_rank_list()

            # Check if target server is a list of integers
            if (
                isinstance(target_server, list)
                and (len(target_server) >= 1)
                and all(isinstance(server, int) for server in target_server)
            ):
                # Check if server is within the server rank list
                for server in target_server:
                    if server not in mpi_server_rank_list:
                        casalog.post("Server #%s does not exist" % str(server), "SEVERE", casalog_call_origin)
                        return None
                    elif self.__monitor_client.get_server_status_keyword(server, "timeout"):
                        casalog.post("Server #%s has timed out" % str(server), "SEVERE", casalog_call_origin)
                        return None
                # Return input list validated
                return target_server
            # Check if target server is an integer
            elif isinstance(target_server, int):
                # Check if server is within the server rank list
                if target_server in mpi_server_rank_list:
                    return [target_server]
                else:
                    casalog.post("Server #%s does not exist" % str(target_server), "SEVERE", casalog_call_origin)
                    return None
            else:
                casalog.post(
                    "target_server has wrong format (%s), accepted formats are int and list(int)"
                    % str(type(target_server)),
                    "SEVERE",
                    casalog_call_origin,
                )
                return None

        def __register_command_request(self, command_request, server):

            # Get command request if
            command_request_id = self.__command_request_counter

            # Complete command request definition
            command_request_complete = dict(command_request)
            command_request_complete["id"] = command_request_id
            command_request_complete["server"] = server
            command_request_complete["status"] = "holding queue"

            # Register command request
            self.__command_request_list[command_request_id] = command_request_complete

            # Append jobs to input queue
            self.__command_request_input_queue_lock.acquire()
            self.__command_request_input_queue.append(command_request_complete)
            self.__command_request_input_queue_lock.release()

            # Increment command id counter
            self.__command_request_counter = self.__command_request_counter + 1

            # Return command request id
            return command_request_id

        def __format_command_response_timeout(self, command_request_id):

            # Create a fake command response copying the command request and marking it as not successful
            command_response = dict(self.__command_request_list[command_request_id])
            command_response["status"] = "timeout"
            command_response["successful"] = False
            command_response["ret"] = None

            # Get server, processor and pid to identify which server timed out
            server = command_response["server"]
            processor = self.__monitor_client.get_server_status_keyword(server, "processor")
            pid = self.__monitor_client.get_server_status_keyword(server, "pid")

            # Create command response trace-back msg
            timeout_msg = "Timeout of assigned server n# " + str(server)
            timeout_msg = timeout_msg + " deployed at " + str(processor)
            timeout_msg = timeout_msg + " with PID " + str(pid)

            command_response["traceback"] = timeout_msg

            return command_response

        ################################################################################################################
        # Public methods ###############################################################################################
        ################################################################################################################

        def get_lifecyle_state(self):

            return self.__life_cycle_state

        def start_services(self):

            casalog_call_origin = "MPICommandClient::start_services"

            if self.__life_cycle_state == 1:
                casalog.post("Services already started", "WARN", casalog_call_origin)
                return
            elif self.__life_cycle_state == 2:
                casalog.post("MPICommandClient life cycle finalized", "WARN", casalog_call_origin)
                return

            # 1st: start servers
            self.__send_start_service_signal()

            # 2nd: start monitoring servers
            self.__monitor_client.start_services()

            # 3rd: start command request queue service
            self.__start_command_request_queue_service()

            # 4th: start command response handler service
            self.__start_command_response_handler_service()

            # Set life cycle state
            self.__life_cycle_state = 1

            casalog.post("All services started", "INFO", casalog_call_origin)

        def stop_services(self, force_command_request_interruption=False):

            # jagonzal: This method is called by the atexit module and if it fails it
            # causes ipython to crash, producing a report and waiting for user input
            # so we cannot risk under any circumstances such an event
            try:

                casalog_call_origin = "MPICommandClient::stop_services"

                if self.__life_cycle_state == 0:
                    casalog.post("Services not started", "WARN", casalog_call_origin)
                    return
                elif self.__life_cycle_state == 2:
                    casalog.post("MPICommandClient life cycle finalized", "WARN", casalog_call_origin)
                    return

                # Check if any server is in timeout condition before stopping the monitoring service
                server_rank_timeout = self.__monitor_client.get_server_timeout()
                finalize_mpi_environment = True
                if len(server_rank_timeout) > 0:
                    finalize_mpi_environment = False
                    force_command_request_interruption = True

                # Stop client monitoring services
                self.__monitor_client.stop_services()

                # Notify command requests which are going to be interrupted
                for command_request_id in self.__command_request_list:
                    if not self.__command_response_list.has_key(command_request_id):
                        server = self.__command_request_list[command_request_id]["server"]
                        status = self.__command_request_list[command_request_id]["status"]
                        casalog.post(
                            "Aborting command request with id# %s: %s"
                            % (str(command_request_id), str(self.__command_request_list[command_request_id])),
                            "SEVERE",
                            casalog_call_origin,
                        )

                # Stop client command request-response services
                self.__stop_command_request_queue_service()
                self.__stop_command_response_handler_service()

                # Shutdown plotms process
                self.__send_control_signal(
                    {"command": "pm.killApp()", "signal": "process_control"}, check_response=True
                )

                # Shutdown virtual frame buffer
                self.__send_control_signal(
                    {"command": "self.stop_virtual_frame_buffer()", "signal": "process_control"}, check_response=True
                )

                # Send stop signal to servers
                self.__send_control_signal(
                    {
                        "command": "stop_service_requested = True",
                        "signal": "stop",
                        "force_command_request_interruption": force_command_request_interruption,
                        "finalize_mpi_environment": finalize_mpi_environment,
                    },
                    check_response=False,
                )

                # Finalize MPI environment
                if finalize_mpi_environment:
                    try:
                        casalog.post("Going to finalize MPI environment", "INFO", casalog_call_origin)
                        MPIEnvironment.finalize_mpi_environment()
                    except:
                        formatted_traceback = traceback.format_exc()
                        casalog.post(
                            "Exception finalizing MPI environment %s" % str(formatted_traceback),
                            "SEVERE",
                            casalog_call_origin,
                        )
                else:
                    casalog.post(
                        "MPIServers with rank %s are in timeout condition, skipping MPI_Finalize()"
                        % str(server_rank_timeout),
                        "SEVERE",
                        casalog_call_origin,
                    )

                # UnMark MPI environment to be finalized by the MPICommunicator destructor
                # (Either because it is already finalized or due to a
                # server not responsive that prevents graceful finalization)
                self.__communicator.set_finalize_mpi_environment(False)

                # Set life cycle state
                self.__life_cycle_state = 2

                casalog.post("All services stopped", "INFO", casalog_call_origin)

            except:
                formatted_traceback = traceback.format_exc()
                print "Unhandled exception in MPICommandClient::stop_services %s" % (formatted_traceback)

        def push_command_request(self, command, block=False, target_server=None, parameters=None):

            casalog_call_origin = "MPICommandClient::push_command_request"

            if self.__life_cycle_state == 0:
                casalog.post("Services not started", "WARN", casalog_call_origin)
                return
            elif self.__life_cycle_state == 2:
                casalog.post("MPICommandClient life cycle finalized", "WARN", casalog_call_origin)
                return

            command_request = {}
            command_request["command"] = command
            command_request["parameters"] = parameters

            # Determine whether command is a statement or an expression
            if command == "push":
                command_request["mode"] = "push"
                casalog.post("Requested push operation", "DEBUG", casalog_call_origin)
            else:
                # Determine whether command is a statement or an expression
                try:
                    code = compile(command_request["command"], "send_command_request", "eval")
                    command_request["mode"] = "eval"
                    casalog.post(
                        "Command will be evaluated as an expression with return value", "DEBUG", casalog_call_origin
                    )
                except:
                    try:
                        code = compile(command_request["command"], "send_command_request", "exec")
                        command_request["mode"] = "exec"
                        casalog.post(
                            "Command will be executed as an statement w/o return code", "DEBUG", casalog_call_origin
                        )
                    except:
                        formatted_traceback = traceback.format_exc()
                        casalog.post(
                            "Command cannot be executed neither as a statement nor as an expression, it will be rejected: %s"
                            % str(formatted_traceback),
                            "SEVERE",
                            casalog_call_origin,
                        )
                        return None

            # Validate target servers
            target_server_validated = None
            if target_server is not None:
                target_server_validated = self.__validate_target_servers(target_server)
                # Exit if target server is not validated
                if target_server_validated is None:
                    return None

            # Create command request list
            command_request_id_list = []
            if target_server_validated is not None:
                for server in target_server_validated:
                    command_request_id = self.__register_command_request(command_request, server)
                    command_request_id_list.append(command_request_id)
            else:
                command_request_id = self.__register_command_request(command_request, None)
                command_request_id_list.append(command_request_id)

            # Wake up command request/response service threads
            self.__command_request_queue_service_event_controller.set()
            self.__command_response_handler_service_event_controller.set()

            # In blocking mode wait until command response is received otherwise return request id
            if block:
                command_return_code_list = self.get_command_response(command_request_id_list, True, False)
                return command_return_code_list
            # Otherwise we simply return the command request id
            else:
                return command_request_id_list

        def get_command_response(self, command_request_id_list, block=False, verbose=True):

            casalog_call_origin = "MPICommandClient::get_command_response"

            command_response_list = []
            if block:

                # Wait until command request response is received or timeout
                pending_command_request_id_list = list(command_request_id_list)
                while len(pending_command_request_id_list) > 0:
                    for command_request_id in command_request_id_list:
                        # Check if command request id is still pending
                        if command_request_id in pending_command_request_id_list:
                            # Check if we have response for command request id
                            if self.__command_response_list.has_key(command_request_id):
                                # Remove command request id from pending list
                                pending_command_request_id_list.remove(command_request_id)
                            else:
                                server = self.__command_request_list[command_request_id]["server"]
                                if server is not None and self.__monitor_client.get_server_status_keyword(
                                    server, "timeout"
                                ):
                                    casalog.post(
                                        "Command request with id# %s sent to server n# %s, but the server has timed out"
                                        % (str(command_request_id), str(server)),
                                        "SEVERE",
                                        casalog_call_origin,
                                    )
                                    # Remove command request id from pending list
                                    pending_command_request_id_list.remove(command_request_id)

                    time.sleep(MPIEnvironment.mpi_push_command_request_block_mode_sleep_time)

                # Gather command response list
                for command_request_id in command_request_id_list:
                    if self.__command_response_list.has_key(command_request_id):
                        command_response = dict(self.__command_response_list[command_request_id])
                        command_response_list.append(command_response)
                    else:
                        command_response = self.__format_command_response_timeout(command_request_id)
                        command_response_list.append(command_response)

                # Gather return codes
                # command_return_code_list = []
                # for command_response in command_response_list:
                #    successful = command_response['successful']
                #    if not successful:
                #        command_return_code_list.append([command_response['id'],False, command_response['traceback']])
                #    elif command_response['mode'] == 'eval':
                #        command_return_code_list.append([command_response['id'],True,command_response['ret']])
                #    else:
                #        command_return_code_list.append([command_response['id'],True,None])

                # Return command return code list
                return command_response_list

            else:
                command_response_list = []
                for command_request_id in command_request_id_list:
                    if not self.__command_response_list.has_key(command_request_id):
                        server = self.__command_request_list[command_request_id]["server"]
                        timeout = self.__monitor_client.get_server_status_keyword(server, "timeout")
                        if timeout:
                            casalog.post(
                                "Command request with id# %s sent to server n# %s, but the server has timed out"
                                % (str(command_request_id), str(server)),
                                "SEVERE",
                                casalog_call_origin,
                            )
                            command_response = self.__format_command_response_timeout(command_request_id)
                            command_response_list.append(command_response)
                        elif verbose:
                            status = self.__command_request_list[command_request_id]["status"]
                            casalog.post(
                                "Command request with id# %s is in %s state assigned to server %s"
                                % (str(command_request_id), status, str(server)),
                                "INFO",
                                casalog_call_origin,
                            )
                    else:
                        command_response = dict(self.__command_response_list[command_request_id])
                        command_response_list.append(command_response)

                return command_response_list

        def get_command_response_event(self, command_request_id_list):

            # Get command group response id
            command_group_response_id = self.__command_group_response_counter

            # Setup event object
            command_group_response_event = threading.Event()
            command_group_response_event.clear()

            # Setup command group response
            command_group_response = {}
            command_group_response["id"] = command_group_response_id
            command_group_response["list"] = list(command_request_id_list)  # Make a copy of the list
            command_group_response["event"] = command_group_response_event

            # Register command group response
            self.__command_group_response_list[command_group_response_id] = command_group_response
            for command_request_id in command_request_id_list:
                self.__command_request_list[command_request_id]["group"] = command_group_response_id

            # Increment command id counter
            self.__command_group_response_counter = self.__command_group_response_counter + 1

            # Return command response event object
            return command_group_response_event

        def get_server_status(self, server=None):
            return self.__monitor_client.get_server_status(server)

        def get_command_request_list(self):
            return self.__command_request_list

        def get_command_response_list(self):
            return self.__command_response_list

        def set_log_mode(self, logmode):
            self.__log_mode = logmode

        def set_log_level(self, log_level):

            casalog_call_origin = "MPICommandClient::set_log_level"

            if self.__life_cycle_state == 0:
                casalog.post("Services not started", "WARN", casalog_call_origin)
                return
            elif self.__life_cycle_state == 2:
                casalog.post("MPICommandClient life cycle finalized", "WARN", casalog_call_origin)
                return

            if log_level not in log_levels:
                casalog.post(
                    "Unknown log level %s, recognized levels are: %s" % (str(log_level), str(log_levels)),
                    "WARN",
                    casalog_call_origin,
                )
                return

            MPIEnvironment.command_handling_log_level = log_level

            self.__send_control_signal(
                {
                    "command": "MPIEnvironment.command_handling_log_level = '%s'" % log_level,
                    "signal": "process_control",
                },
                check_response=True,
            )
Beispiel #13
0
    class __MPICommandClientImpl:
        """ Implementation of the MPICommandClient singleton interface """
        
        
        def __init__(self,start_services=False):
            
            # Initialize life cycle state
            # 0-> Services not started
            # 1-> Services started
            # 2-> Stop service signal sent
            self.__life_cycle_state = 0
             
                    
            # Initialize command request id counter and list
            self.__command_request_counter = 1
            self.__command_request_list = {}
            self.__command_response_list = {}
            
            # Initialize command group response state
            self.__command_group_response_counter = 1
            self.__command_group_response_list = {}            
            
            # Initialize command response handler service state
            self.__command_response_handler_service_on = False
            self.__command_response_handler_service_running = False
            self.__command_response_handler_service_thread = None 
            self.__command_response_handler_service_event_controller = threading.Event()       
            self.__command_response_handler_service_event_controller.clear()            
            
            # Initialize command request queue service state
            self.__command_request_queue = []
            self.__command_request_queue_service_on = False
            self.__command_request_queue_service_running = False
            self.__command_request_queue_service_thread = None             
            self.__command_request_queue_service_event_controller = threading.Event()       
            self.__command_request_queue_service_event_controller.clear()    
            
            # Setup a command request input queue to append the jobs
            # to be picked up by the command request queue service
            self.__command_request_input_queue = []
            self.__command_request_input_queue_lock = threading.Lock()
            
            # Instantiate MPICommunicator reference
            self.__communicator = MPICommunicator()
            
            # Instantiate MPIMonitorClient reference
            self.__monitor_client = MPIMonitorClient(False)            
            
            # Automatically start services
            if start_services:
                self.start_services()
                
            # Log mode
            self.__log_mode = 'unified'
                
            # Register exit handler
            # NOTE: It is not guaranteed that __del__() methods are called 
            #       for objects that still exist when the interpreter exits.
            atexit.register(self.stop_services,force_command_request_interruption=True)
                                
                                   
        ################################################################################################################            
        # Private methods ##############################################################################################
        ################################################################################################################                                   
            
            
        def __command_response_handler_service(self):
            
            casalog_call_origin = "MPICommandClient::command_response_handler_service"
            
            # Mark service as running
            self.__command_response_handler_service_running = True            
                          
            while (self.__command_response_handler_service_on):
                
                # Wait until there are command request whose response is pending
                if len(self.__command_response_list) == len(self.__command_request_list):
                    self.__command_response_handler_service_event_controller.wait()
                
                # First check if there is a command response msg available
                msg_available = False
                try:
                    msg_available = self.__communicator.command_response_probe()
                except:
                    msg_available = False
                    formatted_traceback = traceback.format_exc()
                    casalog.post("Exception checking if command response msg is available: %s" 
                                 % str(formatted_traceback),"SEVERE",casalog_call_origin)                 
                    
                # Then receive, store and post command response msg
                if (msg_available):            
                    try:
                        # Receive command response
                        command_response = self.__communicator.command_response_recv()
                        server = command_response['server']
                        successful = command_response['successful']
                        command_id = command_response['id']
                        # Mark immediately server as not-busy
                        self.__monitor_client.set_server_status_keyword(server,'busy',False)
                        # Store command response
                        self.__command_response_list[command_id] = command_response 
                        # If there are no pending command responses clear the event controller
                        if len(self.__command_response_list) == len(self.__command_request_list):
                            self.__command_response_handler_service_event_controller.clear()                             
                        # Mark command request as received
                        self.__command_request_list[command_id]['status'] = 'response received' 
                        self.__command_response_list[command_id]['status'] = 'response received' 
                        # Notify that command response has been received
                        if successful:
                            casalog.post("Command request with id %s successfully handled by server n# %s" 
                                         % (str(command_id),str(server)),MPIEnvironment.command_handling_log_level,casalog_call_origin)                                  
                        else:
                            casalog.post("Command request with id %s failed in server n# %s with traceback %s" 
                                         % (str(command_id),str(server),str(command_response['traceback'])),
                                         "SEVERE",casalog_call_origin)          
                        # If this request belongs to a group update the group response object
                        if self.__command_request_list[command_id].has_key('group'):
                            command_group_response_id = self.__command_request_list[command_id]['group']
                            self.__command_group_response_list[command_group_response_id]['list'].remove(command_id)
                            # If there are no requests pending from this group send the group response signal
                            if len(self.__command_group_response_list[command_group_response_id]['list']) == 0:
                                self.__command_group_response_list[command_group_response_id]['event'].set()
                    except:
                        formatted_traceback = traceback.format_exc()
                        casalog.post("Exception receiving command request response msg: %s" 
                                     % str(formatted_traceback),"SEVERE",casalog_call_origin)
                else:
                    time.sleep(MPIEnvironment.mpi_command_response_handler_service_sleep_time) 

            # Mark service as not running
            self.__command_response_handler_service_running = False            

            
        def __start_command_response_handler_service(self):
        
            casalog_call_origin = "MPICommandClient::start_command_response_handler_service"

            if self.__command_response_handler_service_running:
                casalog.post("MPI command response handler service is already running","WARN",casalog_call_origin)
                return True
            
            try:
                self.__command_response_handler_service_on = True
                self.__command_response_handler_service_thread = thread.start_new_thread(self.__command_response_handler_service, ())
            except:
                formatted_traceback = traceback.format_exc()
                self.__command_response_handler_service_on = False
                self.__command_response_handler_service_running = False
                casalog.post("Exception starting MPI command response handler service: %s" 
                             % str(formatted_traceback),"SEVERE",casalog_call_origin)  
                return False
        
            while (not self.__command_response_handler_service_running):
                time.sleep(MPIEnvironment.mpi_check_start_service_sleep_time) 
        
            casalog.post("MPI command response handler service started","INFO",casalog_call_origin)
            
            return True
        
        
        def __stop_command_response_handler_service(self):
        
            casalog_call_origin = "MPICommandClient::stop_command_response_handler_service"
        
            if not self.__command_response_handler_service_running:
                casalog.post("MPI command response handler service is not running","WARN",casalog_call_origin)
                return             

            self.__command_response_handler_service_on = False
            # Send signal to the thread to be awakened
            self.__command_response_handler_service_event_controller.set()            
        
            while (self.__command_response_handler_service_running):
                time.sleep(MPIEnvironment.mpi_check_stop_service_sleep_time)
                
            casalog.post("MPI command response handler service stopped","INFO",casalog_call_origin)
            
            
        def __command_request_queue_service(self):
            
            casalog_call_origin = "MPICommandClient::command_request_queue_service"
            
            # Mark service as running
            self.__command_request_queue_service_running = True                    
                       
            while self.__command_request_queue_service_on:
                
                # Wait until there are pending command responses
                self.__command_request_input_queue_lock.acquire()
                if len(self.__command_request_queue) == 0 and len(self.__command_request_input_queue) == 0:
                    self.__command_request_input_queue_lock.release()
                    self.__command_request_queue_service_event_controller.wait()
                else:
                    self.__command_request_input_queue_lock.release()
                    
                # Pick up jobs from input queue
                self.__command_request_input_queue_lock.acquire()
                while len(self.__command_request_input_queue) > 0:
                    self.__command_request_queue.append(self.__command_request_input_queue.pop(0))
                self.__command_request_input_queue_lock.release()

                # Get list of available servers
                available_servers_list = self.__monitor_client.get_server_rank_available()
                if len(available_servers_list) >= 1:
                    # Get list of matching command requests
                    matching_command_request_id_list = self.__match_available_servers_with_command_requests(available_servers_list)
                    # Iterate over matching command request list
                    for command_request_id in matching_command_request_id_list:
                        command_request_found = False
                        command_request_queue_idx = 0
                        # Iterate over command request queue to find the corresponding index
                        for command_request in self.__command_request_queue:
                            if command_request['id'] == command_request_id:
                                command_request_found = True
                                break
                            else:
                                command_request_queue_idx = command_request_queue_idx + 1
                        # Extract command request from queue and send it
                        if command_request_found:
                            command_request = self.__command_request_queue.pop(command_request_queue_idx)
                            # If command request queue us empty clear the event controller
                            if len(self.__command_request_queue) == 0:
                                self.__command_request_queue_service_event_controller.clear()                            
                            server = command_request['server']
                            try:
                                # Mark assigned server as busy and set command info in server status
                                self.__monitor_client.set_server_status_keyword(server,'busy',True)
                                self.__monitor_client.set_server_status_keyword(server,'command',command_request['command'])
                                # Send command request
                                self.__communicator.command_request_send(request=command_request,server=server)
                                # Mark command request as sent
                                self.__command_request_list[command_request_id]['status']='request sent'
                                # Notify that command request has been sent
                                casalog.post("Command request with id# %s sent to server n# %s" 
                                             % (str(command_request_id),str(server)),MPIEnvironment.command_handling_log_level,casalog_call_origin)
                            except:
                                # Get and format traceback
                                formatted_traceback = traceback.format_exc()
                                # Simulate response
                                command_response = dict(command_request)
                                command_response['successful']=False
                                command_response['traceback']=formatted_traceback
                                self.__command_response_list[command_request_id]=command_response
                                # Notify exception
                                casalog.post("Exception sending command request with id# %s to server n# %s: %s"
                                             % (str(command_request_id),str(server),str(formatted_traceback)),
                                             "SEVERE",casalog_call_origin)                       
                        else:
                            casalog.post("Command request with id# %s not found" % 
                                         str(command_request_id),"SEVERE",casalog_call_origin)
                else:
                    # Sleep in order not to saturate the system
                    time.sleep(MPIEnvironment.mpi_command_request_queue_service_sleep_time)
                
            # Mark service as not running
            self.__command_request_queue_service_running = False
            
            
        def __match_available_servers_with_command_requests(self,available_servers):
            
            matching_command_request_id_list = []
            unassigned_command_request_id_list = []
            available_servers_left = list(available_servers)
            for command_request in self.__command_request_queue:
                server = command_request['server']
                command_request_id = command_request['id']
                # Command request does not have any pre-assigned server
                if server is None:
                    matching_command_request_id_list.append(command_request_id)
                    unassigned_command_request_id_list.append(command_request_id)
                # Assigned server is within the list of available servers 
                elif server in available_servers_left:
                    matching_command_request_id_list.append(command_request_id)
                    # Remove server from the list of available servers to avoid multiple assignment
                    available_servers_left.remove(server)
                    
                # Exit loop if we have enough matching requests
                if len(matching_command_request_id_list) >= len(available_servers):
                    break
                
            # Assign servers to the remaining requests
            for command_request in self.__command_request_queue:
                if command_request['id'] in unassigned_command_request_id_list:
                    server = available_servers_left.pop()
                    command_request['server'] = server
                
            # Return matching command request id list
            return matching_command_request_id_list  
        
            
        def __start_command_request_queue_service(self):
        
            casalog_call_origin = "MPICommandClient::start_command_request_queue_service"

            if self.__command_request_queue_service_running:
                casalog.post("MPI command request queue service is already running","WARN",casalog_call_origin)
                return True
            
            try:
                self.__command_request_queue_service_on = True
                self.__command_request_queue_service_thread = thread.start_new_thread(self.__command_request_queue_service, ())
            except:
                formatted_traceback = traceback.format_exc()
                self.__command_request_queue_service_on = False
                self.__command_request_queue_service_running = False
                casalog.post("Exception starting MPI command request queue service: %s" 
                             % str(formatted_traceback),"SEVERE",casalog_call_origin)  
                return False
        
            while (not self.__command_request_queue_service_running):
                time.sleep(MPIEnvironment.mpi_check_start_service_sleep_time) 
        
            casalog.post("MPI command request queue service started","INFO",casalog_call_origin)
            
            return True
        
        
        def __stop_command_request_queue_service(self):
        
            casalog_call_origin = "MPICommandClient::stop_command_request_queue_service"
        
            if not self.__command_request_queue_service_running:
                casalog.post("MPI command request queue service is not running","WARN",casalog_call_origin)
                return             

            self.__command_request_queue_service_on = False
            # Send signal to the thread to be awakened
            self.__command_request_queue_service_event_controller.set()
        
            while (self.__command_request_queue_service_running):
                time.sleep(MPIEnvironment.mpi_check_stop_service_sleep_time)
                
            casalog.post("MPI command request queue service stopped","INFO",casalog_call_origin)            
            
            
        def __send_start_service_signal(self):
            
            casalog_call_origin = "MPICommandClient::send_start_service_signal"
            
            casalog.post("Sending start service signal to all servers","INFO",casalog_call_origin)
            
            # Prepare stop service request
            request = {}
            request['signal'] = 'start'
            request['casa'] = casa # The request contains the global casa dictionary to be used by the servers
            request['logmode'] = self.__log_mode
            
            # Send request to all servers
            self.__communicator.control_service_request_broadcast(request,casalog)
                
            # Then wait until all servers have handled the signal
            mpi_server_rank_list = MPIEnvironment.mpi_server_rank_list()
            while len(mpi_server_rank_list)>0:
                response_available = False
                response_available = self.__communicator.control_service_response_probe()
                if response_available:
                    # Receive start service response to know what server has started
                    response = self.__communicator.control_service_response_recv()
                    rank = response['rank']
                    # Store processor name and PID info in the MPIMonitorClient
                    self.__monitor_client.set_server_status_keyword(rank,'processor',response['processor'])
                    self.__monitor_client.set_server_status_keyword(rank,'pid',response['pid'])
                    # Remove server from list
                    mpi_server_rank_list.remove(rank)
                    # Communicate that server response to start service signal has been received
                    casalog.post("Server with rank %s started at %s with PID %s" 
                                 % (str(rank),str(response['processor']),str(response['pid'])),
                                 "INFO",casalog_call_origin)
                else:
                    time.sleep(MPIEnvironment.mpi_check_start_service_sleep_time)
                        
            casalog.post("Received response from all servers to start service signal","INFO",casalog_call_origin)
            
            
        def __send_control_signal(self,signal,check_response=True):
            
            casalog_call_origin = "MPICommandClient::send_app_control_signal"
            
            casalog.post("Sending control signal to all servers: %s" % signal['command'],"INFO",casalog_call_origin)
            
            # Add check_response to signal
            signal['send_response'] = check_response
            
            # Send request to all servers
            try:
                self.__communicator.control_service_request_broadcast(signal,casalog)
            except:
                formatted_traceback = traceback.format_exc()
                casalog.post("Exception sending control signal to all servers: %s" % str(formatted_traceback),
                             "SEVERE",casalog_call_origin)
                return
                        
            # Then wait until all servers have handled the signal
            if check_response:
                
                try:
                    mpi_server_rank_list = self.__monitor_client.get_server_rank_online()
                except:
                    formatted_traceback = traceback.format_exc()
                    casalog.post("Exception checking for response to control signal: %s" % str(formatted_traceback),
                                 "SEVERE",casalog_call_origin)
                    return
                    
                while len(mpi_server_rank_list)>0:
                    
                    response_available = False
                    try:
                        response_available = self.__communicator.control_service_response_probe()
                    except:
                        response_available = False
                        formatted_traceback = traceback.format_exc()
                        casalog.post("Exception getting response to control signal: %s" % str(formatted_traceback),
                                     "SEVERE",casalog_call_origin)
                        return
                        
                    if response_available:
                        # Receive control signal response
                        response = self.__communicator.control_service_response_recv()
                        rank = response['rank']
                        # Remove server from list
                        # CAS-7721: Control signals are sent to all servers, even if not responsive
                        # So we may get a response from a server which is not in the initial online servers list                        
                        if mpi_server_rank_list.count(rank):
                            mpi_server_rank_list.remove(rank)
                            # Communicate that server response to start service signal has been received
                            casalog.post("Server with rank %s handled control signal %s" 
                                         % (str(rank),signal['command']),
                                         "DEBUG",casalog_call_origin)
                    else:
                        time.sleep(MPIEnvironment.mpi_check_stop_service_sleep_time)
                
                casalog.post("Control signal handled by all servers: %s" % signal['command'],"INFO",casalog_call_origin)     
                  
            else:
                  
                casalog.post("Control signal sent to all servers: %s" % signal['command'],"INFO",casalog_call_origin)       
            
            
        def __validate_target_servers(self,target_server):
            
            casalog_call_origin = "MPICommandClient::validate_target_servers"
            
            # Get list of valid MPIServer ranks
            mpi_server_rank_list = MPIEnvironment.mpi_server_rank_list()
            
            # Check if target server is a list of integers
            if isinstance(target_server,list) and (len(target_server)>=1) and all(isinstance(server, int) for server in target_server):
                # Check if server is within the server rank list
                for server in target_server:
                    if server not in mpi_server_rank_list:
                        casalog.post("Server #%s does not exist" % str(server),"SEVERE",casalog_call_origin)
                        return None
                    elif self.__monitor_client.get_server_status_keyword(server,'timeout'):
                        casalog.post("Server #%s has timed out" % str(server),"SEVERE",casalog_call_origin)
                        return None
                # Return input list validated
                return target_server
            # Check if target server is an integer
            elif isinstance(target_server,int):
                # Check if server is within the server rank list
                if target_server in mpi_server_rank_list:
                    return [target_server]
                else:
                    casalog.post("Server #%s does not exist" % str(target_server),"SEVERE",casalog_call_origin)
                    return None
            else:
                casalog.post("target_server has wrong format (%s), accepted formats are int and list(int)" 
                             % str(type(target_server)),"SEVERE",casalog_call_origin)                
                return None        
            
            
        def __register_command_request(self,command_request,server):
            
            # Get command request if
            command_request_id = self.__command_request_counter 
            
            # Complete command request definition
            command_request_complete = dict(command_request)
            command_request_complete['id'] = command_request_id
            command_request_complete['server'] = server
            command_request_complete['status'] = 'holding queue'
                       
            # Register command request
            self.__command_request_list[command_request_id]=command_request_complete
             
            # Append jobs to input queue
            self.__command_request_input_queue_lock.acquire()
            self.__command_request_input_queue.append(command_request_complete)
            self.__command_request_input_queue_lock.release()
             
            # Increment command id counter
            self.__command_request_counter = self.__command_request_counter + 1    
            
            # Return command request id
            return command_request_id     
        
        def __format_command_response_timeout(self,command_request_id):
            
            # Create a fake command response copying the command request and marking it as not successful
            command_response = dict(self.__command_request_list[command_request_id])
            command_response['status']='timeout'
            command_response['successful']=False
            command_response['ret']=None
            
            # Get server, processor and pid to identify which server timed out
            server = command_response['server']
            processor = self.__monitor_client.get_server_status_keyword(server,'processor')
            pid = self.__monitor_client.get_server_status_keyword(server,'pid')
                        
            # Create command response trace-back msg
            timeout_msg = "Timeout of assigned server n# " + str(server)
            timeout_msg = timeout_msg + " deployed at " + str(processor)
            timeout_msg = timeout_msg + " with PID " + str(pid)
                        
            command_response['traceback'] = timeout_msg
            
            return command_response
            
        ################################################################################################################            
        # Public methods ###############################################################################################
        ################################################################################################################
            
            
        def get_lifecyle_state(self):
            
            return self.__life_cycle_state
        
            
        def start_services(self):
            
            casalog_call_origin = "MPICommandClient::start_services"
            
            if self.__life_cycle_state == 1:
                casalog.post("Services already started","WARN",casalog_call_origin)
                return       
            elif self.__life_cycle_state == 2:
                casalog.post("MPICommandClient life cycle finalized","WARN",casalog_call_origin)
                return 
        
            # 1st: start servers
            self.__send_start_service_signal()
            
            # 2nd: start monitoring servers
            self.__monitor_client.start_services()
            
            # 3rd: start command request queue service
            self.__start_command_request_queue_service()
            
            # 4th: start command response handler service
            self.__start_command_response_handler_service()
            
            # Set life cycle state
            self.__life_cycle_state = 1
            
            casalog.post("All services started","INFO",casalog_call_origin)
        
        
        def stop_services(self,force_command_request_interruption=False):
            
            # jagonzal: This method is called by the atexit module and if it fails it
            # causes ipython to crash, producing a report and waiting for user input
            # so we cannot risk under any circumstances such an event
            try:
                
                casalog_call_origin = "MPICommandClient::stop_services"
                
                if self.__life_cycle_state == 0:
                    casalog.post("Services not started","WARN",casalog_call_origin)
                    return       
                elif self.__life_cycle_state == 2:
                    casalog.post("MPICommandClient life cycle finalized","WARN",casalog_call_origin)
                    return      
                
                # Check if any server is in timeout condition before stopping the monitoring service
                server_rank_timeout = self.__monitor_client.get_server_timeout()
                finalize_mpi_environment = True
                if len(server_rank_timeout) > 0:
                    finalize_mpi_environment = False
                    force_command_request_interruption = True                               
                
                # Stop client monitoring services
                self.__monitor_client.stop_services()
                
                # Notify command requests which are going to be interrupted
                for command_request_id in self.__command_request_list:
                        if not self.__command_response_list.has_key(command_request_id):
                            server = self.__command_request_list[command_request_id]['server']
                            status = self.__command_request_list[command_request_id]['status']
                            casalog.post("Aborting command request with id# %s: %s" 
                                         % (str(command_request_id),str(self.__command_request_list[command_request_id])),
                                         "SEVERE",casalog_call_origin)
                
                # Stop client command request-response services
                self.__stop_command_request_queue_service()
                self.__stop_command_response_handler_service()          
                
                # Shutdown plotms process
                self.__send_control_signal({'command':'pm.killApp()',
                                            'signal':'process_control'},
                                           check_response=True)
                
                # Shutdown virtual frame buffer
                self.__send_control_signal({'command':'self.stop_virtual_frame_buffer()',
                                            'signal':'process_control'},
                                           check_response=True)      
                    
                # Send stop signal to servers
                self.__send_control_signal({'command':'stop_service_requested = True',
                                            'signal':'stop',
                                            'force_command_request_interruption':force_command_request_interruption,
                                            'finalize_mpi_environment':finalize_mpi_environment},
                                           check_response=False)    
                
                # Finalize MPI environment            
                if finalize_mpi_environment:
                    try:
                        casalog.post("Going to finalize MPI environment","INFO",casalog_call_origin)
                        MPIEnvironment.finalize_mpi_environment()
                    except:
                        formatted_traceback = traceback.format_exc()
                        casalog.post("Exception finalizing MPI environment %s" 
                                     % str(formatted_traceback),"SEVERE",casalog_call_origin)
                else:
                    casalog.post("MPIServers with rank %s are in timeout condition, skipping MPI_Finalize()" 
                                 % str(server_rank_timeout),"SEVERE",casalog_call_origin)
                
                # UnMark MPI environment to be finalized by the MPICommunicator destructor
                # (Either because it is already finalized or due to a 
                # server not responsive that prevents graceful finalization)  
                self.__communicator.set_finalize_mpi_environment(False)         
                                  
                # Set life cycle state
                self.__life_cycle_state = 2            
                
                casalog.post("All services stopped","INFO",casalog_call_origin)
                
            except:
                formatted_traceback = traceback.format_exc()
                print "Unhandled exception in MPICommandClient::stop_services %s" %(formatted_traceback)
           

        def push_command_request(self,command,block=False,target_server=None,parameters=None):
        
            casalog_call_origin = "MPICommandClient::push_command_request"
            
            if self.__life_cycle_state == 0:
                casalog.post("Services not started","WARN",casalog_call_origin)
                return       
            elif self.__life_cycle_state == 2:
                casalog.post("MPICommandClient life cycle finalized","WARN",casalog_call_origin)
                return  
            
            command_request = {}
            command_request['command']=command
            command_request['parameters'] = parameters
            
            # Determine whether command is a statement or an expression
            if command == "push":
                command_request['mode']='push'
                casalog.post("Requested push operation","DEBUG",casalog_call_origin)                   
            else:
                # Determine whether command is a statement or an expression
                try:
                    code = compile(command_request['command'],"send_command_request", "eval")
                    command_request['mode']='eval'
                    casalog.post("Command will be evaluated as an expression with return value",
                                 "DEBUG",casalog_call_origin)                
                except:
                    try:
                        code = compile(command_request['command'],"send_command_request", "exec")
                        command_request['mode']='exec'
                        casalog.post("Command will be executed as an statement w/o return code",
                                     "DEBUG",casalog_call_origin)                    
                    except:
                        formatted_traceback = traceback.format_exc()
                        casalog.post("Command cannot be executed neither as a statement nor as an expression, it will be rejected: %s" 
                                     % str(formatted_traceback),"SEVERE",casalog_call_origin)
                        return None   
            
            # Validate target servers
            target_server_validated = None
            if target_server is not None:
                target_server_validated = self.__validate_target_servers(target_server)
                # Exit if target server is not validated
                if target_server_validated is None:
                    return None
                
            # Create command request list
            command_request_id_list = []
            if target_server_validated is not None:
                for server in target_server_validated:
                    command_request_id = self.__register_command_request(command_request,server)
                    command_request_id_list.append(command_request_id)
            else:
                command_request_id = self.__register_command_request(command_request,None)
                command_request_id_list.append(command_request_id)
                
            # Wake up command request/response service threads
            self.__command_request_queue_service_event_controller.set()
            self.__command_response_handler_service_event_controller.set()
            
            # In blocking mode wait until command response is received otherwise return request id
            if block:
                command_return_code_list = self.get_command_response(command_request_id_list,True,False)
                return command_return_code_list
            # Otherwise we simply return the command request id
            else:
                return command_request_id_list
        
        
        def get_command_response(self,command_request_id_list,block=False,verbose=True):
            
            casalog_call_origin = "MPICommandClient::get_command_response"    
            
            
            command_response_list = []
            if block:
                
                # Wait until command request response is received or timeout
                pending_command_request_id_list = list(command_request_id_list)
                while len(pending_command_request_id_list)>0:
                    for command_request_id in command_request_id_list:
                        # Check if command request id is still pending
                        if command_request_id in pending_command_request_id_list:
                            # Check if we have response for command request id
                            if self.__command_response_list.has_key(command_request_id):
                                # Remove command request id from pending list
                                pending_command_request_id_list.remove(command_request_id)
                            else:
                                server = self.__command_request_list[command_request_id]['server']
                                if server is not None and self.__monitor_client.get_server_status_keyword(server,'timeout'):
                                    casalog.post("Command request with id# %s sent to server n# %s, but the server has timed out" 
                                                 % (str(command_request_id),str(server)),"SEVERE",casalog_call_origin)
                                    # Remove command request id from pending list
                                    pending_command_request_id_list.remove(command_request_id)
                                
                            
                    time.sleep(MPIEnvironment.mpi_push_command_request_block_mode_sleep_time)
                    
                # Gather command response list
                for command_request_id in command_request_id_list:
                    if self.__command_response_list.has_key(command_request_id):
                        command_response = dict(self.__command_response_list[command_request_id])
                        command_response_list.append(command_response)
                    else:
                        command_response = self.__format_command_response_timeout(command_request_id)
                        command_response_list.append(command_response)
                    
                # Gather return codes
                #command_return_code_list = []
                #for command_response in command_response_list:
                #    successful = command_response['successful']
                #    if not successful:
                #        command_return_code_list.append([command_response['id'],False, command_response['traceback']])
                #    elif command_response['mode'] == 'eval':
                #        command_return_code_list.append([command_response['id'],True,command_response['ret']])
                #    else:
                #        command_return_code_list.append([command_response['id'],True,None])
                
                # Return command return code list
                return command_response_list
            
            else:
                command_response_list = []
                for command_request_id in command_request_id_list:
                    if not self.__command_response_list.has_key(command_request_id):
                        server = self.__command_request_list[command_request_id]['server']
                        timeout = self.__monitor_client.get_server_status_keyword(server,'timeout')
                        if timeout:
                            casalog.post("Command request with id# %s sent to server n# %s, but the server has timed out" 
                                         % (str(command_request_id),str(server)),"SEVERE",casalog_call_origin)
                            command_response = self.__format_command_response_timeout(command_request_id)
                            command_response_list.append(command_response)
                        elif verbose:
                            status = self.__command_request_list[command_request_id]['status']
                            casalog.post("Command request with id# %s is in %s state assigned to server %s" 
                                         % (str(command_request_id),status,str(server)),"INFO",casalog_call_origin)
                    else:
                        command_response = dict(self.__command_response_list[command_request_id])
                        command_response_list.append(command_response)
                        
                return command_response_list
            
            
        def get_command_response_event(self,command_request_id_list):
            
            # Get command group response id
            command_group_response_id = self.__command_group_response_counter
            
            # Setup event object
            command_group_response_event = threading.Event()
            command_group_response_event.clear()
            
            # Setup command group response
            command_group_response = {}
            command_group_response['id'] = command_group_response_id
            command_group_response['list'] = list(command_request_id_list) # Make a copy of the list 
            command_group_response['event'] = command_group_response_event
            
            # Register command group response
            self.__command_group_response_list[command_group_response_id]=command_group_response
            for command_request_id in command_request_id_list:
                self.__command_request_list[command_request_id]['group'] = command_group_response_id
             
            # Increment command id counter
            self.__command_group_response_counter = self.__command_group_response_counter + 1    
            
            # Return command response event object
            return command_group_response_event
        
        
        def get_server_status(self,server=None):
            return self.__monitor_client.get_server_status(server)
        
        
        def get_command_request_list(self):
            return self.__command_request_list
        
        
        def get_command_response_list(self):
            return self.__command_response_list        
        
        
        def set_log_mode(self,logmode):
            self.__log_mode = logmode
            
            
        def set_log_level(self,log_level):
            
            casalog_call_origin = "MPICommandClient::set_log_level"    
            
            if self.__life_cycle_state == 0:
                casalog.post("Services not started","WARN",casalog_call_origin)
                return       
            elif self.__life_cycle_state == 2:
                casalog.post("MPICommandClient life cycle finalized","WARN",casalog_call_origin)
                return             
            
            if log_level not in log_levels:
                casalog.post("Unknown log level %s, recognized levels are: %s" % (str(log_level),str(log_levels)),
                             "WARN",casalog_call_origin)
                return
                
            MPIEnvironment.command_handling_log_level = log_level
            
            self.__send_control_signal({'command':"MPIEnvironment.command_handling_log_level = '%s'" % log_level,
                                        'signal':'process_control'},
                                       check_response=True)   
            
            
                      
            
            
   
# EOF
Beispiel #14
0
    class __MPIMonitorServerImpl:
        """ Implementation of the MPIMonitorServer singleton interface """
        def __init__(self, start_services=True):

            # Initialize status state dict
            self.__status = {}
            self.__status['rank'] = MPIEnvironment.mpi_processor_rank
            self.__status['processor'] = MPIEnvironment.hostname
            self.__status['pid'] = os.getpid()
            self.__status['busy'] = False
            self.__status['command'] = None
            self.__status['command_start_time'] = None
            self.__status['command_stop_time'] = None

            # Initialize ping status request handler service state
            self.__ping_status_request_handler_service_on = False
            self.__ping_status_request_handler_service_final_round = False
            self.__ping_status_request_handler_service_running = False
            self.__ping_status_request_handler_service_thread = None
            self.__last_ping_status_request_time = None
            self.__client_timeout = False

            # Instantiate MPICommunicator reference
            self.__communicator = MPICommunicator()

            # Automatically start services
            if start_services:
                self.start_services()

        ################################################################################################################
        # Private methods ##############################################################################################
        ################################################################################################################

        def __ping_status_request_handler_service(self):

            casalog_call_origin = "MPIMonitorServer::ping_status_request_handler_service"

            # Mark service as running
            self.__ping_status_request_handler_service_running = True

            while (self.__ping_status_request_handler_service_on
                   or self.__ping_status_request_handler_service_final_round):

                # First check if there is a msg available
                msg_available = False
                try:
                    msg_available = self.__communicator.ping_status_request_probe(
                    )
                except Exception as instance:
                    casalog.post(
                        "Exception checking if ping status request msg is available: %s"
                        % str(instance), "SEVERE", casalog_call_origin)
                    msg_available = False

                # Then receive ping status request msg
                msg_received = False
                if (msg_available):
                    self.__last_ping_status_request_time = time.time()
                    try:
                        self.__communicator.ping_status_request_recv()
                        msg_received = True
                    except Exception as instance:

                        casalog.post(
                            "Exception receiving ping status request msg: %s" %
                            str(instance), "SEVERE", casalog_call_origin)
                        msg_received = False

                # jagonzal: Intensive activity in the client can cause monitoring client service to be slowed down
                #           This is due to Python's GIL which is acquired by the CASA SWIG components
                #           Using SWIG's thread option it is possible to disable GIL within the SWIG components
                #           (see test_mpi4casa[test1_applycal_fluxscale_gcal_bcal])
                # Check when we last received a ping status request
                # elif self.__last_ping_status_request_time is not None:
                #    elapsed_time = time.time() - self.__last_ping_status_request_time
                #    if (elapsed_time > MPIEnvironment.mpi_ping_status_request_handler_service_timeout):
                #        casalog.post("Heartbeat from client not received in the last %ss" %
                #                     str(int(round(elapsed_time))),"WARN",casalog_call_origin)
                #        self.__client_timeout = True

                # Send back response
                if (msg_received):
                    try:
                        self.__communicator.ping_status_response_send(
                            response=self.__status)
                    except:
                        formatted_traceback = traceback.format_exc()
                        casalog.post(
                            "Exception sending back ping status response: %s" %
                            str(formatted_traceback), "SEVERE",
                            casalog_call_origin)
                else:
                    time.sleep(
                        MPIEnvironment.
                        mpi_ping_status_request_handler_service_sleep_time)

                # Check if this was last round
                if (self.__ping_status_request_handler_service_final_round):
                    self.__ping_status_request_handler_service_final_round = False

            # Mark service as not running
            self.__ping_status_request_handler_service_running = False

        def __start_ping_status_request_handler_service(self):

            casalog_call_origin = "MPIMonitorServer::start_ping_status_request_handler_service"

            if self.__ping_status_request_handler_service_running:
                casalog.post(
                    "MPI ping status request handler service is already running",
                    "WARN", casalog_call_origin)
                return True

            try:
                self.__ping_status_request_handler_service_on = True
                self.__ping_status_request_handler_service_thread = thread.start_new_thread(
                    self.__ping_status_request_handler_service, ())
            except Exception as instance:
                self.__ping_status_request_handler_service_on = False
                self.__ping_status_request_handler_service_running = False
                casalog.post(
                    "Exception starting MPI ping status request handler service: %s"
                    % str(instance), "SEVERE", casalog_call_origin)
                return False

            while (not self.__ping_status_request_handler_service_running):
                time.sleep(MPIEnvironment.mpi_check_start_service_sleep_time)

            casalog.post("MPI ping status request handler service started",
                         "INFO", casalog_call_origin)

            return True

        def __stop_ping_status_request_handler_service(self):

            casalog_call_origin = "MPIMonitorServer::stop_ping_status_request_handler_service"

            if not self.__ping_status_request_handler_service_on:
                casalog.post(
                    "MPI ping status request handler service is not running",
                    "WARN", casalog_call_origin)
                return

            self.__ping_status_request_handler_service_final_round = True
            self.__ping_status_request_handler_service_on = False

            while (self.__ping_status_request_handler_service_running):
                time.sleep(MPIEnvironment.mpi_check_stop_service_sleep_time)

            casalog.post("MPI ping status request handler service stopped",
                         "INFO", casalog_call_origin)

        ################################################################################################################
        # Public methods ###############################################################################################
        ################################################################################################################

        def start_services(self):

            self.__start_ping_status_request_handler_service()

        def stop_services(self):

            self.__stop_ping_status_request_handler_service()

        def get_client_timeout(self):

            return self.__client_timeout

        def get_status(self, keyword=None):

            casalog_call_origin = "MPIMonitorServer::get_status"

            # If no keyword is provided return a copy of the status dictionary
            if keyword is None:
                return dict(self.__status)
            # If keyword is provided check existence and return the mapped value
            elif keyword in self.__status:
                return self.__status[keyword]
            else:
                casalog.post("Status keyword %s not defined" % str(keyword),
                             "WARN", casalog_call_origin)

        def set_status(self, keyword, value):

            casalog_call_origin = "MPIMonitorServer::set_status"

            if keyword in self.__status:
                self.__status[keyword] = value
            else:
                casalog.post("Status keyword %s not defined" % str(keyword),
                             "WARN", casalog_call_origin)
Beispiel #15
0
    class __MPIMonitorServerImpl:
        """ Implementation of the MPIMonitorServer singleton interface """    
    
        def __init__(self,start_services=True):
            
            # Initialize status state dict
            self.__status = {}
            self.__status['rank'] = MPIEnvironment.mpi_processor_rank
            self.__status['processor'] = MPIEnvironment.hostname
            self.__status['pid'] = os.getpid()
            self.__status['busy'] = False
            self.__status['command'] = None
            self.__status['command_start_time'] = None
            self.__status['command_stop_time'] = None
        
            # Initialize ping status request handler service state
            self.__ping_status_request_handler_service_on = False
            self.__ping_status_request_handler_service_final_round = False
            self.__ping_status_request_handler_service_running = False
            self.__ping_status_request_handler_service_thread = None
            self.__last_ping_status_request_time = None
            self.__client_timeout = False
            
            # Instantiate MPICommunicator reference
            self.__communicator = MPICommunicator()
            
            # Automatically start services
            if start_services:
                self.start_services()
        
        ################################################################################################################            
        # Private methods ##############################################################################################
        ################################################################################################################
        
        
        def __ping_status_request_handler_service(self):
            
            casalog_call_origin = "MPIMonitorServer::ping_status_request_handler_service"
            
            # Mark service as running
            self.__ping_status_request_handler_service_running = True            
        
            while (self.__ping_status_request_handler_service_on or self.__ping_status_request_handler_service_final_round):
                
                # First check if there is a msg available
                msg_available = False
                try:
                    msg_available = self.__communicator.ping_status_request_probe()
                except Exception as instance:
                    casalog.post("Exception checking if ping status request msg is available: %s" 
                                 % str(instance),"SEVERE",casalog_call_origin)
                    msg_available = False
                    
                # Then receive ping status request msg
                msg_received = False
                if (msg_available):
                    self.__last_ping_status_request_time = time.time()
                    try:
                        self.__communicator.ping_status_request_recv()
                        msg_received = True
                    except Exception as instance:
                        
                        casalog.post("Exception receiving ping status request msg: %s" 
                                     % str(instance),"SEVERE",casalog_call_origin)
                        msg_received = False
                
                # jagonzal: Intensive activity in the client can cause monitoring client service to be slowed down
                #           This is due to Python's GIL which is acquired by the CASA SWIG components
                #           Using SWIG's thread option it is possible to disable GIL within the SWIG components
                #           (see test_mpi4casa[test1_applycal_fluxscale_gcal_bcal])
                # Check when we last received a ping status request
                # elif self.__last_ping_status_request_time is not None:
                #    elapsed_time = time.time() - self.__last_ping_status_request_time
                #    if (elapsed_time > MPIEnvironment.mpi_ping_status_request_handler_service_timeout):
                #        casalog.post("Heartbeat from client not received in the last %ss" % 
                #                     str(int(round(elapsed_time))),"WARN",casalog_call_origin)
                #        self.__client_timeout = True
                        
                # Send back response
                if (msg_received):
                    try:
                        self.__communicator.ping_status_response_send(response=self.__status)
                    except:
                        formatted_traceback = traceback.format_exc()
                        casalog.post("Exception sending back ping status response: %s" 
                                     % str(formatted_traceback),"SEVERE",casalog_call_origin)
                else:
                    time.sleep(MPIEnvironment.mpi_ping_status_request_handler_service_sleep_time)
                    
                # Check if this was last round
                if (self.__ping_status_request_handler_service_final_round):
                    self.__ping_status_request_handler_service_final_round = False
            
            # Mark service as not running
            self.__ping_status_request_handler_service_running = False
        
        
        def __start_ping_status_request_handler_service(self):
        
            casalog_call_origin = "MPIMonitorServer::start_ping_status_request_handler_service"     
        
            if self.__ping_status_request_handler_service_running:
                casalog.post("MPI ping status request handler service is already running","WARN",casalog_call_origin)
                return True
            
            try:
                self.__ping_status_request_handler_service_on = True
                self.__ping_status_request_handler_service_thread = thread.start_new_thread(self.__ping_status_request_handler_service, ())
            except Exception as instance:
                self.__ping_status_request_handler_service_on = False
                self.__ping_status_request_handler_service_running = False
                casalog.post("Exception starting MPI ping status request handler service: %s" 
                             % str(instance),"SEVERE",casalog_call_origin)
                return False 
        
            while (not self.__ping_status_request_handler_service_running):
                time.sleep(MPIEnvironment.mpi_check_start_service_sleep_time)
        
            casalog.post("MPI ping status request handler service started","INFO",casalog_call_origin)
            
            return True
        
        
        def __stop_ping_status_request_handler_service(self):
        
            casalog_call_origin = "MPIMonitorServer::stop_ping_status_request_handler_service"
        
            if not self.__ping_status_request_handler_service_on:
                casalog.post("MPI ping status request handler service is not running","WARN",casalog_call_origin)
                return        
        
            self.__ping_status_request_handler_service_final_round = True
            self.__ping_status_request_handler_service_on = False
        
            while (self.__ping_status_request_handler_service_running):
                time.sleep(MPIEnvironment.mpi_check_stop_service_sleep_time)
                
            casalog.post("MPI ping status request handler service stopped","INFO",casalog_call_origin)
            
            
        ################################################################################################################            
        # Public methods ###############################################################################################
        ################################################################################################################            
            
        def start_services(self):
        
            self.__start_ping_status_request_handler_service()
            
            
        def stop_services(self):
        
            self.__stop_ping_status_request_handler_service()   
            
            
        def get_client_timeout(self):
            
            return self.__client_timeout
            
            
        def get_status(self,keyword=None):
            
            casalog_call_origin = "MPIMonitorServer::get_status"
            
            # If no keyword is provided return a copy of the status dictionary
            if keyword is None:
                return dict(self.__status)
            # If keyword is provided check existence and return the mapped value
            elif self.__status.has_key(keyword):
                return self.__status[keyword]
            else:
                casalog.post("Status keyword %s not defined" % str(keyword),"WARN",casalog_call_origin)
            
            
        def set_status(self,keyword,value):
            
            casalog_call_origin = "MPIMonitorServer::set_status"

            if self.__status.has_key(keyword):
                self.__status[keyword] = value
            else:
                casalog.post("Status keyword %s not defined" % str(keyword),"WARN",casalog_call_origin)          
Beispiel #16
0
    class __MPIMonitorClientImpl:
        """ Implementation of the MPIMonitorClient singleton interface """
        
        
        def __init__(self,start_services=True):
                                 
            # Initialize server status state
            self.__server_status_list = {}
            mpi_server_rank_list = MPIEnvironment.mpi_server_rank_list()
            for rank in mpi_server_rank_list:
                self.__server_status_list[rank] = {}
                self.__server_status_list[rank]['rank'] = rank
                self.__server_status_list[rank]['processor'] = None
                self.__server_status_list[rank]['pid'] = None
                self.__server_status_list[rank]['busy'] = False
                self.__server_status_list[rank]['command'] = None
                self.__server_status_list[rank]['command_start_time'] = None
                self.__server_status_list[rank]['pong_pending'] = False
                self.__server_status_list[rank]['ping_time'] = None
                self.__server_status_list[rank]['pong_time'] = None          
                self.__server_status_list[rank]['timeout'] = False      
                   
            # Initialize monitor service state
            self.__monitor_status_service_on = False
            self.__monitor_status_service_running = False
            self.__monitor_status_service_thread = None          
            
            # Initialize ping status response handler service state
            self.__ping_status_response_handler_service_on = False
            self.__ping_status_response_handler_service_running = False
            self.__ping_status_response_handler_service_thread = None  
            
            # Instantiate MPICommunicator reference
            self.__communicator = MPICommunicator()
            
            # Automatically start services
            if start_services:
                self.start_services()
                
                
        ################################################################################################################            
        # Private methods ##############################################################################################
        ################################################################################################################                   
                
                
        def __monitor_status_service(self):
            
            casalog_call_origin = "MPIMonitorClient::monitor_status_service"
            
            # Mark service as running
            self.__monitor_status_service_running = True            
            
            mpi_server_rank_list = MPIEnvironment.mpi_server_rank_list()
                          
            while (self.__monitor_status_service_on):
                # Iterate over servers     
                for rank in mpi_server_rank_list:
                    # Send ping status request if there is none pending
                    if not self.__server_status_list[rank]['pong_pending']:
                        try:
                            self.__communicator.ping_status_request_send(server=rank)
                            self.__server_status_list[rank]['ping_time'] = time.time()
                            self.__server_status_list[rank]['pong_pending'] = True    
                            self.__server_status_list[rank]['pong_checks'] = 0           
                        except:
                            formatted_traceback = traceback.format_exc()
                            casalog.post("Exception sending ping status request to server %s: %s" % 
                                         (str(rank),str(formatted_traceback)),"SEVERE",casalog_call_origin)
                    else:
                        self.__server_status_list[rank]['pong_checks'] += 1
                        elapsed_time = MPIEnvironment.mpi_monitor_status_service_heartbeat
                        elapsed_time *= self.__server_status_list[rank]['pong_checks']
                        # elapsed_time = int(round(time.time() - self.__server_status_list[rank]['ping_time']))                        
                        # Notify when a server reaches timeout condition
                        if ((elapsed_time > MPIEnvironment.mpi_monitor_status_service_timeout) and 
                            (not self.__server_status_list[rank]['timeout'])):
                            casalog.post("Ping status response from server %s not received in the last %ss" % 
                                         (str(rank),str(int(elapsed_time))),"SEVERE",casalog_call_origin)
                            self.__server_status_list[rank]['timeout'] = True
                # Sleep before next round
                time.sleep(MPIEnvironment.mpi_monitor_status_service_heartbeat)
            
            # Mark service as not running
            self.__monitor_status_service_running = False            

            
        def __start_monitor_status_service(self):
        
            casalog_call_origin = "MPIMonitorClient::start_monitor_status_service"

            if self.__monitor_status_service_running:
                casalog.post("MPI monitor status service is already running","WARN",casalog_call_origin)
                return True
            
            try:
                self.__monitor_status_service_on = True
                self.__monitor_status_service_thread = thread.start_new_thread(self.__monitor_status_service, ())
            except:
                formatted_traceback = traceback.format_exc()
                self.__monitor_status_service_on = False
                self.__monitor_status_service_running = False
                casalog.post("Exception starting MPI monitor status service: %s" 
                             % str(formatted_traceback),"SEVERE",casalog_call_origin)  
                return False
        
            while (not self.__monitor_status_service_running):
                time.sleep(MPIEnvironment.mpi_check_start_service_sleep_time) 
        
            casalog.post("MPI monitor status service started","INFO",casalog_call_origin)
            
            return True
        
        
        def __stop_monitor_status_service(self):
        
            casalog_call_origin = "MPIMonitorClient::stop_monitor_status_service"
        
            if not self.__monitor_status_service_running:
                casalog.post("MPI ping status response handler service is not running","WARN",casalog_call_origin)
                return             

            self.__monitor_status_service_on = False
        
            while (self.__monitor_status_service_running):
                time.sleep(MPIEnvironment.mpi_check_stop_service_sleep_time)
                
            casalog.post("MPI monitor status service stopped","INFO",casalog_call_origin)  
                
                
        def __ping_status_response_handler_service(self):
            
            casalog_call_origin = "MPIMonitorClient::ping_status_response_handler_service"
            
            # Mark service as running
            self.__ping_status_response_handler_service_running = True            
                          
            while (self.__ping_status_response_handler_service_on):
                
                # First check if there is a ping_status response msg available
                msg_available = False
                try:
                    msg_available = self.__communicator.ping_status_response_probe()
                except:
                    msg_available = False
                    formatted_traceback = traceback.format_exc()
                    casalog.post("Exception checking if ping status response msg is available: %s" 
                                 % str(formatted_traceback),"SEVERE",casalog_call_origin)
                    
                # Then receive, store and post ping_status response msg
                if (msg_available):            
                    try:
                        ping_status_response = self.__communicator.ping_status_response_recv()
                        pong_time = time.time()
                        rank = ping_status_response['rank']
                        self.__server_status_list[rank]['command'] = ping_status_response['command']
                        self.__server_status_list[rank]['command_start_time'] = ping_status_response['command_start_time']
                        self.__server_status_list[rank]['pong_time'] = pong_time
                        self.__server_status_list[rank]['pong_pending'] = False
                        elapsed_time = pong_time - self.__server_status_list[rank]['ping_time']
                        # Notify if the response has been received after timeout
                        if self.__server_status_list[rank]['timeout']:
                            self.__server_status_list[rank]['timeout'] = False
                            casalog.post("Ping status response from server %s finally received after %ss" % 
                                         (str(rank),str(int(elapsed_time))),"WARN",casalog_call_origin)                  
                    except:
                        formatted_traceback = traceback.format_exc()
                        casalog.post("Exception receiving ping status response msg: %s" 
                                     % str(formatted_traceback),"SEVERE",casalog_call_origin)
                else:
                    time.sleep(MPIEnvironment.mpi_ping_status_response_handler_service_sleep_time) 

            # Mark service as not running
            self.__ping_status_response_handler_service_running = False            

            
        def __start_ping_status_response_handler_service(self):
        
            casalog_call_origin = "MPIMonitorClient::start_ping_status_response_handler_service"

            if self.__ping_status_response_handler_service_running:
                casalog.post("MPI ping status response handler service is already running","WARN",casalog_call_origin)
                return True
            
            try:
                self.__ping_status_response_handler_service_on = True
                self.__ping_status_response_handler_service_thread = thread.start_new_thread(self.__ping_status_response_handler_service, ())
            except:
                formatted_traceback = traceback.format_exc()
                self.__ping_status_response_handler_service_on = False
                self.__ping_status_response_handler_service_running = False
                casalog.post("Exception starting MPI ping status response handler service: %s" 
                             % str(formatted_traceback),"SEVERE",casalog_call_origin)  
                return False
        
            while (not self.__ping_status_response_handler_service_running):
                time.sleep(MPIEnvironment.mpi_check_start_service_sleep_time) 
        
            casalog.post("MPI ping status response handler service started","INFO",casalog_call_origin)
            
            return True
        
        
        def __stop_ping_status_response_handler_service(self):
        
            casalog_call_origin = "MPIMonitorClient::stop_ping_status_response_handler_service"
        
            if not self.__ping_status_response_handler_service_running:
                casalog.post("MPI ping status response handler service is not running","WARN",casalog_call_origin)
                return             

            self.__ping_status_response_handler_service_on = False
        
            while (self.__ping_status_response_handler_service_running):
                time.sleep(MPIEnvironment.mpi_check_stop_service_sleep_time)
                
            casalog.post("MPI ping status response handler service stopped","INFO",casalog_call_origin)         
            
            
        ################################################################################################################            
        # Public methods ###############################################################################################
        ################################################################################################################
        
                    
        def start_services(self):
        
            self.__start_ping_status_response_handler_service()
            self.__start_monitor_status_service()
        
        
        def stop_services(self):

            self.__stop_monitor_status_service()
            self.__stop_ping_status_response_handler_service()            
            
            
        def get_server_status(self,server=None):
            
            casalog_call_origin = "MPIMonitorClient::get_server_status"
            
            if server is None:
                return dict(self.__server_status_list)
            else:
                if self.__server_status_list.has_key(server):
                    return dict(self.__server_status_list[server])
                else:
                    casalog.post("Server n# %s is out of range" % str(server),"WARN",casalog_call_origin)  
            
            
        def get_server_status_keyword(self,server,keyword):
            
            casalog_call_origin = "MPIMonitorClient::get_server_status_keyword"
            
            if self.__server_status_list.has_key(server):
                if self.__server_status_list[server].has_key(keyword):
                    return self.__server_status_list[server][keyword]
                else:
                    casalog.post("Status keyword %s not defined" % str(keyword),"WARN",casalog_call_origin)
            else:
                casalog.post("Server n# %s is out of range" % str(server),"WARN",casalog_call_origin)         
            
            
        def set_server_status_keyword(self,server,keyword,value):
            
            casalog_call_origin = "MPIMonitorClient::set_server_status_keyword"
            
            if self.__server_status_list.has_key(server):
                if self.__server_status_list[server].has_key(keyword):
                    self.__server_status_list[server][keyword]=value
                else:
                    casalog.post("Status keyword %s not defined" % str(keyword),"WARN",casalog_call_origin)
            else:
                casalog.post("Server n# %s is out of range" % str(server),"WARN",casalog_call_origin)
         
        
        def get_server_rank_available(self,verbose=False):
                     
            server_rank_available = []
            for rank in self.__server_status_list:
                if not (self.__server_status_list[rank]['busy'] or self.__server_status_list[rank]['timeout']):
                    server_rank_available.append(rank)
                
            return server_rank_available
        
        
        def get_server_rank_online(self,verbose=False):
                     
            server_rank_online = []
            for rank in self.__server_status_list:
                if not self.__server_status_list[rank]['timeout']:
                    server_rank_online.append(rank)
                
            return server_rank_online        
        
        
        def get_server_timeout(self):
            
            casalog_call_origin = "MPIMonitorClient::get_server_timeout"
            
            server_rank_timeout = []
            for rank in self.__server_status_list:
                if self.__server_status_list[rank]['timeout'] is True:
                    server_rank_timeout.append(rank)
                    
            return server_rank_timeout