def __init__(self, start_services=True): # Initialize server status state self.__server_status_list = {} mpi_server_rank_list = MPIEnvironment.mpi_server_rank_list() for rank in mpi_server_rank_list: self.__server_status_list[rank] = {} self.__server_status_list[rank]['rank'] = rank self.__server_status_list[rank]['processor'] = None self.__server_status_list[rank]['pid'] = None self.__server_status_list[rank]['busy'] = False self.__server_status_list[rank]['command'] = None self.__server_status_list[rank]['command_start_time'] = None self.__server_status_list[rank]['pong_pending'] = False self.__server_status_list[rank]['ping_time'] = None self.__server_status_list[rank]['pong_time'] = None self.__server_status_list[rank]['timeout'] = False # Initialize monitor service state self.__monitor_status_service_on = False self.__monitor_status_service_running = False self.__monitor_status_service_thread = None # Initialize ping status response handler service state self.__ping_status_response_handler_service_on = False self.__ping_status_response_handler_service_running = False self.__ping_status_response_handler_service_thread = None # Instantiate MPICommunicator reference self.__communicator = MPICommunicator() # Automatically start services if start_services: self.start_services()
def __init__(self, start_services=True): # Initialize status state dict self.__status = {} self.__status['rank'] = MPIEnvironment.mpi_processor_rank self.__status['processor'] = MPIEnvironment.hostname self.__status['pid'] = os.getpid() self.__status['busy'] = False self.__status['command'] = None self.__status['command_start_time'] = None self.__status['command_stop_time'] = None # Initialize ping status request handler service state self.__ping_status_request_handler_service_on = False self.__ping_status_request_handler_service_final_round = False self.__ping_status_request_handler_service_running = False self.__ping_status_request_handler_service_thread = None self.__last_ping_status_request_time = None self.__client_timeout = False # Instantiate MPICommunicator reference self.__communicator = MPICommunicator() # Automatically start services if start_services: self.start_services()
def __init__(self,start_services=False): # Initialize life cycle state # 0-> Services not started # 1-> Services started # 2-> Stop service signal sent self.__life_cycle_state = 0 # Initialize command request id counter and list self.__command_request_counter = 1 self.__command_request_list = {} self.__command_response_list = {} # Initialize command group response state self.__command_group_response_counter = 1 self.__command_group_response_list = {} # Initialize command response handler service state self.__command_response_handler_service_on = False self.__command_response_handler_service_running = False self.__command_response_handler_service_thread = None self.__command_response_handler_service_event_controller = threading.Event() self.__command_response_handler_service_event_controller.clear() # Initialize command request queue service state self.__command_request_queue = [] self.__command_request_queue_service_on = False self.__command_request_queue_service_running = False self.__command_request_queue_service_thread = None self.__command_request_queue_service_event_controller = threading.Event() self.__command_request_queue_service_event_controller.clear() # Setup a command request input queue to append the jobs # to be picked up by the command request queue service self.__command_request_input_queue = [] self.__command_request_input_queue_lock = threading.Lock() # Instantiate MPICommunicator reference self.__communicator = MPICommunicator() # Instantiate MPIMonitorClient reference self.__monitor_client = MPIMonitorClient(False) # Automatically start services if start_services: self.start_services() # Log mode self.__log_mode = 'unified' # Register exit handler # NOTE: It is not guaranteed that __del__() methods are called # for objects that still exist when the interpreter exits. atexit.register(self.stop_services,force_command_request_interruption=True)
def __init__(self,start_services=True): # Initialize status state dict self.__status = {} self.__status['rank'] = MPIEnvironment.mpi_processor_rank self.__status['processor'] = MPIEnvironment.hostname self.__status['pid'] = os.getpid() self.__status['busy'] = False self.__status['command'] = None self.__status['command_start_time'] = None self.__status['command_stop_time'] = None # Initialize ping status request handler service state self.__ping_status_request_handler_service_on = False self.__ping_status_request_handler_service_final_round = False self.__ping_status_request_handler_service_running = False self.__ping_status_request_handler_service_thread = None self.__last_ping_status_request_time = None self.__client_timeout = False # Instantiate MPICommunicator reference self.__communicator = MPICommunicator() # Automatically start services if start_services: self.start_services()
def __init__(self,start_services=True): # Initialize server status state self.__server_status_list = {} mpi_server_rank_list = MPIEnvironment.mpi_server_rank_list() for rank in mpi_server_rank_list: self.__server_status_list[rank] = {} self.__server_status_list[rank]['rank'] = rank self.__server_status_list[rank]['processor'] = None self.__server_status_list[rank]['pid'] = None self.__server_status_list[rank]['busy'] = False self.__server_status_list[rank]['command'] = None self.__server_status_list[rank]['command_start_time'] = None self.__server_status_list[rank]['pong_pending'] = False self.__server_status_list[rank]['ping_time'] = None self.__server_status_list[rank]['pong_time'] = None self.__server_status_list[rank]['timeout'] = False # Initialize monitor service state self.__monitor_status_service_on = False self.__monitor_status_service_running = False self.__monitor_status_service_thread = None # Initialize ping status response handler service state self.__ping_status_response_handler_service_on = False self.__ping_status_response_handler_service_running = False self.__ping_status_response_handler_service_thread = None # Instantiate MPICommunicator reference self.__communicator = MPICommunicator() # Automatically start services if start_services: self.start_services()
def __init__(self, start_services=False): # Initialize life cycle state # 0-> Services not started # 1-> Services started # 2-> Stop service signal sent self.__life_cycle_state = 0 # Initialize command request id counter and list self.__command_request_counter = 1 self.__command_request_list = {} self.__command_response_list = {} # Initialize command group response state self.__command_group_response_counter = 1 self.__command_group_response_list = {} # Initialize command response handler service state self.__command_response_handler_service_on = False self.__command_response_handler_service_running = False self.__command_response_handler_service_thread = None self.__command_response_handler_service_event_controller = threading.Event() self.__command_response_handler_service_event_controller.clear() # Initialize command request queue service state self.__command_request_queue = [] self.__command_request_queue_service_on = False self.__command_request_queue_service_running = False self.__command_request_queue_service_thread = None self.__command_request_queue_service_event_controller = threading.Event() self.__command_request_queue_service_event_controller.clear() # Setup a command request input queue to append the jobs # to be picked up by the command request queue service self.__command_request_input_queue = [] self.__command_request_input_queue_lock = threading.Lock() # Instantiate MPICommunicator reference self.__communicator = MPICommunicator() # Instantiate MPIMonitorClient reference self.__monitor_client = MPIMonitorClient(False) # Automatically start services if start_services: self.start_services() # Log mode self.__log_mode = "unified" # Register exit handler # NOTE: It is not guaranteed that __del__() methods are called # for objects that still exist when the interpreter exits. atexit.register(self.stop_services, force_command_request_interruption=True)
def __init__(self,start_services=False): # Initialize command request handler service state self.__command_request_handler_service_on = False self.__command_request_handler_service_running = False self.__command_request_handler_service_thread = None # Instantiate MPICommunicator reference self.__communicator = MPICommunicator() # Instantiate MPIMonitorClient reference self.__monitor_server = MPIMonitorServer(False) # Initialize logfile descriptor self.__logfile_descriptor = open(casalog.logfile(), 'a') # Initialize virtual frame buffer state self.__virtual_frame_buffer_port = None self.__virtual_frame_buffer_process = None # Automatically start services if start_services: self.start_services()
def __init__(self, start_services=False): # Initialize command request handler service state self.__command_request_handler_service_on = False self.__command_request_handler_service_running = False self.__command_request_handler_service_thread = None # Instantiate MPICommunicator reference self.__communicator = MPICommunicator() # Instantiate MPIMonitorClient reference self.__monitor_server = MPIMonitorServer(False) # Initialize logfile descriptor self.__logfile_descriptor = open(casalog.logfile(), "a") # Initialize virtual frame buffer state self.__virtual_frame_buffer_port = None self.__virtual_frame_buffer_process = None # Automatically start services if start_services: self.start_services()
class __MPICommandServerImpl: """ Implementation of the MPICommandServer singleton interface """ def __init__(self, start_services=False): # Initialize command request handler service state self.__command_request_handler_service_on = False self.__command_request_handler_service_running = False self.__command_request_handler_service_thread = None # Instantiate MPICommunicator reference self.__communicator = MPICommunicator() # Instantiate MPIMonitorClient reference self.__monitor_server = MPIMonitorServer(False) # Initialize logfile descriptor self.__logfile_descriptor = open(casalog.logfile(), "a") # Initialize virtual frame buffer state self.__virtual_frame_buffer_port = None self.__virtual_frame_buffer_process = None # Automatically start services if start_services: self.start_services() ################################################################################################################ # Private methods ############################################################################################## ################################################################################################################ def __command_request_handler_service(self): casalog_call_origin = "MPICommandServer::command_request_handler_service" # Mark service as running self.__command_request_handler_service_running = True while self.__command_request_handler_service_on: # First check if there is a command request msg available msg_available = False try: msg_available = self.__communicator.command_request_probe() except Exception as instance: casalog.post( "Exception checking if command request msg is available: %s" % str(instance), "SEVERE", casalog_call_origin, ) msg_available = False # Then receive command request msg msg_received = False if msg_available: try: command_request = self.__communicator.command_request_recv() casalog.post( "Received command request msg: %s" % command_request["command"], MPIEnvironment.command_handling_log_level, casalog_call_origin, ) msg_received = True except: formatted_traceback = traceback.format_exc() casalog.post( "Exception receiving command request msg: %s" % str(formatted_traceback), "SEVERE", casalog_call_origin, ) msg_received = False # Finally process command request and send back response if msg_received: # Start timer command_start_time = time.time() # Update server state self.__monitor_server.set_status("busy", True) self.__monitor_server.set_status("command", command_request["command"]) self.__monitor_server.set_status("command_start_time", command_start_time) # Get command request id command_request_id = command_request["id"] # Prepare command response command_response = dict(command_request) # Set command start time command_response["command_start_time"] = command_start_time # Execute/Evaluate command request try: # Add dict-defined parameters to globals if isinstance(command_request["parameters"], dict): globals().update(command_request["parameters"]) # Execute command if command_request["mode"] == "eval": casalog.post( "Going to evaluate command request with id# %s as an expression via eval: %s" % (str(command_request_id), str(command_request["command"])), MPIEnvironment.command_handling_log_level, casalog_call_origin, ) command_response["ret"] = eval(command_request["command"]) elif command_request["mode"] == "exec": casalog.post( "Going to execute command request with id# %s as a statement via exec: %s" % (str(command_request_id), command_request["command"]), MPIEnvironment.command_handling_log_level, casalog_call_origin, ) code = compile(command_request["command"], casalog_call_origin, "exec") exec(code) command_response["ret"] = None elif command_request["mode"] == "push": casalog.post( "Command request with id# %s is a push operation" % str(command_request_id), MPIEnvironment.command_handling_log_level, casalog_call_origin, ) command_response["ret"] = None # Set command response parameters command_response["successful"] = True command_response["traceback"] = None except: formatted_traceback = traceback.format_exc() casalog.post( "Exception executing command request via %s: %s" % (command_request["mode"], str(formatted_traceback)), "SEVERE", casalog_call_origin, ) # Set command response parameters command_response["successful"] = False command_response["traceback"] = formatted_traceback command_response["ret"] = None # Variables are cleaned from the environment regardless of the result finally: # Clear parameter variables if isinstance(command_request["parameters"], dict) and command_request["mode"] != "push": for parameter in command_request["parameters"]: try: del globals()[parameter] except: formatted_traceback = traceback.format_exc() casalog.post( "Exception deleting parameter variable '%s' from global environment: %s" % (str(parameter), str(formatted_traceback)), "WARN", casalog_call_origin, ) # Set command stop time command_stop_time = time.time() command_response["command_stop_time"] = command_stop_time # Update server state self.__monitor_server.set_status("busy", False) self.__monitor_server.set_status("command", None) self.__monitor_server.set_status("command_start_time", None) # Send response back (successful or not) try: casalog.post( "Command request with id %s successfully processed in %s mode, sending back response ..." % (str(command_response["id"]), str(command_response["mode"])), MPIEnvironment.command_handling_log_level, casalog_call_origin, ) self.__communicator.command_response_send(response=command_response) except: formatted_traceback = traceback.format_exc() casalog.post( "Exception sending back command response: %s" % str(formatted_traceback), "SEVERE", casalog_call_origin, ) else: time.sleep(MPIEnvironment.mpi_command_request_handler_service_sleep_time) # Mark service as not running self.__command_request_handler_service_running = False def __start_command_request_handler_service(self): casalog_call_origin = "MPICommandServer::start_command_request_handler_service" if self.__command_request_handler_service_running: casalog.post("MPI command request handler service is already running", "WARN", casalog_call_origin) return True try: self.__command_request_handler_service_on = True self.__command_request_handler_service_thread = thread.start_new_thread( self.__command_request_handler_service, () ) except Exception as instance: self.__command_request_handler_service_on = False self.__command_request_handler_service_running = False casalog.post( "Exception starting MPI command request handler service: %s" % str(instance), "SEVERE", casalog_call_origin, ) return False while not self.__command_request_handler_service_running: time.sleep(MPIEnvironment.mpi_check_start_service_sleep_time) casalog.post("MPI command request handler service started", "INFO", casalog_call_origin) return True def __stop_command_request_handler_service(self): casalog_call_origin = "MPICommandServer::stop_command_request_handler_service" if not self.__command_request_handler_service_running: casalog.post("MPI command request handler service is not running", "WARN", casalog_call_origin) return self.__command_request_handler_service_on = False while self.__command_request_handler_service_running: time.sleep(MPIEnvironment.mpi_check_stop_service_sleep_time) casalog.post("MPI command request handler service stopped", "INFO", casalog_call_origin) ################################################################################################################ # Public methods ############################################################################################### ################################################################################################################ def start_virtual_frame_buffer(self): casalog_call_origin = "MPICommandServer::start_virtual_frame_buffer" displayport = os.getpid() while os.path.exists("/tmp/.X%d-lock" % displayport): displayport += 1 self.__virtual_frame_buffer_port = ":%d" % displayport self.__xauthfile = tempfile.NamedTemporaryFile() try: cookie = subprocess.check_output(["mcookie"], universal_newlines=True).strip() except: cookie = str(uuid.uuid4()).replace("-", "") # sometimes also works without auth, so accept failure subprocess.call( ["xauth", "-f", self.__xauthfile.name, "add", self.__virtual_frame_buffer_port, ".", cookie], stdout=self.__logfile_descriptor, stderr=self.__logfile_descriptor, ) try: self.__virtual_frame_buffer_process = subprocess.Popen( ["Xvfb", self.__virtual_frame_buffer_port, "-auth", self.__xauthfile.name], stdout=self.__logfile_descriptor, stderr=self.__logfile_descriptor, shell=False, ) os.environ["DISPLAY"] = self.__virtual_frame_buffer_port os.environ["XAUTHORITY"] = self.__xauthfile.name casalog.post( "Deployed virtual frame buffer at %s with pid %s" % (self.__virtual_frame_buffer_port, str(self.__virtual_frame_buffer_process.pid)), "INFO", casalog_call_origin, ) except: self.__virtual_frame_buffer_process = None formatted_traceback = traceback.format_exc() casalog.post( "Exception deploying virtual frame buffer at %s: %s" % (self.__virtual_frame_buffer_port, str(formatted_traceback)), "SEVERE", casalog_call_origin, ) def stop_virtual_frame_buffer(self): casalog_call_origin = "MPICommandServer::stop_virtual_frame_buffer" if self.__virtual_frame_buffer_process is not None: try: self.__virtual_frame_buffer_process.terminate() casalog.post( "Virtual frame buffer deployed at %s with pid %s successfully shutdown" % (self.__virtual_frame_buffer_port, str(self.__virtual_frame_buffer_process.pid)), "DEBUG", casalog_call_origin, ) self.__virtual_frame_buffer_process = None except: formatted_traceback = traceback.format_exc() casalog.post( "Exception shutting down virtual frame buffer deployed at %s with pid %s: %s" % ( self.__virtual_frame_buffer_port, str(self.__virtual_frame_buffer_process.pid), str(formatted_traceback), ), "SEVERE", casalog_call_origin, ) else: casalog.post("Virtual frame buffer not deployed", "WARN", casalog_call_origin) subprocess.call( ["xauth", "-f", self.__xauthfile.name, "remove", self.__virtual_frame_buffer_port], stdout=self.__logfile_descriptor, stderr=self.__logfile_descriptor, ) self.__xauthfile.close() def start_services(self): self.__monitor_server.start_services() self.__start_command_request_handler_service() self.start_virtual_frame_buffer() def stop_services(self, force_command_request_interruption=False): if self.__logfile_descriptor is not None: self.__logfile_descriptor.close() self.__logfile_descriptor = None self.__monitor_server.stop_services() if not force_command_request_interruption: self.__stop_command_request_handler_service() def serve(self): casalog_call_origin = "MPICommandServer::serve" # First start command and ping status services casalog.post("Starting services...", "INFO", casalog_call_origin) self.start_services() # Notify to MPICommandClient that service is up and running self.__communicator.control_service_response_send(response=self.__monitor_server.get_status()) # Keep serving until a stop signal service is received control_service_request = {} stop_service_requested = False while (not stop_service_requested) and (not self.__monitor_server.get_client_timeout()): # Check if there is an incoming control service msg msg_available = False try: msg_available = self.__communicator.control_service_request_probe() except: msg_available = False formatted_traceback = traceback.format_exc() casalog.post( "Exception checking if control service msg is available: %s" % str(formatted_traceback), "SEVERE", casalog_call_origin, ) # Notify to MPICommandClient that control signal has been processed if msg_available: # Receive control service msg msg_received = False control_service_request = {} try: control_service_request = self.__communicator.control_service_request_recv() msg_received = True except: msg_received = False formatted_traceback = traceback.format_exc() casalog.post( "Exception receiving control service msg: %s" % str(formatted_traceback), "SEVERE", casalog_call_origin, ) continue # Process control service msg cmd = None send_response = False if msg_received: try: cmd = control_service_request["command"] send_response = control_service_request["send_response"] code = compile(cmd, casalog_call_origin, "exec") exec(code) casalog.post( "Control signal %s successfully handled by server %s" % (str(cmd), str(MPIEnvironment.mpi_processor_rank)), "INFO", casalog_call_origin, ) except: formatted_traceback = traceback.format_exc() casalog.post( "Exception handling control signal command %s in server %s: %s" % ( str(control_service_request), str(MPIEnvironment.mpi_processor_rank), str(formatted_traceback), ), "SEVERE", casalog_call_origin, ) # Notify to MPICommandClient that control signal has been processed if send_response: try: self.__communicator.control_service_response_send( response=self.__monitor_server.get_status() ) except: formatted_traceback = traceback.format_exc() casalog.post( "Exception sending response to control signal command %s in server %s: %s" % (str(cmd), str(MPIEnvironment.mpi_processor_rank), str(formatted_traceback)), "SEVERE", casalog_call_origin, ) time.sleep(MPIEnvironment.mpi_stop_service_sleep_time) # Process stop service request if stop_service_requested: # Check if force mode is needed force_command_request_interruption = control_service_request["force_command_request_interruption"] finalize_mpi_environment = control_service_request["finalize_mpi_environment"] busy = self.__monitor_server.get_status("busy") if force_command_request_interruption and busy: casalog.post( "force-stop service signal received, stopping services, " + "command request handler service will be interrupted...", "INFO", casalog_call_origin, ) else: force_command_request_interruption = False casalog.post("stop service signal received, stopping services...", "INFO", casalog_call_origin) else: force_command_request_interruption = True casalog.post( "client timeout, forcing disconnection, " + "command request handler service will be interrupted..", "INFO", casalog_call_origin, ) # Stop services self.stop_services(force_command_request_interruption) # Finalize MPI environment if finalize_mpi_environment: try: casalog.post("Going to finalize MPI environment", "INFO", casalog_call_origin) MPIEnvironment.finalize_mpi_environment() except: formatted_traceback = traceback.format_exc() casalog.post( "Exception finalizing MPI environment %s" % str(formatted_traceback), "SEVERE", casalog_call_origin, ) # Exit casalog.post("Exiting", "INFO", casalog_call_origin)
class __MPIMonitorClientImpl: """ Implementation of the MPIMonitorClient singleton interface """ def __init__(self, start_services=True): # Initialize server status state self.__server_status_list = {} mpi_server_rank_list = MPIEnvironment.mpi_server_rank_list() for rank in mpi_server_rank_list: self.__server_status_list[rank] = {} self.__server_status_list[rank]['rank'] = rank self.__server_status_list[rank]['processor'] = None self.__server_status_list[rank]['pid'] = None self.__server_status_list[rank]['busy'] = False self.__server_status_list[rank]['command'] = None self.__server_status_list[rank]['command_start_time'] = None self.__server_status_list[rank]['pong_pending'] = False self.__server_status_list[rank]['ping_time'] = None self.__server_status_list[rank]['pong_time'] = None self.__server_status_list[rank]['timeout'] = False # Initialize monitor service state self.__monitor_status_service_on = False self.__monitor_status_service_running = False self.__monitor_status_service_thread = None # Initialize ping status response handler service state self.__ping_status_response_handler_service_on = False self.__ping_status_response_handler_service_running = False self.__ping_status_response_handler_service_thread = None # Instantiate MPICommunicator reference self.__communicator = MPICommunicator() # Automatically start services if start_services: self.start_services() ################################################################################################################ # Private methods ############################################################################################## ################################################################################################################ def __monitor_status_service(self): casalog_call_origin = "MPIMonitorClient::monitor_status_service" # Mark service as running self.__monitor_status_service_running = True mpi_server_rank_list = MPIEnvironment.mpi_server_rank_list() while (self.__monitor_status_service_on): # Iterate over servers for rank in mpi_server_rank_list: # Send ping status request if there is none pending if not self.__server_status_list[rank]['pong_pending']: try: self.__communicator.ping_status_request_send( server=rank) self.__server_status_list[rank][ 'ping_time'] = time.time() self.__server_status_list[rank][ 'pong_pending'] = True self.__server_status_list[rank]['pong_checks'] = 0 except: formatted_traceback = traceback.format_exc() casalog.post( "Exception sending ping status request to server %s: %s" % (str(rank), str(formatted_traceback)), "SEVERE", casalog_call_origin) else: self.__server_status_list[rank]['pong_checks'] += 1 elapsed_time = MPIEnvironment.mpi_monitor_status_service_heartbeat elapsed_time *= self.__server_status_list[rank][ 'pong_checks'] # elapsed_time = int(round(time.time() - self.__server_status_list[rank]['ping_time'])) # Notify when a server reaches timeout condition if (MPIEnvironment. mpi_monitor_status_service_timeout_enabled and (elapsed_time > MPIEnvironment.mpi_monitor_status_service_timeout) and (not self.__server_status_list[rank]['timeout'])): casalog.post( "Ping status response from server %s not received " "in the last %ss. Setting its status to 'timeout'" % (str(rank), str(int(elapsed_time))), "SEVERE", casalog_call_origin) self.__server_status_list[rank]['timeout'] = True # Sleep before next round time.sleep(MPIEnvironment.mpi_monitor_status_service_heartbeat) # Mark service as not running self.__monitor_status_service_running = False def __start_monitor_status_service(self): casalog_call_origin = "MPIMonitorClient::start_monitor_status_service" if self.__monitor_status_service_running: casalog.post("MPI monitor status service is already running", "WARN", casalog_call_origin) return True try: self.__monitor_status_service_on = True self.__monitor_status_service_thread = thread.start_new_thread( self.__monitor_status_service, ()) except: formatted_traceback = traceback.format_exc() self.__monitor_status_service_on = False self.__monitor_status_service_running = False casalog.post( "Exception starting MPI monitor status service: %s" % str(formatted_traceback), "SEVERE", casalog_call_origin) return False while (not self.__monitor_status_service_running): time.sleep(MPIEnvironment.mpi_check_start_service_sleep_time) casalog.post("MPI monitor status service started", "INFO", casalog_call_origin) return True def __stop_monitor_status_service(self): casalog_call_origin = "MPIMonitorClient::stop_monitor_status_service" if not self.__monitor_status_service_running: casalog.post( "MPI ping status response handler service is not running", "WARN", casalog_call_origin) return self.__monitor_status_service_on = False while (self.__monitor_status_service_running): time.sleep(MPIEnvironment.mpi_check_stop_service_sleep_time) casalog.post("MPI monitor status service stopped", "INFO", casalog_call_origin) def __ping_status_response_handler_service(self): casalog_call_origin = "MPIMonitorClient::ping_status_response_handler_service" # Mark service as running self.__ping_status_response_handler_service_running = True while (self.__ping_status_response_handler_service_on): # First check if there is a ping_status response msg available msg_available = False try: msg_available = self.__communicator.ping_status_response_probe( ) except: msg_available = False formatted_traceback = traceback.format_exc() casalog.post( "Exception checking if ping status response msg is available: %s" % str(formatted_traceback), "SEVERE", casalog_call_origin) # Then receive, store and post ping_status response msg if (msg_available): try: ping_status_response = self.__communicator.ping_status_response_recv( ) pong_time = time.time() rank = ping_status_response['rank'] self.__server_status_list[rank][ 'command'] = ping_status_response['command'] self.__server_status_list[rank][ 'command_start_time'] = ping_status_response[ 'command_start_time'] self.__server_status_list[rank][ 'pong_time'] = pong_time self.__server_status_list[rank]['pong_pending'] = False elapsed_time = pong_time - self.__server_status_list[ rank]['ping_time'] # Notify if the response has been received after timeout if self.__server_status_list[rank]['timeout']: self.__server_status_list[rank]['timeout'] = False casalog.post( "Ping status response from server %s received after %ss" % (str(rank), str(int(elapsed_time))), "WARN", casalog_call_origin) except: formatted_traceback = traceback.format_exc() casalog.post( "Exception receiving ping status response msg: %s" % str(formatted_traceback), "SEVERE", casalog_call_origin) else: time.sleep( MPIEnvironment. mpi_ping_status_response_handler_service_sleep_time) # Mark service as not running self.__ping_status_response_handler_service_running = False def __start_ping_status_response_handler_service(self): casalog_call_origin = "MPIMonitorClient::start_ping_status_response_handler_service" if self.__ping_status_response_handler_service_running: casalog.post( "MPI ping status response handler service is already running", "WARN", casalog_call_origin) return True try: self.__ping_status_response_handler_service_on = True self.__ping_status_response_handler_service_thread = thread.start_new_thread( self.__ping_status_response_handler_service, ()) except: formatted_traceback = traceback.format_exc() self.__ping_status_response_handler_service_on = False self.__ping_status_response_handler_service_running = False casalog.post( "Exception starting MPI ping status response handler service: %s" % str(formatted_traceback), "SEVERE", casalog_call_origin) return False while (not self.__ping_status_response_handler_service_running): time.sleep(MPIEnvironment.mpi_check_start_service_sleep_time) casalog.post("MPI ping status response handler service started", "INFO", casalog_call_origin) return True def __stop_ping_status_response_handler_service(self): casalog_call_origin = "MPIMonitorClient::stop_ping_status_response_handler_service" if not self.__ping_status_response_handler_service_running: casalog.post( "MPI ping status response handler service is not running", "WARN", casalog_call_origin) return self.__ping_status_response_handler_service_on = False while (self.__ping_status_response_handler_service_running): time.sleep(MPIEnvironment.mpi_check_stop_service_sleep_time) casalog.post("MPI ping status response handler service stopped", "INFO", casalog_call_origin) ################################################################################################################ # Public methods ############################################################################################### ################################################################################################################ def start_services(self): self.__start_ping_status_response_handler_service() self.__start_monitor_status_service() def stop_services(self): self.__stop_monitor_status_service() self.__stop_ping_status_response_handler_service() def get_server_status(self, server=None): casalog_call_origin = "MPIMonitorClient::get_server_status" if server is None: return dict(self.__server_status_list) else: if self.__server_status_list.has_key(server): return dict(self.__server_status_list[server]) else: casalog.post("Server n# %s is out of range" % str(server), "WARN", casalog_call_origin) def get_server_status_keyword(self, server, keyword): casalog_call_origin = "MPIMonitorClient::get_server_status_keyword" if self.__server_status_list.has_key(server): if self.__server_status_list[server].has_key(keyword): return self.__server_status_list[server][keyword] else: casalog.post( "Status keyword %s not defined" % str(keyword), "WARN", casalog_call_origin) else: casalog.post("Server n# %s is out of range" % str(server), "WARN", casalog_call_origin) def set_server_status_keyword(self, server, keyword, value): casalog_call_origin = "MPIMonitorClient::set_server_status_keyword" if self.__server_status_list.has_key(server): if self.__server_status_list[server].has_key(keyword): self.__server_status_list[server][keyword] = value else: casalog.post( "Status keyword %s not defined" % str(keyword), "WARN", casalog_call_origin) else: casalog.post("Server n# %s is out of range" % str(server), "WARN", casalog_call_origin) def get_server_rank_available(self, verbose=False): server_rank_available = [] for rank in self.__server_status_list: if not (self.__server_status_list[rank]['busy'] or self.__server_status_list[rank]['timeout']): server_rank_available.append(rank) return server_rank_available def get_server_rank_online(self, verbose=False): server_rank_online = [] for rank in self.__server_status_list: if not self.__server_status_list[rank]['timeout']: server_rank_online.append(rank) return server_rank_online def get_server_timeout(self): casalog_call_origin = "MPIMonitorClient::get_server_timeout" server_rank_timeout = [] for rank in self.__server_status_list: if self.__server_status_list[rank]['timeout'] is True: server_rank_timeout.append(rank) casalog.post( 'Found {} server in timeout status'.format( len(server_rank_timeout)), "INFO", casalog_call_origin) return server_rank_timeout def start_debugging_mode(self): """ Enter debugging/development mode. This disables the heart-beat time out mechanism (which would otherwise trigger when a debugger is attached to MPI server processes). After this no more servers will be flagged as 'timeout', until stop_debugging_mode() is called.""" casalog_call_origin = "MPIMonitorClient::start_debugging_mode" MPIEnvironment.mpi_monitor_status_service_timeout_enabled = False casalog.post("Started debugging mode. Timeout mechanism disabled.", "INFO", casalog_call_origin) def stop_debugging_mode(self): """ Leave debugging/development mode. The heart-beat timeout mechanism is re-enabled. """ casalog_call_origin = "MPIMonitorClient::stop_debugging_mode" # Clear all 'pong_pending' and start ping/pong counts anew for rank in self.__server_status_list: if not self.__server_status_list[rank]['timeout'] is True: self.__server_status_list[rank]['pong_pending'] = False self.__server_status_list[rank]['pong_checks'] = 0 MPIEnvironment.mpi_monitor_status_service_timeout_enabled = True casalog.post("Stopped debugging mode. Timeout mechanism enabled.", "INFO", casalog_call_origin)
class __MPICommandServerImpl: """ Implementation of the MPICommandServer singleton interface """ def __init__(self,start_services=False): # Initialize command request handler service state self.__command_request_handler_service_on = False self.__command_request_handler_service_running = False self.__command_request_handler_service_thread = None # Instantiate MPICommunicator reference self.__communicator = MPICommunicator() # Instantiate MPIMonitorClient reference self.__monitor_server = MPIMonitorServer(False) # Initialize logfile descriptor self.__logfile_descriptor = open(casalog.logfile(), 'a') # Initialize virtual frame buffer state self.__virtual_frame_buffer_port = None self.__virtual_frame_buffer_process = None # Automatically start services if start_services: self.start_services() ################################################################################################################ # Private methods ############################################################################################## ################################################################################################################ def __command_request_handler_service(self): casalog_call_origin = "MPICommandServer::command_request_handler_service" # Mark service as running self.__command_request_handler_service_running = True while (self.__command_request_handler_service_on): # First check if there is a command request msg available msg_available = False try: msg_available = self.__communicator.command_request_probe() except Exception as instance: casalog.post("Exception checking if command request msg is available: %s" % str(instance),"SEVERE",casalog_call_origin) msg_available = False # Then receive command request msg msg_received = False if (msg_available): try: command_request = self.__communicator.command_request_recv() casalog.post("Received command request msg: %s" % command_request['command'],MPIEnvironment.command_handling_log_level,casalog_call_origin) msg_received = True except: formatted_traceback = traceback.format_exc() casalog.post("Exception receiving command request msg: %s" % str(formatted_traceback),"SEVERE",casalog_call_origin) msg_received = False # Finally process command request and send back response if (msg_received): # Start timer command_start_time = time.time() # Update server state self.__monitor_server.set_status('busy',True) self.__monitor_server.set_status('command',command_request['command']) self.__monitor_server.set_status('command_start_time',command_start_time) # Get command request id command_request_id = command_request['id'] # Prepare command response command_response = dict(command_request) # Set command start time command_response['command_start_time'] = command_start_time # Execute/Evaluate command request try: # Add dict-defined parameters to globals if isinstance(command_request['parameters'],dict): globals().update(command_request['parameters']) # Execute command if command_request['mode']=='eval': casalog.post("Going to evaluate command request with id# %s as an expression via eval: %s" % (str(command_request_id),str(command_request['command'])), MPIEnvironment.command_handling_log_level,casalog_call_origin) command_response['ret'] = eval(command_request['command']) elif command_request['mode']=='exec': casalog.post("Going to execute command request with id# %s as a statement via exec: %s" % (str(command_request_id),command_request['command']), MPIEnvironment.command_handling_log_level,casalog_call_origin) code = compile(command_request['command'], casalog_call_origin, 'exec') exec(code) command_response['ret'] = None elif command_request['mode']=='push': casalog.post("Command request with id# %s is a push operation" % str(command_request_id), MPIEnvironment.command_handling_log_level,casalog_call_origin) command_response['ret'] = None # Set command response parameters command_response['successful'] = True command_response['traceback'] = None except: formatted_traceback = traceback.format_exc() casalog.post("Exception executing command request via %s: %s" % (command_request['mode'],str(formatted_traceback)),"SEVERE",casalog_call_origin) # Set command response parameters command_response['successful'] = False command_response['traceback']=formatted_traceback command_response['ret']=None # Variables are cleaned from the environment regardless of the result finally: # Clear parameter variables if isinstance(command_request['parameters'],dict) and command_request['mode']!='push': for parameter in command_request['parameters']: try: del globals()[parameter] except: formatted_traceback = traceback.format_exc() casalog.post("Exception deleting parameter variable '%s' from global environment: %s" % (str(parameter),str(formatted_traceback)),"WARN",casalog_call_origin) # Set command stop time command_stop_time = time.time() command_response['command_stop_time'] = command_stop_time # Update server state self.__monitor_server.set_status('busy',False) self.__monitor_server.set_status('command',None) self.__monitor_server.set_status('command_start_time',None) # Send response back (successful or not) try: casalog.post("Command request with id %s successfully processed in %s mode, sending back response ..." % (str(command_response['id']),str(command_response['mode'])), MPIEnvironment.command_handling_log_level,casalog_call_origin) self.__communicator.command_response_send(response=command_response) except: formatted_traceback = traceback.format_exc() casalog.post("Exception sending back command response: %s" % str(formatted_traceback),"SEVERE",casalog_call_origin) else: time.sleep(MPIEnvironment.mpi_command_request_handler_service_sleep_time) # Mark service as not running self.__command_request_handler_service_running = False def __start_command_request_handler_service(self): casalog_call_origin = "MPICommandServer::start_command_request_handler_service" if self.__command_request_handler_service_running: casalog.post("MPI command request handler service is already running","WARN",casalog_call_origin) return True try: self.__command_request_handler_service_on = True self.__command_request_handler_service_thread = thread.start_new_thread(self.__command_request_handler_service, ()) except Exception as instance: self.__command_request_handler_service_on = False self.__command_request_handler_service_running = False casalog.post("Exception starting MPI command request handler service: %s" % str(instance),"SEVERE",casalog_call_origin) return False while (not self.__command_request_handler_service_running): time.sleep(MPIEnvironment.mpi_check_start_service_sleep_time) casalog.post("MPI command request handler service started","INFO",casalog_call_origin) return True def __stop_command_request_handler_service(self): casalog_call_origin = "MPICommandServer::stop_command_request_handler_service" if not self.__command_request_handler_service_running: casalog.post("MPI command request handler service is not running","WARN",casalog_call_origin) return self.__command_request_handler_service_on = False while (self.__command_request_handler_service_running): time.sleep(MPIEnvironment.mpi_check_stop_service_sleep_time) casalog.post("MPI command request handler service stopped","INFO",casalog_call_origin) ################################################################################################################ # Public methods ############################################################################################### ################################################################################################################ def start_virtual_frame_buffer(self): casalog_call_origin = "MPICommandServer::start_virtual_frame_buffer" displayport = os.getpid() while os.path.exists('/tmp/.X%d-lock' % displayport): displayport += 1 self.__virtual_frame_buffer_port = ":%d" % displayport self.__xauthfile = tempfile.NamedTemporaryFile() try: cookie = subprocess.check_output(['mcookie'], universal_newlines=True).strip() except: cookie = str(uuid.uuid4()).replace('-', '') #sometimes also works without auth, so accept failure subprocess.call(['xauth', '-f', self.__xauthfile.name, 'add', self.__virtual_frame_buffer_port, '.', cookie], stdout=self.__logfile_descriptor, stderr=self.__logfile_descriptor) try: self.__virtual_frame_buffer_process = subprocess.Popen(['Xvfb',self.__virtual_frame_buffer_port, '-auth', self.__xauthfile.name], stdout=self.__logfile_descriptor, stderr=self.__logfile_descriptor, shell=False) os.environ['DISPLAY']=self.__virtual_frame_buffer_port os.environ['XAUTHORITY'] = self.__xauthfile.name casalog.post("Deployed virtual frame buffer at %s with pid %s" % (self.__virtual_frame_buffer_port, str(self.__virtual_frame_buffer_process.pid)), "INFO",casalog_call_origin) except: self.__virtual_frame_buffer_process = None formatted_traceback = traceback.format_exc() casalog.post("Exception deploying virtual frame buffer at %s: %s" % (self.__virtual_frame_buffer_port, str(formatted_traceback)), "SEVERE",casalog_call_origin) def stop_virtual_frame_buffer(self): casalog_call_origin = "MPICommandServer::stop_virtual_frame_buffer" if self.__virtual_frame_buffer_process is not None: try: self.__virtual_frame_buffer_process.terminate() casalog.post("Virtual frame buffer deployed at %s with pid %s successfully shutdown" % (self.__virtual_frame_buffer_port, str(self.__virtual_frame_buffer_process.pid)), "DEBUG",casalog_call_origin) self.__virtual_frame_buffer_process = None except: formatted_traceback = traceback.format_exc() casalog.post("Exception shutting down virtual frame buffer deployed at %s with pid %s: %s" % (self.__virtual_frame_buffer_port, str(self.__virtual_frame_buffer_process.pid), str(formatted_traceback)), "SEVERE",casalog_call_origin) else: casalog.post("Virtual frame buffer not deployed","WARN",casalog_call_origin) subprocess.call(['xauth', '-f', self.__xauthfile.name, 'remove', self.__virtual_frame_buffer_port], stdout=self.__logfile_descriptor, stderr=self.__logfile_descriptor) self.__xauthfile.close() def start_services(self): self.__monitor_server.start_services() self.__start_command_request_handler_service() self.start_virtual_frame_buffer() def stop_services(self,force_command_request_interruption=False): if self.__logfile_descriptor is not None: self.__logfile_descriptor.close() self.__logfile_descriptor = None self.__monitor_server.stop_services() if not force_command_request_interruption: self.__stop_command_request_handler_service() def serve(self): casalog_call_origin = "MPICommandServer::serve" # First start command and ping status services casalog.post("Starting services...","INFO",casalog_call_origin) self.start_services() # Notify to MPICommandClient that service is up and running self.__communicator.control_service_response_send(response=self.__monitor_server.get_status()) # Keep serving until a stop signal service is received control_service_request = {} stop_service_requested = False while ((not stop_service_requested) and (not self.__monitor_server.get_client_timeout())): # Check if there is an incoming control service msg msg_available = False try: msg_available = self.__communicator.control_service_request_probe() except: msg_available = False formatted_traceback = traceback.format_exc() casalog.post("Exception checking if control service msg is available: %s" % str(formatted_traceback),"SEVERE",casalog_call_origin) # Notify to MPICommandClient that control signal has been processed if msg_available: # Receive control service msg msg_received = False control_service_request = {} try: control_service_request = self.__communicator.control_service_request_recv() msg_received = True except: msg_received = False formatted_traceback = traceback.format_exc() casalog.post("Exception receiving control service msg: %s" % str(formatted_traceback),"SEVERE",casalog_call_origin) continue # Process control service msg cmd = None send_response = False if msg_received: try: cmd = control_service_request['command'] send_response = control_service_request['send_response'] code = compile(cmd, casalog_call_origin, 'exec') exec(code) casalog.post("Control signal %s successfully handled by server %s" % (str(cmd),str(MPIEnvironment.mpi_processor_rank)), "INFO",casalog_call_origin) except: formatted_traceback = traceback.format_exc() casalog.post("Exception handling control signal command %s in server %s: %s" % (str(control_service_request), str(MPIEnvironment.mpi_processor_rank), str(formatted_traceback)), "SEVERE",casalog_call_origin) # Notify to MPICommandClient that control signal has been processed if send_response: try: self.__communicator.control_service_response_send(response=self.__monitor_server.get_status()) except: formatted_traceback = traceback.format_exc() casalog.post("Exception sending response to control signal command %s in server %s: %s" % (str(cmd),str(MPIEnvironment.mpi_processor_rank),str(formatted_traceback)), "SEVERE",casalog_call_origin) time.sleep(MPIEnvironment.mpi_stop_service_sleep_time) # Process stop service request if stop_service_requested: # Check if force mode is needed force_command_request_interruption = control_service_request['force_command_request_interruption'] finalize_mpi_environment = control_service_request['finalize_mpi_environment'] busy = self.__monitor_server.get_status('busy') if force_command_request_interruption and busy: casalog.post("force-stop service signal received, stopping services, " + "command request handler service will be interrupted...","INFO",casalog_call_origin) else: force_command_request_interruption = False casalog.post("stop service signal received, stopping services...","INFO",casalog_call_origin) else: force_command_request_interruption = True casalog.post("client timeout, forcing disconnection, " + "command request handler service will be interrupted.." ,"INFO",casalog_call_origin) # Stop services self.stop_services(force_command_request_interruption) # Finalize MPI environment if finalize_mpi_environment: try: casalog.post("Going to finalize MPI environment","INFO",casalog_call_origin) MPIEnvironment.finalize_mpi_environment() except: formatted_traceback = traceback.format_exc() casalog.post("Exception finalizing MPI environment %s" % str(formatted_traceback),"SEVERE",casalog_call_origin) # Exit casalog.post("Exiting","INFO",casalog_call_origin)
class __MPICommandClientImpl: """ Implementation of the MPICommandClient singleton interface """ def __init__(self, start_services=False): # Initialize life cycle state # 0-> Services not started # 1-> Services started # 2-> Stop service signal sent self.__life_cycle_state = 0 # Initialize command request id counter and list self.__command_request_counter = 1 self.__command_request_list = {} self.__command_response_list = {} # Initialize command group response state self.__command_group_response_counter = 1 self.__command_group_response_list = {} # Initialize command response handler service state self.__command_response_handler_service_on = False self.__command_response_handler_service_running = False self.__command_response_handler_service_thread = None self.__command_response_handler_service_event_controller = threading.Event() self.__command_response_handler_service_event_controller.clear() # Initialize command request queue service state self.__command_request_queue = [] self.__command_request_queue_service_on = False self.__command_request_queue_service_running = False self.__command_request_queue_service_thread = None self.__command_request_queue_service_event_controller = threading.Event() self.__command_request_queue_service_event_controller.clear() # Setup a command request input queue to append the jobs # to be picked up by the command request queue service self.__command_request_input_queue = [] self.__command_request_input_queue_lock = threading.Lock() # Instantiate MPICommunicator reference self.__communicator = MPICommunicator() # Instantiate MPIMonitorClient reference self.__monitor_client = MPIMonitorClient(False) # Automatically start services if start_services: self.start_services() # Log mode self.__log_mode = "unified" # Register exit handler # NOTE: It is not guaranteed that __del__() methods are called # for objects that still exist when the interpreter exits. atexit.register(self.stop_services, force_command_request_interruption=True) ################################################################################################################ # Private methods ############################################################################################## ################################################################################################################ def __command_response_handler_service(self): casalog_call_origin = "MPICommandClient::command_response_handler_service" # Mark service as running self.__command_response_handler_service_running = True while self.__command_response_handler_service_on: # Wait until there are command request whose response is pending if len(self.__command_response_list) == len(self.__command_request_list): self.__command_response_handler_service_event_controller.wait() # First check if there is a command response msg available msg_available = False try: msg_available = self.__communicator.command_response_probe() except: msg_available = False formatted_traceback = traceback.format_exc() casalog.post( "Exception checking if command response msg is available: %s" % str(formatted_traceback), "SEVERE", casalog_call_origin, ) # Then receive, store and post command response msg if msg_available: try: # Receive command response command_response = self.__communicator.command_response_recv() server = command_response["server"] successful = command_response["successful"] command_id = command_response["id"] # Mark immediately server as not-busy self.__monitor_client.set_server_status_keyword(server, "busy", False) # Store command response self.__command_response_list[command_id] = command_response # If there are no pending command responses clear the event controller if len(self.__command_response_list) == len(self.__command_request_list): self.__command_response_handler_service_event_controller.clear() # Mark command request as received self.__command_request_list[command_id]["status"] = "response received" self.__command_response_list[command_id]["status"] = "response received" # Notify that command response has been received if successful: casalog.post( "Command request with id %s successfully handled by server n# %s" % (str(command_id), str(server)), MPIEnvironment.command_handling_log_level, casalog_call_origin, ) else: casalog.post( "Command request with id %s failed in server n# %s with traceback %s" % (str(command_id), str(server), str(command_response["traceback"])), "SEVERE", casalog_call_origin, ) # If this request belongs to a group update the group response object if self.__command_request_list[command_id].has_key("group"): command_group_response_id = self.__command_request_list[command_id]["group"] self.__command_group_response_list[command_group_response_id]["list"].remove(command_id) # If there are no requests pending from this group send the group response signal if len(self.__command_group_response_list[command_group_response_id]["list"]) == 0: self.__command_group_response_list[command_group_response_id]["event"].set() except: formatted_traceback = traceback.format_exc() casalog.post( "Exception receiving command request response msg: %s" % str(formatted_traceback), "SEVERE", casalog_call_origin, ) else: time.sleep(MPIEnvironment.mpi_command_response_handler_service_sleep_time) # Mark service as not running self.__command_response_handler_service_running = False def __start_command_response_handler_service(self): casalog_call_origin = "MPICommandClient::start_command_response_handler_service" if self.__command_response_handler_service_running: casalog.post("MPI command response handler service is already running", "WARN", casalog_call_origin) return True try: self.__command_response_handler_service_on = True self.__command_response_handler_service_thread = thread.start_new_thread( self.__command_response_handler_service, () ) except: formatted_traceback = traceback.format_exc() self.__command_response_handler_service_on = False self.__command_response_handler_service_running = False casalog.post( "Exception starting MPI command response handler service: %s" % str(formatted_traceback), "SEVERE", casalog_call_origin, ) return False while not self.__command_response_handler_service_running: time.sleep(MPIEnvironment.mpi_check_start_service_sleep_time) casalog.post("MPI command response handler service started", "INFO", casalog_call_origin) return True def __stop_command_response_handler_service(self): casalog_call_origin = "MPICommandClient::stop_command_response_handler_service" if not self.__command_response_handler_service_running: casalog.post("MPI command response handler service is not running", "WARN", casalog_call_origin) return self.__command_response_handler_service_on = False # Send signal to the thread to be awakened self.__command_response_handler_service_event_controller.set() while self.__command_response_handler_service_running: time.sleep(MPIEnvironment.mpi_check_stop_service_sleep_time) casalog.post("MPI command response handler service stopped", "INFO", casalog_call_origin) def __command_request_queue_service(self): casalog_call_origin = "MPICommandClient::command_request_queue_service" # Mark service as running self.__command_request_queue_service_running = True while self.__command_request_queue_service_on: # Wait until there are pending command responses self.__command_request_input_queue_lock.acquire() if len(self.__command_request_queue) == 0 and len(self.__command_request_input_queue) == 0: self.__command_request_input_queue_lock.release() self.__command_request_queue_service_event_controller.wait() else: self.__command_request_input_queue_lock.release() # Pick up jobs from input queue self.__command_request_input_queue_lock.acquire() while len(self.__command_request_input_queue) > 0: self.__command_request_queue.append(self.__command_request_input_queue.pop(0)) self.__command_request_input_queue_lock.release() # Get list of available servers available_servers_list = self.__monitor_client.get_server_rank_available() if len(available_servers_list) >= 1: # Get list of matching command requests matching_command_request_id_list = self.__match_available_servers_with_command_requests( available_servers_list ) # Iterate over matching command request list for command_request_id in matching_command_request_id_list: command_request_found = False command_request_queue_idx = 0 # Iterate over command request queue to find the corresponding index for command_request in self.__command_request_queue: if command_request["id"] == command_request_id: command_request_found = True break else: command_request_queue_idx = command_request_queue_idx + 1 # Extract command request from queue and send it if command_request_found: command_request = self.__command_request_queue.pop(command_request_queue_idx) # If command request queue us empty clear the event controller if len(self.__command_request_queue) == 0: self.__command_request_queue_service_event_controller.clear() server = command_request["server"] try: # Mark assigned server as busy and set command info in server status self.__monitor_client.set_server_status_keyword(server, "busy", True) self.__monitor_client.set_server_status_keyword( server, "command", command_request["command"] ) # Send command request self.__communicator.command_request_send(request=command_request, server=server) # Mark command request as sent self.__command_request_list[command_request_id]["status"] = "request sent" # Notify that command request has been sent casalog.post( "Command request with id# %s sent to server n# %s" % (str(command_request_id), str(server)), MPIEnvironment.command_handling_log_level, casalog_call_origin, ) except: # Get and format traceback formatted_traceback = traceback.format_exc() # Simulate response command_response = dict(command_request) command_response["successful"] = False command_response["traceback"] = formatted_traceback self.__command_response_list[command_request_id] = command_response # Notify exception casalog.post( "Exception sending command request with id# %s to server n# %s: %s" % (str(command_request_id), str(server), str(formatted_traceback)), "SEVERE", casalog_call_origin, ) else: casalog.post( "Command request with id# %s not found" % str(command_request_id), "SEVERE", casalog_call_origin, ) else: # Sleep in order not to saturate the system time.sleep(MPIEnvironment.mpi_command_request_queue_service_sleep_time) # Mark service as not running self.__command_request_queue_service_running = False def __match_available_servers_with_command_requests(self, available_servers): matching_command_request_id_list = [] unassigned_command_request_id_list = [] available_servers_left = list(available_servers) for command_request in self.__command_request_queue: server = command_request["server"] command_request_id = command_request["id"] # Command request does not have any pre-assigned server if server is None: matching_command_request_id_list.append(command_request_id) unassigned_command_request_id_list.append(command_request_id) # Assigned server is within the list of available servers elif server in available_servers_left: matching_command_request_id_list.append(command_request_id) # Remove server from the list of available servers to avoid multiple assignment available_servers_left.remove(server) # Exit loop if we have enough matching requests if len(matching_command_request_id_list) >= len(available_servers): break # Assign servers to the remaining requests for command_request in self.__command_request_queue: if command_request["id"] in unassigned_command_request_id_list: server = available_servers_left.pop() command_request["server"] = server # Return matching command request id list return matching_command_request_id_list def __start_command_request_queue_service(self): casalog_call_origin = "MPICommandClient::start_command_request_queue_service" if self.__command_request_queue_service_running: casalog.post("MPI command request queue service is already running", "WARN", casalog_call_origin) return True try: self.__command_request_queue_service_on = True self.__command_request_queue_service_thread = thread.start_new_thread( self.__command_request_queue_service, () ) except: formatted_traceback = traceback.format_exc() self.__command_request_queue_service_on = False self.__command_request_queue_service_running = False casalog.post( "Exception starting MPI command request queue service: %s" % str(formatted_traceback), "SEVERE", casalog_call_origin, ) return False while not self.__command_request_queue_service_running: time.sleep(MPIEnvironment.mpi_check_start_service_sleep_time) casalog.post("MPI command request queue service started", "INFO", casalog_call_origin) return True def __stop_command_request_queue_service(self): casalog_call_origin = "MPICommandClient::stop_command_request_queue_service" if not self.__command_request_queue_service_running: casalog.post("MPI command request queue service is not running", "WARN", casalog_call_origin) return self.__command_request_queue_service_on = False # Send signal to the thread to be awakened self.__command_request_queue_service_event_controller.set() while self.__command_request_queue_service_running: time.sleep(MPIEnvironment.mpi_check_stop_service_sleep_time) casalog.post("MPI command request queue service stopped", "INFO", casalog_call_origin) def __send_start_service_signal(self): casalog_call_origin = "MPICommandClient::send_start_service_signal" casalog.post("Sending start service signal to all servers", "INFO", casalog_call_origin) # Prepare stop service request request = {} request["signal"] = "start" request["casa"] = casa # The request contains the global casa dictionary to be used by the servers request["logmode"] = self.__log_mode # Send request to all servers self.__communicator.control_service_request_broadcast(request, casalog) # Then wait until all servers have handled the signal mpi_server_rank_list = MPIEnvironment.mpi_server_rank_list() while len(mpi_server_rank_list) > 0: response_available = False response_available = self.__communicator.control_service_response_probe() if response_available: # Receive start service response to know what server has started response = self.__communicator.control_service_response_recv() rank = response["rank"] # Store processor name and PID info in the MPIMonitorClient self.__monitor_client.set_server_status_keyword(rank, "processor", response["processor"]) self.__monitor_client.set_server_status_keyword(rank, "pid", response["pid"]) # Remove server from list mpi_server_rank_list.remove(rank) # Communicate that server response to start service signal has been received casalog.post( "Server with rank %s started at %s with PID %s" % (str(rank), str(response["processor"]), str(response["pid"])), "INFO", casalog_call_origin, ) else: time.sleep(MPIEnvironment.mpi_check_start_service_sleep_time) casalog.post("Received response from all servers to start service signal", "INFO", casalog_call_origin) def __send_control_signal(self, signal, check_response=True): casalog_call_origin = "MPICommandClient::send_app_control_signal" casalog.post("Sending control signal to all servers: %s" % signal["command"], "INFO", casalog_call_origin) # Add check_response to signal signal["send_response"] = check_response # Send request to all servers try: self.__communicator.control_service_request_broadcast(signal, casalog) except: formatted_traceback = traceback.format_exc() casalog.post( "Exception sending control signal to all servers: %s" % str(formatted_traceback), "SEVERE", casalog_call_origin, ) return # Then wait until all servers have handled the signal if check_response: try: mpi_server_rank_list = self.__monitor_client.get_server_rank_online() except: formatted_traceback = traceback.format_exc() casalog.post( "Exception checking for response to control signal: %s" % str(formatted_traceback), "SEVERE", casalog_call_origin, ) return while len(mpi_server_rank_list) > 0: response_available = False try: response_available = self.__communicator.control_service_response_probe() except: response_available = False formatted_traceback = traceback.format_exc() casalog.post( "Exception getting response to control signal: %s" % str(formatted_traceback), "SEVERE", casalog_call_origin, ) return if response_available: # Receive control signal response response = self.__communicator.control_service_response_recv() rank = response["rank"] # Remove server from list # CAS-7721: Control signals are sent to all servers, even if not responsive # So we may get a response from a server which is not in the initial online servers list if mpi_server_rank_list.count(rank): mpi_server_rank_list.remove(rank) # Communicate that server response to start service signal has been received casalog.post( "Server with rank %s handled control signal %s" % (str(rank), signal["command"]), "DEBUG", casalog_call_origin, ) else: time.sleep(MPIEnvironment.mpi_check_stop_service_sleep_time) casalog.post( "Control signal handled by all servers: %s" % signal["command"], "INFO", casalog_call_origin ) else: casalog.post("Control signal sent to all servers: %s" % signal["command"], "INFO", casalog_call_origin) def __validate_target_servers(self, target_server): casalog_call_origin = "MPICommandClient::validate_target_servers" # Get list of valid MPIServer ranks mpi_server_rank_list = MPIEnvironment.mpi_server_rank_list() # Check if target server is a list of integers if ( isinstance(target_server, list) and (len(target_server) >= 1) and all(isinstance(server, int) for server in target_server) ): # Check if server is within the server rank list for server in target_server: if server not in mpi_server_rank_list: casalog.post("Server #%s does not exist" % str(server), "SEVERE", casalog_call_origin) return None elif self.__monitor_client.get_server_status_keyword(server, "timeout"): casalog.post("Server #%s has timed out" % str(server), "SEVERE", casalog_call_origin) return None # Return input list validated return target_server # Check if target server is an integer elif isinstance(target_server, int): # Check if server is within the server rank list if target_server in mpi_server_rank_list: return [target_server] else: casalog.post("Server #%s does not exist" % str(target_server), "SEVERE", casalog_call_origin) return None else: casalog.post( "target_server has wrong format (%s), accepted formats are int and list(int)" % str(type(target_server)), "SEVERE", casalog_call_origin, ) return None def __register_command_request(self, command_request, server): # Get command request if command_request_id = self.__command_request_counter # Complete command request definition command_request_complete = dict(command_request) command_request_complete["id"] = command_request_id command_request_complete["server"] = server command_request_complete["status"] = "holding queue" # Register command request self.__command_request_list[command_request_id] = command_request_complete # Append jobs to input queue self.__command_request_input_queue_lock.acquire() self.__command_request_input_queue.append(command_request_complete) self.__command_request_input_queue_lock.release() # Increment command id counter self.__command_request_counter = self.__command_request_counter + 1 # Return command request id return command_request_id def __format_command_response_timeout(self, command_request_id): # Create a fake command response copying the command request and marking it as not successful command_response = dict(self.__command_request_list[command_request_id]) command_response["status"] = "timeout" command_response["successful"] = False command_response["ret"] = None # Get server, processor and pid to identify which server timed out server = command_response["server"] processor = self.__monitor_client.get_server_status_keyword(server, "processor") pid = self.__monitor_client.get_server_status_keyword(server, "pid") # Create command response trace-back msg timeout_msg = "Timeout of assigned server n# " + str(server) timeout_msg = timeout_msg + " deployed at " + str(processor) timeout_msg = timeout_msg + " with PID " + str(pid) command_response["traceback"] = timeout_msg return command_response ################################################################################################################ # Public methods ############################################################################################### ################################################################################################################ def get_lifecyle_state(self): return self.__life_cycle_state def start_services(self): casalog_call_origin = "MPICommandClient::start_services" if self.__life_cycle_state == 1: casalog.post("Services already started", "WARN", casalog_call_origin) return elif self.__life_cycle_state == 2: casalog.post("MPICommandClient life cycle finalized", "WARN", casalog_call_origin) return # 1st: start servers self.__send_start_service_signal() # 2nd: start monitoring servers self.__monitor_client.start_services() # 3rd: start command request queue service self.__start_command_request_queue_service() # 4th: start command response handler service self.__start_command_response_handler_service() # Set life cycle state self.__life_cycle_state = 1 casalog.post("All services started", "INFO", casalog_call_origin) def stop_services(self, force_command_request_interruption=False): # jagonzal: This method is called by the atexit module and if it fails it # causes ipython to crash, producing a report and waiting for user input # so we cannot risk under any circumstances such an event try: casalog_call_origin = "MPICommandClient::stop_services" if self.__life_cycle_state == 0: casalog.post("Services not started", "WARN", casalog_call_origin) return elif self.__life_cycle_state == 2: casalog.post("MPICommandClient life cycle finalized", "WARN", casalog_call_origin) return # Check if any server is in timeout condition before stopping the monitoring service server_rank_timeout = self.__monitor_client.get_server_timeout() finalize_mpi_environment = True if len(server_rank_timeout) > 0: finalize_mpi_environment = False force_command_request_interruption = True # Stop client monitoring services self.__monitor_client.stop_services() # Notify command requests which are going to be interrupted for command_request_id in self.__command_request_list: if not self.__command_response_list.has_key(command_request_id): server = self.__command_request_list[command_request_id]["server"] status = self.__command_request_list[command_request_id]["status"] casalog.post( "Aborting command request with id# %s: %s" % (str(command_request_id), str(self.__command_request_list[command_request_id])), "SEVERE", casalog_call_origin, ) # Stop client command request-response services self.__stop_command_request_queue_service() self.__stop_command_response_handler_service() # Shutdown plotms process self.__send_control_signal( {"command": "pm.killApp()", "signal": "process_control"}, check_response=True ) # Shutdown virtual frame buffer self.__send_control_signal( {"command": "self.stop_virtual_frame_buffer()", "signal": "process_control"}, check_response=True ) # Send stop signal to servers self.__send_control_signal( { "command": "stop_service_requested = True", "signal": "stop", "force_command_request_interruption": force_command_request_interruption, "finalize_mpi_environment": finalize_mpi_environment, }, check_response=False, ) # Finalize MPI environment if finalize_mpi_environment: try: casalog.post("Going to finalize MPI environment", "INFO", casalog_call_origin) MPIEnvironment.finalize_mpi_environment() except: formatted_traceback = traceback.format_exc() casalog.post( "Exception finalizing MPI environment %s" % str(formatted_traceback), "SEVERE", casalog_call_origin, ) else: casalog.post( "MPIServers with rank %s are in timeout condition, skipping MPI_Finalize()" % str(server_rank_timeout), "SEVERE", casalog_call_origin, ) # UnMark MPI environment to be finalized by the MPICommunicator destructor # (Either because it is already finalized or due to a # server not responsive that prevents graceful finalization) self.__communicator.set_finalize_mpi_environment(False) # Set life cycle state self.__life_cycle_state = 2 casalog.post("All services stopped", "INFO", casalog_call_origin) except: formatted_traceback = traceback.format_exc() print "Unhandled exception in MPICommandClient::stop_services %s" % (formatted_traceback) def push_command_request(self, command, block=False, target_server=None, parameters=None): casalog_call_origin = "MPICommandClient::push_command_request" if self.__life_cycle_state == 0: casalog.post("Services not started", "WARN", casalog_call_origin) return elif self.__life_cycle_state == 2: casalog.post("MPICommandClient life cycle finalized", "WARN", casalog_call_origin) return command_request = {} command_request["command"] = command command_request["parameters"] = parameters # Determine whether command is a statement or an expression if command == "push": command_request["mode"] = "push" casalog.post("Requested push operation", "DEBUG", casalog_call_origin) else: # Determine whether command is a statement or an expression try: code = compile(command_request["command"], "send_command_request", "eval") command_request["mode"] = "eval" casalog.post( "Command will be evaluated as an expression with return value", "DEBUG", casalog_call_origin ) except: try: code = compile(command_request["command"], "send_command_request", "exec") command_request["mode"] = "exec" casalog.post( "Command will be executed as an statement w/o return code", "DEBUG", casalog_call_origin ) except: formatted_traceback = traceback.format_exc() casalog.post( "Command cannot be executed neither as a statement nor as an expression, it will be rejected: %s" % str(formatted_traceback), "SEVERE", casalog_call_origin, ) return None # Validate target servers target_server_validated = None if target_server is not None: target_server_validated = self.__validate_target_servers(target_server) # Exit if target server is not validated if target_server_validated is None: return None # Create command request list command_request_id_list = [] if target_server_validated is not None: for server in target_server_validated: command_request_id = self.__register_command_request(command_request, server) command_request_id_list.append(command_request_id) else: command_request_id = self.__register_command_request(command_request, None) command_request_id_list.append(command_request_id) # Wake up command request/response service threads self.__command_request_queue_service_event_controller.set() self.__command_response_handler_service_event_controller.set() # In blocking mode wait until command response is received otherwise return request id if block: command_return_code_list = self.get_command_response(command_request_id_list, True, False) return command_return_code_list # Otherwise we simply return the command request id else: return command_request_id_list def get_command_response(self, command_request_id_list, block=False, verbose=True): casalog_call_origin = "MPICommandClient::get_command_response" command_response_list = [] if block: # Wait until command request response is received or timeout pending_command_request_id_list = list(command_request_id_list) while len(pending_command_request_id_list) > 0: for command_request_id in command_request_id_list: # Check if command request id is still pending if command_request_id in pending_command_request_id_list: # Check if we have response for command request id if self.__command_response_list.has_key(command_request_id): # Remove command request id from pending list pending_command_request_id_list.remove(command_request_id) else: server = self.__command_request_list[command_request_id]["server"] if server is not None and self.__monitor_client.get_server_status_keyword( server, "timeout" ): casalog.post( "Command request with id# %s sent to server n# %s, but the server has timed out" % (str(command_request_id), str(server)), "SEVERE", casalog_call_origin, ) # Remove command request id from pending list pending_command_request_id_list.remove(command_request_id) time.sleep(MPIEnvironment.mpi_push_command_request_block_mode_sleep_time) # Gather command response list for command_request_id in command_request_id_list: if self.__command_response_list.has_key(command_request_id): command_response = dict(self.__command_response_list[command_request_id]) command_response_list.append(command_response) else: command_response = self.__format_command_response_timeout(command_request_id) command_response_list.append(command_response) # Gather return codes # command_return_code_list = [] # for command_response in command_response_list: # successful = command_response['successful'] # if not successful: # command_return_code_list.append([command_response['id'],False, command_response['traceback']]) # elif command_response['mode'] == 'eval': # command_return_code_list.append([command_response['id'],True,command_response['ret']]) # else: # command_return_code_list.append([command_response['id'],True,None]) # Return command return code list return command_response_list else: command_response_list = [] for command_request_id in command_request_id_list: if not self.__command_response_list.has_key(command_request_id): server = self.__command_request_list[command_request_id]["server"] timeout = self.__monitor_client.get_server_status_keyword(server, "timeout") if timeout: casalog.post( "Command request with id# %s sent to server n# %s, but the server has timed out" % (str(command_request_id), str(server)), "SEVERE", casalog_call_origin, ) command_response = self.__format_command_response_timeout(command_request_id) command_response_list.append(command_response) elif verbose: status = self.__command_request_list[command_request_id]["status"] casalog.post( "Command request with id# %s is in %s state assigned to server %s" % (str(command_request_id), status, str(server)), "INFO", casalog_call_origin, ) else: command_response = dict(self.__command_response_list[command_request_id]) command_response_list.append(command_response) return command_response_list def get_command_response_event(self, command_request_id_list): # Get command group response id command_group_response_id = self.__command_group_response_counter # Setup event object command_group_response_event = threading.Event() command_group_response_event.clear() # Setup command group response command_group_response = {} command_group_response["id"] = command_group_response_id command_group_response["list"] = list(command_request_id_list) # Make a copy of the list command_group_response["event"] = command_group_response_event # Register command group response self.__command_group_response_list[command_group_response_id] = command_group_response for command_request_id in command_request_id_list: self.__command_request_list[command_request_id]["group"] = command_group_response_id # Increment command id counter self.__command_group_response_counter = self.__command_group_response_counter + 1 # Return command response event object return command_group_response_event def get_server_status(self, server=None): return self.__monitor_client.get_server_status(server) def get_command_request_list(self): return self.__command_request_list def get_command_response_list(self): return self.__command_response_list def set_log_mode(self, logmode): self.__log_mode = logmode def set_log_level(self, log_level): casalog_call_origin = "MPICommandClient::set_log_level" if self.__life_cycle_state == 0: casalog.post("Services not started", "WARN", casalog_call_origin) return elif self.__life_cycle_state == 2: casalog.post("MPICommandClient life cycle finalized", "WARN", casalog_call_origin) return if log_level not in log_levels: casalog.post( "Unknown log level %s, recognized levels are: %s" % (str(log_level), str(log_levels)), "WARN", casalog_call_origin, ) return MPIEnvironment.command_handling_log_level = log_level self.__send_control_signal( { "command": "MPIEnvironment.command_handling_log_level = '%s'" % log_level, "signal": "process_control", }, check_response=True, )
class __MPICommandClientImpl: """ Implementation of the MPICommandClient singleton interface """ def __init__(self,start_services=False): # Initialize life cycle state # 0-> Services not started # 1-> Services started # 2-> Stop service signal sent self.__life_cycle_state = 0 # Initialize command request id counter and list self.__command_request_counter = 1 self.__command_request_list = {} self.__command_response_list = {} # Initialize command group response state self.__command_group_response_counter = 1 self.__command_group_response_list = {} # Initialize command response handler service state self.__command_response_handler_service_on = False self.__command_response_handler_service_running = False self.__command_response_handler_service_thread = None self.__command_response_handler_service_event_controller = threading.Event() self.__command_response_handler_service_event_controller.clear() # Initialize command request queue service state self.__command_request_queue = [] self.__command_request_queue_service_on = False self.__command_request_queue_service_running = False self.__command_request_queue_service_thread = None self.__command_request_queue_service_event_controller = threading.Event() self.__command_request_queue_service_event_controller.clear() # Setup a command request input queue to append the jobs # to be picked up by the command request queue service self.__command_request_input_queue = [] self.__command_request_input_queue_lock = threading.Lock() # Instantiate MPICommunicator reference self.__communicator = MPICommunicator() # Instantiate MPIMonitorClient reference self.__monitor_client = MPIMonitorClient(False) # Automatically start services if start_services: self.start_services() # Log mode self.__log_mode = 'unified' # Register exit handler # NOTE: It is not guaranteed that __del__() methods are called # for objects that still exist when the interpreter exits. atexit.register(self.stop_services,force_command_request_interruption=True) ################################################################################################################ # Private methods ############################################################################################## ################################################################################################################ def __command_response_handler_service(self): casalog_call_origin = "MPICommandClient::command_response_handler_service" # Mark service as running self.__command_response_handler_service_running = True while (self.__command_response_handler_service_on): # Wait until there are command request whose response is pending if len(self.__command_response_list) == len(self.__command_request_list): self.__command_response_handler_service_event_controller.wait() # First check if there is a command response msg available msg_available = False try: msg_available = self.__communicator.command_response_probe() except: msg_available = False formatted_traceback = traceback.format_exc() casalog.post("Exception checking if command response msg is available: %s" % str(formatted_traceback),"SEVERE",casalog_call_origin) # Then receive, store and post command response msg if (msg_available): try: # Receive command response command_response = self.__communicator.command_response_recv() server = command_response['server'] successful = command_response['successful'] command_id = command_response['id'] # Mark immediately server as not-busy self.__monitor_client.set_server_status_keyword(server,'busy',False) # Store command response self.__command_response_list[command_id] = command_response # If there are no pending command responses clear the event controller if len(self.__command_response_list) == len(self.__command_request_list): self.__command_response_handler_service_event_controller.clear() # Mark command request as received self.__command_request_list[command_id]['status'] = 'response received' self.__command_response_list[command_id]['status'] = 'response received' # Notify that command response has been received if successful: casalog.post("Command request with id %s successfully handled by server n# %s" % (str(command_id),str(server)),MPIEnvironment.command_handling_log_level,casalog_call_origin) else: casalog.post("Command request with id %s failed in server n# %s with traceback %s" % (str(command_id),str(server),str(command_response['traceback'])), "SEVERE",casalog_call_origin) # If this request belongs to a group update the group response object if self.__command_request_list[command_id].has_key('group'): command_group_response_id = self.__command_request_list[command_id]['group'] self.__command_group_response_list[command_group_response_id]['list'].remove(command_id) # If there are no requests pending from this group send the group response signal if len(self.__command_group_response_list[command_group_response_id]['list']) == 0: self.__command_group_response_list[command_group_response_id]['event'].set() except: formatted_traceback = traceback.format_exc() casalog.post("Exception receiving command request response msg: %s" % str(formatted_traceback),"SEVERE",casalog_call_origin) else: time.sleep(MPIEnvironment.mpi_command_response_handler_service_sleep_time) # Mark service as not running self.__command_response_handler_service_running = False def __start_command_response_handler_service(self): casalog_call_origin = "MPICommandClient::start_command_response_handler_service" if self.__command_response_handler_service_running: casalog.post("MPI command response handler service is already running","WARN",casalog_call_origin) return True try: self.__command_response_handler_service_on = True self.__command_response_handler_service_thread = thread.start_new_thread(self.__command_response_handler_service, ()) except: formatted_traceback = traceback.format_exc() self.__command_response_handler_service_on = False self.__command_response_handler_service_running = False casalog.post("Exception starting MPI command response handler service: %s" % str(formatted_traceback),"SEVERE",casalog_call_origin) return False while (not self.__command_response_handler_service_running): time.sleep(MPIEnvironment.mpi_check_start_service_sleep_time) casalog.post("MPI command response handler service started","INFO",casalog_call_origin) return True def __stop_command_response_handler_service(self): casalog_call_origin = "MPICommandClient::stop_command_response_handler_service" if not self.__command_response_handler_service_running: casalog.post("MPI command response handler service is not running","WARN",casalog_call_origin) return self.__command_response_handler_service_on = False # Send signal to the thread to be awakened self.__command_response_handler_service_event_controller.set() while (self.__command_response_handler_service_running): time.sleep(MPIEnvironment.mpi_check_stop_service_sleep_time) casalog.post("MPI command response handler service stopped","INFO",casalog_call_origin) def __command_request_queue_service(self): casalog_call_origin = "MPICommandClient::command_request_queue_service" # Mark service as running self.__command_request_queue_service_running = True while self.__command_request_queue_service_on: # Wait until there are pending command responses self.__command_request_input_queue_lock.acquire() if len(self.__command_request_queue) == 0 and len(self.__command_request_input_queue) == 0: self.__command_request_input_queue_lock.release() self.__command_request_queue_service_event_controller.wait() else: self.__command_request_input_queue_lock.release() # Pick up jobs from input queue self.__command_request_input_queue_lock.acquire() while len(self.__command_request_input_queue) > 0: self.__command_request_queue.append(self.__command_request_input_queue.pop(0)) self.__command_request_input_queue_lock.release() # Get list of available servers available_servers_list = self.__monitor_client.get_server_rank_available() if len(available_servers_list) >= 1: # Get list of matching command requests matching_command_request_id_list = self.__match_available_servers_with_command_requests(available_servers_list) # Iterate over matching command request list for command_request_id in matching_command_request_id_list: command_request_found = False command_request_queue_idx = 0 # Iterate over command request queue to find the corresponding index for command_request in self.__command_request_queue: if command_request['id'] == command_request_id: command_request_found = True break else: command_request_queue_idx = command_request_queue_idx + 1 # Extract command request from queue and send it if command_request_found: command_request = self.__command_request_queue.pop(command_request_queue_idx) # If command request queue us empty clear the event controller if len(self.__command_request_queue) == 0: self.__command_request_queue_service_event_controller.clear() server = command_request['server'] try: # Mark assigned server as busy and set command info in server status self.__monitor_client.set_server_status_keyword(server,'busy',True) self.__monitor_client.set_server_status_keyword(server,'command',command_request['command']) # Send command request self.__communicator.command_request_send(request=command_request,server=server) # Mark command request as sent self.__command_request_list[command_request_id]['status']='request sent' # Notify that command request has been sent casalog.post("Command request with id# %s sent to server n# %s" % (str(command_request_id),str(server)),MPIEnvironment.command_handling_log_level,casalog_call_origin) except: # Get and format traceback formatted_traceback = traceback.format_exc() # Simulate response command_response = dict(command_request) command_response['successful']=False command_response['traceback']=formatted_traceback self.__command_response_list[command_request_id]=command_response # Notify exception casalog.post("Exception sending command request with id# %s to server n# %s: %s" % (str(command_request_id),str(server),str(formatted_traceback)), "SEVERE",casalog_call_origin) else: casalog.post("Command request with id# %s not found" % str(command_request_id),"SEVERE",casalog_call_origin) else: # Sleep in order not to saturate the system time.sleep(MPIEnvironment.mpi_command_request_queue_service_sleep_time) # Mark service as not running self.__command_request_queue_service_running = False def __match_available_servers_with_command_requests(self,available_servers): matching_command_request_id_list = [] unassigned_command_request_id_list = [] available_servers_left = list(available_servers) for command_request in self.__command_request_queue: server = command_request['server'] command_request_id = command_request['id'] # Command request does not have any pre-assigned server if server is None: matching_command_request_id_list.append(command_request_id) unassigned_command_request_id_list.append(command_request_id) # Assigned server is within the list of available servers elif server in available_servers_left: matching_command_request_id_list.append(command_request_id) # Remove server from the list of available servers to avoid multiple assignment available_servers_left.remove(server) # Exit loop if we have enough matching requests if len(matching_command_request_id_list) >= len(available_servers): break # Assign servers to the remaining requests for command_request in self.__command_request_queue: if command_request['id'] in unassigned_command_request_id_list: server = available_servers_left.pop() command_request['server'] = server # Return matching command request id list return matching_command_request_id_list def __start_command_request_queue_service(self): casalog_call_origin = "MPICommandClient::start_command_request_queue_service" if self.__command_request_queue_service_running: casalog.post("MPI command request queue service is already running","WARN",casalog_call_origin) return True try: self.__command_request_queue_service_on = True self.__command_request_queue_service_thread = thread.start_new_thread(self.__command_request_queue_service, ()) except: formatted_traceback = traceback.format_exc() self.__command_request_queue_service_on = False self.__command_request_queue_service_running = False casalog.post("Exception starting MPI command request queue service: %s" % str(formatted_traceback),"SEVERE",casalog_call_origin) return False while (not self.__command_request_queue_service_running): time.sleep(MPIEnvironment.mpi_check_start_service_sleep_time) casalog.post("MPI command request queue service started","INFO",casalog_call_origin) return True def __stop_command_request_queue_service(self): casalog_call_origin = "MPICommandClient::stop_command_request_queue_service" if not self.__command_request_queue_service_running: casalog.post("MPI command request queue service is not running","WARN",casalog_call_origin) return self.__command_request_queue_service_on = False # Send signal to the thread to be awakened self.__command_request_queue_service_event_controller.set() while (self.__command_request_queue_service_running): time.sleep(MPIEnvironment.mpi_check_stop_service_sleep_time) casalog.post("MPI command request queue service stopped","INFO",casalog_call_origin) def __send_start_service_signal(self): casalog_call_origin = "MPICommandClient::send_start_service_signal" casalog.post("Sending start service signal to all servers","INFO",casalog_call_origin) # Prepare stop service request request = {} request['signal'] = 'start' request['casa'] = casa # The request contains the global casa dictionary to be used by the servers request['logmode'] = self.__log_mode # Send request to all servers self.__communicator.control_service_request_broadcast(request,casalog) # Then wait until all servers have handled the signal mpi_server_rank_list = MPIEnvironment.mpi_server_rank_list() while len(mpi_server_rank_list)>0: response_available = False response_available = self.__communicator.control_service_response_probe() if response_available: # Receive start service response to know what server has started response = self.__communicator.control_service_response_recv() rank = response['rank'] # Store processor name and PID info in the MPIMonitorClient self.__monitor_client.set_server_status_keyword(rank,'processor',response['processor']) self.__monitor_client.set_server_status_keyword(rank,'pid',response['pid']) # Remove server from list mpi_server_rank_list.remove(rank) # Communicate that server response to start service signal has been received casalog.post("Server with rank %s started at %s with PID %s" % (str(rank),str(response['processor']),str(response['pid'])), "INFO",casalog_call_origin) else: time.sleep(MPIEnvironment.mpi_check_start_service_sleep_time) casalog.post("Received response from all servers to start service signal","INFO",casalog_call_origin) def __send_control_signal(self,signal,check_response=True): casalog_call_origin = "MPICommandClient::send_app_control_signal" casalog.post("Sending control signal to all servers: %s" % signal['command'],"INFO",casalog_call_origin) # Add check_response to signal signal['send_response'] = check_response # Send request to all servers try: self.__communicator.control_service_request_broadcast(signal,casalog) except: formatted_traceback = traceback.format_exc() casalog.post("Exception sending control signal to all servers: %s" % str(formatted_traceback), "SEVERE",casalog_call_origin) return # Then wait until all servers have handled the signal if check_response: try: mpi_server_rank_list = self.__monitor_client.get_server_rank_online() except: formatted_traceback = traceback.format_exc() casalog.post("Exception checking for response to control signal: %s" % str(formatted_traceback), "SEVERE",casalog_call_origin) return while len(mpi_server_rank_list)>0: response_available = False try: response_available = self.__communicator.control_service_response_probe() except: response_available = False formatted_traceback = traceback.format_exc() casalog.post("Exception getting response to control signal: %s" % str(formatted_traceback), "SEVERE",casalog_call_origin) return if response_available: # Receive control signal response response = self.__communicator.control_service_response_recv() rank = response['rank'] # Remove server from list # CAS-7721: Control signals are sent to all servers, even if not responsive # So we may get a response from a server which is not in the initial online servers list if mpi_server_rank_list.count(rank): mpi_server_rank_list.remove(rank) # Communicate that server response to start service signal has been received casalog.post("Server with rank %s handled control signal %s" % (str(rank),signal['command']), "DEBUG",casalog_call_origin) else: time.sleep(MPIEnvironment.mpi_check_stop_service_sleep_time) casalog.post("Control signal handled by all servers: %s" % signal['command'],"INFO",casalog_call_origin) else: casalog.post("Control signal sent to all servers: %s" % signal['command'],"INFO",casalog_call_origin) def __validate_target_servers(self,target_server): casalog_call_origin = "MPICommandClient::validate_target_servers" # Get list of valid MPIServer ranks mpi_server_rank_list = MPIEnvironment.mpi_server_rank_list() # Check if target server is a list of integers if isinstance(target_server,list) and (len(target_server)>=1) and all(isinstance(server, int) for server in target_server): # Check if server is within the server rank list for server in target_server: if server not in mpi_server_rank_list: casalog.post("Server #%s does not exist" % str(server),"SEVERE",casalog_call_origin) return None elif self.__monitor_client.get_server_status_keyword(server,'timeout'): casalog.post("Server #%s has timed out" % str(server),"SEVERE",casalog_call_origin) return None # Return input list validated return target_server # Check if target server is an integer elif isinstance(target_server,int): # Check if server is within the server rank list if target_server in mpi_server_rank_list: return [target_server] else: casalog.post("Server #%s does not exist" % str(target_server),"SEVERE",casalog_call_origin) return None else: casalog.post("target_server has wrong format (%s), accepted formats are int and list(int)" % str(type(target_server)),"SEVERE",casalog_call_origin) return None def __register_command_request(self,command_request,server): # Get command request if command_request_id = self.__command_request_counter # Complete command request definition command_request_complete = dict(command_request) command_request_complete['id'] = command_request_id command_request_complete['server'] = server command_request_complete['status'] = 'holding queue' # Register command request self.__command_request_list[command_request_id]=command_request_complete # Append jobs to input queue self.__command_request_input_queue_lock.acquire() self.__command_request_input_queue.append(command_request_complete) self.__command_request_input_queue_lock.release() # Increment command id counter self.__command_request_counter = self.__command_request_counter + 1 # Return command request id return command_request_id def __format_command_response_timeout(self,command_request_id): # Create a fake command response copying the command request and marking it as not successful command_response = dict(self.__command_request_list[command_request_id]) command_response['status']='timeout' command_response['successful']=False command_response['ret']=None # Get server, processor and pid to identify which server timed out server = command_response['server'] processor = self.__monitor_client.get_server_status_keyword(server,'processor') pid = self.__monitor_client.get_server_status_keyword(server,'pid') # Create command response trace-back msg timeout_msg = "Timeout of assigned server n# " + str(server) timeout_msg = timeout_msg + " deployed at " + str(processor) timeout_msg = timeout_msg + " with PID " + str(pid) command_response['traceback'] = timeout_msg return command_response ################################################################################################################ # Public methods ############################################################################################### ################################################################################################################ def get_lifecyle_state(self): return self.__life_cycle_state def start_services(self): casalog_call_origin = "MPICommandClient::start_services" if self.__life_cycle_state == 1: casalog.post("Services already started","WARN",casalog_call_origin) return elif self.__life_cycle_state == 2: casalog.post("MPICommandClient life cycle finalized","WARN",casalog_call_origin) return # 1st: start servers self.__send_start_service_signal() # 2nd: start monitoring servers self.__monitor_client.start_services() # 3rd: start command request queue service self.__start_command_request_queue_service() # 4th: start command response handler service self.__start_command_response_handler_service() # Set life cycle state self.__life_cycle_state = 1 casalog.post("All services started","INFO",casalog_call_origin) def stop_services(self,force_command_request_interruption=False): # jagonzal: This method is called by the atexit module and if it fails it # causes ipython to crash, producing a report and waiting for user input # so we cannot risk under any circumstances such an event try: casalog_call_origin = "MPICommandClient::stop_services" if self.__life_cycle_state == 0: casalog.post("Services not started","WARN",casalog_call_origin) return elif self.__life_cycle_state == 2: casalog.post("MPICommandClient life cycle finalized","WARN",casalog_call_origin) return # Check if any server is in timeout condition before stopping the monitoring service server_rank_timeout = self.__monitor_client.get_server_timeout() finalize_mpi_environment = True if len(server_rank_timeout) > 0: finalize_mpi_environment = False force_command_request_interruption = True # Stop client monitoring services self.__monitor_client.stop_services() # Notify command requests which are going to be interrupted for command_request_id in self.__command_request_list: if not self.__command_response_list.has_key(command_request_id): server = self.__command_request_list[command_request_id]['server'] status = self.__command_request_list[command_request_id]['status'] casalog.post("Aborting command request with id# %s: %s" % (str(command_request_id),str(self.__command_request_list[command_request_id])), "SEVERE",casalog_call_origin) # Stop client command request-response services self.__stop_command_request_queue_service() self.__stop_command_response_handler_service() # Shutdown plotms process self.__send_control_signal({'command':'pm.killApp()', 'signal':'process_control'}, check_response=True) # Shutdown virtual frame buffer self.__send_control_signal({'command':'self.stop_virtual_frame_buffer()', 'signal':'process_control'}, check_response=True) # Send stop signal to servers self.__send_control_signal({'command':'stop_service_requested = True', 'signal':'stop', 'force_command_request_interruption':force_command_request_interruption, 'finalize_mpi_environment':finalize_mpi_environment}, check_response=False) # Finalize MPI environment if finalize_mpi_environment: try: casalog.post("Going to finalize MPI environment","INFO",casalog_call_origin) MPIEnvironment.finalize_mpi_environment() except: formatted_traceback = traceback.format_exc() casalog.post("Exception finalizing MPI environment %s" % str(formatted_traceback),"SEVERE",casalog_call_origin) else: casalog.post("MPIServers with rank %s are in timeout condition, skipping MPI_Finalize()" % str(server_rank_timeout),"SEVERE",casalog_call_origin) # UnMark MPI environment to be finalized by the MPICommunicator destructor # (Either because it is already finalized or due to a # server not responsive that prevents graceful finalization) self.__communicator.set_finalize_mpi_environment(False) # Set life cycle state self.__life_cycle_state = 2 casalog.post("All services stopped","INFO",casalog_call_origin) except: formatted_traceback = traceback.format_exc() print "Unhandled exception in MPICommandClient::stop_services %s" %(formatted_traceback) def push_command_request(self,command,block=False,target_server=None,parameters=None): casalog_call_origin = "MPICommandClient::push_command_request" if self.__life_cycle_state == 0: casalog.post("Services not started","WARN",casalog_call_origin) return elif self.__life_cycle_state == 2: casalog.post("MPICommandClient life cycle finalized","WARN",casalog_call_origin) return command_request = {} command_request['command']=command command_request['parameters'] = parameters # Determine whether command is a statement or an expression if command == "push": command_request['mode']='push' casalog.post("Requested push operation","DEBUG",casalog_call_origin) else: # Determine whether command is a statement or an expression try: code = compile(command_request['command'],"send_command_request", "eval") command_request['mode']='eval' casalog.post("Command will be evaluated as an expression with return value", "DEBUG",casalog_call_origin) except: try: code = compile(command_request['command'],"send_command_request", "exec") command_request['mode']='exec' casalog.post("Command will be executed as an statement w/o return code", "DEBUG",casalog_call_origin) except: formatted_traceback = traceback.format_exc() casalog.post("Command cannot be executed neither as a statement nor as an expression, it will be rejected: %s" % str(formatted_traceback),"SEVERE",casalog_call_origin) return None # Validate target servers target_server_validated = None if target_server is not None: target_server_validated = self.__validate_target_servers(target_server) # Exit if target server is not validated if target_server_validated is None: return None # Create command request list command_request_id_list = [] if target_server_validated is not None: for server in target_server_validated: command_request_id = self.__register_command_request(command_request,server) command_request_id_list.append(command_request_id) else: command_request_id = self.__register_command_request(command_request,None) command_request_id_list.append(command_request_id) # Wake up command request/response service threads self.__command_request_queue_service_event_controller.set() self.__command_response_handler_service_event_controller.set() # In blocking mode wait until command response is received otherwise return request id if block: command_return_code_list = self.get_command_response(command_request_id_list,True,False) return command_return_code_list # Otherwise we simply return the command request id else: return command_request_id_list def get_command_response(self,command_request_id_list,block=False,verbose=True): casalog_call_origin = "MPICommandClient::get_command_response" command_response_list = [] if block: # Wait until command request response is received or timeout pending_command_request_id_list = list(command_request_id_list) while len(pending_command_request_id_list)>0: for command_request_id in command_request_id_list: # Check if command request id is still pending if command_request_id in pending_command_request_id_list: # Check if we have response for command request id if self.__command_response_list.has_key(command_request_id): # Remove command request id from pending list pending_command_request_id_list.remove(command_request_id) else: server = self.__command_request_list[command_request_id]['server'] if server is not None and self.__monitor_client.get_server_status_keyword(server,'timeout'): casalog.post("Command request with id# %s sent to server n# %s, but the server has timed out" % (str(command_request_id),str(server)),"SEVERE",casalog_call_origin) # Remove command request id from pending list pending_command_request_id_list.remove(command_request_id) time.sleep(MPIEnvironment.mpi_push_command_request_block_mode_sleep_time) # Gather command response list for command_request_id in command_request_id_list: if self.__command_response_list.has_key(command_request_id): command_response = dict(self.__command_response_list[command_request_id]) command_response_list.append(command_response) else: command_response = self.__format_command_response_timeout(command_request_id) command_response_list.append(command_response) # Gather return codes #command_return_code_list = [] #for command_response in command_response_list: # successful = command_response['successful'] # if not successful: # command_return_code_list.append([command_response['id'],False, command_response['traceback']]) # elif command_response['mode'] == 'eval': # command_return_code_list.append([command_response['id'],True,command_response['ret']]) # else: # command_return_code_list.append([command_response['id'],True,None]) # Return command return code list return command_response_list else: command_response_list = [] for command_request_id in command_request_id_list: if not self.__command_response_list.has_key(command_request_id): server = self.__command_request_list[command_request_id]['server'] timeout = self.__monitor_client.get_server_status_keyword(server,'timeout') if timeout: casalog.post("Command request with id# %s sent to server n# %s, but the server has timed out" % (str(command_request_id),str(server)),"SEVERE",casalog_call_origin) command_response = self.__format_command_response_timeout(command_request_id) command_response_list.append(command_response) elif verbose: status = self.__command_request_list[command_request_id]['status'] casalog.post("Command request with id# %s is in %s state assigned to server %s" % (str(command_request_id),status,str(server)),"INFO",casalog_call_origin) else: command_response = dict(self.__command_response_list[command_request_id]) command_response_list.append(command_response) return command_response_list def get_command_response_event(self,command_request_id_list): # Get command group response id command_group_response_id = self.__command_group_response_counter # Setup event object command_group_response_event = threading.Event() command_group_response_event.clear() # Setup command group response command_group_response = {} command_group_response['id'] = command_group_response_id command_group_response['list'] = list(command_request_id_list) # Make a copy of the list command_group_response['event'] = command_group_response_event # Register command group response self.__command_group_response_list[command_group_response_id]=command_group_response for command_request_id in command_request_id_list: self.__command_request_list[command_request_id]['group'] = command_group_response_id # Increment command id counter self.__command_group_response_counter = self.__command_group_response_counter + 1 # Return command response event object return command_group_response_event def get_server_status(self,server=None): return self.__monitor_client.get_server_status(server) def get_command_request_list(self): return self.__command_request_list def get_command_response_list(self): return self.__command_response_list def set_log_mode(self,logmode): self.__log_mode = logmode def set_log_level(self,log_level): casalog_call_origin = "MPICommandClient::set_log_level" if self.__life_cycle_state == 0: casalog.post("Services not started","WARN",casalog_call_origin) return elif self.__life_cycle_state == 2: casalog.post("MPICommandClient life cycle finalized","WARN",casalog_call_origin) return if log_level not in log_levels: casalog.post("Unknown log level %s, recognized levels are: %s" % (str(log_level),str(log_levels)), "WARN",casalog_call_origin) return MPIEnvironment.command_handling_log_level = log_level self.__send_control_signal({'command':"MPIEnvironment.command_handling_log_level = '%s'" % log_level, 'signal':'process_control'}, check_response=True) # EOF
class __MPIMonitorServerImpl: """ Implementation of the MPIMonitorServer singleton interface """ def __init__(self, start_services=True): # Initialize status state dict self.__status = {} self.__status['rank'] = MPIEnvironment.mpi_processor_rank self.__status['processor'] = MPIEnvironment.hostname self.__status['pid'] = os.getpid() self.__status['busy'] = False self.__status['command'] = None self.__status['command_start_time'] = None self.__status['command_stop_time'] = None # Initialize ping status request handler service state self.__ping_status_request_handler_service_on = False self.__ping_status_request_handler_service_final_round = False self.__ping_status_request_handler_service_running = False self.__ping_status_request_handler_service_thread = None self.__last_ping_status_request_time = None self.__client_timeout = False # Instantiate MPICommunicator reference self.__communicator = MPICommunicator() # Automatically start services if start_services: self.start_services() ################################################################################################################ # Private methods ############################################################################################## ################################################################################################################ def __ping_status_request_handler_service(self): casalog_call_origin = "MPIMonitorServer::ping_status_request_handler_service" # Mark service as running self.__ping_status_request_handler_service_running = True while (self.__ping_status_request_handler_service_on or self.__ping_status_request_handler_service_final_round): # First check if there is a msg available msg_available = False try: msg_available = self.__communicator.ping_status_request_probe( ) except Exception as instance: casalog.post( "Exception checking if ping status request msg is available: %s" % str(instance), "SEVERE", casalog_call_origin) msg_available = False # Then receive ping status request msg msg_received = False if (msg_available): self.__last_ping_status_request_time = time.time() try: self.__communicator.ping_status_request_recv() msg_received = True except Exception as instance: casalog.post( "Exception receiving ping status request msg: %s" % str(instance), "SEVERE", casalog_call_origin) msg_received = False # jagonzal: Intensive activity in the client can cause monitoring client service to be slowed down # This is due to Python's GIL which is acquired by the CASA SWIG components # Using SWIG's thread option it is possible to disable GIL within the SWIG components # (see test_mpi4casa[test1_applycal_fluxscale_gcal_bcal]) # Check when we last received a ping status request # elif self.__last_ping_status_request_time is not None: # elapsed_time = time.time() - self.__last_ping_status_request_time # if (elapsed_time > MPIEnvironment.mpi_ping_status_request_handler_service_timeout): # casalog.post("Heartbeat from client not received in the last %ss" % # str(int(round(elapsed_time))),"WARN",casalog_call_origin) # self.__client_timeout = True # Send back response if (msg_received): try: self.__communicator.ping_status_response_send( response=self.__status) except: formatted_traceback = traceback.format_exc() casalog.post( "Exception sending back ping status response: %s" % str(formatted_traceback), "SEVERE", casalog_call_origin) else: time.sleep( MPIEnvironment. mpi_ping_status_request_handler_service_sleep_time) # Check if this was last round if (self.__ping_status_request_handler_service_final_round): self.__ping_status_request_handler_service_final_round = False # Mark service as not running self.__ping_status_request_handler_service_running = False def __start_ping_status_request_handler_service(self): casalog_call_origin = "MPIMonitorServer::start_ping_status_request_handler_service" if self.__ping_status_request_handler_service_running: casalog.post( "MPI ping status request handler service is already running", "WARN", casalog_call_origin) return True try: self.__ping_status_request_handler_service_on = True self.__ping_status_request_handler_service_thread = thread.start_new_thread( self.__ping_status_request_handler_service, ()) except Exception as instance: self.__ping_status_request_handler_service_on = False self.__ping_status_request_handler_service_running = False casalog.post( "Exception starting MPI ping status request handler service: %s" % str(instance), "SEVERE", casalog_call_origin) return False while (not self.__ping_status_request_handler_service_running): time.sleep(MPIEnvironment.mpi_check_start_service_sleep_time) casalog.post("MPI ping status request handler service started", "INFO", casalog_call_origin) return True def __stop_ping_status_request_handler_service(self): casalog_call_origin = "MPIMonitorServer::stop_ping_status_request_handler_service" if not self.__ping_status_request_handler_service_on: casalog.post( "MPI ping status request handler service is not running", "WARN", casalog_call_origin) return self.__ping_status_request_handler_service_final_round = True self.__ping_status_request_handler_service_on = False while (self.__ping_status_request_handler_service_running): time.sleep(MPIEnvironment.mpi_check_stop_service_sleep_time) casalog.post("MPI ping status request handler service stopped", "INFO", casalog_call_origin) ################################################################################################################ # Public methods ############################################################################################### ################################################################################################################ def start_services(self): self.__start_ping_status_request_handler_service() def stop_services(self): self.__stop_ping_status_request_handler_service() def get_client_timeout(self): return self.__client_timeout def get_status(self, keyword=None): casalog_call_origin = "MPIMonitorServer::get_status" # If no keyword is provided return a copy of the status dictionary if keyword is None: return dict(self.__status) # If keyword is provided check existence and return the mapped value elif keyword in self.__status: return self.__status[keyword] else: casalog.post("Status keyword %s not defined" % str(keyword), "WARN", casalog_call_origin) def set_status(self, keyword, value): casalog_call_origin = "MPIMonitorServer::set_status" if keyword in self.__status: self.__status[keyword] = value else: casalog.post("Status keyword %s not defined" % str(keyword), "WARN", casalog_call_origin)
class __MPIMonitorServerImpl: """ Implementation of the MPIMonitorServer singleton interface """ def __init__(self,start_services=True): # Initialize status state dict self.__status = {} self.__status['rank'] = MPIEnvironment.mpi_processor_rank self.__status['processor'] = MPIEnvironment.hostname self.__status['pid'] = os.getpid() self.__status['busy'] = False self.__status['command'] = None self.__status['command_start_time'] = None self.__status['command_stop_time'] = None # Initialize ping status request handler service state self.__ping_status_request_handler_service_on = False self.__ping_status_request_handler_service_final_round = False self.__ping_status_request_handler_service_running = False self.__ping_status_request_handler_service_thread = None self.__last_ping_status_request_time = None self.__client_timeout = False # Instantiate MPICommunicator reference self.__communicator = MPICommunicator() # Automatically start services if start_services: self.start_services() ################################################################################################################ # Private methods ############################################################################################## ################################################################################################################ def __ping_status_request_handler_service(self): casalog_call_origin = "MPIMonitorServer::ping_status_request_handler_service" # Mark service as running self.__ping_status_request_handler_service_running = True while (self.__ping_status_request_handler_service_on or self.__ping_status_request_handler_service_final_round): # First check if there is a msg available msg_available = False try: msg_available = self.__communicator.ping_status_request_probe() except Exception as instance: casalog.post("Exception checking if ping status request msg is available: %s" % str(instance),"SEVERE",casalog_call_origin) msg_available = False # Then receive ping status request msg msg_received = False if (msg_available): self.__last_ping_status_request_time = time.time() try: self.__communicator.ping_status_request_recv() msg_received = True except Exception as instance: casalog.post("Exception receiving ping status request msg: %s" % str(instance),"SEVERE",casalog_call_origin) msg_received = False # jagonzal: Intensive activity in the client can cause monitoring client service to be slowed down # This is due to Python's GIL which is acquired by the CASA SWIG components # Using SWIG's thread option it is possible to disable GIL within the SWIG components # (see test_mpi4casa[test1_applycal_fluxscale_gcal_bcal]) # Check when we last received a ping status request # elif self.__last_ping_status_request_time is not None: # elapsed_time = time.time() - self.__last_ping_status_request_time # if (elapsed_time > MPIEnvironment.mpi_ping_status_request_handler_service_timeout): # casalog.post("Heartbeat from client not received in the last %ss" % # str(int(round(elapsed_time))),"WARN",casalog_call_origin) # self.__client_timeout = True # Send back response if (msg_received): try: self.__communicator.ping_status_response_send(response=self.__status) except: formatted_traceback = traceback.format_exc() casalog.post("Exception sending back ping status response: %s" % str(formatted_traceback),"SEVERE",casalog_call_origin) else: time.sleep(MPIEnvironment.mpi_ping_status_request_handler_service_sleep_time) # Check if this was last round if (self.__ping_status_request_handler_service_final_round): self.__ping_status_request_handler_service_final_round = False # Mark service as not running self.__ping_status_request_handler_service_running = False def __start_ping_status_request_handler_service(self): casalog_call_origin = "MPIMonitorServer::start_ping_status_request_handler_service" if self.__ping_status_request_handler_service_running: casalog.post("MPI ping status request handler service is already running","WARN",casalog_call_origin) return True try: self.__ping_status_request_handler_service_on = True self.__ping_status_request_handler_service_thread = thread.start_new_thread(self.__ping_status_request_handler_service, ()) except Exception as instance: self.__ping_status_request_handler_service_on = False self.__ping_status_request_handler_service_running = False casalog.post("Exception starting MPI ping status request handler service: %s" % str(instance),"SEVERE",casalog_call_origin) return False while (not self.__ping_status_request_handler_service_running): time.sleep(MPIEnvironment.mpi_check_start_service_sleep_time) casalog.post("MPI ping status request handler service started","INFO",casalog_call_origin) return True def __stop_ping_status_request_handler_service(self): casalog_call_origin = "MPIMonitorServer::stop_ping_status_request_handler_service" if not self.__ping_status_request_handler_service_on: casalog.post("MPI ping status request handler service is not running","WARN",casalog_call_origin) return self.__ping_status_request_handler_service_final_round = True self.__ping_status_request_handler_service_on = False while (self.__ping_status_request_handler_service_running): time.sleep(MPIEnvironment.mpi_check_stop_service_sleep_time) casalog.post("MPI ping status request handler service stopped","INFO",casalog_call_origin) ################################################################################################################ # Public methods ############################################################################################### ################################################################################################################ def start_services(self): self.__start_ping_status_request_handler_service() def stop_services(self): self.__stop_ping_status_request_handler_service() def get_client_timeout(self): return self.__client_timeout def get_status(self,keyword=None): casalog_call_origin = "MPIMonitorServer::get_status" # If no keyword is provided return a copy of the status dictionary if keyword is None: return dict(self.__status) # If keyword is provided check existence and return the mapped value elif self.__status.has_key(keyword): return self.__status[keyword] else: casalog.post("Status keyword %s not defined" % str(keyword),"WARN",casalog_call_origin) def set_status(self,keyword,value): casalog_call_origin = "MPIMonitorServer::set_status" if self.__status.has_key(keyword): self.__status[keyword] = value else: casalog.post("Status keyword %s not defined" % str(keyword),"WARN",casalog_call_origin)
class __MPIMonitorClientImpl: """ Implementation of the MPIMonitorClient singleton interface """ def __init__(self,start_services=True): # Initialize server status state self.__server_status_list = {} mpi_server_rank_list = MPIEnvironment.mpi_server_rank_list() for rank in mpi_server_rank_list: self.__server_status_list[rank] = {} self.__server_status_list[rank]['rank'] = rank self.__server_status_list[rank]['processor'] = None self.__server_status_list[rank]['pid'] = None self.__server_status_list[rank]['busy'] = False self.__server_status_list[rank]['command'] = None self.__server_status_list[rank]['command_start_time'] = None self.__server_status_list[rank]['pong_pending'] = False self.__server_status_list[rank]['ping_time'] = None self.__server_status_list[rank]['pong_time'] = None self.__server_status_list[rank]['timeout'] = False # Initialize monitor service state self.__monitor_status_service_on = False self.__monitor_status_service_running = False self.__monitor_status_service_thread = None # Initialize ping status response handler service state self.__ping_status_response_handler_service_on = False self.__ping_status_response_handler_service_running = False self.__ping_status_response_handler_service_thread = None # Instantiate MPICommunicator reference self.__communicator = MPICommunicator() # Automatically start services if start_services: self.start_services() ################################################################################################################ # Private methods ############################################################################################## ################################################################################################################ def __monitor_status_service(self): casalog_call_origin = "MPIMonitorClient::monitor_status_service" # Mark service as running self.__monitor_status_service_running = True mpi_server_rank_list = MPIEnvironment.mpi_server_rank_list() while (self.__monitor_status_service_on): # Iterate over servers for rank in mpi_server_rank_list: # Send ping status request if there is none pending if not self.__server_status_list[rank]['pong_pending']: try: self.__communicator.ping_status_request_send(server=rank) self.__server_status_list[rank]['ping_time'] = time.time() self.__server_status_list[rank]['pong_pending'] = True self.__server_status_list[rank]['pong_checks'] = 0 except: formatted_traceback = traceback.format_exc() casalog.post("Exception sending ping status request to server %s: %s" % (str(rank),str(formatted_traceback)),"SEVERE",casalog_call_origin) else: self.__server_status_list[rank]['pong_checks'] += 1 elapsed_time = MPIEnvironment.mpi_monitor_status_service_heartbeat elapsed_time *= self.__server_status_list[rank]['pong_checks'] # elapsed_time = int(round(time.time() - self.__server_status_list[rank]['ping_time'])) # Notify when a server reaches timeout condition if ((elapsed_time > MPIEnvironment.mpi_monitor_status_service_timeout) and (not self.__server_status_list[rank]['timeout'])): casalog.post("Ping status response from server %s not received in the last %ss" % (str(rank),str(int(elapsed_time))),"SEVERE",casalog_call_origin) self.__server_status_list[rank]['timeout'] = True # Sleep before next round time.sleep(MPIEnvironment.mpi_monitor_status_service_heartbeat) # Mark service as not running self.__monitor_status_service_running = False def __start_monitor_status_service(self): casalog_call_origin = "MPIMonitorClient::start_monitor_status_service" if self.__monitor_status_service_running: casalog.post("MPI monitor status service is already running","WARN",casalog_call_origin) return True try: self.__monitor_status_service_on = True self.__monitor_status_service_thread = thread.start_new_thread(self.__monitor_status_service, ()) except: formatted_traceback = traceback.format_exc() self.__monitor_status_service_on = False self.__monitor_status_service_running = False casalog.post("Exception starting MPI monitor status service: %s" % str(formatted_traceback),"SEVERE",casalog_call_origin) return False while (not self.__monitor_status_service_running): time.sleep(MPIEnvironment.mpi_check_start_service_sleep_time) casalog.post("MPI monitor status service started","INFO",casalog_call_origin) return True def __stop_monitor_status_service(self): casalog_call_origin = "MPIMonitorClient::stop_monitor_status_service" if not self.__monitor_status_service_running: casalog.post("MPI ping status response handler service is not running","WARN",casalog_call_origin) return self.__monitor_status_service_on = False while (self.__monitor_status_service_running): time.sleep(MPIEnvironment.mpi_check_stop_service_sleep_time) casalog.post("MPI monitor status service stopped","INFO",casalog_call_origin) def __ping_status_response_handler_service(self): casalog_call_origin = "MPIMonitorClient::ping_status_response_handler_service" # Mark service as running self.__ping_status_response_handler_service_running = True while (self.__ping_status_response_handler_service_on): # First check if there is a ping_status response msg available msg_available = False try: msg_available = self.__communicator.ping_status_response_probe() except: msg_available = False formatted_traceback = traceback.format_exc() casalog.post("Exception checking if ping status response msg is available: %s" % str(formatted_traceback),"SEVERE",casalog_call_origin) # Then receive, store and post ping_status response msg if (msg_available): try: ping_status_response = self.__communicator.ping_status_response_recv() pong_time = time.time() rank = ping_status_response['rank'] self.__server_status_list[rank]['command'] = ping_status_response['command'] self.__server_status_list[rank]['command_start_time'] = ping_status_response['command_start_time'] self.__server_status_list[rank]['pong_time'] = pong_time self.__server_status_list[rank]['pong_pending'] = False elapsed_time = pong_time - self.__server_status_list[rank]['ping_time'] # Notify if the response has been received after timeout if self.__server_status_list[rank]['timeout']: self.__server_status_list[rank]['timeout'] = False casalog.post("Ping status response from server %s finally received after %ss" % (str(rank),str(int(elapsed_time))),"WARN",casalog_call_origin) except: formatted_traceback = traceback.format_exc() casalog.post("Exception receiving ping status response msg: %s" % str(formatted_traceback),"SEVERE",casalog_call_origin) else: time.sleep(MPIEnvironment.mpi_ping_status_response_handler_service_sleep_time) # Mark service as not running self.__ping_status_response_handler_service_running = False def __start_ping_status_response_handler_service(self): casalog_call_origin = "MPIMonitorClient::start_ping_status_response_handler_service" if self.__ping_status_response_handler_service_running: casalog.post("MPI ping status response handler service is already running","WARN",casalog_call_origin) return True try: self.__ping_status_response_handler_service_on = True self.__ping_status_response_handler_service_thread = thread.start_new_thread(self.__ping_status_response_handler_service, ()) except: formatted_traceback = traceback.format_exc() self.__ping_status_response_handler_service_on = False self.__ping_status_response_handler_service_running = False casalog.post("Exception starting MPI ping status response handler service: %s" % str(formatted_traceback),"SEVERE",casalog_call_origin) return False while (not self.__ping_status_response_handler_service_running): time.sleep(MPIEnvironment.mpi_check_start_service_sleep_time) casalog.post("MPI ping status response handler service started","INFO",casalog_call_origin) return True def __stop_ping_status_response_handler_service(self): casalog_call_origin = "MPIMonitorClient::stop_ping_status_response_handler_service" if not self.__ping_status_response_handler_service_running: casalog.post("MPI ping status response handler service is not running","WARN",casalog_call_origin) return self.__ping_status_response_handler_service_on = False while (self.__ping_status_response_handler_service_running): time.sleep(MPIEnvironment.mpi_check_stop_service_sleep_time) casalog.post("MPI ping status response handler service stopped","INFO",casalog_call_origin) ################################################################################################################ # Public methods ############################################################################################### ################################################################################################################ def start_services(self): self.__start_ping_status_response_handler_service() self.__start_monitor_status_service() def stop_services(self): self.__stop_monitor_status_service() self.__stop_ping_status_response_handler_service() def get_server_status(self,server=None): casalog_call_origin = "MPIMonitorClient::get_server_status" if server is None: return dict(self.__server_status_list) else: if self.__server_status_list.has_key(server): return dict(self.__server_status_list[server]) else: casalog.post("Server n# %s is out of range" % str(server),"WARN",casalog_call_origin) def get_server_status_keyword(self,server,keyword): casalog_call_origin = "MPIMonitorClient::get_server_status_keyword" if self.__server_status_list.has_key(server): if self.__server_status_list[server].has_key(keyword): return self.__server_status_list[server][keyword] else: casalog.post("Status keyword %s not defined" % str(keyword),"WARN",casalog_call_origin) else: casalog.post("Server n# %s is out of range" % str(server),"WARN",casalog_call_origin) def set_server_status_keyword(self,server,keyword,value): casalog_call_origin = "MPIMonitorClient::set_server_status_keyword" if self.__server_status_list.has_key(server): if self.__server_status_list[server].has_key(keyword): self.__server_status_list[server][keyword]=value else: casalog.post("Status keyword %s not defined" % str(keyword),"WARN",casalog_call_origin) else: casalog.post("Server n# %s is out of range" % str(server),"WARN",casalog_call_origin) def get_server_rank_available(self,verbose=False): server_rank_available = [] for rank in self.__server_status_list: if not (self.__server_status_list[rank]['busy'] or self.__server_status_list[rank]['timeout']): server_rank_available.append(rank) return server_rank_available def get_server_rank_online(self,verbose=False): server_rank_online = [] for rank in self.__server_status_list: if not self.__server_status_list[rank]['timeout']: server_rank_online.append(rank) return server_rank_online def get_server_timeout(self): casalog_call_origin = "MPIMonitorClient::get_server_timeout" server_rank_timeout = [] for rank in self.__server_status_list: if self.__server_status_list[rank]['timeout'] is True: server_rank_timeout.append(rank) return server_rank_timeout