def _process_msg_pull(self, addr, msg): """ Processes incoming message for clients that are in pull mode, not injecting any fault :param addr: The address of the sender :param msg: The message dictionary """ # We process status messages for connections that are in the queue is_status, status = MessageClient.is_status_message(msg) if is_status and status == MessageClient.CONNECTION_LOST_MSG: if not self._suppressOutput: self._writers[addr].write_entry( MessageBuilder.status_connection(time())) elif is_status and status == MessageClient.CONNECTION_RESTORED_MSG: if not self._suppressOutput: self._writers[addr].write_entry( MessageBuilder.status_connection(time(), restored=True)) else: # Messages are popped from the input queue, and their content stored if not self._suppressOutput: self._writers[addr].write_entry(msg) msg_type = msg[MessageBuilder.FIELD_TYPE] if msg_type == MessageBuilder.STATUS_START: InjectorController.logger.info( "Task %s started on host %s" % (msg[MessageBuilder.FIELD_DATA], formatipport(addr))) elif msg_type == MessageBuilder.STATUS_RESTART: InjectorController.logger.info( "Task %s restarted on host %s" % (msg[MessageBuilder.FIELD_DATA], formatipport(addr))) elif msg_type == MessageBuilder.STATUS_END: InjectorController.logger.info( "Task %s terminated successfully on host %s" % (msg[MessageBuilder.FIELD_DATA], formatipport(addr))) if not self._suppressOutput: self._write_task_output(addr, msg) elif msg_type == MessageBuilder.STATUS_ERR: InjectorController.logger.error( "Task %s terminated with error code %s on host %s" % (msg[MessageBuilder.FIELD_DATA], str(msg[MessageBuilder.FIELD_ERR]), formatipport(addr))) if not self._suppressOutput: self._write_task_output(addr, msg) elif msg_type == MessageBuilder.STATUS_GREET: status_string = 'An injection session is in progress' if msg[MessageBuilder.FIELD_ISF] else \ 'No injection session is in progress' InjectorController.logger.info( "Greetings. Engine %s is alive with %s currently active tasks. %s" % (formatipport(addr), str( msg[MessageBuilder.FIELD_DATA]), status_string))
def _update_session(self, addr, msg): """ Checks and updates session-related information In a fault injection session, the master is the only host allowed to issue commands to this server. All other connected host can only monitor information :param addr: The (ip, port) address of the sender host :param msg: The message dictionary """ ack = False err = None if msg[MessageBuilder. FIELD_TYPE] == MessageBuilder.COMMAND_END_SESSION and addr == self._master: # If the current master has terminated its session, we react accordingly self._master = None self._session_timestamp = -1 ack = True InjectorEngine.logger.info( 'Injection session terminated with controller %s' % formatipport(addr)) elif msg[MessageBuilder. FIELD_TYPE] == MessageBuilder.COMMAND_START_SESSION: session_ts = msg[MessageBuilder.FIELD_TIME] addresses = self._server.get_registered_hosts() if self._master is None or self._master not in addresses or self._master == addr: # When starting a brand new session, the thread pool must be reset in order to prevent orphan tasks # from the previous session to keep running. # The only exception is when the session start command refers to a started session, that must be # restored after a disconnection of the master. if not self._server.reSendMsgs or self._session_timestamp != session_ts or self._master is None: self._pool.stop(kill_abruptly=True) self._pool.start() err = -1 # If there is no current master, or the previous one lost its connection, we accept the # session start request of the new host self._master = addr self._session_timestamp = session_ts ack = True InjectorEngine.logger.info( 'Injection session started with controller %s' % formatipport(addr)) else: InjectorEngine.logger.info( 'Injection session rejected with controller %s' % formatipport(addr)) # An ack (positive or negative) is sent to the sender host self._server.send_msg(addr, MessageBuilder.ack(time(), ack, err))
def _remove_host(self, address): """ Removes an host from the list of active hosts :param address: The (ip, port) address corresponding to the host to remove """ if address in self._registeredHosts: self._registeredHosts[address].close() self._registeredHosts.pop(address, None) self._update_read_set() else: MessageEntity.logger.error( 'Cannot remove host %s, does not exist' % formatipport(address))
def _end_session(self): """ Terminates the injection session for all connected hosts """ msg_end = MessageBuilder.command_session(time(), end=True) self._client.broadcast_msg(msg_end) session_closed = 0 session_sent = self._client.get_n_registered_hosts() session_check_start = time() session_check_now = time() while session_check_now - session_check_start < self._sessionWait and session_closed < session_sent: # We wait until we have received an ack for the termination from all of the connected hosts, or we time out if self._client.peek_msg_queue() > 0: addr, msg = self._client.pop_msg_queue() if msg[MessageBuilder.FIELD_TYPE] == MessageBuilder.ACK_YES: InjectorController.logger.info( "Injection session closed with engine %s" % formatipport(addr)) if not self._suppressOutput: self._writers[addr].write_entry( MessageBuilder.command_session( msg[MessageBuilder.FIELD_TIME], end=True)) session_closed += 1 else: # If we receive a message that is not an ack after all tasks have terminated, something is wrong InjectorController.logger.error( "Ack expected from engine %s, got %s" % (formatipport(addr), msg[MessageBuilder.FIELD_TYPE])) sleep(self._sleepPeriod) session_check_now = time() # All of the execution log writers are closed, and the session finishes if not self._suppressOutput: for writer in self._writers.values(): writer.close()
def add_servers(self, addrs): """ Method that opens connection with a specified list of ips/ports of servers :param addrs: The addresses of servers to which to connect, in (ip, port) tuple format """ if addrs is None: MessageClient.logger.error( 'You must specify one or more addresses to start the client') return if not isinstance(addrs, (list, tuple)): addrs = [addrs] for addr in addrs: try: sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) sock.connect((socket.gethostbyname(addr[0]), addr[1])) self._register_host(sock) MessageClient.logger.info( 'Successfully connected to server %s' % formatipport(addr)) except (ConnectionError, ConnectionRefusedError, TimeoutError, ConnectionAbortedError, socket.gaierror): MessageClient.logger.warning('Could not connect to %s' % formatipport(addr)) pass
def _register_host(self, connection, overwrite=False): """ Adds an host for which connection was successfully established to the list of active hosts :param connection: the socket object corresponding to the host :param overwrite: if True, connections will be overwritten by new connections to the same host """ addr = connection.getpeername() if addr not in self._registeredHosts or overwrite: self._registeredHosts[addr] = connection self._update_read_set() else: connection.close() MessageEntity.logger.error( 'Cannot register host %s, is already registered' % formatipport(addr))
def _restore_dangling_connections(self): """ Tries to re-establish connection with "dangling" hosts A "dangling" host is one whose connection has been recently lost, in a time window that falls within retry_interval. If the connection could not be established by the end of the time window, the host is dropped """ if len(self._dangling) > 0: time_now = time() to_pop = [] for addr, time_list in self._dangling.items(): # If a dangling host has passed its retry interval, we remove it completely if time_now - time_list[1] > self.retry_interval: self._add_to_input_queue( addr, MessageEntity.CONNECTION_FINALIZED_MSG) to_pop.append(addr) # We retry establishing a connection with the dangling host elif time_now - time_list[0] >= self.retry_period: time_list[0] = time_now try: sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) sock.connect((socket.gethostbyname(addr[0]), addr[1])) self._register_host(sock, overwrite=True) if self.reSendMsgs: self._forward_old_msgs(self._seq_nums[addr][1], addr) self._send_msg(self._seq_nums[addr][0], addr, None) to_pop.append(addr) # When connection is re-established, we inject a status message for that host in the input queue self._add_to_input_queue( addr, MessageEntity.CONNECTION_RESTORED_MSG) MessageClient.logger.info( 'Connection to server %s was successfully restored' % formatipport(addr)) except (ConnectionError, ConnectionRefusedError, TimeoutError, ConnectionAbortedError): pass # We remove all hosts for which connection was re-established from the dangling ones for addr in to_pop: self._dangling.pop(addr, None) to_pop.clear()
def _send_msg(self, seq_num, addr, comm): """ Private method that sends messages over specific active hosts of the registered hosts list :param seq_num: sequence number of the message to be sent in tuple format :param addr: address of the target host :param comm: content of the message. Must be supplied as a dictionary. If None, an empty message with its header only will be sent: this type of messages is used to identify message forwarding requests, with seq_num representing the sequence number of the last valid message received from the host :return: True if the message was successfully sent, False otherwise """ # Verifying if the input address has a corresponding open socket try: sock = self._registeredHosts[addr] except KeyError: sock = None # If no valid socket was found for the input address, the message is not sent if sock is None: MessageEntity.logger.error('Cannot send to %s, is not registered' % formatipport(addr)) return False if comm is None: # An empty message containing only the header represents a message forwarding request msg = struct.pack('>I', 0) + struct.pack( '>I', seq_num[0]) + struct.pack('>I', seq_num[1]) else: msg = json.dumps(comm).encode() # Prefix each message with a 4-byte length (network byte order) msg = struct.pack('>I', len(msg)) + struct.pack( '>I', seq_num[0]) + struct.pack('>I', seq_num[1]) + msg try: sock.sendall(msg) if self.reSendMsgs and comm is not None: self._update_seq_num(addr, seq_num, received=False) return True except Exception: MessageEntity.logger.error( 'Exception encountered while sending msg to %s' % getipport(sock)) # If an error is encountered during communication, we suppose the host is dead return False
def listen(self): """ Listens for incoming fault injection requests and executes them """ InjectorEngine.logger.info("FINJ Injection Engine v%s started" % VER_ID) signal.signal(signal.SIGINT, self._signalhandler) signal.signal(signal.SIGTERM, self._signalhandler) self._subman.start_subprocesses() self._server.start() self._pool.start() while True: # Waiting for a new requests to arrive addr, msg = self._server.pop_msg_queue() msg_type = msg[MessageBuilder.FIELD_TYPE] # If a session command has arrived, we process it accordingly if msg_type == MessageBuilder.COMMAND_START_SESSION or msg_type == MessageBuilder.COMMAND_END_SESSION: self._update_session(addr, msg) # The set time is sent by the master after a successful ack and defines when the 'workload' is started elif msg_type == MessageBuilder.COMMAND_SET_TIME and self._master is not None and addr == self._master: self._pool.reset_session(msg[MessageBuilder.FIELD_TIME], time()) # If the master has sent a clock correction request, we process it elif msg_type == MessageBuilder.COMMAND_CORRECT_TIME and self._master is not None and addr == self._master: self._pool.correct_time(msg[MessageBuilder.FIELD_TIME]) # Processing a termination command elif msg_type == MessageBuilder.COMMAND_TERMINATE: self._check_for_termination(addr, msg) # If a new command has been issued by the current session master, we add it to the thread pool queue elif addr == self._master and msg[ MessageBuilder.FIELD_TYPE] == MessageBuilder.COMMAND_START: self._pool.submit_task(Task.msg_to_task(msg)) elif msg_type == MessageBuilder.COMMAND_GREET: reply = MessageBuilder.status_greet(time(), self._pool.active_tasks(), self._master is not None) self._server.send_msg(addr, reply) else: InjectorEngine.logger.warning( 'Invalid command sent from non-master host %s', formatipport(addr))
def _process_msg_inject(self, addr, msg): """ Processes incoming message for clients involved in an injection session :param addr: The address of the sender :param msg: The message dictionary """ # We process status messages for connections that are in the queue is_status, status = MessageClient.is_status_message(msg) if is_status and status == MessageClient.CONNECTION_LOST_MSG: # If connection has been lost with an host, we remove its pendingTasks entry if not self._suppressOutput: self._writers[addr].write_entry( MessageBuilder.status_connection(time())) elif is_status and status == MessageClient.CONNECTION_RESTORED_MSG: # If connection has been restored with an host, we send a new session start command self._client.send_msg( addr, MessageBuilder.command_session(self._session_id)) self._client.send_msg( addr, MessageBuilder.command_set_time(self._get_timestamp(time()))) elif is_status and status == MessageClient.CONNECTION_FINALIZED_MSG: self._pendingTasks.pop(addr, None) # If all connections to servers were finalized we assume that the injection can be terminated if len(self._pendingTasks) == 0: self._endReached = True self._reader.close() else: msg_type = msg[MessageBuilder.FIELD_TYPE] if msg_type != MessageBuilder.ACK_YES and msg_type != MessageBuilder.ACK_NO: # Ack messages are not written to the output log if not self._suppressOutput: self._writers[addr].write_entry(msg) # We log on the terminal the content of the message in a pretty form if msg_type == MessageBuilder.STATUS_START: InjectorController.logger.info( "Task %s started on host %s" % (msg[MessageBuilder.FIELD_DATA], formatipport(addr))) elif msg_type == MessageBuilder.STATUS_RESTART: InjectorController.logger.info( "Task %s restarted on host %s" % (msg[MessageBuilder.FIELD_DATA], formatipport(addr))) elif msg_type == MessageBuilder.STATUS_END: InjectorController.logger.info( "Task %s terminated successfully on host %s" % (msg[MessageBuilder.FIELD_DATA], formatipport(addr))) # If a task terminates, we remove its sequence number from the set of pending tasks for the host self._pendingTasks[addr].discard( msg[MessageBuilder.FIELD_SEQNUM]) if not self._suppressOutput: self._write_task_output(addr, msg) elif msg_type == MessageBuilder.STATUS_ERR: InjectorController.logger.error( "Task %s terminated with error code %s on host %s" % (msg[MessageBuilder.FIELD_DATA], str(msg[MessageBuilder.FIELD_ERR]), formatipport(addr))) self._pendingTasks[addr].discard( msg[MessageBuilder.FIELD_SEQNUM]) if not self._suppressOutput: self._write_task_output(addr, msg) elif msg_type == MessageBuilder.ACK_YES: # ACK messages after the initialization phase are received ONLY when a connection is restored, # and the session must be resumed InjectorController.logger.warning( "Session resumed with engine %s" % formatipport(addr)) # If the ack msg contains an error, it means all previously running tasks have been lost if not self._suppressOutput: self._writers[addr].write_entry( MessageBuilder.status_connection(time(), restored=True)) if MessageBuilder.FIELD_ERR in msg: self._pendingTasks[addr] = set() if not self._suppressOutput: self._writers[addr].write_entry( MessageBuilder.status_reset( msg[MessageBuilder.FIELD_TIME])) elif msg_type == MessageBuilder.ACK_NO: InjectorController.logger.warning( "Session cannot be resumed with engine %s" % formatipport(addr)) self._client.remove_host(addr)
def _init_session(self, workload_name): """ Initializes the injection session for all connected hosts :param workload_name: The name of the workload to be injected :return: the number of hosts that have accepted the injection start command, and the timestamp ID of the session """ session_start_timestamp = time() msg_start = MessageBuilder.command_session(session_start_timestamp) self._client.broadcast_msg(msg_start) self._writers = {} self._outputsDirs = {} self._pendingTasks = {} session_accepted = set() session_replied = 0 session_sent = self._client.get_n_registered_hosts() session_check_start = time() session_check_now = time() while session_check_now - session_check_start < self._sessionWait and session_replied < session_sent: # We wait until we receive an ack (positive or negative) from all connected hosts, or either we time out if self._client.peek_msg_queue() > 0: addr, msg = self._client.pop_msg_queue() if msg[MessageBuilder.FIELD_TYPE] == MessageBuilder.ACK_YES: # If an host replies to the injection start command with a positive ack, its log writer is # instantiated, together with its entry in the pendingTasks dictionary InjectorController.logger.info( "Injection session started with engine %s" % formatipport(addr)) session_accepted.add(addr) session_replied += 1 self._outputsDirs[addr] = format_output_directory( self._resultsDir, addr, workload_name) # The outputs directory needs to be flushed before starting the new injection session if not self._suppressOutput: if isdir(self._outputsDirs[addr]): rmtree(self._outputsDirs[addr], ignore_errors=True) self._writers[addr] = ExecutionLogWriter( format_injection_filename(self._resultsDir, addr, workload_name)) self._writers[addr].write_entry( MessageBuilder.command_session( msg[MessageBuilder.FIELD_TIME])) self._pendingTasks[addr] = set() elif msg[MessageBuilder.FIELD_TYPE] == MessageBuilder.ACK_NO: # If an host rejects the injection start command, we discard it InjectorController.logger.warning( "Injection session request rejected by engine %s" % formatipport(addr)) session_replied += 1 self._client.remove_host(addr) sleep(self._sleepPeriod) session_check_now = time() if session_check_now - session_check_start >= self._sessionWait: # If we have reached the time out, it means that not all of the connected hosts have replied. This is # highly unlikely, but could still happen. In this case, we remove all hosts that have not replied InjectorController.logger.warning( "Injection session startup reached the timeout limit") for addr in self._client.get_registered_hosts(): if addr not in session_accepted: self._client.remove_host(addr) return len(session_accepted), session_start_timestamp