def _process_msg_pull(self, addr, msg): """ Processes incoming message for clients that are in pull mode, not injecting any fault :param addr: The address of the sender :param msg: The message dictionary """ # We process status messages for connections that are in the queue is_status, status = MessageClient.is_status_message(msg) if is_status and status == MessageClient.CONNECTION_LOST_MSG: if not self._suppressOutput: self._writers[addr].write_entry( MessageBuilder.status_connection(time())) elif is_status and status == MessageClient.CONNECTION_RESTORED_MSG: if not self._suppressOutput: self._writers[addr].write_entry( MessageBuilder.status_connection(time(), restored=True)) else: # Messages are popped from the input queue, and their content stored if not self._suppressOutput: self._writers[addr].write_entry(msg) msg_type = msg[MessageBuilder.FIELD_TYPE] if msg_type == MessageBuilder.STATUS_START: InjectorController.logger.info( "Task %s started on host %s" % (msg[MessageBuilder.FIELD_DATA], formatipport(addr))) elif msg_type == MessageBuilder.STATUS_RESTART: InjectorController.logger.info( "Task %s restarted on host %s" % (msg[MessageBuilder.FIELD_DATA], formatipport(addr))) elif msg_type == MessageBuilder.STATUS_END: InjectorController.logger.info( "Task %s terminated successfully on host %s" % (msg[MessageBuilder.FIELD_DATA], formatipport(addr))) if not self._suppressOutput: self._write_task_output(addr, msg) elif msg_type == MessageBuilder.STATUS_ERR: InjectorController.logger.error( "Task %s terminated with error code %s on host %s" % (msg[MessageBuilder.FIELD_DATA], str(msg[MessageBuilder.FIELD_ERR]), formatipport(addr))) if not self._suppressOutput: self._write_task_output(addr, msg) elif msg_type == MessageBuilder.STATUS_GREET: status_string = 'An injection session is in progress' if msg[MessageBuilder.FIELD_ISF] else \ 'No injection session is in progress' InjectorController.logger.info( "Greetings. Engine %s is alive with %s currently active tasks. %s" % (formatipport(addr), str( msg[MessageBuilder.FIELD_DATA]), status_string))
def _pull(self): """ Starts the injection server in pull mode: that is, no workload is injected, and the execution logs are stored as messages are sent from the connected hosts. """ self._client.start() if self._client.get_n_registered_hosts() == 0: InjectorController.logger.warning( "No connected hosts for pulling information. Aborting...") return msg = MessageBuilder.command_greet(0) self._client.broadcast_msg(msg) addrs = self._client.get_registered_hosts() self._writers = {} self._outputsDirs = {} for addr in addrs: self._outputsDirs[addr] = format_output_directory( self._resultsDir, addr) # The outputs directory needs to be flushed before starting the new injection session if not self._suppressOutput: if isdir(self._outputsDirs[addr]): rmtree(self._outputsDirs[addr], ignore_errors=True) # We create an execution log writer for each connected host self._writers[addr] = ExecutionLogWriter( format_injection_filename(self._resultsDir, addr)) while True: # The loop does not end; it is up to users to terminate the listening process by killing the process addr, msg = self._client.pop_msg_queue() self._process_msg_pull(addr, msg)
def _process_result(self, task, timestamp, rcode, outdata=''): """ Method that sends a broadcast message to all connected hosts when a task terminates :param task: The msg related to the task that has terminated :param timestamp: The timestamp related to the termination time :param rcode: The return code of the task's execution :param outdata: the shell output of the task, if it is a benchmark """ task.timestamp = timestamp # If output logging is not enabled, or the task is not a benchmark, the output data is discarded if not self._log_outputs or task.isFault or len(outdata) == 0: outdata = None if rcode != 0: msg = MessageBuilder.status_error(task, rcode, outdata) else: msg = MessageBuilder.status_end(task, outdata) if msg is not None and not current_thread().has_to_terminate(): self._server.broadcast_msg(msg)
def _inform_start(self, task, timestamp): """ Method that sends a broadcast message to all connected hosts when a task is started :param task: The msg related to the task that has been started :param timestamp: The timestamp related to the starting time """ task.timestamp = timestamp msg = MessageBuilder.status_start(task) if msg is not None: self._server.broadcast_msg(msg)
def _inform_restart(self, task, timestamp, rcode): """ Method that sends a broadcast message to all connected hosts when a task is restarted :param task: The msg related to the task that has terminated :param timestamp: The timestamp related to the termination time :param rcode: The return code of the task's execution """ task.timestamp = timestamp error = None if rcode == 0 else rcode msg = MessageBuilder.status_restart(task, error) if msg is not None: self._server.broadcast_msg(msg)
def _end_session(self): """ Terminates the injection session for all connected hosts """ msg_end = MessageBuilder.command_session(time(), end=True) self._client.broadcast_msg(msg_end) session_closed = 0 session_sent = self._client.get_n_registered_hosts() session_check_start = time() session_check_now = time() while session_check_now - session_check_start < self._sessionWait and session_closed < session_sent: # We wait until we have received an ack for the termination from all of the connected hosts, or we time out if self._client.peek_msg_queue() > 0: addr, msg = self._client.pop_msg_queue() if msg[MessageBuilder.FIELD_TYPE] == MessageBuilder.ACK_YES: InjectorController.logger.info( "Injection session closed with engine %s" % formatipport(addr)) if not self._suppressOutput: self._writers[addr].write_entry( MessageBuilder.command_session( msg[MessageBuilder.FIELD_TIME], end=True)) session_closed += 1 else: # If we receive a message that is not an ack after all tasks have terminated, something is wrong InjectorController.logger.error( "Ack expected from engine %s, got %s" % (formatipport(addr), msg[MessageBuilder.FIELD_TYPE])) sleep(self._sleepPeriod) session_check_now = time() # All of the execution log writers are closed, and the session finishes if not self._suppressOutput: for writer in self._writers.values(): writer.close()
def _update_session(self, addr, msg): """ Checks and updates session-related information In a fault injection session, the master is the only host allowed to issue commands to this server. All other connected host can only monitor information :param addr: The (ip, port) address of the sender host :param msg: The message dictionary """ ack = False err = None if msg[MessageBuilder. FIELD_TYPE] == MessageBuilder.COMMAND_END_SESSION and addr == self._master: # If the current master has terminated its session, we react accordingly self._master = None self._session_timestamp = -1 ack = True InjectorEngine.logger.info( 'Injection session terminated with controller %s' % formatipport(addr)) elif msg[MessageBuilder. FIELD_TYPE] == MessageBuilder.COMMAND_START_SESSION: session_ts = msg[MessageBuilder.FIELD_TIME] addresses = self._server.get_registered_hosts() if self._master is None or self._master not in addresses or self._master == addr: # When starting a brand new session, the thread pool must be reset in order to prevent orphan tasks # from the previous session to keep running. # The only exception is when the session start command refers to a started session, that must be # restored after a disconnection of the master. if not self._server.reSendMsgs or self._session_timestamp != session_ts or self._master is None: self._pool.stop(kill_abruptly=True) self._pool.start() err = -1 # If there is no current master, or the previous one lost its connection, we accept the # session start request of the new host self._master = addr self._session_timestamp = session_ts ack = True InjectorEngine.logger.info( 'Injection session started with controller %s' % formatipport(addr)) else: InjectorEngine.logger.info( 'Injection session rejected with controller %s' % formatipport(addr)) # An ack (positive or negative) is sent to the sender host self._server.send_msg(addr, MessageBuilder.ack(time(), ack, err))
def listen(self): """ Listens for incoming fault injection requests and executes them """ InjectorEngine.logger.info("FINJ Injection Engine v%s started" % VER_ID) signal.signal(signal.SIGINT, self._signalhandler) signal.signal(signal.SIGTERM, self._signalhandler) self._subman.start_subprocesses() self._server.start() self._pool.start() while True: # Waiting for a new requests to arrive addr, msg = self._server.pop_msg_queue() msg_type = msg[MessageBuilder.FIELD_TYPE] # If a session command has arrived, we process it accordingly if msg_type == MessageBuilder.COMMAND_START_SESSION or msg_type == MessageBuilder.COMMAND_END_SESSION: self._update_session(addr, msg) # The set time is sent by the master after a successful ack and defines when the 'workload' is started elif msg_type == MessageBuilder.COMMAND_SET_TIME and self._master is not None and addr == self._master: self._pool.reset_session(msg[MessageBuilder.FIELD_TIME], time()) # If the master has sent a clock correction request, we process it elif msg_type == MessageBuilder.COMMAND_CORRECT_TIME and self._master is not None and addr == self._master: self._pool.correct_time(msg[MessageBuilder.FIELD_TIME]) # Processing a termination command elif msg_type == MessageBuilder.COMMAND_TERMINATE: self._check_for_termination(addr, msg) # If a new command has been issued by the current session master, we add it to the thread pool queue elif addr == self._master and msg[ MessageBuilder.FIELD_TYPE] == MessageBuilder.COMMAND_START: self._pool.submit_task(Task.msg_to_task(msg)) elif msg_type == MessageBuilder.COMMAND_GREET: reply = MessageBuilder.status_greet(time(), self._pool.active_tasks(), self._master is not None) self._server.send_msg(addr, reply) else: InjectorEngine.logger.warning( 'Invalid command sent from non-master host %s', formatipport(addr))
def _process_msg_inject(self, addr, msg): """ Processes incoming message for clients involved in an injection session :param addr: The address of the sender :param msg: The message dictionary """ # We process status messages for connections that are in the queue is_status, status = MessageClient.is_status_message(msg) if is_status and status == MessageClient.CONNECTION_LOST_MSG: # If connection has been lost with an host, we remove its pendingTasks entry if not self._suppressOutput: self._writers[addr].write_entry( MessageBuilder.status_connection(time())) elif is_status and status == MessageClient.CONNECTION_RESTORED_MSG: # If connection has been restored with an host, we send a new session start command self._client.send_msg( addr, MessageBuilder.command_session(self._session_id)) self._client.send_msg( addr, MessageBuilder.command_set_time(self._get_timestamp(time()))) elif is_status and status == MessageClient.CONNECTION_FINALIZED_MSG: self._pendingTasks.pop(addr, None) # If all connections to servers were finalized we assume that the injection can be terminated if len(self._pendingTasks) == 0: self._endReached = True self._reader.close() else: msg_type = msg[MessageBuilder.FIELD_TYPE] if msg_type != MessageBuilder.ACK_YES and msg_type != MessageBuilder.ACK_NO: # Ack messages are not written to the output log if not self._suppressOutput: self._writers[addr].write_entry(msg) # We log on the terminal the content of the message in a pretty form if msg_type == MessageBuilder.STATUS_START: InjectorController.logger.info( "Task %s started on host %s" % (msg[MessageBuilder.FIELD_DATA], formatipport(addr))) elif msg_type == MessageBuilder.STATUS_RESTART: InjectorController.logger.info( "Task %s restarted on host %s" % (msg[MessageBuilder.FIELD_DATA], formatipport(addr))) elif msg_type == MessageBuilder.STATUS_END: InjectorController.logger.info( "Task %s terminated successfully on host %s" % (msg[MessageBuilder.FIELD_DATA], formatipport(addr))) # If a task terminates, we remove its sequence number from the set of pending tasks for the host self._pendingTasks[addr].discard( msg[MessageBuilder.FIELD_SEQNUM]) if not self._suppressOutput: self._write_task_output(addr, msg) elif msg_type == MessageBuilder.STATUS_ERR: InjectorController.logger.error( "Task %s terminated with error code %s on host %s" % (msg[MessageBuilder.FIELD_DATA], str(msg[MessageBuilder.FIELD_ERR]), formatipport(addr))) self._pendingTasks[addr].discard( msg[MessageBuilder.FIELD_SEQNUM]) if not self._suppressOutput: self._write_task_output(addr, msg) elif msg_type == MessageBuilder.ACK_YES: # ACK messages after the initialization phase are received ONLY when a connection is restored, # and the session must be resumed InjectorController.logger.warning( "Session resumed with engine %s" % formatipport(addr)) # If the ack msg contains an error, it means all previously running tasks have been lost if not self._suppressOutput: self._writers[addr].write_entry( MessageBuilder.status_connection(time(), restored=True)) if MessageBuilder.FIELD_ERR in msg: self._pendingTasks[addr] = set() if not self._suppressOutput: self._writers[addr].write_entry( MessageBuilder.status_reset( msg[MessageBuilder.FIELD_TIME])) elif msg_type == MessageBuilder.ACK_NO: InjectorController.logger.warning( "Session cannot be resumed with engine %s" % formatipport(addr)) self._client.remove_host(addr)
def _init_session(self, workload_name): """ Initializes the injection session for all connected hosts :param workload_name: The name of the workload to be injected :return: the number of hosts that have accepted the injection start command, and the timestamp ID of the session """ session_start_timestamp = time() msg_start = MessageBuilder.command_session(session_start_timestamp) self._client.broadcast_msg(msg_start) self._writers = {} self._outputsDirs = {} self._pendingTasks = {} session_accepted = set() session_replied = 0 session_sent = self._client.get_n_registered_hosts() session_check_start = time() session_check_now = time() while session_check_now - session_check_start < self._sessionWait and session_replied < session_sent: # We wait until we receive an ack (positive or negative) from all connected hosts, or either we time out if self._client.peek_msg_queue() > 0: addr, msg = self._client.pop_msg_queue() if msg[MessageBuilder.FIELD_TYPE] == MessageBuilder.ACK_YES: # If an host replies to the injection start command with a positive ack, its log writer is # instantiated, together with its entry in the pendingTasks dictionary InjectorController.logger.info( "Injection session started with engine %s" % formatipport(addr)) session_accepted.add(addr) session_replied += 1 self._outputsDirs[addr] = format_output_directory( self._resultsDir, addr, workload_name) # The outputs directory needs to be flushed before starting the new injection session if not self._suppressOutput: if isdir(self._outputsDirs[addr]): rmtree(self._outputsDirs[addr], ignore_errors=True) self._writers[addr] = ExecutionLogWriter( format_injection_filename(self._resultsDir, addr, workload_name)) self._writers[addr].write_entry( MessageBuilder.command_session( msg[MessageBuilder.FIELD_TIME])) self._pendingTasks[addr] = set() elif msg[MessageBuilder.FIELD_TYPE] == MessageBuilder.ACK_NO: # If an host rejects the injection start command, we discard it InjectorController.logger.warning( "Injection session request rejected by engine %s" % formatipport(addr)) session_replied += 1 self._client.remove_host(addr) sleep(self._sleepPeriod) session_check_now = time() if session_check_now - session_check_start >= self._sessionWait: # If we have reached the time out, it means that not all of the connected hosts have replied. This is # highly unlikely, but could still happen. In this case, we remove all hosts that have not replied InjectorController.logger.warning( "Injection session startup reached the timeout limit") for addr in self._client.get_registered_hosts(): if addr not in session_accepted: self._client.remove_host(addr) return len(session_accepted), session_start_timestamp
def _inject(self, reader, max_tasks=None): """ Starts the injection process with a given workload, issuing commands to start tasks on remote hosts and collecting their result :param reader: a valid Reader object :param max_tasks: The maximum number of tasks to be processed before terminating. Useful for debugging """ self._reader = reader assert isinstance( reader, Reader), '_inject method only supports Reader objects!' task = reader.read_entry() if task is None: InjectorController.logger.warning( "Input workload appears to be empty. Aborting...") return self._client.start() # Initializing the injection session session_accepted, session_id = self._init_session( workload_name=splitext(basename(reader.get_path()))[0]) if session_accepted == 0: InjectorController.logger.warning( "No valid hosts for injection detected. Aborting...") return self._session_id = session_id # Determines if we have reached the end of the workload self._endReached = False read_tasks = 0 # Start timestamp for the workload, computed from its first entry, minus the specified padding value self._start_timestamp = task.timestamp - self._workloadPadding # Synchronizes the time with all of the connected hosts self._client.broadcast_msg( MessageBuilder.command_set_time(self._start_timestamp)) # Absolute timestamp associated to the workload's starting timestamp self._start_timestamp_abs = time() # Timestamp of the last correction that was applied to the clock of remote hosts last_clock_correction = self._start_timestamp_abs while not self._endReached or self._tasks_are_pending(): # While some tasks are still running, and there are tasks from the workload that still need to be read, we # keep looping while self._client.peek_msg_queue() > 0: # We process all messages in the input queue, and write their content to the execution log for the # given host addr, msg = self._client.pop_msg_queue() self._process_msg_inject(addr, msg) # We compute the new "virtual" timestamp, in function of the workload's starting time now_timestamp_abs = time() now_timestamp = self._get_timestamp(now_timestamp_abs) # We perform periodically a correction of the clock of the remote hosts. This has impact only when there # is a very large drift between the clocks, of several minutes # If the sliding window for the task injection is disabled there is no need to perform clock correction if now_timestamp_abs - last_clock_correction > self._clockCorrectionPeriod and self._preSendInterval >= 0: msg = MessageBuilder.command_correct_time(now_timestamp) self._client.broadcast_msg(msg) last_clock_correction = now_timestamp_abs while not self._endReached and ( task.timestamp < now_timestamp + self._preSendInterval or self._preSendInterval < 0): # We read all entries from the workload that correspond to tasks scheduled to start in the next # minutes (specified by presendinterval), and issue the related commands. This supposes that the # workload entries are ordered by their timestamp msg = MessageBuilder.command_start(task) self._client.broadcast_msg(msg) for s in self._pendingTasks.values(): s.add(task.seqNum) task = reader.read_entry() read_tasks += 1 if task is None or (max_tasks is not None and read_tasks >= max_tasks): self._endReached = True reader.close() # This is a busy loop, with a short sleep period of roughly one second sleep(self._sleepPeriod) self._end_session()