Example #1
0
    def _process_msg_pull(self, addr, msg):
        """
        Processes incoming message for clients that are in pull mode, not injecting any fault

        :param addr: The address of the sender
        :param msg: The message dictionary
        """
        # We process status messages for connections that are in the queue
        is_status, status = MessageClient.is_status_message(msg)
        if is_status and status == MessageClient.CONNECTION_LOST_MSG:
            if not self._suppressOutput:
                self._writers[addr].write_entry(
                    MessageBuilder.status_connection(time()))
        elif is_status and status == MessageClient.CONNECTION_RESTORED_MSG:
            if not self._suppressOutput:
                self._writers[addr].write_entry(
                    MessageBuilder.status_connection(time(), restored=True))
        else:
            # Messages are popped from the input queue, and their content stored
            if not self._suppressOutput:
                self._writers[addr].write_entry(msg)
            msg_type = msg[MessageBuilder.FIELD_TYPE]
            if msg_type == MessageBuilder.STATUS_START:
                InjectorController.logger.info(
                    "Task %s started on host %s" %
                    (msg[MessageBuilder.FIELD_DATA], formatipport(addr)))
            elif msg_type == MessageBuilder.STATUS_RESTART:
                InjectorController.logger.info(
                    "Task %s restarted on host %s" %
                    (msg[MessageBuilder.FIELD_DATA], formatipport(addr)))
            elif msg_type == MessageBuilder.STATUS_END:
                InjectorController.logger.info(
                    "Task %s terminated successfully on host %s" %
                    (msg[MessageBuilder.FIELD_DATA], formatipport(addr)))
                if not self._suppressOutput:
                    self._write_task_output(addr, msg)
            elif msg_type == MessageBuilder.STATUS_ERR:
                InjectorController.logger.error(
                    "Task %s terminated with error code %s on host %s" %
                    (msg[MessageBuilder.FIELD_DATA],
                     str(msg[MessageBuilder.FIELD_ERR]), formatipport(addr)))
                if not self._suppressOutput:
                    self._write_task_output(addr, msg)
            elif msg_type == MessageBuilder.STATUS_GREET:
                status_string = 'An injection session is in progress' if msg[MessageBuilder.FIELD_ISF] else \
                    'No injection session is in progress'
                InjectorController.logger.info(
                    "Greetings. Engine %s is alive with %s currently active tasks. %s"
                    % (formatipport(addr), str(
                        msg[MessageBuilder.FIELD_DATA]), status_string))
Example #2
0
 def _update_session(self, addr, msg):
     """
     Checks and updates session-related information
     
     In a fault injection session, the master is the only host allowed to issue commands to this server. All other
     connected host can only monitor information
     
     :param addr: The (ip, port) address of the sender host
     :param msg: The message dictionary
     """
     ack = False
     err = None
     if msg[MessageBuilder.
            FIELD_TYPE] == MessageBuilder.COMMAND_END_SESSION and addr == self._master:
         # If the current master has terminated its session, we react accordingly
         self._master = None
         self._session_timestamp = -1
         ack = True
         InjectorEngine.logger.info(
             'Injection session terminated with controller %s' %
             formatipport(addr))
     elif msg[MessageBuilder.
              FIELD_TYPE] == MessageBuilder.COMMAND_START_SESSION:
         session_ts = msg[MessageBuilder.FIELD_TIME]
         addresses = self._server.get_registered_hosts()
         if self._master is None or self._master not in addresses or self._master == addr:
             # When starting a brand new session, the thread pool must be reset in order to prevent orphan tasks
             # from the previous session to keep running.
             # The only exception is when the session start command refers to a started session, that must be
             # restored after a disconnection of the master.
             if not self._server.reSendMsgs or self._session_timestamp != session_ts or self._master is None:
                 self._pool.stop(kill_abruptly=True)
                 self._pool.start()
                 err = -1
             # If there is no current master, or the previous one lost its connection, we accept the
             # session start request of the new host
             self._master = addr
             self._session_timestamp = session_ts
             ack = True
             InjectorEngine.logger.info(
                 'Injection session started with controller %s' %
                 formatipport(addr))
         else:
             InjectorEngine.logger.info(
                 'Injection session rejected with controller %s' %
                 formatipport(addr))
         # An ack (positive or negative) is sent to the sender host
     self._server.send_msg(addr, MessageBuilder.ack(time(), ack, err))
Example #3
0
    def _remove_host(self, address):
        """
        Removes an host from the list of active hosts

        :param address: The (ip, port) address corresponding to the host to remove
        """
        if address in self._registeredHosts:
            self._registeredHosts[address].close()
            self._registeredHosts.pop(address, None)
            self._update_read_set()
        else:
            MessageEntity.logger.error(
                'Cannot remove host %s, does not exist' %
                formatipport(address))
Example #4
0
    def _end_session(self):
        """
        Terminates the injection session for all connected hosts
        """
        msg_end = MessageBuilder.command_session(time(), end=True)
        self._client.broadcast_msg(msg_end)

        session_closed = 0
        session_sent = self._client.get_n_registered_hosts()
        session_check_start = time()
        session_check_now = time()
        while session_check_now - session_check_start < self._sessionWait and session_closed < session_sent:
            # We wait until we have received an ack for the termination from all of the connected hosts, or we time out
            if self._client.peek_msg_queue() > 0:
                addr, msg = self._client.pop_msg_queue()
                if msg[MessageBuilder.FIELD_TYPE] == MessageBuilder.ACK_YES:
                    InjectorController.logger.info(
                        "Injection session closed with engine %s" %
                        formatipport(addr))
                    if not self._suppressOutput:
                        self._writers[addr].write_entry(
                            MessageBuilder.command_session(
                                msg[MessageBuilder.FIELD_TIME], end=True))
                    session_closed += 1
                else:
                    # If we receive a message that is not an ack after all tasks have terminated, something is wrong
                    InjectorController.logger.error(
                        "Ack expected from engine %s, got %s" %
                        (formatipport(addr), msg[MessageBuilder.FIELD_TYPE]))
            sleep(self._sleepPeriod)
            session_check_now = time()

        # All of the execution log writers are closed, and the session finishes
        if not self._suppressOutput:
            for writer in self._writers.values():
                writer.close()
Example #5
0
 def add_servers(self, addrs):
     """
     Method that opens connection with a specified list of ips/ports of servers
     
     :param addrs: The addresses of servers to which to connect, in (ip, port) tuple format
     """
     if addrs is None:
         MessageClient.logger.error(
             'You must specify one or more addresses to start the client')
         return
     if not isinstance(addrs, (list, tuple)):
         addrs = [addrs]
     for addr in addrs:
         try:
             sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
             sock.connect((socket.gethostbyname(addr[0]), addr[1]))
             self._register_host(sock)
             MessageClient.logger.info(
                 'Successfully connected to server %s' % formatipport(addr))
         except (ConnectionError, ConnectionRefusedError, TimeoutError,
                 ConnectionAbortedError, socket.gaierror):
             MessageClient.logger.warning('Could not connect to %s' %
                                          formatipport(addr))
             pass
Example #6
0
    def _register_host(self, connection, overwrite=False):
        """
        Adds an host for which connection was successfully established to the list of active hosts

        :param connection: the socket object corresponding to the host
        :param overwrite: if True, connections will be overwritten by new connections to the same host
        """
        addr = connection.getpeername()
        if addr not in self._registeredHosts or overwrite:
            self._registeredHosts[addr] = connection
            self._update_read_set()
        else:
            connection.close()
            MessageEntity.logger.error(
                'Cannot register host %s, is already registered' %
                formatipport(addr))
Example #7
0
    def _restore_dangling_connections(self):
        """
        Tries to re-establish connection with "dangling" hosts

        A "dangling" host is one whose connection has been recently lost, in a time window that falls within
        retry_interval. If the connection could not be established by the end of the time window, the host is dropped
        """
        if len(self._dangling) > 0:
            time_now = time()
            to_pop = []
            for addr, time_list in self._dangling.items():
                # If a dangling host has passed its retry interval, we remove it completely
                if time_now - time_list[1] > self.retry_interval:
                    self._add_to_input_queue(
                        addr, MessageEntity.CONNECTION_FINALIZED_MSG)
                    to_pop.append(addr)
                # We retry establishing a connection with the dangling host
                elif time_now - time_list[0] >= self.retry_period:
                    time_list[0] = time_now
                    try:
                        sock = socket.socket(socket.AF_INET,
                                             socket.SOCK_STREAM)
                        sock.connect((socket.gethostbyname(addr[0]), addr[1]))
                        self._register_host(sock, overwrite=True)
                        if self.reSendMsgs:
                            self._forward_old_msgs(self._seq_nums[addr][1],
                                                   addr)
                            self._send_msg(self._seq_nums[addr][0], addr, None)
                        to_pop.append(addr)
                        # When connection is re-established, we inject a status message for that host in the input queue
                        self._add_to_input_queue(
                            addr, MessageEntity.CONNECTION_RESTORED_MSG)
                        MessageClient.logger.info(
                            'Connection to server %s was successfully restored'
                            % formatipport(addr))
                    except (ConnectionError, ConnectionRefusedError,
                            TimeoutError, ConnectionAbortedError):
                        pass
            # We remove all hosts for which connection was re-established from the dangling ones
            for addr in to_pop:
                self._dangling.pop(addr, None)
            to_pop.clear()
Example #8
0
    def _send_msg(self, seq_num, addr, comm):
        """
        Private method that sends messages over specific active hosts of the registered hosts list

        :param seq_num: sequence number of the message to be sent in tuple format
        :param addr: address of the target host
        :param comm: content of the message. Must be supplied as a dictionary. If None, an empty message with its header
            only will be sent: this type of messages is used to identify message forwarding requests, with seq_num
            representing the sequence number of the last valid message received from the host
        :return: True if the message was successfully sent, False otherwise
        """
        # Verifying if the input address has a corresponding open socket
        try:
            sock = self._registeredHosts[addr]
        except KeyError:
            sock = None
        # If no valid socket was found for the input address, the message is not sent
        if sock is None:
            MessageEntity.logger.error('Cannot send to %s, is not registered' %
                                       formatipport(addr))
            return False
        if comm is None:
            # An empty message containing only the header represents a message forwarding request
            msg = struct.pack('>I', 0) + struct.pack(
                '>I', seq_num[0]) + struct.pack('>I', seq_num[1])
        else:
            msg = json.dumps(comm).encode()
            # Prefix each message with a 4-byte length (network byte order)
            msg = struct.pack('>I', len(msg)) + struct.pack(
                '>I', seq_num[0]) + struct.pack('>I', seq_num[1]) + msg
        try:
            sock.sendall(msg)
            if self.reSendMsgs and comm is not None:
                self._update_seq_num(addr, seq_num, received=False)
            return True
        except Exception:
            MessageEntity.logger.error(
                'Exception encountered while sending msg to %s' %
                getipport(sock))
            # If an error is encountered during communication, we suppose the host is dead
            return False
Example #9
0
 def listen(self):
     """
     Listens for incoming fault injection requests and executes them 
     """
     InjectorEngine.logger.info("FINJ Injection Engine v%s started" %
                                VER_ID)
     signal.signal(signal.SIGINT, self._signalhandler)
     signal.signal(signal.SIGTERM, self._signalhandler)
     self._subman.start_subprocesses()
     self._server.start()
     self._pool.start()
     while True:
         # Waiting for a new requests to arrive
         addr, msg = self._server.pop_msg_queue()
         msg_type = msg[MessageBuilder.FIELD_TYPE]
         # If a session command has arrived, we process it accordingly
         if msg_type == MessageBuilder.COMMAND_START_SESSION or msg_type == MessageBuilder.COMMAND_END_SESSION:
             self._update_session(addr, msg)
         # The set time is sent by the master after a successful ack and defines when the 'workload' is started
         elif msg_type == MessageBuilder.COMMAND_SET_TIME and self._master is not None and addr == self._master:
             self._pool.reset_session(msg[MessageBuilder.FIELD_TIME],
                                      time())
         # If the master has sent a clock correction request, we process it
         elif msg_type == MessageBuilder.COMMAND_CORRECT_TIME and self._master is not None and addr == self._master:
             self._pool.correct_time(msg[MessageBuilder.FIELD_TIME])
         # Processing a termination command
         elif msg_type == MessageBuilder.COMMAND_TERMINATE:
             self._check_for_termination(addr, msg)
         # If a new command has been issued by the current session master, we add it to the thread pool queue
         elif addr == self._master and msg[
                 MessageBuilder.FIELD_TYPE] == MessageBuilder.COMMAND_START:
             self._pool.submit_task(Task.msg_to_task(msg))
         elif msg_type == MessageBuilder.COMMAND_GREET:
             reply = MessageBuilder.status_greet(time(),
                                                 self._pool.active_tasks(),
                                                 self._master is not None)
             self._server.send_msg(addr, reply)
         else:
             InjectorEngine.logger.warning(
                 'Invalid command sent from non-master host %s',
                 formatipport(addr))
Example #10
0
    def _process_msg_inject(self, addr, msg):
        """
        Processes incoming message for clients involved in an injection session

        :param addr: The address of the sender
        :param msg: The message dictionary
        """
        # We process status messages for connections that are in the queue
        is_status, status = MessageClient.is_status_message(msg)
        if is_status and status == MessageClient.CONNECTION_LOST_MSG:
            # If connection has been lost with an host, we remove its pendingTasks entry
            if not self._suppressOutput:
                self._writers[addr].write_entry(
                    MessageBuilder.status_connection(time()))
        elif is_status and status == MessageClient.CONNECTION_RESTORED_MSG:
            # If connection has been restored with an host, we send a new session start command
            self._client.send_msg(
                addr, MessageBuilder.command_session(self._session_id))
            self._client.send_msg(
                addr,
                MessageBuilder.command_set_time(self._get_timestamp(time())))
        elif is_status and status == MessageClient.CONNECTION_FINALIZED_MSG:
            self._pendingTasks.pop(addr, None)
            # If all connections to servers were finalized we assume that the injection can be terminated
            if len(self._pendingTasks) == 0:
                self._endReached = True
                self._reader.close()
        else:
            msg_type = msg[MessageBuilder.FIELD_TYPE]
            if msg_type != MessageBuilder.ACK_YES and msg_type != MessageBuilder.ACK_NO:
                # Ack messages are not written to the output log
                if not self._suppressOutput:
                    self._writers[addr].write_entry(msg)
            # We log on the terminal the content of the message in a pretty form
            if msg_type == MessageBuilder.STATUS_START:
                InjectorController.logger.info(
                    "Task %s started on host %s" %
                    (msg[MessageBuilder.FIELD_DATA], formatipport(addr)))
            elif msg_type == MessageBuilder.STATUS_RESTART:
                InjectorController.logger.info(
                    "Task %s restarted on host %s" %
                    (msg[MessageBuilder.FIELD_DATA], formatipport(addr)))
            elif msg_type == MessageBuilder.STATUS_END:
                InjectorController.logger.info(
                    "Task %s terminated successfully on host %s" %
                    (msg[MessageBuilder.FIELD_DATA], formatipport(addr)))
                # If a task terminates, we remove its sequence number from the set of pending tasks for the host
                self._pendingTasks[addr].discard(
                    msg[MessageBuilder.FIELD_SEQNUM])
                if not self._suppressOutput:
                    self._write_task_output(addr, msg)
            elif msg_type == MessageBuilder.STATUS_ERR:
                InjectorController.logger.error(
                    "Task %s terminated with error code %s on host %s" %
                    (msg[MessageBuilder.FIELD_DATA],
                     str(msg[MessageBuilder.FIELD_ERR]), formatipport(addr)))
                self._pendingTasks[addr].discard(
                    msg[MessageBuilder.FIELD_SEQNUM])
                if not self._suppressOutput:
                    self._write_task_output(addr, msg)
            elif msg_type == MessageBuilder.ACK_YES:
                # ACK messages after the initialization phase are received ONLY when a connection is restored,
                # and the session must be resumed
                InjectorController.logger.warning(
                    "Session resumed with engine %s" % formatipport(addr))
                # If the ack msg contains an error, it means all previously running tasks have been lost
                if not self._suppressOutput:
                    self._writers[addr].write_entry(
                        MessageBuilder.status_connection(time(),
                                                         restored=True))
                if MessageBuilder.FIELD_ERR in msg:
                    self._pendingTasks[addr] = set()
                    if not self._suppressOutput:
                        self._writers[addr].write_entry(
                            MessageBuilder.status_reset(
                                msg[MessageBuilder.FIELD_TIME]))
            elif msg_type == MessageBuilder.ACK_NO:
                InjectorController.logger.warning(
                    "Session cannot be resumed with engine %s" %
                    formatipport(addr))
                self._client.remove_host(addr)
Example #11
0
    def _init_session(self, workload_name):
        """
        Initializes the injection session for all connected hosts

        :param workload_name: The name of the workload to be injected
        :return: the number of hosts that have accepted the injection start command, and the timestamp ID of the session
        """
        session_start_timestamp = time()
        msg_start = MessageBuilder.command_session(session_start_timestamp)
        self._client.broadcast_msg(msg_start)

        self._writers = {}
        self._outputsDirs = {}
        self._pendingTasks = {}
        session_accepted = set()
        session_replied = 0
        session_sent = self._client.get_n_registered_hosts()
        session_check_start = time()
        session_check_now = time()
        while session_check_now - session_check_start < self._sessionWait and session_replied < session_sent:
            # We wait until we receive an ack (positive or negative) from all connected hosts, or either we time out
            if self._client.peek_msg_queue() > 0:
                addr, msg = self._client.pop_msg_queue()
                if msg[MessageBuilder.FIELD_TYPE] == MessageBuilder.ACK_YES:
                    # If an host replies to the injection start command with a positive ack, its log writer is
                    # instantiated, together with its entry in the pendingTasks dictionary
                    InjectorController.logger.info(
                        "Injection session started with engine %s" %
                        formatipport(addr))
                    session_accepted.add(addr)
                    session_replied += 1
                    self._outputsDirs[addr] = format_output_directory(
                        self._resultsDir, addr, workload_name)
                    # The outputs directory needs to be flushed before starting the new injection session
                    if not self._suppressOutput:
                        if isdir(self._outputsDirs[addr]):
                            rmtree(self._outputsDirs[addr], ignore_errors=True)
                        self._writers[addr] = ExecutionLogWriter(
                            format_injection_filename(self._resultsDir, addr,
                                                      workload_name))
                        self._writers[addr].write_entry(
                            MessageBuilder.command_session(
                                msg[MessageBuilder.FIELD_TIME]))
                    self._pendingTasks[addr] = set()
                elif msg[MessageBuilder.FIELD_TYPE] == MessageBuilder.ACK_NO:
                    # If an host rejects the injection start command, we discard it
                    InjectorController.logger.warning(
                        "Injection session request rejected by engine %s" %
                        formatipport(addr))
                    session_replied += 1
                    self._client.remove_host(addr)
            sleep(self._sleepPeriod)
            session_check_now = time()

        if session_check_now - session_check_start >= self._sessionWait:
            # If we have reached the time out, it means that not all of the connected hosts have replied. This is
            # highly unlikely, but could still happen. In this case, we remove all hosts that have not replied
            InjectorController.logger.warning(
                "Injection session startup reached the timeout limit")
            for addr in self._client.get_registered_hosts():
                if addr not in session_accepted:
                    self._client.remove_host(addr)

        return len(session_accepted), session_start_timestamp