Exemple #1
0
 def setUp(self):
     super(TestExpiringList, self).setUp()
     self.exping_list = ExpiringList(10)
     patcher_time = patch("time.time")
     self.addCleanup(patcher_time.stop)
     self.mock_time = patcher_time.start()
     self.mock_time.return_value = 0
    def __init__(self):
        super(AgentRpcMessenger, self).__init__()

        # session to list of RPC IDs
        self._session_rpcs = defaultdict(dict)

        # FQDN to session
        self._sessions = {}
        self._cancelled_rpcs = ExpiringList(10 * 60)

        self._action_runner_rx_queue = AgentRxQueue(AgentRpcMessenger.PLUGIN_NAME)
        self._action_runner_rx_queue.purge()

        self._lock = threading.Lock()
Exemple #3
0
class TestExpiringList(ImlUnitTestCase):
    def setUp(self):
        super(TestExpiringList, self).setUp()
        self.exping_list = ExpiringList(10)
        patcher_time = patch("time.time")
        self.addCleanup(patcher_time.stop)
        self.mock_time = patcher_time.start()
        self.mock_time.return_value = 0

    def test_not_expired(self):
        self.exping_list.append("value")
        self.assertIn("value", self.exping_list)
        self.assertEqual(1, len(self.exping_list))
        self.assertEqual("value", self.exping_list[0])

    def test_expired(self):
        self.exping_list.append("value")
        self.mock_time.return_value = 60 * 10 + 1
        self.assertNotIn("value", self.exping_list)
        self.assertEqual(0, len(self.exping_list))

    def test_deletion(self):
        self.exping_list.append("value 1")
        self.exping_list.append("value 2")
        del self.exping_list[0]
        self.assertEqual(1, len(self.exping_list))
        self.assertEqual("value 2", self.exping_list[0])

    def test_stringify(self):
        self.exping_list.append("value")
        self.assertEqual("['value']", str(self.exping_list))

    def test_multiple_entries(self):
        map(self.exping_list.append, range(1, 101))
        self.assertEqual(100, len(self.exping_list))
        self.mock_time.return_value = 30
        map(self.exping_list.append, range(101, 201))
        self.assertEqual(100, len(self.exping_list))
        self.mock_time.return_value = 60
        self.exping_list.append("value")
        self.assertIn("value", self.exping_list)
        self.assertEqual(1, len(self.exping_list))
        self.assertEqual("value", self.exping_list[0])
class AgentRpcMessenger(object):
    """
    This class consumes AgentRunnerPluginRxQueue, sends
    messages to AgentTxQueue, and maintains state for
    actions in progress.
    """

    # The name of the device plugin on the agent with which
    # this module will communicate
    PLUGIN_NAME = "action_runner"

    # A bit rubbish, but a tag so callers can know the failure was because the server could not be contacted.
    # Improve with a different Exception one day.
    COULD_NOT_CONTACT_TAG = "Could not contact server"

    # If no action_runner session is present when trying to run
    # an action, wait this long for one to show up
    SESSION_WAIT_TIMEOUT = 30

    def __init__(self):
        super(AgentRpcMessenger, self).__init__()

        # session to list of RPC IDs
        self._session_rpcs = defaultdict(dict)

        # FQDN to session
        self._sessions = {}
        self._cancelled_rpcs = ExpiringList(10 * 60)

        self._action_runner_rx_queue = AgentRxQueue(
            AgentRpcMessenger.PLUGIN_NAME)
        self._action_runner_rx_queue.purge()

        self._lock = threading.Lock()

    def run(self):
        try:
            HttpAgentRpc().reset_plugin_sessions(AgentRpcMessenger.PLUGIN_NAME)
        except RpcTimeout:
            # Assume this means that the http_agent service isn't running: this
            # is acceptable, as our goal of there not being any sessions is
            # already the case.
            log.warning("Unable to reset %s sessions" %
                        AgentRpcMessenger.PLUGIN_NAME)

        self._action_runner_rx_queue.serve(session_callback=self.on_rx)
        log.info("AgentRpcMessenger.complete")

    def stop(self):
        log.info("AgentRpcMessenger.stop")
        self._action_runner_rx_queue.stop()

    def complete_all(self):
        log.info("AgentRpcMessenger.complete_all")
        for session_id, rpc_id_to_rpc in self._session_rpcs.items():
            for rpc_id, rpc_state in rpc_id_to_rpc.items():
                log.info("AgentRpcMessenger.complete_all: erroring %s" %
                         rpc_state.id)
                if not rpc_state.complete.is_set():
                    rpc_state.exception = "Cancelled due to service shutdown"
                    rpc_state.complete.set()

    def remove(self, fqdn):
        with self._lock:
            try:
                del self._sessions[fqdn]
            except KeyError:
                pass

    def _abort_session(self,
                       fqdn,
                       message,
                       old_session_id,
                       new_session_id=None):
        log.warning("AgentRpcMessenger.on_rx: aborting session %s because %s" %
                    (old_session_id, message))
        old_rpcs = self._session_rpcs[old_session_id]

        if new_session_id is not None:
            self._sessions[fqdn] = new_session_id
        else:
            try:
                del self._sessions[fqdn]
            except KeyError:
                pass

        for rpc in old_rpcs.values():
            if new_session_id:
                log.warning(
                    "AgentRpcMessenger.on_rx: re-issuing RPC %s for session %s (was %s) because %s"
                    % (rpc.id, new_session_id, old_session_id, message))
                rpc.session_id = new_session_id
                self._resend(rpc)
            else:
                rpc.exception = "Communications error with %s because %s" % (
                    fqdn, message)
                rpc.complete.set()
        del self._session_rpcs[old_session_id]

    def get_session_id(self, fqdn):
        with self._lock:
            try:
                return self._sessions[fqdn]
            except KeyError:
                return None

    def await_restart(self, fqdn, timeout, old_session_id=None):
        """
        If there is currently an action_runner session, wait for a different one.  Else
        wait for any action_runner session to start."""

        if old_session_id is None:
            old_session_id = self.get_session_id(fqdn)

        log.info("AgentRpcMessenger.await_restart: awaiting %s (old %s)" %
                 (fqdn, old_session_id))

        # Note: using polling here for simplicity, if efficiency became an issue here
        # we could set up events to be triggered by the new session logic in on_rx, and
        # sleep on them instead of polling.

        duration = 0
        poll_period = 1.0
        while True:
            current_session_id = self.get_session_id(fqdn)

            if current_session_id is not None and current_session_id != old_session_id:
                log.info("AgentRpcMessenger.await_restart: %s new %s" %
                         (fqdn, current_session_id))
                break

            if duration >= timeout:
                log.info(
                    "AgentRpcMessenger.await_restart: %s timeout after %ss" %
                    (fqdn, duration))

            duration += poll_period
            time.sleep(poll_period)

    def on_rx(self, message):
        with self._lock:
            log.debug("on_rx: %s" % message)
            session_id = message["session_id"]
            fqdn = message["fqdn"]
            log.info("AgentRpcMessenger.on_rx: %s/%s" % (fqdn, session_id))

            if message["type"] == "SESSION_CREATE":
                if fqdn in self._sessions:
                    old_session_id = self._sessions[fqdn]
                    self._abort_session(fqdn, "new session created",
                                        old_session_id, session_id)
                else:
                    self._sessions[fqdn] = session_id
            elif message["type"] == "SESSION_TERMINATE":
                # An agent has timed out or restarted, we're being told its session is now dead
                if message["fqdn"] in self._sessions:
                    self._abort_session(fqdn, "session terminated",
                                        message["session_id"])
            elif message["type"] == "SESSION_TERMINATE_ALL":
                # The http_agent service has restarted, all sessions are now over
                for fqdn, session in self._sessions.items():
                    self._abort_session(fqdn, "all sessions terminated",
                                        session)
            else:
                rpc_response = message["body"]
                if rpc_response["type"] != "ACTION_COMPLETE":
                    log.error("Unexpected type '%s'" % rpc_response["type"])
                    return

                if fqdn in self._sessions and self._sessions[
                        fqdn] != session_id:
                    log.info(
                        "AgentRpcMessenger.on_rx: cancelling session %s/%s (replaced by %s)"
                        % (fqdn, self._sessions[fqdn], session_id))
                    self._abort_session(fqdn, "session cancelled",
                                        self._sessions[fqdn])
                    HttpAgentRpc().reset_session(fqdn,
                                                 AgentRpcMessenger.PLUGIN_NAME,
                                                 session_id)
                elif fqdn in self._sessions:
                    log.info("AgentRpcMessenger.on_rx: good session %s/%s" %
                             (fqdn, session_id))
                    # Find this RPC and complete it
                    try:
                        rpc = self._session_rpcs[session_id][
                            rpc_response["id"]]
                    except KeyError:
                        if rpc_response["id"] in self._cancelled_rpcs:
                            log.debug(
                                "Response received from a cancelled RPC (id: %s)",
                                rpc_response["id"])
                        else:
                            log.error(
                                "Response received from UNKNOWN RPC of (id: %s)",
                                rpc_response["id"])
                    else:
                        del self._session_rpcs[session_id][rpc_response["id"]]
                        rpc.exception = rpc_response["exception"]
                        rpc.result = rpc_response["result"]
                        rpc.subprocesses = rpc_response["subprocesses"]
                        log.info("AgentRpcMessenger.on_rx: completing rpc %s" %
                                 rpc.id)
                        rpc.complete.set()
                else:
                    log.info("AgentRpcMessenger.on_rx: unknown session %s/%s" %
                             (fqdn, session_id))
                    # A session I never heard of?
                    HttpAgentRpc().reset_session(fqdn,
                                                 AgentRpcMessenger.PLUGIN_NAME,
                                                 session_id)

    def _resend(self, rpc):
        log.debug("AgentRpcMessenger._resend: rpc %s in session %s" %
                  (rpc.id, rpc.session_id))
        self._session_rpcs[rpc.session_id][rpc.id] = rpc
        AgentTxQueue().put(rpc.get_request())

    def _send_request(self, fqdn, action, args):
        wait_count = 0

        if not self.await_session(fqdn,
                                  AgentRpcMessenger.SESSION_WAIT_TIMEOUT):
            log.error("No %s session for %s after %s seconds" %
                      (AgentRpcMessenger.PLUGIN_NAME, fqdn, wait_count))
            raise AgentException(
                fqdn,
                action,
                args,
                "%s %s no session after %s seconds" %
                (self.COULD_NOT_CONTACT_TAG, fqdn,
                 AgentRpcMessenger.SESSION_WAIT_TIMEOUT),
            )

        with self._lock:
            try:
                session_id = self._sessions[fqdn]
            except KeyError:
                # This could happen in spite of the earlier check, as that was outside the lock.
                log.warning("AgentRpcMessenger._send: no session for %s" %
                            fqdn)
                raise AgentException(
                    fqdn, action, args,
                    "%s %s" % (self.COULD_NOT_CONTACT_TAG, fqdn))

            log.debug("AgentRpcMessenger._send: using session %s" % session_id)

            rpc = ActionInFlight(session_id, fqdn, action, args)

            self._session_rpcs[session_id][rpc.id] = rpc
            AgentTxQueue().put(rpc.get_request())
            return rpc

    def _send_cancellation(self, rpc):
        with self._lock:
            try:
                self._session_rpcs[rpc.session_id][rpc.id]
            except KeyError:
                log.warning(
                    "Dropping cancellation of RPC %s, it is already complete or aborted"
                    % rpc.id)
            else:
                log.warning("Cancelling RPC %s" % rpc.id)
                AgentTxQueue().put(rpc.get_cancellation())
                del self._session_rpcs[rpc.session_id][rpc.id]

    def _complete(self, rpc, cancel_event):
        log.info("AgentRpcMessenger._complete: starting wait for rpc %s" %
                 rpc.id)

        # Wait for rpc.complete, waking up every second to
        # check cancel_event
        while True:
            if cancel_event.is_set():
                self._send_cancellation(rpc)
                self._cancelled_rpcs.append(rpc.id)
                raise AgentCancellation()
            else:
                rpc.complete.wait(timeout=1.0)
                if rpc.complete.is_set():
                    break

        log.info("AgentRpcMessenger._complete: completed wait for rpc %s" %
                 rpc.id)
        if rpc.exception:
            raise AgentException(rpc.fqdn,
                                 rpc.action,
                                 rpc.args,
                                 rpc.exception,
                                 subprocesses=rpc.subprocesses)
        else:
            return rpc.result

    def call(self, fqdn, action, args, cancel_event):
        log.debug("AgentRpcMessenger.call: %s %s" % (fqdn, action))
        rpc = self._send_request(fqdn, action, args)
        return self._complete(rpc, cancel_event), rpc

    def await_session(self, fqdn, timeout):
        """
        Wait for the agent to connect back to the manager and hence be ready to accept commands
        :param fqdn: fqdn of the agent we are waiting for
        :param timeout: how long to wait before quiting.
        :return: timeout remaining 0=failed, !0 is pass and useful for debug.
        """
        while self.get_session_id(fqdn) == None and timeout > 0:
            # Allow a short wait for a session to show up, for example
            # when running setup actions on a host we've just added its
            # session may not yet have been fully established
            log.info(
                "AgentRpcMessenger._send: no session yet for %s, %s seconds remain"
                % (fqdn, timeout))
            timeout -= 1
            time.sleep(1)

        return timeout