def setUp(self): super(TestExpiringList, self).setUp() self.exping_list = ExpiringList(10) patcher_time = patch("time.time") self.addCleanup(patcher_time.stop) self.mock_time = patcher_time.start() self.mock_time.return_value = 0
def __init__(self): super(AgentRpcMessenger, self).__init__() # session to list of RPC IDs self._session_rpcs = defaultdict(dict) # FQDN to session self._sessions = {} self._cancelled_rpcs = ExpiringList(10 * 60) self._action_runner_rx_queue = AgentRxQueue(AgentRpcMessenger.PLUGIN_NAME) self._action_runner_rx_queue.purge() self._lock = threading.Lock()
class TestExpiringList(ImlUnitTestCase): def setUp(self): super(TestExpiringList, self).setUp() self.exping_list = ExpiringList(10) patcher_time = patch("time.time") self.addCleanup(patcher_time.stop) self.mock_time = patcher_time.start() self.mock_time.return_value = 0 def test_not_expired(self): self.exping_list.append("value") self.assertIn("value", self.exping_list) self.assertEqual(1, len(self.exping_list)) self.assertEqual("value", self.exping_list[0]) def test_expired(self): self.exping_list.append("value") self.mock_time.return_value = 60 * 10 + 1 self.assertNotIn("value", self.exping_list) self.assertEqual(0, len(self.exping_list)) def test_deletion(self): self.exping_list.append("value 1") self.exping_list.append("value 2") del self.exping_list[0] self.assertEqual(1, len(self.exping_list)) self.assertEqual("value 2", self.exping_list[0]) def test_stringify(self): self.exping_list.append("value") self.assertEqual("['value']", str(self.exping_list)) def test_multiple_entries(self): map(self.exping_list.append, range(1, 101)) self.assertEqual(100, len(self.exping_list)) self.mock_time.return_value = 30 map(self.exping_list.append, range(101, 201)) self.assertEqual(100, len(self.exping_list)) self.mock_time.return_value = 60 self.exping_list.append("value") self.assertIn("value", self.exping_list) self.assertEqual(1, len(self.exping_list)) self.assertEqual("value", self.exping_list[0])
class AgentRpcMessenger(object): """ This class consumes AgentRunnerPluginRxQueue, sends messages to AgentTxQueue, and maintains state for actions in progress. """ # The name of the device plugin on the agent with which # this module will communicate PLUGIN_NAME = "action_runner" # A bit rubbish, but a tag so callers can know the failure was because the server could not be contacted. # Improve with a different Exception one day. COULD_NOT_CONTACT_TAG = "Could not contact server" # If no action_runner session is present when trying to run # an action, wait this long for one to show up SESSION_WAIT_TIMEOUT = 30 def __init__(self): super(AgentRpcMessenger, self).__init__() # session to list of RPC IDs self._session_rpcs = defaultdict(dict) # FQDN to session self._sessions = {} self._cancelled_rpcs = ExpiringList(10 * 60) self._action_runner_rx_queue = AgentRxQueue( AgentRpcMessenger.PLUGIN_NAME) self._action_runner_rx_queue.purge() self._lock = threading.Lock() def run(self): try: HttpAgentRpc().reset_plugin_sessions(AgentRpcMessenger.PLUGIN_NAME) except RpcTimeout: # Assume this means that the http_agent service isn't running: this # is acceptable, as our goal of there not being any sessions is # already the case. log.warning("Unable to reset %s sessions" % AgentRpcMessenger.PLUGIN_NAME) self._action_runner_rx_queue.serve(session_callback=self.on_rx) log.info("AgentRpcMessenger.complete") def stop(self): log.info("AgentRpcMessenger.stop") self._action_runner_rx_queue.stop() def complete_all(self): log.info("AgentRpcMessenger.complete_all") for session_id, rpc_id_to_rpc in self._session_rpcs.items(): for rpc_id, rpc_state in rpc_id_to_rpc.items(): log.info("AgentRpcMessenger.complete_all: erroring %s" % rpc_state.id) if not rpc_state.complete.is_set(): rpc_state.exception = "Cancelled due to service shutdown" rpc_state.complete.set() def remove(self, fqdn): with self._lock: try: del self._sessions[fqdn] except KeyError: pass def _abort_session(self, fqdn, message, old_session_id, new_session_id=None): log.warning("AgentRpcMessenger.on_rx: aborting session %s because %s" % (old_session_id, message)) old_rpcs = self._session_rpcs[old_session_id] if new_session_id is not None: self._sessions[fqdn] = new_session_id else: try: del self._sessions[fqdn] except KeyError: pass for rpc in old_rpcs.values(): if new_session_id: log.warning( "AgentRpcMessenger.on_rx: re-issuing RPC %s for session %s (was %s) because %s" % (rpc.id, new_session_id, old_session_id, message)) rpc.session_id = new_session_id self._resend(rpc) else: rpc.exception = "Communications error with %s because %s" % ( fqdn, message) rpc.complete.set() del self._session_rpcs[old_session_id] def get_session_id(self, fqdn): with self._lock: try: return self._sessions[fqdn] except KeyError: return None def await_restart(self, fqdn, timeout, old_session_id=None): """ If there is currently an action_runner session, wait for a different one. Else wait for any action_runner session to start.""" if old_session_id is None: old_session_id = self.get_session_id(fqdn) log.info("AgentRpcMessenger.await_restart: awaiting %s (old %s)" % (fqdn, old_session_id)) # Note: using polling here for simplicity, if efficiency became an issue here # we could set up events to be triggered by the new session logic in on_rx, and # sleep on them instead of polling. duration = 0 poll_period = 1.0 while True: current_session_id = self.get_session_id(fqdn) if current_session_id is not None and current_session_id != old_session_id: log.info("AgentRpcMessenger.await_restart: %s new %s" % (fqdn, current_session_id)) break if duration >= timeout: log.info( "AgentRpcMessenger.await_restart: %s timeout after %ss" % (fqdn, duration)) duration += poll_period time.sleep(poll_period) def on_rx(self, message): with self._lock: log.debug("on_rx: %s" % message) session_id = message["session_id"] fqdn = message["fqdn"] log.info("AgentRpcMessenger.on_rx: %s/%s" % (fqdn, session_id)) if message["type"] == "SESSION_CREATE": if fqdn in self._sessions: old_session_id = self._sessions[fqdn] self._abort_session(fqdn, "new session created", old_session_id, session_id) else: self._sessions[fqdn] = session_id elif message["type"] == "SESSION_TERMINATE": # An agent has timed out or restarted, we're being told its session is now dead if message["fqdn"] in self._sessions: self._abort_session(fqdn, "session terminated", message["session_id"]) elif message["type"] == "SESSION_TERMINATE_ALL": # The http_agent service has restarted, all sessions are now over for fqdn, session in self._sessions.items(): self._abort_session(fqdn, "all sessions terminated", session) else: rpc_response = message["body"] if rpc_response["type"] != "ACTION_COMPLETE": log.error("Unexpected type '%s'" % rpc_response["type"]) return if fqdn in self._sessions and self._sessions[ fqdn] != session_id: log.info( "AgentRpcMessenger.on_rx: cancelling session %s/%s (replaced by %s)" % (fqdn, self._sessions[fqdn], session_id)) self._abort_session(fqdn, "session cancelled", self._sessions[fqdn]) HttpAgentRpc().reset_session(fqdn, AgentRpcMessenger.PLUGIN_NAME, session_id) elif fqdn in self._sessions: log.info("AgentRpcMessenger.on_rx: good session %s/%s" % (fqdn, session_id)) # Find this RPC and complete it try: rpc = self._session_rpcs[session_id][ rpc_response["id"]] except KeyError: if rpc_response["id"] in self._cancelled_rpcs: log.debug( "Response received from a cancelled RPC (id: %s)", rpc_response["id"]) else: log.error( "Response received from UNKNOWN RPC of (id: %s)", rpc_response["id"]) else: del self._session_rpcs[session_id][rpc_response["id"]] rpc.exception = rpc_response["exception"] rpc.result = rpc_response["result"] rpc.subprocesses = rpc_response["subprocesses"] log.info("AgentRpcMessenger.on_rx: completing rpc %s" % rpc.id) rpc.complete.set() else: log.info("AgentRpcMessenger.on_rx: unknown session %s/%s" % (fqdn, session_id)) # A session I never heard of? HttpAgentRpc().reset_session(fqdn, AgentRpcMessenger.PLUGIN_NAME, session_id) def _resend(self, rpc): log.debug("AgentRpcMessenger._resend: rpc %s in session %s" % (rpc.id, rpc.session_id)) self._session_rpcs[rpc.session_id][rpc.id] = rpc AgentTxQueue().put(rpc.get_request()) def _send_request(self, fqdn, action, args): wait_count = 0 if not self.await_session(fqdn, AgentRpcMessenger.SESSION_WAIT_TIMEOUT): log.error("No %s session for %s after %s seconds" % (AgentRpcMessenger.PLUGIN_NAME, fqdn, wait_count)) raise AgentException( fqdn, action, args, "%s %s no session after %s seconds" % (self.COULD_NOT_CONTACT_TAG, fqdn, AgentRpcMessenger.SESSION_WAIT_TIMEOUT), ) with self._lock: try: session_id = self._sessions[fqdn] except KeyError: # This could happen in spite of the earlier check, as that was outside the lock. log.warning("AgentRpcMessenger._send: no session for %s" % fqdn) raise AgentException( fqdn, action, args, "%s %s" % (self.COULD_NOT_CONTACT_TAG, fqdn)) log.debug("AgentRpcMessenger._send: using session %s" % session_id) rpc = ActionInFlight(session_id, fqdn, action, args) self._session_rpcs[session_id][rpc.id] = rpc AgentTxQueue().put(rpc.get_request()) return rpc def _send_cancellation(self, rpc): with self._lock: try: self._session_rpcs[rpc.session_id][rpc.id] except KeyError: log.warning( "Dropping cancellation of RPC %s, it is already complete or aborted" % rpc.id) else: log.warning("Cancelling RPC %s" % rpc.id) AgentTxQueue().put(rpc.get_cancellation()) del self._session_rpcs[rpc.session_id][rpc.id] def _complete(self, rpc, cancel_event): log.info("AgentRpcMessenger._complete: starting wait for rpc %s" % rpc.id) # Wait for rpc.complete, waking up every second to # check cancel_event while True: if cancel_event.is_set(): self._send_cancellation(rpc) self._cancelled_rpcs.append(rpc.id) raise AgentCancellation() else: rpc.complete.wait(timeout=1.0) if rpc.complete.is_set(): break log.info("AgentRpcMessenger._complete: completed wait for rpc %s" % rpc.id) if rpc.exception: raise AgentException(rpc.fqdn, rpc.action, rpc.args, rpc.exception, subprocesses=rpc.subprocesses) else: return rpc.result def call(self, fqdn, action, args, cancel_event): log.debug("AgentRpcMessenger.call: %s %s" % (fqdn, action)) rpc = self._send_request(fqdn, action, args) return self._complete(rpc, cancel_event), rpc def await_session(self, fqdn, timeout): """ Wait for the agent to connect back to the manager and hence be ready to accept commands :param fqdn: fqdn of the agent we are waiting for :param timeout: how long to wait before quiting. :return: timeout remaining 0=failed, !0 is pass and useful for debug. """ while self.get_session_id(fqdn) == None and timeout > 0: # Allow a short wait for a session to show up, for example # when running setup actions on a host we've just added its # session may not yet have been fully established log.info( "AgentRpcMessenger._send: no session yet for %s, %s seconds remain" % (fqdn, timeout)) timeout -= 1 time.sleep(1) return timeout