class TestAggregateStat(TestCase): def setUp(self): super(TestAggregateStat, self).setUp() self.stat = AggregateStat("foo", "ms") def test_string_no_data(self): with patch("calico.stats.monotonic_time", autospec=True) as m_time: m_time.return_value = 1234 self.stat.reset() self.assertEqual( str(self.stat), "foo: 0 in 0.0s (0.000/s) " "min=0.000ms mean=0.000ms max=0.000ms") def test_string_with_data_zero_time(self): with patch("calico.stats.monotonic_time", autospec=True) as m_time: m_time.return_value = 1234 self.stat.reset() self.stat.store_reading(123) self.stat.store_reading(124) self.assertEqual( str(self.stat), "foo: 2 in 0.0s (0.000/s) " "min=123.000ms mean=123.500ms max=124.000ms") def test_string_with_data_and_time(self): with patch("calico.stats.monotonic_time", autospec=True) as m_time: m_time.side_effect = iter([1234, 1235, 1235]) self.stat.reset() self.stat.store_reading(123) self.stat.store_reading(124) self.assertEqual( str(self.stat), "foo: 2 in 1.0s (2.000/s) " "min=123.000ms mean=123.500ms max=124.000ms")
class TestAggregateStat(TestCase): def setUp(self): super(TestAggregateStat, self).setUp() self.stat = AggregateStat("foo", "ms") def test_string_no_data(self): with patch("calico.stats.monotonic_time", autospec=True) as m_time: m_time.return_value = 1234 self.stat.reset() self.assertEqual( str(self.stat), "foo: 0 in 0.0s (0.000/s) " "min=0.000ms mean=0.000ms max=0.000ms" ) def test_string_with_data_zero_time(self): with patch("calico.stats.monotonic_time", autospec=True) as m_time: m_time.return_value = 1234 self.stat.reset() self.stat.store_reading(123) self.stat.store_reading(124) self.assertEqual( str(self.stat), "foo: 2 in 0.0s (0.000/s) " "min=123.000ms mean=123.500ms max=124.000ms" ) def test_string_with_data_and_time(self): with patch("calico.stats.monotonic_time", autospec=True) as m_time: m_time.side_effect = iter([1234, 1235, 1235]) self.stat.reset() self.stat.store_reading(123) self.stat.store_reading(124) self.assertEqual( str(self.stat), "foo: 2 in 1.0s (2.000/s) " "min=123.000ms mean=123.500ms max=124.000ms" )
def watch_etcd(self, next_index, event_queue, stop_event): """ Thread: etcd watcher thread. Watches etcd for changes and sends them over the queue to the resync thread, which owns the socket to Felix. Dies if it receives an error from etcd. Note: it is important that we pass the index, queue and event as parameters to ensure that each watcher thread only touches the versions of those values that were created for it as opposed to a later-created watcher thread. :param int next_index: The etcd index to start watching from. :param Queue event_queue: Queue of updates back to the resync thread. :param Event stop_event: Event used to stop this thread when it is no longer needed. """ _log.info("Watcher thread started with next index %s", next_index) last_log_time = monotonic_time() req_end_time = None non_req_time_stat = AggregateStat("processing time", "ms") etcd_response_time = None etcd_response_time_stat = AggregateStat("etcd response time", "ms") stats = [etcd_response_time_stat, non_req_time_stat] http = None try: while not self._stop_event.is_set() and not stop_event.is_set(): if not http: _log.info("No HTTP pool, creating one...") http = self.get_etcd_connection() req_start_time = monotonic_time() if req_end_time is not None: # Calculate the time since the end of the previous request, # i.e. the time we spent processing the response. Note: # start and end are flipped because we just read the start # time but we have the end time from the last loop. non_req_time = req_start_time - req_end_time non_req_time_stat.store_reading(non_req_time * 1000) _log.debug("Waiting on etcd index %s", next_index) try: try: resp = self._etcd_request(http, VERSION_DIR, recursive=True, wait_index=next_index, timeout=90) finally: # Make sure the time is available to both exception and # mainline code paths. req_end_time = monotonic_time() etcd_response_time = req_end_time - req_start_time if resp.status != 200: _log.warning("etcd watch returned bad HTTP status to" "poll on index %s: %s", next_index, resp.status) self._check_cluster_id(resp) resp_body = resp.data # Force read inside try block. except urllib3.exceptions.ReadTimeoutError: # 100% expected when there are no events. _log.debug("Watch read timed out, restarting watch at " "index %s", next_index) # Workaround urllib3 bug #718. After a ReadTimeout, the # connection is incorrectly recycled. http = None continue except (urllib3.exceptions.HTTPError, httplib.HTTPException, socket.error) as e: # Not so expected but still possible to recover: etcd # being restarted, for example. assert etcd_response_time is not None if etcd_response_time < REQ_TIGHT_LOOP_THRESH: # Failed fast, make sure we don't tight loop. delay = REQ_TIGHT_LOOP_THRESH - etcd_response_time _log.warning("Connection to etcd failed with %r, " "restarting watch at index %s after " "delay %.3f", e, next_index, delay) time.sleep(delay) else: _log.warning("Connection to etcd failed with %r, " "restarting watch at index %s " "immediately", e, next_index) # Workaround urllib3 bug #718. After a ReadTimeout, the # connection is incorrectly recycled. http = None continue # If we get to this point, we've got an etcd response to # process; try to parse it. try: etcd_resp = json.loads(resp_body) if "errorCode" in etcd_resp: _log.error("Error from etcd for index %s: %s; " "triggering a resync.", next_index, etcd_resp) break node = etcd_resp["node"] key = node["key"] action = ACTION_MAPPING[etcd_resp["action"]] is_dir = node.get("dir", False) value = node.get("value") dir_creation = False if is_dir: if action == "delete": if key.rstrip("/") in (VERSION_DIR, ROOT_DIR): # Special case: if the whole keyspace is # deleted, that implies the ready flag is gone # too. Break out of the loop to trigger a # resync. This avoids queuing up a bunch of # events that would be discarded by the # resync thread. _log.warning("Whole %s deleted, resyncing", VERSION_DIR) break else: # Just ignore sets to directories, we only track # leaves. _log.debug("Skipping non-delete to dir %s", key) dir_creation = True modified_index = node["modifiedIndex"] except (KeyError, TypeError, ValueError): _log.exception("Unexpected format for etcd response to" "index %s: %r; triggering a resync.", next_index, resp_body) break else: # We successfully parsed the response, hand it off to the # resync thread. Now we know that we got a response, # we record that in the stat. etcd_response_time_stat.store_reading(etcd_response_time * 1000) if not dir_creation: # The resync thread doesn't need to know about # directory creations so we skip them. (It does need # to know about deletions in order to clean up # sub-keys.) event_queue.put((modified_index, key, value)) next_index = modified_index + 1 # Opportunistically log stats. now = monotonic_time() if now - last_log_time > STATS_LOG_INTERVAL: for stat in stats: _log.info("STAT: Watcher %s", stat) stat.reset() _log.info("STAT: Watcher queue length: %s", event_queue.qsize()) last_log_time = now except DriverShutdown: _log.warning("Watcher thread stopping due to driver shutdown.") except: _log.exception("Exception finishing watcher thread.") raise finally: # Signal to the resync thread that we've exited. event_queue.put(None) # Make sure we get some stats output from the watcher. for stat in stats: _log.info("STAT: Final watcher %s", stat) _log.info("Watcher thread finished. Signalled to resync thread. " "Was at index %s. Queue length is %s.", next_index, event_queue.qsize())
def watch_etcd(self, next_index, event_queue, stop_event): """ Thread: etcd watcher thread. Watches etcd for changes and sends them over the queue to the resync thread, which owns the socket to Felix. Dies if it receives an error from etcd. Note: it is important that we pass the index, queue and event as parameters to ensure that each watcher thread only touches the versions of those values that were created for it as opposed to a later-created watcher thread. :param int next_index: The etcd index to start watching from. :param Queue event_queue: Queue of updates back to the resync thread. :param Event stop_event: Event used to stop this thread when it is no longer needed. """ _log.info("Watcher thread started with next index %s", next_index) last_log_time = monotonic_time() req_end_time = None non_req_time_stat = AggregateStat("processing time", "ms") etcd_response_time = None etcd_response_time_stat = AggregateStat("etcd response time", "ms") stats = [etcd_response_time_stat, non_req_time_stat] http = None try: while not self._stop_event.is_set() and not stop_event.is_set(): if not http: _log.info("No HTTP pool, creating one...") http = self.get_etcd_connection() req_start_time = monotonic_time() if req_end_time is not None: # Calculate the time since the end of the previous request, # i.e. the time we spent processing the response. Note: # start and end are flipped because we just read the start # time but we have the end time from the last loop. non_req_time = req_start_time - req_end_time non_req_time_stat.store_reading(non_req_time * 1000) _log.debug("Waiting on etcd index %s", next_index) try: try: resp = self._etcd_request(http, VERSION_DIR, recursive=True, wait_index=next_index, timeout=90) finally: # Make sure the time is available to both exception and # mainline code paths. req_end_time = monotonic_time() etcd_response_time = req_end_time - req_start_time if resp.status != 200: _log.warning("etcd watch returned bad HTTP status to" "poll on index %s: %s", next_index, resp.status) self._check_cluster_id(resp) resp_body = resp.data # Force read inside try block. except urllib3.exceptions.ReadTimeoutError: # 100% expected when there are no events. _log.debug("Watch read timed out, restarting watch at " "index %s", next_index) # Workaround urllib3 bug #718. After a ReadTimeout, the # connection is incorrectly recycled. http = None continue except (urllib3.exceptions.HTTPError, httplib.HTTPException, socket.error) as e: # Not so expected but still possible to recover: etcd # being restarted, for example. assert etcd_response_time is not None if etcd_response_time < REQ_TIGHT_LOOP_THRESH: # Failed fast, make sure we don't tight loop. delay = REQ_TIGHT_LOOP_THRESH - etcd_response_time _log.warning("Connection to etcd failed with %r, " "restarting watch at index %s after " "delay %.3f", e, next_index, delay) time.sleep(delay) else: _log.warning("Connection to etcd failed with %r, " "restarting watch at index %s " "immediately", e, next_index) # If available, connect to a different etcd URL in case # only the previous one has failed. self._rotate_etcd_url() # Workaround urllib3 bug #718. After a ReadTimeout, the # connection is incorrectly recycled. http = None continue # If we get to this point, we've got an etcd response to # process; try to parse it. try: etcd_resp = json.loads(resp_body) if "errorCode" in etcd_resp: _log.error("Error from etcd for index %s: %s; " "triggering a resync.", next_index, etcd_resp) break node = etcd_resp["node"] key = node["key"] action = ACTION_MAPPING[etcd_resp["action"]] is_dir = node.get("dir", False) value = node.get("value") dir_creation = False if is_dir: if action == "delete": if key.rstrip("/") in (VERSION_DIR, ROOT_DIR): # Special case: if the whole keyspace is # deleted, that implies the ready flag is gone # too. Break out of the loop to trigger a # resync. This avoids queuing up a bunch of # events that would be discarded by the # resync thread. _log.warning("Whole %s deleted, resyncing", VERSION_DIR) break else: # Just ignore sets to directories, we only track # leaves. _log.debug("Skipping non-delete to dir %s", key) dir_creation = True modified_index = node["modifiedIndex"] except (KeyError, TypeError, ValueError): _log.exception("Unexpected format for etcd response to" "index %s: %r; triggering a resync.", next_index, resp_body) break else: # We successfully parsed the response, hand it off to the # resync thread. Now we know that we got a response, # we record that in the stat. etcd_response_time_stat.store_reading(etcd_response_time * 1000) if not dir_creation: # The resync thread doesn't need to know about # directory creations so we skip them. (It does need # to know about deletions in order to clean up # sub-keys.) event_queue.put((modified_index, key, value)) next_index = modified_index + 1 # Opportunistically log stats. now = monotonic_time() if now - last_log_time > STATS_LOG_INTERVAL: for stat in stats: _log.info("STAT: Watcher %s", stat) stat.reset() _log.info("STAT: Watcher queue length: %s", event_queue.qsize()) last_log_time = now except DriverShutdown: _log.warning("Watcher thread stopping due to driver shutdown.") except: _log.exception("Exception finishing watcher thread.") raise finally: # Signal to the resync thread that we've exited. event_queue.put(None) # Make sure we get some stats output from the watcher. for stat in stats: _log.info("STAT: Final watcher %s", stat) _log.info("Watcher thread finished. Signalled to resync thread. " "Was at index %s. Queue length is %s.", next_index, event_queue.qsize())
def setUp(self): super(TestAggregateStat, self).setUp() self.stat = AggregateStat("foo", "ms")