Ejemplo n.º 1
0
 def wrapped(*args, **kwargs):
     start_time = monotonic_time()
     rc = fn(*args, **kwargs)
     end_time = monotonic_time()
     time_taken = end_time - start_time
     assert_true(time_taken < time_limit, "Test took too long")
     return rc
Ejemplo n.º 2
0
 def test_mainline(self):
     a = monotonic_time()
     time.sleep(0.01)
     b = monotonic_time()
     self.assertTrue(b >= a + 0.01,
                     msg="Monotonic time did not increase as "
                     "expected: %s, %s" % (a, b))
Ejemplo n.º 3
0
    def _on_update_from_driver(self, msg):
        """
        Called when the driver sends us a key/value pair update.

        After the initial handshake, the stream of events consists
        entirely of updates unless something happens to change the
        state of the driver.

        :param dict msg: The message received from the driver.
        """
        assert self.configured.is_set(), "Received update before config"
        # The driver starts polling immediately, make sure we block until
        # everyone else is ready to receive updates.
        self.begin_polling.wait()
        # Unpack the message.
        key = msg[MSG_KEY_KEY]
        value = msg[MSG_KEY_VALUE]
        _log.debug("Update from driver: %s -> %s", key, value)
        # Output some very coarse stats.
        self.read_count += 1
        if self.read_count % 1000 == 0:
            now = monotonic_time()
            delta = now - self.last_rate_log_time
            _log.info("Processed %s updates from driver "
                      "%.1f/s", self.read_count, 1000.0 / delta)
            self.last_rate_log_time = now
        # Wrap the update in an EtcdEvent object so we can dispatch it via the
        # PathDispatcher.
        n = EtcdEvent("set" if value is not None else "delete", key, value)
        self.dispatcher.handle_event(n)
Ejemplo n.º 4
0
    def maybe_schedule(self, caller):
        """
        Schedule this Actor to run iff it is not already running/scheduled.

        :param str caller: A (debug) tag to pass to the greenlet to identify
               the caller
        """
        # This method uses very low-level gevent APIs for performance, it is
        # partially cribbed from gevent's Waiter class, which is a simple
        # application of the low-level APIs.

        _log.debug("Checking whether we need to schedule %s", self)
        # We use the lack of yield points in this code to ensure that no-one
        # changes self._scheduled between the next two lines.  Do not add
        # logging between the test and set of self._scheduled.
        if not self._scheduled:
            self._scheduled = True
            # Calculate the scheduling delay.  If this Actor hasn't been
            # scheduled for a long time, we'll schedule it straight away,
            # otherwise we back off to the batch_delay to encourage work to be
            # batched up when we're under load.
            now = monotonic_time()
            if self._last_scheduled is not None:
                time_since_last_schedule = now - self._last_scheduled
                delay = max(self.batch_delay - time_since_last_schedule,
                            MIN_DELAY)
            else:
                delay = MIN_DELAY
            # We can't switch directly to the Actor's greenlet because that
            # prevents gevent from doing its scheduling.  Instead, we ask the
            # gevent event loop to switch to the greenlet.
            t = self._gevent_loop.timer(delay)
            t.start(self._switch, caller)
            self._last_scheduled = now
            _log.debug("Scheduled %s", self)
Ejemplo n.º 5
0
    def __init__(self, config, hosts_ipset):
        super(EtcdAPI, self).__init__(config.ETCD_ADDRS,
                                      etcd_scheme=config.ETCD_SCHEME,
                                      etcd_key=config.ETCD_KEY_FILE,
                                      etcd_cert=config.ETCD_CERT_FILE,
                                      etcd_ca=config.ETCD_CA_FILE)
        self._config = config

        # Timestamp storing when the EtcdAPI started. This info is needed
        # in order to report uptime to etcd.
        self._start_time = monotonic_time()

        # Create an Actor to report per-endpoint status into etcd.
        self.status_reporter = EtcdStatusReporter(config)

        # Start up the main etcd-watching greenlet.  It will wait for an
        # event from us before doing anything.
        self._watcher = _FelixEtcdWatcher(config, self, self.status_reporter,
                                          hosts_ipset)
        self._watcher.link(self._on_worker_died)
        self._watcher.start()

        # Start up a greenlet to trigger periodic resyncs.
        self._resync_greenlet = gevent.spawn(self._periodically_resync)
        self._resync_greenlet.link_exception(self._on_worker_died)

        # Start up greenlet to report felix's liveness into etcd.
        self.done_first_status_report = False
        self._status_reporting_greenlet = gevent.spawn(
            self._periodically_report_status)
        self._status_reporting_greenlet.link_exception(self._on_worker_died)

        # Start the status reporter.
        self.status_reporter.start()
        self.status_reporter.greenlet.link(self._on_worker_died)
Ejemplo n.º 6
0
    def _update_felix_status(self, ttl):
        """
        Writes two keys to etcd:

        * uptime in secs
        * felix status in JSON - containing current time in ISO 8601 Zulu
          format

        :param: ttl int: time to live in sec - lifetime of the status report
        """
        time_formatted = iso_utc_timestamp()
        uptime = monotonic_time() - self._start_time
        status = {
            "time": time_formatted,
            "uptime": uptime,
            "first_update": not self.done_first_status_report,
        }

        status_value = json.dumps(status)

        _log.debug("Reporting felix status/uptime (%.1fs) using hostname %s",
                   uptime, self._config.HOSTNAME)
        status_key = key_for_last_status(self._config.HOSTNAME)
        self.client.set(status_key, status_value)
        status_key = key_for_status(self._config.HOSTNAME)
        self.client.set(status_key, status_value, ttl=ttl)
        self.done_first_status_report = True
Ejemplo n.º 7
0
    def _on_update_from_driver(self, msg):
        """
        Called when the driver sends us a key/value pair update.

        After the initial handshake, the stream of events consists
        entirely of updates unless something happens to change the
        state of the driver.

        :param dict msg: The message received from the driver.
        """
        assert self.configured.is_set(), "Received update before config"
        # The driver starts polling immediately, make sure we block until
        # everyone else is ready to receive updates.
        self.begin_polling.wait()
        # Unpack the message.
        key = msg[MSG_KEY_KEY]
        value = msg[MSG_KEY_VALUE]
        _log.debug("Update from driver: %s -> %s", key, value)
        # Output some very coarse stats.
        self.read_count += 1
        if self.read_count % 1000 == 0:
            now = monotonic_time()
            delta = now - self.last_rate_log_time
            _log.info("Processed %s updates from driver "
                      "%.1f/s", self.read_count, 1000.0 / delta)
            self.last_rate_log_time = now
        # Wrap the update in an EtcdEvent object so we can dispatch it via the
        # PathDispatcher.
        n = EtcdEvent("set" if value is not None else "delete", key, value)
        self.dispatcher.handle_event(n)
Ejemplo n.º 8
0
    def _update_felix_status(self, ttl):
        """
        Writes two keys to etcd:

        * uptime in secs
        * felix status in JSON - containing current time in ISO 8601 Zulu
          format

        :param: ttl int: time to live in sec - lifetime of the status report
        """
        time_formatted = iso_utc_timestamp()
        uptime = monotonic_time() - self._start_time
        status = {
            "time": time_formatted,
            "uptime": uptime,
            "first_update": not self.done_first_status_report,
        }

        status_value = json.dumps(status)

        _log.debug("Reporting felix status/uptime (%.1fs) using hostname %s",
                   uptime, self._config.HOSTNAME)
        status_key = key_for_last_status(self._config.HOSTNAME)
        self.client.set(status_key, status_value)
        status_key = key_for_status(self._config.HOSTNAME)
        self.client.set(status_key, status_value, ttl=ttl)
        self.done_first_status_report = True
Ejemplo n.º 9
0
    def __init__(self, config, hosts_ipset):
        super(EtcdAPI, self).__init__(config.ETCD_ADDR)
        self._config = config

        # Timestamp storing when the EtcdAPI started. This info is needed
        # in order to report uptime to etcd.
        self._start_time = monotonic_time()

        # Create an Actor to report per-endpoint status into etcd.
        self.status_reporter = EtcdStatusReporter(config)

        # Start up the main etcd-watching greenlet.  It will wait for an
        # event from us before doing anything.
        self._watcher = _FelixEtcdWatcher(config, self, self.status_reporter, hosts_ipset)
        self._watcher.link(self._on_worker_died)
        self._watcher.start()

        # Start up a greenlet to trigger periodic resyncs.
        self._resync_greenlet = gevent.spawn(self._periodically_resync)
        self._resync_greenlet.link_exception(self._on_worker_died)

        # Start up greenlet to report felix's liveness into etcd.
        self.done_first_status_report = False
        self._status_reporting_greenlet = gevent.spawn(self._periodically_report_status)
        self._status_reporting_greenlet.link_exception(self._on_worker_died)

        # Start the status reporter.
        self.status_reporter.start()
        self.status_reporter.greenlet.link(self._on_worker_died)
Ejemplo n.º 10
0
 def _maybe_log_resync_thread_stats(self):
     now = monotonic_time()
     if now - self._last_resync_stat_log_time > STATS_LOG_INTERVAL:
         for stat in self._resync_stats:
             _log.info("STAT: Resync thread %s", stat)
             stat.reset()
         self._last_resync_stat_log_time = now
Ejemplo n.º 11
0
 def _maybe_log_resync_thread_stats(self):
     now = monotonic_time()
     if now - self._last_resync_stat_log_time > STATS_LOG_INTERVAL:
         for stat in self._resync_stats:
             _log.info("STAT: Resync thread %s", stat)
             stat.reset()
         self._last_resync_stat_log_time = now
Ejemplo n.º 12
0
    def maybe_schedule(self, caller):
        """
        Schedule this Actor to run iff it is not already running/scheduled.

        :param str caller: A (debug) tag to pass to the greenlet to identify
               the caller
        """
        # This method uses very low-level gevent APIs for performance, it is
        # partially cribbed from gevent's Waiter class, which is a simple
        # application of the low-level APIs.

        _log.debug("Checking whether we need to schedule %s", self)
        # We use the lack of yield points in this code to ensure that no-one
        # changes self._scheduled between the next two lines.  Do not add
        # logging between the test and set of self._scheduled.
        if not self._scheduled:
            self._scheduled = True
            # Calculate the scheduling delay.  If this Actor hasn't been
            # scheduled for a long time, we'll schedule it straight away,
            # otherwise we back off to the batch_delay to encourage work to be
            # batched up when we're under load.
            now = monotonic_time()
            if self._last_scheduled is not None:
                time_since_last_schedule = now - self._last_scheduled
                delay = max(self.batch_delay - time_since_last_schedule,
                            MIN_DELAY)
            else:
                delay = MIN_DELAY
            # We can't switch directly to the Actor's greenlet because that
            # prevents gevent from doing its scheduling.  Instead, we ask the
            # gevent event loop to switch to the greenlet.
            t = self._gevent_loop.timer(delay)
            t.start(self._switch, caller)
            self._last_scheduled = now
            _log.debug("Scheduled %s", self)
Ejemplo n.º 13
0
    def __init__(self, config, hosts_ipset):
        super(EtcdAPI, self).__init__(config.ETCD_ADDRS,
                                      etcd_scheme=config.ETCD_SCHEME,
                                      etcd_key=config.ETCD_KEY_FILE,
                                      etcd_cert=config.ETCD_CERT_FILE,
                                      etcd_ca=config.ETCD_CA_FILE)
        self._config = config

        # Timestamp storing when the EtcdAPI started. This info is needed
        # in order to report uptime to etcd.
        self._start_time = monotonic_time()

        # Create an Actor to report per-endpoint status into etcd.  We defer
        # startup of this and our other workers until we get started.
        self.status_reporter = EtcdStatusReporter(config)

        # Create the main etcd-watching greenlet.
        self._watcher = _FelixEtcdWatcher(config,
                                          self,
                                          self.status_reporter,
                                          hosts_ipset)
        self._watcher.link(self._on_worker_died)

        # Create a greenlet to trigger periodic resyncs.
        self._resync_greenlet = gevent.Greenlet(self._periodically_resync)
        self._resync_greenlet.link_exception(self._on_worker_died)

        # Create a greenlet to report felix's liveness into etcd.
        self.done_first_status_report = False
        self._status_reporting_greenlet = gevent.Greenlet(
            self._periodically_report_status
        )
        self._status_reporting_greenlet.link_exception(self._on_worker_died)

        self.status_reporter.greenlet.link(self._on_worker_died)
Ejemplo n.º 14
0
    def __init__(self, felix_sck):
        # Wrap the socket with our protocol reader/writer objects.
        self._msg_reader = MessageReader(felix_sck)
        self._msg_writer = MessageWriter(felix_sck)

        # Global stop event used to signal to all threads to stop.
        self._stop_event = Event()

        # Threads to own the connection from/to Felix.  The resync thread
        # is responsible for doing resyncs and merging updates from the
        # watcher thread (which it manages).
        self._reader_thread = Thread(target=self._read_from_socket,
                                     name="reader-thread")
        self._reader_thread.daemon = True
        self._resync_thread = Thread(target=self._resync_and_merge,
                                     name="resync-thread")
        self._resync_thread.daemon = True
        self._watcher_thread = None  # Created on demand
        self._watcher_stop_event = None
        self._watcher_start_index = None

        # High-water mark cache.  Owned by resync thread.
        self._hwms = HighWaterTracker()
        self._first_resync = True
        self._resync_http_pool = None
        self._cluster_id = None

        # Resync thread stats.
        self._snap_keys_processed = RateStat("snapshot keys processed")
        self._event_keys_processed = RateStat("event keys processed")
        self._felix_updates_sent = RateStat("felix updates sent")
        self._resync_stats = [
            self._snap_keys_processed,
            self._event_keys_processed,
            self._felix_updates_sent,
        ]
        self._last_resync_stat_log_time = monotonic_time()

        # Set by the reader thread once the init message has been received
        # from Felix.
        self._init_received = Event()
        # Initial config, received in the init message.
        self._etcd_base_url = None
        self._etcd_other_urls = []
        # Lock for the etcd url fields: this is the only lock, and no thread
        # ever recursively acquires it, so it cannot deadlock.  Must be locked
        # to access the _etcd_base_url and _etcd_other_urls fields (after they
        # are initialized).
        self._etcd_url_lock = Lock()
        self._hostname = None
        # Set by the reader thread once the logging config has been received
        # from Felix.  Triggers the first resync.
        self._config_received = Event()

        # Flag to request a resync.  Set by the reader thread, polled by the
        # resync and merge thread.
        self._resync_requested = False
Ejemplo n.º 15
0
 def update_felix_status(self):
     """Sends Felix's status to the backend driver."""
     time_formatted = iso_utc_timestamp()
     uptime = monotonic_time() - self._start_time
     envelope = felixbackend_pb2.FromDataplane()
     payload = envelope.process_status_update
     payload.iso_timestamp = time_formatted
     payload.uptime = uptime
     self._writer.send_message(envelope)
Ejemplo n.º 16
0
 def __init__(self, config, message_writer):
     super(DatastoreWriter, self).__init__()
     self._config = config
     self._start_time = monotonic_time()
     self._writer = message_writer
     self._endpoint_status = {IPV4: {}, IPV6: {}}
     self.config_resolved = False
     self._dirty_endpoints = set()
     self._reporting_allowed = True
     self._status_reporting_greenlet = None
Ejemplo n.º 17
0
 def _on_ipset_delta_msg_from_driver(self, msg):
     _log.debug("IP set delta updates: %s", msg)
     # Output some very coarse stats.
     self.ip_upd_count += 1
     if self.ip_upd_count % 1000 == 0:
         now = monotonic_time()
         delta = now - self.last_ip_upd_log_time
         _log.info("Processed %s IP updates from driver "
                   "%.1f/s", self.ip_upd_count, 1000.0 / delta)
         self.last_ip_upd_log_time = now
     self.splitter.on_ipset_delta_update(msg.id,
                                         msg.added_members or [],
                                         msg.removed_members or [])
Ejemplo n.º 18
0
 def __init__(self, config, msg_reader, datastore_writer, hosts_ipset):
     super(DatastoreReader, self).__init__()
     self._config = config
     self.hosts_ipset = hosts_ipset
     self._msg_reader = msg_reader
     self._datastore_writer = datastore_writer
     # Whether we've been in sync with etcd at some point.
     self._been_in_sync = False
     # Keep track of the config loaded from etcd so we can spot if it
     # changes.
     self.last_global_config = None
     self.last_host_config = None
     # Events triggered by the DatastoreAPI Actor to tell us to load the
     # config and start polling.  These are one-way flags.
     self.load_config = Event()
     self.begin_polling = Event()
     # Event that we trigger once the config is loaded.
     self.configured = Event()
     # Polling state initialized at poll start time.
     self.splitter = None
     # Next-hop IP addresses of our hosts, if populated in etcd.
     self.ipv4_by_hostname = {}
     # Forces a resync after the current poll if set.  Safe to set from
     # another thread.  Automatically reset to False after the resync is
     # triggered.
     self.resync_requested = False
     # True if we've been shut down.
     self.killed = False
     # Stats.
     self.read_count = 0
     self.ip_upd_count = 0
     self.ip_remove_count = 0
     self.msgs_processed = 0
     self.last_rate_log_time = monotonic_time()
     self.last_ip_upd_log_time = monotonic_time()
     self.last_ip_remove_log_time = monotonic_time()
Ejemplo n.º 19
0
    def __init__(self, config, pipe_from_parent, pipe_to_parent, hosts_ipset):
        super(DatastoreAPI, self).__init__()
        self._config = config
        self.pipe_from_parent = pipe_from_parent
        self.pipe_to_parent = pipe_to_parent
        self.hosts_ipset = hosts_ipset

        # Timestamp storing when the DatastoreAPI started. This info is needed
        # in order to report uptime to etcd.
        self._start_time = monotonic_time()

        # The main etcd-watching greenlet.
        self._reader = None

        # One-way flag indicating we're being shut down.
        self.killed = False
Ejemplo n.º 20
0
 def __init__(self, config, etcd_api, status_reporter, hosts_ipset):
     super(_FelixEtcdWatcher, self).__init__()
     self._config = config
     self._etcd_api = etcd_api
     self._status_reporter = status_reporter
     self.hosts_ipset = hosts_ipset
     # Whether we've been in sync with etcd at some point.
     self._been_in_sync = False
     # Keep track of the config loaded from etcd so we can spot if it
     # changes.
     self.last_global_config = None
     self.last_host_config = None
     self.my_config_dir = dir_for_per_host_config(self._config.HOSTNAME)
     # Events triggered by the EtcdAPI Actor to tell us to load the config
     # and start polling.  These are one-way flags.
     self.load_config = Event()
     self.begin_polling = Event()
     # Event that we trigger once the config is loaded.
     self.configured = Event()
     # Polling state initialized at poll start time.
     self.splitter = None
     # Next-hop IP addresses of our hosts, if populated in etcd.
     self.ipv4_by_hostname = {}
     # Forces a resync after the current poll if set.  Safe to set from
     # another thread.  Automatically reset to False after the resync is
     # triggered.
     self.resync_requested = False
     self.dispatcher = PathDispatcher()
     # The Popen object for the driver.
     self._driver_process = None
     # Stats.
     self.read_count = 0
     self.msgs_processed = 0
     self.last_rate_log_time = monotonic_time()
     # Register for events when values change.
     self._register_paths()
     self._usage_report_greenlet = gevent.Greenlet(
         self._periodically_usage_report)
Ejemplo n.º 21
0
 def __init__(self, config, etcd_api, status_reporter, hosts_ipset):
     super(_FelixEtcdWatcher, self).__init__()
     self._config = config
     self._etcd_api = etcd_api
     self._status_reporter = status_reporter
     self.hosts_ipset = hosts_ipset
     # Whether we've been in sync with etcd at some point.
     self._been_in_sync = False
     # Keep track of the config loaded from etcd so we can spot if it
     # changes.
     self.last_global_config = None
     self.last_host_config = None
     self.my_config_dir = dir_for_per_host_config(self._config.HOSTNAME)
     # Events triggered by the EtcdAPI Actor to tell us to load the config
     # and start polling.  These are one-way flags.
     self.load_config = Event()
     self.begin_polling = Event()
     # Event that we trigger once the config is loaded.
     self.configured = Event()
     # Polling state initialized at poll start time.
     self.splitter = None
     # Next-hop IP addresses of our hosts, if populated in etcd.
     self.ipv4_by_hostname = {}
     # Forces a resync after the current poll if set.  Safe to set from
     # another thread.  Automatically reset to False after the resync is
     # triggered.
     self.resync_requested = False
     self.dispatcher = PathDispatcher()
     # The Popen object for the driver.
     self._driver_process = None
     # Stats.
     self.read_count = 0
     self.msgs_processed = 0
     self.last_rate_log_time = monotonic_time()
     # Register for events when values change.
     self._register_paths()
Ejemplo n.º 22
0
 def time_since_start(self):
     now = monotonic_time()
     time_since_start = now - self.start_time
     return time_since_start
Ejemplo n.º 23
0
    def watch_etcd(self, next_index, event_queue, stop_event):
        """
        Thread: etcd watcher thread.  Watches etcd for changes and
        sends them over the queue to the resync thread, which owns
        the socket to Felix.

        Dies if it receives an error from etcd.

        Note: it is important that we pass the index, queue and event
        as parameters to ensure that each watcher thread only touches
        the versions of those values that were created for it as
        opposed to a later-created watcher thread.

        :param int next_index: The etcd index to start watching from.
        :param Queue event_queue: Queue of updates back to the resync thread.
        :param Event stop_event: Event used to stop this thread when it is no
               longer needed.
        """
        _log.info("Watcher thread started with next index %s", next_index)
        last_log_time = monotonic_time()
        req_end_time = None
        non_req_time_stat = AggregateStat("processing time", "ms")
        etcd_response_time = None
        etcd_response_time_stat = AggregateStat("etcd response time", "ms")
        stats = [etcd_response_time_stat,
                 non_req_time_stat]
        http = None
        try:
            while not self._stop_event.is_set() and not stop_event.is_set():
                if not http:
                    _log.info("No HTTP pool, creating one...")
                    http = self.get_etcd_connection()
                req_start_time = monotonic_time()
                if req_end_time is not None:
                    # Calculate the time since the end of the previous request,
                    # i.e. the time we spent processing the response.  Note:
                    # start and end are flipped because we just read the start
                    # time but we have the end time from the last loop.
                    non_req_time = req_start_time - req_end_time
                    non_req_time_stat.store_reading(non_req_time * 1000)
                _log.debug("Waiting on etcd index %s", next_index)
                try:
                    try:
                        resp = self._etcd_request(http,
                                                  VERSION_DIR,
                                                  recursive=True,
                                                  wait_index=next_index,
                                                  timeout=90)
                    finally:
                        # Make sure the time is available to both exception and
                        # mainline code paths.
                        req_end_time = monotonic_time()
                        etcd_response_time = req_end_time - req_start_time
                    if resp.status != 200:
                        _log.warning("etcd watch returned bad HTTP status to"
                                     "poll on index %s: %s", next_index,
                                     resp.status)
                    self._check_cluster_id(resp)
                    resp_body = resp.data  # Force read inside try block.
                except urllib3.exceptions.ReadTimeoutError:
                    # 100% expected when there are no events.
                    _log.debug("Watch read timed out, restarting watch at "
                               "index %s", next_index)
                    # Workaround urllib3 bug #718.  After a ReadTimeout, the
                    # connection is incorrectly recycled.
                    http = None
                    continue
                except (urllib3.exceptions.HTTPError,
                        httplib.HTTPException,
                        socket.error) as e:
                    # Not so expected but still possible to recover:  etcd
                    # being restarted, for example.
                    assert etcd_response_time is not None
                    if etcd_response_time < REQ_TIGHT_LOOP_THRESH:
                        # Failed fast, make sure we don't tight loop.
                        delay = REQ_TIGHT_LOOP_THRESH - etcd_response_time
                        _log.warning("Connection to etcd failed with %r, "
                                     "restarting watch at index %s after "
                                     "delay %.3f", e, next_index, delay)
                        time.sleep(delay)
                    else:
                        _log.warning("Connection to etcd failed with %r, "
                                     "restarting watch at index %s "
                                     "immediately", e, next_index)
                    # Workaround urllib3 bug #718.  After a ReadTimeout, the
                    # connection is incorrectly recycled.
                    http = None
                    continue
                # If we get to this point, we've got an etcd response to
                # process; try to parse it.
                try:
                    etcd_resp = json.loads(resp_body)
                    if "errorCode" in etcd_resp:
                        _log.error("Error from etcd for index %s: %s; "
                                   "triggering a resync.",
                                   next_index, etcd_resp)
                        break
                    node = etcd_resp["node"]
                    key = node["key"]
                    action = ACTION_MAPPING[etcd_resp["action"]]
                    is_dir = node.get("dir", False)
                    value = node.get("value")
                    dir_creation = False
                    if is_dir:
                        if action == "delete":
                            if key.rstrip("/") in (VERSION_DIR, ROOT_DIR):
                                # Special case: if the whole keyspace is
                                # deleted, that implies the ready flag is gone
                                # too.  Break out of the loop to trigger a
                                # resync.  This avoids queuing up a bunch of
                                # events that would be discarded by the
                                # resync thread.
                                _log.warning("Whole %s deleted, resyncing",
                                             VERSION_DIR)
                                break
                        else:
                            # Just ignore sets to directories, we only track
                            # leaves.
                            _log.debug("Skipping non-delete to dir %s", key)
                            dir_creation = True
                    modified_index = node["modifiedIndex"]
                except (KeyError, TypeError, ValueError):
                    _log.exception("Unexpected format for etcd response to"
                                   "index %s: %r; triggering a resync.",
                                   next_index, resp_body)
                    break
                else:
                    # We successfully parsed the response, hand it off to the
                    # resync thread.  Now we know that we got a response,
                    # we record that in the stat.
                    etcd_response_time_stat.store_reading(etcd_response_time *
                                                          1000)
                    if not dir_creation:
                        # The resync thread doesn't need to know about
                        # directory creations so we skip them.  (It does need
                        # to know about deletions in order to clean up
                        # sub-keys.)
                        event_queue.put((modified_index, key, value))
                    next_index = modified_index + 1

                    # Opportunistically log stats.
                    now = monotonic_time()
                    if now - last_log_time > STATS_LOG_INTERVAL:
                        for stat in stats:
                            _log.info("STAT: Watcher %s", stat)
                            stat.reset()
                        _log.info("STAT: Watcher queue length: %s",
                                  event_queue.qsize())
                        last_log_time = now
        except DriverShutdown:
            _log.warning("Watcher thread stopping due to driver shutdown.")
        except:
            _log.exception("Exception finishing watcher thread.")
            raise
        finally:
            # Signal to the resync thread that we've exited.
            event_queue.put(None)
            # Make sure we get some stats output from the watcher.
            for stat in stats:
                _log.info("STAT: Final watcher %s", stat)
            _log.info("Watcher thread finished. Signalled to resync thread. "
                      "Was at index %s.  Queue length is %s.", next_index,
                      event_queue.qsize())
Ejemplo n.º 24
0
 def test_mainline(self):
     a = monotonic_time()
     time.sleep(0.01)
     b = monotonic_time()
     self.assertTrue(b >= a+0.01, msg="Monotonic time did not increase as "
                                      "expected: %s, %s" % (a, b))
Ejemplo n.º 25
0
 def _reset_resync_thread_stats(self):
     for stat in self._resync_stats:
         stat.reset()
     self._last_resync_stat_log_time = monotonic_time()
Ejemplo n.º 26
0
    def _resync_and_merge(self):
        """
        Thread: Resync-and-merge thread.  Loads the etcd snapshot, merges
        it with the events going on concurrently and sends the event stream
        to Felix.
        """
        _log.info("Resync thread started, waiting for config to be loaded...")
        self._init_received.wait()
        _log.info("Config loaded; continuing.")

        while not self._stop_event.is_set():
            _log.info("Stop event not set, starting new resync...")
            self._reset_resync_thread_stats()
            loop_start = monotonic_time()
            try:
                # Start with a fresh HTTP pool just in case it got into a bad
                # state.
                self._resync_http_pool = self.get_etcd_connection()
                # Before we get to the snapshot, Felix needs the configuration.
                self._send_status(STATUS_WAIT_FOR_READY)
                self._wait_for_ready()
                self._preload_config()
                # Wait for config if we have not already received it.
                self._wait_for_config()
                # Kick off the snapshot request as far as the headers.
                self._send_status(STATUS_RESYNC)
                resp, snapshot_index = self._start_snapshot_request()
                # Before reading from the snapshot, start the watcher thread.
                self._ensure_watcher_running(snapshot_index)
                # Incrementally process the snapshot, merging in events from
                # the queue.
                self._process_snapshot_and_events(resp, snapshot_index)
                # We're now in-sync.  Tell Felix.
                self._send_status(STATUS_IN_SYNC)
                # Then switch to processing events only.
                self._process_events_only()
            except WriteFailed:
                _log.exception("Write to Felix failed; shutting down.")
                self.stop()
            except WatcherDied:
                _log.warning("Watcher died; resyncing.")
                self._stop_watcher()  # Clean up the event
            except (urllib3.exceptions.HTTPError,
                    httplib.HTTPException,
                    socket.error) as e:
                _log.error("Request to etcd failed: %r; resyncing.", e)
                self._stop_watcher()
                if monotonic_time() - loop_start < 1:
                    _log.warning("May be tight looping, sleeping...")
                    time.sleep(1)
            except ResyncRequested:
                _log.info("Resync requested, looping to start a new resync. "
                          "Leaving watcher running if possible.")
            except ResyncRequired:
                _log.warn("Detected inconsistency requiring a full resync, "
                          "stopping watcher")
                self._stop_watcher()
            except DriverShutdown:
                _log.info("Driver shut down.")
                return
            except:
                _log.exception("Unexpected exception; shutting down.")
                self.stop()
                raise
            finally:
                self._first_resync = False
                self._resync_requested = False
        _log.info("Stop event set, exiting resync loop.")
Ejemplo n.º 27
0
    def watch_etcd(self, next_index, event_queue, stop_event):
        """
        Thread: etcd watcher thread.  Watches etcd for changes and
        sends them over the queue to the resync thread, which owns
        the socket to Felix.

        Dies if it receives an error from etcd.

        Note: it is important that we pass the index, queue and event
        as parameters to ensure that each watcher thread only touches
        the versions of those values that were created for it as
        opposed to a later-created watcher thread.

        :param next_index: The etcd index to start watching from.
        :param event_queue: Queue of updates back to the resync thread.
        :param stop_event: Event used to stop this thread when it is no
               longer needed.
        """
        _log.info("Watcher thread started")
        http = None
        try:
            while not self._stop_event.is_set() and not stop_event.is_set():
                if not http:
                    _log.info("No HTTP pool, creating one...")
                    http = self.get_etcd_connection()
                req_start_time = monotonic_time()
                try:
                    _log.debug("Waiting on etcd index %s", next_index)
                    resp = self._etcd_request(http,
                                              VERSION_DIR,
                                              recursive=True,
                                              wait_index=next_index,
                                              timeout=90)
                    if resp.status != 200:
                        _log.warning("etcd watch returned bad HTTP status: %s",
                                     resp.status)
                    self._check_cluster_id(resp)
                    resp_body = resp.data  # Force read inside try block.
                except urllib3.exceptions.ReadTimeoutError:
                    # 100% expected when there are no events.
                    _log.debug("Watch read timed out, restarting watch at "
                               "index %s", next_index)
                    # Workaround urllib3 bug #718.  After a ReadTimeout, the
                    # connection is incorrectly recycled.
                    http = None
                    continue
                except (urllib3.exceptions.HTTPError,
                        httplib.HTTPException,
                        socket.error) as e:
                    # Not so expected but still possible to recover:  etcd
                    # being restarted, for example.
                    req_end_time = monotonic_time()
                    req_time = req_end_time - req_start_time
                    if req_time < REQ_TIGHT_LOOP_THRESH:
                        # Failed fast, make sure we don't tight loop.
                        delay = REQ_TIGHT_LOOP_THRESH - req_time
                        _log.warning("Connection to etcd failed with %r, "
                                     "restarting watch at index %s after "
                                     "delay %.3f", e, next_index, delay)
                        time.sleep(delay)
                    else:
                        _log.warning("Connection to etcd failed with %r, "
                                     "restarting watch at index %s "
                                     "immediately", e, next_index)
                    # Workaround urllib3 bug #718.  After a ReadTimeout, the
                    # connection is incorrectly recycled.
                    http = None
                    continue
                try:
                    etcd_resp = json.loads(resp_body)
                    if "errorCode" in etcd_resp:
                        _log.error("Error from etcd: %s; triggering a resync.",
                                   etcd_resp)
                        break
                    node = etcd_resp["node"]
                    key = node["key"]
                    action = ACTION_MAPPING[etcd_resp["action"]]
                    is_dir = node.get("dir", False)
                    value = node.get("value")
                    dir_creation = False
                    if is_dir:
                        if action == "delete":
                            if key.rstrip("/") in (VERSION_DIR, ROOT_DIR):
                                # Special case: if the whole keyspace is
                                # deleted, that implies the ready flag is gone
                                # too.  Break out of the loop to trigger a
                                # resync.  This avoids queuing up a bunch of
                                # events that would be discarded by the
                                # resync thread.
                                _log.warning("Whole %s deleted, resyncing",
                                             VERSION_DIR)
                                break
                        else:
                            # Just ignore sets to directories, we only track
                            # leaves.
                            _log.debug("Skipping non-delete to dir %s", key)
                            dir_creation = True
                    modified_index = node["modifiedIndex"]
                except (KeyError, TypeError, ValueError):
                    _log.exception("Unexpected format for etcd response: %r;"
                                   "triggering a resync.",
                                   resp_body)
                    break
                else:
                    if not dir_creation:
                        # The resync thread doesn't need to know about
                        # directory creations so we skip them.  (It does need
                        # to know about deletions in order to clean up
                        # sub-keys.)
                        event_queue.put((modified_index, key, value))
                    next_index = modified_index + 1
        except DriverShutdown:
            _log.warning("Watcher thread stopping due to driver shutdown.")
        except:
            _log.exception("Exception finishing watcher thread.")
            raise
        finally:
            # Signal to the resync thread that we've exited.
            _log.info("Watcher thread finished. Signalling to resync thread.")
            event_queue.put(None)
Ejemplo n.º 28
0
    def watch_etcd(self, next_index, event_queue, stop_event):
        """
        Thread: etcd watcher thread.  Watches etcd for changes and
        sends them over the queue to the resync thread, which owns
        the socket to Felix.

        Dies if it receives an error from etcd.

        Note: it is important that we pass the index, queue and event
        as parameters to ensure that each watcher thread only touches
        the versions of those values that were created for it as
        opposed to a later-created watcher thread.

        :param int next_index: The etcd index to start watching from.
        :param Queue event_queue: Queue of updates back to the resync thread.
        :param Event stop_event: Event used to stop this thread when it is no
               longer needed.
        """
        _log.info("Watcher thread started with next index %s", next_index)
        last_log_time = monotonic_time()
        req_end_time = None
        non_req_time_stat = AggregateStat("processing time", "ms")
        etcd_response_time = None
        etcd_response_time_stat = AggregateStat("etcd response time", "ms")
        stats = [etcd_response_time_stat,
                 non_req_time_stat]
        http = None
        try:
            while not self._stop_event.is_set() and not stop_event.is_set():
                if not http:
                    _log.info("No HTTP pool, creating one...")
                    http = self.get_etcd_connection()
                req_start_time = monotonic_time()
                if req_end_time is not None:
                    # Calculate the time since the end of the previous request,
                    # i.e. the time we spent processing the response.  Note:
                    # start and end are flipped because we just read the start
                    # time but we have the end time from the last loop.
                    non_req_time = req_start_time - req_end_time
                    non_req_time_stat.store_reading(non_req_time * 1000)
                _log.debug("Waiting on etcd index %s", next_index)
                try:
                    try:
                        resp = self._etcd_request(http,
                                                  VERSION_DIR,
                                                  recursive=True,
                                                  wait_index=next_index,
                                                  timeout=90)
                    finally:
                        # Make sure the time is available to both exception and
                        # mainline code paths.
                        req_end_time = monotonic_time()
                        etcd_response_time = req_end_time - req_start_time
                    if resp.status != 200:
                        _log.warning("etcd watch returned bad HTTP status to"
                                     "poll on index %s: %s", next_index,
                                     resp.status)
                    self._check_cluster_id(resp)
                    resp_body = resp.data  # Force read inside try block.
                except urllib3.exceptions.ReadTimeoutError:
                    # 100% expected when there are no events.
                    _log.debug("Watch read timed out, restarting watch at "
                               "index %s", next_index)
                    # Workaround urllib3 bug #718.  After a ReadTimeout, the
                    # connection is incorrectly recycled.
                    http = None
                    continue
                except (urllib3.exceptions.HTTPError,
                        httplib.HTTPException,
                        socket.error) as e:
                    # Not so expected but still possible to recover:  etcd
                    # being restarted, for example.
                    assert etcd_response_time is not None
                    if etcd_response_time < REQ_TIGHT_LOOP_THRESH:
                        # Failed fast, make sure we don't tight loop.
                        delay = REQ_TIGHT_LOOP_THRESH - etcd_response_time
                        _log.warning("Connection to etcd failed with %r, "
                                     "restarting watch at index %s after "
                                     "delay %.3f", e, next_index, delay)
                        time.sleep(delay)
                    else:
                        _log.warning("Connection to etcd failed with %r, "
                                     "restarting watch at index %s "
                                     "immediately", e, next_index)
                    # If available, connect to a different etcd URL in case
                    # only the previous one has failed.
                    self._rotate_etcd_url()
                    # Workaround urllib3 bug #718.  After a ReadTimeout, the
                    # connection is incorrectly recycled.
                    http = None
                    continue
                # If we get to this point, we've got an etcd response to
                # process; try to parse it.
                try:
                    etcd_resp = json.loads(resp_body)
                    if "errorCode" in etcd_resp:
                        _log.error("Error from etcd for index %s: %s; "
                                   "triggering a resync.",
                                   next_index, etcd_resp)
                        break
                    node = etcd_resp["node"]
                    key = node["key"]
                    action = ACTION_MAPPING[etcd_resp["action"]]
                    is_dir = node.get("dir", False)
                    value = node.get("value")
                    dir_creation = False
                    if is_dir:
                        if action == "delete":
                            if key.rstrip("/") in (VERSION_DIR, ROOT_DIR):
                                # Special case: if the whole keyspace is
                                # deleted, that implies the ready flag is gone
                                # too.  Break out of the loop to trigger a
                                # resync.  This avoids queuing up a bunch of
                                # events that would be discarded by the
                                # resync thread.
                                _log.warning("Whole %s deleted, resyncing",
                                             VERSION_DIR)
                                break
                        else:
                            # Just ignore sets to directories, we only track
                            # leaves.
                            _log.debug("Skipping non-delete to dir %s", key)
                            dir_creation = True
                    modified_index = node["modifiedIndex"]
                except (KeyError, TypeError, ValueError):
                    _log.exception("Unexpected format for etcd response to"
                                   "index %s: %r; triggering a resync.",
                                   next_index, resp_body)
                    break
                else:
                    # We successfully parsed the response, hand it off to the
                    # resync thread.  Now we know that we got a response,
                    # we record that in the stat.
                    etcd_response_time_stat.store_reading(etcd_response_time *
                                                          1000)
                    if not dir_creation:
                        # The resync thread doesn't need to know about
                        # directory creations so we skip them.  (It does need
                        # to know about deletions in order to clean up
                        # sub-keys.)
                        event_queue.put((modified_index, key, value))
                    next_index = modified_index + 1

                    # Opportunistically log stats.
                    now = monotonic_time()
                    if now - last_log_time > STATS_LOG_INTERVAL:
                        for stat in stats:
                            _log.info("STAT: Watcher %s", stat)
                            stat.reset()
                        _log.info("STAT: Watcher queue length: %s",
                                  event_queue.qsize())
                        last_log_time = now
        except DriverShutdown:
            _log.warning("Watcher thread stopping due to driver shutdown.")
        except:
            _log.exception("Exception finishing watcher thread.")
            raise
        finally:
            # Signal to the resync thread that we've exited.
            event_queue.put(None)
            # Make sure we get some stats output from the watcher.
            for stat in stats:
                _log.info("STAT: Final watcher %s", stat)
            _log.info("Watcher thread finished. Signalled to resync thread. "
                      "Was at index %s.  Queue length is %s.", next_index,
                      event_queue.qsize())
Ejemplo n.º 29
0
 def reset(self):
     self.start_time = monotonic_time()
     self.count = 0
Ejemplo n.º 30
0
 def _reset_resync_thread_stats(self):
     for stat in self._resync_stats:
         stat.reset()
     self._last_resync_stat_log_time = monotonic_time()
Ejemplo n.º 31
0
    def _resync_and_merge(self):
        """
        Thread: Resync-and-merge thread.  Loads the etcd snapshot, merges
        it with the events going on concurrently and sends the event stream
        to Felix.
        """
        _log.info("Resync thread started, waiting for config to be loaded...")
        self._init_received.wait()
        _log.info("Config loaded; continuing.")

        while not self._stop_event.is_set():
            _log.info("Stop event not set, starting new resync...")
            self._reset_resync_thread_stats()
            loop_start = monotonic_time()
            try:
                # Start with a fresh HTTP pool just in case it got into a bad
                # state.
                self._resync_http_pool = self.get_etcd_connection()
                # Before we get to the snapshot, Felix needs the configuration.
                self._send_status(STATUS_WAIT_FOR_READY)
                self._wait_for_ready()
                self._preload_config()
                # Wait for config if we have not already received it.
                self._wait_for_config()
                # Kick off the snapshot request as far as the headers.
                self._send_status(STATUS_RESYNC)
                resp, snapshot_index = self._start_snapshot_request()
                # Before reading from the snapshot, start the watcher thread.
                self._ensure_watcher_running(snapshot_index)
                # Incrementally process the snapshot, merging in events from
                # the queue.
                self._process_snapshot_and_events(resp, snapshot_index)
                # We're now in-sync.  Tell Felix.
                self._send_status(STATUS_IN_SYNC)
                # Then switch to processing events only.
                self._process_events_only()
            except WriteFailed:
                _log.exception("Write to Felix failed; shutting down.")
                self.stop()
            except WatcherDied:
                _log.warning("Watcher died; resyncing.")
                self._stop_watcher()  # Clean up the event
            except (urllib3.exceptions.HTTPError,
                    httplib.HTTPException,
                    socket.error) as e:
                _log.error("Request to etcd failed: %r; resyncing.", e)
                self._stop_watcher()
                self._rotate_etcd_url()  # Try a different etcd URL if possible
                if monotonic_time() - loop_start < 1:
                    _log.warning("May be tight looping, sleeping...")
                    time.sleep(1)
            except ResyncRequested:
                _log.info("Resync requested, looping to start a new resync. "
                          "Leaving watcher running if possible.")
            except ResyncRequired:
                _log.warn("Detected inconsistency requiring a full resync, "
                          "stopping watcher")
                self._stop_watcher()
            except DriverShutdown:
                _log.info("Driver shut down.")
                return
            except:
                _log.exception("Unexpected exception; shutting down.")
                self.stop()
                raise
            finally:
                self._first_resync = False
                self._resync_requested = False
        _log.info("Stop event set, exiting resync loop.")
Ejemplo n.º 32
0
 def time_since_start(self):
     now = monotonic_time()
     time_since_start = now - self.start_time
     return time_since_start
Ejemplo n.º 33
0
 def reset(self):
     self.start_time = monotonic_time()
     self.count = 0