class Coordinator: """ Class that coordinates the configuration, state and status reports for a single LVS instance """ serverConfigUrl = 'file:///etc/pybal/squids' intvLoadServers = 60 metric_keywords = { 'labelnames': ('service', ), 'namespace': 'pybal', 'subsystem': 'service' } metrics = { 'servers': Gauge('servers', 'Amount of servers', **metric_keywords), 'servers_enabled': Gauge('servers_enabled', 'Amount of enabled servers', **metric_keywords), 'servers_up': Gauge('servers_up', 'Amount of up servers', **metric_keywords), 'servers_pooled': Gauge('servers_pooled', 'Amount of pooled servers', **metric_keywords), 'can_depool': Gauge('can_depool', 'Can depool more servers', **metric_keywords), 'pooled_down_servers': Gauge('pooled_down_servers', 'Amount of down servers pooled because too many down', **metric_keywords), 'could_not_depool_total': Counter('could_not_depool_total', 'Pybal could not depool a server because too many down', **metric_keywords), 'depool_threshold': Gauge( 'depool_threshold', "Threshold of up servers vs total servers below which pybal can't depool any more", **metric_keywords), } def __init__(self, lvsservice, configUrl): """Constructor""" self.servers = {} self.lvsservice = lvsservice self.metric_labels = {'service': self.lvsservice.name} self.pooledDownServers = set() self.configHash = None self.serverConfigUrl = configUrl self.serverInitDeferredList = defer.Deferred() self.configObserver = config.ConfigurationObserver.fromUrl( self, configUrl) self.configObserver.startObserving() self.metrics['depool_threshold'].labels(**self.metric_labels).set( self.lvsservice.getDepoolThreshold()) def __str__(self): return "[%s]" % self.lvsservice.name def assignServers(self): """ Takes a new set of servers (as a host->Server dict) and hands them over to LVSService """ # Hand over enabled servers to LVSService self.lvsservice.assignServers( set([ server for server in self.servers.itervalues() if server.pooled ])) def refreshModifiedServers(self): """ Calculates the status of every server that existed before the config change. """ for server in self.servers.itervalues(): if not server.modified: continue server.up = server.calcStatus() server.pooled = server.enabled and server.up def resultDown(self, monitor, reason=None): """ Accepts a 'down' notification status result from a single monitoring instance and acts accordingly. """ server = monitor.server data = { 'service': self, 'monitor': monitor.name(), 'host': server.host, 'status': server.textStatus(), 'reason': (reason or '(reason unknown)') } msg = "Monitoring instance {monitor} " \ "reports server {host} ({status}) down: {reason}" log.error(msg.format(**data), system=self.lvsservice.name) if server.up: server.up = False if server.pooled: self.depool(server) def resultUp(self, monitor): """ Accepts a 'up' notification status result from a single monitoring instance and acts accordingly. """ server = monitor.server if not server.up and server.calcStatus(): log.info("Server {} ({}) is up".format(server.host, server.textStatus()), system=self.lvsservice.name) server.up = True if server.enabled and server.ready: self.repool(server) def depool(self, server): """Depools a single Server, if possible""" assert server.pooled if self.canDepool(): self.lvsservice.removeServer(server) self.pooledDownServers.discard(server) self.metrics['servers_pooled'].labels(**self.metric_labels).dec() else: self.pooledDownServers.add(server) msg = "Could not depool server " \ "{} because of too many down!".format(server.host) log.error(msg, system=self.lvsservice.name) self.metrics['could_not_depool_total'].labels( **self.metric_labels).inc() self._updatePooledDownMetrics() def repool(self, server): """ Repools a single server. Also depools previously downed Servers that could not be depooled then because of too many hosts down. """ assert server.enabled and server.ready if not server.pooled: self.lvsservice.addServer(server) self.metrics['servers_pooled'].labels(**self.metric_labels).inc() else: msg = "Leaving previously pooled but down server {} pooled" log.info(msg.format(server.host), system=self.lvsservice.name) # If it had been pooled in down state before, remove it from the list self.pooledDownServers.discard(server) self._updatePooledDownMetrics() # See if we can depool any servers that could not be depooled before while len(self.pooledDownServers) > 0 and self.canDepool(): self.depool(self.pooledDownServers.pop()) def canDepool(self): """Returns a boolean denoting whether another server can be depooled""" # Construct a list of servers that have status 'down' downServers = [ server for server in self.servers.itervalues() if not server.up ] # The total amount of pooled servers may never drop below a configured threshold return len(self.servers) - len(downServers) >= len( self.servers) * self.lvsservice.getDepoolThreshold() def onConfigUpdate(self, config): """Parses the server list and changes the state accordingly.""" delServers = self.servers.copy() # Shallow copy initList = [] for hostName, hostConfig in config.items(): if hostName in self.servers: # Existing server. merge server = delServers.pop(hostName) server.merge(hostConfig) data = { 'status': (server.enabled and "enabled" or "disabled"), 'host': hostName, 'weight': server.weight } log.info( "Merged {status} server {host}, weight {weight}".format( **data), system=self.lvsservice.name) else: # New server server = Server.buildServer(hostName, hostConfig, self.lvsservice) data = { 'status': (server.enabled and "enabled" or "disabled"), 'host': hostName, 'weight': server.weight } # Initialize with LVS service specific configuration self.lvsservice.initServer(server) self.servers[hostName] = server initList.append(server.initialize(self)) log.info("New {status} server {host}, weight {weight}".format( **data), system=self.lvsservice.name) # Remove old servers for hostName, server in delServers.iteritems(): log.info( "{} Removing server {} (no longer found in new configuration)". format(self, hostName)) server.destroy() del self.servers[hostName] # Calculate up status for previously existing, modified servers self.refreshModifiedServers() # Wait for all new servers to finish initializing self.serverInitDeferredList = defer.DeferredList(initList).addCallback( self._serverInitDone) # Update metrics self._updateServerMetrics() self._updatePooledDownMetrics() def _serverInitDone(self, result): """Called when all (new) servers have finished initializing""" log.info("{} Initialization complete".format(self)) # Assign the updated list of enabled servers to the LVSService instance self.assignServers() self.metrics['servers_pooled'].labels(**self.metric_labels).set( len([s for s in self.servers.itervalues() if s.pooled])) self._updatePooledDownMetrics() def _updateServerMetrics(self): """Update gauge metrics for servers on config change""" self.metrics['servers'].labels(**self.metric_labels).set( len(self.servers)) self.metrics['servers_enabled'].labels(**self.metric_labels).set( len([s for s in self.servers.itervalues() if s.enabled])) self.metrics['servers_up'].labels(**self.metric_labels).set( len([s for s in self.servers.itervalues() if s.up])) def _updatePooledDownMetrics(self): """Update gauge metrics for pooled-but-down servers""" self.metrics['pooled_down_servers'].labels(**self.metric_labels).set( len(self.pooledDownServers)) self.metrics['can_depool'].labels(**self.metric_labels).set( self.canDepool() and 1 or 0)
class RunCommandMonitoringProtocol(monitor.LoopingCheckMonitoringProtocol): """ Monitor that checks server uptime by repeatedly fetching a certain URL """ __name__ = 'RunCommand' INTV_CHECK = 60 TIMEOUT_RUN = 20 metric_labelnames = ('service', 'host', 'monitor') metric_keywords = { 'namespace': 'pybal', 'subsystem': 'monitor_' + __name__.lower() } runcommand_metrics = { 'run_duration_seconds': Gauge('run_duration_seconds', 'Command duration', labelnames=metric_labelnames + ('result', 'exitcode'), **metric_keywords) } def __init__(self, coordinator, server, configuration={}, reactor=None): """Constructor""" # Call ancestor constructor super(RunCommandMonitoringProtocol, self).__init__(coordinator, server, configuration, reactor) locals = {'server': server} self.timeout = self._getConfigInt('timeout', self.TIMEOUT_RUN) self.command = self._getConfigString('command') try: self.arguments = self._getConfigStringList('arguments', locals=locals) except (KeyError, ValueError): # Default to empty stringlist if runcommand.arguments has not been # specified or if it is an empty list self.arguments = [""] self.logOutput = self._getConfigBool('log-output', True) self.runningProcess = None self.runningProcessDeferred = None def stop(self): """Stop all running and/or upcoming checks""" super(RunCommandMonitoringProtocol, self).stop() # Try to kill any running check if self.runningProcess is not None: try: self.runningProcess.signalProcess(signal.SIGKILL) except error.ProcessExitedAlready: pass def runCommand(self): """Periodically called method that does a single uptime check.""" self.checkStartTime = seconds() self.runningProcess = self._spawnProcess( self, self.command, [self.command] + self.arguments, sessionLeader=True, timeout=(self.timeout or None)) self.runningProcessDeferred = defer.Deferred() return self.runningProcessDeferred check = runCommand def makeConnection(self, process): pass def childDataReceived(self, childFD, data): if not self.logOutput: return # Escape control chars map = {'\n': r'\n', '\r': r'\r', '\t': r'\t'} for char, subst in map.iteritems(): data = data.replace(char, subst) self.report("Cmd stdout: " + data) def childConnectionLost(self, childFD): pass def processEnded(self, reason): """ Called when the process has ended """ duration = seconds() - self.checkStartTime if reason.check(error.ProcessDone): self._resultUp() result = 'successful' exitcode = 0 elif reason.check(error.ProcessTerminated): self._resultDown(reason.getErrorMessage()) result = 'failed' exitcode = reason.value.exitCode else: result = None exitcode = None self.runcommand_metrics['run_duration_seconds'].labels( result=result, exitcode=exitcode, **self.metric_labels).set(duration) self.runningProcessDeferred.callback(reason.type) reason.trap(error.ProcessDone, error.ProcessTerminated) def leftoverProcesses(self, allKilled): """ Called when the child terminated cleanly, but left some of its child processes behind """ if allKilled: msg = "Command %s %s left child processes behind, which have been killed!" else: msg = "Command %s %s left child processes behind, and not all could be killed!" self.report(msg % (self.command, str(self.arguments)), level=logging.WARN) def _spawnProcess(self, processProtocol, executable, args=(), env={}, path=None, uid=None, gid=None, childFDs=None, sessionLeader=False, timeout=None): """ Replacement for posixbase.PosixReactorBase.spawnProcess with added process group / session and timeout support, and support for non-POSIX platforms and PTYs removed. """ # Use the default reactor instead of self.reactor as not all (testing) # reactors provide _checkProcessArgs, and it's harmless anyway. args, env = twisted.internet.reactor._checkProcessArgs(args, env) return ProcessGroupProcess(self.reactor, executable, args, env, path, processProtocol, uid, gid, childFDs, sessionLeader, timeout)
class DNSQueryMonitoringProtocol(monitor.LoopingCheckMonitoringProtocol): """ Monitor that checks a DNS server by doing repeated DNS queries """ __name__ = 'DNSQuery' TIMEOUT_QUERY = 5 catchList = (defer.TimeoutError, error.DomainError, error.AuthoritativeDomainError, error.DNSFormatError, error.DNSNameError, error.DNSQueryRefusedError, error.DNSQueryTimeoutError, error.DNSServerError, error.DNSUnknownError) metric_labelnames = ('service', 'host', 'monitor') metric_keywords = { 'namespace': 'pybal', 'subsystem': 'monitor_' + __name__.lower() } dnsquery_metrics = { 'request_duration_seconds': Gauge( 'request_duration_seconds', 'DNS query duration', labelnames=metric_labelnames + ('result',), **metric_keywords) } def __init__(self, coordinator, server, configuration, reactor=None): """Constructor""" # Call ancestor constructor super(DNSQueryMonitoringProtocol, self).__init__( coordinator, server, configuration, reactor=reactor) self.toQuery = self._getConfigInt('timeout', self.TIMEOUT_QUERY) self.hostnames = self._getConfigStringList('hostnames') self.failOnNXDOMAIN = self._getConfigBool('fail-on-nxdomain', False) self.resolver = None self.DNSQueryDeferred = None self.checkStartTime = None def run(self): """Start the monitoring""" super(DNSQueryMonitoringProtocol, self).run() # Create a resolver. Use the DNS server IPv4 addresses instead of # self.server.ip as Twisted's createResolver (< 17.1.0) does not # support querying a nameserver over IPv6. self.resolver = client.createResolver([(ip, 53) for ip in self.server.ip4_addresses]) def stop(self): """Stop the monitoring""" super(DNSQueryMonitoringProtocol, self).stop() if self.DNSQueryDeferred is not None: self.DNSQueryDeferred.cancel() def check(self): """Periodically called method that does a single uptime check.""" hostname = random.choice(self.hostnames) query = dns.Query(hostname, type=random.choice([dns.A, dns.AAAA])) self.checkStartTime = runtime.seconds() if query.type == dns.A: self.DNSQueryDeferred = self.resolver.lookupAddress(hostname, timeout=[self.toQuery]) elif query.type == dns.AAAA: self.DNSQueryDeferred = self.resolver.lookupIPV6Address(hostname, timeout=[self.toQuery]) self.DNSQueryDeferred.addCallback(self._querySuccessful, query ).addErrback(self._queryFailed, query ).addBoth(self._checkFinished) return self.DNSQueryDeferred def _querySuccessful(self, (answers, authority, additional), query): """Called when the DNS query finished successfully.""" if query.type in (dns.A, dns.AAAA): addressFamily = query.type == dns.A and socket.AF_INET or socket.AF_INET6 addresses = " ".join([socket.inet_ntop(addressFamily, r.payload.address) for r in answers if r.type == query.type]) resultStr = "%s %s %s" % (query.name, dns.QUERY_TYPES[query.type], addresses) else: resultStr = None duration = runtime.seconds() - self.checkStartTime self.report('DNS query successful, %.3f s' % (duration) + (resultStr and (': ' + resultStr) or "")) self._resultUp() self.dnsquery_metrics['request_duration_seconds'].labels( result='successful', **self.metric_labels ).set(duration) return answers, authority, additional
class ProxyFetchMonitoringProtocol(monitor.LoopingCheckMonitoringProtocol): """ Monitor that checks server uptime by repeatedly fetching a certain URL """ TIMEOUT_GET = 5 HTTP_STATUS = 200 __name__ = 'ProxyFetch' from twisted.internet import error from twisted.web import error as weberror catchList = (defer.TimeoutError, weberror.Error, error.ConnectError, error.DNSLookupError) metric_labelnames = ('service', 'host', 'monitor') metric_keywords = { 'namespace': 'pybal', 'subsystem': 'monitor_' + __name__.lower() } proxyfetch_metrics = { 'request_duration_seconds': Gauge( 'request_duration_seconds', 'HTTP(S) request duration', labelnames=metric_labelnames + ('result', ), # TODO: statuscode **metric_keywords) } def __init__(self, coordinator, server, configuration={}, reactor=None): """Constructor""" # Call ancestor constructor super(ProxyFetchMonitoringProtocol, self).__init__(coordinator, server, configuration, reactor=reactor) self.toGET = self._getConfigInt('timeout', self.TIMEOUT_GET) self.expectedStatus = self._getConfigInt('http_status', self.HTTP_STATUS) self.getPageDeferred = None self.checkStartTime = None self.URL = self._getConfigStringList('url') def stop(self): """Stop all running and/or upcoming checks""" super(ProxyFetchMonitoringProtocol, self).stop() if self.getPageDeferred is not None: self.getPageDeferred.cancel() def check(self): """Periodically called method that does a single uptime check.""" if not self.active: log.warn( "ProxyFetchMonitoringProtocol.check() called while active == False" ) return # FIXME: Use GET as a workaround for a Twisted bug with HEAD/Content-length # where it expects a body and throws a PartialDownload failure url = random.choice(self.URL) self.checkStartTime = seconds() self.getPageDeferred = self.getProxyPage( url, method='GET', host=self.server.ip, port=self.server.port, status=self.expectedStatus, timeout=self.toGET, followRedirect=False, reactor=self.reactor).addCallbacks(self._fetchSuccessful, self._fetchFailed).addBoth( self._checkFinished) return self.getPageDeferred def _fetchSuccessful(self, result): """Called when getProxyPage is finished successfully.""" duration = seconds() - self.checkStartTime self.report('Fetch successful, %.3f s' % (duration)) self._resultUp() self.proxyfetch_metrics['request_duration_seconds'].labels( result='successful', **self.metric_labels).set(duration) return result def _fetchFailed(self, failure): """Called when getProxyPage finished with a failure.""" # Don't act as if the check failed if we cancelled it if failure.check(defer.CancelledError): return None duration = seconds() - self.checkStartTime self.report('Fetch failed, %.3f s' % (duration), level=logging.WARN) self._resultDown(failure.getErrorMessage()) self.proxyfetch_metrics['request_duration_seconds'].labels( result='failed', **self.metric_labels).set(duration) failure.trap(*self.catchList) def _checkFinished(self, result): """ Called when getProxyPage finished with either success or failure, to do after-check cleanups. """ self.checkStartTime = None return result @staticmethod def getProxyPage(url, contextFactory=None, host=None, port=None, status=None, reactor=twisted.internet.reactor, *args, **kwargs): """Download a web page as a string. (modified from twisted.web.client.getPage) Download a page. Return a deferred, which will callback with a page (as a string) or errback with a description of the error. See HTTPClientFactory to see what extra args can be passed. """ if status > 300 and status < 304: factory = RedirHTTPClientFactory(url, *args, **kwargs) else: factory = client.HTTPClientFactory(url, *args, **kwargs) host = host or factory.host port = port or factory.port if factory.scheme == 'https': from twisted.internet import ssl if contextFactory is None: contextFactory = ssl.ClientContextFactory() reactor.connectSSL(host, port, factory, contextFactory) else: reactor.connectTCP(host, port, factory) return factory.deferred
class MonitoringProtocol(object): """ Base class for all monitoring protocols. Declares a few obligatory abstract methods, and some commonly useful functions. """ __name__ = '' metric_labelnames = ('service', 'host', 'monitor') metric_keywords = { 'labelnames': metric_labelnames, 'namespace': 'pybal', 'subsystem': 'monitor' } metrics = { 'up_transitions_total': Counter('up_transitions_total', 'Monitor up transition count', **metric_keywords), 'down_transitions_total': Counter('down_transitions_total', 'Monitor down transition count', **metric_keywords), 'up_results_total': Counter('up_results_total', 'Monitor up result count', **metric_keywords), 'down_results_total': Counter('down_results_total', 'Monitor down result count', **metric_keywords), 'status': Gauge('status', 'Monitor up status', **metric_keywords) } def __init__(self, coordinator, server, configuration={}, reactor=None): """Constructor""" self.coordinator = coordinator self.server = server self.configuration = configuration self.up = None # None, False (Down) or True (Up) self.reactor = reactor or twisted.internet.reactor self.active = False self.firstCheck = True self._shutdownTriggerID = None self.metric_labels = { 'service': self.server.lvsservice.name, 'host': self.server.host, 'monitor': self.name() } def run(self): """Start the monitoring""" assert self.active is False self.active = True # Install cleanup handler self._shutdownTriggerID = self.reactor.addSystemEventTrigger( 'before', 'shutdown', self.stop) def stop(self): """Stop the monitoring; cancel any running or upcoming checks""" self.active = False if self._shutdownTriggerID is not None: # Remove cleanup handler self.reactor.removeSystemEventTrigger(self._shutdownTriggerID) self._shutdownTriggerID = None def name(self): """Returns a printable name for this monitor""" return self.__name__ def _resultUp(self): """Sets own monitoring state to Up and notifies the coordinator if this implies a state change. """ self.metrics['up_results_total'].labels(**self.metric_labels).inc() if self.active and self.up is False or self.firstCheck: self.up = True self.firstCheck = False if self.coordinator: self.coordinator.resultUp(self) self.metrics['up_transitions_total'].labels( **self.metric_labels).inc() self.metrics['status'].labels(**self.metric_labels).set(1) def _resultDown(self, reason=None): """Sets own monitoring state to Down and notifies the coordinator if this implies a state change.""" self.metrics['down_results_total'].labels(**self.metric_labels).inc() if self.active and self.up is True or self.firstCheck: self.up = False self.firstCheck = False if self.coordinator: self.coordinator.resultDown(self, reason) self.metrics['down_transitions_total'].labels( **self.metric_labels).inc() self.metrics['status'].labels(**self.metric_labels).set(0) def report(self, text, level=logging.DEBUG): """Common method for reporting/logging check results.""" msg = "%s (%s): %s" % (self.server.host, self.server.textStatus(), text) s = "%s %s" % (self.server.lvsservice.name, self.__name__) _log(msg, level, s) def _getConfigBool(self, optionname, default=None): return self.configuration.getboolean( '%s.%s' % (self.__name__.lower(), optionname), default) def _getConfigInt(self, optionname, default=None): return self.configuration.getint( '%s.%s' % (self.__name__.lower(), optionname), default) def _getConfigString(self, optionname): val = self.configuration[self.__name__.lower() + '.' + optionname] if type(val) == str: return val else: raise ValueError("Value of %s is not a string" % optionname) def _getConfigStringList(self, optionname, locals=None, globals=None): """Takes a (string) value, eval()s it and checks whether it consists of either a single string, or a single list of strings.""" key = self.__name__.lower() + '.' + optionname val = eval(self.configuration[key], locals, globals) if type(val) == str: return val elif (isinstance(val, list) and all(isinstance(x, basestring) for x in val) and val): # Checked that each list member is a string and that list is not # empty. return val else: raise ValueError("Value of %s is not a string or stringlist" % optionname)
class BGPPeering(BGPFactory): """Class managing a BGP session with a peer""" implements(IBGPPeering, interfaces.IPushProducer) metric_labelnames = {'local_asn', 'peer'} metric_keywords = { 'labelnames': metric_labelnames, 'namespace': 'pybal', 'subsystem': 'bgp' } metrics = { 'bgp_session_established': Gauge('session_established', 'BGP session established', **metric_keywords) } def __init__(self, myASN=None, peerAddr=None): self.myASN = myASN self.peerAddr = peerAddr self.peerId = None self.fsm = BGPFactory.FSM(self) self.addressFamilies = set((AFI_INET, SAFI_UNICAST)) self.inConnections = [] self.outConnections = [] self.estabProtocol = None # reference to the BGPProtocol instance in ESTAB state self.consumers = set() self.metric_labels = {'local_asn': self.myASN, 'peer': self.peerAddr} self.metrics = BGPPeering.metrics self.metrics['bgp_session_established'].labels( **self.metric_labels).set(0) def __setattr__(self, name, value): if name == 'estabProtocol' and name in self.__dict__ and getattr( self, name) != value: if value: msg = 'established' metric_value = 1 else: msg = 'gone' metric_value = 0 self.log( "BGP session %s for ASN %s peer %s" % (msg, self.myASN, self.peerAddr), logging.INFO) self.metrics['bgp_session_established'].labels( **self.metric_labels).set(metric_value) # old style class, super().__setattr__() doesn't work # https://docs.python.org/2/reference/datamodel.html#customizing-attribute-access self.__dict__[name] = value def buildProtocol(self, addr): """Builds a BGP protocol instance""" self.log("Building a new BGP protocol instance") p = BGPFactory.buildProtocol(self, addr) if p is not None: self._initProtocol(p, addr) self.outConnections.append(p) return p def takeServerConnection(self, addr): """Builds a BGP protocol instance for a server connection""" p = BGPFactory.buildProtocol(self, addr) if p is not None: self._initProtocol(p, addr) self.inConnections.append(p) return p def _initProtocol(self, protocol, addr): """Initializes a BGPProtocol instance""" protocol.bgpPeering = self # Hand over the FSM protocol.fsm = self.fsm protocol.fsm.protocol = protocol # Create a new fsm for internal use for now self.fsm = BGPFactory.FSM(self) self.fsm.state = protocol.fsm.state if addr.port == PORT: protocol.fsm.state = ST_CONNECT else: protocol.fsm.state = ST_ACTIVE # Set up callback and error handlers protocol.deferred.addCallbacks(self.sessionEstablished, self.protocolError) def clientConnectionFailed(self, connector, reason): """Called when the outgoing connection failed.""" self.log("Client connection failed: %s" % reason.getErrorMessage(), logging.INFO) # There is no protocol instance yet at this point. # Catch a possible NotificationException try: self.fsm.connectionFailed() except NotificationSent, e: # TODO: error handling pass
class FSM(object): class BGPTimer(object): """ Timer class with a slightly different Timer interface than the Twisted DelayedCall interface """ def __init__(self, callable): self.delayedCall = None self.callable = callable def cancel(self): """Cancels the timer if it was running, does nothing otherwise""" try: self.delayedCall.cancel() except (AttributeError, error.AlreadyCalled, error.AlreadyCancelled): pass def reset(self, secondsFromNow): """Resets an already running timer, or starts it if it wasn't running.""" try: self.delayedCall.reset(secondsFromNow) except (AttributeError, error.AlreadyCalled, error.AlreadyCancelled): self.delayedCall = reactor.callLater(secondsFromNow, self.callable) def active(self): """Returns True if the timer was running, False otherwise.""" try: return self.delayedCall.active() except AttributeError: return False protocol = None state = ST_IDLE largeHoldTime = 4 * 60 sendNotificationWithoutOpen = True # No bullshit eventMethods = { 1: 'manualStart', 2: 'manualStop', 3: 'automaticStart', 9: 'connectRetryTimeEvent', 10: 'holdTimeEvent', 11: 'keepAliveEvent', 12: 'delayOpenEvent', 13: 'idleHoldTimeEvent', 17: 'connectionMade', 18: 'connectionFailed', 19: 'openReceived', 20: 'openReceived', 21: 'headerError', 22: 'openMessageError', 23: 'openCollisionDump', 24: 'versionError', 25: 'notificationReceived', 26: 'keepAliveReceived', 27: 'updateReceived', 28: 'updateError' } bgpTimers = { 'connectRetryTimer', 'holdTimer', 'keepAliveTimer', 'delayOpenTimer', 'idleHoldTimer' } metric_labelnames = {'local_asn', 'state', 'local_ip', 'remote_ip', 'side'} metric_keywords = { 'labelnames': metric_labelnames, 'namespace': 'pybal', 'subsystem': 'bgp' } metrics = { 'bgp_session_state_count': Gauge('session_state_count', 'Number of sessions in the specified state', **metric_keywords) } def __init__(self, bgpPeering=None, protocol=None): self.bgpPeering = bgpPeering self.protocol = protocol self.connectRetryCounter = 0 self.connectRetryTime = 30 self.connectRetryTimer = FSM.BGPTimer(self.connectRetryTimeEvent) self.holdTime = 3 * 60 self.holdTimer = FSM.BGPTimer(self.holdTimeEvent) self.keepAliveTime = self.holdTime / 3 self.keepAliveTimer = FSM.BGPTimer(self.keepAliveEvent) self.allowAutomaticStart = True self.allowAutomaticStop = False self.delayOpen = False self.delayOpenTime = 30 self.delayOpenTimer = FSM.BGPTimer(self.delayOpenEvent) self.dampPeerOscillations = True self.idleHoldTime = 30 self.idleHoldTimer = FSM.BGPTimer(self.idleHoldTimeEvent) self.metric_labels = { 'state': stateDescr[self.state], 'local_asn': None, 'local_ip': None, 'remote_ip': None, 'side': None } if self.bgpPeering: self.metric_labels['local_asn'] = self.bgpPeering.myASN self.initial_idle_state = True def log(self, msg, lvl=logging.DEBUG): s = "bgp.FSM@{}".format(hex(id(self))) if self.protocol is not None: s += " peer {}".format(self.protocol.peerAddrStr()) elif self.bgpPeering is not None: s += " peer {}".format(self.bgpPeering.peerAddr) _log(msg, lvl, s) def __setattr__(self, name, value): if name == 'state' and value != getattr(self, name): self.log("State is now: %s" % stateDescr[value], logging.INFO) self.__update_metrics(value) super(FSM, self).__setattr__(name, value) def __update_metrics(self, new_state): if self.metric_labels['local_ip'] and self.metric_labels['remote_ip']: if not self.initial_idle_state: self.metrics['bgp_session_state_count'].labels( **self.metric_labels).dec() else: self.initial_idle_state = False self.metric_labels['state'] = stateDescr[new_state] self.metrics['bgp_session_state_count'].labels( **self.metric_labels).inc() def manualStart(self): """ Should be called when a BGP ManualStart event (event 1) is requested. Note that a protocol instance does not yet exist at this point, so this method requires some support from BGPPeering.manualStart(). """ if self.state == ST_IDLE: self.connectRetryCounter = 0 self.connectRetryTimer.reset(self.connectRetryTime) def manualStop(self): """Should be called when a BGP ManualStop event (event 2) is requested.""" if self.state != ST_IDLE: self.protocol.sendNotification(ERR_CEASE, 0) # Stop all timers for timer in (self.connectRetryTimer, self.holdTimer, self.keepAliveTimer, self.delayOpenTimer, self.idleHoldTimer): timer.cancel() if self.bgpPeering is not None: self.bgpPeering.releaseResources(self.protocol) self._closeConnection() self.connectRetryCounter = 0 self.state = ST_IDLE raise NotificationSent(self.protocol, ERR_CEASE, 0) def automaticStart(self, idleHold=False): """ Should be called when a BGP Automatic Start event (event 3) is requested. Returns True or False to indicate BGPPeering whether a connection attempt should be initiated. """ if self.state == ST_IDLE: if idleHold: self.idleHoldTimer.reset(self.idleHoldTime) return False else: self.connectRetryCounter = 0 self.connectRetryTimer.reset(self.connectRetryTime) return True def connectionMade(self): """Should be called when a TCP connection has successfully been established with the peer. (events 16, 17) """ if self.state in (ST_CONNECT, ST_ACTIVE): # State Connect, Event 16 or 17 if self.delayOpen: self.connectRetryTimer.cancel() self.delayOpenTimer.reset(self.delayOpenTime) else: self.connectRetryTimer.cancel() if self.bgpPeering: self.bgpPeering.completeInit(self.protocol) self.protocol.sendOpen() self.holdTimer.reset(self.largeHoldTime) self.state = ST_OPENSENT def connectionFailed(self): """Should be called when the associated TCP connection failed, or was lost. (event 18)""" if self.state == ST_CONNECT: # State Connect, event 18 if self.delayOpenTimer.active(): self.connectRetryTimer.reset(self.connectRetryTime) self.delayOpenTimer.cancel() self.state = ST_ACTIVE else: self.connectRetryTimer.cancel() self._closeConnection() if self.bgpPeering: self.bgpPeering.releaseResources(self.protocol) self.state = ST_IDLE elif self.state == ST_ACTIVE: # State Active, event 18 self.connectRetryTimer.reset(self.connectRetryTime) self.delayOpenTimer.cancel() if self.bgpPeering: self.bgpPeering.releaseResources(self.protocol) self.connectRetryCounter += 1 # TODO: osc damping self.state = ST_IDLE elif self.state == ST_OPENSENT: # State OpenSent, event 18 if self.bgpPeering: self.bgpPeering.releaseResources(self.protocol) self._closeConnection() self.connectRetryTimer.reset(self.connectRetryTime) self.state = ST_ACTIVE elif self.state in (ST_OPENCONFIRM, ST_ESTABLISHED): self._errorClose() def openReceived(self): """Should be called when a BGP Open message was received from the peer. (events 19, 20) """ if self.state in (ST_CONNECT, ST_ACTIVE): if self.delayOpenTimer.active(): # State Connect, event 20 self.connectRetryTimer.cancel() if self.bgpPeering: self.bgpPeering.completeInit(self.protocol) self.delayOpenTimer.cancel() self.protocol.sendOpen() self.protocol.sendKeepAlive() if self.holdTime != 0: self.keepAliveTimer.reset(self.keepAliveTime) self.holdTimer.reset(self.holdTime) else: # holdTime == 0 self.keepAliveTimer.cancel() self.holdTimer.cancel() self.state = ST_OPENCONFIRM else: # State Connect, event 19 self._errorClose() elif self.state == ST_OPENSENT: if not self.delayOpen: # State OpenSent, event 19 self.delayOpenTimer.cancel() self.connectRetryTimer.cancel() self.protocol.sendKeepAlive() if self.holdTime > 0: self.keepAliveTimer.reset(self.keepAliveTime) self.holdTimer.reset(self.holdTime) self.state = ST_OPENCONFIRM else: # State OpenSent, event 20 self.protocol.sendNotification(ERR_FSM, 0) self._errorClose() raise NotificationSent(self.protocol, ERR_FSM, 0) elif self.state == ST_OPENCONFIRM: if not self.delayOpen: # State OpenConfirm, events 19 self.log("Running collision detection") # Perform collision detection self.protocol.collisionDetect() else: # State OpenConfirm, event 20 self.protocol.sendNotification(ERR_FSM, 0) self._errorClose() raise NotificationSent(self.protocol, ERR_FSM, 0) elif self.state == ST_ESTABLISHED: # State Established, event 19 or 20 self.protocol.sendNotification(ERR_FSM, 0) self._errorClose() raise NotificationSent(self.protocol, ERR_FSM, 0) def headerError(self, suberror, data=''): """ Should be called when an invalid BGP message header was received. (event 21) """ if self.state != ST_IDLE: self.protocol.sendNotification(ERR_MSG_HDR, suberror, data) # Note: RFC4271 states that we should send ERR_FSM in the # Established state, which contradicts earlier statements. self._errorClose() raise NotificationSent(self.protocol, ERR_MSG_HDR, suberror, data) def openMessageError(self, suberror, data=''): """ Should be called when an invalid BGP Open message was received. (event 22) """ if self.state != ST_IDLE: self.protocol.sendNotification(ERR_MSG_OPEN, suberror, data) # Note: RFC4271 states that we should send ERR_FSM in the # Established state, which contradicts earlier statements. self._errorClose() raise NotificationSent(self.protocol, ERR_MSG_OPEN, suberror, data) def keepAliveReceived(self): """ Should be called when a BGP KeepAlive packet was received from the peer. (event 26) """ if self.state == ST_OPENCONFIRM: # State OpenSent, event 26 self.holdTimer.reset(self.holdTime) self.state = ST_ESTABLISHED self.protocol.deferred.callback(self.protocol) elif self.state == ST_ESTABLISHED: # State Established, event 26 self.holdTimer.reset(self.holdTime) elif self.state in (ST_CONNECT, ST_ACTIVE): # States Connect, Active, event 26 self._errorClose() elif self.state == ST_OPENSENT: # State OpenSent, event 26 self.protocol.sendNotification(ERR_FSM, 0) self._errorClose() raise NotificationSent(self.protocol, ERR_FSM, 0) def versionError(self): """ Should be called when a BGP Notification Open Version Error message was received from the peer. (event 24) """ if self.state in (ST_OPENSENT, ST_OPENCONFIRM): # State OpenSent/OpenConfirm, event 24 self.connectRetryTimer.cancel() if self.bgpPeering: self.bgpPeering.releaseResources(self.protocol) self._closeConnection() self.state = ST_IDLE elif self.state in (ST_CONNECT, ST_ACTIVE, ST_ESTABLISHED): # State Connect/Active/Established, event 24 self._errorClose() def notificationReceived(self, error, suberror): """ Should be called when a BGP Notification message was received from the peer. (events 24, 25) """ if error == ERR_MSG_OPEN and suberror == 1: # Event 24 self.versionError() else: if self.state != ST_IDLE: # State != Idle, events 24, 25 if self.state == ST_OPENSENT: self.protocol.sendNotification(ERR_FSM, 0) self._errorClose() def updateReceived(self, update): """Called when a valid BGP Update message was received. (event 27)""" if self.state == ST_ESTABLISHED: # State Established, event 27 if self.holdTime != 0: self.holdTimer.reset(self.holdTime) self.bgpPeering.update(update) elif self.state in (ST_ACTIVE, ST_CONNECT): # States Active, Connect, event 27 self._errorClose() elif self.state in (ST_OPENSENT, ST_OPENCONFIRM): # States OpenSent, OpenConfirm, event 27 self.protocol.sendNotification(ERR_FSM, 0) self._errorClose() raise NotificationSent(self.protocol, ERR_FSM, 0) def updateError(self, suberror, data=''): """Called when an invalid BGP Update message was received. (event 28)""" if self.state == ST_ESTABLISHED: # State Established, event 28 self.protocol.sendNotification(ERR_MSG_UPDATE, suberror, data) self._errorClose() raise NotificationSent(self.protocol, ERR_MSG_UPDATE, suberror, data) elif self.state in (ST_ACTIVE, ST_CONNECT): # States Active, Connect, event 28 self._errorClose() elif self.state in (ST_OPENSENT, ST_OPENCONFIRM): # States OpenSent, OpenConfirm, event 28 self.protocol.sendNotification(ERR_FSM, 0) self._errorClose() raise NotificationSent(self.protocol, ERR_FSM, 0) def openCollisionDump(self): """ Called when the collision detection algorithm determined that the associated connection should be dumped. (event 23) """ self.log("Collided, closing") if self.state == ST_IDLE: return elif self.state in (ST_OPENSENT, ST_OPENCONFIRM, ST_ESTABLISHED): self.protocol.sendNotification(ERR_CEASE, 0) self._errorClose() raise NotificationSent(self.protocol, ERR_CEASE, 0) def delayOpenEvent(self): """Called when the DelayOpenTimer expires. (event 12)""" assert (self.delayOpen) self.log("Delay Open event") if self.state == ST_CONNECT: # State Connect, event 12 self.protocol.sendOpen() self.holdTimer.reset(self.largeHoldTime) self.state = ST_OPENSENT elif self.state == ST_ACTIVE: # State Active, event 12 self.connectRetryTimer.cancel() self.delayOpenTimer.cancel() if self.bgpPeering: self.bgpPeering.completeInit(self.protocol) self.protocol.sendOpen() self.holdTimer.reset(self.largeHoldTime) self.state = ST_OPENSENT elif self.state != ST_IDLE: # State OpenSent, OpenConfirm, Established, event 12 self.protocol.sendNotification(ERR_FSM, 0) self._errorClose() raise NotificationSent(self.protocol, ERR_FSM, 0) def keepAliveEvent(self): """Called when the KeepAliveTimer expires. (event 11)""" if self.state in (ST_OPENCONFIRM, ST_ESTABLISHED): # State OpenConfirm, Established, event 11 self.protocol.sendKeepAlive() if self.holdTime > 0: self.keepAliveTimer.reset(self.keepAliveTime) elif self.state in (ST_CONNECT, ST_ACTIVE): self._errorClose() elif self.state == ST_OPENSENT: self.protocol.sendNotification(ERR_FSM, 0) self._errorClose() raise NotificationSent(self.protocol, ERR_FSM, 0) def holdTimeEvent(self): """Called when the HoldTimer expires. (event 10)""" if self.state in (ST_OPENSENT, ST_OPENCONFIRM, ST_ESTABLISHED): # States OpenSent, OpenConfirm, Established, event 10 self.protocol.sendNotification(ERR_HOLD_TIMER_EXPIRED, 0) self._errorClose() # TODO: peer osc damping elif self.state in (ST_CONNECT, ST_ACTIVE): self._errorClose() def connectRetryTimeEvent(self): """Called when the ConnectRetryTimer expires. (event 9)""" if self.state == ST_CONNECT: # State Connect, event 9 self._closeConnection() self.connectRetryTimer.reset(self.connectRetryTime) self.delayOpenTimer.cancel() # Initiate TCP connection if self.bgpPeering: self.bgpPeering.connectRetryEvent(self.protocol) elif self.state == ST_ACTIVE: # State Active, event 9 self.connectRetryTimer.reset(self.connectRetryTime) # Initiate TCP connection if self.bgpPeering: self.bgpPeering.connectRetryEvent(self.protocol) self.state = ST_CONNECT elif self.state != ST_IDLE: # State OpenSent, OpenConfirm, Established, event 12 self.protocol.sendNotification(ERR_FSM, 0) self._errorClose() raise NotificationSent(self.protocol, ERR_FSM, 0) def idleHoldTimeEvent(self): """Called when the IdleHoldTimer expires. (event 13)""" if self.state == ST_IDLE: if self.bgpPeering: self.bgpPeering.automaticStart(idleHold=False) else: fsmError = False if self.state in (ST_OPENSENT, ST_OPENCONFIRM, ST_ESTABLISHED): fsmError = True self.protocol.sendNotification(ERR_FSM, 0) self._errorClose() if fsmError: raise NotificationSent(self.protocol, ERR_FSM, 0) def updateSent(self): """Called by the protocol instance when it just sent an Update message.""" if self.holdTime > 0: self.keepAliveTimer.reset(self.keepAliveTime) def _errorClose(self): """Internal method that closes a connection and returns the state to IDLE. """ # Stop the timers for timer in (self.connectRetryTimer, self.delayOpenTimer, self.holdTimer, self.keepAliveTimer): timer.cancel() # Release BGP resources (routes, etc) if self.bgpPeering: self.bgpPeering.releaseResources(self.protocol) self._closeConnection() self.connectRetryCounter += 1 self.state = ST_IDLE def _closeConnection(self): """Internal method that close the connection if a valid BGP protocol instance exists. """ if self.protocol is not None: self.protocol.closeConnection() # Remove from connections list if self.bgpPeering: self.bgpPeering.connectionClosed(self.protocol)
class BGPFailover: """Class for maintaining BGP sessions to routers for IP address failover""" prefixes = {} peerings = {} ipServices = {} metric_keywords = {'namespace': 'pybal', 'subsystem': 'bgp'} metrics = {'enabled': Gauge('enabled', 'BGP Enabled', **metric_keywords)} def __init__(self, globalConfig): # Store globalconfig so setup() can check whether BGP is enabled. self.globalConfig = globalConfig if not globalConfig.getboolean('bgp', False): self.metrics['enabled'].set(0) return self.metrics['enabled'].set(1) self._parseConfig() def _parseConfig(self): log.info("parsing BGP config", system="bgp") self.myASN = self.globalConfig.getint('bgp-local-asn') self.asPath = self.globalConfig.get('bgp-as-path', str(self.myASN)) self.asPath = [int(asn) for asn in self.asPath.split()] self.defaultMED = self.globalConfig.getint('bgp-med', 0) try: self.nexthopIPv4 = self.globalConfig['bgp-nexthop-ipv4'] except KeyError: if (bgp.AFI_INET, bgp.SAFI_UNICAST) in BGPFailover.prefixes: raise ValueError( "IPv4 BGP NextHop (global configuration variable 'bgp-nexthop-ipv4') not set" ) try: self.nexthopIPv6 = self.globalConfig['bgp-nexthop-ipv6'] except KeyError: if (bgp.AFI_INET6, bgp.SAFI_UNICAST) in BGPFailover.prefixes: raise ValueError( "IPv6 BGP NextHop (global configuration variable 'bgp-nexthop-ipv6') not set" ) bgpPeerAddress = self.globalConfig.get('bgp-peer-address', '').strip() if not bgpPeerAddress.startswith('['): bgpPeerAddress = "[ \"{}\" ]".format(bgpPeerAddress) self.peerAddresses = eval(bgpPeerAddress) assert isinstance(self.peerAddresses, list) def setup(self): if not self.globalConfig.getboolean('bgp', False): return try: advertisements = self.buildAdvertisements() for peerAddr in self.peerAddresses: peering = bgp.NaiveBGPPeering(self.myASN, peerAddr) peering.setEnabledAddressFamilies(set(self.prefixes.keys())) peering.setAdvertisements(advertisements) log.info("Starting BGP session with peer {}".format(peerAddr)) peering.automaticStart() self.peerings[peerAddr] = peering reactor.addSystemEventTrigger('before', 'shutdown', self.closeSession, peering) except Exception: log.critical("Could not set up BGP peering instances.") raise else: # Bind on the IPs listed in 'bgp_local_ips'. Default to # localhost v4 and v6 if no IPs have been specified in the # configuration. bgp_local_ips = eval(self.globalConfig.get('bgp-local-ips', '[""]')) bgp_local_port = self.globalConfig.getint('bgp-local-port', bgp.PORT) # Try to listen on the BGP port, not fatal if fails for ip in bgp_local_ips: try: reactor.listenTCP(bgp_local_port, bgp.BGPServerFactory(self.peerings), interface=ip) except CannotListenError as e: log.critical("Could not listen for BGP connections: " + str(e)) raise def closeSession(self, peering): log.info("Clearing session to {}".format(peering.peerAddr)) # Withdraw all announcements peering.setAdvertisements(set()) return peering.manualStop() def buildAdvertisements(self): baseAttrs = attrs.AttributeDict( [attrs.OriginAttribute(), attrs.ASPathAttribute(self.asPath)]) advertisements = set() for af in self.prefixes: afAttrs = bgp.AttributeDict(baseAttrs) if af[0] == (bgp.AFI_INET): afAttrs[attrs.NextHopAttribute] = attrs.NextHopAttribute( self.nexthopIPv4) elif af[0] == (bgp.AFI_INET6): afAttrs[ attrs.MPReachNLRIAttribute] = attrs.MPReachNLRIAttribute( (af[0], af[1], IPv6IP(self.nexthopIPv6), [])) else: raise ValueError("Unsupported address family {}".format(af)) for prefix in self.prefixes[af]: attributes = bgp.AttributeDict(afAttrs) # This service IP may use a non-default MED med = self.ipServices[prefix][0][ 'med'] # Guaranteed to exist, may be None if med is None: attributes[attrs.MEDAttribute] = attrs.MEDAttribute( self.defaultMED) else: attributes[attrs.MEDAttribute] = attrs.MEDAttribute(med) attributes = attrs.FrozenAttributeDict(attributes) advertisements.add(bgp.Advertisement(prefix, attributes, af)) return advertisements @classmethod def associateService(cls, ip, lvsservice, med): if ':' not in ip: af = (bgp.AFI_INET, bgp.SAFI_UNICAST) prefix = IPv4IP(ip) else: af = (bgp.AFI_INET6, bgp.SAFI_UNICAST) prefix = IPv6IP(ip) # All services need to agree on the same MED for this IP if prefix in cls.ipServices and not med == cls.ipServices[prefix][0][ 'med']: raise ValueError( "LVS service {} MED value {} differs from other MED values for IP {}" .format(lvsservice.name, med, ip)) service_state = {'lvsservice': lvsservice, 'af': af, 'med': med} cls.ipServices.setdefault(prefix, []).append(service_state) cls.prefixes.setdefault(af, set()).add(prefix)