def on_agent_option(self, name, value): self.tracer.trace3("on_agent_option: key={}, value={}".format( name, value)) if name == "insert_vrf_routes": if value == "ip": self.insert_routes_vrf(0, INITIAL_ROUTES, "blue") elif value == "ipv6": self.insert_v6_routes_vrf(0, INITIAL_ROUTES, "blue") elif name == "delete_vrf_routes": if value == "ip": self.delete_routes_vrf(0, INITIAL_ROUTES, "blue") elif value == "ipv6": self.delete_v6_routes_vrf(0, INITIAL_ROUTES, "blue") elif ((name == "cleanup" or name == "cleanup_v6" or name == "cleanup_new_api" or name == "cleanup_v6_new_api") and value == "start"): self.clear_routes() elif ((name == "cleanup" or name == "cleanup_new_api") and value == "done"): self.isV6Phase = True self.phase = 0 self.timeout_time_is(eossdk.now() + 5) elif name == "cleanup_v6" and value == "done": self.isV6Phase = False self.isNewApi = True self.phase = 0 self.timeout_time_is(eossdk.now() + 5)
def handle_phase(self): self.tracer.trace0("Starting phase {}".format(self.phase)) if self.phase == 0: self.clear_routes() self.timeout_time_is(eossdk.now() + 20) elif self.phase == 1: self.insert_initial_routes() self.timeout_time_is(eossdk.now() + 65) elif self.phase == 2: self.re_insert_routes() self.tracer.trace0("Finished phase {}".format(self.phase)) self.phase += 1
def stop(self): assert not self.shutdown_in_progress_ self.shutdown_in_progress_ = True debug("Telling tcollector to die") self.module_.ALIVE = False def do_stop(): debug("Joining main thread") self.main_thread_.join() debug("Joining ReaderThread thread") self.reader_thread_.join() debug("Joining SenderThread thread") self.sender_thread_.join() debug("Killing all remaining collectors") for col in list(self.module_.all_living_collectors()): col.shutdown() # Unregister the collectors... self.module_.COLLECTORS.clear() debug("Shutdown complete, updating running status") self.tcollector_running_ = False # Notify that shutdown is complete self.shutdown_in_progress_ = False # AFAIK we can't join the threads asynchronously, and each thread may # take several seconds to join, join the threads with another thread... # Kind of a kludge really. threading.Thread(target=do_stop, name="stopTcollector").start() # Setup timeout handler to poll for stopTcollector thread completion self.timeout_time_is(eossdk.now() + 1)
def on_initialized(self): self.tracer.trace0("Initialized") syslog.syslog("IPCheck Initialized") self.agentMgr.status_set("Status:", "Administratively Up") IPv4 = self.agentMgr.agent_option("IPv4") if not IPv4: # No IPv4 list of IPs initially set self.agentMgr.status_set("IPv4 Ping List:", "None") else: # Handle the initial state self.on_agent_option("IPv4", IPv4) IPv6 = self.agentMgr.agent_option("IPv6") if not IPv6: # No IPv6 list of IPs initially set self.agentMgr.status_set("IPv6 Ping List:", "None") else: # Handle the initial state self.on_agent_option("IPv6", IPv6) #Lets check the extra parameters and see if we should override the defaults TESTINTERVAL = self.agentMgr.agent_option("CHECKINTERVAL") if TESTINTERVAL: global CHECKINTERVAL CHECKINTERVAL = TESTINTERVAL PINGS = self.agentMgr.agent_option("PINGCOUNT") if PINGS: global PINGCOUNT PINGCOUNT = PINGS #Start our handler now. self.timeout_time_is(eossdk.now())
def stop(self): assert not self.shutdown_in_progress_ self.shutdown_in_progress_ = True debug("Telling tcollector to die") self.module_.ALIVE = False def do_stop(): debug("Joining main thread") self.main_thread_.join() debug("Joining ReaderThread thread") self.reader_thread_.join() debug("Joining SenderThread thread") self.sender_thread_.join() debug("Killing all remaining collectors") for col in list(self.module_.all_living_collectors()): col.shutdown() # Unregister the collectors... self.module_.COLLECTORS.clear() debug("Shutdown complete, updating running status") self.tcollector_running_ = False # Notify that shutdown is complete self.shutdown_in_progress_ = False # AFAIK we can't join the threads asynchronously, and each thread may # take several seconds to join, join the threads with another thread... # Kind of a kludge really. threading.Thread(target=do_stop, name="stopTcollector").start() # Setup timeout handler to poll for stopTcollector thread completion self.next_timeout_is(eossdk.now() + 1)
def resolve_config(self): self.tracer.trace2("Resolving all of our configured tunnels") for host in self.remote_switches.itervalues(): for tunnel in host.egress_tunnels.itervalues(): tunnel.last_update_time = time.time() + STARTUP_GRACEPERIOD self.resolve_egress_tunnel(tunnel) self.timeout_time_is(eossdk.now() + POLL_TIME)
def on_timeout(self): """ Called when we've tried to shutdown the tcollector process and need to wait for it to finish. Since we can't get notified asynchronously, this is done out of a timer callback. """ if self.shutdown_in_progress_: # Not yet complete, check again in a second. self.next_timeout_is(eossdk.now() + 1) else: # tcollector shutdown complete. Check to make sure # we weren't re-enabled while shutting down. self._maybe_connect()
def on_timeout(self): """ Called when we've tried to shutdown the tcollector process and need to wait for it to finish. Since we can't get notified asynchronously, this is done out of a timer callback. """ if self.shutdown_in_progress_: # Not yet complete, check again in a second. self.timeout_time_is(eossdk.now() + 1) else: # tcollector shutdown complete. Check to make sure # we weren't re-enabled while shutting down. self._maybe_connect()
def __init__(self, sdk, config_file, policy_handler, poll_interval=0.5): self.config_file_ = config_file self.sdk_ = sdk self.policy_handler_ = policy_handler self.poll_interval_ = poll_interval self.wm_ = pyinotify.WatchManager() mask = pyinotify.IN_MODIFY | pyinotify.IN_CREATE | pyinotify.IN_DELETE handler = functools.partial(InotifyHandler, handler=policy_handler) # Allow coalescing, so that delete/recreate (as opposed to modify) doesn't # cause us to delete the policy. self.notifier_ = pyinotify.Notifier(self.wm_, handler, timeout=10) self.notifier_.coalesce_events() self.watch_ = self.wm_.watch_transient_file(self.config_file_, mask, handler) eossdk.TimeoutHandler.__init__(self, self.sdk_.get_timeout_mgr()) self.timeout_time_is(eossdk.now())
def on_timeout(self): #Create a blank list of IPs that are dead. We'll use that to suppress notification global DEADIPV4 global DEADIPV6 IPv4 = self.agentMgr.agent_option("IPv4") if IPv4: EachAddress = IPv4.split(',') for host in EachAddress: pingstatus = pingDUT(4, str(host), PINGCOUNT) #After ping status, lets go over all the various test cases below if pingstatus == True: #Its alive #Check to see if it was in our dead list if host in DEADIPV4: #Notify that its back up. syslog.syslog('Next HOP %s is back up' % str(host)) DEADIPV4.remove(host) else: #Its not alive if host not in DEADIPV4: syslog.syslog('Next HOP %s is down' % str(host)) DEADIPV4.append(host) #Do an IPv6 section now IPv6 = self.agentMgr.agent_option("IPv6") if IPv6: EachAddress = IPv6.split(',') for host in EachAddress: for host in EachAddress: pingstatus = pingDUT(6, str(host), PINGCOUNT) #After ping status, lets go over all the various test cases below if pingstatus == True: #Its alive #Check to see if it was in our dead list if host in DEADIPV6: #Notify that its back up. syslog.syslog('Next HOP %s is back up' % str(host)) DEADIPV6.remove(host) else: #Its not alive if host not in DEADIPV6: syslog.syslog('Next HOP %s is down' % str(host)) DEADIPV6.append(host) self.timeout_time_is(eossdk.now() + int(CHECKINTERVAL))
def on_initialized(self): self.tracer.trace0("Initialized") syslog.syslog("toptalkers Initialized") self.agentMgr.status_set("Status:", "Administratively Up") # #Set up all our options. global CHECKINTERVAL if self.agentMgr.agent_option("CHECKINTERVAL"): self.on_agent_option("CHECKINTERVAL", self.agentMgr.agent_option("CHECKINTERVAL")) else: #global CHECKINTERVAL #We'll just use the default time specified by global variable self.agentMgr.status_set("CHECKINTERVAL:", "%s" % CHECKINTERVAL) global MAXFILESIZE if self.agentMgr.agent_option("MAXFILESIZE"): self.on_agent_option("MAXFILESIZE", self.agentMgr.agent_option("MAXFILESIZE")) else: #We'll just use the default MAXFILESIZE specified by global variable self.agentMgr.status_set("MAXFILESIZE:", "%s" % MAXFILESIZE) global HOURSOLD if self.agentMgr.agent_option("HOURSOLD"): self.on_agent_option("HOURSOLD", self.agentMgr.agent_option("HOURSOLD")) else: #We'll just use the default HOURSOLD specified by global variable self.agentMgr.status_set("HOURSOLD:", "%s" % HOURSOLD) #IF DB file does not exist, then create DB #Call DB Create function. if not os.path.exists(SQLDBFILE): syslog.syslog("DB File does not exist. Creating.") self.create_db() ##START SFACCTD self.start_sfacctd() #Start our handler now. self.timeout_time_is(eossdk.now())
def on_timeout(self): """ Time to send some packets to our neighbors! Our poller fired, so we should send out our heartbeat packets. We also check if we haven't heard about any of our tunnels recently, and if so, mark them as dead. """ cur_time = time.time() for host in self.remote_switches.itervalues(): liveness_dict = host.liveness_dict(cur_time) host.last_tx_msg_id += 1 if host.last_tx_msg_id > MAX_INT: host.last_tx_msg_id = 1 for key, tunnel in host.egress_tunnels.iteritems(): msg = Message(self.pid, key, host.last_tx_msg_id, liveness_dict) self.send_packet(host.destination_ip, tunnel, msg) if tunnel.is_alive and ( time.time() - tunnel.last_update_time > TIMEOUT_TIME): # There has been no updates to this tunnel at all # within our timeout period. tunnel.is_alive = False self.handle_tunnel_death(host.destination_ip, key, tunnel) # Schedule us to be called again in the future self.timeout_time_is(eossdk.now() + POLL_TIME)
def on_timeout(self): """ Time to send some packets to our neighbors! Our poller fired, so we should send out our heartbeat packets. We also check if we haven't heard about any of our tunnels recently, and if so, mark them as dead. """ cur_time = time.time() for host in self.remote_switches.itervalues(): liveness_dict = host.liveness_dict(cur_time) host.last_tx_msg_id += 1 if host.last_tx_msg_id > MAX_INT: host.last_tx_msg_id = 1 for key, tunnel in host.egress_tunnels.iteritems(): msg = Message(self.pid, key, host.last_tx_msg_id, liveness_dict) self.send_packet(host.destination_ip, tunnel, msg) if tunnel.is_alive and (time.time() - tunnel.last_update_time > TIMEOUT_TIME): # There has been no updates to this tunnel at all # within our timeout period. tunnel.is_alive = False self.handle_tunnel_death(host.destination_ip, key, tunnel) # Schedule us to be called again in the future self.timeout_time_is(eossdk.now() + POLL_TIME)
def on_timeout(self): ''' This is the function/method where we do the exciting stuff :-) ''' # If CONFIGCHECK is not 1 a.k.a. ok, then we won't do anything. It means we have a config error. if self.CONFIGCHECK == 1: # Let's check our HTTP Address & REGEX and see if its up or down. _web_check = self.web_check() if _web_check == 1: # Now we have to do all our health checking logic here... # If we are here, then we are up self.agentMgr.status_set("HealthStatus:", "UP") if self.CURRENTSTATUS == 0: # We were down but now up,so now let's change the configuration and set CURRENTSTATUS to 1 # Run CONF_RECOVER ******** syslog.syslog("HTTP host back up. Changing Configuration.") self.change_config('RECOVER') self.CURRENTSTATUS = 1 self.FAILITERATION = 0 elif self.FAILITERATION > 0: # This means we had at least one miss but we did not change config, just log and reset variable to 0 syslog.syslog( "HTTP host back up. Clearing FAILITERATION semaphore.") self.agentMgr.status_set("HealthStatus:", "UP") self.FAILITERATION = 0 elif _web_check == 0: # We are down self.FAILITERATION += 1 if self.CURRENTSTATUS == 0: # This means we've already changed config. Do nothing. pass else: # These are strings, force them to ints if self.agentMgr.agent_option("FAILCOUNT"): MAXFAILCOUNT = self.agentMgr.agent_option("FAILCOUNT") else: # Else we'll use the default value of FAILCOUNT MAXFAILCOUNT = self.FAILCOUNT if int(self.FAILITERATION) >= int(MAXFAILCOUNT): # Host is definitely down. Change config. # RUN CONF_FAIL syslog.syslog( "HTTP HOST is down. Changing configuration.") self.change_config('FAIL') self.agentMgr.status_set("HealthStatus:", "FAIL") self.CURRENTSTATUS = 0 else: # We get here if we had some weird exception syslog.syslog( "TCPCheck - An exception occurred. Skipping to next interval" ) # Wait for CHECKINTERVAL if self.agentMgr.agent_option("CHECKINTERVAL"): self.timeout_time_is( eossdk.now() + int(self.agentMgr.agent_option("CHECKINTERVAL"))) else: self.timeout_time_is(eossdk.now() + int(self.CHECKINTERVAL))
def on_initialized(self): global __version__ self.tracer.trace0("Initialized") syslog.syslog("TCPCheck Version %s Initialized" % __version__) self.agentMgr.status_set("Status:", "Administratively Up") # Lets check and set our state for each option during initialization. # i.e. after you do a 'no shut' on the daemon, we'll check each of these # and set status. # We'll pass this on to on_agent_option to process each of these. self.on_agent_option("IPv4", self.agentMgr.agent_option("IPv4")) self.on_agent_option("PROTOCOL", self.agentMgr.agent_option("PROTOCOL")) self.on_agent_option("TCPPORT", self.agentMgr.agent_option("TCPPORT")) self.on_agent_option("CONF_FAIL", self.agentMgr.agent_option("CONF_FAIL")) self.on_agent_option("CONF_RECOVER", self.agentMgr.agent_option("CONF_RECOVER")) self.on_agent_option("USERNAME", self.agentMgr.agent_option("USERNAME")) self.on_agent_option("PASSWORD", self.agentMgr.agent_option("PASSWORD")) self.on_agent_option("REGEX", self.agentMgr.agent_option("REGEX")) self.on_agent_option("URLPATH", self.agentMgr.agent_option("URLPATH")) self.on_agent_option("VRF", self.agentMgr.agent_option("VRF")) # Lets check the checkinterval, FAILCOUNT and HTTPTIMEOUT parameters and see if we should override the defaults # Note these are only variables that we have defaults for if user does not # override the value. Everything else, we'll reference the values directly # with agent.Mgr.agent_option("xyz") if self.agentMgr.agent_option("CHECKINTERVAL"): self.on_agent_option("CHECKINTERVAL", self.agentMgr.agent_option("CHECKINTERVAL")) else: # global CHECKINTERVAL # We'll just use the default time specified by global variable self.agentMgr.status_set("CHECKINTERVAL:", "%s" % self.CHECKINTERVAL) if self.agentMgr.agent_option("FAILCOUNT"): self.on_agent_option("FAILCOUNT", self.agentMgr.agent_option("FAILCOUNT")) else: # We'll just use the default failcount specified by global variable self.agentMgr.status_set("FAILCOUNT: ", "%s" % self.FAILCOUNT) # TODO - Perhaps add independent socket & HTTP timeout? if self.agentMgr.agent_option("HTTPTIMEOUT"): self.on_agent_option("HTTPTIMEOUT", self.agentMgr.agent_option("HTTPTIMEOUT")) else: # Since agent_option is not set, we'll just use the default HTTPTIMEOUT specified by global variable self.agentMgr.status_set("HTTPTIMEOUT:", "%s" % self.HTTPTIMEOUT) # Some basic mandatory variable checks. We'll check this when we have a # no shut on the daemon. Add some notes in comment and Readme.md to recommend # a shut and no shut every time you make parameter changes... if self.check_vars() == 1: self.CONFIGCHECK = 1 else: self.CONFIGCHECK = 0 #Start our handler now. self.agentMgr.status_set("HealthStatus:", "Unknown") self.timeout_time_is(eossdk.now())
def __init__(self, sdk, macHandler, poll_interval): self._sdk = sdk self._poll_interval = poll_interval self._macHandler = macHandler eossdk.TimeoutHandler.__init__(self, self._sdk.get_timeout_mgr()) self.timeout_time_is(eossdk.now() + self._poll_interval)
def on_timeout(self): #We need our globals here for reference in case they are not set in the configuration global MAXFILESIZE global HOURSOLD global CHECKINTERVAL # Set Last Check so we can see it from the EOS CLI. self.agentMgr.status_set("Last Check At:", time.asctime(time.localtime(time.time()))) # If this is first time, then what? #CHECK SIZE...because we don't know if someone did a shutdown/no shut and we have an #old file here. To be safe, always check in each interation, even if first time through. #We don't automatically delete the db on startup, because perhaps user wants to keep historial data try: DBSIZE = os.path.getsize(SQLDBFILE) except Exception as e: #If we get an issue, lets log this, e.g. someone deleted the file?. syslog.syslog("%s" % e) syslog.syslog( "db file appears to be unaccessible. Did someone delete %s" % SQLDBFILE) #If we get here, we can restart everythingself. self.kill_sfacctd() #sleep for 5 seconds just to let things stabilize. Before we create new db time.sleep(5) self.create_db() self.start_sfacctd() DBSIZE = 0 self.agentMgr.status_set("DB SIZE (Bytes):", "{:,}".format(DBSIZE)) #generally, we check the /tmp/sampling.db file size. If it exceeds the defined #threshold, log and then delete old entries and vacuum. if self.agentMgr.agent_option("MAXFILESIZE"): MAXSIZE = self.agentMgr.agent_option("MAXFILESIZE") else: #Else we'll use the default value of MAXFILESIZE. Always do this, because user could change #this at any time. Best to always check and then use default if it is not set. MAXSIZE = MAXFILESIZE #force to int for compare because we have a lot of strings here... if (int(DBSIZE) > int(MAXSIZE)): syslog.syslog("toptalker DB at %s bytes, running cleanup." % "{:,}".format(DBSIZE)) #How old do we want to delete our db entries? if self.agentMgr.agent_option("HOURSOLD"): MAXHOURS = self.agentMgr.agent_option("HOURSOLD") else: #Else we'll use the default value of HOURSOLD MAXHOURS = HOURSOLD #TODO would be a little cleaner to have DELETE and VACUUM as a function. # try: conn = sqlite3.connect(SQLDBFILE) conn.row_factory = sqlite3.Row db = conn.cursor() rows = db.execute( "DELETE from acct_v5 where stamp_updated <= datetime('now', '-%s hour');" % str(MAXHOURS)) db.execute("VACUUM") conn.commit() except sqlite3.Error as e: syslog.syslog("%s" % e) syslog.syslog( "Either db is corrupted or your sampling rate is too high. Deleting and creating a new db" ) #If we get here, then we have a serious issue with the database. #It could be corrupted, or we ran out of disk space. As a fail safe #method of dealing with this, we'll kill the sfacctd process, create a new #blank db file and then restart. We'll provide a detailed syslog message of the issue. #If filesystem is full (since we need more space for the VACUUM), then notify user #so they know they need to back off on their sflow sampling rate and be more conservative #with the db size and retention. self.kill_sfacctd() #sleep for 5 seconds just to let things stabilize. Before we create new db time.sleep(5) try: os.remove(SQLDBFILE) except: #We could use subprocess and use sudo as a sledgehammer #but if we get here, its because somebody is manually tweaking files. #If that is the case, its better to just error disable. syslog.syslog( "Unable to delete old db file. Shutting down agent.") self.on_agent_enabled(enabled=False, reason='error disabled') #Create new db and restart self.create_db() self.start_sfacctd() finally: conn.close() syslog.syslog("Toptalker db cleanup task complete.") if self.agentMgr.agent_option("CHECKINTERVAL"): self.timeout_time_is( eossdk.now() + int(self.agentMgr.agent_option("CHECKINTERVAL"))) else: self.timeout_time_is(eossdk.now() + int(CHECKINTERVAL))
def on_timeout(self): for intf_id in self.intf_mgr_.intf_iter(): if intf_id.intf_type() in self.intf_types: self.printIntfCounters(intf_id) sys.stdout.flush() self.timeout_time_is(eossdk.now() + self.interval_)
def on_timeout(self): ''' This is the function/method where we do the exciting stuff :-) ''' #Global variables are needed global CHECKINTERVAL global CURRENTSTATUS global PINGCOUNT global ITERATION # Just in case someone changes the options while daemon is running # we should go ahead and check our parameters on each iteration. # if its not a 1, then we fail check and will just wait till next iteration # and will show this via the status. if self.check_vars() == 1: #Here we do all the fun work and testing IPv4 = self.agentMgr.agent_option("IPv4") if self.agentMgr.agent_option("PINGCOUNT"): PINGS2SEND = self.agentMgr.agent_option("PINGCOUNT") else: #Else we'll use the default value of PINGCOUNT PINGS2SEND = PINGCOUNT #Check state, are we UP or FAILED state? #If up, lets check each of our addresses. #For this particular use case, its a logical OR for our addresses. #If any are up, then we mark this as good #If ALL are down, then we mark as bad #We also need to mark the iteration number which is important # for our holddown number. # #We could just react to single failure or recovery. But this is not as versatile. #What happens if remote rate limits pings so we have a false positive? This is why # we need to make sure that all our hosts in our list are down before we consider #this an issue. #Lets test each host in list and then we will populate DEAD or GOOD global list. #Then it is easier to do our logic or change it after all the checks. global DEADIPV4 global GOODIPV4 if IPv4: EachAddress = IPv4.split(',') startTime = eossdk.now() for host in EachAddress: if SOURCEINTFADDR: pingstatus = self.pingDUTeAPI(4, str(host), PINGS2SEND, SOURCEINTFADDR) else: pingstatus = self.pingDUTeAPI(4, str(host), PINGS2SEND) #After ping status, lets go over all the various test cases below if pingstatus == True: #Its alive - UP #Check to see if it was in our dead list if host in DEADIPV4: #Notify that its back up. syslog.syslog('PingCheck host %s is back up' % str(host)) DEADIPV4.remove(host) if host not in GOODIPV4: GOODIPV4.append(host) else: #Its not alive - DOWN if host not in DEADIPV4: syslog.syslog('PingCheck host %s is down' % str(host)) DEADIPV4.append(host) if host in GOODIPV4: #need to remove it from our GOOD list. GOODIPV4.remove(host) #We need to have some local variables to use for HOLDUP and HOLDDOWN because the admin #might change the values from the default. So lets just check this on each iteration. #But if the admin changes this in the middle of an interation check, we should make sure ITERATION # is greater than or equal to the HOLDDOWN or HOLDUP values so we don't get stuck. if self.agentMgr.agent_option("HOLDDOWN"): HOLDDOWNLOCAL = self.agentMgr.agent_option("HOLDDOWN") else: HOLDDOWNLOCAL = HOLDDOWN if self.agentMgr.agent_option("HOLDUP"): HOLDUPLOCAL = self.agentMgr.agent_option("HOLDUP") else: HOLDUPLOCAL = HOLDUP # Now we have all the ping state for each host. Lets do our additional logic here # Current implementaion is logical OR. So all we need is at least one host in GOODIPV4 list and we pass if len(GOODIPV4) > 0: # We have some life here...now we need to determine whether to recover or not based on our HOLDDOWN. if CURRENTSTATUS == 0: #We were down, now determine if we should recover yet. if ITERATION >= int(HOLDDOWNLOCAL): # Recover CURRENTSTATUS = 1 ITERATION = 0 syslog.syslog( "PingCheck Recovering. Changing configure for recovered state." ) #RUN CONFIG Change self.change_config('RECOVER') else: ITERATION += 1 #We need to wait till we hit our HOLDDOWN counter so we dampen a flapping condition if so exists else: #We get here when everything is down...nothing in GOODIPV4 list #Determine, are we already down? If so, noop. If not, then we need to determine if we are at HOLDDOWN. if CURRENTSTATUS == 1: #Determine if we need to do something if ITERATION >= int(HOLDUPLOCAL): syslog.syslog( "PingCheck Failure State. Changing configuration for failed state" ) # run config change failure self.change_config('FAIL') #Set Currentstatus to 0, we're now in failed state CURRENTSTATUS = 0 #Reset ITERATION ITERATION = 0 else: ITERATION += 1 #Set current state via HealthStatus with agentMgr. if CURRENTSTATUS == 1: self.agentMgr.status_set("Health Status:", "GOOD") else: self.agentMgr.status_set("Health Status:", "FAIL") else: #If we failed the config check, then we land here and just skip any other processing #and set Health status to INACTIVE. #Once the config checks out, then we'll change it above with either GOOD or FAIL #dependent on our ping checks. self.agentMgr.status_set("Health Status:", "INACTIVE") #Wait for CHECKINTERVAL runTime = eossdk.now() - startTime if self.agentMgr.agent_option("CHECKINTERVAL"): if runTime > int(self.agentMgr.agent_option("CHECKINTERVAL")): self.timeout_time_is(eossdk.now( )) # Run now if Checkinterval shorter than run time. else: nextRun = int( self.agentMgr.agent_option("CHECKINTERVAL")) - runTime self.timeout_time_is(eossdk.now() + nextRun) else: if runTime > int(CHECKINTERVAL): self.timeout_time_is(eossdk.now()) else: nextRun = int(CHECKINTERVAL) - runTime self.timeout_time_is(eossdk.now() + nextRun)
def sleep(self): """Go to sleep for 'refresh_interval' seconds.""" self.status = "sleeping" self.timeout_time_is(eossdk.now() + self.refresh_interval)
def on_initialized(self): self.tracer.trace0("Initialized") syslog.syslog("PingCheck Initialized") self.agentMgr.status_set("Status:", "Administratively Up") #We'll pass this on to on_agent_option to process each of these. self.on_agent_option("CONF_FAIL", self.agentMgr.agent_option("CONF_FAIL")) self.on_agent_option("CONF_RECOVER", self.agentMgr.agent_option("CONF_RECOVER")) IPv4 = self.agentMgr.agent_option("IPv4") if not IPv4: # No IPv4 list of IPs initially set self.agentMgr.status_set("IPv4 Ping List:", "None") else: # Handle the initial state self.on_agent_option("IPv4", IPv4) #Lets check the extra parameters and see if we should override the defaults #This is mostly for the status message. if self.agentMgr.agent_option("CHECKINTERVAL"): self.on_agent_option("CHECKINTERVAL", self.agentMgr.agent_option("CHECKINTERVAL")) else: #We'll just use the default time specified by global variable self.agentMgr.status_set("CHECKINTERVAL:", "%s" % self.CHECKINTERVAL) if self.agentMgr.agent_option("PINGCOUNT"): self.on_agent_option("PINGCOUNT", self.agentMgr.agent_option("PINGCOUNT")) else: #We'll just use the default pingcount specified by global variable self.agentMgr.status_set("PINGCOUNT:", "%s" % self.PINGCOUNT) if self.agentMgr.agent_option("HOLDDOWN"): self.on_agent_option("HOLDDOWN", self.agentMgr.agent_option("HOLDDOWN")) else: #We'll just use the default holddown specified by global variable self.agentMgr.status_set("HOLDDOWN:", "%s" % self.HOLDDOWN) if self.agentMgr.agent_option("HOLDUP"): self.on_agent_option("HOLDUP", self.agentMgr.agent_option("HOLDUP")) else: # We'll just use the default holdup specified by instance of variable self.agentMgr.status_set("HOLDUP:", "%s" % self.HOLDUP) if self.agentMgr.agent_option("PINGTIMEOUT"): self.on_agent_option("PINGTIMEOUT", self.agentMgr.agent_option("PINGTIMEOUT")) else: # We'll just use the default holddown specified by instance variable self.agentMgr.status_set("PINGTIMEOUT:", "%s" % self.PINGTIMEOUT) #Some basic mandatory variable checks. We'll check this when we have a #no shut on the daemon. Add some notes in comment and Readme.md to recommend #a shut and no shut every time you make parameter changes... self.agentMgr.status_set("Health Status:", "Unknown") #Start our handler now. self.timeout_time_is(eossdk.now())
def on_timeout(self): self.poll() self.timeout_time_is(eossdk.now() + self.poll_interval_)
def on_timeout(self): ''' This is the function/method where we do the exciting stuff :-) ''' # Create a time stamp of when we begin. Depending on the ping counts, # the number of IP's to check, and the ping timeout - we may need to # compensate when we start our next iteration. This will be even more # pronounced if the CHECKINTERVAL is very short. # This can cause our reaction time to a failed state to drift significantly. startTime = eossdk.now() # Just in case someone changes the options while daemon is running # we should go ahead and check our parameters on each iteration. # if its not a 1, then we fail check and will just wait till next iteration # and will show this via the status. if self.check_vars() == 1: #Here we do all the fun work and testing #Check state, are we UP or FAILED state? #If up, lets check each of our addresses. #For this particular use case, its a logical OR for our addresses. #If any are up, then we mark this as good #If ALL are down, then we mark as bad #We also need to mark the iteration number which is important # for our holddown number. # #We could just react to single failure or recovery. But this is not as versatile. #What happens if remote rate limits pings so we have a false positive? This is why # we need to make sure that all our hosts in our list are down before we consider #this an issue. #Lets test each host in list and then we will populate DEAD or GOOD global list. #Then it is easier to do our logic or change it after all the checks. IPv4 = self.agentMgr.agent_option("IPv4") if IPv4: EachAddress = IPv4.split(',') for host in EachAddress: pingstatus = self.pingDUT(str(host)) #After ping status, lets go over all the various test cases below if pingstatus == True: #Its alive - UP #Check to see if it was in our dead list if host in self.DEADIPV4: #Notify that its back up. syslog.syslog('PingCheck host %s is back up' % str(host)) self.DEADIPV4.remove(host) if host not in self.GOODIPV4: self.GOODIPV4.append(host) else: #Its not alive - DOWN if host not in self.DEADIPV4: syslog.syslog('PingCheck host %s is down' % str(host)) self.DEADIPV4.append(host) if host in self.GOODIPV4: #need to remove it from our GOOD list. self.GOODIPV4.remove(host) # We need to have some local variables to use for HOLDUP and HOLDDOWN because the admin # might change the values from the default. So lets just check this on each iteration. # But if the admin changes this in the middle of an interation check, we should make sure ITERATION # is greater than or equal to the HOLDDOWN or HOLDUP values so we don't get stuck. if self.agentMgr.agent_option("HOLDDOWN"): HOLDDOWNLOCAL = self.agentMgr.agent_option("HOLDDOWN") else: HOLDDOWNLOCAL = self.HOLDDOWN if self.agentMgr.agent_option("HOLDUP"): HOLDUPLOCAL = self.agentMgr.agent_option("HOLDUP") else: HOLDUPLOCAL = self.HOLDUP # Now we have all the ping state for each host. Lets do our additional logic here # Current implementaion is logical OR. So all we need is at least one host in GOODIPV4 list and we pass if len(self.GOODIPV4) > 0: # We have some life here...now we need to determine whether to recover or not based on our HOLDDOWN. if self.CURRENTSTATUS == 0: #We were down, now determine if we should recover yet. if self.ITERATION >= int(HOLDDOWNLOCAL): # Recover self.CURRENTSTATUS = 1 self.ITERATION = 0 syslog.syslog( "PingCheck Recovering. Changing configure for recovered state." ) # RUN CONFIG Change self.change_config('RECOVER') else: self.ITERATION += 1 # We need to wait till we hit our HOLDDOWN counter so we dampen a flapping condition if so exists else: # We get here when everything is down...nothing in GOODIPV4 list # Determine, are we already down? If so, noop. If not, then we need to determine if we are at HOLDDOWN. if self.CURRENTSTATUS == 1: # Determine if we need to do something if self.ITERATION >= int(HOLDUPLOCAL): syslog.syslog( "PingCheck Failure State. Changing configuration for failed state" ) # run config change failure self.change_config('FAIL') #Set Currentstatus to 0, we're now in failed state self.CURRENTSTATUS = 0 #Reset ITERATION self.ITERATION = 0 else: self.ITERATION += 1 # Set current state via HealthStatus with agentMgr. if self.CURRENTSTATUS == 1: self.agentMgr.status_set("Health Status:", "GOOD") else: self.agentMgr.status_set("Health Status:", "FAIL") else: # If we failed the config check, then we land here and just skip any other processing # and set Health status to INACTIVE. # Once the config checks out, then we'll change it above with either GOOD or FAIL # dependent on our ping checks. self.agentMgr.status_set("Health Status:", "INACTIVE") # Wait for CHECKINTERVAL - if memory serves, I think I added this is to deal with # time drift especially if many of the pings timeout and PINGTIMEOUT is set to a # high value. This can really make our reaction time too slow # and push out a reaction significantly. # If the delta between the time we started our interation to this point of # execution, then we need to go through our checks again immediately. # If all is good, runTime ends up being pretty close to zero for the most part. runTime = eossdk.now() - startTime if self.agentMgr.agent_option("CHECKINTERVAL"): if runTime > int(self.agentMgr.agent_option("CHECKINTERVAL")): self.timeout_time_is(eossdk.now( )) # Run now if Checkinterval shorter than run time. else: nextRun = int( self.agentMgr.agent_option("CHECKINTERVAL")) - runTime self.timeout_time_is(eossdk.now() + nextRun) else: if runTime > int(self.CHECKINTERVAL): self.timeout_time_is(eossdk.now()) else: nextRun = int(self.CHECKINTERVAL) - runTime self.timeout_time_is(eossdk.now() + nextRun)
def on_initialized(self): # Schedule ourselves to run immediately self.timeout_time_is(eossdk.now())