Example #1
0
    def on_agent_option(self, name, value):
        self.tracer.trace3("on_agent_option: key={}, value={}".format(
            name, value))
        if name == "insert_vrf_routes":
            if value == "ip":
                self.insert_routes_vrf(0, INITIAL_ROUTES, "blue")
            elif value == "ipv6":
                self.insert_v6_routes_vrf(0, INITIAL_ROUTES, "blue")

        elif name == "delete_vrf_routes":
            if value == "ip":
                self.delete_routes_vrf(0, INITIAL_ROUTES, "blue")
            elif value == "ipv6":
                self.delete_v6_routes_vrf(0, INITIAL_ROUTES, "blue")

        elif ((name == "cleanup" or name == "cleanup_v6"
               or name == "cleanup_new_api" or name == "cleanup_v6_new_api")
              and value == "start"):
            self.clear_routes()

        elif ((name == "cleanup" or name == "cleanup_new_api")
              and value == "done"):
            self.isV6Phase = True
            self.phase = 0
            self.timeout_time_is(eossdk.now() + 5)

        elif name == "cleanup_v6" and value == "done":
            self.isV6Phase = False
            self.isNewApi = True
            self.phase = 0
            self.timeout_time_is(eossdk.now() + 5)
Example #2
0
    def handle_phase(self):
        self.tracer.trace0("Starting phase {}".format(self.phase))

        if self.phase == 0:
            self.clear_routes()
            self.timeout_time_is(eossdk.now() + 20)
        elif self.phase == 1:
            self.insert_initial_routes()
            self.timeout_time_is(eossdk.now() + 65)
        elif self.phase == 2:
            self.re_insert_routes()

        self.tracer.trace0("Finished phase {}".format(self.phase))
        self.phase += 1
Example #3
0
    def stop(self):
        assert not self.shutdown_in_progress_
        self.shutdown_in_progress_ = True

        debug("Telling tcollector to die")
        self.module_.ALIVE = False

        def do_stop():
            debug("Joining main thread")
            self.main_thread_.join()
            debug("Joining ReaderThread thread")
            self.reader_thread_.join()
            debug("Joining SenderThread thread")
            self.sender_thread_.join()
            debug("Killing all remaining collectors")
            for col in list(self.module_.all_living_collectors()):
                col.shutdown()
            # Unregister the collectors...
            self.module_.COLLECTORS.clear()
            debug("Shutdown complete, updating running status")
            self.tcollector_running_ = False
            # Notify that shutdown is complete
            self.shutdown_in_progress_ = False

        # AFAIK we can't join the threads asynchronously, and each thread may
        # take several seconds to join, join the threads with another thread...
        # Kind of a kludge really.
        threading.Thread(target=do_stop, name="stopTcollector").start()

        # Setup timeout handler to poll for stopTcollector thread completion
        self.timeout_time_is(eossdk.now() + 1)
Example #4
0
    def on_initialized(self):
        self.tracer.trace0("Initialized")
        syslog.syslog("IPCheck Initialized")
        self.agentMgr.status_set("Status:", "Administratively Up")
        IPv4 = self.agentMgr.agent_option("IPv4")
        if not IPv4:
            # No IPv4 list of IPs initially set
            self.agentMgr.status_set("IPv4 Ping List:", "None")
        else:
            # Handle the initial state
            self.on_agent_option("IPv4", IPv4)
        IPv6 = self.agentMgr.agent_option("IPv6")
        if not IPv6:
            # No IPv6 list of IPs initially set
            self.agentMgr.status_set("IPv6 Ping List:", "None")
        else:
            # Handle the initial state
            self.on_agent_option("IPv6", IPv6)
        #Lets check the extra parameters and see if we should override the defaults
        TESTINTERVAL = self.agentMgr.agent_option("CHECKINTERVAL")
        if TESTINTERVAL:
            global CHECKINTERVAL
            CHECKINTERVAL = TESTINTERVAL
        PINGS = self.agentMgr.agent_option("PINGCOUNT")
        if PINGS:
            global PINGCOUNT
            PINGCOUNT = PINGS

        #Start our handler now.
        self.timeout_time_is(eossdk.now())
   def stop(self):
      assert not self.shutdown_in_progress_
      self.shutdown_in_progress_ = True

      debug("Telling tcollector to die")
      self.module_.ALIVE = False

      def do_stop():
         debug("Joining main thread")
         self.main_thread_.join()
         debug("Joining ReaderThread thread")
         self.reader_thread_.join()
         debug("Joining SenderThread thread")
         self.sender_thread_.join()
         debug("Killing all remaining collectors")
         for col in list(self.module_.all_living_collectors()):
            col.shutdown()
         # Unregister the collectors...
         self.module_.COLLECTORS.clear()
         debug("Shutdown complete, updating running status")
         self.tcollector_running_ = False
         # Notify that shutdown is complete
         self.shutdown_in_progress_ = False

      # AFAIK we can't join the threads asynchronously, and each thread may
      # take several seconds to join, join the threads with another thread...
      # Kind of a kludge really.
      threading.Thread(target=do_stop, name="stopTcollector").start()

      # Setup timeout handler to poll for stopTcollector thread completion
      self.next_timeout_is(eossdk.now() + 1)
 def resolve_config(self):
    self.tracer.trace2("Resolving all of our configured tunnels")
    for host in self.remote_switches.itervalues():
       for tunnel in host.egress_tunnels.itervalues():
          tunnel.last_update_time = time.time() + STARTUP_GRACEPERIOD
          self.resolve_egress_tunnel(tunnel)
    self.timeout_time_is(eossdk.now() + POLL_TIME)
Example #7
0
 def resolve_config(self):
     self.tracer.trace2("Resolving all of our configured tunnels")
     for host in self.remote_switches.itervalues():
         for tunnel in host.egress_tunnels.itervalues():
             tunnel.last_update_time = time.time() + STARTUP_GRACEPERIOD
             self.resolve_egress_tunnel(tunnel)
     self.timeout_time_is(eossdk.now() + POLL_TIME)
 def on_timeout(self):
    """ Called when we've tried to shutdown the tcollector process
    and need to wait for it to finish. Since we can't get notified
    asynchronously, this is done out of a timer callback. """
    if self.shutdown_in_progress_:
       # Not yet complete, check again in a second.
       self.next_timeout_is(eossdk.now() + 1)
    else:
       # tcollector shutdown complete. Check to make sure
       # we weren't re-enabled while shutting down.
       self._maybe_connect()
Example #9
0
 def on_timeout(self):
     """ Called when we've tried to shutdown the tcollector process
   and need to wait for it to finish. Since we can't get notified
   asynchronously, this is done out of a timer callback. """
     if self.shutdown_in_progress_:
         # Not yet complete, check again in a second.
         self.timeout_time_is(eossdk.now() + 1)
     else:
         # tcollector shutdown complete. Check to make sure
         # we weren't re-enabled while shutting down.
         self._maybe_connect()
Example #10
0
 def __init__(self, sdk, config_file, policy_handler, poll_interval=0.5):
    self.config_file_ = config_file
    self.sdk_ = sdk
    self.policy_handler_ = policy_handler
    self.poll_interval_ = poll_interval
    self.wm_ = pyinotify.WatchManager()
    mask = pyinotify.IN_MODIFY | pyinotify.IN_CREATE | pyinotify.IN_DELETE
    handler = functools.partial(InotifyHandler, handler=policy_handler)
    # Allow coalescing, so that delete/recreate (as opposed to modify) doesn't
    # cause us to delete the policy.
    self.notifier_ = pyinotify.Notifier(self.wm_, handler, timeout=10)
    self.notifier_.coalesce_events()
    self.watch_ = self.wm_.watch_transient_file(self.config_file_, mask, handler)
    eossdk.TimeoutHandler.__init__(self, self.sdk_.get_timeout_mgr())
    self.timeout_time_is(eossdk.now())
Example #11
0
    def on_timeout(self):
        #Create a blank list of IPs that are dead. We'll use that to suppress notification
        global DEADIPV4
        global DEADIPV6
        IPv4 = self.agentMgr.agent_option("IPv4")
        if IPv4:
            EachAddress = IPv4.split(',')
            for host in EachAddress:
                pingstatus = pingDUT(4, str(host), PINGCOUNT)
                #After ping status, lets go over all the various test cases below
                if pingstatus == True:
                    #Its alive
                    #Check to see if it was in our dead list
                    if host in DEADIPV4:
                        #Notify that its back up.
                        syslog.syslog('Next HOP %s is back up' % str(host))
                        DEADIPV4.remove(host)
                else:
                    #Its not alive
                    if host not in DEADIPV4:
                        syslog.syslog('Next HOP %s is down' % str(host))
                        DEADIPV4.append(host)

        #Do an IPv6 section now
        IPv6 = self.agentMgr.agent_option("IPv6")
        if IPv6:
            EachAddress = IPv6.split(',')
            for host in EachAddress:
                for host in EachAddress:
                    pingstatus = pingDUT(6, str(host), PINGCOUNT)
                    #After ping status, lets go over all the various test cases below
                    if pingstatus == True:
                        #Its alive
                        #Check to see if it was in our dead list
                        if host in DEADIPV6:
                            #Notify that its back up.
                            syslog.syslog('Next HOP %s is back up' % str(host))
                            DEADIPV6.remove(host)
                    else:
                        #Its not alive
                        if host not in DEADIPV6:
                            syslog.syslog('Next HOP %s is down' % str(host))
                            DEADIPV6.append(host)

        self.timeout_time_is(eossdk.now() + int(CHECKINTERVAL))
Example #12
0
    def on_initialized(self):
        self.tracer.trace0("Initialized")
        syslog.syslog("toptalkers Initialized")
        self.agentMgr.status_set("Status:", "Administratively Up")
        #

        #Set up all our options.
        global CHECKINTERVAL
        if self.agentMgr.agent_option("CHECKINTERVAL"):
            self.on_agent_option("CHECKINTERVAL",
                                 self.agentMgr.agent_option("CHECKINTERVAL"))
        else:
            #global CHECKINTERVAL
            #We'll just use the default time specified by global variable
            self.agentMgr.status_set("CHECKINTERVAL:", "%s" % CHECKINTERVAL)

        global MAXFILESIZE
        if self.agentMgr.agent_option("MAXFILESIZE"):
            self.on_agent_option("MAXFILESIZE",
                                 self.agentMgr.agent_option("MAXFILESIZE"))
        else:
            #We'll just use the default MAXFILESIZE specified by global variable
            self.agentMgr.status_set("MAXFILESIZE:", "%s" % MAXFILESIZE)

        global HOURSOLD
        if self.agentMgr.agent_option("HOURSOLD"):
            self.on_agent_option("HOURSOLD",
                                 self.agentMgr.agent_option("HOURSOLD"))
        else:
            #We'll just use the default HOURSOLD specified by global variable
            self.agentMgr.status_set("HOURSOLD:", "%s" % HOURSOLD)

        #IF DB file does not exist, then create DB
        #Call DB Create function.
        if not os.path.exists(SQLDBFILE):
            syslog.syslog("DB File does not exist. Creating.")
            self.create_db()

        ##START SFACCTD
        self.start_sfacctd()

        #Start our handler now.
        self.timeout_time_is(eossdk.now())
 def on_timeout(self):
    """ Time to send some packets to our neighbors! Our poller
    fired, so we should send out our heartbeat packets. We also
    check if we haven't heard about any of our tunnels recently, and
    if so, mark them as dead. """
    cur_time = time.time()
    for host in self.remote_switches.itervalues():
       liveness_dict = host.liveness_dict(cur_time)
       host.last_tx_msg_id += 1
       if host.last_tx_msg_id > MAX_INT:
          host.last_tx_msg_id = 1
       for key, tunnel in host.egress_tunnels.iteritems():
          msg = Message(self.pid, key, host.last_tx_msg_id, liveness_dict)
          self.send_packet(host.destination_ip, tunnel, msg)
          if tunnel.is_alive and (
             time.time() - tunnel.last_update_time > TIMEOUT_TIME):
             # There has been no updates to this tunnel at all
             # within our timeout period.
             tunnel.is_alive = False
             self.handle_tunnel_death(host.destination_ip, key, tunnel)
    # Schedule us to be called again in the future
    self.timeout_time_is(eossdk.now() + POLL_TIME)
Example #14
0
 def on_timeout(self):
     """ Time to send some packets to our neighbors! Our poller
   fired, so we should send out our heartbeat packets. We also
   check if we haven't heard about any of our tunnels recently, and
   if so, mark them as dead. """
     cur_time = time.time()
     for host in self.remote_switches.itervalues():
         liveness_dict = host.liveness_dict(cur_time)
         host.last_tx_msg_id += 1
         if host.last_tx_msg_id > MAX_INT:
             host.last_tx_msg_id = 1
         for key, tunnel in host.egress_tunnels.iteritems():
             msg = Message(self.pid, key, host.last_tx_msg_id,
                           liveness_dict)
             self.send_packet(host.destination_ip, tunnel, msg)
             if tunnel.is_alive and (time.time() - tunnel.last_update_time >
                                     TIMEOUT_TIME):
                 # There has been no updates to this tunnel at all
                 # within our timeout period.
                 tunnel.is_alive = False
                 self.handle_tunnel_death(host.destination_ip, key, tunnel)
     # Schedule us to be called again in the future
     self.timeout_time_is(eossdk.now() + POLL_TIME)
Example #15
0
    def on_timeout(self):
        '''
         This is the function/method where we do the exciting stuff :-)
        '''

        # If CONFIGCHECK is not 1 a.k.a. ok, then we won't do anything. It means we have a config error.
        if self.CONFIGCHECK == 1:
            # Let's check our HTTP Address & REGEX and see if its up or down.
            _web_check = self.web_check()
            if _web_check == 1:
                # Now we have to do all our health checking logic here...
                # If we are here, then we are up
                self.agentMgr.status_set("HealthStatus:", "UP")
                if self.CURRENTSTATUS == 0:
                    # We were down but now up,so now let's change the configuration and set CURRENTSTATUS to 1
                    # Run CONF_RECOVER ********
                    syslog.syslog("HTTP host back up. Changing Configuration.")
                    self.change_config('RECOVER')
                    self.CURRENTSTATUS = 1
                    self.FAILITERATION = 0
                elif self.FAILITERATION > 0:
                    # This means we had at least one miss but we did not change config, just log and reset variable to 0
                    syslog.syslog(
                        "HTTP host back up. Clearing FAILITERATION semaphore.")
                    self.agentMgr.status_set("HealthStatus:", "UP")
                    self.FAILITERATION = 0
            elif _web_check == 0:
                # We are down
                self.FAILITERATION += 1
                if self.CURRENTSTATUS == 0:
                    # This means we've already changed config. Do nothing.
                    pass
                else:
                    # These are strings, force them to ints
                    if self.agentMgr.agent_option("FAILCOUNT"):
                        MAXFAILCOUNT = self.agentMgr.agent_option("FAILCOUNT")
                    else:
                        # Else we'll use the default value of FAILCOUNT
                        MAXFAILCOUNT = self.FAILCOUNT
                    if int(self.FAILITERATION) >= int(MAXFAILCOUNT):
                        # Host is definitely down. Change config.
                        # RUN CONF_FAIL
                        syslog.syslog(
                            "HTTP HOST is down. Changing configuration.")
                        self.change_config('FAIL')
                        self.agentMgr.status_set("HealthStatus:", "FAIL")
                        self.CURRENTSTATUS = 0

            else:
                # We get here if we had some weird exception
                syslog.syslog(
                    "TCPCheck - An exception occurred. Skipping to next interval"
                )

        # Wait for CHECKINTERVAL
        if self.agentMgr.agent_option("CHECKINTERVAL"):
            self.timeout_time_is(
                eossdk.now() +
                int(self.agentMgr.agent_option("CHECKINTERVAL")))
        else:
            self.timeout_time_is(eossdk.now() + int(self.CHECKINTERVAL))
Example #16
0
    def on_initialized(self):
        global __version__
        self.tracer.trace0("Initialized")
        syslog.syslog("TCPCheck Version %s Initialized" % __version__)
        self.agentMgr.status_set("Status:", "Administratively Up")
        # Lets check and set our state for each option during initialization.
        # i.e. after you do a 'no shut' on the daemon, we'll check each of these
        # and set status.

        # We'll pass this on to on_agent_option to process each of these.
        self.on_agent_option("IPv4", self.agentMgr.agent_option("IPv4"))
        self.on_agent_option("PROTOCOL",
                             self.agentMgr.agent_option("PROTOCOL"))
        self.on_agent_option("TCPPORT", self.agentMgr.agent_option("TCPPORT"))
        self.on_agent_option("CONF_FAIL",
                             self.agentMgr.agent_option("CONF_FAIL"))
        self.on_agent_option("CONF_RECOVER",
                             self.agentMgr.agent_option("CONF_RECOVER"))
        self.on_agent_option("USERNAME",
                             self.agentMgr.agent_option("USERNAME"))
        self.on_agent_option("PASSWORD",
                             self.agentMgr.agent_option("PASSWORD"))
        self.on_agent_option("REGEX", self.agentMgr.agent_option("REGEX"))
        self.on_agent_option("URLPATH", self.agentMgr.agent_option("URLPATH"))
        self.on_agent_option("VRF", self.agentMgr.agent_option("VRF"))

        # Lets check the checkinterval, FAILCOUNT and HTTPTIMEOUT parameters and see if we should override the defaults
        # Note these are only variables that we have defaults for if user does not
        # override the value. Everything else, we'll reference the values directly
        # with agent.Mgr.agent_option("xyz")

        if self.agentMgr.agent_option("CHECKINTERVAL"):
            self.on_agent_option("CHECKINTERVAL",
                                 self.agentMgr.agent_option("CHECKINTERVAL"))
        else:
            # global CHECKINTERVAL
            # We'll just use the default time specified by global variable
            self.agentMgr.status_set("CHECKINTERVAL:",
                                     "%s" % self.CHECKINTERVAL)

        if self.agentMgr.agent_option("FAILCOUNT"):
            self.on_agent_option("FAILCOUNT",
                                 self.agentMgr.agent_option("FAILCOUNT"))
        else:
            # We'll just use the default failcount specified by global variable
            self.agentMgr.status_set("FAILCOUNT: ", "%s" % self.FAILCOUNT)

        # TODO - Perhaps add independent socket & HTTP timeout?

        if self.agentMgr.agent_option("HTTPTIMEOUT"):
            self.on_agent_option("HTTPTIMEOUT",
                                 self.agentMgr.agent_option("HTTPTIMEOUT"))
        else:
            # Since agent_option is not set, we'll just use the default HTTPTIMEOUT specified by global variable
            self.agentMgr.status_set("HTTPTIMEOUT:", "%s" % self.HTTPTIMEOUT)

        # Some basic mandatory variable checks. We'll check this when we have a
        # no shut on the daemon. Add some notes in comment and Readme.md to recommend
        # a shut and no shut every time you make parameter changes...
        if self.check_vars() == 1:
            self.CONFIGCHECK = 1
        else:
            self.CONFIGCHECK = 0
        #Start our handler now.
        self.agentMgr.status_set("HealthStatus:", "Unknown")
        self.timeout_time_is(eossdk.now())
Example #17
0
 def __init__(self, sdk, macHandler, poll_interval):
     self._sdk = sdk
     self._poll_interval = poll_interval
     self._macHandler = macHandler
     eossdk.TimeoutHandler.__init__(self, self._sdk.get_timeout_mgr())
     self.timeout_time_is(eossdk.now() + self._poll_interval)
Example #18
0
    def on_timeout(self):
        #We need our globals here for reference in case they are not set in the configuration
        global MAXFILESIZE
        global HOURSOLD
        global CHECKINTERVAL

        # Set Last Check so we can see it from the EOS CLI.
        self.agentMgr.status_set("Last Check At:",
                                 time.asctime(time.localtime(time.time())))

        # If this is first time, then what?
        #CHECK SIZE...because we don't know if someone did a shutdown/no shut and we have an
        #old file here. To be safe, always check in each interation, even if first time through.
        #We don't automatically delete the db on startup, because perhaps user wants to keep historial data
        try:
            DBSIZE = os.path.getsize(SQLDBFILE)
        except Exception as e:
            #If we get an issue, lets log this, e.g. someone deleted the file?.
            syslog.syslog("%s" % e)
            syslog.syslog(
                "db file appears to be unaccessible. Did someone delete %s" %
                SQLDBFILE)
            #If we get here, we can restart everythingself.
            self.kill_sfacctd()
            #sleep for 5 seconds just to let things stabilize. Before we create new db
            time.sleep(5)
            self.create_db()
            self.start_sfacctd()
            DBSIZE = 0

        self.agentMgr.status_set("DB SIZE (Bytes):", "{:,}".format(DBSIZE))

        #generally, we check the /tmp/sampling.db file size. If it exceeds the defined
        #threshold, log and then delete old entries and vacuum.
        if self.agentMgr.agent_option("MAXFILESIZE"):
            MAXSIZE = self.agentMgr.agent_option("MAXFILESIZE")
        else:
            #Else we'll use the default value of MAXFILESIZE. Always do this, because user could change
            #this at any time. Best to always check and then use default if it is not set.
            MAXSIZE = MAXFILESIZE

        #force to int for compare because we have a lot of strings here...
        if (int(DBSIZE) > int(MAXSIZE)):
            syslog.syslog("toptalker DB at %s bytes, running cleanup." %
                          "{:,}".format(DBSIZE))

            #How old do we want to delete our db entries?
            if self.agentMgr.agent_option("HOURSOLD"):
                MAXHOURS = self.agentMgr.agent_option("HOURSOLD")
            else:
                #Else we'll use the default value of HOURSOLD
                MAXHOURS = HOURSOLD

            #TODO would be a little cleaner to have DELETE and VACUUM as a function.
            #
            try:
                conn = sqlite3.connect(SQLDBFILE)
                conn.row_factory = sqlite3.Row
                db = conn.cursor()
                rows = db.execute(
                    "DELETE from acct_v5 where stamp_updated <= datetime('now', '-%s hour');"
                    % str(MAXHOURS))
                db.execute("VACUUM")
                conn.commit()
            except sqlite3.Error as e:
                syslog.syslog("%s" % e)
                syslog.syslog(
                    "Either db is corrupted or your sampling rate is too high. Deleting and creating a new db"
                )
                #If we get here, then we have a serious issue with the database.
                #It could be corrupted, or we ran out of disk space. As a fail safe
                #method of dealing with this, we'll kill the sfacctd process, create a new
                #blank db file and then restart. We'll provide a detailed syslog message of the issue.
                #If filesystem is full (since we need more space for the VACUUM), then notify user
                #so they know they need to back off on their sflow sampling rate and be more conservative
                #with the db size and retention.
                self.kill_sfacctd()
                #sleep for 5 seconds just to let things stabilize. Before we create new db
                time.sleep(5)
                try:
                    os.remove(SQLDBFILE)
                except:
                    #We could use subprocess and use sudo as a sledgehammer
                    #but if we get here, its because somebody is manually tweaking files.
                    #If that is the case, its better to just error disable.
                    syslog.syslog(
                        "Unable to delete old db file. Shutting down agent.")
                    self.on_agent_enabled(enabled=False,
                                          reason='error disabled')

                #Create new db and restart
                self.create_db()
                self.start_sfacctd()
            finally:
                conn.close()

            syslog.syslog("Toptalker db cleanup task complete.")

        if self.agentMgr.agent_option("CHECKINTERVAL"):
            self.timeout_time_is(
                eossdk.now() +
                int(self.agentMgr.agent_option("CHECKINTERVAL")))
        else:
            self.timeout_time_is(eossdk.now() + int(CHECKINTERVAL))
Example #19
0
 def on_timeout(self):
     for intf_id in self.intf_mgr_.intf_iter():
         if intf_id.intf_type() in self.intf_types:
             self.printIntfCounters(intf_id)
     sys.stdout.flush()
     self.timeout_time_is(eossdk.now() + self.interval_)
Example #20
0
    def on_timeout(self):
        '''
         This is the function/method where we do the exciting stuff :-)
        '''
        #Global variables are needed
        global CHECKINTERVAL
        global CURRENTSTATUS
        global PINGCOUNT
        global ITERATION

        # Just in case someone changes the options while daemon is running
        # we should go ahead and check our parameters on each iteration.
        # if its not a 1, then we fail check and will just wait till next iteration
        # and will show this via the status.
        if self.check_vars() == 1:

            #Here we do all the fun work and testing
            IPv4 = self.agentMgr.agent_option("IPv4")
            if self.agentMgr.agent_option("PINGCOUNT"):
                PINGS2SEND = self.agentMgr.agent_option("PINGCOUNT")
            else:
                #Else we'll use the default value of PINGCOUNT
                PINGS2SEND = PINGCOUNT

            #Check state, are we UP or FAILED state?
            #If up, lets check each of our addresses.
            #For this particular use case, its a logical OR for our addresses.
            #If any are up, then we mark this as good
            #If ALL are down, then we mark as bad
            #We also need to mark the iteration number which is important
            # for our holddown number.
            #

            #We could just react to single failure or recovery. But this is not as versatile.
            #What happens if remote rate limits pings so we have a false positive? This is why
            # we need to make sure that all our hosts in our list are down before we consider
            #this an issue.
            #Lets test each host in list and then we will populate DEAD or GOOD global list.
            #Then it is easier to do our logic or change it after all the checks.
            global DEADIPV4
            global GOODIPV4
            if IPv4:
                EachAddress = IPv4.split(',')
                startTime = eossdk.now()
                for host in EachAddress:
                    if SOURCEINTFADDR:
                        pingstatus = self.pingDUTeAPI(4, str(host), PINGS2SEND,
                                                      SOURCEINTFADDR)
                    else:
                        pingstatus = self.pingDUTeAPI(4, str(host), PINGS2SEND)
                    #After ping status, lets go over all the various test cases below
                    if pingstatus == True:
                        #Its alive - UP
                        #Check to see if it was in our dead list
                        if host in DEADIPV4:
                            #Notify that its back up.
                            syslog.syslog('PingCheck host %s is back up' %
                                          str(host))
                            DEADIPV4.remove(host)
                        if host not in GOODIPV4:
                            GOODIPV4.append(host)
                    else:
                        #Its not alive  - DOWN
                        if host not in DEADIPV4:
                            syslog.syslog('PingCheck host %s is down' %
                                          str(host))
                            DEADIPV4.append(host)
                        if host in GOODIPV4:
                            #need to remove it from our GOOD list.
                            GOODIPV4.remove(host)

            #We need to have some local variables to use for HOLDUP and HOLDDOWN because the admin
            #might change the values from the default. So lets just check this on each iteration.
            #But if the admin changes this in the middle of an interation check, we should make sure ITERATION
            # is greater than or equal to the HOLDDOWN or HOLDUP values so we don't get stuck.

            if self.agentMgr.agent_option("HOLDDOWN"):
                HOLDDOWNLOCAL = self.agentMgr.agent_option("HOLDDOWN")
            else:
                HOLDDOWNLOCAL = HOLDDOWN
            if self.agentMgr.agent_option("HOLDUP"):
                HOLDUPLOCAL = self.agentMgr.agent_option("HOLDUP")
            else:
                HOLDUPLOCAL = HOLDUP

# Now we have all the ping state for each host. Lets do our additional logic here
# Current implementaion is logical OR. So all we need is at least one host in GOODIPV4 list and we pass
            if len(GOODIPV4) > 0:
                # We have some life here...now we need to determine whether to recover or not based on our HOLDDOWN.
                if CURRENTSTATUS == 0:
                    #We were down, now determine if we should recover yet.
                    if ITERATION >= int(HOLDDOWNLOCAL):
                        # Recover
                        CURRENTSTATUS = 1
                        ITERATION = 0
                        syslog.syslog(
                            "PingCheck Recovering. Changing configure for recovered state."
                        )
                        #RUN CONFIG Change
                        self.change_config('RECOVER')
                    else:
                        ITERATION += 1
                    #We need to wait till we hit our HOLDDOWN counter so we dampen a flapping condition if so exists
            else:
                #We get here when everything is down...nothing in GOODIPV4 list
                #Determine, are we already down? If so, noop. If not, then we need to determine if we are at HOLDDOWN.
                if CURRENTSTATUS == 1:
                    #Determine if we need to do something
                    if ITERATION >= int(HOLDUPLOCAL):
                        syslog.syslog(
                            "PingCheck Failure State. Changing configuration for failed state"
                        )
                        # run config change failure
                        self.change_config('FAIL')
                        #Set Currentstatus to 0, we're now in failed state
                        CURRENTSTATUS = 0
                        #Reset ITERATION
                        ITERATION = 0
                    else:
                        ITERATION += 1

            #Set current state via HealthStatus with agentMgr.
            if CURRENTSTATUS == 1:
                self.agentMgr.status_set("Health Status:", "GOOD")
            else:
                self.agentMgr.status_set("Health Status:", "FAIL")

        else:
            #If we failed the config check, then we land here and just skip any other processing
            #and set Health status to INACTIVE.
            #Once the config checks out, then we'll change it above with either GOOD or FAIL
            #dependent on our ping checks.
            self.agentMgr.status_set("Health Status:", "INACTIVE")

        #Wait for CHECKINTERVAL
        runTime = eossdk.now() - startTime
        if self.agentMgr.agent_option("CHECKINTERVAL"):
            if runTime > int(self.agentMgr.agent_option("CHECKINTERVAL")):
                self.timeout_time_is(eossdk.now(
                ))  # Run now if Checkinterval shorter than run time.
            else:
                nextRun = int(
                    self.agentMgr.agent_option("CHECKINTERVAL")) - runTime
                self.timeout_time_is(eossdk.now() + nextRun)
        else:
            if runTime > int(CHECKINTERVAL):
                self.timeout_time_is(eossdk.now())
            else:
                nextRun = int(CHECKINTERVAL) - runTime
                self.timeout_time_is(eossdk.now() + nextRun)
Example #21
0
 def sleep(self):
     """Go to sleep for 'refresh_interval' seconds."""
     self.status = "sleeping"
     self.timeout_time_is(eossdk.now() + self.refresh_interval)
Example #22
0
 def on_timeout(self):
    for intf_id in self.intf_mgr_.intf_iter():
       if intf_id.intf_type() in self.intf_types:
          self.printIntfCounters(intf_id)
    sys.stdout.flush()
    self.timeout_time_is(eossdk.now() + self.interval_)
Example #23
0
    def on_initialized(self):
        self.tracer.trace0("Initialized")
        syslog.syslog("PingCheck Initialized")
        self.agentMgr.status_set("Status:", "Administratively Up")

        #We'll pass this on to on_agent_option to process each of these.
        self.on_agent_option("CONF_FAIL",
                             self.agentMgr.agent_option("CONF_FAIL"))
        self.on_agent_option("CONF_RECOVER",
                             self.agentMgr.agent_option("CONF_RECOVER"))
        IPv4 = self.agentMgr.agent_option("IPv4")
        if not IPv4:
            # No IPv4 list of IPs initially set
            self.agentMgr.status_set("IPv4 Ping List:", "None")
        else:
            # Handle the initial state
            self.on_agent_option("IPv4", IPv4)

        #Lets check the extra parameters and see if we should override the defaults
        #This is mostly for the status message.
        if self.agentMgr.agent_option("CHECKINTERVAL"):
            self.on_agent_option("CHECKINTERVAL",
                                 self.agentMgr.agent_option("CHECKINTERVAL"))
        else:
            #We'll just use the default time specified by global variable
            self.agentMgr.status_set("CHECKINTERVAL:",
                                     "%s" % self.CHECKINTERVAL)

        if self.agentMgr.agent_option("PINGCOUNT"):
            self.on_agent_option("PINGCOUNT",
                                 self.agentMgr.agent_option("PINGCOUNT"))
        else:
            #We'll just use the default pingcount specified by global variable
            self.agentMgr.status_set("PINGCOUNT:", "%s" % self.PINGCOUNT)

        if self.agentMgr.agent_option("HOLDDOWN"):
            self.on_agent_option("HOLDDOWN",
                                 self.agentMgr.agent_option("HOLDDOWN"))
        else:
            #We'll just use the default holddown specified by global variable
            self.agentMgr.status_set("HOLDDOWN:", "%s" % self.HOLDDOWN)

        if self.agentMgr.agent_option("HOLDUP"):
            self.on_agent_option("HOLDUP",
                                 self.agentMgr.agent_option("HOLDUP"))
        else:
            # We'll just use the default holdup specified by instance of variable
            self.agentMgr.status_set("HOLDUP:", "%s" % self.HOLDUP)

        if self.agentMgr.agent_option("PINGTIMEOUT"):
            self.on_agent_option("PINGTIMEOUT",
                                 self.agentMgr.agent_option("PINGTIMEOUT"))
        else:
            # We'll just use the default holddown specified by instance variable
            self.agentMgr.status_set("PINGTIMEOUT:", "%s" % self.PINGTIMEOUT)

        #Some basic mandatory variable checks. We'll check this when we have a
        #no shut on the daemon. Add some notes in comment and Readme.md to recommend
        #a shut and no shut every time you make parameter changes...

        self.agentMgr.status_set("Health Status:", "Unknown")

        #Start our handler now.
        self.timeout_time_is(eossdk.now())
Example #24
0
 def on_timeout(self):
    self.poll()
    self.timeout_time_is(eossdk.now() + self.poll_interval_)
Example #25
0
    def on_timeout(self):
        '''
         This is the function/method where we do the exciting stuff :-)
        '''

        # Create a time stamp of when we begin. Depending on the ping counts,
        # the number of IP's to check, and the ping timeout - we may need to
        # compensate when we start our next iteration. This will be even more
        # pronounced if the CHECKINTERVAL is very short.
        # This can cause our reaction time to a failed state to drift significantly.
        startTime = eossdk.now()

        # Just in case someone changes the options while daemon is running
        # we should go ahead and check our parameters on each iteration.
        # if its not a 1, then we fail check and will just wait till next iteration
        # and will show this via the status.
        if self.check_vars() == 1:

            #Here we do all the fun work and testing

            #Check state, are we UP or FAILED state?
            #If up, lets check each of our addresses.
            #For this particular use case, its a logical OR for our addresses.
            #If any are up, then we mark this as good
            #If ALL are down, then we mark as bad
            #We also need to mark the iteration number which is important
            # for our holddown number.
            #

            #We could just react to single failure or recovery. But this is not as versatile.
            #What happens if remote rate limits pings so we have a false positive? This is why
            # we need to make sure that all our hosts in our list are down before we consider
            #this an issue.
            #Lets test each host in list and then we will populate DEAD or GOOD global list.
            #Then it is easier to do our logic or change it after all the checks.

            IPv4 = self.agentMgr.agent_option("IPv4")
            if IPv4:
                EachAddress = IPv4.split(',')
                for host in EachAddress:
                    pingstatus = self.pingDUT(str(host))
                    #After ping status, lets go over all the various test cases below
                    if pingstatus == True:
                        #Its alive - UP
                        #Check to see if it was in our dead list
                        if host in self.DEADIPV4:
                            #Notify that its back up.
                            syslog.syslog('PingCheck host %s is back up' %
                                          str(host))
                            self.DEADIPV4.remove(host)
                        if host not in self.GOODIPV4:
                            self.GOODIPV4.append(host)
                    else:
                        #Its not alive  - DOWN
                        if host not in self.DEADIPV4:
                            syslog.syslog('PingCheck host %s is down' %
                                          str(host))
                            self.DEADIPV4.append(host)
                        if host in self.GOODIPV4:
                            #need to remove it from our GOOD list.
                            self.GOODIPV4.remove(host)

            # We need to have some local variables to use for HOLDUP and HOLDDOWN because the admin
            # might change the values from the default. So lets just check this on each iteration.
            # But if the admin changes this in the middle of an interation check, we should make sure ITERATION
            # is greater than or equal to the HOLDDOWN or HOLDUP values so we don't get stuck.

            if self.agentMgr.agent_option("HOLDDOWN"):
                HOLDDOWNLOCAL = self.agentMgr.agent_option("HOLDDOWN")
            else:
                HOLDDOWNLOCAL = self.HOLDDOWN
            if self.agentMgr.agent_option("HOLDUP"):
                HOLDUPLOCAL = self.agentMgr.agent_option("HOLDUP")
            else:
                HOLDUPLOCAL = self.HOLDUP

# Now we have all the ping state for each host. Lets do our additional logic here
# Current implementaion is logical OR. So all we need is at least one host in GOODIPV4 list and we pass
            if len(self.GOODIPV4) > 0:
                # We have some life here...now we need to determine whether to recover or not based on our HOLDDOWN.
                if self.CURRENTSTATUS == 0:
                    #We were down, now determine if we should recover yet.
                    if self.ITERATION >= int(HOLDDOWNLOCAL):
                        # Recover
                        self.CURRENTSTATUS = 1
                        self.ITERATION = 0
                        syslog.syslog(
                            "PingCheck Recovering. Changing configure for recovered state."
                        )
                        # RUN CONFIG Change
                        self.change_config('RECOVER')
                    else:
                        self.ITERATION += 1
                    # We need to wait till we hit our HOLDDOWN counter so we dampen a flapping condition if so exists
            else:
                # We get here when everything is down...nothing in GOODIPV4 list
                # Determine, are we already down? If so, noop. If not, then we need to determine if we are at HOLDDOWN.
                if self.CURRENTSTATUS == 1:
                    # Determine if we need to do something
                    if self.ITERATION >= int(HOLDUPLOCAL):
                        syslog.syslog(
                            "PingCheck Failure State. Changing configuration for failed state"
                        )
                        # run config change failure
                        self.change_config('FAIL')
                        #Set Currentstatus to 0, we're now in failed state
                        self.CURRENTSTATUS = 0
                        #Reset ITERATION
                        self.ITERATION = 0
                    else:
                        self.ITERATION += 1

            # Set current state via HealthStatus with agentMgr.
            if self.CURRENTSTATUS == 1:
                self.agentMgr.status_set("Health Status:", "GOOD")
            else:
                self.agentMgr.status_set("Health Status:", "FAIL")

        else:
            # If we failed the config check, then we land here and just skip any other processing
            # and set Health status to INACTIVE.
            # Once the config checks out, then we'll change it above with either GOOD or FAIL
            # dependent on our ping checks.
            self.agentMgr.status_set("Health Status:", "INACTIVE")

        # Wait for CHECKINTERVAL - if memory serves, I think I added this is to deal with
        # time drift especially if many of the pings timeout and PINGTIMEOUT is set to a
        # high value. This can really make our reaction time too slow
        # and push out a reaction significantly.
        # If the delta between the time we started our interation to this point of
        # execution, then we need to go through our checks again immediately.
        # If all is good, runTime ends up being pretty close to zero for the most part.
        runTime = eossdk.now() - startTime
        if self.agentMgr.agent_option("CHECKINTERVAL"):
            if runTime > int(self.agentMgr.agent_option("CHECKINTERVAL")):
                self.timeout_time_is(eossdk.now(
                ))  # Run now if Checkinterval shorter than run time.
            else:
                nextRun = int(
                    self.agentMgr.agent_option("CHECKINTERVAL")) - runTime
                self.timeout_time_is(eossdk.now() + nextRun)
        else:
            if runTime > int(self.CHECKINTERVAL):
                self.timeout_time_is(eossdk.now())
            else:
                nextRun = int(self.CHECKINTERVAL) - runTime
                self.timeout_time_is(eossdk.now() + nextRun)
Example #26
0
 def on_initialized(self):
    # Schedule ourselves to run immediately
    self.timeout_time_is(eossdk.now())
Example #27
0
 def on_initialized(self):
     # Schedule ourselves to run immediately
     self.timeout_time_is(eossdk.now())