Exemple #1
0
class Worker(threading.Thread):

    def __init__(self, cfg):
        threading.Thread.__init__(self)
        self.cfg = cfg
        self.setDaemon(True)
        self.client = QumuloClient(cfg) # only one cluster for now

        engine = create_engine('sqlite:///qactivity.sqlite')
        DBSession = sessionmaker(bind=engine)
        self.session = DBSession()

    def get_iops(self, ts):
        iops = self.client.get_iops()
        for entry in iops:
            self.session.add(\
                Iops(ts=ts,\
                     cluster=self.cfg.cluster.hostname,\
                     path=entry['path'],\
                     ip=entry['ip'],\
                     iops = json.dumps(entry)))
            self.session.commit()

    def get_capacity(self, ts):

        for path in self.cfg.paths:
            capacity = self.client.get_capacity(path)
            # add a Capacity record
            self.session.add(Capacity(ts=ts, cluster=self.cfg.cluster.hostname, \
                path=path, size=  long(capacity['total_capacity'])))
            self.session.commit()


    def get_cluster_metrics(self):
        for path in self.cfg.paths:
            ts = int((datetime.now() - datetime(1970, 1, 1)).total_seconds())
            self.get_iops(ts)
            self.get_capacity(ts)

    def run(self):
        try:
            while True:
                print("Getting data....")
                time.sleep(10)
                self.get_cluster_metrics()
        except KeyboardInterrupt:
            print "Shutting down"
Exemple #2
0
    def __init__(self, cfg):
        threading.Thread.__init__(self)
        self.cfg = cfg
        self.setDaemon(True)
        self.client = QumuloClient(cfg) # only one cluster for now

        engine = create_engine('sqlite:///qactivity.sqlite')
        DBSession = sessionmaker(bind=engine)
        self.session = DBSession()
Exemple #3
0
    def __init__(self, agent, mib, cfg):
        threading.Thread.__init__(self)
        self.logger = logging.getLogger('agent.Worker')
        self._agent = agent
        self._mib = mib
        self._cfg = cfg
        self.setDaemon(True)

        self.client = QumuloClient(cfg.clusters[0])  # only one cluster for now
        self.notified_offline = False
        self.notified_dead_drives = False
        # Use an array of dictionaries to track per-node PS notify states
        self.notified_power_supply_failure = \
            [{'PS1': False, 'PS2': False}
             for node in cfg.clusters[0].ipmi.ipmi_servers]
        # print self.notified_power_supply_failure

        self.snmp_enabled = cfg.snmp.enabled
        self.email_enabled = cfg.email.enabled
        self.ipmi_enabled = cfg.clusters[0].ipmi.enabled

        if self.email_enabled:
            self.email_acct = os.getenv('SNMP_AGENT_EMAIL_ACCT')
            self.email_pwd = os.getenv('SNMP_AGENT_EMAIL_PWD')
Exemple #4
0
    def __init__(self, agent, mib, cfg):
        threading.Thread.__init__(self)
        self._agent = agent
        self._mib = mib
        self._cfg = cfg
        self.setDaemon(True)

        self.client = QumuloClient(cfg.clusters[0]) # only one cluster for now
        self.notified_offline = False
        self.notified_dead_drives = False
        self.notified_power_supply_failure = False

        self.snmp_enabled = cfg.snmp.enabled
        self.email_enabled = cfg.email.enabled
        self.ipmi_enabled = cfg.ipmi.enabled

        if self.email_enabled:
           self.email_acct = os.getenv('SNMP_AGENT_EMAIL_ACCT')
           self.email_pwd = os.getenv('SNMP_AGENT_EMAIL_PWD')
import time

from config import Config
from qumulo_client import QumuloClient

if __name__ == '__main__':

    # see if we can read config
    f = file('snmp_agent.cfg')
    cfg = Config(f)
    client = QumuloClient(cfg.clusters[0]) # only one cluster for now
    notified_offline = False

    while True:
        time.sleep(3)
        client.get_cluster_state()
        if len(client.offline_nodes) > 0:
            notified_offline = True
            print "There are currently " + str(len(client.offline_nodes)) + " nodes offline:"
            for n in client.offline_nodes:
                print "\tNode " + n["node_name"] + " is currently offline."
        else:
            if notified_offline == True:
                notified_offline = False
                print "All nodes back online."
Exemple #6
0
class Worker(threading.Thread):
    """Just to demonstrate updating the MIB
    and sending traps
    """

    def __init__(self, agent, mib, cfg):
        threading.Thread.__init__(self)
        self._agent = agent
        self._mib = mib
        self._cfg = cfg
        self.setDaemon(True)

        self.client = QumuloClient(cfg.clusters[0]) # only one cluster for now
        self.notified_offline = False
        self.notified_dead_drives = False
        self.notified_power_supply_failure = False

        self.snmp_enabled = cfg.snmp.enabled
        self.email_enabled = cfg.email.enabled
        self.ipmi_enabled = cfg.ipmi.enabled

        if self.email_enabled:
           self.email_acct = os.getenv('SNMP_AGENT_EMAIL_ACCT')
           self.email_pwd = os.getenv('SNMP_AGENT_EMAIL_PWD')

    def check_nodes(self):
        self.client.get_cluster_state()
        if len(self.client.offline_nodes) > 0:

            if self.notified_offline == False:
               msg = "There are currently " + str(len(self.client.offline_nodes)) + \
                  " nodes offline:"

               for n in self.client.offline_nodes:
                   msg = msg +  "\tNode " + n["node_name"] + " is currently offline."

               self.notify("Qumulo Nodes Offline", msg, "nodeDownTrap")
               self.notified_offline = True
        else:
            if self.notified_offline == True:
                self.notified_offline = False
                self.notify("Qumulo Nodes Back Online", "All nodes back online", "nodesClearTrap")

    def check_drives(self):
        self.client.get_drive_states()
        if len(self.client.dead_drives) > 0:

            if self.notified_dead_drives == False:
                msg = "There are currently " + str(len(self.client.dead_drives)) + " drives offline:"

                for d in self.client.dead_drives:
                    msg = msg + "\t" + d["disk_type"] + " Drive" + d["id"] + " is offline."

                self.notify("Qumulo Drives Offline", msg, "driveFailureTrap")
                self.notified_dead_drives = True

        else:
            if self.notified_dead_drives == True:
                self.notified_dead_drives = False
                self.notify("Qumulo Drives Back Online", "All nodes back online", "nodesClearTrap")

    def check_power(self, ipmi_server):
        power_state = self.client.get_power_state(self._cfg['clusters'][0].ipmi.ipmi_server)

        m = re.search("Failure", power_state[0])
        if m:
            if not self.notified_power_supply_failure:
                self.notify("Qumulo Power Supply Failure", power_state[0], "powerSupplyFailureTrap")
                self.notified_power_supply_failure = True
        else:
            if self.notified_power_supply_failure: # we're back to normal
                self.notified_power_supply_failure = False
                self.notify("Qumulo Cluster Back Online", "Qumulo Cluster power back to normal", "nodesClearTrap")

    def notify(self, subject, message, snmp_trap_name = None):

        print(message)

        if self.snmp_enabled:
            print("Sending trap")
            self._agent.sendTrap(message, snmp_trap_name, ())

        if self.email_enabled:
            print("Sending email")
            self.send_email(subject, message)


    def check_cluster_status(self):

        # Check IPMI
        if self._cfg.ipmi.enabled:
            ipmi_server = self._cfg['clusters'][0].ipmi.ipmi_server
            self.check_power(ipmi_server, self._cfg.snmp.enabled, self._email.enabled)

        if self.client.credentials != None:
            self.check_nodes()
            self.check_drives()
        else: # we're offline
            if not self.notified_offline:
                print "Error connecting to Qumulo Cluster REST Server"
                self.notify("Qumulo Cluster offline", "Error connecting to Qumulo Cluster REST Server", "nodeDownTrap")
                self.notified_offline = True
            else: # retry login
                self.client.login()


    def send_email(self, subject, body):
        '''Send an email message to a list of recipients'''
        try:
            # Create a text/plain message
            msg = MIMEMultipart()
            msg['From'] = self._cfg.email.address_from
            msg['To'] = self._cfg.email.address_to
            msg['Subject'] = subject
            msg.attach(MIMEText(body, 'plain'))

            server = smtplib.SMTP(self._cfg.email.server, self._cfg.email.tls_port)
            server.starttls()
            server.login(self.email_acct, self.email_pwd)
            server.sendmail(msg['From'], msg['To'], msg.as_string())
            server.quit()


        except Exception, excpt:
            print("Failed to send email (Subject: %s) (%s)" %
                    (subject, excpt))
Exemple #7
0
class Worker(threading.Thread):
    """Just to demonstrate updating the MIB
    and sending traps
    """

    def __init__(self, agent, mib, cfg):
        threading.Thread.__init__(self)
        self.logger = logging.getLogger('agent.Worker')
        self._agent = agent
        self._mib = mib
        self._cfg = cfg
        self.setDaemon(True)

        self.client = QumuloClient(cfg.clusters[0])  # only one cluster for now
        self.notified_offline = False
        self.notified_dead_drives = False
        # Use an array of dictionaries to track per-node PS notify states
        self.notified_power_supply_failure = \
            [{'PS1': False, 'PS2': False}
             for node in cfg.clusters[0].ipmi.ipmi_servers]
        # print self.notified_power_supply_failure

        self.snmp_enabled = cfg.snmp.enabled
        self.email_enabled = cfg.email.enabled
        self.ipmi_enabled = cfg.clusters[0].ipmi.enabled

        if self.email_enabled:
            self.email_acct = os.getenv('SNMP_AGENT_EMAIL_ACCT')
            self.email_pwd = os.getenv('SNMP_AGENT_EMAIL_PWD')

    def check_nodes(self):
        self.client.get_cluster_state()
        if len(self.client.offline_nodes) > 0:

            if self.notified_offline == False:
               msg = "There are currently " + str(len(self.client.offline_nodes)) + \
                  " nodes offline:"

               for n in self.client.offline_nodes:
                   msg = msg +  "\tNode " + n["node_name"] + " is currently offline."

               self.notify("Qumulo Nodes Offline", msg, "nodeDownTrap")
               self.notified_offline = True
        else:
            if self.notified_offline == True:
                self.notified_offline = False
                self.notify("Qumulo Nodes Back Online", "All nodes back online", "nodesClearTrap")

    def check_drives(self):
        self.client.get_drive_states()
        if len(self.client.dead_drives) > 0:

            if self.notified_dead_drives == False:
                msg = "There are currently " + str(len(self.client.dead_drives)) + " drives offline:"

                for d in self.client.dead_drives:
                    msg = msg + "\t" + d["disk_type"] + " Drive" + d["id"] + " is offline."

                self.notify("Qumulo Drives Offline", msg, "driveFailureTrap")
                self.notified_dead_drives = True

        else:
            if self.notified_dead_drives == True:
                self.notified_dead_drives = False
                self.notify("Qumulo Drives Back Online", "All nodes back online", "nodesClearTrap")

    def check_power(self, ipmi_server, node_id):
        power_states = self.client.get_power_state(ipmi_server)

        cluster_name = self._cfg.clusters[0].name
        node_name = cluster_name + '-' + str(node_id + 1)

        # notify on every failed PS we find and set notified state to True
        try:
            for PS in power_states['FAIL']:
                if not self.notified_power_supply_failure[node_id][PS]:
                    message = PS + " in " + node_name + " failed"
                    subject = "[ALERT] Qumulo Power Supply Failure " + node_name
                    self.notify(subject,
                                message,
                                "powerSupplyFailureTrap",
                                [(rfc1902.ObjectName('1.3.6.1.4.1.47017.8'),
                                  rfc1902.OctetString(node_name)),
                                 (rfc1902.ObjectName('1.3.6.1.4.1.47017.11'),
                                  rfc1902.OctetString(PS))
                                 ]
                                )
                    self.notified_power_supply_failure[node_id][PS] = True
        except TypeError, err:
            self.logger.warn("WARNING: IPMI Exception, please verify IPMI config. (%s)"
                     % str(err))

        # notify on every good PS we find and set those notified states to False
        try:
            for PS in power_states['GOOD']:
                if self.notified_power_supply_failure[node_id][PS]:
                    message = PS + " in " + node_name + " power back to normal"
                    self.notify("Qumulo Power Supply Normal", message, "nodesClearTrap")
                    self.notified_power_supply_failure[node_id][PS] = False
        except TypeError, err:
            self.logger.warn("WARNING: IPMI Exception, please verify IPMI config. (%s)"
                     % str(err))
import time

from config import Config
from qumulo_client import QumuloClient

if __name__ == '__main__':

    # see if we can read config
    f = file('snmp_agent.cfg')
    cfg = Config(f)
    client = QumuloClient(cfg.clusters[0])  # only one cluster for now
    notified_offline = False

    while True:
        time.sleep(3)
        client.get_cluster_state()
        if len(client.offline_nodes) > 0:
            notified_offline = True
            print "There are currently " + str(len(
                client.offline_nodes)) + " nodes offline:"
            for n in client.offline_nodes:
                print "\tNode " + n["node_name"] + " is currently offline."
        else:
            if notified_offline == True:
                notified_offline = False
                print "All nodes back online."