class EmergencyChecker(Daemon): """ \brief Implements the EmergencyD external interface This class contains the methods called when requests are recieved by the Daemon (inherited). """ __version = "Emergency Daemon v0.1" __checker_timer = None __checker_lock = None __config = None __config_path = None __warning_email_addresses = None __critical_email_addresses = None __run_level = None def __init__(self, config=CONFIG_FILE): """\brief Registers remote methods and starts update thread (timer) """ Daemon.__init__(self) self.__config_path = config self.__parseConfig(self.__config_path) self.__registerMethods() self.__checker_lock = threading.Lock() def testMethod(self,prot,seq,ln,payload): log.debug("testMethod() called.") sdata = (2, "cpu.w00t", "temperature", 1168342852, 32.5, 32.6) self.__handleCriticalStatus("computer32", sdata) prot.sendReply(200, seq, "Test method called.") def manualRunChecker(self,prot,seq,ln,payload): log.debug("manualRunChecker() called.") prot.sendReply(200, seq, "Running runChecker.") self.runChecker() def manualStopUpdateTimer(self,prot,seq,ln,payload): log.debug("manualStopUpdateTimer() called.") if not self.updateTimerIsRunning(): prot.sendReply(400, seq, "The update timer isn't running!") else: self.stopUpdateTimer() prot.sendReply(200, seq, "Update Timer stopped.") def manualStartUpdateTimer(self,prot,seq,ln,payload): log.debug("manualStartUpdateTimer() called.") if self.updateTimerIsRunning(): prot.sendReply(400, seq, "The update timer is already running!") else: self.startUpdateTimer() prot.sendReply(200, seq, "Update Timer started.") def setRunLevel(self,prot,seq,ln,payload): log.debug("setRunLevel() called.") new_run_level = int(payload) if not self.__validRunLevel(new_run_level): log.debug("Invalid run_level (%d) given" % new_run_level) prot.sendReply(400, seq, "Invalid run_level given.") return if new_run_level < self.__run_level: payload = "Lowering run_level from %d to %d." % \ (self.__run_level, new_run_level) elif new_run_level > self.__run_level: payload = "Raising run_level from %d to %d." % \ (self.__run_level, new_run_level) else: payload = "No change in run_level." log.info(payload) prot.sendReply(200, seq, payload) self.__run_level = new_run_level def getRunLevel(self,prot,seq,ln,payload): log.debug("getRunLevel() called.") prot.sendReply(200, seq, str(self.__run_level)) def reloadConfig(self,prot,seq,ln,payload): log.debug("reloadConfig() called.") self.__parseConfig(config) prot.sendReply(200, seq, "Reload of config file completed.") def killDaemon(self,prot,seq,ln,payload): prot.sendReply(200, seq, "Killing Daemon!") os.abort() def getVersion(self,prot,seq,ln,payload): """\brief Returns version""" payload = self.__version prot.sendReply(200, seq, payload) def stopDaemon(self,prot,seq,ln,payload): """\brief Stops the daemon and all threads This method will first stop any more incoming queries, then wait for any update tasks to complete, before stopping itself. """ log.debug("stopDaemon called.") prot.sendReply(200, seq, "Accepted stop request.") log.debug("Stopping Checker Timer") self.__checker_timer.stop() self.acceptConnections(False) log.debug("Stopping Emergency Daemon (self)") Daemon.stop(self) def startUpdateTimer(self): self.__checker_timer = GracefulTimer(CHECKER_INTERVAL, \ self.runChecker, True) self.__checker_timer.start() def stopUpdateTimer(self): self.__checker_timer.stop() def updateTimerIsRunning(self): if self.__checker_timer: if self.__checker_timer.isAlive(): return True return False def runChecker(self): log.debug("runChecker() called.") log.debug("Acquiring checker lock.") self.__checker_lock.acquire() p = Protocol(None) if DaemonStatus().monitorDaemonIsOnline(5): p.open(MONITORD_HOST, MONITORD_PORT) p.sendRequest("get_currentsensorreadings","",self.nodeStatusHandler) p.readAndProcess() else: log.info("Monitor Daemon is not online!") # TODO: Email self.__checker_lock.release() log.debug("Released checker lock.") def nodeStatusHandler(self,code,seq,size,payload): if (code != 200) or (len(payload) == 0): # TODO: Warn someone that monitord isn't working properly log.critical("Incorrect payload received from monitor daemon!") sensor_dom = xml.dom.minidom.parseString(payload) node_readings = sensor_dom.getElementsByTagName("nodereading") for nodereading in node_readings: self.checkNodeReadings(nodereading) sensor_dom.unlink() def checkNodeReadings(self, nodereading): nodeid = nodereading.attributes["id"].value overallstatus = nodereading.attributes["overallstatus"].value if overallstatus == 0: return readings = nodereading.getElementsByTagName("reading") for reading in readings: # (status, sensorid, sensortype, timeinsecs, sensorvalue, # sensormaxvalue) sdata = self.__parseXMLReading(reading) if sdata[0] == 0: continue elif sdata[0] == 1: log.critical("ALERT: [%s][%s] has WARNING status with " + \ "curval=[%f], highval=[%f]" % \ (nodeid, sdata[1], sdata[4], sdata[5])) self.__handleWarningStatus(nodeid, sdata) elif sdata[0] >= 2: log.critical("ALERT: [%s][%s] has CRITICAL status with " + \ "curval=[%f], highval=[%f]" % \ (nodeid, sdata[1], sdata[4], sdata[5])) self.__handleCriticalStatus(nodeid, sdata) else: log.critical("WARNING: [%s][%s] has UNKNOWN status %d!" \ % (nodeid, sdata[1], sdata[0])) self.__handleUnknownStatus(nodeid, sdata) def __makeEmailNodeMessage(self, status, nodeid, sdata, action): return (EMAIL_NODE_DETAILS % (status, nodeid, sdata[1], sdata[2], status, datetime.datetime.fromtimestamp(float(sdata[3])).\ strftime("%Y-%m-%d-%H:%M:%S"), sdata[4], sdata[5], \ self.__run_level, action)) def __handleWarningStatus(self, nodeid, sdata): """\brief Handles a sensor warning status, by sending out a warning email to the warning_email_addresses recipients. """ message = self.__makeEmailNodeMessage("WARNING", nodeid, sdata, action) if self.__run_level == 0: action = "No action - currently running in dry-run mode." message = self.__makeEmailNodeMessage("WARNING", nodeid, sdata, action) log.warning(message) elif self.__run_level >= 1: action = "Email warning, but no direct action." message = self.__makeEmailNodeMessage("WARNING", nodeid, sdata, action) log.warning(message) self.__sendEmail(self.__warning_email_addresses, "EMERGENCYD: Node Sensor [%s][%s] in WARNING state!"\ % (nodeid, sdata[1]), message) def __handleCriticalStatus(self, nodeid, sdata): """\brief Handles a critical warning status, by attempting to shut down the node, and then send out an email to the critical_email_addresses recipients, with the results and details. """ if self.__run_level == 0: action = "No action - currently running in dry-run mode." message = self.__makeEmailNodeMessage("CRITICAL", nodeid, sdata, action) log.critical(message) elif self.__run_level == 1: action = "Email warning, but no direct action." message = self.__makeEmailNodeMessage("CRITICAL", nodeid, sdata, action) log.critical(message) self.__sendEmail(self.__critical_email_addresses, "EMERGENCYD: Node Sensor [%s][%s] in CRITICAL state!"\ % (nodeid, sdata[1]), message) else: action = "Email warning, and attempted powerdown of node." message = self.__makeEmailNodeMessage("CRITICAL", nodeid, sdata, action) powerdown_status = self.__attemptPowerDown(nodeid) message += "Output from powerdown attempt:\n" + powerdown_status log.critical(message) self.__sendEmail(self.__critical_email_addresses, "EMERGENCYD: Node Sensor [%s][%s] in CRITICAL state!"\ % (nodeid, sdata[1]), message) def __handleUnknownStatus(self, nodeid, sdata): """\brief Handles an unknown sensor status, by sending out an email to the warning_email_addresses recipients with details. """ unknown_state_message = "A node in an UNKNOWN state indicates a system"\ + "error. Please notify the author immediately." if self.__run_level == 0: action = "No action - currently running in dry-run mode." message = self.__makeEmailNodeMessage("UNKNOWN", nodeid, sdata, action) message += unknown_state_message log.critical(message) elif self.__run_level >= 1: action = "Email warning, but no direct action." message = self.__makeEmailNodeMessage("UNKNOWN", nodeid, sdata, action) message += unknown_state_message log.critical(message) self.__sendEmail(self.__critical_email_addresses, "EMERGENCYD: Node Sensor [%s][%s] in UNKNOWN state!"\ % (nodeid, sdata[1]), message) def __attemptPowerDown(self, nodeid): """\brief Attempts to power down the node specified by nodeid, and returns a string of the results \p nodeid - id of node that needs powering down """ (status, output) = commands.getstatusoutput(\ "%s %s" % (POWERDOWN_COMMAND, nodeid)) return output def __sendEmail(self, recipients, subject, message): """\brief Sends an email with the provided message to the provided list of recipients \p recipients - list of recipients. \p subject - message subject \p message - string message to be sent. """ if len(recipients) == 0: log.critical("__sendEmail(): Error: No recipients given.") return message = ("To: %s\r\nSubject: %s\r\n\r\n" % (", ".join(recipients), subject)) + message try: server = smtplib.SMTP('localhost') server.sendmail("", recipients, message) server.quit() except Exception, e: log.critical("__sendEmail() exception: %s" % str(e))
class HenStatusChecker(Daemon): """ \brief Implements the StatusDaemon external interface This class contains the methods called when requests are recieved by the Daemon (inherited). """ __version = "Hen Status Daemon v0.1" __checker_timer = None __checker_lock = None __stoppedDaemons = None __runningDaemons = None __checkerThreads = None __doneList = None __cli_commands_xml = None __cli_commands = None def __init__(self): """\brief Registers remote methods and starts update thread (timer) """ Daemon.__init__(self) self.__registerMethods() self.__checker_lock = threading.Lock() self.__stoppedDaemons = [] self.__runningDaemons = [] self.__checkerThreads = {} self.__doneList = [] self.__cli_commands = {} def getCLICommandXML(self,prot,seq,ln,payload): """\brief Returns the complete XML interpretation of the CLI commands available from all the running daemons, plus the standard CLI functions such as "exit" and "help". """ if not self.__cli_commands_xml: # This should never happen prot.sendReply(500, seq, "No commands found by daemon!") def getHenStatus(self,prot,seq,ln,payload): log.debug("getHenStatus() called.") self.__checker_lock.acquire() results = "Content-type: text/xml\n" results += "Cache-Control: no-store, no-cache, must-revalidate\n\n" results += "<processmanagement>\n" results += "\t<running>\n" for daemon in self.__runningDaemons: results += "\t\t<process name=\"%s\" />\n" % str(daemon) results += "\t</running>\n" results += "\t<stopped>\n" for daemon in self.__stoppedDaemons: results += "\t\t<process name=\"%s\" />\n" % str(daemon) results += "\t</stopped>\n" results += "</processmanagement>\n" self.__checker_lock.release() prot.sendReply(200, seq, results) def killDaemon(self,prot,seq,ln,payload): prot.sendReply(200, seq, "Killing Daemon!") os.abort() def getVersion(self,prot,seq,ln,payload): """\brief Returns version""" payload = self.__version prot.sendReply(200, seq, payload) def stopDaemon(self,prot,seq,ln,payload): """\brief Stops the daemon and all threads This method will first stop any more incoming queries, then wait for any update tasks to complete, before stopping itself. """ log.debug("stopDaemon called.") prot.sendReply(200, seq, "Accepted stop request.") log.debug("Stopping Checker Timer") self.__checker_timer.stop() self.acceptConnections(False) log.debug("Stopping Hen Status Daemon (self)") Daemon.stop(self) def startCheckerTimer(self): self.__checker_timer = GracefulTimer(CHECKER_INTERVAL, \ self.checkHenStatus, True) self.__checker_timer.start() def stopCheckerTimer(self): self.__checker_timer.stop() def checkerTimerIsRunning(self): if self.__checker_timer: if self.__checker_timer.isAlive(): return True return False def __registerMethods(self): log.debug("Registering method handlers...") self.registerMethodHandler("get_version", self.getVersion) #self.registerMethodHandler("stop_daemon", self.stopDaemon) #self.registerMethodHandler("kill_daemon", self.killDaemon) self.registerMethodHandler("get_henstatus", self.getHenStatus) self.registerMethodHandler("get_cli_command_xml", self.getCLICommandXML) def __createStatusThreads(self): for (daemon, method) in DaemonStatus().getAllDaemonStatusMethods(): doneEvent = threading.Event() self.__checkerThreads[daemon] = \ DaemonStatusChecker(method, doneEvent, STATUS_TIMEOUT) self.__checkerThreads[daemon].start() self.__doneList.append(doneEvent) def __waitForResults(self): while 1: done = True for doneEvent in self.__doneList: if not doneEvent.isSet(): done = False if done: break time.sleep(2) def __collectResults(self): for daemon in self.__checkerThreads.keys(): if self.__checkerThreads[daemon].isOnline(): self.__runningDaemons.append(daemon) else: self.__stoppedDaemons.append(daemon) def __generateCommandXML(self): self.__cli_commands_xml = "<testbedcommands>" # TODO: !!! for daemon in self.__runningDaemons: pass def checkHenStatus(self): log.debug("checkHenStatus() called.") self.__checker_lock.acquire() self.__stoppedDaemons = [] self.__runningDaemons = [] self.__checkerThreads = {} self.__doneList = [] self.__createStatusThreads() self.__waitForResults() self.__collectResults() self.__generateCommandXML() self.__checker_lock.release()
class EmergencyChecker(Daemon): """ \brief Implements the EmergencyD external interface This class contains the methods called when requests are recieved by the Daemon (inherited). """ __version = "Emergency Daemon v0.1" __checker_timer = None __checker_lock = None __config = None __config_path = None __warning_email_addresses = None __critical_email_addresses = None __run_level = None def __init__(self, config=CONFIG_FILE): """\brief Registers remote methods and starts update thread (timer) """ Daemon.__init__(self) self.__config_path = config self.__parseConfig(self.__config_path) self.__registerMethods() self.__checker_lock = threading.Lock() def testMethod(self, prot, seq, ln, payload): log.debug("testMethod() called.") sdata = (2, "cpu.w00t", "temperature", 1168342852, 32.5, 32.6) self.__handleCriticalStatus("computer32", sdata) prot.sendReply(200, seq, "Test method called.") def manualRunChecker(self, prot, seq, ln, payload): log.debug("manualRunChecker() called.") prot.sendReply(200, seq, "Running runChecker.") self.runChecker() def manualStopUpdateTimer(self, prot, seq, ln, payload): log.debug("manualStopUpdateTimer() called.") if not self.updateTimerIsRunning(): prot.sendReply(400, seq, "The update timer isn't running!") else: self.stopUpdateTimer() prot.sendReply(200, seq, "Update Timer stopped.") def manualStartUpdateTimer(self, prot, seq, ln, payload): log.debug("manualStartUpdateTimer() called.") if self.updateTimerIsRunning(): prot.sendReply(400, seq, "The update timer is already running!") else: self.startUpdateTimer() prot.sendReply(200, seq, "Update Timer started.") def setRunLevel(self, prot, seq, ln, payload): log.debug("setRunLevel() called.") new_run_level = int(payload) if not self.__validRunLevel(new_run_level): log.debug("Invalid run_level (%d) given" % new_run_level) prot.sendReply(400, seq, "Invalid run_level given.") return if new_run_level < self.__run_level: payload = "Lowering run_level from %d to %d." % \ (self.__run_level, new_run_level) elif new_run_level > self.__run_level: payload = "Raising run_level from %d to %d." % \ (self.__run_level, new_run_level) else: payload = "No change in run_level." log.info(payload) prot.sendReply(200, seq, payload) self.__run_level = new_run_level def getRunLevel(self, prot, seq, ln, payload): log.debug("getRunLevel() called.") prot.sendReply(200, seq, str(self.__run_level)) def reloadConfig(self, prot, seq, ln, payload): log.debug("reloadConfig() called.") self.__parseConfig(config) prot.sendReply(200, seq, "Reload of config file completed.") def killDaemon(self, prot, seq, ln, payload): prot.sendReply(200, seq, "Killing Daemon!") os.abort() def getVersion(self, prot, seq, ln, payload): """\brief Returns version""" payload = self.__version prot.sendReply(200, seq, payload) def stopDaemon(self, prot, seq, ln, payload): """\brief Stops the daemon and all threads This method will first stop any more incoming queries, then wait for any update tasks to complete, before stopping itself. """ log.debug("stopDaemon called.") prot.sendReply(200, seq, "Accepted stop request.") log.debug("Stopping Checker Timer") self.__checker_timer.stop() self.acceptConnections(False) log.debug("Stopping Emergency Daemon (self)") Daemon.stop(self) def startUpdateTimer(self): self.__checker_timer = GracefulTimer(CHECKER_INTERVAL, \ self.runChecker, True) self.__checker_timer.start() def stopUpdateTimer(self): self.__checker_timer.stop() def updateTimerIsRunning(self): if self.__checker_timer: if self.__checker_timer.isAlive(): return True return False def runChecker(self): log.debug("runChecker() called.") log.debug("Acquiring checker lock.") self.__checker_lock.acquire() p = Protocol(None) if DaemonStatus().monitorDaemonIsOnline(5): p.open(MONITORD_HOST, MONITORD_PORT) p.sendRequest("get_currentsensorreadings", "", self.nodeStatusHandler) p.readAndProcess() else: log.info("Monitor Daemon is not online!") # TODO: Email self.__checker_lock.release() log.debug("Released checker lock.") def nodeStatusHandler(self, code, seq, size, payload): if (code != 200) or (len(payload) == 0): # TODO: Warn someone that monitord isn't working properly log.critical("Incorrect payload received from monitor daemon!") sensor_dom = xml.dom.minidom.parseString(payload) node_readings = sensor_dom.getElementsByTagName("nodereading") for nodereading in node_readings: self.checkNodeReadings(nodereading) sensor_dom.unlink() def checkNodeReadings(self, nodereading): nodeid = nodereading.attributes["id"].value overallstatus = nodereading.attributes["overallstatus"].value if overallstatus == 0: return readings = nodereading.getElementsByTagName("reading") for reading in readings: # (status, sensorid, sensortype, timeinsecs, sensorvalue, # sensormaxvalue) sdata = self.__parseXMLReading(reading) if sdata[0] == 0: continue elif sdata[0] == 1: log.critical("ALERT: [%s][%s] has WARNING status with " + \ "curval=[%f], highval=[%f]" % \ (nodeid, sdata[1], sdata[4], sdata[5])) self.__handleWarningStatus(nodeid, sdata) elif sdata[0] >= 2: log.critical("ALERT: [%s][%s] has CRITICAL status with " + \ "curval=[%f], highval=[%f]" % \ (nodeid, sdata[1], sdata[4], sdata[5])) self.__handleCriticalStatus(nodeid, sdata) else: log.critical("WARNING: [%s][%s] has UNKNOWN status %d!" \ % (nodeid, sdata[1], sdata[0])) self.__handleUnknownStatus(nodeid, sdata) def __makeEmailNodeMessage(self, status, nodeid, sdata, action): return (EMAIL_NODE_DETAILS % (status, nodeid, sdata[1], sdata[2], status, datetime.datetime.fromtimestamp(float(sdata[3])).\ strftime("%Y-%m-%d-%H:%M:%S"), sdata[4], sdata[5], \ self.__run_level, action)) def __handleWarningStatus(self, nodeid, sdata): """\brief Handles a sensor warning status, by sending out a warning email to the warning_email_addresses recipients. """ message = self.__makeEmailNodeMessage("WARNING", nodeid, sdata, action) if self.__run_level == 0: action = "No action - currently running in dry-run mode." message = self.__makeEmailNodeMessage("WARNING", nodeid, sdata, action) log.warning(message) elif self.__run_level >= 1: action = "Email warning, but no direct action." message = self.__makeEmailNodeMessage("WARNING", nodeid, sdata, action) log.warning(message) self.__sendEmail(self.__warning_email_addresses, "EMERGENCYD: Node Sensor [%s][%s] in WARNING state!"\ % (nodeid, sdata[1]), message) def __handleCriticalStatus(self, nodeid, sdata): """\brief Handles a critical warning status, by attempting to shut down the node, and then send out an email to the critical_email_addresses recipients, with the results and details. """ if self.__run_level == 0: action = "No action - currently running in dry-run mode." message = self.__makeEmailNodeMessage("CRITICAL", nodeid, sdata, action) log.critical(message) elif self.__run_level == 1: action = "Email warning, but no direct action." message = self.__makeEmailNodeMessage("CRITICAL", nodeid, sdata, action) log.critical(message) self.__sendEmail(self.__critical_email_addresses, "EMERGENCYD: Node Sensor [%s][%s] in CRITICAL state!"\ % (nodeid, sdata[1]), message) else: action = "Email warning, and attempted powerdown of node." message = self.__makeEmailNodeMessage("CRITICAL", nodeid, sdata, action) powerdown_status = self.__attemptPowerDown(nodeid) message += "Output from powerdown attempt:\n" + powerdown_status log.critical(message) self.__sendEmail(self.__critical_email_addresses, "EMERGENCYD: Node Sensor [%s][%s] in CRITICAL state!"\ % (nodeid, sdata[1]), message) def __handleUnknownStatus(self, nodeid, sdata): """\brief Handles an unknown sensor status, by sending out an email to the warning_email_addresses recipients with details. """ unknown_state_message = "A node in an UNKNOWN state indicates a system"\ + "error. Please notify the author immediately." if self.__run_level == 0: action = "No action - currently running in dry-run mode." message = self.__makeEmailNodeMessage("UNKNOWN", nodeid, sdata, action) message += unknown_state_message log.critical(message) elif self.__run_level >= 1: action = "Email warning, but no direct action." message = self.__makeEmailNodeMessage("UNKNOWN", nodeid, sdata, action) message += unknown_state_message log.critical(message) self.__sendEmail(self.__critical_email_addresses, "EMERGENCYD: Node Sensor [%s][%s] in UNKNOWN state!"\ % (nodeid, sdata[1]), message) def __attemptPowerDown(self, nodeid): """\brief Attempts to power down the node specified by nodeid, and returns a string of the results \p nodeid - id of node that needs powering down """ (status, output) = commands.getstatusoutput(\ "%s %s" % (POWERDOWN_COMMAND, nodeid)) return output def __sendEmail(self, recipients, subject, message): """\brief Sends an email with the provided message to the provided list of recipients \p recipients - list of recipients. \p subject - message subject \p message - string message to be sent. """ if len(recipients) == 0: log.critical("__sendEmail(): Error: No recipients given.") return message = ("To: %s\r\nSubject: %s\r\n\r\n" % (", ".join(recipients), subject)) + message try: server = smtplib.SMTP('localhost') server.sendmail("", recipients, message) server.quit() except Exception, e: log.critical("__sendEmail() exception: %s" % str(e))