def on_error(ws, error): thisNode = getNode() if thisNode['websocket'] == ws: thisNode['telemlistener'] = None config.errorLogger( syslog.LOG_DEBUG, "Websocket error for {bmc}, details: {err}".format( bmc=thisNode['bmcHostname'], err=error))
def initialize(): config.pluginVars['logstash'] = {} # config.pluginVars['logstash']['logstashQueue'] = queue.Queue() try: host = config.pluginConfigs['logstash']['host'] port = int(config.pluginConfigs['logstash']['port']) except KeyError: config.errorLogger( syslog.LOG_ERR, "Host and port configurations missing for logstash plugin. Defaulting to 127.0.0.1:10522" ) host = "127.0.0.1" port = 10522 config.pluginVars['logstash']['logstashSocket'] = socket.socket() if not connectToSocket(config.pluginVars['logstash']['logstashSocket'], host, port, False): return False # t = threading.Thread(target=writeToSocket, args=(config.pluginVars['logstash']['logstashSocket'], # config.pluginVars['logstash']['logstashQueue'])) # t.daemon = True # t.start() return True
def createArgString(cerEvent): argString = "" index = 0 try: cerMessage = config.pluginPolicies['csmPolicy'][ cerEvent['CerID']]['Message'] except KeyError: config.errorLogger( syslog.LOG_ERR, "Event ID {cerID} missing in CSM Policy Table.".format( cerID=cerEvent['CerID'])) cerMessage = cerEvent['message'] argInstance = 0 while cerMessage.find('$(', index) != -1: index = cerMessage.find('$(', index) + 2 arg = cerMessage[index:cerMessage.find(')', index)] if argString != "": argString = argString + ',' try: argString = argString + arg + '=' + str( cerEvent['compInstance']).split(',')[argInstance] except IndexError: config.errorLogger( syslog.LOG_ERR, "CSM Policy table has more arguments than provided by the alert." ) argString = "" break argInstance += 1 return argString
def on_close(ws): thisNode = getNode() if thisNode['websocket'] == ws: thisNode['telemlistener'] = None config.errorLogger( syslog.LOG_DEBUG, "Websocket closed for {bmc}".format(bmc=thisNode['bmcHostname']))
def pollNodes(interval): """ Used as timer for the polling interval. set to 25 second minimum @return: Does not return a specific value but loads the global queue with nodes that get polled """ global killNow if not killNow: t = threading.Timer(interval, pollNodes, [interval]) t.daemon = True t.start() for node in mynodelist: if node['accessType'] == 'ipmi': #load nodes that are using polling into the queue nodes2poll.put(node) elif node['accessType'] == 'openbmcRest': if not config.useTelem: if 'listener' in node and not node['listener'].isAlive(): config.errorLogger( syslog.LOG_DEBUG, "Main process opening new connection to {bmc}".format( bmc=node['bmcHostname'])) t = threading.Thread( target=notificationlistener.openSocket, args=[ node['bmcHostname'], node['username'], node['password'] ]) node['listener'] = t t.daemon = True t.start()
def on_close(ws): """ websocket close event handler """ node = getNode() config.errorLogger( syslog.LOG_INFO, "{bmc} websocket closed.".format(bmc=node['bmcHostname']))
def on_error(ws, wserror): """ websocket error handler """ node = getNode() config.errorLogger( syslog.LOG_ERR, "Websocket error: {bmc}: {err}".format(bmc=node['bmcHostname'], err=wserror))
def setupNotifications(): """ Loads the information from the configuration file and setup notification to monitoring entities @return the configuration parser object """ #read the config file confParser = configparser.ConfigParser() getConfigPaths() try: #check for dynamic config file if os.path.exists(config.configFileName): confParser.read(config.configFileName) test = dict(confParser.items('notify')) for key in test: if test[key] == 'True': notifyList[key] = { "function": test[key + 'function'], "receiveEntityDown": False, "failedFirstTry": False, "successfullyReported": True } if confParser.has_section(key): pluginConfSettings = {key: dict(confParser.items(key))} config.pluginConfigs.update(pluginConfSettings) else: errorLogger(syslog.LOG_CRIT, "Configuration file not found. Exiting.") sys.exit() except KeyError: errorLogger( syslog.LOG_ERR, "No section: notify in file ibm-crassd.config. Alerts will not be forwarded. Terminating" ) sys.exit() #get the nodes to push alerts to createNodeList(confParser) for i in getPlugins(): config.errorLogger(syslog.LOG_DEBUG, "Loading Plugin " + i["name"]) plugin = loadPlugins(i) for key in notifyList: if key in i["name"]: if hasattr(plugin, 'initialize'): if not plugin.initialize(): errorLogger( syslog.LOG_CRIT, 'Plugin: ' + i['name'] + ' failed to initialize. Aborting now.') sys.exit() for entity in notifyList: if isString(notifyList[entity]['function']): if hasattr(plugin, notifyList[entity]["function"]): notifyList[entity]["function"] = getattr( plugin, notifyList[entity]["function"]) return confParser
def updateTimesforLastReports(signum, frame): """ Updates BMC last reports file to current time """ filename = config.updateNodeTimesfile if os.path.exists(filename): Updatesconfparser = configparser.ConfigParser() parsedFiles = Updatesconfparser.read(filename) updatedNodes = [] if filename in parsedFiles: try: for section in Updatesconfparser.sections(): nodes = dict(Updatesconfparser.items(section)) for node in config.mynodelist: for markedNode in Updatesconfparser[section]: if node['xcatNodeName'] == str(markedNode): bmcHostname = node['bmcHostname'] impactednode = node['xcatNodeName'] updateNotifyTimesData = { 'entity': section, 'bmchostname': node['bmcHostname'], 'lastLogTime': nodes[markedNode], 'dupTimeIDList': [] } updateConfFile.put(updateNotifyTimesData) updatedNodes.append(markedNode) with lock: notifyList[section][bmcHostname][ 'lastLogTime'] = nodes[markedNode] del notifyList[section][bmcHostname][ 'dupTimeIDList'][:] except Exception as e: exc_type, exc_obj, exc_tb = sys.exc_info() fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1] config.errorLogger( LOG_ERR, "exception: {etype} {fname} {lineNum}".format( etype=exc_type, fname=fname, lineNum=exc_tb.tb_lineno)) config.errorLogger(LOG_ERR, "{excDetails}".format(excDetails=e)) try: os.remove(config.updateNodeTimesfile) for section in Updatesconfparser.sections(): errorLogger( syslog.LOG_INFO, "Updated {entity} BMC reporting times for: {bmcList}". format(bmcList=", ".join(updatedNodes), entity=section)) except Exception as e: errorLogger( syslog.LOG_ERR, 'Unable to delete file {filename}'.format( filename=config.updateNodeTimesfile)) else: errorLogger(syslog.LOG_ERR, "Unable to parse updateNodes.ini file.")
def updateBMCLastReports(): """ update the bmc ini file to record last log reported """ global killNow confParser = configparser.ConfigParser() if os.path.exists(config.bmclastreports): try: confParser.read(config.bmclastreports) except Exception as e: exc_type, exc_obj, exc_tb = sys.exc_info() fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1] print("exception: ", exc_type, fname, exc_tb.tb_lineno) while True: if killNow: break #node contains {entity: entName, bmchostname: bmchostname, lastlogtime: timestamp, dupTimeIDList: [ID1, ID2] node = updateConfFile.get() if len(node['dupTimeIDList']) >= 1: tmpList = [] for cerid in node['dupTimeIDList']: tmpList.append(str(cerid)) node['dupTimeIDList'] = tmpList data2write = { 'lastLogTime': str(node['lastLogTime']), 'dupTimeIDList': node['dupTimeIDList'], 'hrTime': datetime.datetime.fromtimestamp(int( node['lastLogTime'])).strftime("%Y-%m-%d %H:%M:%S") } statistics = statistics2Write() if len(statistics) > 0: confParser['statistics'] = statistics try: if node['entity'] + '_bmcs' not in confParser: confParser[node['entity'] + '_bmcs'] = {} confParser[node['entity'] + '_bmcs'][node['bmchostname']] = str(data2write) with open(config.bmclastreports, 'w') as configfile: confParser.write(configfile) except Exception as e: exc_type, exc_obj, exc_tb = sys.exc_info() fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1] config.errorLogger(syslog.LOG_DEBUG, "exception: ", exc_type, fname, exc_tb.tb_lineno) traceback.print_tb(e.__traceback__) config.errorLogger(syslog.LOG_DEBUG, str(e)) continue updateConfFile.task_done()
def openWebSocketsThreads(node): bmcIP = node['bmcHostname'] systemName = node['xcatNodeName'] mysession = login(bmcIP, node['username'], node['password'], True) if not isinstance(mysession, str): try: node['activeTimer'] = time.time() sescookie = mysession.cookies.get_dict() initSensors(bmcIP, mysession, systemName) createWebsocket(sescookie, bmcIP, node) node['retryCount'] = 0 except Exception as e: config.errorLogger( syslog.LOG_CRIT, "Failed to open the websocket with bmc {bmc}".format( bmc=bmcIP)) exc_type, exc_obj, exc_tb = sys.exc_info() fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1] config.errorLogger( syslog.LOG_DEBUG, "Exception: Error: {err}, Details: {etype}, {fname}, {lineno}". format(err=e, etype=exc_type, fname=fname, lineno=exc_tb.tb_lineno)) traceback.print_tb(e.__traceback__) else: config.errorLogger(syslog.LOG_CRIT, "Failed to login to bmc {bmc}".format(bmc=bmcIP)) config.errorLogger(syslog.LOG_ERR, mysession)
def on_open(ws): #open the websocket and subscribe to the sensors thisNode = getNode() data = { "paths": sensorList, "interfaces": [ "xyz.openbmc_project.Sensor.Value", "xyz.openbmc_project.Logging.Entry" ] } ws.send(json.dumps(data)) sendQueue.put(thisNode) config.errorLogger( syslog.LOG_DEBUG, "Websocket opened for {bmc}".format(bmc=thisNode['bmcHostname']))
def writeToSocket(logSocket, alert2Send): #while not config.killNow: sendFailed = False #alert2Send = logqueue.get() #data2send = json.dumps(alert2Send['logEntry'],sort_keys=False, indent=4, separators=(',', ': ')).encode() # eventTime =datetime.datetime.fromtimestamp(int(alert2Send['logEntry']['timestamp'])).strftime("%Y-%m-%d %H:%M:%S") data2send = json.dumps(alert2Send['logEntry'], indent=0, separators=(',', ':')).replace('\n', '') + "\n" data2send = data2send.encode() try: logSocket.sendall(data2send) except socket.error: sendFailed = True except Exception as e: traceback.print_tb(e.__traceback__) print(e) if sendFailed: with config.lock: host = config.pluginConfigs['logstash']['host'] port = int(config.pluginConfigs['logstash']['port']) alert2Send['entityAttr']['logstash']['failedFirstTry'] = True if connectToSocket( logSocket, host, port, alert2Send['entityAttr']['logstash']['receiveEntityDown']): sendFailed = False try: logSocket.sendall(data2send) sendFailed = False except socket.error: sendFailed = True with config.lock: alert2Send['entityAttr']['logstash']['failedFirstTry'] = False alert2Send['entityAttr']['logstash'][ 'receiveEntityDown'] = True else: sendFailed = True with config.lock: alert2Send['entityAttr']['logstash']['failedFirstTry'] = False alert2Send['entityAttr']['logstash'][ 'receiveEntityDown'] = True if not sendFailed: config.errorLogger( syslog.LOG_INFO, "Sent to logstash: {analert}".format(analert=data2send)) return not sendFailed
def createNodeList(confParser): """ Gets the list of nodes and loads mynodeList dictionary @confParser: The configuration parser object with the nodes entity @return: The high level list of nodes """ needWebsocket = False try: nodes = dict(confParser.items('nodes')) for key in nodes: mynodelist.append(json.loads(nodes[key].replace("'", '"'))) if 'username' not in mynodelist[-1]: if mynodelist[-1]['accessType'] == "ipmi": mynodelist[-1]['username'] = "******" mynodelist[-1]['password'] = "******" elif mynodelist[-1]['accessType'] == 'openbmcRest': mynodelist[-1]['username'] = "******" mynodelist[-1]['password'] = "******" if mynodelist[-1]['accessType'] == 'openbmcRest': needWebsocket = True mynodelist[-1]['dupTimeIDList'] = [] mynodelist[-1]['lastLogTime'] = '0' mynodelist[-1]['pollFailedCount'] = 0 for entity in notifyList: notifyList[entity][mynodelist[-1]['bmcHostname']] = { 'lastLogTime': mynodelist[-1]['lastLogTime'], 'dupTimeIDList': mynodelist[-1]['dupTimeIDList'] } except Exception as e: exc_type, exc_obj, exc_tb = sys.exc_info() fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1] config.errorLogger( syslog.LOG_DEBUG, "exception: {type} {fname} {lineno}".format( exc_type, fname, exc_tb.tb_lineno)) config.errorLogger(syslog.LOG_DEBUG, str(e)) errorLogger(syslog.LOG_CRIT, "Unable to read node list from configuration file") sys.exit() #load optional module for monitoring openbmc systems if needWebsocket: global notificationlistener import notificationlistener
def connectToSocket(logSocket, host, port, logstashDown): """ Opens a connection to the logstash instance. @param logSocket: the socket object to use @param host: IP or hostname to connect to @param port: port number for logstash service @param logstashDown: boolean, True is logstash is down, preventing log flooding of error messages on reconnect @return: Connection status after attempting to connect. True when successful. """ connected = False errorString = "" #check if socket is active by receiving and checking for eof try: logSocket.settimeout(0.1) data = logSocket.recv(4096) if not data: connected = False else: connected = True except socket.timeout: connected = True except Exception as e: connected = False errorString = e # Try 3 times to open the socket connection if not connected: for x in range(0, 3): try: logSocket.connect((host, port)) connected = True break except socket.error as erString: errorString = erString continue if not connected: if not logstashDown: config.errorLogger( syslog.LOG_ERR, "Logstash connection failure: {}".format(errorString)) return connected
def telemReceive(): global killNow global sensorData while True: if killNow: break try: newData = telemUpdateQueue.get() sensorData.update(newData) except Exception as e: config.errorLogger( syslog.LOG_DEBUG, "Error updating sensor data with new readings.") exc_type, exc_obj, exc_tb = sys.exc_info() fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1] config.errorLogger( syslog.LOG_DEBUG, "Exception: Error: {err}, Details: {etype}, {fname}, {lineno}". format(err=e, etype=exc_type, fname=fname, lineno=exc_tb.tb_lineno)) traceback.print_tb(e.__traceback__)
def socket_server(servsocket): global serverhostname dataUpdaterThread = threading.Thread(target=telemReceive) dataUpdaterThread.daemon = True dataUpdaterThread.start() killQueueThread = threading.Thread(target=killQueueChecker) killQueueThread.daemon = True killQueueThread.start() servsocket.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) servsocket.bind((serverhostname, config.telemPort)) servsocket.listen(15) read_list = [servsocket] global killNow while True: if killNow: break try: if not dataUpdaterThread.isAlive(): dataUpdaterThread = threading.Thread(target=telemReceive) dataUpdaterThread.daemon = True dataUpdaterThread.start() config.errorLogger(syslog.LOG_DEBUG, "Restarted the data consolidation thread") readable, writeable, errored = select.select(read_list, [], [], 30) for s in readable: if s is servsocket: c, addr = servsocket.accept() t = threading.Thread(target=on_new_client, args=[c, addr]) t.daemon = True t.start() else: data = s.recv(1024) if data: pass else: s.close() read_list.remove(s) except Exception as e: config.errorLogger( syslog.LOG_ERR, "Failed to open a telemetry server connection with a client.") exc_type, exc_obj, exc_tb = sys.exc_info() fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1] config.errorLogger( syslog.LOG_DEBUG, "Exception: Error: {err}, Details: {etype}, {fname}, {lineno}". format(err=e, etype=exc_type, fname=fname, lineno=exc_tb.tb_lineno)) traceback.print_tb(e.__traceback__) servsocket.close()
def processMessages(): global killNow while True: if killNow: break text = messageQueue.get() try: message = json.loads(text['msg']) if 'logging' in message['path']: config.errorLogger( syslog.LOG_DEBUG, "Event notification received for {bmc}.".format( bmc=text['node']['bmcHostname'])) if 'sensors' in message["path"]: sensorName = message["path"].split('/')[-1] if 'Value' in message['properties']: sensorData[text['node']['xcatNodeName']][sensorName][ 'value'] = message['properties']['Value'] # config.errorLogger(syslog.LOG_DEBUG, "Updated sensor readings for {bmc}.".format(bmc=text['node']['bmcHostname'])) else: sendQueue.put(text['node']) except Exception as e: config.errorLogger( syslog.LOG_WARNING, "Error encountered processing BMC message from {bmc}".format( bmc=text['node']['bmcHostname'])) config.errorLogger( syslog.LOG_DEBUG, "BMC message was: {msg}".format(msg=text['msg'])) exc_type, exc_obj, exc_tb = sys.exc_info() fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1] config.errorLogger( syslog.LOG_DEBUG, "Exception: Error: {err}, Details: {etype}, {fname}, {lineno}". format(err=e, etype=exc_type, fname=fname, lineno=exc_tb.tb_lineno)) traceback.print_tb(e.__traceback__) messageQueue.task_done()
def sigHandler(signum, frame): """ Used to handle kill signals from the operating system @param signum: integer, the signal number received from the os @param frame; contextual frame @return: set global kill now to true and lead to termination """ if (signum == signal.SIGTERM or signum == signal.SIGINT): config.errorLogger( syslog.LOG_DEBUG, "Termination Signal received: {sigNum}".format(sigNum=signum)) global killNow killNow = True config.killNow = True elif (signum == signal.SIGUSR1): config.errorLogger(syslog.LOG_DEBUG, "Queue size: " + str(nodes2poll.qsize())) else: config.errorLogger(syslog.LOG_DEBUG, "Signal received: {sigNum}".format(sigNum=signum))
def notifyCSM(cerEvent, impactedNode, entityAttr): """ sends alert to CSM @param cerEvent: dict, the cerEvent to send @param impactedNode; the node that had the alert @param entityAttr: dictionary, contains the list of known attributes for the entity to report to @return: True if notification was successful, false if it was unable to send the alert """ try: host = config.pluginConfigs['csm']['host'] port = config.pluginConfigs['csm']['port'] except KeyError: errorLogger( syslog.LOG_ERR, "Host and port configurations missing for CSM plugin. Defaulting to 127.0.0.1:4213" ) host = "127.0.0.1" port = "4213" httpHeader = {'Content-Type': 'application/json'} with config.lock: failedFirstFlag = entityAttr['csm']['failedFirstTry'] csmDown = entityAttr['csm']['receiveEntityDown'] try: if (config.pluginPolicies['csmPolicy'][cerEvent['CerID']]['CSMEnabled'] == False): return True except KeyError: #Report the alert is missing and forward the event to CSM by default. config.errorLogger( syslog.LOG_ERR, "Event ID {cerID} missing in CSM Policy Table. Forwarding to CSM". format(cerID=cerEvent['CerID'])) if (failedFirstFlag == False): msgID = "bmc." + "".join( cerEvent['eventType'].split()) + "." + cerEvent['CerID'] argString = createArgString(cerEvent) eventTime = datetime.datetime.fromtimestamp(int( cerEvent['timestamp'])).strftime("%Y-%m-%d %H:%M:%S") eventEntry = { 'msg_id': msgID, 'location_name': impactedNode, 'time_stamp': eventTime, "raw_data": "serviceable:" + cerEvent['serviceable'] + " || subsystem: " + cerEvent['subSystem'] } if argString != "": eventEntry['kvcsv'] = argString else: msgID = "bmc.Firmware/SoftwareFailure.FQPSPEM0003G" eventEntry = { 'msg_id': msgID, 'location_name': impactedNode, 'time_stamp': time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()), "raw_data": (cerEvent['CerID'] + "|| " + config.pluginPolicies['csmPolicy'][cerEvent['CerID']]['Message'] + "|| serviceable:" + cerEvent['serviceable'] + "|| severity: " + cerEvent['severity']) } if ("additionalDetails" in cerEvent): eventEntry['raw_data'] = eventEntry['raw_data'] + cerEvent[ 'sensor'] + " || " + cerEvent['state'] + " || " + cerEvent[ 'additionalDetails'] try: csmurl = 'http://{host}:{port}/csmi/V1.0/ras/event/create'.format( host=host, port=port) r = requests.post(csmurl, headers=httpHeader, data=json.dumps(eventEntry), timeout=30) if (r.status_code != 200): with config.lock: entityAttr['csm']['receiveEntityDown'] = False return False else: errorLogger( syslog.LOG_INFO, "Successfully reported to CSM: {id} for {system}".format( id=msgID, system=impactedNode)) # sys.stdout.flush() if csmDown == True: with config.lock: entityAttr['csm']['receiveEntityDown'] = False return True except (requests.exceptions.Timeout): if csmDown == False: errorLogger( syslog.LOG_ERR, "Connection Timed out connecting to csmrestd system service. Ensure the service is running" ) with config.lock: entityAttr['csm']['receiveEntityDown'] = True return False except (requests.exceptions.ConnectionError) as err: if csmDown == False: errorLogger( syslog.LOG_ERR, "Encountered an error connecting to csmrestd system service. Ensure the service is running. Error: " + str(err)) with config.lock: entityAttr['csm']['receiveEntityDown'] = True return False except IndexError: traceback.print_stack()
def startMonitoringProcess(nodeList, mngedNodeList): killQueueThread = threading.Thread(target=killQueueChecker) killQueueThread.daemon = True killQueueThread.start() global activeThreads global lock for node in nodeList: if node['accessType'] == 'openbmcRest': node['activeTimer'] = time.time() node['retryCount'] = 0 node['down'] = False ws = threading.Thread(target=openWebSocketsThreads, args=[node]) ws.daemon = True ws.start() node['telemlistener'] = ws pm = threading.Thread(target=processMessages) pm.daemon = True pm.start() activeThreads.append(ws) activeThreads.append(pm) time.sleep(10) global killNow while True: if killNow: break if not sendQueue.empty(): pollNode = sendQueue.get() if 'xcatNodeName' not in pollNode: continue else: mngedNodeList.append( nodeReferenceDict[pollNode['xcatNodeName']]) sendQueue.task_done() for node in nodeList: msgtimer = time.time() - node['activeTimer'] if node['accessType'] == 'openbmcRest': if not pm.isAlive(): try: pm = threading.Thread(target=processMessages) pm.daemon = True pm.start() except Exception as e: config.errorLogger( syslog.LOG_ERR, "Failed to restart the thread for processing BMC telemetry notifications. " ) exc_type, exc_obj, exc_tb = sys.exc_info() fname = os.path.split( exc_tb.tb_frame.f_code.co_filename)[1] config.errorLogger( syslog.LOG_DEBUG, "Exception: Error: {err}, Details: {etype}, {fname}, {lineno}" .format(err=e, etype=exc_type, fname=fname, lineno=exc_tb.tb_lineno)) traceback.print_tb(e.__traceback__) if node['telemlistener'] is None: try: ws = threading.Thread(target=openWebSocketsThreads, args=[node]) ws.daemon = True ws.start() node['telemlistener'] = ws config.errorLogger( syslog.LOG_ERR, "No thread found for monitoring {bmc} telemetry data. A new thread has been started." .format(bmc=node['bmcHostname'])) except Exception as e: config.errorLogger( syslog.LOG_ERR, "Error trying to restart a thread for monitoring bmc telemetry data." ) exc_type, exc_obj, exc_tb = sys.exc_info() fname = os.path.split( exc_tb.tb_frame.f_code.co_filename)[1] config.errorLogger( syslog.LOG_DEBUG, "Exception: Error: {err}, Details: {etype}, {fname}, {lineno}" .format(err=e, etype=exc_type, fname=fname, lineno=exc_tb.tb_lineno)) traceback.print_tb(e.__traceback__) elif msgtimer > 600: try: if node['retryCount'] <= 3: oldws = node['websocket'] oldws.close() ws = threading.Thread(target=openWebSocketsThreads, args=[node]) ws.daemon = True ws.start() node['telemlistener'] = ws node['retryCount'] += 1 if node['retryCount'] > 3 and msgtimer >= 300: if not node['down']: config.errorLogger( syslog.LOG_CRIT, "ibm-crassd has failed to reconnect to BMC, {bmc}, more than three times." .format(bmc=node['bmcHostname'])) node['down'] = True node['retryCount'] = 0 except Exception as e: if not node['down']: config.errorLogger( syslog.LOG_CRIT, "The BMC, {bmc}, stopped sending telemetry data and ibm-crassd failed to reconnect to it." .format(bmc=node['bmcHostname'])) node['down'] = True else: pass time.sleep(0.9) telemUpdateQueue.put(sensorData)
def process_data(filterData, addr): """ Processes the filter data received from a client. In the case of errors, defaults are used. For invalid names and types, they are removed from the list. The full path of the sensor name must be included. @param filterData: The raw data received from the client. Must be in a JSON formatted string. @param addr: The address of the client as a string. """ try: filterDict = json.loads(filterData.decode()) if 'frequency' in filterDict: if not isinstance(filterDict['frequency'], int): try: filterDict['frequency'] = int(filterDict['frequency']) except Exception as e: config.errorLogger( syslog.LOG_ERR, "{value} is not a valid frequency".format( value=filterDict['frequency'])) filterDict['frequency'] = 1 if 'sensornames' in filterDict: if not isinstance(filterDict['sensornames'], list): config.errorLogger( syslog.LOG_ERR, "{value} is not a valid list of names".format( value=filterDict['sensornames'])) filterDict.pop('sensornames', None) else: fullpathnames = [] for sname in filterDict['sensornames']: found = False for fullSensName in sensorList: if sname in fullSensName: found = True fullpathnames.append(fullSensName) break if not found: config.errorLogger( syslog.LOG_ERR, "{value} is not a valid sensor name".format( value=sname)) filterDict['sensornames'] = fullpathnames if len(filterDict['sensornames']) <= 0: filterDict.pop('sensornames', None) if 'sensortypes' in filterDict: if not isinstance(filterDict['sensortypes'], list): config.errorLogger( syslog.LOG_ERR, "{value} is not a valid list of types".format( value=filterDict['sensortypes'])) filterDict.pop('sensornames', None) else: validSensorTypes = [ 'current', 'power', 'voltage', 'temperature', 'fan_tach' ] for stype in filterDict['sensortypes']: if stype not in validSensorTypes: config.errorLogger( syslog.LOG_ERR, "{value} is not a valid sensor type".format( value=stype)) filterDict['sensortypes'].remove(stype) if len(filterDict['sensortypes']) <= 0: filterDict.pop('sensortypes', None) return filterDict except Exception as e: config.errorLogger( syslog.LOG_CRIT, "Unable to process message from client {addr}. Error details: {err}" .format(addr=addr, err=e))
def on_new_client(clientsocket, addr): """ Run in a thread,under a subprocess, sends telemetry data to a subscribed client @param clientsocket: the socket opened with the subscriber @param addr: The address of the subscriber """ last_run = next_run = now = get_millis() clientSubRate = update_every clientsocket.settimeout(0.1) count = 0 config.errorLogger( syslog.LOG_INFO, "Telemetry streaming connected to {address}".format(address=addr)) global killNow filterInfo = {} while True: if killNow: break if next_run <= now: count += 1 while next_run <= now: next_run += clientSubRate dt = now - last_run last_run = now if count == 1: dt = 0 filteredSensors = getFilteredData(filterInfo, sensorData) data2send = ( json.dumps(filteredSensors, indent=0, separators=(',', ':')).replace('\n', '') + "\n").encode() # data2send = (json.dumps(sensorData, indent=0, separators=(',', ':')).replace('\n','') +"\n").encode() msg = struct.pack('>I', len(data2send)) + data2send clientsocket.sendall(msg) time.sleep(0.3) #wait 1/3 of a second and check for new now = get_millis() try: raw_msglen = recvall(clientsocket, 4) if not raw_msglen: break msglen = struct.unpack('>I', raw_msglen)[0] data = recvall(clientsocket, msglen) if not data: break else: filterInfo = process_data(data, addr) if 'frequency' in filterInfo: clientSubRate = filterInfo['frequency'] * 1000 except socket.timeout: pass except Exception as e: config.errorLogger( syslog.LOG_ERR, "Error processing message filters from client at: {caddress}.". format(caddress=addr)) exc_type, exc_obj, exc_tb = sys.exc_info() fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1] config.errorLogger( syslog.LOG_DEBUG, "Exception: Error: {err}, Details: {etype}, {fname}, {lineno}". format(err=e, etype=exc_type, fname=fname, lineno=exc_tb.tb_lineno)) traceback.print_tb(e.__traceback__) for item in clientList: if addr == item: clientList.remove(item) clientsocket.close()
def main(): global telemUpdateQueue telemUpdateQueue = multiprocessing.Queue() global killQueue killQueue = multiprocessing.Queue() global messageQueue messageQueue = queue.Queue() global clientList clientList = [] global outputData outputData = {} global sensorData sensorData = {} global sensorList sensorList = [ "/xyz/openbmc_project/sensors/current/ps0_output_current", "/xyz/openbmc_project/sensors/current/ps1_output_current", "/xyz/openbmc_project/sensors/fan_tach/fan0_0", "/xyz/openbmc_project/sensors/fan_tach/fan0_1", "/xyz/openbmc_project/sensors/fan_tach/fan1_0", "/xyz/openbmc_project/sensors/fan_tach/fan1_1", "/xyz/openbmc_project/sensors/fan_tach/fan2_0", "/xyz/openbmc_project/sensors/fan_tach/fan2_1", "/xyz/openbmc_project/sensors/fan_tach/fan3_0", "/xyz/openbmc_project/sensors/fan_tach/fan3_1", "/xyz/openbmc_project/sensors/power/fan_disk_power", "/xyz/openbmc_project/sensors/power/io_power", "/xyz/openbmc_project/sensors/power/p0_gpu0_power", "/xyz/openbmc_project/sensors/power/p0_gpu1_power", "/xyz/openbmc_project/sensors/power/p0_gpu2_power", "/xyz/openbmc_project/sensors/power/p0_io_power", "/xyz/openbmc_project/sensors/power/p0_mem_power", "/xyz/openbmc_project/sensors/power/p0_power", "/xyz/openbmc_project/sensors/power/p1_gpu0_power", "/xyz/openbmc_project/sensors/power/p1_gpu1_power", "/xyz/openbmc_project/sensors/power/p1_gpu2_power", "/xyz/openbmc_project/sensors/power/p1_io_power", "/xyz/openbmc_project/sensors/power/p1_mem_power", "/xyz/openbmc_project/sensors/power/p1_power", "/xyz/openbmc_project/sensors/power/ps0_input_power", "/xyz/openbmc_project/sensors/power/ps1_input_power", "/xyz/openbmc_project/sensors/power/total_power", "/xyz/openbmc_project/sensors/temperature/ambient", "/xyz/openbmc_project/sensors/temperature/dimm0_temp", "/xyz/openbmc_project/sensors/temperature/dimm1_temp", "/xyz/openbmc_project/sensors/temperature/dimm10_temp", "/xyz/openbmc_project/sensors/temperature/dimm11_temp", "/xyz/openbmc_project/sensors/temperature/dimm12_temp", "/xyz/openbmc_project/sensors/temperature/dimm13_temp", "/xyz/openbmc_project/sensors/temperature/dimm14_temp", "/xyz/openbmc_project/sensors/temperature/dimm15_temp", "/xyz/openbmc_project/sensors/temperature/dimm2_temp", "/xyz/openbmc_project/sensors/temperature/dimm3_temp", "/xyz/openbmc_project/sensors/temperature/dimm4_temp", "/xyz/openbmc_project/sensors/temperature/dimm5_temp", "/xyz/openbmc_project/sensors/temperature/dimm6_temp", "/xyz/openbmc_project/sensors/temperature/dimm7_temp", "/xyz/openbmc_project/sensors/temperature/dimm8_temp", "/xyz/openbmc_project/sensors/temperature/dimm9_temp", "/xyz/openbmc_project/sensors/temperature/gpu0_core_temp", "/xyz/openbmc_project/sensors/temperature/gpu0_mem_temp", "/xyz/openbmc_project/sensors/temperature/gpu1_core_temp", "/xyz/openbmc_project/sensors/temperature/gpu1_mem_temp", "/xyz/openbmc_project/sensors/temperature/gpu2_core_temp", "/xyz/openbmc_project/sensors/temperature/gpu2_mem_temp", "/xyz/openbmc_project/sensors/temperature/gpu3_core_temp", "/xyz/openbmc_project/sensors/temperature/gpu3_mem_temp", "/xyz/openbmc_project/sensors/temperature/gpu4_core_temp", "/xyz/openbmc_project/sensors/temperature/gpu4_mem_temp", "/xyz/openbmc_project/sensors/temperature/gpu5_core_temp", "/xyz/openbmc_project/sensors/temperature/gpu5_mem_temp", "/xyz/openbmc_project/sensors/temperature/p0_core0_temp", "/xyz/openbmc_project/sensors/temperature/p0_core1_temp", "/xyz/openbmc_project/sensors/temperature/p0_core10_temp", "/xyz/openbmc_project/sensors/temperature/p0_core11_temp", "/xyz/openbmc_project/sensors/temperature/p0_core12_temp", "/xyz/openbmc_project/sensors/temperature/p0_core13_temp", "/xyz/openbmc_project/sensors/temperature/p0_core14_temp", "/xyz/openbmc_project/sensors/temperature/p0_core15_temp", "/xyz/openbmc_project/sensors/temperature/p0_core18_temp", "/xyz/openbmc_project/sensors/temperature/p0_core19_temp", "/xyz/openbmc_project/sensors/temperature/p0_core2_temp", "/xyz/openbmc_project/sensors/temperature/p0_core20_temp", "/xyz/openbmc_project/sensors/temperature/p0_core21_temp", "/xyz/openbmc_project/sensors/temperature/p0_core22_temp", "/xyz/openbmc_project/sensors/temperature/p0_core23_temp", "/xyz/openbmc_project/sensors/temperature/p0_core3_temp", "/xyz/openbmc_project/sensors/temperature/p0_core4_temp", "/xyz/openbmc_project/sensors/temperature/p0_core5_temp", "/xyz/openbmc_project/sensors/temperature/p0_core6_temp", "/xyz/openbmc_project/sensors/temperature/p0_core7_temp", "/xyz/openbmc_project/sensors/temperature/p0_core8_temp", "/xyz/openbmc_project/sensors/temperature/p0_core9_temp", "/xyz/openbmc_project/sensors/temperature/p0_vcs_temp", "/xyz/openbmc_project/sensors/temperature/p0_vdd_temp", "/xyz/openbmc_project/sensors/temperature/p0_vddr_temp", "/xyz/openbmc_project/sensors/temperature/p0_vdn_temp", "/xyz/openbmc_project/sensors/temperature/p1_core0_temp", "/xyz/openbmc_project/sensors/temperature/p1_core1_temp", "/xyz/openbmc_project/sensors/temperature/p1_core10_temp", "/xyz/openbmc_project/sensors/temperature/p1_core11_temp", "/xyz/openbmc_project/sensors/temperature/p1_core12_temp", "/xyz/openbmc_project/sensors/temperature/p1_core13_temp", "/xyz/openbmc_project/sensors/temperature/p1_core14_temp", "/xyz/openbmc_project/sensors/temperature/p1_core16_temp", "/xyz/openbmc_project/sensors/temperature/p1_core17_temp", "/xyz/openbmc_project/sensors/temperature/p1_core18_temp", "/xyz/openbmc_project/sensors/temperature/p1_core19_temp", "/xyz/openbmc_project/sensors/temperature/p1_core2_temp", "/xyz/openbmc_project/sensors/temperature/p1_core20_temp", "/xyz/openbmc_project/sensors/temperature/p1_core22_temp", "/xyz/openbmc_project/sensors/temperature/p1_core23_temp", "/xyz/openbmc_project/sensors/temperature/p1_core3_temp", "/xyz/openbmc_project/sensors/temperature/p1_core4_temp", "/xyz/openbmc_project/sensors/temperature/p1_core5_temp", "/xyz/openbmc_project/sensors/temperature/p1_core6_temp", "/xyz/openbmc_project/sensors/temperature/p1_core7_temp", "/xyz/openbmc_project/sensors/temperature/p1_core8_temp", "/xyz/openbmc_project/sensors/temperature/p1_core9_temp", "/xyz/openbmc_project/sensors/temperature/p1_vcs_temp", "/xyz/openbmc_project/sensors/temperature/p1_vdd_temp", "/xyz/openbmc_project/sensors/temperature/p1_vddr_temp", "/xyz/openbmc_project/sensors/temperature/p1_vdn_temp", "/xyz/openbmc_project/sensors/temperature/pcie", "/xyz/openbmc_project/sensors/voltage/ps0_input_voltage", "/xyz/openbmc_project/sensors/voltage/ps0_output_voltage", "/xyz/openbmc_project/sensors/voltage/ps1_input_voltage", "/xyz/openbmc_project/sensors/voltage/ps1_output_voltage", "/xyz/openbmc_project/logging" ] requests.packages.urllib3.disable_warnings( requests.packages.urllib3.exceptions.InsecureRequestWarning) global wsClosed wsClosed = False global pmClosed pmClosed = False global lock lock = threading.Lock() global activeThreads activeThreads = [] global get_millis get_millis = lambda: int(round(time.time() * 1000)) global sendQueue sendQueue = queue.Queue() nodeListManager = multiprocessing.Manager() mngedNodeList = nodeListManager.list() global serversocket serversocket = socket.socket() global serverhostname serverhostname = '' global port port = config.telemPort global update_every update_every = 1 update_every = update_every * 1000 global killNow killNow = config.killNow global gathererProcs gathererProcs = [] global nodeReferenceDict nodeReferenceDict = {} for node in config.mynodelist: nodeReferenceDict[node['xcatNodeName']] = node.copy() init(mngedNodeList) sockServProcess = multiprocessing.Process(target=socket_server, args=[serversocket]) sockServProcess.daemon = True sockServProcess.start() config.errorLogger(syslog.LOG_INFO, 'Started Telemetry Streaming') while not config.killNow: time.sleep(1) try: while len(mngedNodeList) > 0: # node = config.alertMessageQueue.get() node = mngedNodeList.pop(0) config.nodes2poll.put(node) # config.alertMessageQueue.task_done() except Exception as e: config.errorLogger(syslog.LOG_ERR, "Error processing an alert message.") exc_type, exc_obj, exc_tb = sys.exc_info() fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1] config.errorLogger( syslog.LOG_DEBUG, "Exception: Error: {err}, Details: {etype}, {fname}, {lineno}". format(err=e, etype=exc_type, fname=fname, lineno=exc_tb.tb_lineno)) traceback.print_tb(e.__traceback__) for i in range(1 + len(gathererProcs)): killQueue.put(True) sockServProcess.terminate() for aproc in gathererProcs: aproc.terminate()
#check for dead push notification if __name__ == '__main__': """ main thread for the application. """ set_procname('ibm-crassd'.encode('utf-8')) #setup the interrupt to handle SIGTERM, SIGINT signal.signal(signal.SIGTERM, sigHandler) signal.signal(signal.SIGINT, sigHandler) signal.signal(signal.SIGUSR1, sigHandler) signal.signal(signal.SIGUSR2, updateTimesforLastReports) global telemThread try: initialize() config.errorLogger(syslog.LOG_DEBUG, str(os.getpid())) while (True): time.sleep(0.5) if (killNow): break if telemThread is not None: config.errorLogger(syslog.LOG_DEBUG, "Waiting on the telemetry server to stop.") telemThread.join() errorLogger(syslog.LOG_ERR, "The ibm-crassd service has been stopped") sys.exit() except KeyboardInterrupt: config.errorLogger(syslog.LOG_DEBUG, "Terminating") sys.exit()
def initialize(): """ Initializes the application by loading the nodes to monitor, getting the plugins needed, and setting up the forwarding through the specified plugins. Spools up the threads that will process the bmc alerts when a queue entry is created. This also configures push notifications. """ global csmDown csmDown = False global killNow killNow = False #The following list indicates failure to communicate to the BMC and retrieve information global networkErrorList networkErrorList = config.networkErrorList #Setup Notifications for entities to push alerts to confParser = setupNotifications() #validate all of the needed plugins loaded validatePluginNotifications(confParser) errorLogger(syslog.LOG_INFO, "Node Count: {count}".format(count=len(mynodelist))) #check the node count to see if nodes were specified if len(mynodelist) < 1: #The node list seems short, attempt to scan for nodes that report to this service node autoConfigureNodes(confParser) updateMaxThreads(confParser) errorLogger( syslog.LOG_INFO, "Auto-configuration Node Count: {count}".format( count=len(mynodelist))) #Check for analysis scripts getIDstoAnalyze(confParser) #load last reported times from storage file to prevent duplicate entries loadBMCLastReports() #Determine the maximum number of nodes maxThreads = 1 #Enable debug messages if needed if 'enableDebugMsgs' in confParser['base_configuration']: if 'True' in confParser['base_configuration']['enableDebugMsgs']: config.enableDebug = True else: config.enableDebug = False try: maxThreads = int(confParser['base_configuration']['maxThreads']) except KeyError: errorLogger( syslog.LOG_ERR, "No section: base configuration in file ibm-crassd.config. Defaulting to one thread for polling" ) if (maxThreads >= len(mynodelist)): maxThreads = len(mynodelist) if (maxThreads < 1): maxThreads = 1 minPollingInterval = getMinimumPollingInterval(maxThreads) #Create the worker threads for i in range(maxThreads): config.errorLogger(syslog.LOG_DEBUG, "Creating thread " + str(i)) t = threading.Thread(target=BMCEventProcessor) t.daemon = True t.start() t = threading.Thread(target=updateBMCLastReports) t.daemon = True t.start() #start TelemetryServer if enabled if 'telemetry_configuration' in confParser: if 'nodesPerGathererProcess' in confParser['telemetry_configuration']: config.nodespercore = int(confParser['telemetry_configuration'] ['nodesPerGathererProcess']) if 'enableTelemetry' in confParser['base_configuration']: enableTelem = confParser['base_configuration']['enableTelemetry'] if confParser['base_configuration']['enableTelemetry'] == 'True': if 'telemetryPort' in confParser['base_configuration']: config.telemPort = int( confParser['base_configuration']['telemetryPort']) config.useTelem = True global telemThread telemThread = threading.Thread(target=telemetryServer.main) telemThread.daemon = True telemThread.start() else: #subscribe to events only if no telemetry configurePushNotifications() else: #subscribe to events only if no telemetry configurePushNotifications() #Setup polling interval pollNodes(minPollingInterval)
def getBMCAlerts(node): """ Gets alerts from the node's BMC and puts them into a dictionary with a common format @param node: A dictionary containing properties about a node @return: dictionary with common format containing alerts """ eventList = "" eventsDict = {} name = threading.currentThread().getName() bmcHostname = node['bmcHostname'] impactednode = node['xcatNodeName'] username = node['username'] password = node['password'] try: #get the alerts from the bmc and place in a common format if (node['accessType'] == "openbmcRest"): #use openbmctool for openbmc rest interface try: eventBytes = subprocess.check_output([ config.pyString, '/opt/ibm/ras/bin/openbmctool.py', '-H', bmcHostname, '-U', username, '-P', password, '-j', '-t', '/opt/ibm/ras/lib/policyTable.json', 'sel', 'print' ]) eventList = eventBytes.decode('utf-8') except subprocess.CalledProcessError as e: if e.returncode == 1: eventList = e.output.decode('utf-8') else: errorLogger( syslog.LOG_ERR, "An unknown error has occurred when retrieving bmc alerts from {hostname}. Error Details: {msg}" .format(hostname=impactednode, msg=e.message)) eventList = {'numAlerts': 0, 'failedPoll': True} if not isString(eventList): eventList = eventList.decode('utf-8') if eventList.find('{') != -1: #check for valid response eventList = eventList[eventList.index('{'):] eventsDict = json.loads(eventList) else: errorLogger( syslog.LOG_ERR, "An invalid response was received from bmc when requesting alerts for {hostname}" .format(hostname=impactednode)) eventsDict = {'numAlerts': 0, 'failedPoll': True} eventsDict = updateEventDictionary(eventsDict) elif (node['accessType'] == "ipmi"): #use java sel parser and ipmitool to get alerts from ipmi node eventList = subprocess.check_output([ 'java', '-jar', '/opt/ibm/ras/lib/crassd.jar', bmcHostname, username, password ]).decode('utf-8') if eventList.find('{') != -1: #check for valid response eventList = eventList[eventList.index( '{' ):] #keyboard terminate causing substring not found here eventsDict = json.loads(eventList) else: errorLogger( syslog.LOG_ERR, "An invalid response was received when retrieving bmc alerts from {hostname}. Response Details: {msg}" .format(hostname=impactednode, msg=eventList)) eventsDict = {'numAlerts': 0, 'failedPoll': True} else: #use redfish errorLogger(syslog.LOG_ERR, "redfish not supported") eventList = {'numAlerts': 0, 'failedPoll': True} except Exception as e: exc_type, exc_obj, exc_tb = sys.exc_info() fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1] config.errorLogger(syslog.LOG_DEBUG, "exception: ", exc_type, fname, exc_tb.tb_lineno) config.errorLogger(syslog.LOG_DEBUG, str(e)) traceback.print_tb(e.__traceback__) eventsDict = {'numAlerts': 0, 'failedPoll': True} return eventsDict
def BMCEventProcessor(): """ processes alerts and is run in child threads """ eventsDict = {} global notifyList global killNow global networkErrorList while True: nodeCommsLost = False if killNow: break else: node = nodes2poll.get() eventList = {} bmcEvent = {} name = threading.currentThread().getName() bmcHostname = node['bmcHostname'] impactednode = node['xcatNodeName'] username = node['username'] password = node['password'] resetFailedNotify(bmcHostname) try: config.errorLogger(syslog.LOG_DEBUG, str(name + ": " + bmcHostname)) #get the alerts from the bmc and place in a common format eventsDict = getBMCAlerts(node) #process the alerts if (eventsDict['numAlerts'] == 0): #node poll was successful and no alerts to process node['pollFailedCount'] = 0 continue elif ('failedPoll' in eventsDict): node['pollFailedCount'] += 1 if (node['pollFailedCount'] != 2): #create a log entry for failing to process sel entries errorLogger( syslog.LOG_ERR, "Failed to process BMC alerts for {host} three or more times" .format(host=impactednode)) continue else: #process the received alerts for i in range(len(eventsDict) - 1): if (killNow): break event = "event" + str(i) bmcEvent = eventsDict[event] if "error" in eventsDict[event]: node['pollFailedCount'] = 0 begIndex = eventsDict[event]['error'].rfind( ":") + 2 missingKey = eventsDict[event]['error'][begIndex:] if (missingKey not in missingEvents.keys()): with lock: missingEvents[missingKey] = True errorLogger( syslog.LOG_ERR, "Event not found in lookup table for node {node}: {alert}" .format(alert=missingKey, node=impactednode)) else: #check for failure to poll the bmc if (eventsDict[event]['CerID'] in networkErrorList): if (nodeCommsLost == False): nodeCommsLost = True node['pollFailedCount'] += 1 if (node['pollFailedCount'] != 2): #forward the network connection failure at 3 consecutive failures. continue #process the alerts processAlert(eventsDict[event], bmcHostname, impactednode, username, password, node['accessType']) nodes2poll.task_done() except Exception as e: exc_type, exc_obj, exc_tb = sys.exc_info() fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1] config.errorLogger( syslog.LOG_DEBUG, "exception: {type} {fname} {lineNo}".format( exc_type, fname, exc_tb.tb_lineno)) config.errorLogger(syslog.LOG_DEBUG, str(e)) eventsDict.clear()