Example #1
0
def on_error(ws, error):
    thisNode = getNode()
    if thisNode['websocket'] == ws:
        thisNode['telemlistener'] = None
    config.errorLogger(
        syslog.LOG_DEBUG, "Websocket error for {bmc}, details: {err}".format(
            bmc=thisNode['bmcHostname'], err=error))
Example #2
0
def initialize():
    config.pluginVars['logstash'] = {}
    #     config.pluginVars['logstash']['logstashQueue'] = queue.Queue()
    try:
        host = config.pluginConfigs['logstash']['host']
        port = int(config.pluginConfigs['logstash']['port'])
    except KeyError:
        config.errorLogger(
            syslog.LOG_ERR,
            "Host and port configurations missing for logstash plugin. Defaulting to 127.0.0.1:10522"
        )
        host = "127.0.0.1"
        port = 10522

    config.pluginVars['logstash']['logstashSocket'] = socket.socket()
    if not connectToSocket(config.pluginVars['logstash']['logstashSocket'],
                           host, port, False):
        return False


#     t = threading.Thread(target=writeToSocket, args=(config.pluginVars['logstash']['logstashSocket'],
#                                                      config.pluginVars['logstash']['logstashQueue']))
#     t.daemon = True
#     t.start()
    return True
Example #3
0
def createArgString(cerEvent):
    argString = ""
    index = 0
    try:
        cerMessage = config.pluginPolicies['csmPolicy'][
            cerEvent['CerID']]['Message']
    except KeyError:
        config.errorLogger(
            syslog.LOG_ERR,
            "Event ID {cerID} missing in CSM Policy Table.".format(
                cerID=cerEvent['CerID']))
        cerMessage = cerEvent['message']
    argInstance = 0
    while cerMessage.find('$(', index) != -1:
        index = cerMessage.find('$(', index) + 2
        arg = cerMessage[index:cerMessage.find(')', index)]
        if argString != "":
            argString = argString + ','
        try:
            argString = argString + arg + '=' + str(
                cerEvent['compInstance']).split(',')[argInstance]
        except IndexError:
            config.errorLogger(
                syslog.LOG_ERR,
                "CSM Policy table has more arguments than provided by the alert."
            )
            argString = ""
            break
        argInstance += 1
    return argString
Example #4
0
def on_close(ws):
    thisNode = getNode()
    if thisNode['websocket'] == ws:
        thisNode['telemlistener'] = None
    config.errorLogger(
        syslog.LOG_DEBUG,
        "Websocket closed for {bmc}".format(bmc=thisNode['bmcHostname']))
Example #5
0
def pollNodes(interval):
    """
         Used as timer for the polling interval. set to 25 second minimum
           
         @return: Does not return a specific value but loads the global queue with nodes that get polled 
    """
    global killNow
    if not killNow:
        t = threading.Timer(interval, pollNodes, [interval])
        t.daemon = True
        t.start()

    for node in mynodelist:
        if node['accessType'] == 'ipmi':
            #load nodes that are using polling into the queue
            nodes2poll.put(node)
        elif node['accessType'] == 'openbmcRest':
            if not config.useTelem:
                if 'listener' in node and not node['listener'].isAlive():
                    config.errorLogger(
                        syslog.LOG_DEBUG,
                        "Main process opening new connection to {bmc}".format(
                            bmc=node['bmcHostname']))
                    t = threading.Thread(
                        target=notificationlistener.openSocket,
                        args=[
                            node['bmcHostname'], node['username'],
                            node['password']
                        ])
                    node['listener'] = t
                    t.daemon = True
                    t.start()
Example #6
0
def on_close(ws):
    """
        websocket close event handler
    """
    node = getNode()
    config.errorLogger(
        syslog.LOG_INFO,
        "{bmc} websocket closed.".format(bmc=node['bmcHostname']))
Example #7
0
def on_error(ws, wserror):
    """
        websocket error handler
    """
    node = getNode()
    config.errorLogger(
        syslog.LOG_ERR,
        "Websocket error: {bmc}: {err}".format(bmc=node['bmcHostname'],
                                               err=wserror))
Example #8
0
def setupNotifications():
    """
        Loads the information from the configuration file and setup notification to monitoring entities
        @return the configuration parser object   
    """
    #read the config file
    confParser = configparser.ConfigParser()
    getConfigPaths()
    try:
        #check for dynamic config file
        if os.path.exists(config.configFileName):
            confParser.read(config.configFileName)
            test = dict(confParser.items('notify'))
            for key in test:
                if test[key] == 'True':
                    notifyList[key] = {
                        "function": test[key + 'function'],
                        "receiveEntityDown": False,
                        "failedFirstTry": False,
                        "successfullyReported": True
                    }
                    if confParser.has_section(key):
                        pluginConfSettings = {key: dict(confParser.items(key))}
                        config.pluginConfigs.update(pluginConfSettings)
        else:
            errorLogger(syslog.LOG_CRIT,
                        "Configuration file not found. Exiting.")
            sys.exit()
    except KeyError:
        errorLogger(
            syslog.LOG_ERR,
            "No section: notify in file ibm-crassd.config. Alerts will not be forwarded. Terminating"
        )
        sys.exit()

    #get the nodes to push alerts to
    createNodeList(confParser)

    for i in getPlugins():
        config.errorLogger(syslog.LOG_DEBUG, "Loading Plugin " + i["name"])
        plugin = loadPlugins(i)
        for key in notifyList:
            if key in i["name"]:
                if hasattr(plugin, 'initialize'):
                    if not plugin.initialize():
                        errorLogger(
                            syslog.LOG_CRIT, 'Plugin: ' + i['name'] +
                            ' failed to initialize. Aborting now.')
                        sys.exit()
        for entity in notifyList:
            if isString(notifyList[entity]['function']):
                if hasattr(plugin, notifyList[entity]["function"]):
                    notifyList[entity]["function"] = getattr(
                        plugin, notifyList[entity]["function"])
    return confParser
Example #9
0
def updateTimesforLastReports(signum, frame):
    """
        Updates BMC last reports file to current time
    """
    filename = config.updateNodeTimesfile
    if os.path.exists(filename):
        Updatesconfparser = configparser.ConfigParser()
        parsedFiles = Updatesconfparser.read(filename)
        updatedNodes = []
        if filename in parsedFiles:
            try:
                for section in Updatesconfparser.sections():
                    nodes = dict(Updatesconfparser.items(section))
                    for node in config.mynodelist:
                        for markedNode in Updatesconfparser[section]:
                            if node['xcatNodeName'] == str(markedNode):
                                bmcHostname = node['bmcHostname']
                                impactednode = node['xcatNodeName']
                                updateNotifyTimesData = {
                                    'entity': section,
                                    'bmchostname': node['bmcHostname'],
                                    'lastLogTime': nodes[markedNode],
                                    'dupTimeIDList': []
                                }
                                updateConfFile.put(updateNotifyTimesData)
                                updatedNodes.append(markedNode)
                                with lock:
                                    notifyList[section][bmcHostname][
                                        'lastLogTime'] = nodes[markedNode]
                                    del notifyList[section][bmcHostname][
                                        'dupTimeIDList'][:]
            except Exception as e:
                exc_type, exc_obj, exc_tb = sys.exc_info()
                fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
                config.errorLogger(
                    LOG_ERR, "exception: {etype} {fname} {lineNum}".format(
                        etype=exc_type, fname=fname, lineNum=exc_tb.tb_lineno))
                config.errorLogger(LOG_ERR,
                                   "{excDetails}".format(excDetails=e))
            try:
                os.remove(config.updateNodeTimesfile)
                for section in Updatesconfparser.sections():
                    errorLogger(
                        syslog.LOG_INFO,
                        "Updated {entity} BMC reporting times for: {bmcList}".
                        format(bmcList=", ".join(updatedNodes),
                               entity=section))
            except Exception as e:
                errorLogger(
                    syslog.LOG_ERR, 'Unable to delete file {filename}'.format(
                        filename=config.updateNodeTimesfile))
        else:
            errorLogger(syslog.LOG_ERR,
                        "Unable to parse updateNodes.ini file.")
Example #10
0
def updateBMCLastReports():
    """
         update the bmc ini file to record last log reported
    """
    global killNow
    confParser = configparser.ConfigParser()
    if os.path.exists(config.bmclastreports):
        try:
            confParser.read(config.bmclastreports)
        except Exception as e:
            exc_type, exc_obj, exc_tb = sys.exc_info()
            fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
            print("exception: ", exc_type, fname, exc_tb.tb_lineno)
    while True:
        if killNow: break
        #node contains {entity: entName, bmchostname: bmchostname, lastlogtime: timestamp, dupTimeIDList: [ID1, ID2]
        node = updateConfFile.get()

        if len(node['dupTimeIDList']) >= 1:
            tmpList = []
            for cerid in node['dupTimeIDList']:
                tmpList.append(str(cerid))
            node['dupTimeIDList'] = tmpList
        data2write = {
            'lastLogTime':
            str(node['lastLogTime']),
            'dupTimeIDList':
            node['dupTimeIDList'],
            'hrTime':
            datetime.datetime.fromtimestamp(int(
                node['lastLogTime'])).strftime("%Y-%m-%d %H:%M:%S")
        }
        statistics = statistics2Write()
        if len(statistics) > 0:
            confParser['statistics'] = statistics
        try:
            if node['entity'] + '_bmcs' not in confParser:
                confParser[node['entity'] + '_bmcs'] = {}
            confParser[node['entity'] +
                       '_bmcs'][node['bmchostname']] = str(data2write)
            with open(config.bmclastreports, 'w') as configfile:
                confParser.write(configfile)
        except Exception as e:
            exc_type, exc_obj, exc_tb = sys.exc_info()
            fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
            config.errorLogger(syslog.LOG_DEBUG, "exception: ", exc_type,
                               fname, exc_tb.tb_lineno)
            traceback.print_tb(e.__traceback__)
            config.errorLogger(syslog.LOG_DEBUG, str(e))
            continue

        updateConfFile.task_done()
Example #11
0
def openWebSocketsThreads(node):
    bmcIP = node['bmcHostname']
    systemName = node['xcatNodeName']
    mysession = login(bmcIP, node['username'], node['password'], True)
    if not isinstance(mysession, str):
        try:
            node['activeTimer'] = time.time()
            sescookie = mysession.cookies.get_dict()
            initSensors(bmcIP, mysession, systemName)
            createWebsocket(sescookie, bmcIP, node)
            node['retryCount'] = 0
        except Exception as e:
            config.errorLogger(
                syslog.LOG_CRIT,
                "Failed to open the websocket with bmc {bmc}".format(
                    bmc=bmcIP))
            exc_type, exc_obj, exc_tb = sys.exc_info()
            fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
            config.errorLogger(
                syslog.LOG_DEBUG,
                "Exception: Error: {err}, Details: {etype}, {fname}, {lineno}".
                format(err=e,
                       etype=exc_type,
                       fname=fname,
                       lineno=exc_tb.tb_lineno))
            traceback.print_tb(e.__traceback__)
    else:
        config.errorLogger(syslog.LOG_CRIT,
                           "Failed to login to bmc {bmc}".format(bmc=bmcIP))
        config.errorLogger(syslog.LOG_ERR, mysession)
Example #12
0
def on_open(ws):
    #open the websocket and subscribe to the sensors
    thisNode = getNode()
    data = {
        "paths":
        sensorList,
        "interfaces": [
            "xyz.openbmc_project.Sensor.Value",
            "xyz.openbmc_project.Logging.Entry"
        ]
    }
    ws.send(json.dumps(data))
    sendQueue.put(thisNode)
    config.errorLogger(
        syslog.LOG_DEBUG,
        "Websocket opened for {bmc}".format(bmc=thisNode['bmcHostname']))
Example #13
0
def writeToSocket(logSocket, alert2Send):
    #while not config.killNow:
    sendFailed = False
    #alert2Send = logqueue.get()
    #data2send = json.dumps(alert2Send['logEntry'],sort_keys=False, indent=4, separators=(',', ': ')).encode()
    #         eventTime =datetime.datetime.fromtimestamp(int(alert2Send['logEntry']['timestamp'])).strftime("%Y-%m-%d %H:%M:%S")
    data2send = json.dumps(alert2Send['logEntry'],
                           indent=0,
                           separators=(',', ':')).replace('\n', '') + "\n"
    data2send = data2send.encode()
    try:
        logSocket.sendall(data2send)
    except socket.error:
        sendFailed = True
    except Exception as e:
        traceback.print_tb(e.__traceback__)
        print(e)
    if sendFailed:
        with config.lock:
            host = config.pluginConfigs['logstash']['host']
            port = int(config.pluginConfigs['logstash']['port'])
            alert2Send['entityAttr']['logstash']['failedFirstTry'] = True
        if connectToSocket(
                logSocket, host, port,
                alert2Send['entityAttr']['logstash']['receiveEntityDown']):
            sendFailed = False
            try:
                logSocket.sendall(data2send)
                sendFailed = False
            except socket.error:
                sendFailed = True
            with config.lock:
                alert2Send['entityAttr']['logstash']['failedFirstTry'] = False
                alert2Send['entityAttr']['logstash'][
                    'receiveEntityDown'] = True
        else:
            sendFailed = True
            with config.lock:
                alert2Send['entityAttr']['logstash']['failedFirstTry'] = False
                alert2Send['entityAttr']['logstash'][
                    'receiveEntityDown'] = True
    if not sendFailed:
        config.errorLogger(
            syslog.LOG_INFO,
            "Sent to logstash: {analert}".format(analert=data2send))
    return not sendFailed
Example #14
0
def createNodeList(confParser):
    """
        Gets the list of nodes and loads mynodeList dictionary
        @confParser: The configuration parser object with the nodes entity
        @return: The high level list of nodes
    """
    needWebsocket = False
    try:
        nodes = dict(confParser.items('nodes'))
        for key in nodes:
            mynodelist.append(json.loads(nodes[key].replace("'", '"')))
            if 'username' not in mynodelist[-1]:
                if mynodelist[-1]['accessType'] == "ipmi":
                    mynodelist[-1]['username'] = "******"
                    mynodelist[-1]['password'] = "******"
                elif mynodelist[-1]['accessType'] == 'openbmcRest':
                    mynodelist[-1]['username'] = "******"
                    mynodelist[-1]['password'] = "******"
            if mynodelist[-1]['accessType'] == 'openbmcRest':
                needWebsocket = True
            mynodelist[-1]['dupTimeIDList'] = []
            mynodelist[-1]['lastLogTime'] = '0'
            mynodelist[-1]['pollFailedCount'] = 0
            for entity in notifyList:
                notifyList[entity][mynodelist[-1]['bmcHostname']] = {
                    'lastLogTime': mynodelist[-1]['lastLogTime'],
                    'dupTimeIDList': mynodelist[-1]['dupTimeIDList']
                }

    except Exception as e:
        exc_type, exc_obj, exc_tb = sys.exc_info()
        fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
        config.errorLogger(
            syslog.LOG_DEBUG, "exception: {type} {fname} {lineno}".format(
                exc_type, fname, exc_tb.tb_lineno))
        config.errorLogger(syslog.LOG_DEBUG, str(e))
        errorLogger(syslog.LOG_CRIT,
                    "Unable to read node list from configuration file")
        sys.exit()

    #load optional module for monitoring openbmc systems
    if needWebsocket:
        global notificationlistener
        import notificationlistener
Example #15
0
def connectToSocket(logSocket, host, port, logstashDown):
    """
        Opens a connection to the logstash instance.
        @param logSocket: the socket object to use
        @param host: IP or hostname to connect to
        @param port: port number for logstash service
        @param logstashDown: boolean, True is logstash is down, preventing log flooding of error messages on reconnect
        
        @return: Connection status after attempting to connect. True when successful. 
    """
    connected = False
    errorString = ""

    #check if socket is active by receiving and checking for eof
    try:
        logSocket.settimeout(0.1)
        data = logSocket.recv(4096)
        if not data:
            connected = False
        else:
            connected = True
    except socket.timeout:
        connected = True
    except Exception as e:
        connected = False
        errorString = e

    # Try 3 times to open the socket connection
    if not connected:
        for x in range(0, 3):
            try:
                logSocket.connect((host, port))
                connected = True
                break
            except socket.error as erString:
                errorString = erString
                continue
    if not connected:
        if not logstashDown:
            config.errorLogger(
                syslog.LOG_ERR,
                "Logstash connection failure: {}".format(errorString))
    return connected
Example #16
0
def telemReceive():
    global killNow
    global sensorData
    while True:
        if killNow:
            break
        try:
            newData = telemUpdateQueue.get()
            sensorData.update(newData)
        except Exception as e:
            config.errorLogger(
                syslog.LOG_DEBUG,
                "Error updating sensor data with new readings.")
            exc_type, exc_obj, exc_tb = sys.exc_info()
            fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
            config.errorLogger(
                syslog.LOG_DEBUG,
                "Exception: Error: {err}, Details: {etype}, {fname}, {lineno}".
                format(err=e,
                       etype=exc_type,
                       fname=fname,
                       lineno=exc_tb.tb_lineno))
            traceback.print_tb(e.__traceback__)
Example #17
0
def socket_server(servsocket):
    global serverhostname
    dataUpdaterThread = threading.Thread(target=telemReceive)
    dataUpdaterThread.daemon = True
    dataUpdaterThread.start()

    killQueueThread = threading.Thread(target=killQueueChecker)
    killQueueThread.daemon = True
    killQueueThread.start()

    servsocket.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
    servsocket.bind((serverhostname, config.telemPort))
    servsocket.listen(15)
    read_list = [servsocket]
    global killNow
    while True:
        if killNow:
            break
        try:
            if not dataUpdaterThread.isAlive():
                dataUpdaterThread = threading.Thread(target=telemReceive)
                dataUpdaterThread.daemon = True
                dataUpdaterThread.start()
                config.errorLogger(syslog.LOG_DEBUG,
                                   "Restarted the data consolidation thread")
            readable, writeable, errored = select.select(read_list, [], [], 30)
            for s in readable:
                if s is servsocket:
                    c, addr = servsocket.accept()
                    t = threading.Thread(target=on_new_client, args=[c, addr])
                    t.daemon = True
                    t.start()
                else:
                    data = s.recv(1024)
                    if data:
                        pass
                    else:
                        s.close()
                        read_list.remove(s)
        except Exception as e:
            config.errorLogger(
                syslog.LOG_ERR,
                "Failed to open a telemetry server connection with a client.")
            exc_type, exc_obj, exc_tb = sys.exc_info()
            fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
            config.errorLogger(
                syslog.LOG_DEBUG,
                "Exception: Error: {err}, Details: {etype}, {fname}, {lineno}".
                format(err=e,
                       etype=exc_type,
                       fname=fname,
                       lineno=exc_tb.tb_lineno))
            traceback.print_tb(e.__traceback__)

    servsocket.close()
Example #18
0
def processMessages():
    global killNow
    while True:
        if killNow:
            break
        text = messageQueue.get()
        try:
            message = json.loads(text['msg'])
            if 'logging' in message['path']:
                config.errorLogger(
                    syslog.LOG_DEBUG,
                    "Event notification received for {bmc}.".format(
                        bmc=text['node']['bmcHostname']))
            if 'sensors' in message["path"]:
                sensorName = message["path"].split('/')[-1]
                if 'Value' in message['properties']:
                    sensorData[text['node']['xcatNodeName']][sensorName][
                        'value'] = message['properties']['Value']


#                 config.errorLogger(syslog.LOG_DEBUG, "Updated sensor readings for {bmc}.".format(bmc=text['node']['bmcHostname']))
            else:
                sendQueue.put(text['node'])
        except Exception as e:
            config.errorLogger(
                syslog.LOG_WARNING,
                "Error encountered processing BMC message from {bmc}".format(
                    bmc=text['node']['bmcHostname']))
            config.errorLogger(
                syslog.LOG_DEBUG,
                "BMC message was: {msg}".format(msg=text['msg']))
            exc_type, exc_obj, exc_tb = sys.exc_info()
            fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
            config.errorLogger(
                syslog.LOG_DEBUG,
                "Exception: Error: {err}, Details: {etype}, {fname}, {lineno}".
                format(err=e,
                       etype=exc_type,
                       fname=fname,
                       lineno=exc_tb.tb_lineno))
            traceback.print_tb(e.__traceback__)

        messageQueue.task_done()
Example #19
0
def sigHandler(signum, frame):
    """
         Used to handle kill signals from the operating system
           
         @param signum: integer, the signal number received from the os
         @param frame; contextual frame
         @return: set global kill now to true and lead to termination
    """
    if (signum == signal.SIGTERM or signum == signal.SIGINT):
        config.errorLogger(
            syslog.LOG_DEBUG,
            "Termination Signal received: {sigNum}".format(sigNum=signum))
        global killNow
        killNow = True
        config.killNow = True
    elif (signum == signal.SIGUSR1):
        config.errorLogger(syslog.LOG_DEBUG,
                           "Queue size: " + str(nodes2poll.qsize()))
    else:
        config.errorLogger(syslog.LOG_DEBUG,
                           "Signal received: {sigNum}".format(sigNum=signum))
Example #20
0
def notifyCSM(cerEvent, impactedNode, entityAttr):
    """
         sends alert to CSM
           
         @param cerEvent: dict, the cerEvent to send
         @param impactedNode; the node that had the alert
         @param entityAttr: dictionary, contains the list of known attributes for the entity to report to
         @return: True if notification was successful, false if it was unable to send the alert
    """
    try:
        host = config.pluginConfigs['csm']['host']
        port = config.pluginConfigs['csm']['port']
    except KeyError:
        errorLogger(
            syslog.LOG_ERR,
            "Host and port configurations missing for CSM plugin. Defaulting to 127.0.0.1:4213"
        )
        host = "127.0.0.1"
        port = "4213"
    httpHeader = {'Content-Type': 'application/json'}
    with config.lock:
        failedFirstFlag = entityAttr['csm']['failedFirstTry']
        csmDown = entityAttr['csm']['receiveEntityDown']
    try:
        if (config.pluginPolicies['csmPolicy'][cerEvent['CerID']]['CSMEnabled']
                == False):
            return True
    except KeyError:
        #Report the alert is missing and forward the event to CSM by default.
        config.errorLogger(
            syslog.LOG_ERR,
            "Event ID {cerID} missing in CSM Policy Table. Forwarding to CSM".
            format(cerID=cerEvent['CerID']))
    if (failedFirstFlag == False):
        msgID = "bmc." + "".join(
            cerEvent['eventType'].split()) + "." + cerEvent['CerID']
        argString = createArgString(cerEvent)
        eventTime = datetime.datetime.fromtimestamp(int(
            cerEvent['timestamp'])).strftime("%Y-%m-%d %H:%M:%S")
        eventEntry = {
            'msg_id':
            msgID,
            'location_name':
            impactedNode,
            'time_stamp':
            eventTime,
            "raw_data":
            "serviceable:" + cerEvent['serviceable'] + " || subsystem: " +
            cerEvent['subSystem']
        }
        if argString != "":
            eventEntry['kvcsv'] = argString
    else:
        msgID = "bmc.Firmware/SoftwareFailure.FQPSPEM0003G"
        eventEntry = {
            'msg_id':
            msgID,
            'location_name':
            impactedNode,
            'time_stamp':
            time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()),
            "raw_data":
            (cerEvent['CerID'] + "|| " +
             config.pluginPolicies['csmPolicy'][cerEvent['CerID']]['Message'] +
             "|| serviceable:" + cerEvent['serviceable'] + "|| severity: " +
             cerEvent['severity'])
        }
    if ("additionalDetails" in cerEvent):
        eventEntry['raw_data'] = eventEntry['raw_data'] + cerEvent[
            'sensor'] + " || " + cerEvent['state'] + " || " + cerEvent[
                'additionalDetails']
    try:
        csmurl = 'http://{host}:{port}/csmi/V1.0/ras/event/create'.format(
            host=host, port=port)
        r = requests.post(csmurl,
                          headers=httpHeader,
                          data=json.dumps(eventEntry),
                          timeout=30)
        if (r.status_code != 200):

            with config.lock:
                entityAttr['csm']['receiveEntityDown'] = False
            return False
        else:
            errorLogger(
                syslog.LOG_INFO,
                "Successfully reported to CSM: {id} for {system}".format(
                    id=msgID, system=impactedNode))
            #             sys.stdout.flush()
            if csmDown == True:
                with config.lock:
                    entityAttr['csm']['receiveEntityDown'] = False
            return True
    except (requests.exceptions.Timeout):
        if csmDown == False:
            errorLogger(
                syslog.LOG_ERR,
                "Connection Timed out connecting to csmrestd system service. Ensure the service is running"
            )
            with config.lock:
                entityAttr['csm']['receiveEntityDown'] = True
        return False
    except (requests.exceptions.ConnectionError) as err:
        if csmDown == False:
            errorLogger(
                syslog.LOG_ERR,
                "Encountered an error connecting to csmrestd system service. Ensure the service is running. Error: "
                + str(err))
            with config.lock:
                entityAttr['csm']['receiveEntityDown'] = True
        return False
    except IndexError:
        traceback.print_stack()
Example #21
0
def startMonitoringProcess(nodeList, mngedNodeList):
    killQueueThread = threading.Thread(target=killQueueChecker)
    killQueueThread.daemon = True
    killQueueThread.start()
    global activeThreads
    global lock
    for node in nodeList:
        if node['accessType'] == 'openbmcRest':
            node['activeTimer'] = time.time()
            node['retryCount'] = 0
            node['down'] = False
            ws = threading.Thread(target=openWebSocketsThreads, args=[node])
            ws.daemon = True
            ws.start()
            node['telemlistener'] = ws
    pm = threading.Thread(target=processMessages)
    pm.daemon = True
    pm.start()
    activeThreads.append(ws)
    activeThreads.append(pm)
    time.sleep(10)
    global killNow
    while True:
        if killNow:
            break
        if not sendQueue.empty():
            pollNode = sendQueue.get()
            if 'xcatNodeName' not in pollNode:
                continue
            else:
                mngedNodeList.append(
                    nodeReferenceDict[pollNode['xcatNodeName']])
            sendQueue.task_done()
        for node in nodeList:
            msgtimer = time.time() - node['activeTimer']
            if node['accessType'] == 'openbmcRest':
                if not pm.isAlive():
                    try:
                        pm = threading.Thread(target=processMessages)
                        pm.daemon = True
                        pm.start()
                    except Exception as e:
                        config.errorLogger(
                            syslog.LOG_ERR,
                            "Failed to restart the thread for processing BMC telemetry notifications. "
                        )
                        exc_type, exc_obj, exc_tb = sys.exc_info()
                        fname = os.path.split(
                            exc_tb.tb_frame.f_code.co_filename)[1]
                        config.errorLogger(
                            syslog.LOG_DEBUG,
                            "Exception: Error: {err}, Details: {etype}, {fname}, {lineno}"
                            .format(err=e,
                                    etype=exc_type,
                                    fname=fname,
                                    lineno=exc_tb.tb_lineno))
                        traceback.print_tb(e.__traceback__)
                if node['telemlistener'] is None:
                    try:
                        ws = threading.Thread(target=openWebSocketsThreads,
                                              args=[node])
                        ws.daemon = True
                        ws.start()
                        node['telemlistener'] = ws
                        config.errorLogger(
                            syslog.LOG_ERR,
                            "No thread found for monitoring {bmc} telemetry data. A new thread has been started."
                            .format(bmc=node['bmcHostname']))
                    except Exception as e:
                        config.errorLogger(
                            syslog.LOG_ERR,
                            "Error trying to restart a thread for monitoring bmc telemetry data."
                        )
                        exc_type, exc_obj, exc_tb = sys.exc_info()
                        fname = os.path.split(
                            exc_tb.tb_frame.f_code.co_filename)[1]
                        config.errorLogger(
                            syslog.LOG_DEBUG,
                            "Exception: Error: {err}, Details: {etype}, {fname}, {lineno}"
                            .format(err=e,
                                    etype=exc_type,
                                    fname=fname,
                                    lineno=exc_tb.tb_lineno))
                        traceback.print_tb(e.__traceback__)
                elif msgtimer > 600:
                    try:
                        if node['retryCount'] <= 3:
                            oldws = node['websocket']
                            oldws.close()
                            ws = threading.Thread(target=openWebSocketsThreads,
                                                  args=[node])
                            ws.daemon = True
                            ws.start()
                            node['telemlistener'] = ws
                            node['retryCount'] += 1
                        if node['retryCount'] > 3 and msgtimer >= 300:
                            if not node['down']:
                                config.errorLogger(
                                    syslog.LOG_CRIT,
                                    "ibm-crassd has failed to reconnect to BMC, {bmc}, more than three times."
                                    .format(bmc=node['bmcHostname']))
                                node['down'] = True
                            node['retryCount'] = 0
                    except Exception as e:
                        if not node['down']:
                            config.errorLogger(
                                syslog.LOG_CRIT,
                                "The BMC, {bmc}, stopped sending telemetry data and ibm-crassd failed to reconnect to it."
                                .format(bmc=node['bmcHostname']))
                            node['down'] = True
                else:
                    pass
        time.sleep(0.9)
        telemUpdateQueue.put(sensorData)
Example #22
0
def process_data(filterData, addr):
    """
        Processes the filter data received from a client. In the case of errors, defaults are used. 
        For invalid names and types, they are removed from the list. The full path of the sensor name must be included.
        
        @param filterData: The raw data received from the client. Must be in a JSON formatted string.
        @param addr: The address of the client as a string.
    """
    try:
        filterDict = json.loads(filterData.decode())
        if 'frequency' in filterDict:
            if not isinstance(filterDict['frequency'], int):
                try:
                    filterDict['frequency'] = int(filterDict['frequency'])
                except Exception as e:
                    config.errorLogger(
                        syslog.LOG_ERR,
                        "{value} is not a valid frequency".format(
                            value=filterDict['frequency']))
                    filterDict['frequency'] = 1
        if 'sensornames' in filterDict:
            if not isinstance(filterDict['sensornames'], list):
                config.errorLogger(
                    syslog.LOG_ERR,
                    "{value} is not a valid list of names".format(
                        value=filterDict['sensornames']))
                filterDict.pop('sensornames', None)
            else:
                fullpathnames = []
                for sname in filterDict['sensornames']:
                    found = False
                    for fullSensName in sensorList:
                        if sname in fullSensName:
                            found = True
                            fullpathnames.append(fullSensName)
                            break
                    if not found:
                        config.errorLogger(
                            syslog.LOG_ERR,
                            "{value} is not a valid sensor name".format(
                                value=sname))
                filterDict['sensornames'] = fullpathnames
                if len(filterDict['sensornames']) <= 0:
                    filterDict.pop('sensornames', None)
        if 'sensortypes' in filterDict:
            if not isinstance(filterDict['sensortypes'], list):
                config.errorLogger(
                    syslog.LOG_ERR,
                    "{value} is not a valid list of types".format(
                        value=filterDict['sensortypes']))
                filterDict.pop('sensornames', None)
            else:
                validSensorTypes = [
                    'current', 'power', 'voltage', 'temperature', 'fan_tach'
                ]
                for stype in filterDict['sensortypes']:
                    if stype not in validSensorTypes:
                        config.errorLogger(
                            syslog.LOG_ERR,
                            "{value} is not a valid sensor type".format(
                                value=stype))
                        filterDict['sensortypes'].remove(stype)
                if len(filterDict['sensortypes']) <= 0:
                    filterDict.pop('sensortypes', None)
        return filterDict
    except Exception as e:
        config.errorLogger(
            syslog.LOG_CRIT,
            "Unable to process message from client {addr}. Error details: {err}"
            .format(addr=addr, err=e))
Example #23
0
def on_new_client(clientsocket, addr):
    """
         Run in a thread,under a subprocess, sends telemetry data to a subscribed client
           
         @param clientsocket: the socket opened with the subscriber
         @param addr: The address of the subscriber
    """
    last_run = next_run = now = get_millis()
    clientSubRate = update_every
    clientsocket.settimeout(0.1)
    count = 0
    config.errorLogger(
        syslog.LOG_INFO,
        "Telemetry streaming connected to {address}".format(address=addr))
    global killNow
    filterInfo = {}
    while True:
        if killNow:
            break

        if next_run <= now:
            count += 1
            while next_run <= now:
                next_run += clientSubRate
            dt = now - last_run
            last_run = now

            if count == 1:
                dt = 0
            filteredSensors = getFilteredData(filterInfo, sensorData)
            data2send = (
                json.dumps(filteredSensors, indent=0,
                           separators=(',', ':')).replace('\n', '') +
                "\n").encode()
            #             data2send = (json.dumps(sensorData, indent=0, separators=(',', ':')).replace('\n','') +"\n").encode()
            msg = struct.pack('>I', len(data2send)) + data2send
            clientsocket.sendall(msg)
        time.sleep(0.3)  #wait 1/3 of a second and check for new
        now = get_millis()

        try:
            raw_msglen = recvall(clientsocket, 4)
            if not raw_msglen:
                break
            msglen = struct.unpack('>I', raw_msglen)[0]
            data = recvall(clientsocket, msglen)
            if not data:
                break
            else:
                filterInfo = process_data(data, addr)
                if 'frequency' in filterInfo:
                    clientSubRate = filterInfo['frequency'] * 1000
        except socket.timeout:
            pass
        except Exception as e:
            config.errorLogger(
                syslog.LOG_ERR,
                "Error processing message filters from client at: {caddress}.".
                format(caddress=addr))
            exc_type, exc_obj, exc_tb = sys.exc_info()
            fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
            config.errorLogger(
                syslog.LOG_DEBUG,
                "Exception: Error: {err}, Details: {etype}, {fname}, {lineno}".
                format(err=e,
                       etype=exc_type,
                       fname=fname,
                       lineno=exc_tb.tb_lineno))
            traceback.print_tb(e.__traceback__)

    for item in clientList:
        if addr == item:
            clientList.remove(item)
    clientsocket.close()
Example #24
0
def main():
    global telemUpdateQueue
    telemUpdateQueue = multiprocessing.Queue()
    global killQueue
    killQueue = multiprocessing.Queue()
    global messageQueue
    messageQueue = queue.Queue()
    global clientList
    clientList = []
    global outputData
    outputData = {}
    global sensorData
    sensorData = {}
    global sensorList
    sensorList = [
        "/xyz/openbmc_project/sensors/current/ps0_output_current",
        "/xyz/openbmc_project/sensors/current/ps1_output_current",
        "/xyz/openbmc_project/sensors/fan_tach/fan0_0",
        "/xyz/openbmc_project/sensors/fan_tach/fan0_1",
        "/xyz/openbmc_project/sensors/fan_tach/fan1_0",
        "/xyz/openbmc_project/sensors/fan_tach/fan1_1",
        "/xyz/openbmc_project/sensors/fan_tach/fan2_0",
        "/xyz/openbmc_project/sensors/fan_tach/fan2_1",
        "/xyz/openbmc_project/sensors/fan_tach/fan3_0",
        "/xyz/openbmc_project/sensors/fan_tach/fan3_1",
        "/xyz/openbmc_project/sensors/power/fan_disk_power",
        "/xyz/openbmc_project/sensors/power/io_power",
        "/xyz/openbmc_project/sensors/power/p0_gpu0_power",
        "/xyz/openbmc_project/sensors/power/p0_gpu1_power",
        "/xyz/openbmc_project/sensors/power/p0_gpu2_power",
        "/xyz/openbmc_project/sensors/power/p0_io_power",
        "/xyz/openbmc_project/sensors/power/p0_mem_power",
        "/xyz/openbmc_project/sensors/power/p0_power",
        "/xyz/openbmc_project/sensors/power/p1_gpu0_power",
        "/xyz/openbmc_project/sensors/power/p1_gpu1_power",
        "/xyz/openbmc_project/sensors/power/p1_gpu2_power",
        "/xyz/openbmc_project/sensors/power/p1_io_power",
        "/xyz/openbmc_project/sensors/power/p1_mem_power",
        "/xyz/openbmc_project/sensors/power/p1_power",
        "/xyz/openbmc_project/sensors/power/ps0_input_power",
        "/xyz/openbmc_project/sensors/power/ps1_input_power",
        "/xyz/openbmc_project/sensors/power/total_power",
        "/xyz/openbmc_project/sensors/temperature/ambient",
        "/xyz/openbmc_project/sensors/temperature/dimm0_temp",
        "/xyz/openbmc_project/sensors/temperature/dimm1_temp",
        "/xyz/openbmc_project/sensors/temperature/dimm10_temp",
        "/xyz/openbmc_project/sensors/temperature/dimm11_temp",
        "/xyz/openbmc_project/sensors/temperature/dimm12_temp",
        "/xyz/openbmc_project/sensors/temperature/dimm13_temp",
        "/xyz/openbmc_project/sensors/temperature/dimm14_temp",
        "/xyz/openbmc_project/sensors/temperature/dimm15_temp",
        "/xyz/openbmc_project/sensors/temperature/dimm2_temp",
        "/xyz/openbmc_project/sensors/temperature/dimm3_temp",
        "/xyz/openbmc_project/sensors/temperature/dimm4_temp",
        "/xyz/openbmc_project/sensors/temperature/dimm5_temp",
        "/xyz/openbmc_project/sensors/temperature/dimm6_temp",
        "/xyz/openbmc_project/sensors/temperature/dimm7_temp",
        "/xyz/openbmc_project/sensors/temperature/dimm8_temp",
        "/xyz/openbmc_project/sensors/temperature/dimm9_temp",
        "/xyz/openbmc_project/sensors/temperature/gpu0_core_temp",
        "/xyz/openbmc_project/sensors/temperature/gpu0_mem_temp",
        "/xyz/openbmc_project/sensors/temperature/gpu1_core_temp",
        "/xyz/openbmc_project/sensors/temperature/gpu1_mem_temp",
        "/xyz/openbmc_project/sensors/temperature/gpu2_core_temp",
        "/xyz/openbmc_project/sensors/temperature/gpu2_mem_temp",
        "/xyz/openbmc_project/sensors/temperature/gpu3_core_temp",
        "/xyz/openbmc_project/sensors/temperature/gpu3_mem_temp",
        "/xyz/openbmc_project/sensors/temperature/gpu4_core_temp",
        "/xyz/openbmc_project/sensors/temperature/gpu4_mem_temp",
        "/xyz/openbmc_project/sensors/temperature/gpu5_core_temp",
        "/xyz/openbmc_project/sensors/temperature/gpu5_mem_temp",
        "/xyz/openbmc_project/sensors/temperature/p0_core0_temp",
        "/xyz/openbmc_project/sensors/temperature/p0_core1_temp",
        "/xyz/openbmc_project/sensors/temperature/p0_core10_temp",
        "/xyz/openbmc_project/sensors/temperature/p0_core11_temp",
        "/xyz/openbmc_project/sensors/temperature/p0_core12_temp",
        "/xyz/openbmc_project/sensors/temperature/p0_core13_temp",
        "/xyz/openbmc_project/sensors/temperature/p0_core14_temp",
        "/xyz/openbmc_project/sensors/temperature/p0_core15_temp",
        "/xyz/openbmc_project/sensors/temperature/p0_core18_temp",
        "/xyz/openbmc_project/sensors/temperature/p0_core19_temp",
        "/xyz/openbmc_project/sensors/temperature/p0_core2_temp",
        "/xyz/openbmc_project/sensors/temperature/p0_core20_temp",
        "/xyz/openbmc_project/sensors/temperature/p0_core21_temp",
        "/xyz/openbmc_project/sensors/temperature/p0_core22_temp",
        "/xyz/openbmc_project/sensors/temperature/p0_core23_temp",
        "/xyz/openbmc_project/sensors/temperature/p0_core3_temp",
        "/xyz/openbmc_project/sensors/temperature/p0_core4_temp",
        "/xyz/openbmc_project/sensors/temperature/p0_core5_temp",
        "/xyz/openbmc_project/sensors/temperature/p0_core6_temp",
        "/xyz/openbmc_project/sensors/temperature/p0_core7_temp",
        "/xyz/openbmc_project/sensors/temperature/p0_core8_temp",
        "/xyz/openbmc_project/sensors/temperature/p0_core9_temp",
        "/xyz/openbmc_project/sensors/temperature/p0_vcs_temp",
        "/xyz/openbmc_project/sensors/temperature/p0_vdd_temp",
        "/xyz/openbmc_project/sensors/temperature/p0_vddr_temp",
        "/xyz/openbmc_project/sensors/temperature/p0_vdn_temp",
        "/xyz/openbmc_project/sensors/temperature/p1_core0_temp",
        "/xyz/openbmc_project/sensors/temperature/p1_core1_temp",
        "/xyz/openbmc_project/sensors/temperature/p1_core10_temp",
        "/xyz/openbmc_project/sensors/temperature/p1_core11_temp",
        "/xyz/openbmc_project/sensors/temperature/p1_core12_temp",
        "/xyz/openbmc_project/sensors/temperature/p1_core13_temp",
        "/xyz/openbmc_project/sensors/temperature/p1_core14_temp",
        "/xyz/openbmc_project/sensors/temperature/p1_core16_temp",
        "/xyz/openbmc_project/sensors/temperature/p1_core17_temp",
        "/xyz/openbmc_project/sensors/temperature/p1_core18_temp",
        "/xyz/openbmc_project/sensors/temperature/p1_core19_temp",
        "/xyz/openbmc_project/sensors/temperature/p1_core2_temp",
        "/xyz/openbmc_project/sensors/temperature/p1_core20_temp",
        "/xyz/openbmc_project/sensors/temperature/p1_core22_temp",
        "/xyz/openbmc_project/sensors/temperature/p1_core23_temp",
        "/xyz/openbmc_project/sensors/temperature/p1_core3_temp",
        "/xyz/openbmc_project/sensors/temperature/p1_core4_temp",
        "/xyz/openbmc_project/sensors/temperature/p1_core5_temp",
        "/xyz/openbmc_project/sensors/temperature/p1_core6_temp",
        "/xyz/openbmc_project/sensors/temperature/p1_core7_temp",
        "/xyz/openbmc_project/sensors/temperature/p1_core8_temp",
        "/xyz/openbmc_project/sensors/temperature/p1_core9_temp",
        "/xyz/openbmc_project/sensors/temperature/p1_vcs_temp",
        "/xyz/openbmc_project/sensors/temperature/p1_vdd_temp",
        "/xyz/openbmc_project/sensors/temperature/p1_vddr_temp",
        "/xyz/openbmc_project/sensors/temperature/p1_vdn_temp",
        "/xyz/openbmc_project/sensors/temperature/pcie",
        "/xyz/openbmc_project/sensors/voltage/ps0_input_voltage",
        "/xyz/openbmc_project/sensors/voltage/ps0_output_voltage",
        "/xyz/openbmc_project/sensors/voltage/ps1_input_voltage",
        "/xyz/openbmc_project/sensors/voltage/ps1_output_voltage",
        "/xyz/openbmc_project/logging"
    ]
    requests.packages.urllib3.disable_warnings(
        requests.packages.urllib3.exceptions.InsecureRequestWarning)
    global wsClosed
    wsClosed = False
    global pmClosed
    pmClosed = False
    global lock
    lock = threading.Lock()
    global activeThreads
    activeThreads = []
    global get_millis
    get_millis = lambda: int(round(time.time() * 1000))
    global sendQueue
    sendQueue = queue.Queue()
    nodeListManager = multiprocessing.Manager()
    mngedNodeList = nodeListManager.list()
    global serversocket
    serversocket = socket.socket()
    global serverhostname
    serverhostname = ''
    global port
    port = config.telemPort
    global update_every
    update_every = 1
    update_every = update_every * 1000
    global killNow
    killNow = config.killNow
    global gathererProcs
    gathererProcs = []
    global nodeReferenceDict
    nodeReferenceDict = {}
    for node in config.mynodelist:
        nodeReferenceDict[node['xcatNodeName']] = node.copy()
    init(mngedNodeList)

    sockServProcess = multiprocessing.Process(target=socket_server,
                                              args=[serversocket])
    sockServProcess.daemon = True
    sockServProcess.start()
    config.errorLogger(syslog.LOG_INFO, 'Started Telemetry Streaming')

    while not config.killNow:
        time.sleep(1)
        try:
            while len(mngedNodeList) > 0:
                #                 node = config.alertMessageQueue.get()
                node = mngedNodeList.pop(0)
                config.nodes2poll.put(node)


#                 config.alertMessageQueue.task_done()
        except Exception as e:
            config.errorLogger(syslog.LOG_ERR,
                               "Error processing an alert message.")
            exc_type, exc_obj, exc_tb = sys.exc_info()
            fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
            config.errorLogger(
                syslog.LOG_DEBUG,
                "Exception: Error: {err}, Details: {etype}, {fname}, {lineno}".
                format(err=e,
                       etype=exc_type,
                       fname=fname,
                       lineno=exc_tb.tb_lineno))
            traceback.print_tb(e.__traceback__)
    for i in range(1 + len(gathererProcs)):
        killQueue.put(True)

    sockServProcess.terminate()
    for aproc in gathererProcs:
        aproc.terminate()
Example #25
0
    #check for dead push notification


if __name__ == '__main__':
    """
         main thread for the application. 
    """
    set_procname('ibm-crassd'.encode('utf-8'))
    #setup the interrupt to handle SIGTERM, SIGINT
    signal.signal(signal.SIGTERM, sigHandler)
    signal.signal(signal.SIGINT, sigHandler)
    signal.signal(signal.SIGUSR1, sigHandler)
    signal.signal(signal.SIGUSR2, updateTimesforLastReports)
    global telemThread
    try:
        initialize()
        config.errorLogger(syslog.LOG_DEBUG, str(os.getpid()))
        while (True):
            time.sleep(0.5)
            if (killNow):
                break
        if telemThread is not None:
            config.errorLogger(syslog.LOG_DEBUG,
                               "Waiting on the telemetry server to stop.")
            telemThread.join()
        errorLogger(syslog.LOG_ERR, "The ibm-crassd service has been stopped")
        sys.exit()
    except KeyboardInterrupt:
        config.errorLogger(syslog.LOG_DEBUG, "Terminating")
        sys.exit()
Example #26
0
def initialize():
    """
        Initializes the application by loading the nodes to monitor, getting the plugins needed, and setting up
        the forwarding through the specified plugins. Spools up the threads that will process the bmc alerts
        when a queue entry is created. This also configures push notifications. 
    """
    global csmDown
    csmDown = False
    global killNow
    killNow = False

    #The following list indicates failure to communicate to the BMC and retrieve information
    global networkErrorList
    networkErrorList = config.networkErrorList

    #Setup Notifications for entities to push alerts to
    confParser = setupNotifications()

    #validate all of the needed plugins loaded
    validatePluginNotifications(confParser)
    errorLogger(syslog.LOG_INFO,
                "Node Count: {count}".format(count=len(mynodelist)))
    #check the node count to see if nodes were specified
    if len(mynodelist) < 1:
        #The node list seems short, attempt to scan for nodes that report to this service node
        autoConfigureNodes(confParser)
        updateMaxThreads(confParser)
        errorLogger(
            syslog.LOG_INFO, "Auto-configuration Node Count: {count}".format(
                count=len(mynodelist)))
    #Check for analysis scripts
    getIDstoAnalyze(confParser)
    #load last reported times from storage file to prevent duplicate entries
    loadBMCLastReports()

    #Determine the maximum number of nodes
    maxThreads = 1

    #Enable debug messages if needed
    if 'enableDebugMsgs' in confParser['base_configuration']:
        if 'True' in confParser['base_configuration']['enableDebugMsgs']:
            config.enableDebug = True
        else:
            config.enableDebug = False
    try:
        maxThreads = int(confParser['base_configuration']['maxThreads'])
    except KeyError:
        errorLogger(
            syslog.LOG_ERR,
            "No section: base configuration in file ibm-crassd.config. Defaulting to one thread for polling"
        )

    if (maxThreads >= len(mynodelist)):
        maxThreads = len(mynodelist)

    if (maxThreads < 1): maxThreads = 1
    minPollingInterval = getMinimumPollingInterval(maxThreads)
    #Create the worker threads

    for i in range(maxThreads):
        config.errorLogger(syslog.LOG_DEBUG, "Creating thread " + str(i))

        t = threading.Thread(target=BMCEventProcessor)
        t.daemon = True
        t.start()

    t = threading.Thread(target=updateBMCLastReports)
    t.daemon = True
    t.start()

    #start TelemetryServer if enabled
    if 'telemetry_configuration' in confParser:
        if 'nodesPerGathererProcess' in confParser['telemetry_configuration']:
            config.nodespercore = int(confParser['telemetry_configuration']
                                      ['nodesPerGathererProcess'])
    if 'enableTelemetry' in confParser['base_configuration']:
        enableTelem = confParser['base_configuration']['enableTelemetry']
        if confParser['base_configuration']['enableTelemetry'] == 'True':
            if 'telemetryPort' in confParser['base_configuration']:
                config.telemPort = int(
                    confParser['base_configuration']['telemetryPort'])
                config.useTelem = True
            global telemThread
            telemThread = threading.Thread(target=telemetryServer.main)
            telemThread.daemon = True
            telemThread.start()
        else:
            #subscribe to events only if no telemetry
            configurePushNotifications()
    else:
        #subscribe to events only if no telemetry
        configurePushNotifications()

    #Setup polling interval
    pollNodes(minPollingInterval)
Example #27
0
def getBMCAlerts(node):
    """
        Gets alerts from the node's BMC and puts them into a dictionary with a common format
        
        @param node: A dictionary containing properties about a node
        @return: dictionary with common format containing alerts
    """
    eventList = ""
    eventsDict = {}
    name = threading.currentThread().getName()
    bmcHostname = node['bmcHostname']
    impactednode = node['xcatNodeName']
    username = node['username']
    password = node['password']
    try:
        #get the alerts from the bmc and place in a common format
        if (node['accessType'] == "openbmcRest"):
            #use openbmctool for openbmc rest interface
            try:
                eventBytes = subprocess.check_output([
                    config.pyString, '/opt/ibm/ras/bin/openbmctool.py', '-H',
                    bmcHostname, '-U', username, '-P', password, '-j', '-t',
                    '/opt/ibm/ras/lib/policyTable.json', 'sel', 'print'
                ])
                eventList = eventBytes.decode('utf-8')
            except subprocess.CalledProcessError as e:
                if e.returncode == 1:
                    eventList = e.output.decode('utf-8')
                else:
                    errorLogger(
                        syslog.LOG_ERR,
                        "An unknown error has occurred when retrieving bmc alerts from {hostname}. Error Details: {msg}"
                        .format(hostname=impactednode, msg=e.message))
                    eventList = {'numAlerts': 0, 'failedPoll': True}
            if not isString(eventList):
                eventList = eventList.decode('utf-8')
            if eventList.find('{') != -1:  #check for valid response
                eventList = eventList[eventList.index('{'):]
                eventsDict = json.loads(eventList)
            else:
                errorLogger(
                    syslog.LOG_ERR,
                    "An invalid response was received from bmc when requesting alerts for {hostname}"
                    .format(hostname=impactednode))
                eventsDict = {'numAlerts': 0, 'failedPoll': True}
            eventsDict = updateEventDictionary(eventsDict)
        elif (node['accessType'] == "ipmi"):
            #use java sel parser and ipmitool to get alerts from ipmi node
            eventList = subprocess.check_output([
                'java', '-jar', '/opt/ibm/ras/lib/crassd.jar', bmcHostname,
                username, password
            ]).decode('utf-8')
            if eventList.find('{') != -1:  #check for valid response
                eventList = eventList[eventList.index(
                    '{'
                ):]  #keyboard terminate causing substring not found here
                eventsDict = json.loads(eventList)
            else:
                errorLogger(
                    syslog.LOG_ERR,
                    "An invalid response was received when retrieving bmc alerts from {hostname}. Response Details: {msg}"
                    .format(hostname=impactednode, msg=eventList))
                eventsDict = {'numAlerts': 0, 'failedPoll': True}
        else:
            #use redfish
            errorLogger(syslog.LOG_ERR, "redfish not supported")
            eventList = {'numAlerts': 0, 'failedPoll': True}
    except Exception as e:
        exc_type, exc_obj, exc_tb = sys.exc_info()
        fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
        config.errorLogger(syslog.LOG_DEBUG, "exception: ", exc_type, fname,
                           exc_tb.tb_lineno)
        config.errorLogger(syslog.LOG_DEBUG, str(e))
        traceback.print_tb(e.__traceback__)
        eventsDict = {'numAlerts': 0, 'failedPoll': True}

    return eventsDict
Example #28
0
def BMCEventProcessor():
    """
         processes alerts and is run in child threads
    """
    eventsDict = {}
    global notifyList
    global killNow
    global networkErrorList
    while True:
        nodeCommsLost = False
        if killNow:
            break
        else:
            node = nodes2poll.get()
            eventList = {}
            bmcEvent = {}
            name = threading.currentThread().getName()
            bmcHostname = node['bmcHostname']
            impactednode = node['xcatNodeName']
            username = node['username']
            password = node['password']
            resetFailedNotify(bmcHostname)
            try:
                config.errorLogger(syslog.LOG_DEBUG,
                                   str(name + ": " + bmcHostname))
                #get the alerts from the bmc and place in a common format
                eventsDict = getBMCAlerts(node)

                #process the alerts
                if (eventsDict['numAlerts'] == 0):
                    #node poll was successful and no alerts to process
                    node['pollFailedCount'] = 0
                    continue
                elif ('failedPoll' in eventsDict):
                    node['pollFailedCount'] += 1
                    if (node['pollFailedCount'] != 2):
                        #create a log entry for failing to process sel entries
                        errorLogger(
                            syslog.LOG_ERR,
                            "Failed to process BMC alerts for {host} three or more times"
                            .format(host=impactednode))
                        continue
                else:
                    #process the received alerts
                    for i in range(len(eventsDict) - 1):
                        if (killNow):
                            break
                        event = "event" + str(i)
                        bmcEvent = eventsDict[event]
                        if "error" in eventsDict[event]:
                            node['pollFailedCount'] = 0
                            begIndex = eventsDict[event]['error'].rfind(
                                ":") + 2
                            missingKey = eventsDict[event]['error'][begIndex:]
                            if (missingKey not in missingEvents.keys()):
                                with lock:
                                    missingEvents[missingKey] = True
                                errorLogger(
                                    syslog.LOG_ERR,
                                    "Event not found in lookup table for node {node}: {alert}"
                                    .format(alert=missingKey,
                                            node=impactednode))
                        else:
                            #check for failure to poll the bmc
                            if (eventsDict[event]['CerID']
                                    in networkErrorList):
                                if (nodeCommsLost == False):
                                    nodeCommsLost = True
                                    node['pollFailedCount'] += 1
                                if (node['pollFailedCount'] != 2):
                                    #forward the network connection failure at 3 consecutive failures.
                                    continue

                            #process the alerts
                            processAlert(eventsDict[event], bmcHostname,
                                         impactednode, username, password,
                                         node['accessType'])
                nodes2poll.task_done()
            except Exception as e:
                exc_type, exc_obj, exc_tb = sys.exc_info()
                fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
                config.errorLogger(
                    syslog.LOG_DEBUG,
                    "exception: {type} {fname} {lineNo}".format(
                        exc_type, fname, exc_tb.tb_lineno))
                config.errorLogger(syslog.LOG_DEBUG, str(e))
            eventsDict.clear()