def read_xml(self, xml_constants_element, trace_dict): '''Read the GEAR control XML''' self.trace_id = trace_dict[xml_constants_element] for xml_entry in xml_constants_element: entry_name = xml_entry.tag.split('}')[-1] get_logger().debug('Processing {0}'.format(entry_name)) ## Dual support if entry_name == GCTL_DEFAULT_CREATE_ALERT_INIT_CLASS: # supported by both, so don't need to check if 'class' not in xml_entry.attrib: self.ruleset.parse_error(self.trace_id[0], 'default_create_alert_init_class element requires a \'class\' attribute') tmp_class = xml_entry.attrib['class'] try: module_name, class_name = tmp_class.rsplit('.', 1) module = __import__(module_name, globals(), locals(), [class_name]) except ImportError, ie: get_logger().error(ie) self.ruleset.parse_error(self.trace_id[0], 'gear control unable to load specified create alert init class: {0}'.format(tmp_class)) raise ie self[GCTL_DEFAULT_CREATE_ALERT_INIT_CLASS] = getattr(module, class_name) ## Event analysis support elif entry_name == GCTL_DEFAULT_EVENT_COMP: if not self.ruleset.event_input: self.ruleset.parse_error(self.trace_id[0], 'gear control element \'{0}\' is not supported for this analyzer'.format(GCTL_DEFAULT_EVENT_COMP)) self[GCTL_DEFAULT_EVENT_COMP] = xml_entry.attrib['value'].strip() if self[GCTL_DEFAULT_EVENT_COMP] is None or len(self[GCTL_DEFAULT_EVENT_COMP]) == 0: self.ruleset.parse_error(self.trace_id[0], 'gear_control element is missing \'value\' attribute')
def parse_event(errm_env): ''' Handles a PNSD event that was directly monitored by the node ''' # Make sure it was our sensor that fired rsrc_name = errm_env.get('ERRM_RSRC_NAME') if rsrc_name != PNSD_STAT_SENSOR: get_logger().warn('Unknown resource name: {0}'.format(rsrc_name)) return event_id = PNSD_RETRANSMIT_THRESHOLD # Time from RMC is in sec,usec format sec_usec = errm_env.get('ERRM_TIME', None) if sec_usec is not None: sec = long(sec_usec.split(',')[0]) else: sec = None time_occurred = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(sec)) # Create the location codes src = ('PNSD', 'A', '{0}##{1}'.format(errm_env.get('ERRM_NODE_NAME'), rsrc_name)) rpt = (None, None, None) # Save the restransmit percentage as the error data raw_data_fmt = 0 # No special formatting required raw_data = errm_env.get('ERRM_VALUE') log_event(event_id, time_occurred, src, rpt, raw_data_fmt, raw_data)
def match(self, event_id, src_comp, src_loc, rpt_loc, scope): '''check if the event matches the passed in values A None passed in means wildcard ... i.e. assume it matches ''' try: if event_id is not None and event_id != self.get_event_id(): #get_logger().debug('Match failed due to event id mismatch {0} != {1}'.format(event_id, self.get_event_id())) return False if src_comp is not None and src_comp != self.get_src_comp(): #get_logger().debug('Match failed due to comp mismatch {0} != {1}'.format(src_comp, self.get_src_comp())) return False if src_loc is not None: if not src_loc.match(self.get_src_loc(), scope): #get_logger().debug('Match failed due to src_loc mismatch {0} != {1}'.format(str(src_loc), str(self.get_src_loc()))) return False if rpt_loc is not None: if self.get_rpt_loc() is None: #get_logger().debug('Match failed due to event not having rpt_loc value') return False if not rpt_loc.match(self.get_rpt_loc(), scope): #get_logger().debug('Match failed due to rpt_loc mismatch {0} != {1}'.format(str(rpt_loc), str(self.get_rpt_loc()))) return False except BaseException: get_logger().exception('Event {0}: Match failed'.format(self.brief_str())) return False #get_logger().debug('Matched') return True
def __init__(self, name, inQueue, outQueue, config_dict=None, number=0): '''The constructor.''' get_logger().debug('Creating GEAR event analyzer named {0}'.format(name)) self.engine = engine_factory(name, config_dict, event_input=True, number=number, send_alert=self.send_alert) EventAnalyzer.__init__(self, name, inQueue, outQueue, config_dict, number, checkpoint=self.engine.checkpoint) return
def __init__(self, name, inEventQueue, outQueue, config_dict=None, number=0, checkpoint=None): '''The constructor. ''' EventAnalyzer.__init__(self, name, inEventQueue, outQueue, config_dict, number, checkpoint) self.severity = "F" self.recommendation = "Diagnose the fatal RAS event that ended the job. " # ring buffer to remember jobs ending due to fatal errors size = 1024 self.ring = [None for i in xrange(size)] # Get the list of ras events with END_JOB control action self.msgIDs, self.msgidService = get_eventList() for msgID in self.msgIDs: registry.get_logger().debug('msgId = ' + msgID) return
def __init__(self, name, inEventQueue, outQueue, config_dict=None, number=0, checkpoint=None): '''The constructor. ''' EventAnalyzer.__init__(self, name, inEventQueue, outQueue, config_dict, number, checkpoint) self.severity = "W" self.recommendation = "Diagnose the problem that caused the threshold to be reached or exceeded. " self.alert_id = 'THRESH01' # Get the list of ras events that have threshold counts self.msgIDs, self.msgidService, self.msgidCount, self.msgidPeriod = get_eventList() for msgID in self.msgIDs: registry.get_logger().debug('msgId = ' + msgID) # define query for count with no period specified eventTable = self.appendSchema('tbgqeventlog') self.count_query = "select count(*) from " + eventTable + " where msg_id = ? and location LOC and serialnumber SN and event_time <= ?" self.count_query2 = "select sum(bigint(count)) from " + eventTable + " where msg_id = ? and location LOC and serialnumber SN and event_time <= ?" # define query for count exceeded with period specified self.period_query = "select count(*) from " + eventTable + " where msg_id = ? and location LOC and serialnumber SN and event_time <= ? and event_time > (timestamp('MYTIME') - PERIOD)" self.period_query2 = "select sum(bigint(count)) from " + eventTable + " where msg_id = ? and location LOC and serialnumber SN and event_time <= ? and event_time > (timestamp('MYTIME') - PERIOD)" # the number of consecutive periods that the threshold has to exceed self.msgidConsecutivePeriods = dict() self.msgidConsecutivePeriods['0008002F'] = 7 self.msgidConsecutivePeriods['00080030'] = 3 return
def process_alert(self, alert): ''' Send an alert via email to the requested recipients ''' alert_dict = alert.write_to_dictionary() msg = MIMEText(self.body_template.safe_substitute(alert_dict)) msg['Subject'] = self.subj_template.safe_substitute(alert_dict) msg['Date'] = alert_dict[ALERT_ATTR_CREATION_TIME].strftime('%a, %d %b %Y %H:%M:%S -0000') msg['From'] = self.sender msg['To'] = self.receivers try: # Create the proper SMTP instance server = smtplib.SMTP() smtp_server = self.server # Connect to the server server.set_debuglevel(self.debug_level) server.connect(smtp_server) # Login, if required if self.uid: server.login(self.uid,self.password) # Send the alert to the intended recipients server.sendmail(self.sender,self.receivers.split(','),msg.as_string()) # Disconnect from the server server.quit() except Exception,e: get_logger().warn("Failed to send alert({0}) via SMTP: {1}".format(alert_dict[ALERT_ATTR_REC_ID],e))
def notifyFromEventQ(self, item): '''Handle incoming items''' get_logger().debug('Notify called with {0}'.format(str(item))) #self.notify_lock.acquire() result = item.process(self, None) #self.notify_lock.release() return result
def _read_from_dictionary(self, in_dict): '''Set the attributes of Alert from information from a dictionary''' try: self.supresses.clear() self.condition_events.clear() for key in in_dict: value = in_dict[key] if key == ALERT_ATTR_ALERT_ID: # Alert id is set on the allocate call only # If in dictionary ignore it pass elif key == ALERT_ATTR_CREATION_TIME: self.creation_time = value elif key == ALERT_ATTR_SEVERITY: self.severity = value elif key == ALERT_ATTR_URGENCY: self.urgency = value elif key == ALERT_ATTR_EVENT_LOC: try: if value is not None: self.event_loc = Location(in_dict[ALERT_ATTR_EVENT_LOC_TYPE],value) except BaseException, e: get_logger().warning('Error processing location: value is %s', (value,)) raise elif key == ALERT_ATTR_EVENT_LOC_TYPE: pass # Used to build Location above elif key == ALERT_ATTR_EVENT_LOC_OBJECT: self.event_loc = value
def create_teal_alert(alert_id, reason, raw_data, src_name='TEAL', severity='I', urgency='N', loc_instance=None, recommendation='Contact next level of support', disable_dup=False): ''' create a TEAL alert This will used the parameters to: (1) Create the alert initialization dictionary (2) Allocate the alert (3) Commit the alert (4) Put the alert in the delivery queue ''' get_logger().debug('Creating {0} alert'.format(src_name)) # Build the Alert directly from the event information alert_dict = {ALERT_ATTR_SEVERITY:severity, ALERT_ATTR_URGENCY:urgency, ALERT_ATTR_RECOMMENDATION:recommendation, ALERT_ATTR_REASON:reason, ALERT_ATTR_RAW_DATA:raw_data, ALERT_ATTR_SRC_NAME: src_name } alert_dict[ALERT_ATTR_EVENT_LOC_OBJECT] = registry.get_service(SERVICE_LOCATION).get_teal_location(loc_instance) registry.get_service(SERVICE_ALERT_MGR).create_and_deliver_alert(alert_id, alert_dict, disable_dup=disable_dup) return
def __init__(self, location_id, data): ''' Constructor ''' if not(isinstance(data, str) or isinstance(data, unicode)): raise TypeError,"Invalid type of Location data: {0}.".format(type(data)) loc_service = registry.get_service(SERVICE_LOCATION) try: self.location_info = loc_service[location_id] self.location_code = data.split(self.location_info.separator) # Location code is initialized, now validate it self._validate_location_code() except: tmp_env = os.environ.get(TEAL_LOCATION_VALIDATION, 'LOG').upper() if tmp_env == 'LOG': get_logger().exception('LOGGING Location creation failure and continuing processing') elif tmp_env == 'IMMEDIATE': raise self.loc_id = location_id self.data = data self.ex_type, self.ex_value = sys.exc_info()[:2] self.is_unprocessable = self.is_unprocessable_UNPROCESSABLE self.new_location_by_scope = self._UNPROCESSABLE self.get_comp_value = self._UNPROCESSABLE self.get_substitution_dict = self._UNPROCESSABLE self.get_location = self.get_location_UNPROCESSABLE self.str_impl = self.str_impl_UNPROCESSABLE self.match = self.match_UNPROCESSABLE self.get_id = self.get_id_UNPROCESSABLE return
def delete(self): ''' Mark the checkpoint as deleted ''' if self.checkpoint_mgr.use_db == True: with self.lock: self.changed = False # Stop any updates self.checkpoint_mgr.unregister_event_checkpoint(self) else: self.checkpoint_mgr.unregister_event_checkpoint(self) # Change in memory values self.status = CHECKPOINT_STATUS_DELETED self.start_rec_id = None self.data = None self.starting_cb = None self.set_status = self.set_status_DELETED self.set_checkpoint = self.set_checkpoint_DELETED # Delete from DB if self.checkpoint_mgr.use_db == True: try: dbi = get_service(SERVICE_DB_INTERFACE) cnxn = dbi.get_connection() cursor = cnxn.cursor() dbi.delete(cursor, db_interface.TABLE_CHECKPOINT, where='${0} = ?'.format(EVENT_CPF_NAME), where_fields=[EVENT_CPF_NAME], parms=(self.name)) cnxn.commit() cnxn.close() except: get_logger().exception('Unable to delete event checkpoint named {0}'.format(self.name)) raise return
def perform_sparing(self, rec_id, location, rawdata): # extract the mask and register from the message details mask = '0x000' mindex = rawdata.find('Mask=') if (mindex >= 0): mindex = mindex + len('Mask=') mask = rawdata[mindex:mindex + 5] register = 'C23' rindex = rawdata.find('Register=') if (rindex >= 0): rindex = rindex + len('Register=') register = rawdata[rindex:rindex + 3] # log a ras event to have the BQL lane spared command = list() command.append('/bgsys/drivers/ppcfloor/sbin/mc_server_log_ras') command.append('--location') command.append(location) command.append('--message-id') command.append('0x0009020D') command.append('--action') command.append('BQL_SPARE') command.append('--detail') m = 'Mask=' + mask command.append(m) command.append('--detail') r = 'Register=' + register command.append(r) command.append('--detail') command.append('Submitter=TEAL') command.append('--detail') command.append('Associated_Rec_Id=' + str(rec_id)) registry.get_logger().debug(command) subprocess.call(command) return
def get_generator(self, config_dict): """ Return the appropriate SQL generator based on the configuration information retrieved """ DB_CONF_PATH = "{0}/xcat".format(get_service(TEAL_CONF_DIR)) prefix = os.environ.get(TEAL_TEST_XCAT_CFGLOG_PREFIX, "") DB_CONF_FILE = "{0}cfgloc".format(prefix) # Set xCAT table names db_interface.TABLE_EVENT_LOG = "x_tealeventlog" db_interface.TABLE_CHECKPOINT = "x_tealcheckpoint" db_interface.TABLE_ALERT_LOG = "x_tealalertlog" db_interface.TABLE_ALERT2ALERT = "x_tealalert2alert" db_interface.TABLE_ALERT2EVENT = "x_tealalert2event" db_interface.TABLE_TEMPLATE = "x_{0}" # Well-known path to the information. ds_file = "{0}/{1}".format(DB_CONF_PATH, DB_CONF_FILE) get_logger().debug("DB Configuration: {0}".format(ds_file)) try: conf_file = open(ds_file, "r") except IOError, e: get_logger().error("Unable to open DB configuration file. {0}".format(e)) raise
def __init__(self, config_dict, ruleset): ''' Constructor Get initial values from the config_dict ''' dict.__init__(self) self.ruleset = ruleset self.trace_id = (0, 'poolcontrol') if config_dict is None or 'initial_pool_duration' not in config_dict: self[GPCL_INIT_DURATION] = None else: self[GPCL_INIT_DURATION] = int(config_dict['initial_pool_duration']) if config_dict is None or 'max_pool_duration' not in config_dict: self[GPCL_MAX_DURATION] = None else: self[GPCL_MAX_DURATION] = int(config_dict['max_pool_duration']) # Check for environment variable env_to_check = ENV_ARRIVAL_RATE_EXTENSION.format(self.ruleset.name.upper()) env_value = os.environ.get(env_to_check, None) if env_value is not None: tarel = [int(v) for v in env_value.split(',')] self[GPCL_ARRIVAL_RATE_EXTENSION] = ArrivalCheckCtl(window_min=tarel[0], window_max=tarel[1], arrival_rate=tarel[2], extension=tarel[3]) get_logger().warning('Arrival Rate extension overridden using environment variable {0} with value {1}'.format(env_to_check, str(self[GPCL_ARRIVAL_RATE_EXTENSION]))) else: if config_dict is None or CFG_ARRIVAL_RATE_EXTENSION not in config_dict: self[GPCL_ARRIVAL_RATE_EXTENSION] = None else: tarel = [int(v) for v in config_dict[CFG_ARRIVAL_RATE_EXTENSION].split(',')] self[GPCL_ARRIVAL_RATE_EXTENSION] = ArrivalCheckCtl(window_min=tarel[0], window_max=tarel[1], arrival_rate=tarel[2], extension=tarel[3]) get_logger().debug('Arrival Rate extension overridden in config with value {0}'.format(str(self[GPCL_ARRIVAL_RATE_EXTENSION]))) return
def handle_batch_event(errm_env, remote): ''' Process the batch events ''' if remote: try: remote_details = errm_env['ERRM_VALUE'].strip() m = re.match('\[.*,(.*),.*\]', remote_details) rmt_filename = m.group(1).strip() # Need to copy over the event log so it can be handled rmt_host = errm_env['ERRM_NODE_NAME'] rmt_file = 'hscroot@{0}:{1}'.format(rmt_host, rmt_filename) lcl_file = '/tmp/{0}_{1}'.format(rmt_host, os.path.basename(rmt_filename)) subprocess.check_call(['/usr/bin/scp', rmt_file, lcl_file]) # Parse the events that were saved on the remote system parse_batch_file(lcl_file) # All done processing events so remove the file os.remove(lcl_file) except subprocess.CalledProcessError, cpe: # TODO: Log error and leave get_logger().error("Failed to copy batch file: {0}".format(cpe)) except OSError, ose: # TODO: Log the OS Error get_logger().error("Failed to process batch file: {0}".format(ose))
def perform_sparing(self, rec_id, location, rawdata): # extract the mask and register from the message details mask = '0x000' mindex = rawdata.find('Mask=') if (mindex >= 0): mindex = mindex + len('Mask=') mask = rawdata[mindex:mindex+5] register = 'C23' rindex = rawdata.find('Register=') if (rindex >= 0): rindex = rindex + len('Register=') register = rawdata[rindex:rindex+3] # log a ras event to have the BQL lane spared command = list() command.append('/bgsys/drivers/ppcfloor/sbin/mc_server_log_ras') command.append('--location') command.append(location) command.append('--message-id') command.append('0x0009020D') command.append('--action') command.append('BQL_SPARE') command.append('--detail') m = 'Mask=' + mask command.append(m) command.append('--detail') r = 'Register=' + register command.append(r) command.append('--detail') command.append('Submitter=TEAL') command.append('--detail') command.append('Associated_Rec_Id=' + str(rec_id)) registry.get_logger().debug(command) subprocess.call(command) return
def close_event(errm_env, event_data): ''' Find an alert that associated with the closed event and close it. This will only close an alert that has this event as the one and only event ''' # Find the matching event in the event log event_rec_id = find_logged_event(errm_env, event_data) # If no event was found, there is nothing more to do if event_rec_id is None: return # Find an alert that this event is the only event associated with alert_recids = find_logged_alerts(event_rec_id) # If there is no alert associated with this event, then it may have # already been closed out or never logged in the first place based # on when this connector started listening for events if len(alert_recids) == 0: return # Close this alert and any alerts that were duplicates of this alert a_mgr = registry.get_service(registry.SERVICE_ALERT_MGR) for alert_recid in alert_recids: try: a_mgr.close(alert_recid) except alert_mgr.AlertMgrError, ame: get_logger().warn('Failed to close alert({0}) associated to event ({1}): {2}'.format(alert_recid, event_rec_id, ame))
def __init__(self, config_dict): '''Constructor. ''' # Validate configuration parameters if config_dict['enabled'] != 'realtime': raise ConfigurationError('Realtime monitor can only enabled for realtime use. Unsupported value specified: {0}'.format(config_dict['enabled'])) temp_frequency = os.environ.get(TEAL_UPDATE_CHECKPOINT_FREQUENCY, None) if temp_frequency is None: self.update_checkpoint_frequency = DEFAULT_UPDATE_CHECKPOINT_FREQUENCY else: try: self.update_checkpoint_frequency = long(temp_frequency) except: get_logger().warning('Environment variable \'{0}\' was invalid: \'{1}\'. Default value used'.format(TEAL_UPDATE_CHECKPOINT_FREQUENCY, str(temp_frequency))) self.update_checkpoint_frequency = DEFAULT_UPDATE_CHECKPOINT_FREQUENCY cfg_notifier = os.environ.get(TEAL_TEST_NOTIFIER_CONFIG, None) if cfg_notifier is None: if CFG_KEY_NOTIFIER not in config_dict: raise ConfigurationError('RealtimeMonitor requires notifier be specified in the configuration file or as an environment variable') else: cfg_notifier = config_dict[CFG_KEY_NOTIFIER] # create notifier class try: module_name, class_name = cfg_notifier.rsplit('.', 1) module = __import__(module_name, globals(), locals(), [class_name]) except ImportError,ie: get_logger().error(ie) raise # throw the ImportError up the chain
def _log_event(self, db, cursor, ll_event): ''' Log event into TEAL event log. The LL event is actually a combination of common event data and LL specific data ''' # Translate each event into a TEAL format teal_event = self._translate_event(ll_event) #print teal_event db.insert(cursor, LL_TEAL_COLS, db_interface.TABLE_EVENT_LOG, teal_event) # Rules assume detail is provided so set to empty string if not detail = ll_event[LL_EVENT_COL_DETAIL] if detail is None: detail = '' # Now add the LL extended data ll_extended_data = [ll_event[LL_EVENT_COL_TIME_OCCURRED], ll_event[LL_EVENT_COL_TIME_LOGGED], ll_event[LL_EVENT_COL_MSG_TYPE], ll_event[LL_EVENT_COL_MESSAGE], detail] db.insert_dependent(cursor, LL_TEAL_EXTENDED_PK, LL_TEAL_EXTENDED_COLS, LL_TEAL_EXTDATA_TABLE, ll_extended_data) registry.get_logger().debug("Logged event [{0},{1},{2}]".format(ll_event[LL_EVENT_COL_TIME_OCCURRED], ll_event[LL_EVENT_COL_EVENT_ID], ll_event[LL_EVENT_COL_NODE].strip()))
def resolve_and_validate(self): '''Resolve and validate the evaluatable''' self.src_name = self.ruleset.name try: # OK to set Rule to None, since not using any of the support the requires resolve_and_validate_rule_values(self, self.ruleset, None, GEHD_HANDLER_ALERT, rule_part=GRUL_PART_ACTION) except XMLParsingError as e: self.ruleset.parse_error(self.trace_id[0], '\'on_error\' element {0}'.format(e.msg)) self.trace_id = (self.trace_id[0], self.trace_id[1] + '-' + self.type.get_value()) # Process init class if self.init_class.is_set() == False: self.init_class_callable = self.ruleset['gear_control'][GCTL_DEFAULT_CREATE_ALERT_INIT_CLASS] else: try: module_name, class_name = self.init_class.get_value().rsplit('.', 1) module = __import__(module_name, globals(), locals(), [class_name]) except ImportError, ie: get_logger().error(ie) self.ruleset.parse_error(self.trace_id[0], 'gear create alert unable to load specified init class: {0}'.format(self.init_class)) raise ie self.init_class_callable = getattr(module, class_name) tmp_instance = self.init_class_callable() if isinstance(tmp_instance, ExtInitAlert) == False:
def flush(self, flush_time): ''' flush the ruleset ''' try: self.event_pool.flush(flush_time) except IncidentPoolStateTransitionError: get_logger().info('Tried to Flush a closed pool') return
def get_active_sections(self, area, runmode=None, name_required=True, singleton=False): '''Get a list that contains tuples of active section key and names for the specified area ''' results = [] for section in self.sections(): # split into config_area and entry_name result = section.split('.',1) if result[0] == area: if runmode is not None and self.has_option(section, 'enabled'): enabled_val = self.get(section, 'enabled') if enabled_val != 'all': if enabled_val == 'false' or \ (enabled_val == 'realtime' and runmode != RUN_MODE_REALTIME) or \ (enabled_val == 'historic' and runmode != RUN_MODE_HISTORIC): get_logger().debug('Skipping section \'{0}\' with enabled set to \'{1}\''.format(section, enabled_val)) continue elif enabled_val not in ['realtime', 'historic']: raise ConfigurationError('Configuration section \'{0}\' has an unrecognized value for enabled keyword: \'{1}\''.format(section, enabled_val)) if len(result) == 1: if name_required == True: raise ConfigurationError('Configuration sections for \'{0}\' must have a name, but none was specified'.format(area)) result.append(None) if singleton == True and len(results) == 1: raise ConfigurationError('There can only be one section called \'{0}\''.format(area)) results.append((section,result[1])) results.sort() return results
def _connect_to_hmc(self): ''' Connect to the HMC ''' if self.hmc_using_addr == self.hmc_primary_addr: # Try primary if self._try_to_connect(self.hmc_primary_addr) == True: get_logger().info('Connected to primary HMC') self.hmc_connected = True return # Try backup if self._try_to_connect(self.hmc_backup_addr) == True: self.hmc_using_addr = self.hmc_backup_addr self.hmc_connected = True get_logger().info('Connected to secondary HMC') return else: # Try backup if self._try_to_connect(self.hmc_backup_addr) == True: self.hmc_connected = True get_logger().info('Connected to secondary HMC') return # Try primary if self._try_to_connect(self.hmc_primary_addr) == True: self.hmc_connected = True self.hmc_using_addr = self.hmc_primary_addr get_logger().info('Connected to primary HMC') return # Neither worked get_logger().info('Unable to connect to HMC') self.hmc_using_addr = self.hmc_primary_addr # Start with primary next time return
def process_alert(self, alert): ''' Convert the alert to a service focal point log and send to the service focal point ''' get_logger().debug('In cnm_alert_listener') self.queue.put(alert, True) return
def read_xml(self, xml_element, trace_dict): '''Read the pool control XML''' self.trace_id = trace_dict[xml_element] entry_found = False for xml_entry in xml_element: entry_name = xml_entry.tag.split('}')[-1] get_logger().debug('Processing {0}'.format(entry_name)) if entry_name == GPCL_INIT_DURATION: entry_found = True try: self[GPCL_INIT_DURATION] = self._process_duration_xml(xml_entry, self[GPCL_INIT_DURATION]) except XMLParsingError as e: self.ruleset.parse_error(self.trace_id[0], 'pool control initial duration error: {0}'.format(e.msg)) elif entry_name == GPCL_MAX_DURATION: entry_found = True try: self[GPCL_MAX_DURATION] = self._process_duration_xml(xml_entry, self[GPCL_MAX_DURATION]) except XMLParsingError as e: self.ruleset.parse_error(self.trace_id[0], 'pool control max duration error: {0}'.format(e.msg)) elif entry_name == GPCL_ARRIVAL_RATE_EXTENSION: entry_found = True if self[GPCL_ARRIVAL_RATE_EXTENSION] is None: self._process_arrival_rate_extension(xml_entry) else: self.ruleset.parse_error(self.trace_id[0], 'pool control encountered an unexpected element \'{0}\''.format(entry_name)) if not entry_found: self.ruleset.parse_error(self.trace_id[0], 'pool control element must have at least one sub-element specified') return
def read_from_xml(self, xml_templates_element, trace_dict): '''Add template info defined in an XML templates element''' self.trace_id = trace_dict[xml_templates_element] for template_entry in xml_templates_element: template_type = template_entry.tag.split('}')[-1] if template_type != GTPL_CONDITION: self.context.parse_error(self.trace_id[0], '\'templates\' element does not support the sub-element \'{0}\''.format(template_type)) # Condition template -- only one currently supported # Process attributes name = None for att_key in template_entry.attrib: att_value = template_entry.attrib[att_key] if att_key == 'name': name = att_value.strip() else: self.ruleset.parse_error(self.trace_id[0], '\'condition_template\' element encountered an unexpected attribute \'{0}\''.format(att_key)) # Name was required if name is None: self.context.parse_error(self.trace_id[0], '\'condition_template\' element requires \'name\' attribute') get_logger().debug('Condition template defined with name {0}'.format(name)) # Must have a contained element if len(template_entry) < 1: self.context.parse_error(self.trace_id[0], 'template must contain one and only one sub-element') # Put in the template dictionary self[GTPL_CONDITION][name] = template_entry[0] return
def alert_not_analyzed_callback(self, alert): ''' When an alert is not handled in the alert analyzer queue pass it to the filter queue''' if isinstance(alert, Alert): get_logger().debug('Alert {0} was not analyzed in Alert Analysis Queue -- put in Delivery Queue'.format(alert.brief_str())) registry.get_service(SERVICE_ALERT_DELIVERY_Q).put_nowait(alert) else: get_logger().debug('Command {0} was processed by the Alert Analysis Queue'.format(alert.brief_str()))
def location_error(self, event, location): ''' Handle a location error ''' if self.loc_handler is None: get_logger().debug('No location error handler. Raise the exception') location._UNPROCESSABLE() self.loc_handler.create_alert(event, location)
def handle_event(errm_env): ''' Parse and log the event retrieved from the HMC ''' event_data = parse_event(errm_env['ERRM_VALUE']) if not event_filtered(errm_env, event_data): event_type = event_data[EVENT_TYPE] if (event_type == EVENT_TYPE_OPEN): # New event -- Tell TEAL about it log_event(errm_env, event_data) elif (event_type == EVENT_TYPE_CLOSED): get_logger().info('{0}:{1} - {2}'.format(event_data[EVENT_PROB_NUM], event_data[EVENT_REFCODE], event_type)) close_event(errm_env, event_data) elif (event_type == EVENT_TYPE_CHANGED): # Make sure this is for an event with a valid problem number if (EVENT_PROB_NUM in event_data): # If this event has not been logged, then we missed the # initial logging or never saw an open event (which can occur) if find_logged_event(errm_env, event_data) is None: log_event(errm_env, event_data) else: # Other event status changes are not operated on get_logger().warn('{0}:{1} - {2}'.format(event_data[EVENT_PROB_NUM], event_data[EVENT_REFCODE], event_type))
def get_generator(self, config_dict): ''' Return the appropriate SQL generator based on the configuration information retrieved from the bg.properties ''' # 1) see if person calling has it specified properties_file = config_dict.get('bgproperties', None) # 2) go to env variable PROPERTIES_FILE if not properties_file: properties_file = os.environ.get('PROPERTIES_FILE',None) if not properties_file: properties_file = os.environ.get('BG_PROPERTIES_FILE',None) # 3) then look in /bgsys/local/etc if not properties_file: properties_file = BG_PROPERTIES_FILE cfg = ConfigParser.ConfigParser() cfg.readfp(BgqPropertiesFile(properties_file)) # get ras filter info try: rasFilter = cfg.get('ras','filter') except ConfigParser.NoOptionError: rasFilter = '/bgsys/drivers/ppcfloor/ras/etc/ras_environment_filter.xml' registry.get_logger().debug('RAS Environment filter file: ' + rasFilter) registry.register_service('BGQ_RAS_FILTER', rasFilter) config_service = ConfigService() registry.register_service('BGQ_CONFIG_SERVICE', config_service) # get database info db = cfg.get('database','name') try: usr_schema = cfg.get('database','schema_name') + '.' except ConfigParser.NoOptionError: usr_schema = '' pwless = False try: usr = cfg.get('database','user') pw = cfg.get('database','password') except ConfigParser.NoOptionError: registry.get_logger().debug('Database user and/or password is not specified.') pwless = True # Set the table names db_interface.TABLE_EVENT_LOG = usr_schema + 'x_tealeventlog' db_interface.TABLE_EVENT_LOG_EXT = usr_schema + 'x_tealeventlogext' db_interface.TABLE_BG_EVENT_LOG = usr_schema + 'tbgqeventlog' db_interface.TABLE_CHECKPOINT = usr_schema + 'x_tealcheckpoint' db_interface.TABLE_ALERT_LOG = usr_schema + 'x_tealalertlog' db_interface.TABLE_ALERT2ALERT = usr_schema + 'x_tealalert2alert' db_interface.TABLE_ALERT2EVENT = usr_schema + 'x_tealalert2event' db_interface.TABLE_TEMPLATE = usr_schema + 'x_{0}' if pwless: return SQLGeneratorDB2({'dsn':db}) else: return SQLGeneratorDB2({'dsn':db, 'uid':usr, 'pwd':pw})
def cancel(self): '''Cancel the timer ''' get_logger().debug('Canceling timer') if self.my_timer is not None: self.my_timer.cancel() self.callback = None return
def _periodic_monitor(self): ''' Runs the monitor thread waiting for new events to occur ''' registry.get_logger().debug("in periodic monitor " + str(self.last_processed_event)) # Wait for the next polling iteration time.sleep(self.poll_interval) self._get_last_processed_event() self._query_and_log_event(">", self.last_processed_event)
def will_analyze_event(self, event): '''Indicate this analyzer handles certain BQL events. ''' event_id = event.get_event_id() if event_id in self.msgIDs: registry.get_logger().debug('matched event id ' + event_id + ' ' + str(event.get_rec_id())) return True else: registry.get_logger().debug('not matched ' + event_id + ' ' + str(event.get_rec_id())) return False
def get_excludeMsgIDs(): '''Get the list of excluded message IDs. ''' cfg = registry.get_service(registry.SERVICE_CONFIGURATION) excludeMsgIDs = '' try: excludeMsgIDs = cfg.get(BGQ_TEAL_THRESHOLD_ANALYZER, BGQ_TEAL_THRESHOLD_EXCLUDE_IDS) registry.get_logger().debug('Exclude List = ' + excludeMsgIDs) except Exception, e: registry.get_logger().debug(e)
def _configure(self): # Set the polling time based on the BGQ Connector conf file cfg = registry.get_service(registry.SERVICE_CONFIGURATION) try: value = cfg.get(BGQ_TEAL_CONFIG, BGQ_TEAL_CONFIG_POLL_INTERVAL) self.poll_interval = int(value) if self.poll_interval <= 0: registry.get_logger().error('The value ' + str(self.poll_interval) + ' specified in the poll interval is not valid. The value must be greater than zero.') raise except: registry.get_logger().warn('Configuring poll interval to default {0} seconds'.format(BGQ_DEFAULT_POLL_INTERVAL)) self.poll_interval = BGQ_DEFAULT_POLL_INTERVAL
def rt_callback(self,recid,msgid): '''Realtime callback function. ''' # In case some RAS events occurred before the real-time server could be started ... if self.first_realtime_event: # Not sure this is needed? This should be current from the previous call to query_and_log_event # self._get_last_processed_event() self._query_and_log_event(">", self.last_processed_event, recid) self.first_realtime_event = False registry.get_logger().debug("in rt_callback " + str(recid) + " " + msgid) self._query_and_log_event("=", recid) return
def _query_and_log_event(self, query_sign, recid, max_recid=0): ''' Query the BG event log for new events and log into TEAL ''' registry.get_logger().debug("in _query_and_log_event") event_logged = False db = registry.get_service(registry.SERVICE_DB_INTERFACE) cnxn = db.get_connection() bgq_cursor = cnxn.cursor() teal_cursor = cnxn.cursor() # Query the BG event log for new events bgEvent_query = "SELECT RECID, CATEGORY, COMPONENT, JOBID, BLOCK, LOCATION, MSG_ID FROM " + db_interface.TABLE_BG_EVENT_LOG + " WHERE RECID " + query_sign + " ? ORDER BY RECID ASC" bgq_cursor.execute(bgEvent_query, recid) commit_count = 0 for bg_event in next_row(bgq_cursor): # Don't process events with recids >= max_recid, if it is nonzero if max_recid > 0 and bg_event[0] >= max_recid: break # Log only events we are interested in if bg_event[6] in self.msgIDs: event_logged = True # Log the event into TEAL self._log_event(bg_event, teal_cursor) # Commit every so often to limit the transaction size commit_count += 1 if commit_count == COMMIT_LIMIT: cnxn.commit() commit_count = 0 else: registry.get_logger().debug('ignore msgid ' + bg_event[6]) # Update the 'cursor' into the BGQ database self.last_processed_event = bg_event[0] # Notify TEAL that events have been inserted if (event_logged): registry.get_logger().debug("event to log " + str(event_logged)) cnxn.commit() if self.notifier: self.notifier.post() else: registry.get_logger().warn('TEAL notifier not configured.') cnxn.close() registry.get_logger().debug("exit _query_and_log_event")
def __init__(self, name, inEventQueue, outQueue, config_dict=None, number=0, checkpoint=None): '''The constructor. ''' EventAnalyzer.__init__(self, name, inEventQueue, outQueue, config_dict, number, checkpoint) self.severity = "F" self.recommendation = "Diagnose hardware that has been placed in an error state. " # Get the list of ras events with 'hardware in error' control actions self.msgIDs, self.msgidService = get_eventList() for msgID in self.msgIDs: registry.get_logger().debug('msgId = ' + msgID) return
def has_duplicate(self, alert_time, query, cursor): ''' Query alerts for the same location. ''' # Query for the number of alerts for the same location query = query.replace('ALERT_TIME', alert_time) query = query.replace('WINDOW', self.window_time) registry.get_logger().info('Duplicate query: %s', query) cursor.execute(query) # Return True if find a match row = cursor.fetchone() if row: return True return False
def __init__(self, name, inEventQueue, outQueue, config_dict=None, number=0, checkpoint=None): '''The constructor. ''' EventAnalyzer.__init__(self, name, inEventQueue, outQueue, config_dict, number, checkpoint) self.severity = "W" self.recommendation = '''Schedule service to isolate the BQL issue. Possible causes are environmental, cable, or a board. Multiple BQLs reporting issues on multiple boards in the same midplane may be a side effect of an environmental issue like neighboring boards being powered off unexpectedly. Issues with a single cable may be due to poor seating of the cable into the connector or debris on the cable. The cable and board can be cleaned with an optics cleaning tool. Low voltage on all or most lanes on a single optical module may be an issue with the board at this location (the receiver) or its neighboring board (the transmitter) at the other end of the cable.''' self.alert_id = 'BQL01' # Get the exclude list of message IDs cfg = registry.get_service(registry.SERVICE_CONFIGURATION) excludeList = '' try: excludeList = cfg.get(BGQ_TEAL_BQL_ANALYZER, excludeMsgIDs) registry.get_logger().debug('Exclude List = ' + excludeList) except Exception, e: registry.get_logger().debug(e)
def get_eventList(): '''Get the list of RAS events with thresdhold count. ''' # Get the exclude list of message IDs from the configuration file excludeMsgList = get_excludeMsgIDs() # Search the tbgmsgtypes for ras events that have threshold counts # Note: do not include ras events already handled by HardwareInError and JobFatal analyzers. schema = str(db_interface.TABLE_TEMPLATE).split('.') msgtypesTable = schema[0] + '.' + 'tbgqmsgtypes' count_query = "select msg_id, thresholdcount, svcaction, relevantdiags from " + msgtypesTable + " where thresholdcount is not NULL and (ctlaction is NULL or (ctlaction not like '%END_JOB%' and (ctlaction not like '%_IN_ERROR%' or ctlaction like '%SOFTWARE_IN_ERROR%')))" dbi = registry.get_service(SERVICE_DB_INTERFACE) dbConn = dbi.get_connection() cursor = dbConn.cursor() cursor.execute(count_query) rows = cursor.fetchall() msgIDs = list() msgidCount = dict() msgidService = dict() for r in rows: msgid = r[0].strip() if excludeMsgList.find(msgid) >= 0: registry.get_logger().debug(' excluding ' + msgid) continue msgIDs.append(msgid) msgidCount[msgid] = r[1] sa = 'Service action: ' if r[2]: sa += r[2].strip() else: sa += "None." if r[3]: sa += ' Relevant diagnostic bucket(s): ' + r[3].strip() msgidService[msgid] = sa # Search the tbgqmsgtypes for ras events that have threshold period period_query = "select msg_id, thresholdperiod from " + msgtypesTable + " where thresholdperiod is not NULL" cursor.execute(period_query) rows = cursor.fetchall() msgidPeriod = dict() for r in rows: msgidPeriod[r[0]] = r[1] return msgIDs, msgidService, msgidCount, msgidPeriod
def has_matching_blockId(self, block, alert_time, cursor): ''' Query alerts for the associated event with the same block id ''' # Get the block id from the associated event for the current alert # Return if no block id. same_block_query = self.same_block_query_str.format(alert_time, self.window_time, block) registry.get_logger().info('Same block query: %s', same_block_query) cursor.execute(same_block_query) row = cursor.fetchone() # If the query returns a nonzero count then there are some records with matching block ids if row is not None and len(row) > 0: if row[0] > 0: return True return False
def _get_last_processed_event(self): ''' Log events that have occurred prior to starting the monitor ''' dbi = registry.get_service(registry.SERVICE_DB_INTERFACE) cnxn = dbi.get_connection() cursor = cnxn.cursor() # Find the last event injected into TEAL and then inject # all the events that have occurred since then maxEvent_query = "SELECT MAX(REC_ID) FROM " + db_interface.TABLE_EVENT_LOG_EXT cursor.execute(maxEvent_query); max_id = cursor.fetchone()[0] if max_id is None: self.last_processed_event = 0 else: self.last_processed_event = max_id registry.get_logger().info('Last Processed Event = ' + str(self.last_processed_event)) cnxn.close()
def delta_period(self, period): pindex = period.find(' ') pdigit = '' punits = '' if pindex != -1: pdigit = period[0:pindex] punits = period[(pindex + 1):] # registry.get_logger().error("delta_period digit=" + pdigit + ", unit=" + punits) if punits == 'MONTH' or punits == 'MONTHS' or punits == 'month' or punits == 'months': return timedelta(months=int(pdigit)) if punits == 'DAY' or punits == 'DAYS' or punits == 'day' or punits == 'days': return timedelta(days=int(pdigit)) if punits == 'MINUTE' or punits == 'MINUTES' or punits == 'minute' or punits == 'minutes': return timedelta(minutes=int(pdigit)) if punits == 'SECOND' or punits == 'SECONDS' or punits == 'second' or punits == 'seconds': return timedelta(seconds=int(pdigit)) if punits == 'MICROSECOND' or punits == 'MICROSECONDS' or punits == 'microsecond' or punits == 'microseconds': return timedelta(microseconds=int(pdigit)) registry.get_logger().error("No timedelta possible for " + period) return None
def send_common_alert(self, loc, cur_alert_recid, event, alert_time, dup_query, cursor): ''' Send an alert for the common location. ''' # Close current alert prior to creating a new common alert registry.get_logger().info('Closing current alert recid %d prior to creating a common mode alert', cur_alert_recid) registry.get_service(SERVICE_ALERT_MGR).close(cur_alert_recid) # Get the location loc_name = self.get_loc_name(loc) loc_type = loc.get_id() loc_parent, loc_parent_list = self.get_loc_parent(loc) loc_parent_object = Location(loc_type, loc_parent) # Removed the duplicate check that was here -- this has already been determined # Fill in alert info reason = self.reason.replace('LOC_NAME', loc_name) reason = reason.replace('LOC_PARENT', loc_parent) recommendation = self.recommendation.replace('LOC_PARENT', loc_parent) alert_dict = {alert.ALERT_ATTR_SEVERITY:self.severity, alert.ALERT_ATTR_URGENCY:'I', alert.ALERT_ATTR_EVENT_LOC_OBJECT:loc_parent_object, alert.ALERT_ATTR_RECOMMENDATION:recommendation, alert.ALERT_ATTR_REASON:reason, alert.ALERT_ATTR_RAW_DATA:'No raw data', alert.ALERT_ATTR_SRC_NAME:self.get_name(), alert.ALERT_ATTR_CONDITION_EVENTS:set((event,)) } # Get the alert manager to create/allocate/commit the alert alertMgr = registry.get_service(registry.SERVICE_ALERT_MGR) bg_alert = alertMgr.allocate(self.alertId, in_dict=alert_dict) alertMgr.commit(bg_alert, disable_dup=False) # Now the alert is created, need to put it in the queue so that it can be analyzed # by alert analyzer (instead of sending it - send_alert, which will get reported # through the pipeline right away) registry.get_logger().info("Put alertId = %s with event recid = %d on the alert analyzer queue", self.alertId, event.get_rec_id()) registry.get_service(SERVICE_ALERT_ANALYZER_Q).put(bg_alert) return
def has_common_location(self, loc, alert_time, query, cursor): ''' Query alerts for the common location to indicate whether or not to send an alert. ''' locParent, locParent_list = self.get_loc_parent(loc) # Query for the number of alerts for the same parent's location query = query.replace('LOCATION',loc.get_location()) query = query.replace('PLOC',locParent) query = query.replace('ALERT_TIME', alert_time) query = query.replace('WINDOW', self.window_time) cursor.execute(query) registry.get_logger().info("Trying to match parent: %s from query: %s", locParent, query) # Send a common alert if ther are alerts with common location # Start counting from the current alert (include current alert) loc_type = loc.get_id() count = 1 rows = cursor.fetchall() for r in rows: r_loc = r[0].strip() r_loc_object = Location(loc_type, r_loc) r_loc_parent, r_loc_parent_list = self.get_loc_parent(r_loc_object) if r_loc_parent == locParent: registry.get_logger().info("Match: %s with rec: %s", locParent, r_loc_parent) count += 1 if count >= self.threshold: return True else: registry.get_logger().info("No match: %s with rec: %s", locParent, r_loc_parent) return False
def will_analyze_alert(self, alert): '''Whether or not to analyze an alert ''' # Will not analyze duplicate alert alertId = alert.get_rec_id() if alert.dup_alert_recid is not None: if alert.dup_alert_recid > 0: registry.get_logger().debug('Duplicate alert rec id %d is not analyzed', alert.get_rec_id()) return False else: # If the dup_alert_recid field is None, then it has not been set, so determine if # there are duplicates as before if self.alertMgr.is_duplicate(alertId): registry.get_logger().debug('Duplicate alert id %d is not analyzed.', alert.get_rec_id()) return False # Will not analyze alert without hardware location (C: Compute; I: I/O) loc_type = alert.event_loc.get_id() if loc_type != 'C' and loc_type != 'I': registry.get_logger().debug('Alert id %d with location type %s is not analyzed.', alert.get_rec_id(), loc_type) return False # Will analyze all other alerts return True
def __init__(self, name, inEventQueue, outQueue, config_dict=None, number=0, checkpoint=None): '''The constructor. ''' EventAnalyzer.__init__(self, name, inEventQueue, outQueue, config_dict, number, checkpoint) self.severity = "W" self.recommendation = "Diagnose the problem that caused the threshold to be reached or exceeded. " self.alert_id = 'THRESH01' # Get the list of ras events that have threshold counts self.msgIDs, self.msgidService, self.msgidCount, self.msgidPeriod = get_eventList( ) for msgID in self.msgIDs: registry.get_logger().debug('msgId = ' + msgID) # define query for count with no period specified eventTable = self.appendSchema('tbgqeventlog') self.count_query = "select count(*) from " + eventTable + " where msg_id = ? and location LOC and serialnumber SN and event_time <= ?" self.count_query2 = "select sum(bigint(count)) from " + eventTable + " where msg_id = ? and location LOC and serialnumber SN and event_time <= ?" # define query for count exceeded with period specified self.period_query = "select count(*) from " + eventTable + " where msg_id = ? and location LOC and serialnumber SN and event_time <= ? and event_time > (timestamp('MYTIME') - PERIOD)" self.period_query2 = "select sum(bigint(count)) from " + eventTable + " where msg_id = ? and location LOC and serialnumber SN and event_time <= ? and event_time > (timestamp('MYTIME') - PERIOD)" # the number of consecutive periods that the threshold has to exceed self.msgidConsecutivePeriods = dict() self.msgidConsecutivePeriods['0008002F'] = 7 self.msgidConsecutivePeriods['00080030'] = 3 return
def analyze_event(self, event): '''Analyze a RAS event and determine whether the BQL threshold of errors has been reached or exceeded. ''' msg_id = event.get_event_id() rec_id = event.get_rec_id() registry.get_logger().info("Analyzing msgid = " + msg_id + " recid = " + str(rec_id)) location = str(event.get_src_loc()) location = location[3:].strip() severity = event.raw_data['severity'].strip() serialnumber = event.raw_data['serialnumber'] ecid = event.raw_data['ecid'] event_time = event.get_time_logged() block = event.raw_data['block'].strip() jobid = event.raw_data['jobid'] msgText = event.raw_data['message'].strip() rawdata = event.raw_data['rawdata'].strip() count = event.get_event_cnt() # Set threshold value threshold = self.msgidCount[msg_id] tmsg = "BQL error threshold of " + str( threshold) + " has been reached or exceeded, total count is " # check if thresholds have been reached or exceeded for events xmsg = "" xmsg = " in a period of " + self.msgidPeriod[msg_id].strip() query = self.period_query.replace('PERIOD', self.msgidPeriod[msg_id].strip()) query = query.replace('MYTIME', str(event_time)) # search for events associated with this location's midplane or I/O board qryloc = location.strip()[0:6] + '%' registry.get_logger().debug(query + " xmsgId=" + msg_id + " loc=" + qryloc + " ev_time=" + str(event_time)) msgCount = 0 for x in range(5): try: self.cursor.execute(query, qryloc) row = self.cursor.fetchone() msgCount = row[0] break except Exception, e: registry.get_logger().debug(e) if x < 4: dbi = registry.get_service(SERVICE_DB_INTERFACE) self.dbConn = dbi.get_connection() self.cursor = self.dbConn.cursor() else: raise Exception( 'Error: bgq_BqlEventAnalyzer could not connect to the database' )
def get_threshold(self): ''' Get the window time config info for alert analyzer ''' # Get the threshold for a hardware location try: threshold = self.cfg.get(BGQ_TEAL_ALERT_ANALYZER, BGQ_TEAL_ALERT_ANALYZER_THRESHOLD) if int(threshold) <= 0: registry.get_logger().error('The value %s specified in the threshold is not valid. The value must be greater than zero.', threshold) raise registry.get_logger().debug('threshold = %s', threshold) except Exception, e: threshold = str(BGQ_DEFAULT_THRESHOLD) registry.get_logger().warn('Configuring the threshold to default %s due to exception: %s', threshold, e)
def get_window_time(self): ''' Get the window time config info for alert analyzer ''' # Get the window time (in seconds) windowTime = 1 try: windowTime = self.cfg.get(BGQ_TEAL_ALERT_ANALYZER, BGQ_TEAL_ALERT_ANALYZER_WINDOW_TIME) if int(windowTime) <= 0: registry.get_logger().error('The value %s specified in the window time is not valid. The value must be greater than zero.', windowTime) raise window_time = windowTime + ' SECONDS' registry.get_logger().debug('windowTime = %s', window_time) except Exception, e: registry.get_logger().warn('Configuring window time to default %s seconds due to exception: %s', window_time, e) window_time = str(BGQ_DEFAULT_WINDOW_TIME) + ' SECONDS'
def analyze_event(self, event): '''Analyze a RAS event and send an alert. ''' msg_id = event.get_event_id() rec_id = event.get_rec_id() registry.get_logger().info("Analyzing msgid = " + msg_id + " recid = " + str(rec_id)) # Exclude event logged from DIAG run if event.raw_data['diags'] == 'T': registry.get_logger().debug('RAS Event generated by Diagnostics, skip creating an alert') return # Fill in alert with appropriate data reason = "The hardware been put in an error state. \nRAS event details:" \ " message id = " + msg_id + \ ", recid = " + str(rec_id) + \ ", timestamp = " + str(event.get_time_occurred()) + \ ", serial number = " + str(event.raw_data['serialnumber']) + \ ", ecid = " + self.ecidString(event.raw_data['ecid']) + \ ", jobid = " + str(event.raw_data['jobid']) + \ ", block = " + str(event.raw_data['block']) raw_data = "RAS Message: " + event.raw_data['message'] recommendation = self.recommendation + " " + self.msgidService[msg_id] alert_dict = {alert.ALERT_ATTR_SEVERITY:self.severity, alert.ALERT_ATTR_URGENCY:'I', alert.ALERT_ATTR_EVENT_LOC_OBJECT:event.get_src_loc(), alert.ALERT_ATTR_RECOMMENDATION:recommendation, alert.ALERT_ATTR_REASON:reason, alert.ALERT_ATTR_RAW_DATA:raw_data, alert.ALERT_ATTR_SRC_NAME:self.get_name(), alert.ALERT_ATTR_CONDITION_EVENTS:set((event,)) } # Get the alert manager to create/allocate/commit the alert alertMgr = registry.get_service(registry.SERVICE_ALERT_MGR) alert_id = 'HWERR01' bg_alert = alertMgr.allocate(alert_id, in_dict=alert_dict) alertMgr.commit(bg_alert, disable_dup=False) # Now the alert is created and can be reported through the pipeline registry.get_logger().info("Sending alert for msgid = " + msg_id + " recid = " + str(rec_id)) self.send_alert(bg_alert) return
def run(self): ''' Runs the monitor thread waiting for new events to occur ''' self._get_last_processed_event() self._query_and_log_event(">", self.last_processed_event) while (self.running): # Start real-time client registry.get_logger().info("starting real-time monitor") registry.get_logger().debug("RAS event ids to filter: " + self.filter) self.first_realtime_event = True self.t = Thread(None, pyrealtime.ras_init, 'pyrealtime', (self.filter,self.rt_callback,self.rt_term_callback)) self.t.start() # If the real-time client thread terminate for whatever reason, start the periodic monitor self.t.join() registry.get_logger().info("real_time server is ended, starting periodic monitor") self._periodic_monitor() return
class bgqBqlEventAnalyzer(bgqBaseAnalyzer): '''The BqlEventAnalyzer class determines what action to take for BQL RAS events of interest. ''' def __init__(self, name, inEventQueue, outQueue, config_dict=None, number=0, checkpoint=None): '''The constructor. ''' EventAnalyzer.__init__(self, name, inEventQueue, outQueue, config_dict, number, checkpoint) self.severity = "W" self.recommendation = '''Schedule service to isolate the BQL issue. Possible causes are environmental, cable, or a board. Multiple BQLs reporting issues on multiple boards in the same midplane may be a side effect of an environmental issue like neighboring boards being powered off unexpectedly. Issues with a single cable may be due to poor seating of the cable into the connector or debris on the cable. The cable and board can be cleaned with an optics cleaning tool. Low voltage on all or most lanes on a single optical module may be an issue with the board at this location (the receiver) or its neighboring board (the transmitter) at the other end of the cable.''' self.alert_id = 'BQL01' # Get the exclude list of message IDs cfg = registry.get_service(registry.SERVICE_CONFIGURATION) excludeList = '' try: excludeList = cfg.get(BGQ_TEAL_BQL_ANALYZER, excludeMsgIDs) registry.get_logger().debug('Exclude List = ' + excludeList) except Exception, e: registry.get_logger().debug(e) # ras events that have BQL_SPARE detail data self.msgIDs = get_eventList() for msgid in self.msgIDs: registry.get_logger().debug('msgId = ' + msgid) # set the threshold self.msgidCount = dict() self.msgidCount['00090200'] = 2 self.msgidCount['00090201'] = 1 self.msgidCount['00090202'] = 1 self.msgidCount['00090210'] = 4 self.msgidCount['00090211'] = 4 # set the window = 2 X the period self.msgidPeriod = dict() self.msgidPeriod['00090200'] = '11 seconds' self.msgidPeriod['00090201'] = '11 seconds' self.msgidPeriod['00090202'] = '11 seconds' self.msgidPeriod['00090210'] = '11 seconds' self.msgidPeriod['00090211'] = '11 seconds' # BQL related ras events self.bqlIDs = list() # define query for count of recent events at this location # within a window (plus and minus the event time) # parameter 1 = location # parameter 2 = event time eventTable = self.appendSchema('tbgqeventlog') self.period_query = "select count(*) from " + eventTable + " where location like ? and category='BQL' and event_time <= (timestamp('MYTIME') + PERIOD) and event_time > (timestamp('MYTIME') - PERIOD)" # define query for count of open alerts at this location # within a day from the event time # parameter 1 = location # parameter 2 = event time alertTable = self.appendSchema('x_tealalertlog') self.alert_period = '1 day' self.alert_query = "select count(*) from " + alertTable + " where \"alert_id\"='BQL01' and \"event_loc\"= ? and \"creation_time\" >= (timestamp('MYTIME') - PERIOD) and \"state\"=1" # database connection and cursor dbi = registry.get_service(SERVICE_DB_INTERFACE) self.dbConn = dbi.get_connection() self.cursor = self.dbConn.cursor() return
def handle_control_msg(self, control_msg): ''' Handle any control messages that have been sent. No special action required ''' registry.get_logger().debug( '...Control message received: {0}'.format(control_msg))
def analyze_alert(self, alert): '''Analyze an alert ''' alert_recId = alert.get_rec_id() alert_id = alert.get_incident_id() loc_type = alert.event_loc.get_id() location = alert.event_loc.get_location() #alert_msgId = alert.get_incident_id() registry.get_logger().info('Analyzing alert id %d loc_type: %s: %s', alert_recId, loc_type, location) # There should only be one condition event associated with the alert. events = alert.condition_events if len(events) == 0: registry.get_logger().error('No event associated with the alert recid %d', alert_recId) registry.get_service(SERVICE_ALERT_DELIVERY_Q).put(alert) return event = events.pop() if (alert_id == 'BQL01'): # No need to analyze BQL01 alerts, just pass it to the delivery queue registry.get_logger().info('Nothing to analyze for alert id %s ', alert_id) registry.get_service(SERVICE_ALERT_DELIVERY_Q).put(alert) return # Get the location loc = Location(loc_type, location) locName = self.get_loc_name(loc) # No need to analyze alert with rack location alert_time = str(alert.get_time_occurred()) if locName == 'rack': registry.get_logger().info('Nothing to analyze for alert recid %d with rack location', alert_recId) registry.get_service(SERVICE_ALERT_DELIVERY_Q).put(alert) return # Find out if there are other alerts with the same block id (for ENDJOB01 and THRES01) dup_qry = '' if (alert_id == 'ENDJOB01' or alert_id == 'THRES01'): if event.raw_data['block'] is None: event_block = None else: event_block = event.raw_data['block'].strip() if event_block is None or event_block == BGQ_EVENT_NULL_BLOCK: # Found no prior alert with the same block id, pass current alert to the delivery queue registry.get_logger().info('No block id for alert id %d, no common alert generated for block: %s', alert_recId, event_block) registry.get_service(SERVICE_ALERT_DELIVERY_Q).put(alert) return same_block = False # Get db connection needed for query dbi = registry.get_service(SERVICE_DB_INTERFACE) dbConn = dbi.get_connection() cursor = dbConn.cursor() if (alert_id == 'ENDJOB01'): # For ENDJOB01, look for alert id HWERR01 or COMMON01 with the same block id same_block = self.has_matching_blockId(event_block, alert_time, cursor) else: # For THRES0101, look for alert id HWERR01 or COMMON01 or ENDJOB01 with the same block id same_block = self.has_matching_blockId(event_block, alert_time, cursor) if same_block: # Found prior alert with the same block id, close current alert registry.get_logger().info('Closing current alert recid %d due to prior alert with the same block id', alert_recId) registry.get_service(SERVICE_ALERT_MGR).close(alert_recId) else: # Found no prior alert with the same block id, pass current alert to the delivery queue registry.get_logger().info('No common block id found for alert id %d within the last %s', alert_recId, self.window_time) registry.get_service(SERVICE_ALERT_DELIVERY_Q).put(alert) return elif (alert_id == 'BQL01'): # No need to analyze BQL01 alerts, just pass it to the delivery queue registry.get_logger().info('Nothing to analyze for alert id %s.', alert_id) registry.get_service(SERVICE_ALERT_DELIVERY_Q).put(alert) return # The following will handle the rest of the alert ids (HWERR01 or COMMON01). # Find out if there is common mode alert already exist for the same location or higher hierarchy loc_parent, loc_parent_list = self.get_loc_parent(loc) loc_qry = '(' idx = 0 for pLoc in loc_parent_list: if idx != 0: loc_qry += " or " loc_qry += " \"event_loc\" like '" + pLoc + "'" idx += 1 dup_qry2 = self.dup_query + loc_qry + ")" loc_qry += " or \"event_loc\" like '" + location + "')" dup_qry = self.dup_query + loc_qry dbi = registry.get_service(SERVICE_DB_INTERFACE) dbConn = dbi.get_connection() cursor = dbConn.cursor() dup = self.has_duplicate(alert_time, dup_qry, cursor) if dup: # Found prior alert with the same block id, close current alert registry.get_logger().info('Closing current alert recid %d due to prior alert with same common location', alert_recId) registry.get_service(SERVICE_ALERT_MGR).close(alert_recId) return # Look for a common hardware problem if there are multiple alerts for different location # on the same hardware. sendAlert = self.has_common_location(loc, alert_time, self.query, cursor) if sendAlert: # Send commmon alert self.send_common_alert(loc, alert_recId, event, alert_time, dup_qry2, cursor) else: # Pass current alert to the delivery queue registry.get_logger().info('No common location for %s found for alert id: %d within the last %s ', location, alert_recId, self.window_time) registry.get_service(SERVICE_ALERT_DELIVERY_Q).put(alert) return
break except Exception, e: registry.get_logger().debug(e) if x < 4: dbi = registry.get_service(SERVICE_DB_INTERFACE) self.dbConn = dbi.get_connection() self.cursor = self.dbConn.cursor() else: raise Exception( 'Error: bgq_BqlEventAnalyzer could not connect to the database' ) if msgCount < threshold: if msg_id == '00090200': registry.get_logger().info( "The optical lane will be spared since only " + str(msgCount) + " BQL event(s) were logged during the window.") # perform the BQL sparing action self.perform_sparing(rec_id, location, rawdata) if msg_id == '00090210' or msg_id == '00090211': registry.get_logger().info( "The failing optical lane is spared automatically by the control system for message id " + msg_id + ". No administrator action is required. Retry booting the block." ) return aquery = self.alert_query.replace('PERIOD', self.alert_period) aquery = aquery.replace('MYTIME', str(event_time)) registry.get_logger().debug(aquery + " xmsgId=" + msg_id + " loc=" + location.strip() + " ev_time=" +
self.cursor.execute(query, qryloc) row = self.cursor.fetchone() msgCount = row[0] break except Exception, e: registry.get_logger().debug(e) if x < 4: dbi = registry.get_service(SERVICE_DB_INTERFACE) self.dbConn = dbi.get_connection() self.cursor = self.dbConn.cursor() else: raise Exception('Error: bgq_BqlEventAnalyzer could not connect to the database') if msgCount < threshold: if msg_id == '00090200': registry.get_logger().info("The optical lane will be spared since only " + str(msgCount) + " BQL event(s) were logged during the window.") # perform the BQL sparing action self.perform_sparing(rec_id, location, rawdata) return aquery = self.alert_query.replace('PERIOD',self.alert_period) aquery = aquery.replace('MYTIME', str(event_time)) registry.get_logger().debug(aquery + " xmsgId=" + msg_id + " loc=" + location.strip() + " ev_time=" + str(event_time)) msgCount = 0 for x in range(5): try: self.cursor.execute(aquery,location.strip()) row = self.cursor.fetchone() msgCount = row[0] break