def port_flap(host, yang_message, error, tag, current_case=None): ''' Workflow for fixing port flapping, similarly to the ifdown function. :param host: the from which the event originated. :param yang_message: the yang data. :param error: the events error. :param tag: the events tag. :param current_case: the current oats case id. :return: error, tag, comment, success (bool) ''' conf = 'No changes' success = False interface = oatsdbhelpers.get_interface(error, yang_message) comment = 'Port Flapping on ' + host + ' detected. ' if current_case is None or current_case == 'None': current_case = oatspsql.create_case( error, host, solution='Case created in salt: `tshoot.port_flap`.') interface_neighbor = oatsnb.get_interface_neighbor(host, interface, case=current_case) #neighbors = oatsnb.get_neighbors(interface_neighbor, case=current_case) #device_up = oatssalthelpers.check_device_connectivity(neighbors, interface_neighbor, case=current_case) oatssalthelpers.if_shutdown(host, interface, case=current_case) conf = oatssalthelpers.if_noshutdown(host, interface, case=current_case) success = oatssalthelpers.ping(host, interface_neighbor, check_connectivity=True, case=current_case) if success: success = True comment += ('Resolved Port Flapping on Interface`' + interface + '`.') oatssalthelpers.post_slack(comment, case=current_case) oatspsql.close_case(current_case) else: oatspsql.update_case( current_case, solution='Port flapping could not get resolved. Technician needed.', status=oatspsql.Status.ONHOLD.value) comment = ('Could not fix port flapping status of `' + interface + '` on host' + host + ' .') oatssalthelpers.post_slack(comment, case=current_case) success = False oatspsql.update_case(current_case, solution='Device ' + interface_neighbor + ' is unreachable. Technician needed.', status=oatspsql.Status.ONHOLD.value) oatssalthelpers.post_slack(comment, case=current_case) return { 'error': error, 'tag': tag, 'comment': comment, 'changes': conf, 'success': success }
def test_update_case(): caseid = oatspsql.create_case(error='test', host='test', description='Test', status='new') updated = oatspsql.update_case(caseid, solution='New stuff') deleted = oatspsql.delete_case(caseid) assert updated is not None and caseid is not None and deleted is True
def compress(data, host, timestamp, severity, error, sensor_type, event_name, correlate_for=10, use_oats_case=False): ''' Takes event of the same kind and compresses them. Once the first event reaches this function it will stop the propagation of the same kind of function for the given amount of time. Once the time is passed it will send the first event to the salt master. :param data: the data of the event. :param host: the host from which the event originated. :param timestamp: the events timestamp. :param severity: the events severity. :param error: the events error eg. OSPF_NEIGHBOR_DOWN. :param sensor_type: the sensor which detected the event eg. syslog. :param event_name: the events event name. :param correlate_for: the amount of time to compress for. :param use_oats_case: if set will generate an oats case in psql. :return: None ''' oatsinflux.write_event(host, timestamp, sensor_type, event_name, severity, data) cache_id = 'compress' + event_name lock = threading.Lock() lock.acquire() current_case = None if cache is None or cache_id not in cache or error not in cache[cache_id]: # first thread initializes and populates dict logger.debug('Starting compression of [{0}] events...'.format(event_name)) __init_cache(error, cache_id, correlate_for) else: # later threads increment counter cache[cache_id][error]['counter'] += 1 lock.release() return lock.release() if use_oats_case: current_case = __create_db_case(error, host, 'compress') oatspsql.update_case(current_case, solution='Waiting for {0} seconds to compress {1} events.'.format(correlate_for, event_name)) # compress events time.sleep(correlate_for) logger.debug('Compression finished, amount of compressed [{0}] events: {1}.' .format(event_name, cache[cache_id][error]['counter'])) if use_oats_case: __update_db_case(current_case, cache[cache_id][error]['counter'], event_name) EventProcessor.process_event(data=data, host=host, timestamp=timestamp, sensor_type=sensor_type, event_name=event_name, severity=severity, case=current_case, influx_write=False)
:param case: ID to update the case information in the psql database :return: a list of hostnames ''' # custom field on ip address to poll ospf area nb = connect() host = str(host) ospf_nb = [] logger.debug('Trying to get ospf_neighbors for host {0}'.format(host)) try: neighborip = nb.ipam.ip_addresses.filter(device=host) for nbip in neighborip: if nbip.custom_fields["OSPF_area"] is not None: ospf_nb.append(nbip.custom_fields["OSPF_area"]) if case: sol = 'Got OSPF neighbors of ' + host oatspsql.update_case(case_id=case, solution=sol) except Exception as e: logger.exception( 'Exception in oatsnb.get_ospf_neighbors for host {0}'.format(host)) logger.debug('Got ospf neighbors for host {0}'.format(host)) return ospf_nb def get_vrf_ip(host): ''' Function to get a custom field on the device in netbox which specifies the management ip address :param host: hostname :return: ip address of the management ip address ''' # custom field on device to poll salt master nb = connect()
def out_discards_exceeded(data, host, timestamp, current_case): ''' Function that loads a policy onto a device affected by a high amount of discarded packets. The policy throttles the traffic from one IP to another with a certain port number. The source-, destination-IP and port number are gathered by evaluating netflow data. After a certain amount of time the policy is removed again. :param data: contains the affected interface and the value of discarded packets. :param host: the affected host. :param timestamp: the timestamp of the event. :param current_case: the current oats case-id :return: error, comment, changes, slack-post-status(bool), success(bool) ''' if current_case is None or current_case == 'None': current_case = oatspsql.create_case( "OUT_DISCARDS_EXCEEDED", host, solution='Case created in salt: `tshoot.out_discards_exceeded`.') src_flow = None # timeout while loop after 20secs timeout = time.time() + 60 comment = '' while src_flow is None: flows = oatsinflux.get_type_data('netflow', timestamp, 'netflow/*/data', 30, host=host) src_flow = oatssalthelpers.get_src_flow(flows, 15000) time.sleep(1) if time.time() > timeout: break if src_flow is not None: dst_flow_port = src_flow['11'] interface = data['name'] src_ip_address = src_flow['8'] dst_ip_address = src_flow['12'] oatspsql.update_case( current_case, solution= 'Found responsible flow: src_ip = `{0}`, dst_ip = `{1}`, port_number = `{2}`' .format(src_ip_address, dst_ip_address, dst_flow_port)) minion = oatsnb.get_hostname(host) oatssalthelpers.apply_policy(minion, 8000, interface, src_ip_address, dst_ip_address, dst_flow_port) oatssalthelpers.remove_policy(minion, interface, src_ip_address, dst_ip_address, dst_flow_port) comment = "Discarded packets on host {0} on egress interface `{1}` exceeded threshhold. " \ "Destination port of traffic: `{2}`.\n".format(host, data['name'], dst_flow_port) comment += "Applied traffic throttlinc policy for 120 seconds.\n" else: comment += 'Could not determine source of traffic, possible DDoS attack detected' \ ' because traffic source port is `port 0`.' slack_status = oatssalthelpers.post_slack(comment, case=current_case) ret = { 'error': 'OUT_DISCARDS', 'comment': comment, 'changes': 'conf', 'slack-post-status:': slack_status, 'success': bool(dst_flow_port) } return ret
def ifdown(host, yang_message, error, tag, current_case=None): ''' Function that gathers data in the network and executes a workflow according to the data. Is triggered by the salt system once an INTERFACE_DOWN event arrives in the salt event bus. Will try to fix the error or send a notification if it is unable to do so. :param host: the host that started this workflow. :param yang_message: the yang data. :param error: the events error. :param tag: the events tag. :param current_case: the current oats case id. :return: error, tag, a comment, configuration changes, success (bool). ''' conf = 'No changes' success = False interface = oatsdbhelpers.get_interface(error, yang_message) comment = 'Interface down status on host ' + host + ' detected. ' if current_case is None or current_case == 'None': current_case = oatspsql.create_case( error, host, solution='Case created in salt: `tshoot.ifdown`.') interface_neighbor = oatsnb.get_interface_neighbor(host, interface, case=current_case) neighbors = oatsnb.get_neighbors(interface_neighbor, case=current_case) device_up = oatssalthelpers.check_device_connectivity(neighbors, interface_neighbor, case=current_case) if device_up: # cycle affected interface oatssalthelpers.if_shutdown(host, interface, case=current_case) conf = oatssalthelpers.if_noshutdown(host, interface, case=current_case) # check if cycle was successful success = oatssalthelpers.ping(host, interface_neighbor, check_connectivity=True, case=current_case) if success: success = True comment += ('Config for Interface `' + interface + '` automatically changed from down to up') # TODO: remove, only useful for debugging oatssalthelpers.post_slack(comment, case=current_case) oatspsql.close_case(current_case) else: oatspsql.update_case(current_case, solution=error + 'could not get resolved. Technician needed.', status=oatspsql.Status.ONHOLD.value) comment = ('Could not fix down status of `' + interface + '` on host' + host + ' .') oatssalthelpers.post_slack(comment, case=current_case) if not device_up: # TODO: powercycle, check power consumation success = False oatspsql.update_case(current_case, solution='Device ' + interface_neighbor + ' is unreachable. Technician needed.', status=oatspsql.Status.ONHOLD.value) comment += 'Interface `' + interface + '` on host ' + host + ' down. Neighbor ' + interface_neighbor + ' is down.' oatssalthelpers.post_slack(comment, case=current_case) comment += ' Could not restore connectivity - Slack Message sent.' return { 'error': error, 'tag': tag, 'comment': comment, 'changes': conf, 'success': success }
case=current_case) n_of_neighbors = len( oatsnb.get_ospf_neighbors(interface_neighbor, case=current_case)) oatssalthelpers.ospf_shutdown(interface_neighbor, process_number, case=current_case) async_result = pool.apply_async( oatssalthelpers.wait_for_event, ('syslog/*/OSPF_NEIGHBOR_UP/ospf_nbrs_up', 120, current_case)) conf = oatssalthelpers.ospf_noshutdown(interface_neighbor, process_number, case=current_case) success = async_result.get() if success: oatspsql.update_case( current_case, 'Successfully restarted OSPF process on host {0}.'.format( interface_neighbor), oatspsql.Status.DONE.value) comment += ' OSPF process restarted successfully.' else: oatspsql.update_case( current_case, 'Unable to restart OSPF process on host {0}. Host might be offline.' '. Technician needed.'.format(interface_neighbor), oatspsql.Status.ONHOLD.value) slack_post = oatssalthelpers.post_slack(comment, case=current_case) ret = { 'error': error, 'tag': tag, 'comment': comment, 'changes': conf,
def aggregate_distinct(data, host, timestamp, severity, error, sensor_type, event_name, distinct_events, aggregation_event_name=None, correlate_for=None, use_oats_case=False): ''' Takes distinct events and aggregates them. The first event will start the aggregation for the given amount of time. Each time an additional event that is given by distinct_events reaches this function a counter will incremented. Once the time passed, the counter is evaluated. If the counter is the same as the number stated in distinct_events an event with the event name "aggregation_event_name" is generated. Else an event with the event name "event_name" is generated. Note: the events do not have to be distinct, but for aggregation of identical events the use of aggregate_identical is suggested. :param data: the data of the event. :param host: the host from which the event originated. :param timestamp: the events timestamp. :param severity: the events severity. :param error: the events error eg. OSPF_NEIGHBOR_DOWN. :param sensor_type: the sensor which detected the event eg. syslog. :param event_name: the events event name. :param distinct_events: dict of the form { event_name: x_amount_of_events, event_name2: y_amount_of_events } eg. { 'syslog/*/INTERFACE_CHANGED/down': 2, 'syslog/*/INTERFACE_CHANGED/up': 2} :param aggregation_event_name: the event name to use if the aggregation is successful :param correlate_for: the amount of time to aggregate for. :param use_oats_case: if set will generate an oats case in psql. :return: None ''' oatsinflux.write_event(host, timestamp, sensor_type, event_name, severity, data) cache_id = 'aggregate_distinct' + error lock = threading.Lock() lock.acquire() if not 'event_names' in locals(): event_names = [] current_case = None if cache is None or cache_id not in cache or host+event_name not in cache[cache_id]: logger.debug('Starting aggregation of distinct events...') # first thread initializes and populates dict __init_cache(host+event_name, cache_id, correlate_for,host=host, additional_events=distinct_events.keys()) event_names.append(host+event_name) else: logger.debug('Additional (distinct) event detected, incrementing counter...') # later threads increment counter cache[cache_id][host+event_name]['counter'] += 1 event_names.append(host+event_name) lock.release() return lock.release() if use_oats_case: current_case = __create_db_case(error, host, 'aggregate') oatspsql.update_case(current_case, solution='Waiting for {0} seconds to aggregate distinct events.'.format(correlate_for)) # wait for additional events time.sleep(correlate_for) success = True for event in event_names: if not cache[cache_id][event]['counter'] >= distinct_events[event[3:]]: success = False break if success: if use_oats_case: oatspsql.update_case(current_case, solution='Aggregation successful: sending `{0}` event to salt master.' .format(aggregation_event_name)) logger.debug('Aggregation successful.' .format(aggregation_event_name)) EventProcessor.process_event(data=data, host=host, timestamp=timestamp, sensor_type=sensor_type, event_name=aggregation_event_name, severity=severity, case=current_case, influx_write=False) else: if use_oats_case: oatspsql.update_case(current_case, solution='Aggregation not successful: sending `{0}` event to salt master.' .format(event_name)) logger.debug('Aggregation not successful.' .format(event_name)) EventProcessor.process_event(data=data, host=host, timestamp=timestamp, sensor_type=sensor_type, event_name=event_name, severity=severity, case=current_case, influx_write=False)
def __update_db_case(current_case, counter, event_name): oatspsql.update_case(current_case, solution='Time passed: `{0}` event counter is {1}. Sending `{0}`' ' event to salt master'.format(event_name, counter))
def aggregate_identical(data, host, timestamp, severity, error, sensor_type, event_name, n_of_events=None, alternative_event_name=None, correlate_for=None, use_oats_case=False): ''' Takes identical events and aggregates them. The first event will start the aggregation for the given amount of time. Each time an additional event of the same kind reaches this function a counter will incremented. Once the time passed, the counter is evaluated. If the counter is the same as the number stated in n_of_events an event with the event name "event_name" is generated. Else an event with the event name "alternative_event_name" is generated. :param data: the data of the event. :param host: the host from which the event originated. :param timestamp: the events timestamp. :param severity: the events severity. :param error: the events error eg. OSPF_NEIGHBOR_DOWN. :param sensor_type: the sensor which detected the event eg. syslog. :param event_name: the event name that is used when aggregation is successful. :param n_of_events: the needed amount of events for the aggregation to be successful. :param alternative_event_name: the alternative name if the aggregation is not successful. :param correlate_for: the amount of time to aggregate for. :param use_oats_case: if set will generate an oats case in psql. :return: None ''' oatsinflux.write_event(host, timestamp, sensor_type, event_name, severity, data) cache_id = 'aggregate' + event_name lock = threading.Lock() lock.acquire() current_case = None if cache is None or cache_id not in cache or error not in cache[cache_id]: # first thread initializes and populates dict logger.debug('Starting aggregation of [{0}] events'.format(event_name)) __init_cache(error, cache_id, correlate_for) else: logger.debug('Additional [{0}] event detected. Incrementing counter...' .format(event_name)) # later threads increment counter cache[cache_id][error]['counter'] += 1 lock.release() return lock.release() if use_oats_case: current_case = __create_db_case(error, host, 'aggregate') oatspsql.update_case(current_case, solution='Waiting for {0} seconds to aggregate events.' ' Required amount of events: {1}'.format(correlate_for, n_of_events)) # wait for additional events time.sleep(correlate_for) logger.debug('Aggregation finished. Event counter for event [{0}] is: {1}.' .format(event_name, cache[cache_id][error]['counter'])) if cache[cache_id][error]['counter'] == n_of_events: if use_oats_case: __update_db_case(current_case, cache[cache_id][error]['counter'], event_name) logger.debug('Aggregation successful.' .format(event_name)) EventProcessor.process_event(data=data, host=host, timestamp=timestamp, sensor_type=sensor_type, event_name=event_name, severity=severity, case=current_case, influx_write=False) else: if use_oats_case: __update_db_case(current_case, cache[cache_id][error]['counter'], event_name) logger.debug('Aggregation not successful.' .format(alternative_event_name)) EventProcessor.process_event(data=data, host=host, timestamp=timestamp, sensor_type=sensor_type, event_name=alternative_event_name, severity=severity, case=current_case, influx_write=False)