def __init__(self, parent_pid): super(Aggregator, self).__init__() self.redis_conn = get_redis_conn(skyline_app) self.redis_conn_decoded = get_redis_conn_decoded(skyline_app) self.parent_pid = parent_pid self.daemon = True self.current_pid = getpid()
def __init__(self, parent_pid): """ Initialize the SNAB_flux_load_test """ super(SNAB_flux_load_test, self).__init__() self.redis_conn = get_redis_conn(skyline_app) self.redis_conn_decoded = get_redis_conn_decoded(skyline_app) self.daemon = True self.parent_pid = parent_pid self.current_pid = getpid()
def __init__(self, parent_pid): """ Initialize Rolling """ super(RollingThunder, self).__init__() self.redis_conn = get_redis_conn(skyline_app) self.redis_conn_decoded = get_redis_conn_decoded(skyline_app) self.daemon = True self.parent_pid = parent_pid self.current_pid = getpid()
def __init__(self, parent_pid): """ Initialize RelatedMetrics """ super(RelatedMetrics, self).__init__() self.redis_conn = get_redis_conn(skyline_app) self.redis_conn_decoded = get_redis_conn_decoded(skyline_app) self.daemon = True self.parent_pid = parent_pid self.current_pid = getpid()
def __init__(self, parent_pid): """ Initialize Cloudbursts """ super(Cloudbursts, self).__init__() self.redis_conn = get_redis_conn(skyline_app) self.redis_conn_decoded = get_redis_conn_decoded(skyline_app) self.daemon = True self.parent_pid = parent_pid self.current_pid = getpid()
def thunder_send_event(current_skyline_app, event, log=True): """ Add an event to the thunder.events Redis set or the thunder check dir if Redis is not available. :param current_skyline_app: the app calling the function :param event: the event data :param log: whether to log or not, optional, defaults to False :type current_skyline_app: str :type event: dict :type log: boolean :return: submitted :rtype: boolean """ function_str = 'functions.thunder.thunder_sent_event' if log: current_skyline_app_logger = current_skyline_app + 'Log' current_logger = logging.getLogger(current_skyline_app_logger) else: current_logger = None submitted = 0 try: redis_conn = get_redis_conn(current_skyline_app) submitted = redis_conn.sadd('thunder.events', str(event)) if submitted: return True except Exception as e: if not log: current_skyline_app_logger = current_skyline_app + 'Log' current_logger = logging.getLogger(current_skyline_app_logger) current_logger.error(traceback.format_exc()) current_logger.error( 'error :: %s :: failed to add %s to thunder.events Redis set - %s' % (function_str, str(event), e)) # If the thunder event was not added to Redis set, create the event_file if not path.exists(THUNDER_EVENTS_DIR): mkdir_p(THUNDER_EVENTS_DIR) current_logger.info('created dir - %s' % THUNDER_EVENTS_DIR) event_file = '%s/%s.thunder.event.dict' % (THUNDER_EVENTS_DIR, str(time())) try: write_data_to_file(current_skyline_app, event_file, 'w', str(event)) current_logger.info('added thunder event file - %s' % event_file) submitted = True except Exception as e: current_logger.error(traceback.format_exc()) current_logger.error( 'error :: failed to add thunder event file - %s - %s' % (event_file, e)) submitted = False return submitted
def __init__(self, parent_pid): """ Initialize Luminosity Create the :obj:`redis_conn` a Redis client object Create the :obj:`correlations` list Create the :obj:`mysql_conn` MySQLConnection object Create the :obj:`memcache_client` a constructor that does not make a connection to memcached. The first call to a method on the object will do that. """ super(Luminosity, self).__init__() # @modified 20180519 - Feature #2378: Add redis auth to Skyline and rebrow # @modified 20191030 - Bug #3266: py3 Redis binary objects not strings # Branch #3262: py3 # Use get_redis_conn and get_redis_conn_decoded to use on Redis sets when the bytes # types need to be decoded as utf-8 to str # if settings.REDIS_PASSWORD: # self.redis_conn = StrictRedis(password=settings.REDIS_PASSWORD, unix_socket_path=settings.REDIS_SOCKET_PATH) # else: # self.redis_conn = StrictRedis(unix_socket_path=settings.REDIS_SOCKET_PATH) # @added 20191030 - Bug #3266: py3 Redis binary objects not strings # Branch #3262: py3 # Added a single functions to deal with Redis connection and the # charset='utf-8', decode_responses=True arguments required in py3 self.redis_conn = get_redis_conn(skyline_app) self.redis_conn_decoded = get_redis_conn_decoded(skyline_app) self.daemon = True self.parent_pid = parent_pid self.current_pid = getpid() # @modified 20190522 - Task #3034: Reduce multiprocessing Manager list usage # Task #3032: Debug number of Python processes and memory use # Branch #3002: docker # Reduce amount of Manager instances that are used as each requires a # copy of entire memory to be copied into each subprocess so this # results in a python process per Manager instance, using as much # memory as the parent. OK on a server, not so much in a container. # Disabled all the Manager().list() below and replaced with Redis sets # self.correlations = Manager().list() # @added 20180720 - Task #2462: Implement useful metrics for Luminosity # self.metrics_checked_for_correlation = Manager().list() # self.runtimes = Manager().list() self.mysql_conn = mysql.connector.connect(**config) if settings.MEMCACHE_ENABLED: self.memcache_client = pymemcache_Client( (settings.MEMCACHED_SERVER_IP, settings.MEMCACHED_SERVER_PORT), connect_timeout=0.1, timeout=0.2) else: self.memcache_client = None
def __init__(self, queue, parent_pid): super(Worker, self).__init__() # @modified 20191115 - Bug #3266: py3 Redis binary objects not strings # Branch #3262: py3 # if settings.REDIS_PASSWORD: # self.redis_conn = StrictRedis(password=settings.REDIS_PASSWORD, unix_socket_path=settings.REDIS_SOCKET_PATH) # else: # self.redis_conn = StrictRedis(unix_socket_path=settings.REDIS_SOCKET_PATH) # @added 20191115 - Bug #3266: py3 Redis binary objects not strings # Branch #3262: py3 self.redis_conn = get_redis_conn(skyline_app) self.redis_conn_decoded = get_redis_conn_decoded(skyline_app) self.q = queue self.parent_pid = parent_pid self.daemon = True
def __init__(self, parent_pid): super(Worker, self).__init__() self.parent_pid = parent_pid self.daemon = True # @modified 20191111 - Bug #3266: py3 Redis binary objects not strings # Branch #3262: py3 # if settings.REDIS_PASSWORD: # self.redis_conn = StrictRedis(password=settings.REDIS_PASSWORD, unix_socket_path=settings.REDIS_SOCKET_PATH) # else: # self.redis_conn = StrictRedis(unix_socket_path=settings.REDIS_SOCKET_PATH) # @added 20191111 - Bug #3266: py3 Redis binary objects not strings # Branch #3262: py3 # Added a single functions to deal with Redis connection and the # charset='utf-8', decode_responses=True arguments required in py3 self.redis_conn = get_redis_conn(skyline_app) self.redis_conn_decoded = get_redis_conn_decoded(skyline_app)
def is_anomalously_anomalous(metric_name, ensemble, datapoint): """ This method runs a meta-analysis on the metric to determine whether the metric has a past history of triggering. TODO: weight intervals based on datapoint """ # We want the datapoint to avoid triggering twice on the same data new_trigger = [time(), datapoint] # Get the old history # @added 20200505 - Feature #3504: Handle airgaps in batch metrics # Use get_redis_conn from skyline_functions import get_redis_conn redis_conn = get_redis_conn(skyline_app) raw_trigger_history = redis_conn.get('trigger_history.' + metric_name) if not raw_trigger_history: redis_conn.set('trigger_history.' + metric_name, packb([(time(), datapoint)])) return True trigger_history = unpackb(raw_trigger_history) # Are we (probably) triggering on the same data? if (new_trigger[1] == trigger_history[-1][1] and new_trigger[0] - trigger_history[-1][0] <= 300): return False # Update the history trigger_history.append(new_trigger) redis_conn.set('trigger_history.' + metric_name, packb(trigger_history)) # Should we surface the anomaly? trigger_times = [x[0] for x in trigger_history] intervals = [ trigger_times[i + 1] - trigger_times[i] for i, v in enumerate(trigger_times) if (i + 1) < len(trigger_times) ] series = pandas.Series(intervals) mean = series.mean() stdDev = series.std() return abs(intervals[-1] - mean) > 3 * stdDev
def __init__(self, parent_pid, skip_mini): super(Roomba, self).__init__() # @modified 20180519 - Feature #2378: Add redis auth to Skyline and rebrow # @modified 20191030 - Bug #3266: py3 Redis binary objects not strings # Branch #3262: py3 # if settings.REDIS_PASSWORD: # self.redis_conn = StrictRedis(password=settings.REDIS_PASSWORD, unix_socket_path=settings.REDIS_SOCKET_PATH) # else: # self.redis_conn = StrictRedis(unix_socket_path=settings.REDIS_SOCKET_PATH) # @added 20191030 - Bug #3266: py3 Redis binary objects not strings # Branch #3262: py3 # Added a single functions to deal with Redis connection and the # charset='utf-8', decode_responses=True arguments required in py3 self.redis_conn = get_redis_conn(skyline_app) self.redis_conn_decoded = get_redis_conn_decoded(skyline_app) self.daemon = True self.parent_pid = parent_pid self.skip_mini = skip_mini
def get_metric_timeseries(current_skyline_app, metric_name, log=True): """ Return a metric time series as a list e.g. [[ts, value], [ts, value], ..., [ts, value]] :param current_skyline_app: the app calling the function :param metric_name: the full Redis metric name :param log: whether to log or not, optional, defaults to True :type current_skyline_app: str :type metric_name: str :type log: boolean :return: timeseries :rtype: list """ function_str = 'functions.redis.get_metric_timeseries' if log: current_skyline_app_logger = current_skyline_app + 'Log' current_logger = logging.getLogger(current_skyline_app_logger) else: current_logger = None timeseries = [] try: redis_conn = get_redis_conn(current_skyline_app) except Exception as e: if not log: current_skyline_app_logger = current_skyline_app + 'Log' current_logger = logging.getLogger(current_skyline_app_logger) current_logger.error( 'error :: %s :: failed to connect to Redis to fetch time series for %s - %s' % (function_str, metric_name, e)) if metric_name.startswith(FULL_NAMESPACE): metric_name = str(metric_name) else: metric_name = '%s%s' % (FULL_NAMESPACE, str(metric_name)) raw_series = None try: raw_series = redis_conn.get(metric_name) except Exception as e: if not log: current_skyline_app_logger = current_skyline_app + 'Log' current_logger = logging.getLogger(current_skyline_app_logger) current_logger.error('error :: failed to get %s from Redis - %s' % (metric_name, e)) raw_series = None if not raw_series: return timeseries try: unpacker = Unpacker(use_list=False) unpacker.feed(raw_series) timeseries = list(unpacker) except Exception as e: if not log: current_skyline_app_logger = current_skyline_app + 'Log' current_logger = logging.getLogger(current_skyline_app_logger) current_logger.error( 'error :: failed to unpack %s time series from Redis data - %s' % (metric_name, e)) timeseries = [] try: redis_conn_decoded = get_redis_conn_decoded(current_skyline_app) except Exception as e: if not log: current_skyline_app_logger = current_skyline_app + 'Log' current_logger = logging.getLogger(current_skyline_app_logger) current_logger.error( 'error :: %s :: failed to connect to Redis to get derivative_metrics - %s' % (function_str, e)) derivative_metrics = [] try: # @modified 20211012 - Feature #4280: aet.metrics_manager.derivative_metrics Redis hash # derivative_metrics = list(redis_conn_decoded.smembers('derivative_metrics')) derivative_metrics = list( redis_conn_decoded.smembers( 'aet.metrics_manager.derivative_metrics')) except Exception as e: if not log: current_skyline_app_logger = current_skyline_app + 'Log' current_logger = logging.getLogger(current_skyline_app_logger) current_logger.error( 'error :: %s :: failed to connect to Redis for smembers of derivative_metrics - %s' % (function_str, e)) derivative_metrics = [] if metric_name in derivative_metrics: if len(timeseries) > 3: try: derivative_timeseries = nonNegativeDerivative(timeseries) timeseries = derivative_timeseries except Exception as e: if not log: current_skyline_app_logger = current_skyline_app + 'Log' current_logger = logging.getLogger( current_skyline_app_logger) current_logger.error( 'error :: %s :: nonNegativeDerivative failed - %s' % (function_str, e)) return timeseries
def run(self): """ Called when the process intializes. """ logger.info('aggregator :: starting aggregator') # Determine a primary aggregator aggregator_pid = getpid() main_process_pid = 0 try: main_process_pid = int( self.redis_conn_decoded.get('flux.main_process_pid')) if main_process_pid: logger.info( 'aggregator :: main_process_pid found in Redis key - %s' % str(main_process_pid)) except: main_process_pid = 0 if not main_process_pid: logger.error( 'error :: aggregator :: no main_process_pid known, exiting') sys.exit(1) primary_aggregator_key = 'flux.primary_aggregator_pid.%s' % str( main_process_pid) logger.info( 'aggregator :: starting primary_aggregator election using primary_aggregator_key: %s' % primary_aggregator_key) sleep_for = random.uniform(0.1, 1.5) logger.info( 'aggregator :: starting primary_aggregator election - sleeping for %s' % str(sleep_for)) sleep(sleep_for) primary_aggregator_pid = 0 try: primary_aggregator_pid = int( self.redis_conn_decoded.get(primary_aggregator_key)) if primary_aggregator_pid: logger.info( 'aggregator :: primary_aggregator_pid found in Redis key - %s' % str(primary_aggregator_pid)) except: primary_aggregator_pid = 0 if not primary_aggregator_pid: try: self.redis_conn.setex(primary_aggregator_key, 300, aggregator_pid) primary_aggregator_pid = int( self.redis_conn_decoded.get(primary_aggregator_key)) logger.info( 'aggregator :: set self pid to primary_aggregator - %s' % str(primary_aggregator_pid)) except: primary_aggregator_pid = 0 primary_aggregator = False if primary_aggregator_pid == aggregator_pid: primary_aggregator = True logger.info( 'aggregator :: primary_aggregator_pid is set to %s, primary_aggregator: %s' % (str(primary_aggregator_pid), str(primary_aggregator))) last_flush = int(time()) - 59 remove_from_flux_queue_redis_set = [] # Populate API keys and tokens in memcache # python-2.x and python3.x handle while 1 and while True differently # while 1: running = True while running: # Make sure Redis is up redis_up = False while not redis_up: try: redis_up = self.redis_conn.ping() except: logger.error( 'aggregator :: cannot connect to redis at socket path %s' % (settings.REDIS_SOCKET_PATH)) sleep(2) try: self.redis_conn = get_redis_conn(skyline_app) except Exception as e: logger.error( 'error :: aggregator :: could not get_redis_conn - %s' % str(e)) try: self.redis_conn_decoded = get_redis_conn_decoded( skyline_app) except Exception as e: logger.error( 'error :: aggregator :: could not get_redis_conn_decoded - %s' % str(e)) try: time_now = int(time()) while (time_now - last_flush) <= 59: sleep(1) remove_from_flux_queue_redis_set = [] time_now = int(time()) primary_aggregator_pid = 0 try: primary_aggregator_pid = int( self.redis_conn_decoded.get(primary_aggregator_key)) if primary_aggregator_pid: logger.info( 'aggregator :: primary_aggregator_pid found in Redis key - %s' % str(primary_aggregator_pid)) except: primary_aggregator_pid = 0 if not primary_aggregator_pid: try: self.redis_conn.setex(primary_aggregator_key, 300, aggregator_pid) primary_aggregator_pid = int( self.redis_conn_decoded.get( primary_aggregator_key)) logger.info( 'aggregator :: set self pid to primary_aggregator - %s' % str(primary_aggregator_pid)) except: primary_aggregator_pid = 0 primary_aggregator = False if primary_aggregator_pid == aggregator_pid: primary_aggregator = True logger.info( 'aggregator :: primary_aggregator_pid is set to %s, primary_aggregator: %s' % (str(primary_aggregator_pid), str(primary_aggregator))) flux_aggregator_queue = [] if primary_aggregator: logger.info('aggregator :: checking for data to aggregate') try: flux_aggregator_queue = self.redis_conn_decoded.smembers( 'flux.aggregator.queue') logger.info( 'aggregator :: %s entries in flux.aggregator.queue to process' % str(len(flux_aggregator_queue))) except: logger.error(traceback.format_exc()) logger.error( 'error :: could not get the flux.aggregator.queue set from Redis' ) else: logger.info( 'aggregator :: not primary, in standby to take over should the primary_aggregator fail' ) flux_aggregator_queue_items = [] all_metrics = [] if flux_aggregator_queue: for flux_aggregator_queue_item_str in flux_aggregator_queue: try: flux_aggregator_queue_item = literal_eval( flux_aggregator_queue_item_str) all_metrics.append(flux_aggregator_queue_item[0]) flux_aggregator_queue_items.append([ flux_aggregator_queue_item, flux_aggregator_queue_item_str ]) # self.redis_conn.srem('flux.aggregator.queue', flux_aggregator_queue_item_str) except: logger.error(traceback.format_exc()) logger.error( 'error :: failed to evaluate item from flux.aggregator.queue Redis set' ) metrics = list(set(all_metrics)) for metric in metrics: last_metric_flush = last_flush last_metric_flush_str = None try: last_metric_flush_str = self.redis_conn_decoded.hget( 'flux.aggregate_metrics.last_flush', metric) # Handle new metric without throwing an error if they do # not have an entry in the hash if last_metric_flush_str: last_metric_flush = int(last_metric_flush_str) except: logger.error(traceback.format_exc()) logger.error( 'error :: failed convert last_metric_flush_str value to an int from flux.aggregate_metrics.last_flush Redis hash for %s' % metric) if not last_metric_flush: # Handle new metric without throwing an error if they do # not have an entry in the hash logger.info( 'aggregator :: probable new metric - no last_metric_flush found in flux.aggregate_metrics.last_flush Redis hash for %s using last_flush' % metric) last_metric_flush = last_flush metric_aggregation_settings = {} try: metric_aggregation_settings_str = self.redis_conn_decoded.hget( 'metrics_manager.flux.aggregate_namespaces.settings', metric) # @modified 20210718 if metric_aggregation_settings_str: metric_aggregation_settings = literal_eval( metric_aggregation_settings_str) else: metric_aggregation_settings = {} except: logger.error(traceback.format_exc()) logger.error( 'error :: failed to determine aggregation_settings from metrics_manager.flux.aggregate_namespaces.settings Redis hash for %s' % metric) # @added 20210718 # Handle newly added metrics that have not been added to # metrics_manager.flux.aggregate_namespaces.settings due to # to the chicken or the egg problem if not metric_aggregation_settings: logger.info( 'aggregator :: probable new metric - %s not found in metrics_manager.flux.aggregate_namespaces.settings Redis hash' % metric) aggregate_namespaces = list( settings.FLUX_AGGREGATE_NAMESPACES.keys()) pattern_match, metric_matched_by = matched_or_regexed_in_list( 'flux', metric, aggregate_namespaces) if pattern_match: matched_namespace = metric_matched_by[ 'matched_namespace'] metric_aggregation_settings = settings.FLUX_AGGREGATE_NAMESPACES[ matched_namespace] logger.info( 'aggregator :: new metric - %s detemined metric_aggregation_settings from FLUX_AGGREGATE_NAMESPACES - %s' % (metric, str(metric_aggregation_settings))) else: logger.error( 'error :: aggregator :: new metric - %s could not detemine metric_aggregation_settings from FLUX_AGGREGATE_NAMESPACES' % (metric)) interval = 60 try: interval = int(metric_aggregation_settings['interval']) except: # logger.error(traceback.format_exc()) logger.error( 'error :: failed to get interval from metric_aggregation_settings for %s, setting to default 60' % metric) interval = 60 if (time_now - last_metric_flush) < interval: continue metric_values = [] for flux_aggregator_queue_item in flux_aggregator_queue_items: if flux_aggregator_queue_item[0][0] != metric: continue # Discard any values older than the last metric flush if int(flux_aggregator_queue_item[0] [2]) > last_metric_flush: metric_values.append( flux_aggregator_queue_item[0][1]) try: self.redis_conn.srem('flux.aggregator.queue', flux_aggregator_queue_item[1]) remove_from_flux_queue_redis_set.append( flux_aggregator_queue_item[1]) except: logger.error(traceback.format_exc()) logger.error( 'error :: failed to remove item from flux.aggregator.queue Redis set - %s' % str(flux_aggregator_queue_item[1])) if not metric_aggregation_settings: logger.error( 'error :: no aggregation settings known for %s, discarding data' % metric) continue if metric_values: methods = [] try: methods = metric_aggregation_settings['method'] except: logger.error(traceback.format_exc()) logger.error( 'error :: failed to determine aggregation methods from metric_aggregation_settings - %s' % str(metric_aggregation_settings)) methods = [] for method in methods: try: metric_namespace = metric if metric_aggregation_settings[ 'method_suffix']: metric_namespace = '%s.%s' % (metric, method) else: # @added 20220126 - Feature #4400: flux - quota # If method_suffix is not set but multiple # methods are being used, method_suffix # must be applied, otherwise the metric will # have all the method values submitted to a # single metric name. if len(methods) > 1: metric_namespace = '%s.%s' % (metric, method) aggregate_value = None if method == 'avg': if len(metric_values) > 1: aggregate_value = sum( metric_values) / len(metric_values) else: aggregate_value = metric_values[0] if method == 'sum': aggregate_value = sum(metric_values) if method == 'max': aggregate_value = max(metric_values) if method == 'min': aggregate_value = min(metric_values) if aggregate_value is not None: try: backfill = False metric_data = [ metric_namespace, aggregate_value, (time_now - interval), backfill ] flux.httpMetricDataQueue.put( metric_data, block=False) logger.info('aggregator :: added %s' % (str(metric_data))) try: self.redis_conn.hset( 'flux.aggregate_metrics.last_flush', metric, time_now) except: logger.error( traceback.format_exc()) logger.error( 'error :: aggregator :: failed to set last metric flush time in Redis hash flux.aggregate_metrics.last_flush' ) except: logger.error(traceback.format_exc()) logger.error( 'error :: aggregator :: failed to add aggregator data to flux.httpMetricDataQueue - %s' % str(metric_data)) except: logger.error(traceback.format_exc()) logger.error( 'error :: aggregator :: failed to aggregate metric_values by a method for %s' % str(metric)) last_flush = time_now # flux_zero_fill_metrics = list(self.redis_conn_decoded.smembers('flux.zero_fill_metrics')) if FLUX_PERSIST_QUEUE: redis_set_size = 0 try: redis_set_size = self.redis_conn.scard('flux.queue') except: logger.error(traceback.format_exc()) logger.error( 'error :: aggregator :: failed to determine size of flux.queue Redis set' ) logger.info( 'aggregator :: flux.queue Redis set size of %s before removal of %s items' % (str(redis_set_size), str(len(remove_from_flux_queue_redis_set)))) if remove_from_flux_queue_redis_set: try: self.redis_conn.srem( 'flux.queue', *set(remove_from_flux_queue_redis_set)) remove_from_flux_queue_redis_set = [] except: logger.error(traceback.format_exc()) logger.error( 'error :: aggregator :: failed to remove multiple items from flux.queue Redis set' ) try: redis_set_size = self.redis_conn.scard( 'flux.queue') except: logger.error(traceback.format_exc()) logger.error( 'error :: aggregator :: failed to determine size of flux.queue Redis set' ) logger.info( 'aggregator :: flux.queue Redis set size of %s after the removal of items' % (str(redis_set_size))) remove_from_flux_queue_redis_set = [] if primary_aggregator: try: self.redis_conn.setex(primary_aggregator_key, 300, aggregator_pid) primary_aggregator_pid = int( self.redis_conn_decoded.get( primary_aggregator_key)) logger.info( 'aggregator :: set self pid to primary_aggregator - %s' % str(primary_aggregator_pid)) logger.info( 'aggregator :: set Redis primary_aggregator_key key to self pid to primary_aggregator - %s' % str(primary_aggregator_pid)) except Exception as e: logger.error( 'error :: aggregator :: failed to set Redis primary_aggregator_key key to self pid - %s' % (str(e))) except NotImplementedError: pass except KeyboardInterrupt: logger.info( 'aggregator :: server has been issued a user signal to terminate - KeyboardInterrupt' ) except SystemExit: logger.info( 'aggregator :: server was interrupted - SystemExit') except Exception as e: logger.error(traceback.format_exc()) logger.error('error :: aggregator :: %s' % (str(e)))
def run(self): """ Called when process initializes. """ # Log management to prevent overwriting # Allow the bin/<skyline_app>.d to manage the log if os.path.isfile(skyline_app_logwait): try: os_remove(skyline_app_logwait) except OSError: logger.error('error - failed to remove %s, continuing' % skyline_app_logwait) pass now = time() log_wait_for = now + 5 while now < log_wait_for: if os.path.isfile(skyline_app_loglock): sleep(.1) now = time() else: now = log_wait_for + 1 logger.info('starting %s run' % skyline_app) if os.path.isfile(skyline_app_loglock): logger.error( 'error - bin/%s.d log management seems to have failed, continuing' % skyline_app) try: os_remove(skyline_app_loglock) logger.info('log lock file removed') except OSError: logger.error('error - failed to remove %s, continuing' % skyline_app_loglock) pass else: logger.info('bin/%s.d log management done' % skyline_app) logger.info('%s :: started roomba' % skyline_app) while 1: now = time() # Make sure Redis is up try: self.redis_conn.ping() except: logger.error( '%s :: roomba can\'t connect to redis at socket path %s' % (skyline_app, settings.REDIS_SOCKET_PATH)) sleep(10) # @modified 20180519 - Feature #2378: Add redis auth to Skyline and rebrow # @modified 20191115 - Bug #3266: py3 Redis binary objects not strings # Branch #3262: py3 if settings.REDIS_PASSWORD: self.redis_conn = StrictRedis( password=settings.REDIS_PASSWORD, unix_socket_path=settings.REDIS_SOCKET_PATH) else: self.redis_conn = StrictRedis( unix_socket_path=settings.REDIS_SOCKET_PATH) # @added 20191115 - Bug #3266: py3 Redis binary objects not strings # Branch #3262: py3 self.redis_conn = get_redis_conn(skyline_app) self.redis_conn_decoded = get_redis_conn_decoded(skyline_app) continue # Spawn processes pids = [] for i in range(1, settings.ROOMBA_PROCESSES + 1): if not self.skip_mini: logger.info( '%s :: starting vacuum process on mini namespace' % skyline_app) p = Process(target=self.vacuum, args=(i, settings.MINI_NAMESPACE, settings.MINI_DURATION + settings.ROOMBA_GRACE_TIME)) pids.append(p) p.start() logger.info('%s :: starting vacuum process' % skyline_app) p = Process( target=self.vacuum, args=(i, settings.FULL_NAMESPACE, settings.FULL_DURATION + settings.ROOMBA_GRACE_TIME)) pids.append(p) p.start() # Send wait signal to zombie processes # for p in pids: # p.join() # deroomba - kill any lingering vacuum processes # Changed to manage Roomba processes as edge cases related to I/O # wait have been experienced that resulted in Roomba stalling so a # ROOMBA_TIMEOUT setting was added and here we use the pattern # described by http://stackoverflow.com/users/2073595/dano at # http://stackoverflow.com/a/26064238 to monitor and kill any # stalled processes rather than using p.join(TIMEOUT) - 20160505 # @earthgecko ref 1342 logger.info('%s :: allowing vacuum process/es %s seconds to run' % (skyline_app, str(settings.ROOMBA_TIMEOUT))) start = time() while time() - start <= settings.ROOMBA_TIMEOUT: if any(p.is_alive() for p in pids): # Just to avoid hogging the CPU sleep(.1) else: # All the processes are done, break now. time_to_run = time() - start logger.info('%s :: vacuum processes completed in %.2f' % (skyline_app, time_to_run)) break else: # We only enter this if we didn't 'break' above. logger.info('%s :: timed out, killing all Roomba processes' % (skyline_app)) for p in pids: p.terminate() p.join() # sleeping in the main process is more CPU efficient than sleeping # in the vacuum def also roomba is quite CPU intensive so we only # what to run roomba once every minute process_runtime = time() - now roomba_optimum_run_duration = 60 if process_runtime < roomba_optimum_run_duration: sleep_for = (roomba_optimum_run_duration - process_runtime) logger.info('%s :: sleeping %.2f for due to low run time' % (skyline_app, sleep_for)) sleep(sleep_for)
# Consolidate flux logging # logger = set_up_logging('listen') logger = set_up_logging(None) LOCAL_DEBUG = False ALLOWED_CHARS = ['+', '-', '%', '.', '_', '/', '='] for char in string.ascii_lowercase: ALLOWED_CHARS.append(char) for char in string.ascii_uppercase: ALLOWED_CHARS.append(char) for char in string.digits: ALLOWED_CHARS.append(char) # @added 20201018 - Feature #3798: FLUX_PERSIST_QUEUE redis_conn = get_redis_conn('flux') def validate_key(caller, apikey): # @added 20200818 - Feature #3694: flux - POST multiple metrics # Added metric_namespace_prefix which is declared via the FLUX_API_KEYS metric_namespace_prefix = None try: isAlNum = False isAlNum = apikey.isalnum() if isAlNum: keyLength = len(apikey) if keyLength == 32: # Check to determine if it is a valid API key
def run(self): """ - Called when the process intializes. - Determine if Redis is up - Spawn a process_metric process to do analysis - Wait for the process to finish. - run_every 300 seconds """ # Log management to prevent overwriting # Allow the bin/<skyline_app>.d to manage the log now = time() log_wait_for = now + 5 while now < log_wait_for: if os.path.isfile(skyline_app_loglock): sleep(.1) now = time() else: now = log_wait_for + 1 logger.info('related_metrics :: starting') while 1: now = time() # Make sure Redis is up try: self.redis_conn.ping() except Exception as e: logger.error(traceback.format_exc()) logger.error( 'error :: related_metrics cannot connect to redis at socket path %s - %s' % (settings.REDIS_SOCKET_PATH, e)) sleep(10) try: self.redis_conn = get_redis_conn(skyline_app) self.redis_conn_decoded = get_redis_conn_decoded( skyline_app) except Exception as e: logger.info(traceback.format_exc()) logger.error( 'error :: related_metrics cannot connect to get_redis_conn - %s' % e) continue # Report app up try: self.redis_conn.setex('luminosity.related_metrics', 120, now) logger.info( 'related_metrics :: set luminosity.related_metrics Redis key' ) except Exception as err: logger.error(traceback.format_exc()) logger.error( 'error :: related_metrics :: could not update the Redis luminosity.related_metrics key - %s' % str(err)) now_timestamp = int(time()) # Spawn process pids = [] spawned_pids = [] pid_count = 0 for i in range(1, 1 + 1): try: p = Process(target=self.find_related, args=(i, )) pids.append(p) pid_count += 1 logger.info( 'related_metrics starting %s of 1 find_related processes' % (str(pid_count))) p.start() spawned_pids.append(p.pid) except Exception as e: logger.error(traceback.format_exc()) logger.error( 'error :: related_metrics :: failed to spawn find_related_metrics process - %s' % e) # Self monitor processes and terminate if any find_related # has run for longer than run_every - 10 p_starts = time() while time() - p_starts <= (120 - 10): if any(p.is_alive() for p in pids): # Just to avoid hogging the CPU sleep(.1) else: # All the processes are done, break now. time_to_run = time() - p_starts logger.info( 'related_metrics :: find_related process completed in %.2f seconds' % (time_to_run)) break else: # We only enter this if we didn't 'break' above. logger.info( 'related_metrics :: timed out, killing find_related process' ) for p in pids: logger.info( 'related_metrics :: killing find_related process') p.terminate() logger.info( 'related_metrics :: killed find_related process') for p in pids: if p.is_alive(): try: logger.info( 'related_metrics :: stopping find_related - %s' % (str(p.is_alive()))) p.terminate() except Exception as e: logger.error(traceback.format_exc()) logger.error( 'error :: related_metrics :: failed to stop find_related - %s' % e) run_every = 60 process_runtime = time() - now if process_runtime < run_every: sleep_for = (run_every - process_runtime) process_runtime_now = time() - now sleep_for = (run_every - process_runtime_now) logger.info( 'related_metrics :: sleeping for %.2f seconds due to low run time...' % sleep_for) sleep(sleep_for) try: del sleep_for except Exception as e: logger.error( 'error :: related_metrics :: failed to del sleep_for - %s' % e) try: del process_runtime except Exception as e: logger.error( 'error :: related_metrics :: failed to del process_runtime - %s' % e)
def get_redis_metrics_timeseries(current_skyline_app, metrics, log=False): """ Return a dict of metrics timeseries as lists e.g. { 'base_name.1': [[ts, value], [ts, value], ..., [ts, value]], 'base_name.2': [[ts, value], [ts, value], ..., [ts, value]] } :param current_skyline_app: the app calling the function :param metrics: a list of base_names or full Redis metric names :param log: whether to log or not, optional, defaults to False :type current_skyline_app: str :type metrics: list :type log: boolean :return: metrics_timeseries :rtype: dict """ function_str = 'functions.redis.get_metrics_timeseries' if log: current_skyline_app_logger = current_skyline_app + 'Log' current_logger = logging.getLogger(current_skyline_app_logger) else: current_logger = None metrics_timeseries = {} try: redis_conn = get_redis_conn(current_skyline_app) except Exception as err: if not log: current_skyline_app_logger = current_skyline_app + 'Log' current_logger = logging.getLogger(current_skyline_app_logger) current_logger.error( 'error :: %s :: %s :: get_redis_conn failed - %s' % (current_skyline_app, function_str, str(err))) try: redis_conn_decoded = get_redis_conn_decoded(current_skyline_app) except Exception as err: if not log: current_skyline_app_logger = current_skyline_app + 'Log' current_logger = logging.getLogger(current_skyline_app_logger) current_logger.error( 'error :: %s :: %s :: get_redis_conn_decoded failed - %s' % (current_skyline_app, function_str, str(err))) assigned_metrics = [] base_names = [] for metric in metrics: if metric.startswith(FULL_NAMESPACE): metric_name = str(metric) base_name = metric.replace(FULL_NAMESPACE, '') else: metric_name = '%s%s' % (FULL_NAMESPACE, str(metric)) base_name = str(metric) assigned_metrics.append(metric_name) base_names.append(base_name) metrics_timeseries[base_name] = {} derivative_metrics = [] try: # @modified 20211012 - Feature #4280: aet.metrics_manager.derivative_metrics Redis hash # derivative_metrics = list(redis_conn_decoded.smembers('derivative_metrics')) derivative_metrics = list( redis_conn_decoded.smembers( 'aet.metrics_manager.derivative_metrics')) except Exception as err: if not log: current_skyline_app_logger = current_skyline_app + 'Log' current_logger = logging.getLogger(current_skyline_app_logger) current_logger.error(traceback.format_exc()) current_logger.error( 'error :: %s :: %s :: failed to get derivative_metrics from Redis - %s' % (current_skyline_app, function_str, str(err))) raw_assigned = {} try: raw_assigned = redis_conn.mget(assigned_metrics) except Exception as err: if not log: current_skyline_app_logger = current_skyline_app + 'Log' current_logger = logging.getLogger(current_skyline_app_logger) current_logger.error(traceback.format_exc()) current_logger.error( 'error :: %s :: %s :: failed to get raw_assigned from Redis - %s' % (current_skyline_app, function_str, str(err))) if raw_assigned: for index, metric_name in enumerate(assigned_metrics): timeseries = [] try: raw_series = raw_assigned[index] if raw_series: unpacker = Unpacker(use_list=False) unpacker.feed(raw_series) timeseries = list(unpacker) except Exception as err: if not log: current_skyline_app_logger = current_skyline_app + 'Log' current_logger = logging.getLogger( current_skyline_app_logger) current_logger.error( 'error :: %s :: %s :: failed to unpack %s timeseries - %s' % (current_skyline_app, function_str, metric_name, str(err))) timeseries = [] if timeseries: # Convert Redis ts floats to ints timeseries = [[int(ts), value] for ts, value in timeseries] if timeseries: # To ensure that there are no unordered timestamps in the time # series which are artefacts of the collector or carbon-relay, sort # all time series by timestamp before analysis. original_timeseries = timeseries if original_timeseries: timeseries = sort_timeseries(original_timeseries) del original_timeseries if metric_name in derivative_metrics: if len(timeseries) > 3: try: derivative_timeseries = nonNegativeDerivative( timeseries) timeseries = derivative_timeseries except Exception as err: if not log: current_skyline_app_logger = current_skyline_app + 'Log' current_logger = logging.getLogger( current_skyline_app_logger) current_logger.error(traceback.format_exc()) current_logger.error( 'error :: %s :: %s :: nonNegativeDerivative failed on timeseries for %s - %s' % (current_skyline_app, function_str, metric_name, str(err))) if timeseries: base_name = base_names[index] metrics_timeseries[base_name] = timeseries return metrics_timeseries
def run(self): """ Called when the process intializes. """ # Log management to prevent overwriting # Allow the bin/<skyline_app>.d to manage the log # In Vista the log management is handled be fetcher, the worker just # waits for the fetcher to do the log managment now = int(time()) log_wait_for = now + 5 while now < log_wait_for: if os.path.isfile(skyline_app_loglock): sleep(.1) now = int(time()) else: now = log_wait_for + 1 logger.info('worker :: starting log management') if os.path.isfile(skyline_app_loglock): logger.error( 'error :: worker :: bin/%s.d log management seems to have failed, continuing' % skyline_app) try: os_remove(skyline_app_loglock) logger.info('worker :: log lock file removed') except OSError: logger.error( 'error :: worker :: failed to remove %s, continuing' % skyline_app_loglock) pass else: logger.info('worker :: bin/%s.d log management done' % skyline_app) logger.info('worker :: starting worker') try: VISTA_ENABLED = settings.VISTA_ENABLED logger.info('worker :: VISTA_ENABLED is set to %s' % str(VISTA_ENABLED)) except: VISTA_ENABLED = False logger.info( 'worker :: warning :: VISTA_ENABLED is not declared in settings.py, defaults to False' ) last_sent_to_graphite = int(time()) metrics_sent_to_flux = 0 # python-2.x and python3.x handle while 1 and while True differently # while 1: running = True while running: # Make sure Redis is up redis_up = False while not redis_up: try: redis_up = self.redis_conn.ping() if LOCAL_DEBUG: logger.info('worker :: redis is up') except: logger.error( 'worker :: cannot connect to redis at socket path %s' % (settings.REDIS_SOCKET_PATH)) sleep(2) # @modified 20191111 - Bug #3266: py3 Redis binary objects not strings # Branch #3262: py3 # if settings.REDIS_PASSWORD: # self.redis_conn = StrictRedis(password=settings.REDIS_PASSWORD, unix_socket_path=settings.REDIS_SOCKET_PATH) # else: # self.redis_conn = StrictRedis(unix_socket_path=settings.REDIS_SOCKET_PATH) self.redis_conn = get_redis_conn(skyline_app) self.redis_conn_decoded = get_redis_conn_decoded( skyline_app) metrics_data = [] redis_set = 'vista.fetcher.metrics.json' try: # Get a metric to validate from the Redis set # @modified 20191111 - Bug #3266: py3 Redis binary objects not strings # Branch #3262: py3 # metrics_data = self.redis_conn.smembers(redis_set) metrics_data = self.redis_conn_decoded.smembers(redis_set) if LOCAL_DEBUG: logger.info('worker :: got redis set data - %s' % redis_set) except: logger.error(traceback.format_exc()) logger.error( 'error :: worker :: retrieving Redis set %s data' % str(redis_set)) if not metrics_data: if LOCAL_DEBUG: logger.info('worker :: no data from Redis set %s' % str(redis_set)) sleep(5) for str_metric_data in metrics_data: delete_set_record = False remote_host_type = None try: # @modified 20191111 - Bug #3266: py3 Redis binary objects not strings # Branch #3262: py3 # Rather using get_redis_conn_decoded # if python_version == 3: # str_metric_data = str_metric_data.decode('UTF-8') metric_data = literal_eval(str_metric_data) remote_host_type = str(metric_data[0]['remote_host_type']) if LOCAL_DEBUG: logger.info( 'worker :: got data from Redis set for remote_host_type %s' % str(remote_host_type)) except: logger.error(traceback.format_exc()) logger.error( 'error :: worker :: failed to determine remote_host_type from %s' % str(str_metric_data)) delete_set_record = True if not delete_set_record: try: remote_target = str(metric_data[0]['remote_target']) if LOCAL_DEBUG: logger.info( 'worker :: got data from Redis set for target %s' % str(remote_target)) except: logger.error(traceback.format_exc()) logger.error( 'error :: worker :: failed to determine target from %s' % str(str_metric_data)) delete_set_record = True metric = None if not delete_set_record: try: metric = str(metric_data[0]['metric']) if LOCAL_DEBUG: logger.info( 'worker :: got data from Redis set for metric %s' % str(metric)) except: logger.error(traceback.format_exc()) logger.error( 'error :: worker :: failed to determine metric from %s' % str(str_metric_data)) delete_set_record = True namespace_prefix = '' if not delete_set_record: try: namespace_prefix = str( metric_data[0]['namespace_prefix']) namespace_prefix = '%s.' % namespace_prefix if not namespace_prefix: namespace_prefix = '' if LOCAL_DEBUG: logger.info( 'worker :: got data from Redis set for namespace_prefix %s' % str(namespace_prefix)) except: logger.error(traceback.format_exc()) logger.error( 'error :: worker :: failed to determine namespace_prefix from %s' % str(str_metric_data)) delete_set_record = True have_data = False if not delete_set_record: last_flux_metric_data = None cache_key = 'flux.last.%s' % (metric) try: if python_version == 3: redis_last_flux_metric_data = self.redis_conn.get( cache_key).decode('UTF-8') else: redis_last_flux_metric_data = self.redis_conn.get( cache_key) redis_last_flux_metric_data = redis_last_flux_metric_data last_flux_metric_data = literal_eval( redis_last_flux_metric_data) if LOCAL_DEBUG: logger.info( 'worker :: got last_flux_metric_data from Redis' ) except: logger.error(traceback.format_exc()) logger.error( 'error :: worker :: retrieving Redis key %s data' % str(cache_key)) last_flux_metric_data = False last_flux_timestamp = None if last_flux_metric_data: try: last_flux_timestamp = int(last_flux_metric_data[0]) if LOCAL_DEBUG: logger.info( 'worker :: got last_flux_timestamp - %s' % str(last_flux_timestamp)) except: logger.error(traceback.format_exc()) logger.error( 'error :: worker :: failed determining last_flux_timestamp' ) last_flux_timestamp = False # Determine the timestamp of the current minute to apply # VISTA_DO_NOT_SUBMIT_CURRENT_MINUTE time_now = int(time()) # current_minute = datetime.datetime.utcfromtimestamp(time_now).strftime('%Y-%m-%d %H:%M') current_minute_hour = int( datetime.datetime.utcfromtimestamp(time_now).strftime( '%H')) current_minute_minute = int( datetime.datetime.utcfromtimestamp(time_now).strftime( '%M')) current_datetime = datetime.datetime.utcfromtimestamp( time_now).replace(hour=current_minute_hour, minute=current_minute_minute, second=0, microsecond=0) current_minute_timestamp_start = int( current_datetime.strftime('%s')) datapoint = None last_timestamp_with_data = None timeseries = [] # @added 20200107 - Task #3376: Enable vista and flux to deal with lower frequency data metric_resolution = 60 metric_resolution_determined = False try: if python_version == 3: datapoints_str = literal_eval( metric_data[0]['datapoints']) metric_datapoints = literal_eval(datapoints_str) else: # metric_datapoints = metric_data[0]['datapoints'] datapoints_str = literal_eval( metric_data[0]['datapoints']) metric_datapoints = literal_eval(datapoints_str) # for value, timestamp in metric_data[0]['datapoints']: if LOCAL_DEBUG: len_metric_datapoints = len(metric_datapoints) logger.info( 'worker :: got %s metric_datapoints - %s' % (str(len_metric_datapoints), str(metric_datapoints))) # @added 20200107 - Task #3376: Enable vista and flux to deal with lower frequency data # Determine resolution resolution_timestamps = [] for metric_datapoint in metric_datapoints: timestamp = int(metric_datapoint[0]) resolution_timestamps.append(timestamp) timestamp_resolutions = [] if resolution_timestamps: last_timestamp = None for timestamp in resolution_timestamps: if last_timestamp: resolution = timestamp - last_timestamp timestamp_resolutions.append(resolution) last_timestamp = timestamp else: last_timestamp = timestamp if timestamp_resolutions: try: timestamp_resolutions_count = Counter( timestamp_resolutions) ordered_timestamp_resolutions_count = timestamp_resolutions_count.most_common( ) metric_resolution = int( ordered_timestamp_resolutions_count[0][0]) if metric_resolution > 0: metric_resolution_determined = True except: logger.error(traceback.format_exc()) logger.error( 'error :: worker :: failed to determine metric_resolution from %s' % (str(metric_data))) if metric_resolution_determined: cache_key = 'vista.last.resolution.%s' % metric try: # Update Redis key self.redis_conn.setex(cache_key, 3600, metric_resolution) except: logger.error(traceback.format_exc()) logger.error( 'error :: fetcher :: failed to set Redis key - %s' % (cache_key)) for metric_datapoint in metric_datapoints: # @20191010 - Branch #3140: vista # fetcher passes through preformatted data points that # are in the same format/order for both graphite and # prometheus # if remote_host_type == 'graphite': # value = float(metric_datapoint[0]) # timestamp = int(metric_datapoint[1]) # if remote_host_type == 'prometheus': # value = float(metric_datapoint[1]) # timestamp = int(metric_datapoint[0]) timestamp = int(metric_datapoint[0]) value = float(metric_datapoint[1]) append_to_timeseries = False if last_flux_timestamp: if int(timestamp) > last_flux_timestamp: # timeseries.append([timestamp, value]) append_to_timeseries = True else: # timeseries.append([timestamp, value]) append_to_timeseries = True # Here if the timestamp of the data point falls # within the current minute, it is discarded and not # sent to flux, to ensure that high frequency metrics # can have their minutely bins fully populated before # they are submitted to Graphite if settings.VISTA_DO_NOT_SUBMIT_CURRENT_MINUTE: if int(timestamp ) >= current_minute_timestamp_start: append_to_timeseries = False if append_to_timeseries: timeseries.append([timestamp, value]) last_timestamp_with_data = 0 for timestamp, value in timeseries[::-1]: has_value = False if value == 0.0: has_value = True if value: has_value = True if has_value: last_timestamp_with_data = int(timestamp) datapoint = value break if last_timestamp_with_data: have_data = True except: logger.error(traceback.format_exc()) logger.error( 'error :: worker :: failed to determine datapoints from %s' % (str(metric_data))) delete_set_record = True if not timeseries: logger.info( 'worker :: after processing, there were no valid data points in %s' % (str(metric_data))) delete_set_record = True if not have_data and timeseries: logger.error( 'error :: worker :: failed to determine last_timestamp_with_data from %s' % (str(metric_data))) delete_set_record = True if delete_set_record: try: redis_set = 'vista.fetcher.metrics.json' self.redis_conn.srem(redis_set, str_metric_data) except: logger.error(traceback.format_exc()) logger.error( 'error :: worker :: failed to delete data from Redis set %s, data - ' % (str(redis_set), str(str_metric_data))) continue if not metric: continue valid_data = True if last_flux_timestamp and last_timestamp_with_data: if int(last_timestamp_with_data) <= last_flux_timestamp: valid_data = False if not valid_data: redis_set = 'vista.fetcher.metrics.json' logger.info( 'worker :: no valid data in fetched data removing from Redis set %s - data - %s' % (redis_set, str(str_metric_data))) try: self.redis_conn.srem(redis_set, str_metric_data) except: logger.error(traceback.format_exc()) logger.error( 'error :: worker :: failed to delete data from Redis set %s, data - %s' % (redis_set, str(str_metric_data))) continue if valid_data: flux_host = 'http://%s:%s' % (settings.FLUX_IP, settings.FLUX_PORT) # Resample resample_at = None if resample_at == 'none' or resample_at == '0Min': resample_at = False if resample_at == 'None' or resample_at == '0min': resample_at = False if resample_at is None or resample_at == '0' or resample_at == 0: resample_at = False if resample_at: try: df = pd.DataFrame(timeseries) df.columns = ['timestamp', 'value'] df['timestamp'] = pd.to_datetime(df['timestamp'], unit='s', origin='unix') df = df.set_index('timestamp') resampled_df = df.resample(resample_at).sum() resampled_timeseries = [] for index, row in resampled_df.iterrows(): timestamp = int(index.strftime('%s')) resampled_timeseries.append( [timestamp, row[0]]) timeseries = resampled_timeseries timeseries_length = len(timeseries) logger.info( 'worker :: time series resampled at %s resulting in %s data points to send to Graphite' % (str(resample_at), str(timeseries_length))) except: logger.error(traceback.format_exc()) logger.error( 'error :: worker :: failed to resample time series at %s for %s with time series %s' % (str(resample_at), str(metric), str(timeseries))) for timestamp, value in timeseries: flux_url = '%s/metric_data?metric=%s&value=%s×tamp=%s&key=%s' % ( flux_host, metric, str(datapoint), str(timestamp), settings.FLUX_SELF_API_KEY) success = False try: response = requests.get(flux_url) if response.status_code == 200: success = True elif response.status_code == 204: success = True except: logger.error(traceback.format_exc()) logger.error( 'error :: worker :: failed to request %s' % str(flux_url)) if not success: logger.error( 'error :: worker :: http status code - %s, reason - %s' % (str(response.status_code), str(response.reason))) if success: metrics_sent_to_flux += 1 redis_set = 'vista.fetcher.metrics.json' # @added 20191011 - Task #3258: Reduce vista logging timeseries_length = len(timeseries) # @modified 20191011 - Task #3258: Reduce vista logging # logger.info('worker :: data submitted to flux OK, removing data from Redis set %s' % ( # redis_set)) logger.info( 'worker :: %s data points submitted to flux OK for %s' % (str(timeseries_length), metric)) try: self.redis_conn.srem(redis_set, str_metric_data) except: logger.error(traceback.format_exc()) logger.error( 'error :: worker :: failed to delete data from Redis set %s, data - %s' % (redis_set, str(str_metric_data))) redis_set = 'vista.fetcher.unique_metrics' try: self.redis_conn.sadd(redis_set, remote_target) except: logger.error(traceback.format_exc()) logger.error( 'error :: worker :: failed to add %s to Redis set %s' % (remote_target, redis_set)) time_now = int(time()) if (time_now - last_sent_to_graphite) >= 60: logger.info( 'worker :: metrics sent_to_flux in last 60 seconds - %s' % str(metrics_sent_to_flux)) send_metric_name = '%s.metrics_sent_to_flux' % skyline_app_graphite_namespace try: send_graphite_metric(parent_skyline_app, send_metric_name, str(metrics_sent_to_flux)) last_sent_to_graphite = int(time()) metrics_sent_to_flux = 0 except: logger.error(traceback.format_exc()) logger.error( 'error :: worker :: failed to send_graphite_metric %s with %s' % (send_metric_name, str(metrics_sent_to_flux)))
def run_selected_algorithm(timeseries, metric_name, airgapped_metrics, airgapped_metrics_filled, run_negatives_present, check_for_airgaps_only): """ Filter timeseries and run selected algorithm. """ # @added 20180807 - Feature #2492: alert on stale metrics # Determine if a metric has stopped sending data and if so add to the # analyzer.alert_on_stale_metrics Redis set add_to_alert_on_stale_metrics = False if ALERT_ON_STALE_METRICS: # @modified 20180816 - Feature #2492: alert on stale metrics # Added try and except to prevent some errors that are encounter between # 00:14 and 00:17 on some days # Traceback (most recent call last): # File "/opt/skyline/github/skyline/skyline/analyzer/analyzer.py", line 394, in spin_process # anomalous, ensemble, datapoint = run_selected_algorithm(timeseries, metric_name) # File "/opt/skyline/github/skyline/skyline/analyzer/algorithms.py", line 530, in run_selected_algorithm # if int(time()) - int(timeseries[-1][0]) >= ALERT_ON_STALE_PERIOD: # IndexError: list index out of range try: if int(time()) - int(timeseries[-1][0]) >= ALERT_ON_STALE_PERIOD: add_to_alert_on_stale_metrics = True except: # @modified 20180816 - # Feature #2492: alert on stale metrics add_to_alert_on_stale_metrics = False try: if int(time()) - int(timeseries[-1][0]) >= STALE_PERIOD: add_to_alert_on_stale_metrics = False except: add_to_alert_on_stale_metrics = False if add_to_alert_on_stale_metrics: try: # @added 20200505 - Feature #3504: Handle airgaps in batch metrics # Use get_redis_conn from skyline_functions import get_redis_conn redis_conn = get_redis_conn(skyline_app) redis_conn.sadd('analyzer.alert_on_stale_metrics', metric_name) except: pass # @added 20200505 - Feature #3504: Handle airgaps in batch metrics # Check to see if this is a batch processing metric that has been sent # through Analyzer to check for airgaps only and if so do not check the # timeseries for exceptions check_for_timeseries_exceptions = True check_airgap_only = None if BATCH_PROCESSING and check_for_airgaps_only: check_airgap_only_key = 'analyzer.check_airgap_only.%s' % metric_name try: if not add_to_alert_on_stale_metrics: # @added 20200505 - Feature #3504: Handle airgaps in batch metrics # Use get_redis_conn from skyline_functions import get_redis_conn redis_conn = get_redis_conn(skyline_app) check_airgap_only = redis_conn.get(check_airgap_only_key) except: check_airgap_only = None if check_airgap_only: check_for_timeseries_exceptions = False # @modified 20200505 - Feature #3504: Handle airgaps in batch metrics # Wrapped in check_for_timeseries_exceptions as if it is a check_airgap_only # metric then the time series should not be checked for exceptions if check_for_timeseries_exceptions: # Get rid of short series if len(timeseries) < MIN_TOLERABLE_LENGTH: raise TooShort() # Get rid of stale series if time() - timeseries[-1][0] > STALE_PERIOD: raise Stale() # Get rid of boring series if len(set(item[1] for item in timeseries[-MAX_TOLERABLE_BOREDOM:])) == BOREDOM_SET_SIZE: raise Boring() # @added 20200423 - Feature #3508: ionosphere.untrainable_metrics # Added run_negatives_present negatives_found = False # @added 20200117 - Feature #3400: Identify air gaps in the metric data # @modified 20200214 - Bug #3448: Repeated airgapped_metrics # Feature #3400: Identify air gaps in the metric data # if IDENTIFY_AIRGAPS: if IDENTIFY_AIRGAPS or IDENTIFY_UNORDERED_TIMESERIES: # airgaps = identify_airgaps(metric_name, timeseries, airgapped_metrics) # if airgaps: process_metric = True if IDENTIFY_AIRGAPS: if CHECK_AIRGAPS: process_metric = False # @added 20200423 - Feature #3504: Handle airgaps in batch metrics # Feature #3400: Identify air gaps in the metric data # Replaced code block below to determine if a metric is a check # with a skyline_functions definition of that block as # the check_metric_for_airgaps function check_metric_for_airgaps = False try: check_metric_for_airgaps = is_check_airgap_metric( metric_name) except: check_metric_for_airgaps = False try: logger.error( 'failed to determine if %s is an airgap metric: %s' % (str(metric_name), traceback.format_exc())) except: logger.error( 'failed to determine if the metric is an airgap metric' ) if check_metric_for_airgaps: process_metric = True else: # If IDENTIFY_AIRGAPS is not enabled and # IDENTIFY_UNORDERED_TIMESERIES is enabled process the metric if IDENTIFY_UNORDERED_TIMESERIES: process_metric = True airgaps = None unordered_timeseries = False if process_metric: # @modified 20200501 - Feature #3400: Identify air gaps in the metric data # Added airgapped_metrics_filled # airgaps, unordered_timeseries = identify_airgaps(metric_name, timeseries, airgapped_metrics) airgaps, unordered_timeseries = identify_airgaps( metric_name, timeseries, airgapped_metrics, airgapped_metrics_filled) if airgaps or unordered_timeseries: try: redis_conn.ping() except: # @added 20200505 - Feature #3504: Handle airgaps in batch metrics # Use get_redis_conn from skyline_functions import get_redis_conn redis_conn = get_redis_conn(skyline_app) if airgaps: for i in airgaps: try: redis_conn.sadd('analyzer.airgapped_metrics', str(i)) logger.info('adding airgap %s' % str(i)) # TODO: learn_airgapped_metrics except: pass del airgaps # @added 20200214 - Bug #3448: Repeated airgapped_metrics # Feature #3400: Identify air gaps in the metric data # Also add unordered time series to the analyzer.unordered_timeseries # Redis set if unordered_timeseries: try: redis_conn.sadd('analyzer.unordered_timeseries', metric_name) del unorder_timeseries except: pass # @added 20200423 - Feature #3504: Handle airgaps in batch metrics # Feature #3480: batch_processing # Feature #3486: analyzer_batch # Feature #3400: Identify air gaps in the metric data # Check to see if this is a batch processing metric that has been sent to # analyzer_batch for processing but sent through Analyzer to check for # airgaps only and if so return as it should not be run through algorithms if BATCH_PROCESSING: if check_airgap_only: try: redis_conn.delete(check_airgap_only_key) except: try: logger.error( 'failed to delete Redis key %s: %s' % (str(check_airgap_only_key), traceback.format_exc())) except: logger.error( 'failed to failure regarding deleting the check_airgap_only_key Redis key' ) # @modified 20200430 - Feature #3480: batch_processing # Tidy up and reduce logging, only log if debug enabled if BATCH_PROCESSING_DEBUG: logger.info( 'algorithms :: batch processing - batch metric %s checked for airgaps only, not analysing' % (str(metric_name))) # TODO: the only worry here is that this metric then gets added to # the not_anomalous Redis set? Not sure if that is a problem, I do # not think it is. Unless it is in the end of anomaly_end_timestamp # context? # @modified 20200424 - Feature #3508: ionosphere.untrainable_metrics # Added negatives_found return False, [], 1, negatives_found # RUN_OPTIMIZED_WORKFLOW - replaces the original ensemble method: # ensemble = [globals()[algorithm](timeseries) for algorithm in ALGORITHMS] # which runs all timeseries through all ALGORITHMS final_ensemble = [] number_of_algorithms_triggered = 0 number_of_algorithms_run = 0 number_of_algorithms = len(ALGORITHMS) maximum_false_count = number_of_algorithms - CONSENSUS + 1 # logger.info('the maximum_false_count is %s, above which CONSENSUS cannot be achieved' % (str(maximum_false_count))) consensus_possible = True # DEVELOPMENT: this is for a development version of analyzer only if skyline_app == 'analyzer_dev': time_all_algorithms = True else: time_all_algorithms = False algorithm_tmp_file_prefix = '%s/%s.' % (SKYLINE_TMP_DIR, skyline_app) for algorithm in ALGORITHMS: if consensus_possible: if send_algorithm_run_metrics: algorithm_count_file = '%s%s.count' % ( algorithm_tmp_file_prefix, algorithm) algorithm_timings_file = '%s%s.timings' % ( algorithm_tmp_file_prefix, algorithm) run_algorithm = [] run_algorithm.append(algorithm) number_of_algorithms_run += 1 if send_algorithm_run_metrics: start = timer() try: algorithm_result = [ globals()[test_algorithm](timeseries) for test_algorithm in run_algorithm ] except: # logger.error('%s failed' % (algorithm)) algorithm_result = [None] if send_algorithm_run_metrics: end = timer() with open(algorithm_count_file, 'a') as f: f.write('1\n') with open(algorithm_timings_file, 'a') as f: f.write('%.6f\n' % (end - start)) else: algorithm_result = [False] # logger.info('CONSENSUS NOT ACHIEVABLE - skipping %s' % (str(algorithm))) if algorithm_result.count(True) == 1: result = True number_of_algorithms_triggered += 1 # logger.info('algorithm %s triggerred' % (str(algorithm))) elif algorithm_result.count(False) == 1: result = False elif algorithm_result.count(None) == 1: result = None else: result = False final_ensemble.append(result) if not RUN_OPTIMIZED_WORKFLOW: continue if time_all_algorithms: continue if ENABLE_ALL_ALGORITHMS_RUN_METRICS: continue # true_count = final_ensemble.count(True) # false_count = final_ensemble.count(False) # logger.info('current false_count %s' % (str(false_count))) if final_ensemble.count(False) >= maximum_false_count: consensus_possible = False # logger.info('CONSENSUS cannot be reached as %s algorithms have already not been triggered' % (str(false_count))) # skip_algorithms_count = number_of_algorithms - number_of_algorithms_run # logger.info('skipping %s algorithms' % (str(skip_algorithms_count))) # logger.info('final_ensemble: %s' % (str(final_ensemble))) try: # ensemble = [globals()[algorithm](timeseries) for algorithm in ALGORITHMS] ensemble = final_ensemble threshold = len(ensemble) - CONSENSUS if ensemble.count(False) <= threshold: # @added 20200425 - Feature #3508: ionosphere.untrainable_metrics # Only run a negatives_present check if it is anomalous, there # is no need to check unless it is related to an anomaly if run_negatives_present: try: negatives_found = negatives_present(timeseries) except: logger.error('Algorithm error: negatives_present :: %s' % traceback.format_exc()) negatives_found = False if ENABLE_SECOND_ORDER: if is_anomalously_anomalous(metric_name, ensemble, timeseries[-1][1]): # @modified 20200423 - Feature #3508: ionosphere.untrainable_metrics # Added negatives_found return True, ensemble, timeseries[-1][1], negatives_found else: return True, ensemble, timeseries[-1][1], negatives_found # @modified 20200423 - Feature #3508: ionosphere.untrainable_metrics # Added negatives_found return False, ensemble, timeseries[-1][1], negatives_found except: logger.error('Algorithm error: %s' % traceback.format_exc()) # @modified 20200423 - Feature #3508: ionosphere.untrainable_metrics # Added negatives_found return False, [], 1, negatives_found
def run(self): """ Called when the process intializes. """ def pickle_data_to_graphite(data): message = None try: payload = pickle.dumps(data, protocol=2) header = struct.pack("!L", len(payload)) message = header + payload except: logger.error(traceback.format_exc()) logger.error( 'error :: populate_metric_worker :: failed to pickle to send to Graphite' ) return False if message: try: sock = socket.socket() sock.connect((CARBON_HOST, FLUX_CARBON_PICKLE_PORT)) sock.sendall(message) sock.close() except: logger.error(traceback.format_exc()) logger.error( 'error :: populate_metric_worker :: failed to send pickle data to Graphite' ) return False else: logger.error( 'error :: populate_metric_worker :: failed to pickle metric data into message' ) return False return True logger.info('populate_metric_worker :: starting worker') # Populate API keys and tokens in memcache # python-2.x and python3.x handle while 1 and while True differently # while 1: running = True while running: # Make sure Redis is up redis_up = False while not redis_up: try: redis_up = self.redis_conn.ping() except: logger.error( 'populate_metric_worker :: cannot connect to Redis at socket path %s' % (settings.REDIS_SOCKET_PATH)) sleep(2) # @modified 20191111 - Bug #3266: py3 Redis binary objects not strings # Branch #3262: py3 # if settings.REDIS_PASSWORD: # self.redis_conn = StrictRedis(password=settings.REDIS_PASSWORD, unix_socket_path=settings.REDIS_SOCKET_PATH) # else: # self.redis_conn = StrictRedis(unix_socket_path=settings.REDIS_SOCKET_PATH) self.redis_conn = get_redis_conn(skyline_app) # @added 20191128 - Bug #3266: py3 Redis binary objects not strings # Branch #3262: py3 self.redis_conn_decoded = get_redis_conn_decoded( skyline_app) metricDict = None try: # Get a metric from the queue with a 1 second timeout, each # metric item on the queue is a list e.g. # metric_json = [metricName, metricValue, metricTimestamp] metricDict = self.q.get(True, 1) logger.info('populate_metric_worker :: processing queue item') except Empty: logger.info( 'populate_metric_worker :: queue is empty and timed out, sleeping for 30 seconds' ) sleep(30) except NotImplementedError: pass except KeyboardInterrupt: logger.info( 'populate_metric_worker :: server has been issued a user signal to terminate - KeyboardInterrupt' ) except SystemExit: logger.info( 'populate_metric_worker :: server was interrupted - SystemExit' ) except Exception as e: logger.error('error :: populate_metric_worker :: %s' % (str(e))) if not metricDict: continue try: remote_host_type = str(metricDict['remote_host_type']) remote_target = str(metricDict['remote_target']) metric = str(metricDict['metric']) namespace_prefix = str(metricDict['namespace_prefix']) if not namespace_prefix: namespace_prefix = '' if namespace_prefix == 'None': namespace_prefix = '' key = str(metricDict['key']) token = str(metricDict['token']) user = str(metricDict['user']) password = str(metricDict['password']) if metricDict['fetch_resolution_urls'] == 'None': logger.info( 'No fetch_resolution_urls declared for %s, nothing to do' % remote_target) continue if metricDict['fetch_resolution_urls'] == '()' or metricDict[ 'fetch_resolution_urls'] == (): logger.info( 'No fetch_resolution_urls declared for %s, nothing to do' % remote_target) continue fetch_resolution_urls_str = literal_eval( metricDict['fetch_resolution_urls']) fetch_resolution_urls = literal_eval(fetch_resolution_urls_str) except: logger.error(traceback.format_exc()) logger.error( 'error :: populate_metric_worker :: failed to read from metricData' ) if LOCAL_DEBUG: try: logger.info( 'populate_metric_worker :: remote_target from metricData set to %s' % remote_target) logger.info( 'populate_metric_worker :: metric from metricData set to %s' % metric) logger.info( 'populate_metric_worker :: namespace_prefix from metricData set to %s' % namespace_prefix) logger.info( 'populate_metric_worker :: key from metricData set to %s' % key) logger.info( 'populate_metric_worker :: token from metricData set to %s' % token) logger.info( 'populate_metric_worker :: user from metricData set to %s' % user) logger.info( 'populate_metric_worker :: password from metricData set to %s' % password) logger.info( 'populate_metric_worker :: fetch_resolution_urls from metricData set to %s' % str(fetch_resolution_urls)) if fetch_resolution_urls: for fetch_url in fetch_resolution_urls: logger.info( 'populate_metric_worker :: a fetch_url from metricData is set to %s' % str(fetch_url)) logger.info( 'populate_metric_worker :: metric is set to %s' % metric) except: logger.error(traceback.format_exc()) logger.error( 'error :: populate_metric_worker :: failed to read from metricData' ) # Best effort to de-duplicate the data sent to Graphite cache_key = 'flux.last.%s' % metric last_flux_timestamp = None try: # @modified 20191128 - Bug #3266: py3 Redis binary objects not strings # Branch #3262: py3 # redis_last_metric_data = self.redis_conn.get(cache_key).decode('utf-8') redis_last_metric_data = self.redis_conn_decoded.get(cache_key) last_metric_data = literal_eval(redis_last_metric_data) last_flux_timestamp = int(last_metric_data[0]) except: logger.error(traceback.format_exc()) logger.error( 'error :: populate_metric_worker :: failed to determine last_flux_timestamp from Redis key %s' % cache_key) last_flux_timestamp = False recent_last_flux_timestamp_present = False if last_flux_timestamp: now = int(time()) if (now - last_flux_timestamp) < 600: recent_last_flux_timestamp_present = True # Skyline has the metric so adding it to the vista.fetcher # Redis set redis_set = 'vista.fetcher.unique_metrics' data = str(remote_target) try: self.redis_conn.sadd(redis_set, data) logger.info( 'populate_metric_worker :: the last flux update for %s was less than 600 seconds ago, added metric to %s' % (metric, redis_set)) except: logger.info(traceback.format_exc()) logger.error( 'error :: populate_metric_worker :: failed to add %s to Redis set %s' % (str(data), str(redis_set))) # continue if not last_flux_timestamp: # Check Graphite does not have the data or determine what the # last data Graphite has is logger.info( 'populate_metric_worker :: no last_flux_timestamp was found in Redis for %s, checking if Graphite has data' % (metric)) check_graphite_from = [ '-50mins', '-6hours', '-24hours', '-7days', '-30days', '-90days' ] timeseries = [] for graphite_from in check_graphite_from: if last_flux_timestamp: break logger.info( 'populate_metric_worker :: checking %s in Graphite from %s' % (metric, graphite_from)) got_data = False try: # We use absolute time so that if there is a lag in mirage the correct # timeseries data is still surfaced relevant to the anomalous datapoint # timestamp if settings.GRAPHITE_PORT != '': url = '%s://%s:%s/%s/?from=%s&target=%s&format=json' % ( settings.GRAPHITE_PROTOCOL, settings.GRAPHITE_HOST, str(settings.GRAPHITE_PORT), settings.GRAPHITE_RENDER_URI, graphite_from, metric) else: url = '%s://%s/%s/?from=%s&target=%s&format=json' % ( settings.GRAPHITE_PROTOCOL, settings.GRAPHITE_HOST, settings.GRAPHITE_RENDER_URI, graphite_from, metric) logger.info( 'populate_metric_worker :: using Graphite URL - %s' % (url)) r = requests.get(url) if r.status_code == 200: js = [] try: js = r.json() except: logger.info(traceback.format_exc()) logger.error( 'error :: populate_metric_worker :: failed to get data from Graphite' ) continue if not js: logger.info( 'populate_metric_worker :: %s not present in Graphite from %s' % (metric, graphite_from)) continue got_data = True logger.info( 'populate_metric_worker :: %s present in Graphite from %s' % (metric, graphite_from)) else: logger.info( 'populate_metric_worker :: %s not present in Graphite from %s' % (metric, graphite_from)) continue except: logger.info(traceback.format_exc()) logger.error( 'error :: populate_metric_worker :: failed to get data from Graphite' ) continue datapoints = [] if got_data: try: js = r.json() datapoints = js[0]['datapoints'] logger.info( 'populate_metric_worker :: %s data points are present in the Graphite %s data' % (str(len(datapoints)), str(graphite_from))) except: logger.info(traceback.format_exc()) logger.error( 'error :: populate_metric_worker :: failed to get data from Graphite' ) for datapoint in datapoints: try: value = float(datapoint[0]) timestamp = int(datapoint[1]) new_datapoint = [timestamp, value] timeseries.append(new_datapoint) except: # nosec continue last_timestamp_with_data = None for timestamp, value in timeseries[::-1]: has_value = False if value == 0.0: has_value = True if value == 0: has_value = True if value: has_value = True if has_value: last_timestamp_with_data = int(timestamp) datapoint = value break if last_timestamp_with_data: # Here we set this as the missing last_flux_timestamp last_flux_timestamp = last_timestamp_with_data recent_last_flux_timestamp_present = True logger.info( 'populate_metric_worker :: %s last timestamp in Graphite from %s is %s, using as last_flux_timestamp' % (metric, str(graphite_from), str(last_flux_timestamp))) timeseries = [] start_populating = int(time()) datapoints_added_to_timeseries = 0 datapoints_already_populated = 0 datapoints_with_no_value = 0 timestamp = None value = None # @added 20191111 - Bug #3312: flux - populate_metric_worker - handle None in datapoints # And set flux.last key is the returned value from the remote is # null so that time series that are mostly null do not keep on # getting added to flux populate_metric by Vista raw_timeseries = [] for fetch_url in fetch_resolution_urls: # if recent_last_flux_timestamp_present and remote_host_type == 'prometheus': # This was for the query query and resample method and not for # the query_range query if recent_last_flux_timestamp_present and remote_host_type == 'prometheus_query_range_NOT_FOR_GE_11000': try: logger.info( 'populate_metric_worker :: recent data so replacing fetch_url %s ' % (fetch_url)) seconds_to_fetch = int(time()) - last_flux_timestamp minutes_to_fetch = int(seconds_to_fetch / 60) + 2 re_mins_to_fetch = '[%sm]' % str(minutes_to_fetch) fetch_url = re.sub(r'\[.*\]', re_mins_to_fetch, fetch_url) encoded_re_mins_to_fetch = '%%5B%sm%%5D' % str( minutes_to_fetch) fetch_url = re.sub(r'%5B.*%5D', encoded_re_mins_to_fetch, fetch_url) logger.info( 'populate_metric_worker :: replaced fetch_url %s ' % (fetch_url)) except: logger.info(traceback.format_exc()) logger.error( 'error :: populate_metric_worker :: failed to rewrite URL' ) if recent_last_flux_timestamp_present and remote_host_type == 'prometheus': try: logger.info( 'populate_metric_worker :: recent data so replacing fetch_url %s ' % (fetch_url)) seconds_to_fetch = int(time()) - last_flux_timestamp minutes_to_fetch = int(seconds_to_fetch / 60) + 2 re_mins_to_fetch = '[%sm]' % str(minutes_to_fetch) fetch_url = re.sub(r'\[.*\]', re_mins_to_fetch, fetch_url) encoded_re_mins_to_fetch = '%%5B%sm%%5D' % str( minutes_to_fetch) fetch_url = re.sub(r'%5B.*%5D', encoded_re_mins_to_fetch, fetch_url) logger.info( 'populate_metric_worker :: replaced fetch_url %s ' % (fetch_url)) except: logger.info(traceback.format_exc()) logger.error( 'error :: populate_metric_worker :: failed to rewrite URL' ) success = False try: logger.info( 'populate_metric_worker :: getting data from %s' % str(fetch_url)) response = requests.get(fetch_url) if response.status_code == 200: success = True except: logger.info(traceback.format_exc()) logger.error( 'error :: populate_metric_worker :: http status code - %s, reason - %s' % (str(response.status_code), str(response.reason))) logger.error( 'error :: populate_metric_worker :: failed to get data from %s' % str(fetch_url)) if not success: continue datapoints = None try: js = response.json() if remote_host_type == 'graphite': datapoints = js[0]['datapoints'] if remote_host_type == 'prometheus': datapoints = js['data']['result'][0]['values'] datapoints_fetched = len(datapoints) logger.info( 'populate_metric_worker :: retrieved %s data points from %s' % (str(datapoints_fetched), str(fetch_url))) except: logger.info(traceback.format_exc()) logger.error( 'error :: populate_metric_worker :: failed to get data from %s' % str(fetch_url)) # Example # datapoints[0] # [7.3, 1556817000] # Add each data point and timestamp to the timeseries list so # they can be sent to Graphite if not datapoints: logger.info( 'populate_metric_worker :: failed to get any data from %s' % str(fetch_url)) continue # @added 20191108 - Bug #3312: flux - populate_metric_worker - handle None in datapoints valid_datapoints = [] for datapoint in datapoints: value = None timestamp = None if remote_host_type == 'graphite': # @added 20191111 - Bug #3312: flux - populate_metric_worker - handle None in datapoints raw_timeseries.append([datapoint[1], datapoint[0]]) try: raw_value = datapoint[0] if raw_value is None: datapoints_with_no_value += 1 continue value = float(datapoint[0]) timestamp = int(datapoint[1]) valid_datapoints.append([value, timestamp]) except: continue if remote_host_type == 'prometheus': # @added 20191111 - Bug #3312: flux - populate_metric_worker - handle None in datapoints raw_timeseries.append([datapoint[0], datapoint[1]]) try: raw_value = datapoint[1] if raw_value is None: datapoints_with_no_value += 1 continue timestamp = int(datapoint[0]) value = float(datapoint[1]) except: continue valid_datapoints.append([timestamp, value]) datapoints = valid_datapoints # Order the time series by timestamp as the tuple can shift # order resulting in more recent data being added before older # data datapoints.sort() # Determine the timestamp of the current minute to apply # VISTA_DO_NOT_SUBMIT_CURRENT_MINUTE time_now = int(time()) current_minute_hour = int( datetime.datetime.utcfromtimestamp(time_now).strftime( '%H')) current_minute_minute = int( datetime.datetime.utcfromtimestamp(time_now).strftime( '%M')) current_datetime = datetime.datetime.utcfromtimestamp( time_now).replace(hour=current_minute_hour, minute=current_minute_minute, second=0, microsecond=0) current_minute_timestamp_start = int( current_datetime.strftime('%s')) datapoints_in_current_minute = 0 last_error = None value = None timestamp = None for datapoint in datapoints: try: if remote_host_type == 'graphite': try: raw_value = datapoint[0] if raw_value is None: continue value = float(datapoint[0]) timestamp = int(datapoint[1]) except: continue if remote_host_type == 'prometheus': # timestamp = int(datapoint[0]) try: timestamp = int(datapoint[0]) value = float(datapoint[1]) except: continue submit_data = True if last_flux_timestamp: if timestamp <= last_flux_timestamp: submit_data = False datapoints_already_populated += 1 # Here if the timestamp of the data point falls # within the current minute, it is discarded and not # sent to flux, to ensure that high frequency metrics # can have their minutely bins fully populated before # they are submitted to Graphite if settings.VISTA_DO_NOT_SUBMIT_CURRENT_MINUTE: if timestamp >= current_minute_timestamp_start: submit_data = False datapoints_in_current_minute += 1 if submit_data: new_datapoint = [timestamp, value] timeseries.append(new_datapoint) datapoints_added_to_timeseries += 1 # nosec to exclude from bandit tests except: # nosec last_error = traceback.format_exc() datapoints_with_no_value += 1 continue if last_error: logger.error(last_error) logger.error( 'error :: populate_metric_worker :: the above is the last_error encountered processing %s' % (str(metric))) if datapoints_with_no_value: logger.info( 'populate_metric_worker :: %s of the fetched records were discarded as they had value None' % (str(datapoints_with_no_value))) if datapoints_in_current_minute: logger.info( 'populate_metric_worker :: %s of the fetched records were discarded as they fall within the current minute' % (str(datapoints_in_current_minute))) logger.info( 'populate_metric_worker :: %s of the fetched data points are older than the last known flux timestamp' % (str(datapoints_already_populated))) logger.info( 'populate_metric_worker :: added %s data points to the time series to submit to Graphite' % (str(datapoints_added_to_timeseries))) end_fecthing = int(time()) seconds_to_fetch = end_fecthing - start_populating if timestamp: logger.info( 'populate_metric_worker :: last fetched value - %s, timestamp %s' % (str(value), str(timestamp))) logger.info( 'populate_metric_worker :: %s data point fecthed for %s in %s seconds' % (str(datapoints_added_to_timeseries), remote_target, str(seconds_to_fetch))) # @added 20191111 - Bug #3312: flux - populate_metric_worker - handle None in datapoints # And set flux.last key is the returned value from the remote is # null so that time series that are mostly null do not keep on # getting added to flux populate_metric by Vista if not timeseries: set_flux_key = False try: sorted_raw_timeseries = sorted(raw_timeseries, key=lambda x: x[0]) last_ts = sorted_raw_timeseries[-1][0] if int(last_ts) > (end_fecthing - 120): if sorted_raw_timeseries[-1][1] is None: set_flux_key = True except: logger.error(traceback.format_exc()) logger.error( 'error :: populate_metric_worker :: failed to determine if last value was null' ) if set_flux_key: try: # Update Redis flux key cache_key = 'flux.last.%s' % metric metric_data = [int(last_ts), None] self.redis_conn.set(cache_key, str(metric_data)) logger.info( 'populate_metric_worker :: even though no data points so as to not loop round on this metric, set the metric Redis key - %s - %s' % (cache_key, str(metric_data))) except: logger.error(traceback.format_exc()) logger.error( 'error :: populate_metric_worker :: even though no data points, failed to set Redis key - %s - %s' % (cache_key, str(metric_data))) # Adding to the vista.fetcher.unique_metrics Redis set redis_set = 'vista.fetcher.unique_metrics' data = str(remote_target) try: self.redis_conn.sadd(redis_set, data) logger.info( 'populate_metric_worker :: even though no data points, added %s to Redis set %s' % (remote_target, redis_set)) except: logger.info(traceback.format_exc()) logger.error( 'error :: populate_metric_worker :: even though no data points, failed to add %s to Redis set %s' % (str(data), str(redis_set))) if not timeseries: logger.info( 'populate_metric_worker :: no data in the timeseries list for the time series for %s' % metric) continue # Order the time series by timestamp as the tuple can shift # order resulting in more recent data being added before older # data timeseries.sort() timeseries_length = len(timeseries) # Resample resample_at = '1Min' if resample_at: try: df = pd.DataFrame(timeseries) df.columns = ['timestamp', 'value'] df['timestamp'] = pd.to_datetime(df['timestamp'], unit='s', origin='unix') df = df.set_index('timestamp') # resampled_df = df.resample(resample_at).sum() # Use the mean as Prometheus uses the average in the # query_range API method resampled_df = df.resample(resample_at).mean() resampled_timeseries = [] for index, row in resampled_df.iterrows(): timestamp = int(index.strftime('%s')) resampled_timeseries.append([timestamp, row[0]]) timeseries = resampled_timeseries timeseries_length = len(timeseries) logger.info( 'populate_metric_worker :: time series resampled at %s resulting in %s data points to send to Graphite' % (str(resample_at), str(timeseries_length))) except: logger.error(traceback.format_exc()) logger.error( 'error :: populate_metric_worker :: failed to resample time series for %s' % str(metric)) logger.info( 'populate_metric_worker :: %s data points to send to Graphite' % (str(timeseries_length))) timestamp = None value = None sent_to_graphite = 0 # use_pickle = False use_pickle = True if not use_pickle: for timestamp, value in timeseries: try: graphyte.send(metric, float(value), int(timestamp)) sent_to_graphite += 1 if sent_to_graphite % 1000 == 0: logger.info( 'populate_metric_worker :: submitted %s of %s data points to Graphite so far' % (str(sent_to_graphite), str(timeseries_length))) except: logger.error(traceback.format_exc()) logger.error( 'error :: populate_metric_worker :: failed to send metric data to Graphite for %s' % str(metric)) else: listOfMetricTuples = [] try: for timestamp, value in timeseries: tuple_data = (metric, (int(timestamp), float(value))) listOfMetricTuples.append(tuple_data) sent_to_graphite += 1 except: logger.error(traceback.format_exc()) logger.error( 'error :: populate_metric_worker :: failed to populate listOfMetricTuples for %s' % str(metric)) if listOfMetricTuples: data_points_sent = 0 smallListOfMetricTuples = [] tuples_added = 0 for data in listOfMetricTuples: smallListOfMetricTuples.append(data) tuples_added += 1 if tuples_added >= 1000: pickle_data_sent = pickle_data_to_graphite( smallListOfMetricTuples) if pickle_data_sent: data_points_sent += tuples_added logger.info( 'populate_metric_worker :: sent %s/%s of %s data points to Graphite via pickle for %s' % (str(tuples_added), str(data_points_sent), str(timeseries_length), metric)) sent_to_graphite += len( smallListOfMetricTuples) smallListOfMetricTuples = [] tuples_added = 0 else: logger.error( 'error :: populate_metric_worker :: failed to send %s data points to Graphite via pickle for %s' % (str(tuples_added), metric)) if smallListOfMetricTuples: tuples_to_send = len(smallListOfMetricTuples) pickle_data_sent = pickle_data_to_graphite( smallListOfMetricTuples) if pickle_data_sent: data_points_sent += tuples_to_send logger.info( 'populate_metric_worker :: sent the last %s/%s of %s data points to Graphite via pickle for %s' % (str(tuples_to_send), str(data_points_sent), str(timeseries_length), metric)) else: logger.error( 'error :: populate_metric_worker :: failed to send the last %s data points to Graphite via pickle for %s' % (str(tuples_to_send), metric)) logger.info( 'populate_metric_worker :: sent %s data points to Graphite for %s' % (str(sent_to_graphite), metric)) try: skyline_metric = '%s.datapoints_sent_to_graphite' % ( skyline_app_graphite_namespace) # @modified 20191008 - Feature #3250: Allow Skyline to send metrics to another Carbon host # graphyte.send(skyline_metric, float(sent_to_graphite), int(time())) send_graphite_metric(skyline_app, skyline_metric, float(sent_to_graphite)) logger.info( 'populate_metric_worker :: submitted %s to Graphite for %s' % (str(float(sent_to_graphite)), skyline_metric)) except: logger.error(traceback.format_exc()) logger.error( 'error :: populate_metric_worker :: failed to send metric data to Graphite for %s' % str(skyline_metric)) has_value = False if value == 0.0: has_value = True if value == 0: has_value = True if value: has_value = True if timestamp and has_value: try: # Update Redis flux key cache_key = 'flux.last.%s' % metric metric_data = [int(timestamp), float(value)] self.redis_conn.set(cache_key, str(metric_data)) logger.info( 'populate_metric_worker :: set the metric Redis key - %s - %s' % (cache_key, str(metric_data))) except: logger.error(traceback.format_exc()) logger.error( 'error :: populate_metric_worker :: failed to set Redis key - %s - %s' % (cache_key, str(metric_data))) # Adding to the vista.fetcher.unique_metrics Redis set redis_set = 'vista.fetcher.unique_metrics' data = str(remote_target) try: self.redis_conn.sadd(redis_set, data) logger.info( 'populate_metric_worker :: added %s to Redis set %s' % (remote_target, redis_set)) except: logger.info(traceback.format_exc()) logger.error( 'error :: populate_metric_worker :: failed to add %s to Redis set %s' % (str(data), str(redis_set))) end_populating = int(time()) seconds_to_run = end_populating - start_populating logger.info( 'populate_metric_worker :: %s populated to Graphite in %s seconds' % (metric, str(seconds_to_run)))
# @modified 20180519 - Feature #2378: Add redis auth to Skyline and rebrow # @modified 20191030 - Bug #3266: py3 Redis binary objects not strings # Branch #3262: py3 # Use get_redis_conn and get_redis_conn_decoded to use on Redis sets when the bytes # types need to be decoded as utf-8 to str # if settings.REDIS_PASSWORD: # redis_conn = StrictRedis(password=settings.REDIS_PASSWORD, unix_socket_path=settings.REDIS_SOCKET_PATH) # else: # redis_conn = StrictRedis(unix_socket_path=settings.REDIS_SOCKET_PATH) # @added 20191030 - Bug #3266: py3 Redis binary objects not strings # Branch #3262: py3 # Added a single functions to deal with Redis connection and the # charset='utf-8', decode_responses=True arguments required in py3 redis_conn = get_redis_conn(skyline_app) redis_conn_decoded = get_redis_conn_decoded(skyline_app) def get_anomaly(request_type): """ Query the database for the anomaly details """ logger = logging.getLogger(skyline_app_logger) if isinstance(request_type, int): latest = False else: latest = True
def run(self): """ - Called when the process intializes. - Determine if Redis is up - Spawn a rolling process to do checks - Wait for the process to finish. - run_every 60 seconds """ # Log management to prevent overwriting # Allow the bin/<skyline_app>.d to manage the log now = time() log_wait_for = now + 5 while now < log_wait_for: if os.path.isfile(skyline_app_loglock): sleep(.1) now = time() else: now = log_wait_for + 1 logger.info('thunder/rolling :: starting %s/rolling' % skyline_app) try: SERVER_METRIC_PATH = '.%s' % settings.SERVER_METRICS_NAME if SERVER_METRIC_PATH == '.': SERVER_METRIC_PATH = '' except Exception as e: SERVER_METRIC_PATH = '' logger.warning( 'warning :: thunder/rolling :: settings.SERVER_METRICS_NAME is not declared in settings.py, defaults to \'\' - %s' % e) run_every = 60 while 1: now = time() # Make sure Redis is up try: self.redis_conn.ping() except Exception as e: logger.error(traceback.format_exc()) logger.error( 'error :: thunder/rolling cannot connect to redis at socket path %s - %s' % (settings.REDIS_SOCKET_PATH, e)) sleep(10) try: self.redis_conn = get_redis_conn(skyline_app) self.redis_conn_decoded = get_redis_conn_decoded( skyline_app) except Exception as e: logger.info(traceback.format_exc()) logger.error( 'error :: thunder/rolling cannot connect to get_redis_conn - %s' % e) continue # Report app up try: self.redis_conn.setex('thunder.rolling', 120, now) except Exception as e: logger.error(traceback.format_exc()) logger.error( 'error :: thunder/rolling :: could not update the Redis analyzer.thunder/rolling key - %s' % e) # Spawn processes pids = [] spawned_pids = [] pid_count = 0 try: p = Process(target=self.rolling_process, args=(0, )) pids.append(p) pid_count += 1 logger.info('thunder/rolling :: starting rolling_process') p.start() spawned_pids.append(p.pid) except Exception as e: logger.error(traceback.format_exc()) logger.error( 'error :: thunder/rolling :: failed to spawn process - %s' % e) # Self monitor processes and terminate if any rolling_process that # has run for longer than 180 seconds p_starts = time() while time() - p_starts <= run_every: if any(p.is_alive() for p in pids): # Just to avoid hogging the CPU sleep(.1) else: # All the processes are done, break now. time_to_run = time() - p_starts logger.info( 'thunder/rolling :: rolling_process completed in %.2f seconds' % (time_to_run)) break else: # We only enter this if we didn't 'break' above. logger.info( 'thunder/rolling :: timed out, killing rolling_process process' ) for p in pids: logger.info( 'thunder/rolling :: killing rolling_process process') p.terminate() logger.info( 'thunder/rolling :: killed rolling_process process') for p in pids: if p.is_alive(): try: logger.info( 'thunder/rolling :: stopping rolling_process - %s' % (str(p.is_alive()))) p.terminate() except Exception as e: logger.error(traceback.format_exc()) logger.error( 'error :: thunder/rolling :: failed to stop rolling_process - %s' % e) process_runtime = time() - now if process_runtime < run_every: sleep_for = (run_every - process_runtime) process_runtime_now = time() - now sleep_for = (run_every - process_runtime_now) logger.info( 'thunder/rolling :: sleeping for %.2f seconds due to low run time...' % sleep_for) sleep(sleep_for) try: del sleep_for except Exception as e: logger.error( 'error :: thunder/rolling :: failed to del sleep_for - %s' % e) try: del process_runtime except Exception as e: logger.error( 'error :: thunder/rolling :: failed to del process_runtime - %s' % e)
def run(self): """ - Called when the process intializes. - Determine if Redis is up and discover checks to run. - Divide and assign each process a metric check to analyse and add results to source Redis set. - Wait for the processes to finish. """ # Log management to prevent overwriting # Allow the bin/<skyline_app>.d to manage the log if os.path.isfile(skyline_app_logwait): try: os.remove(skyline_app_logwait) except OSError: logger.error('error - failed to remove %s, continuing' % skyline_app_logwait) pass now = time() log_wait_for = now + 5 while now < log_wait_for: if os.path.isfile(skyline_app_loglock): sleep(.1) now = time() else: now = log_wait_for + 1 logger.info('starting %s run' % skyline_app) if os.path.isfile(skyline_app_loglock): logger.error( 'error - bin/%s.d log management seems to have failed, continuing' % skyline_app) try: os.remove(skyline_app_loglock) logger.info('log lock file removed') except OSError: logger.error('error - failed to remove %s, continuing' % skyline_app_loglock) pass else: logger.info('bin/%s.d log management done' % skyline_app) logger.info('starting SNAB_flux_load_test') while 1: now = time() # Make sure Redis is up try: self.redis_conn.ping() logger.info('pinged Redis via get_redis_conn') except: logger.error(traceback.format_exc()) logger.error( 'error :: cannot connect to redis at socket path %s' % settings.REDIS_SOCKET_PATH) sleep(10) try: self.redis_conn = get_redis_conn(skyline_app) logger.info('connected via get_redis_conn') except: logger.error(traceback.format_exc()) logger.error('error :: not connected via get_redis_conn') continue try: self.redis_conn_decoded.ping() logger.info('pinged Redis via get_redis_conn_decoded') except: logger.error(traceback.format_exc()) logger.error( 'error :: not connected via get_redis_conn_decoded') sleep(10) try: self.redis_conn_decoded = get_redis_conn_decoded( skyline_app) logger.info('connected via get_redis_conn_decoded') except: logger.error(traceback.format_exc()) logger.error( 'error :: cannot connect to get_redis_conn_decoded') continue """ Run load test """ while True: current_timestamp = int(time()) logger.info('snab_flux_load_test - running load test') # Spawn processes pids = [] spawned_pids = [] pid_count = 0 p = Process(target=self.spin_snab_flux_load_test_process, args=(current_timestamp, )) pids.append(p) pid_count += 1 logger.info('starting 1 of %s spin_snab_process' % (str(pid_count))) p.start() spawned_pids.append(p.pid) # Send wait signal to zombie processes # for p in pids: # p.join() # Self monitor processes and terminate if any spin_snab_process # that has run for longer than 58 seconds p_starts = time() while time() - p_starts <= 58: if any(p.is_alive() for p in pids): # Just to avoid hogging the CPU sleep(.1) else: # All the processes are done, break now. time_to_run = time() - p_starts logger.info( '1 spin_snab_flux_load_test_process completed in %.2f seconds' % (time_to_run)) break else: # We only enter this if we didn't 'break' above. logger.info( 'timed out, killing spin_snab_flux_load_test_process process' ) for p in pids: p.terminate() # p.join() for p in pids: if p.is_alive(): logger.info( 'stopping spin_snab_flux_load_test_process - %s' % (str(p.is_alive()))) p.join() process_runtime = time() - current_timestamp if process_runtime < 60: sleep_for = (60 - process_runtime) logger.info('sleeping for %.2f seconds' % sleep_for) sleep(sleep_for) try: del sleep_for except: pass
def run(self): """ Called when the process intializes. """ logger.info('worker :: starting worker') last_sent_to_graphite = int(time()) metrics_sent_to_graphite = 0 # Populate API keys and tokens in memcache # python-2.x and python3.x handle while 1 and while True differently # while 1: running = True while running: # Make sure Redis is up redis_up = False while not redis_up: try: redis_up = self.redis_conn.ping() except: logger.error( 'worker :: cannot connect to redis at socket path %s' % (settings.REDIS_SOCKET_PATH)) sleep(2) # @modified 20191115 - Bug #3266: py3 Redis binary objects not strings # Branch #3262: py3 # Use get_redis_conn and get_redis_conn_decoded # if settings.REDIS_PASSWORD: # self.redis_conn = StrictRedis(password=settings.REDIS_PASSWORD, unix_socket_path=settings.REDIS_SOCKET_PATH) # else: # self.redis_conn = StrictRedis(unix_socket_path=settings.REDIS_SOCKET_PATH) self.redis_conn = get_redis_conn(skyline_app) self.redis_conn_decoded = get_redis_conn_decoded( skyline_app) if LOCAL_DEBUG: try: metric_data_queue_size = self.q.qsize() logger.info( 'worker :: debug :: flux.httpMetricDataQueue queue size - %s' % str(metric_data_queue_size)) except: logger.error(traceback.format_exc()) logger.error( 'error :: worker :: failed to determine size of queue flux.httpMetricDataQueue' ) metric_data = None try: # Get a metric from the queue with a 1 second timeout, each # metric item on the queue is a list e.g. # metric_data = [metricName, metricValue, metricTimestamp] metric_data = self.q.get(True, 1) except Empty: logger.info('worker :: queue is empty and timed out') sleep(1) except NotImplementedError: pass except KeyboardInterrupt: logger.info( 'worker :: server has been issued a user signal to terminate - KeyboardInterrupt' ) except SystemExit: logger.info('worker :: server was interrupted - SystemExit') except Exception as e: logger.error('error :: worker :: %s' % (str(e))) # @added 20200206 - Feature #3444: Allow flux to backfill # Added backfill backfill = False if metric_data: try: metric = str(metric_data[0]) value = float(metric_data[1]) timestamp = int(metric_data[2]) # @added 20200206 - Feature #3444: Allow flux to backfill # Added backfill backfill = int(metric_data[3]) if LOCAL_DEBUG: logger.info( 'worker :: debug :: queue item found - %s' % str(metric_data)) except: logger.error(traceback.format_exc()) logger.error( 'error :: worker :: failed to interpolate metric, value, timestamp from metric_data - %s' % str(metric_data)) continue if settings.FLUX_SEND_TO_CARBON: # Best effort de-duplicate the data valid_data = True # @added 20200818 - Feature #3694: flux - POST multiple metrics # Handle Redis and literal_eval separately redis_last_metric_data = None # @modified 20200206 - Feature #3444: Allow flux to backfill # Only check flux.last key if this is not backfill if not backfill: cache_key = 'flux.last.%s' % metric last_metric_timestamp = None try: # @modified 20191128 - Bug #3266: py3 Redis binary objects not strings # Branch #3262: py3 # redis_last_metric_data = self.redis_conn.get(cache_key) redis_last_metric_data = self.redis_conn_decoded.get( cache_key) except: logger.error(traceback.format_exc()) logger.error( 'error :: worker :: failed to determine last_metric_timestamp from Redis key %s' % str(cache_key)) redis_last_metric_data = None # @modified 20200818 - Feature #3694: flux - POST multiple metrics # Handle Redis and literal_eval separately, only # literal_eval if Redis had data for the key if redis_last_metric_data: try: last_metric_data = literal_eval( redis_last_metric_data) last_metric_timestamp = int( last_metric_data[0]) if LOCAL_DEBUG: logger.info( 'worker :: debug :: last_metric_timestamp for %s from %s is %s' % (metric, str(cache_key), str(last_metric_timestamp))) except: logger.error(traceback.format_exc()) logger.error( 'error :: worker :: failed to determine last_metric_timestamp from Redis key %s' % str(cache_key)) last_metric_timestamp = False if last_metric_timestamp: if timestamp <= last_metric_timestamp: valid_data = False if LOCAL_DEBUG: logger.info( 'worker :: debug :: not valid data - the queue data timestamp %s is <= to the last_metric_timestamp %s for %s' % (str(timestamp), str(last_metric_timestamp), metric)) if valid_data: submittedToGraphite = False try: graphyte.send(metric, value, timestamp) submittedToGraphite = True logger.info( 'worker :: sent %s, %s, %s to Graphite' % (str(metric), str(value), str(timestamp))) metrics_sent_to_graphite += 1 except: logger.error(traceback.format_exc()) logger.error( 'error :: worker :: failed to send metric data to Graphite for %s' % str(metric)) metric = None if submittedToGraphite: # Update the metric Redis flux key # @modified 20200206 - Feature #3444: Allow flux to backfill # Only update the flux.last key if this is not backfill if not backfill: metric_data = [timestamp, value] self.redis_conn.set(cache_key, str(metric_data)) # @added 20200213 - Bug #3448: Repeated airgapped_metrics else: # @added 20200213 - Bug #3448: Repeated airgapped_metrics # Add a flux.filled key to Redis with a expiry # set to FULL_DURATION so that Analyzer knows to # sort and deduplicate the Redis time series # data as carbon-relay will send it to Horizon # and the datapoints will be out of order in the # Redis key try: flux_filled_key = 'flux.filled.%s' % str( metric) self.redis_conn.setex( flux_filled_key, settings.FULL_DURATION, int(time())) logger.info('worker :: set Redis key %s' % (str(flux_filled_key))) except Exception as e: logger.error( 'error :: failed to could not set Redis flux.filled key: %s' % e) else: logger.info( 'worker :: discarded %s, %s, %s as a data point for %s has already been submitted to Graphite' % (str(metric), str(value), str(timestamp), str(timestamp))) else: logger.info( 'worker :: settings.FLUX_SEND_TO_CARBON is set to %s, discarded %s, %s, %s' % (str(settings.FLUX_SEND_TO_CARBON), str(metric), str(value), str(timestamp))) if settings.FLUX_SEND_TO_STATSD: statsd_conn.incr(metric, value, timestamp) logger.info('worker sent %s, %s, %s to statsd' % (metric, str(value), str(timestamp))) time_now = int(time()) if (time_now - last_sent_to_graphite) >= 60: logger.info( 'worker :: metrics_sent_to_graphite in last 60 seconds - %s' % str(metrics_sent_to_graphite)) skyline_metric = '%s.metrics_sent_to_graphite' % skyline_app_graphite_namespace try: # @modified 20191008 - Feature #3250: Allow Skyline to send metrics to another Carbon host # graphyte.send(skyline_metric, metrics_sent_to_graphite, time_now) send_graphite_metric(skyline_app, skyline_metric, metrics_sent_to_graphite) last_sent_to_graphite = int(time()) metrics_sent_to_graphite = 0 except: logger.error(traceback.format_exc()) logger.error( 'error :: worker :: failed to send_graphite_metric %s with %s' % (skyline_metric, str(metrics_sent_to_graphite)))
def update_redis_set( current_skyline_app, redis_set, original_data_str, update_data_str, log=True): """ Manage data in a Redis set. :param current_skyline_app: the app calling the function :param redis_set: the Redis key name of the set :param original_data: the data in the set which to take action on :param update_data_str: the updated data or the string 'remove' to remove the data from the set. :param log: whether to log or not, optional, defaults to True :type current_skyline_app: str :type redis_set: str :type original_data: str :type update_data_str: str :type log: boolean """ function_str = 'functions.redis.update_set' if log: current_skyline_app_logger = current_skyline_app + 'Log' current_logger = logging.getLogger(current_skyline_app_logger) else: current_logger = None try: redis_conn = get_redis_conn(current_skyline_app) except Exception as e: if not log: current_skyline_app_logger = current_skyline_app + 'Log' current_logger = logging.getLogger(current_skyline_app_logger) current_logger.error(traceback.format_exc()) current_logger.error('error :: %s :: failed to connect to Redis to manage data in Redis set %s - %s' % ( function_str, redis_set, e)) try: redis_conn.srem(redis_set, str(original_data_str)) # @added 20220110 - Bug #4364: Prune old thunder.events # Branch #1444: thunder if log: current_logger.info('removed item from Redis set %s - %s' % ( redis_set, str(original_data_str))) except Exception as e: if not log: current_skyline_app_logger = current_skyline_app + 'Log' current_logger = logging.getLogger(current_skyline_app_logger) current_logger.error(traceback.format_exc()) current_logger.error('error :: %s :: failed to remove item from Redis set %s - %s' % ( function_str, redis_set, e)) if update_data_str != 'remove': try: redis_conn.sadd(redis_set, str(update_data_str)) # @added 20220110 - Bug #4364: Prune old thunder.events # Branch #1444: thunder if log: current_logger.info('added updated item to Redis set %s - %s' % ( redis_set, str(update_data_str))) except Exception as e: if not log: current_skyline_app_logger = current_skyline_app + 'Log' current_logger = logging.getLogger(current_skyline_app_logger) current_logger.error(traceback.format_exc()) current_logger.error('error :: %s :: failed to update item in Redis set %s - %s' % ( function_str, redis_set, e)) return
def run(self): """ Called when the process intializes. """ def pickle_data_to_graphite(data): message = None try: payload = pickle.dumps(data, protocol=2) header = struct.pack("!L", len(payload)) message = header + payload except: logger.error(traceback.format_exc()) logger.error( 'error :: worker :: failed to pickle to send to Graphite') return False if message: try: sock = socket.socket() sock.connect( (CARBON_HOST, settings.FLUX_CARBON_PICKLE_PORT)) sock.sendall(message) sock.close() except: logger.error(traceback.format_exc()) logger.error( 'error :: worker :: failed to send pickle data to Graphite' ) return False else: logger.error(traceback.format_exc()) logger.error( 'error :: worker :: failed to pickle metric data into message' ) return False return True def submit_pickle_data_to_graphite(pickle_data): # @modified 20201207 - Task #3864: flux - try except everything try: number_of_datapoints = len(pickle_data) except Exception as e: logger.error( 'error :: worker :: could not determine number_of_datapoints from len(pickle_data) - %s' % str(e)) return False data_points_sent = 0 smallListOfMetricTuples = [] tuples_added = 0 for data in pickle_data: # @modified 20201207 - Task #3864: flux - try except everything try: smallListOfMetricTuples.append(data) tuples_added += 1 if tuples_added >= 480: # @modified 20201207 - Task #3864: flux - try except everything try: pickle_data_sent = pickle_data_to_graphite( smallListOfMetricTuples) except Exception as e: logger.error( 'error :: worker :: pickle_data_to_graphite error - %s' % str(e)) pickle_data_sent = False # Reduce the speed of submissions to Graphite # if there are lots of data points if number_of_datapoints > 4000: sleep(0.3) if pickle_data_sent: data_points_sent += tuples_added logger.info( 'worker :: sent %s/%s of %s data points to Graphite via pickle' % (str(tuples_added), str(data_points_sent), str(number_of_datapoints))) smallListOfMetricTuples = [] tuples_added = 0 else: logger.error( 'error :: worker :: failed to send %s data points to Graphite via pickle' % (str(tuples_added))) return False except Exception as e: logger.error( 'error :: worker :: error handling data in pickle_data - %s' % str(e)) return False if smallListOfMetricTuples: # @modified 20201207 - Task #3864: flux - try except everything try: tuples_to_send = len(smallListOfMetricTuples) pickle_data_sent = pickle_data_to_graphite( smallListOfMetricTuples) if pickle_data_sent: data_points_sent += tuples_to_send logger.info( 'worker :: sent the last %s/%s of %s data points to Graphite via pickle' % (str(tuples_to_send), str(data_points_sent), str(number_of_datapoints))) else: logger.error( 'error :: failed to send the last %s data points to Graphite via pickle' % (str(tuples_to_send))) return False except Exception as e: logger.error( 'error :: worker :: error in smallListOfMetricTuples pickle_data_to_graphite - %s' % str(e)) return False return True logger.info('worker :: starting worker') last_sent_to_graphite = int(time()) metrics_sent_to_graphite = 0 # @added 20200827 - Feature #3708: FLUX_ZERO_FILL_NAMESPACES last_zero_fill_to_graphite = 0 metrics_sent = [] remove_from_flux_queue_redis_set = [] # @added 20201019 - Feature #3790: flux - pickle to Graphite pickle_data = [] # send_to_reciever = 'line' send_to_reciever = 'pickle' # @modified 20201207 - Task #3864: flux - try except everything try: metric_data_queue_size = self.q.qsize() except Exception as e: logger.error( 'error :: worker :: could not determine metric_data_queue_size - %s' % str(e)) metric_data_queue_size = 0 if metric_data_queue_size > 10: send_to_reciever = 'pickle' # @added 202011120 - Feature #3790: flux - pickle to Graphite # Debug Redis set metrics_data_sent = [] # @added 20201020 - Feature #3796: FLUX_CHECK_LAST_TIMESTAMP # Even if flux.last Redis keys are disabled in flux they are used in # Vista vista_metrics = [] if not FLUX_CHECK_LAST_TIMESTAMP and VISTA_ENABLED: try: vista_metrics = list( self.redis_conn_decoded.sscan_iter('vista.metrics', match='*')) except: vista_metrics = [] # Populate API keys and tokens in memcache # python-2.x and python3.x handle while 1 and while True differently # while 1: running = True while running: # Make sure Redis is up redis_up = False while not redis_up: try: redis_up = self.redis_conn.ping() except: logger.error( 'worker :: cannot connect to redis at socket path %s' % (settings.REDIS_SOCKET_PATH)) sleep(2) # @modified 20191115 - Bug #3266: py3 Redis binary objects not strings # Branch #3262: py3 # Use get_redis_conn and get_redis_conn_decoded # if settings.REDIS_PASSWORD: # self.redis_conn = StrictRedis(password=settings.REDIS_PASSWORD, unix_socket_path=settings.REDIS_SOCKET_PATH) # else: # self.redis_conn = StrictRedis(unix_socket_path=settings.REDIS_SOCKET_PATH) # @modified 20201207 - Task #3864: flux - try except everything try: self.redis_conn = get_redis_conn(skyline_app) except Exception as e: logger.error( 'error :: worker :: could not get_redis_conn - %s' % str(e)) try: self.redis_conn_decoded = get_redis_conn_decoded( skyline_app) except Exception as e: logger.error( 'error :: worker :: could not get_redis_conn_decoded - %s' % str(e)) if LOCAL_DEBUG: try: metric_data_queue_size = self.q.qsize() logger.info( 'worker :: debug :: flux.httpMetricDataQueue queue size - %s' % str(metric_data_queue_size)) except: logger.error(traceback.format_exc()) logger.error( 'error :: worker :: failed to determine size of queue flux.httpMetricDataQueue' ) metric_data = None try: # Get a metric from the queue with a 1 second timeout, each # metric item on the queue is a list e.g. # metric_data = [metricName, metricValue, metricTimestamp] metric_data = self.q.get(True, 1) except Empty: if pickle_data: # @modified 20201207 - Task #3864: flux - try except everything try: pickle_data_submitted = submit_pickle_data_to_graphite( pickle_data) except Exception as e: logger.error( 'error :: worker :: queue Empty failed to submit_pickle_data_to_graphite - %s' % str(e)) pickle_data_submitted = False if pickle_data_submitted: pickle_data = [] logger.info('worker :: queue is empty and timed out') sleep(1) # @added 20201017 - Feature #3788: snab_flux_load_test # Send to Graphite even if worker gets no metrics if (int(time()) - last_sent_to_graphite) >= 60: logger.info( 'worker :: metrics_sent_to_graphite in last 60 seconds - %s' % str(metrics_sent_to_graphite)) skyline_metric = '%s.metrics_sent_to_graphite' % skyline_app_graphite_namespace try: # @modified 20191008 - Feature #3250: Allow Skyline to send metrics to another Carbon host # graphyte.send(skyline_metric, metrics_sent_to_graphite, time_now) send_graphite_metric(skyline_app, skyline_metric, metrics_sent_to_graphite) last_sent_to_graphite = int(time()) metrics_sent_to_graphite = 0 except: logger.error(traceback.format_exc()) logger.error( 'error :: worker :: failed to send_graphite_metric %s with %s' % (skyline_metric, str(metrics_sent_to_graphite))) metric_data_queue_size = 0 try: metric_data_queue_size = self.q.qsize() logger.info( 'worker :: flux.httpMetricDataQueue queue size - %s' % str(metric_data_queue_size)) except: logger.error(traceback.format_exc()) logger.error( 'error :: worker :: failed to determine size of queue flux.httpMetricDataQueue' ) skyline_metric = '%s.httpMetricDataQueue.size' % skyline_app_graphite_namespace try: send_graphite_metric(skyline_app, skyline_metric, metric_data_queue_size) except: logger.error(traceback.format_exc()) logger.error( 'error :: worker :: failed to send_graphite_metric %s with %s' % (skyline_metric, str(metrics_sent_to_graphite))) # @added 20201019 - Feature #3790: flux - pickle to Graphite if metric_data_queue_size > 10: send_to_reciever = 'pickle' else: send_to_reciever = 'line' send_to_reciever = 'pickle' # @added 202011120 - Feature #3790: flux - pickle to Graphite # Debug Redis set metrics_data_sent_strs = [] for item in metrics_data_sent: metrics_data_sent_strs.append(str(item)) if metrics_data_sent_strs: try: self.redis_conn.sadd('flux.metrics_data_sent', *set(metrics_data_sent_strs)) logger.info( 'worker :: added %s items to the flux.metrics_data_sent Redis set' % str(len(metrics_data_sent))) except: logger.error(traceback.format_exc()) logger.error( 'error :: worker :: failed to determine size of flux.queue Redis set' ) metrics_data_sent = [] try: new_set = 'aet.flux.metrics_data_sent.%s' % str( self.current_pid) except: logger.error(traceback.format_exc()) logger.error( 'error :: worker :: failed to current_pid for aet.flux.metrics_data_sent Redis set name' ) new_set = 'aet.flux.metrics_data_sent' try: self.redis_conn.rename('flux.metrics_data_sent', new_set) logger.info( 'worker :: renamed flux.metrics_data_sent Redis set to %s' % new_set) # @added 20201128 - Feature #3820: HORIZON_SHARDS # With metrics that come in at a frequency of less # than 60 seconds, it is possible that this key will # not exist as flux has not been sent metric data # so this operation will error with no such key except Exception as e: traceback_str = traceback.format_exc() if 'no such key' in e: logger.warn( 'warning :: worker :: failed to rename flux.metrics_data_sent to %s Redis set - flux has not recieved data in 60 seconds - %s' % (new_set, e)) else: logger.error(traceback_str) logger.error( 'error :: worker :: failed to rename flux.metrics_data_sent to %s Redis set' % new_set) try: self.redis_conn.expire(new_set, 600) except: logger.error(traceback.format_exc()) logger.error( 'error :: worker :: failed to set 600 seconds TTL on %s Redis set' % new_set) # @added 20201018 - Feature #3798: FLUX_PERSIST_QUEUE if FLUX_PERSIST_QUEUE: redis_set_size = 0 try: redis_set_size = self.redis_conn.scard( 'flux.queue') except: logger.error(traceback.format_exc()) logger.error( 'error :: worker :: failed to determine size of flux.queue Redis set' ) logger.info( 'worker - flux.queue Redis set size of %s before removal of %s items' % (str(redis_set_size), str(len(remove_from_flux_queue_redis_set)))) if remove_from_flux_queue_redis_set: try: self.redis_conn.srem( 'flux.queue', *set(remove_from_flux_queue_redis_set)) remove_from_flux_queue_redis_set = [] except: logger.error(traceback.format_exc()) logger.error( 'error :: worker :: failed to remove multiple items from flux.queue Redis set' ) try: redis_set_size = self.redis_conn.scard( 'flux.queue') except: logger.error(traceback.format_exc()) logger.error( 'error :: worker :: failed to determine size of flux.queue Redis set' ) logger.info( 'worker - flux.queue Redis set size of %s after the removal of items' % (str(redis_set_size))) remove_from_flux_queue_redis_set = [] # @added 20201020 - Feature #3796: FLUX_CHECK_LAST_TIMESTAMP # Even if flux.last Redis keys are disabled in flux they are used in # Vista vista_metrics = [] if not FLUX_CHECK_LAST_TIMESTAMP and VISTA_ENABLED: try: vista_metrics = list( self.redis_conn_decoded.sscan_iter( 'vista.metrics', match='*')) except: vista_metrics = [] except NotImplementedError: pass except KeyboardInterrupt: logger.info( 'worker :: server has been issued a user signal to terminate - KeyboardInterrupt' ) except SystemExit: logger.info('worker :: server was interrupted - SystemExit') except Exception as e: logger.error('error :: worker :: %s' % (str(e))) # @added 20200206 - Feature #3444: Allow flux to backfill # Added backfill backfill = False # @added 20201018 - Feature #3798: FLUX_PERSIST_QUEUE if metric_data and FLUX_PERSIST_QUEUE: try: # Do not remove each individual metrics from the flux.queue # Redis set, add to a list that is removed in one srem *set # operation each 60 seconds. This is a more perfomant # method and requires a single blocking call for a batch of # metrics, rather than a blocking call for every metric. # self.redis_conn.srem('flux.queue', str(metric_data)) remove_from_flux_queue_redis_set.append(str(metric_data)) except: pass if metric_data: try: metric = str(metric_data[0]) value = float(metric_data[1]) timestamp = int(metric_data[2]) # @added 20200206 - Feature #3444: Allow flux to backfill # Added backfill backfill = int(metric_data[3]) if LOCAL_DEBUG: logger.info( 'worker :: debug :: queue item found - %s' % str(metric_data)) except: logger.error(traceback.format_exc()) logger.error( 'error :: worker :: failed to interpolate metric, value, timestamp from metric_data - %s' % str(metric_data)) continue # @added 20201020 - Feature #3796: FLUX_CHECK_LAST_TIMESTAMP # Only check flux.last key if this is not backfill and # FLUX_CHECK_LAST_TIMESTAMP is enable or it is in VISTA_ENABLED cache_key = None # if FLUX_CHECK_LAST_TIMESTAMP: cache_key = 'flux.last.%s' % metric check_flux_last_key = False if not backfill and FLUX_CHECK_LAST_TIMESTAMP: check_flux_last_key = True if VISTA_ENABLED: if metric in vista_metrics: check_flux_last_key = True if settings.FLUX_SEND_TO_CARBON: # Best effort de-duplicate the data valid_data = True # @added 20200818 - Feature #3694: flux - POST multiple metrics # Handle Redis and literal_eval separately redis_last_metric_data = None # @modified 20200206 - Feature #3444: Allow flux to backfill # Only check flux.last key if this is not backfill # @modified 20201020 - Feature #3796: FLUX_CHECK_LAST_TIMESTAMP # Use the check_flux_last_key value determined above # if not backfill: if check_flux_last_key: # @modified 20201020 - Feature #3796: FLUX_CHECK_LAST_TIMESTAMP # Set cache_key outside the conditional block # cache_key = 'flux.last.%s' % metric last_metric_timestamp = None try: # @modified 20191128 - Bug #3266: py3 Redis binary objects not strings # Branch #3262: py3 # redis_last_metric_data = self.redis_conn.get(cache_key) redis_last_metric_data = self.redis_conn_decoded.get( cache_key) except: logger.error(traceback.format_exc()) logger.error( 'error :: worker :: failed to determine last_metric_timestamp from Redis key %s' % str(cache_key)) redis_last_metric_data = None # @modified 20200818 - Feature #3694: flux - POST multiple metrics # Handle Redis and literal_eval separately, only # literal_eval if Redis had data for the key if redis_last_metric_data: try: last_metric_data = literal_eval( redis_last_metric_data) last_metric_timestamp = int( last_metric_data[0]) if LOCAL_DEBUG: logger.info( 'worker :: debug :: last_metric_timestamp for %s from %s is %s' % (metric, str(cache_key), str(last_metric_timestamp))) except: logger.error(traceback.format_exc()) logger.error( 'error :: worker :: failed to determine last_metric_timestamp from Redis key %s' % str(cache_key)) last_metric_timestamp = False if last_metric_timestamp: if timestamp <= last_metric_timestamp: valid_data = False if LOCAL_DEBUG: logger.info( 'worker :: debug :: not valid data - the queue data timestamp %s is <= to the last_metric_timestamp %s for %s' % (str(timestamp), str(last_metric_timestamp), metric)) if valid_data: submittedToGraphite = False if send_to_reciever == 'line': try: graphyte.send(metric, value, timestamp) submittedToGraphite = True # modified 20201016 - Feature #3788: snab_flux_load_test if FLUX_VERBOSE_LOGGING: logger.info( 'worker :: sent %s, %s, %s to Graphite - via graphyte' % (str(metric), str(value), str(timestamp))) metrics_sent_to_graphite += 1 # @added 20200827 - Feature #3708: FLUX_ZERO_FILL_NAMESPACES metrics_sent.append(metric) # @added 202011120 - Feature #3790: flux - pickle to Graphite # Debug Redis set metrics_data_sent.append( [metric, value, timestamp]) except: logger.error(traceback.format_exc()) logger.error( 'error :: worker :: failed to send metric data to Graphite for %s' % str(metric)) metric = None if send_to_reciever == 'pickle': # @modified 20201212 - Task #3864: flux - try except everything try: tuple_data = (metric, (int(timestamp), float(value))) pickle_data.append(tuple_data) if FLUX_VERBOSE_LOGGING: logger.info( 'worker :: sending %s, %s, %s to Graphite - via pickle' % (str(metric), str(value), str(timestamp))) submittedToGraphite = True metrics_sent_to_graphite += 1 metrics_sent.append(metric) # @added 202011120 - Feature #3790: flux - pickle to Graphite # Debug Redis set metrics_data_sent.append( [metric, value, timestamp]) except Exception as e: logger.error( 'error :: worker :: failed to append to pickle_data - %s' % str(e)) if submittedToGraphite: # Update the metric Redis flux key # @modified 20200206 - Feature #3444: Allow flux to backfill # Only update the flux.last key if this is not backfill # @modified 20201020 - Feature #3796: FLUX_CHECK_LAST_TIMESTAMP # Use the check_flux_last_key value determined above # if not backfill: if check_flux_last_key: metric_data = [timestamp, value] # @modified 20201207 - Task #3864: flux - try except everything try: self.redis_conn.set( cache_key, str(metric_data)) except Exception as e: logger.error( 'error :: worker :: failed to set check_flux_last_key Redis key - %s' % str(e)) # @added 20200213 - Bug #3448: Repeated airgapped_metrics else: # @added 20201120 - Feature #3796: FLUX_CHECK_LAST_TIMESTAMP # Feature #3400: Identify air gaps in the metric data # Only execute if IDENTIFY_AIRGAPS is enabled if IDENTIFY_AIRGAPS: # @added 20200213 - Bug #3448: Repeated airgapped_metrics # Add a flux.filled key to Redis with a expiry # set to FULL_DURATION so that Analyzer knows to # sort and deduplicate the Redis time series # data as carbon-relay will send it to Horizon # and the datapoints will be out of order in the # Redis key try: flux_filled_key = 'flux.filled.%s' % str( metric) self.redis_conn.setex( flux_filled_key, settings.FULL_DURATION, int(time())) logger.info( 'worker :: set Redis key %s' % (str(flux_filled_key))) except Exception as e: logger.error( 'error :: failed to could not set Redis flux.filled key: %s' % e) else: # modified 20201016 - Feature #3788: snab_flux_load_test if FLUX_VERBOSE_LOGGING: logger.info( 'worker :: discarded %s, %s, %s as a data point for %s has already been submitted to Graphite' % (str(metric), str(value), str(timestamp), str(timestamp))) else: logger.info( 'worker :: settings.FLUX_SEND_TO_CARBON is set to %s, discarded %s, %s, %s' % (str(settings.FLUX_SEND_TO_CARBON), str(metric), str(value), str(timestamp))) if settings.FLUX_SEND_TO_STATSD: statsd_conn.incr(metric, value, timestamp) # modified 20201016 - Feature #3788: snab_flux_load_test if FLUX_VERBOSE_LOGGING: logger.info('worker sent %s, %s, %s to statsd' % (metric, str(value), str(timestamp))) # @added 20200827 - Feature #3708: FLUX_ZERO_FILL_NAMESPACES metrics_sent.append(metric) submit_pickle_data = False if pickle_data: number_of_datapoints = len(pickle_data) if number_of_datapoints >= 1000: submit_pickle_data = True else: try: metric_data_queue_size = self.q.qsize() except: metric_data_queue_size = 0 if metric_data_queue_size == 0: submit_pickle_data = True if submit_pickle_data: # @modified 20201207 - Task #3864: flux - try except everything try: pickle_data_submitted = submit_pickle_data_to_graphite( pickle_data) except Exception as e: logger.error( 'error :: worker :: submit_pickle_data_to_graphite failed - %s' % str(e)) pickle_data_submitted = False if pickle_data_submitted: pickle_data = [] time_now = int(time()) # @added 20200827 - Feature #3708: FLUX_ZERO_FILL_NAMESPACES # Send 0 for any metric in the flux.zero_fill_metrics Redis set that # has not submitted data in the last 60 seconds. The flux.last # Redis key is not updated for these sent 0 values so if the source # sends data for a timestamp in the period later (due to a lag, etc), # it will be valid and sent to Graphite. if FLUX_ZERO_FILL_NAMESPACES: if not last_zero_fill_to_graphite: last_zero_fill_to_graphite = time_now - 60 if (time_now - last_sent_to_graphite) >= 60: try: flux_zero_fill_metrics = list( self.redis_conn_decoded.smembers( 'flux.zero_fill_metrics')) except: logger.info(traceback.format_exc()) logger.error( 'error :: failed to generate a list from flux.zero_fill_metrics Redis set' ) for flux_zero_fill_metric in flux_zero_fill_metrics: if flux_zero_fill_metric not in metrics_sent: try: graphyte.send(flux_zero_fill_metric, 0.0, time_now) # modified 20201016 - Feature #3788: snab_flux_load_test if FLUX_VERBOSE_LOGGING: logger.info( 'worker :: zero fill - sent %s, %s, %s to Graphite' % (str(flux_zero_fill_metric), str(0.0), str(time_now))) metrics_sent_to_graphite += 1 metrics_sent.append(metric) except: logger.error(traceback.format_exc()) logger.error( 'error :: worker :: zero fill - failed to send metric data to Graphite for %s' % str(flux_zero_fill_metric)) metric = None last_zero_fill_to_graphite = time_now metrics_sent = [] if (time_now - last_sent_to_graphite) >= 60: if pickle_data: # @modified 20201207 - Task #3864: flux - try except everything try: pickle_data_submitted = submit_pickle_data_to_graphite( pickle_data) except Exception as e: logger.error( 'error :: worker :: submit_pickle_data_to_graphite failed last_sent_to_graphite >= 60 - %s' % str(e)) pickle_data_submitted = False if pickle_data_submitted: pickle_data = [] logger.info( 'worker :: metrics_sent_to_graphite in last 60 seconds - %s' % str(metrics_sent_to_graphite)) skyline_metric = '%s.metrics_sent_to_graphite' % skyline_app_graphite_namespace try: # @modified 20191008 - Feature #3250: Allow Skyline to send metrics to another Carbon host # graphyte.send(skyline_metric, metrics_sent_to_graphite, time_now) send_graphite_metric(skyline_app, skyline_metric, metrics_sent_to_graphite) last_sent_to_graphite = int(time()) metrics_sent_to_graphite = 0 except: logger.error(traceback.format_exc()) logger.error( 'error :: worker :: failed to send_graphite_metric %s with %s' % (skyline_metric, str(metrics_sent_to_graphite))) metric_data_queue_size = 0 try: metric_data_queue_size = self.q.qsize() logger.info( 'worker :: flux.httpMetricDataQueue queue size - %s' % str(metric_data_queue_size)) except: logger.error(traceback.format_exc()) logger.error( 'error :: worker :: failed to determine size of queue flux.httpMetricDataQueue' ) skyline_metric = '%s.httpMetricDataQueue.size' % skyline_app_graphite_namespace try: send_graphite_metric(skyline_app, skyline_metric, metric_data_queue_size) except: logger.error(traceback.format_exc()) logger.error( 'error :: worker :: failed to send_graphite_metric %s with %s' % (skyline_metric, str(metrics_sent_to_graphite))) # @added 20201019 - Feature #3790: flux - pickle to Graphite if metric_data_queue_size > 10: send_to_reciever = 'pickle' else: send_to_reciever = 'line' # @added 202011120 - Feature #3790: flux - pickle to Graphite # Debug Redis set metrics_data_sent_strs = [] for item in metrics_data_sent: metrics_data_sent_strs.append(str(item)) if metrics_data_sent_strs: try: self.redis_conn.sadd('flux.metrics_data_sent', *set(metrics_data_sent_strs)) logger.info( 'worker :: added %s items to the flux.metrics_data_sent Redis set' % str(len(metrics_data_sent))) except: logger.error(traceback.format_exc()) logger.error( 'error :: worker :: failed to determine size of flux.queue Redis set' ) metrics_data_sent = [] try: new_set = 'aet.flux.metrics_data_sent.%s' % str( self.current_pid) except: logger.error(traceback.format_exc()) logger.error( 'error :: worker :: failed to current_pid for aet.flux.metrics_data_sent Redis set name' ) new_set = 'aet.flux.metrics_data_sent' try: self.redis_conn.rename('flux.metrics_data_sent', new_set) logger.info( 'worker :: renamed flux.metrics_data_sent Redis set to %s' % new_set) # @modified 20201128 - Feature #3820: HORIZON_SHARDS # With metrics that come in at a frequency of less # than 60 seconds, it is possible that this key will # not exist as flux has not been sent metric data # so this operation will error with no such key except Exception as e: traceback_str = traceback.format_exc() if 'no such key' in e: logger.warn( 'warning :: worker :: failed to rename flux.metrics_data_sent to %s Redis set - flux has not recieved data in 60 seconds - %s' % (new_set, e)) else: logger.error(traceback_str) logger.error( 'error :: worker :: failed to rename flux.metrics_data_sent to %s Redis set' % new_set) try: self.redis_conn.expire(new_set, 600) except: logger.error(traceback.format_exc()) logger.error( 'error :: worker :: failed to set 600 seconds TTL on %s Redis set' % new_set) # @added 20201018 - Feature #3798: FLUX_PERSIST_QUEUE if FLUX_PERSIST_QUEUE: redis_set_size = 0 try: redis_set_size = self.redis_conn.scard('flux.queue') except: logger.error(traceback.format_exc()) logger.error( 'error :: worker :: failed to determine size of flux.queue Redis set' ) logger.info( 'worker - flux.queue Redis set size %s before removal of %s items' % (str(redis_set_size), str(len(remove_from_flux_queue_redis_set)))) if remove_from_flux_queue_redis_set: try: self.redis_conn.srem( 'flux.queue', *set(remove_from_flux_queue_redis_set)) remove_from_flux_queue_redis_set = [] except: logger.error(traceback.format_exc()) logger.error( 'error :: worker :: failed to remove multiple items from flux.queue Redis set' ) try: redis_set_size = self.redis_conn.scard( 'flux.queue') except: logger.error(traceback.format_exc()) logger.error( 'error :: worker :: failed to determine size of flux.queue Redis set' ) logger.info( 'worker - flux.queue Redis set size of %s after the removal of items' % (str(redis_set_size))) remove_from_flux_queue_redis_set = [] # @added 20201020 - Feature #3796: FLUX_CHECK_LAST_TIMESTAMP # Even if flux.last Redis keys are disabled in flux they are used in # Vista vista_metrics = [] if not FLUX_CHECK_LAST_TIMESTAMP and VISTA_ENABLED: try: vista_metrics = list( self.redis_conn_decoded.sscan_iter('vista.metrics', match='*')) except: vista_metrics = []
def run(self): """ Called when the process intializes. """ # Log management to prevent overwriting # Allow the bin/<skyline_app>.d to manage the log if os.path.isfile(skyline_app_logwait): try: logger.info('removing %s' % skyline_app_logwait) os.remove(skyline_app_logwait) except OSError: logger.error('error :: failed to remove %s, continuing' % skyline_app_logwait) pass now = time() log_wait_for = now + 5 while now < log_wait_for: if os.path.isfile(skyline_app_loglock): sleep(.1) now = time() else: now = log_wait_for + 1 logger.info('starting %s run' % skyline_app) if os.path.isfile(skyline_app_loglock): logger.error( 'error :: bin/%s.d log management seems to have failed, continuing' % skyline_app) try: os.remove(skyline_app_loglock) logger.info('log lock file removed') except OSError: logger.error('error :: failed to remove %s, continuing' % skyline_app_loglock) pass else: logger.info('bin/%s.d log management done' % skyline_app) # @added 20190417 - Feature #2948: LUMINOSITY_ENABLED setting # If Luminosity is not enabled, do nothing luminosity_enabled = True try: luminosity_enabled = settings.LUMINOSITY_ENABLED logger.info('LUMINOSITY_ENABLED is set to %s' % str(luminosity_enabled)) except: logger.info( 'warning :: LUMINOSITY_ENABLED is not declared in settings.py, defaults to True' ) # @added 20190417 - Feature #2950: Report defaulted settings to log # Added all the globally declared settings to enable reporting in the # log the state of each setting. try: ENABLE_LUMINOSITY_DEBUG = settings.ENABLE_LUMINOSITY_DEBUG logger.info( 'ENABLE_LUMINOSITY_DEBUG is set from settings.py to %s' % str(ENABLE_LUMINOSITY_DEBUG)) except: logger.info( 'warning :: ENABLE_LUMINOSITY_DEBUG is not declared in settings.py, defaults to False' ) ENABLE_LUMINOSITY_DEBUG = False try: SERVER_METRIC_PATH = '.%s' % settings.SERVER_METRICS_NAME if SERVER_METRIC_PATH == '.': SERVER_METRIC_PATH = '' logger.info('SERVER_METRIC_PATH is set from settings.py to %s' % str(SERVER_METRIC_PATH)) except: SERVER_METRIC_PATH = '' logger.info( 'warning :: SERVER_METRIC_PATH is not declared in settings.py, defaults to \'\'' ) try: LUMINOSITY_PROCESSES = settings.LUMINOSITY_PROCESSES logger.info('LUMINOSITY_PROCESSES is set from settings.py to %s' % str(LUMINOSITY_PROCESSES)) except: # @modified 20180110 - Task #2266: Evaluate luminol for the luminosity branch # It is fast and lightweight # luminosity_processes = 2 LUMINOSITY_PROCESSES = 1 logger.info( 'warning :: cannot determine LUMINOSITY_PROCESSES from settings.py, defaults to %s' % str(LUMINOSITY_PROCESSES)) while 1: now = time() # Make sure Redis is up try: self.redis_conn.ping() if ENABLE_LUMINOSITY_DEBUG: logger.info('debug :: connected to Redis') except: logger.error( 'error :: cannot connect to redis at socket path %s' % (settings.REDIS_SOCKET_PATH)) sleep(30) # @modified 20180519 - Feature #2378: Add redis auth to Skyline and rebrow # @modified 20191115 - Bug #3266: py3 Redis binary objects not strings # Branch #3262: py3 # Use get_redis_conn and get_redis_conn_decoded to use on Redis sets when the bytes # types need to be decoded as utf-8 to str # if settings.REDIS_PASSWORD: # self.redis_conn = StrictRedis(password=settings.REDIS_PASSWORD, unix_socket_path=settings.REDIS_SOCKET_PATH) # else: # self.redis_conn = StrictRedis(unix_socket_path=settings.REDIS_SOCKET_PATH) # @added 20191115 - Bug #3266: py3 Redis binary objects not strings # Branch #3262: py3 self.redis_conn = get_redis_conn(skyline_app) self.redis_conn_decoded = get_redis_conn_decoded(skyline_app) continue # Report app up try: self.redis_conn.setex(skyline_app, 120, now) logger.info('updated Redis key for %s up' % skyline_app) except: logger.error('error :: failed to update Redis key for %s up' % skyline_app) # @added 20190417 - Feature #: LUMINOSITY_ENABLED setting # If Luminosity is not enabled, do nothing if not luminosity_enabled: logger.info( 'luminosity is not enabled LUMINOSITY_ENABLED set to %s, sleeping for 20 seconds' % str(settings.LUMINOSITY_ENABLED)) sleep(20) continue """ Determine if any new anomalies have been added """ while True: process_anomaly_id = None last_processed_anomaly_id = None memcache_last_processed_anomaly_id_data = False # Check memcached before MySQL memcache_key = '%s.last.processed.anomaly.id' % skyline_app if settings.MEMCACHE_ENABLED: try: # @modified 20191029 - Task #3304: py3 - handle pymemcache bytes not str # last_processed_anomaly_id = self.memcache_client.get(memcache_key) if python_version == 2: last_processed_anomaly_id = self.memcache_client.get( memcache_key) else: last_processed_anomaly_id = self.memcache_client.get( memcache_key).decode('utf-8') # if memcache does not have the key the response to the # client is None, it does not except except: # @modified 20200507 - stop reporting this as an error # it can be expected to happen from time to time # logger.error('error :: failed to get %s from memcache' % memcache_key) logger.info( 'failed to get %s from memcache, will query DB' % memcache_key) try: self.memcache_client.close() except: logger.error( 'error :: failed to close memcache_client') if last_processed_anomaly_id: logger.info( 'last_processed_anomaly_id found in memcache - %s' % str(last_processed_anomaly_id)) memcache_last_processed_anomaly_id_data = True else: # @modified 20190517 - Bug #3016: Handle no anomaly ids in luminosity # Branch #3002: docker # Log appropriate to whether memcache is enabled or not if settings.MEMCACHE_ENABLED: logger.info( 'last_processed_anomaly_id key was NOT found in memcache - %s' % str(last_processed_anomaly_id)) else: logger.info( 'memcache not enabled not checking for last_processed_anomaly_id key' ) if not last_processed_anomaly_id: query = 'SELECT id FROM luminosity WHERE id=(SELECT MAX(id) FROM luminosity) ORDER BY id DESC LIMIT 1' results = None try: results = mysql_select(skyline_app, query) except: logger.error(traceback.format_exc()) logger.error('error :: MySQL quey failed - %s' % query) if results: try: last_processed_anomaly_id = int(results[0][0]) logger.info( 'last_processed_anomaly_id found from DB - %s' % str(last_processed_anomaly_id)) except: logger.error(traceback.format_exc()) if last_processed_anomaly_id and settings.MEMCACHE_ENABLED: if not memcache_last_processed_anomaly_id_data: logger.info( 'Populating memcache with DB result - %s' % str(last_processed_anomaly_id)) try: self.memcache_client.set( memcache_key, int(last_processed_anomaly_id)) logger.info( 'populated memcache key %s with %s' % (memcache_key, str(last_processed_anomaly_id))) except: logger.error( 'error :: failed to set the memcache key - %s - %s' % (memcache_key, str(last_processed_anomaly_id))) try: self.memcache_client.close() except: logger.error( 'error :: failed to close memcache_client' ) if not last_processed_anomaly_id: # Check MySQL now = int(time()) after = now - 600 query = 'SELECT * FROM anomalies WHERE anomaly_timestamp > \'%s\'' % str( after) # nosec results = None try: results = mysql_select(skyline_app, query) except: logger.error('error :: MySQL quey failed - %s' % query) if results: process_anomaly_id = int(results[0][0]) logger.info( 'found new anomaly id to process from the DB - %s' % str(process_anomaly_id)) # Handle the first one last_processed_anomaly_id = process_anomaly_id - 1 else: logger.info('no new anomalies in the anomalies table') # @added 20190517 - Bug #3016: Handle no anomaly ids in luminosity # Branch #3002: docker # When Skyline is first installed, if luminosity is enabled it # reports errors as there are no anomaly ids if str(last_processed_anomaly_id) == 'None': last_processed_anomaly_id = 0 query = 'SELECT * FROM anomalies WHERE id > \'%s\'' % str( last_processed_anomaly_id) # nosec results = None try: results = mysql_select(skyline_app, query) except: logger.error('error :: MySQL quey failed - %s' % query) if results: try: process_anomaly_id = int(results[0][0]) logger.info( 'found the next new anomaly id to process from the DB - %s' % str(process_anomaly_id)) except: logger.error(traceback.format_exc()) logger.error('error :: from query - %s' % query) else: logger.info('no new anomalies in the anomalies table') if process_anomaly_id and last_processed_anomaly_id: if isinstance(last_processed_anomaly_id, int): if isinstance(process_anomaly_id, int): if last_processed_anomaly_id == process_anomaly_id: logger.info( 'anomaly id already processed - %s' % str(process_anomaly_id)) process_anomaly_id = None if not process_anomaly_id: logger.info( 'sleeping 20 no anomalies to correlate - last processed anomaly id - %s' % str(last_processed_anomaly_id)) sleep(20) up_now = time() # Report app up try: self.redis_conn.setex(skyline_app, 120, up_now) logger.info('updated Redis key for %s up' % skyline_app) except: logger.error( 'error :: failed to update Redis key for %s up' % skyline_app) cache_key = '%s.sent_graphite_metrics' % skyline_app redis_sent_graphite_metrics = False try: redis_sent_graphite_metrics = self.redis_conn.get( cache_key) except Exception as e: logger.error( 'error :: could not query Redis for key %s: %s' % (cache_key, e)) # Flush metrics to Graphite if not redis_sent_graphite_metrics: try: # @modified 20190522 - Task #3034: Reduce multiprocessing Manager list usage # correlations = str(len(self.correlations)) # @modified 20191030 - Bug #3266: py3 Redis binary objects not strings # Branch #3262: py3 # correlations = str(len(list(self.redis_conn.smembers('luminosity.correlations')))) correlations = str( len( list( self.redis_conn_decoded.smembers( 'luminosity.correlations')))) except: correlations = '0' logger.info('correlations :: %s' % correlations) send_metric_name = '%s.correlations' % skyline_app_graphite_namespace send_graphite_metric(skyline_app, send_metric_name, correlations) # @added 20190522 - Task #3034: Reduce multiprocessing Manager list usage try: # @modified 20191030 - Bug #3266: py3 Redis binary objects not strings # Branch #3262: py3 # runtimes = list(self.redis_conn.smembers('luminosity.runtimes')) runtimes = list( self.redis_conn_decoded.smembers( 'luminosity.runtimes')) except: runtimes = [] # @added 20180720 - Task #2462: Implement useful metrics for Luminosity # Branch #2270: luminosity # runtime metric to monitor the time it takes to process # correlations try: # @modified 20190522 - Task #3034: Reduce multiprocessing Manager list usage # if len(self.runtimes) > 1: # avg_runtime = sum(self.runtimes) / len(self.runtimes) # else: # avg_runtime = sum(self.runtimes) if len(runtimes) > 1: avg_runtime = sum(runtimes) / len(runtimes) else: avg_runtime = sum(runtimes) except: avg_runtime = '0' logger.info('avg_runtime :: %s' % str(avg_runtime)) send_metric_name = '%s.avg_runtime' % skyline_app_graphite_namespace send_graphite_metric(skyline_app, send_metric_name, str(avg_runtime)) try: # @modified 20190522 - Task #3034: Reduce multiprocessing Manager list usage # metrics_checked_for_correlation = str(sum(self.metrics_checked_for_correlation)) # @modified 20191030 - Bug #3266: py3 Redis binary objects not strings # Branch #3262: py3 # metrics_checked_for_correlation = str(len(list(self.redis_conn.smembers('luminosity.metrics_checked_for_correlation')))) metrics_checked_for_correlation = str( len( list( self.redis_conn_decoded.smembers( 'luminosity.metrics_checked_for_correlation' )))) except: metrics_checked_for_correlation = '0' logger.info('metrics_checked_for_correlation :: %s' % metrics_checked_for_correlation) send_metric_name = '%s.metrics_checked_for_correlation' % skyline_app_graphite_namespace send_graphite_metric(skyline_app, send_metric_name, metrics_checked_for_correlation) sent_graphite_metrics_now = int(time()) try: self.redis_conn.setex(cache_key, 59, sent_graphite_metrics_now) logger.info('updated Redis key - %s' % cache_key) except: logger.error( 'error :: failed to update Redis key - %s up' % cache_key) # Reset lists # @modified 20190522 - Task #3034: Reduce multiprocessing Manager list usage # self.correlations[:] = [] # @added 20180720 - Task #2462: Implement useful metrics for Luminosity # @modified 20190522 - Task #3034: Reduce multiprocessing Manager list usage # self.runtimes[:] = [] # self.metrics_checked_for_correlation[:] = [] # @added 20190522 - Task #3034: Reduce multiprocessing Manager list usage # Use Redis sets instead of Manager().list() delete_redis_sets = [ 'luminosity.correlations', 'luminosity.runtimes', 'luminosity.metrics_checked_for_correlation' ] for i_redis_set in delete_redis_sets: redis_set_to_delete = i_redis_set try: self.redis_conn.delete(redis_set_to_delete) logger.info('deleted Redis set - %s' % redis_set_to_delete) except: logger.error(traceback.format_exc()) logger.error( 'error :: failed to delete Redis set - %s' % redis_set_to_delete) # @added 20180720 - Task #2462: Implement useful metrics for Luminosity # Feature #2464: luminosity_remote_data # Added the ability to add a Redis key to overview the memcached # key luminosity.last.processed.anomaly.id some it does not have # to be changed via telnet to memcache. if not process_anomaly_id or not redis_sent_graphite_metrics: cache_key = '%s.last.processed.anomaly.id' % skyline_app redis_last_processed_anomaly_id_redis_key = False try: redis_last_processed_anomaly_id_redis_key = self.redis_conn.get( cache_key) except Exception as e: logger.error( 'error :: could not query Redis for key %s: %s' % (cache_key, e)) if redis_last_processed_anomaly_id_redis_key: logger.info( 'found Redis %s key to override the mecache key setting process_anomaly_id to %s' % (cache_key, str(redis_last_processed_anomaly_id_redis_key))) try: process_anomaly_id = int( redis_last_processed_anomaly_id_redis_key) except: logger.error(traceback.format_exc()) logger.error( 'error :: failed to set process_anomaly_id from Rdis override key value' ) # And remove the Redis override key as it is only meant # to override once to allow for a replay for debug # purposes only. try: self.redis_conn.setex( cache_key, 1, int(redis_last_processed_anomaly_id_redis_key)) logger.info('updated Redis key - %s' % cache_key) except: logger.error( 'error :: failed to update Redis key - %s up to 1 second expiring to delete it.' % cache_key) if process_anomaly_id: break # Spawn process logger.info('spawning processes to correlate anomaly id %s' % str(process_anomaly_id)) pids = [] spawned_pids = [] pid_count = 0 now = time() for i in range(1, LUMINOSITY_PROCESSES + 1): try: p = Process(target=self.spin_process, args=(i, process_anomaly_id)) pids.append(p) pid_count += 1 logger.info('starting %s of %s spin_process/es' % (str(pid_count), str(LUMINOSITY_PROCESSES))) p.start() spawned_pids.append(p.pid) except: logger.error(traceback.format_exc()) logger.error('error :: failed to start spin_process') continue # Self monitor processes and terminate if any spin_process has run # for to long p_starts = time() while time() - p_starts <= 60: if any(p.is_alive() for p in pids): # Just to avoid hogging the CPU sleep(.1) else: # All the processes are done, break now. time_to_run = time() - p_starts logger.info('%s spin_process completed in %.2f seconds' % (str(LUMINOSITY_PROCESSES), time_to_run)) break else: # We only enter this if we didn't 'break' above. logger.info('timed out, killing all spin_process processes') for p in pids: try: p.terminate() # p.join() logger.info('killed spin_process process') except: logger.error(traceback.format_exc()) logger.error( 'error :: killing all spin_process processes') for p in pids: if p.is_alive(): logger.info('stopping spin_process - %s' % (str(p.is_alive()))) p.join() process_runtime = time() - now if process_runtime < 10: sleep_for = (10 - process_runtime) logger.info( 'sleeping for %.2f seconds due to low run time...' % sleep_for) sleep(sleep_for) try: del sleep_for except: logger.error('error :: failed to del sleep_for') try: del process_runtime except: logger.error('error :: failed to del process_runtime')