def spawn_alerter_process(self, alert, metric, context): """ Spawn a process to trigger an alert. This is used by smtp alerters so that matplotlib objects are cleared down and the alerter cannot create a memory leak in this manner and plt.savefig keeps the object in memory until the process terminates. Seeing as data is being surfaced and processed in the alert_smtp context, multiprocessing the alert creation and handling prevents any memory leaks in the parent. Added 20160814 relating to: * Bug #1558: Memory leak in Analyzer * Issue #21 Memory leak in Analyzer see https://github.com/earthgecko/skyline/issues/21 Parameters as per :py:func:`skyline.analyzer.alerters.trigger_alert <analyzer.alerters.trigger_alert>` """ trigger_alert(alert, metric, context)
parser = OptionParser() parser.add_option("-t", "--trigger", dest="trigger", default=False, help="Actually trigger the appropriate alerts (default is False)") parser.add_option("-m", "--metric", dest="metric", default='skyline.horizon.queue_size', help="Pass the metric to test (default is skyline.horizon.queue_size)") (options, args) = parser.parse_args() try: alerts_enabled = settings.ENABLE_ALERTS alerts = settings.ALERTS except: print "Exception: Check your settings file for the existence of ENABLE_ALERTS and ALERTS" sys.exit() print 'Verifying alerts for: "' + options.metric + '"' # Send alerts if alerts_enabled: for alert in alerts: if alert[0] in options.metric: print ' Testing against "' + alert[0] + '" to send via ' + alert[1] + "...triggered" if options.trigger: trigger_alert(alert, options.metric) else: print ' Testing against "' + alert[0] + '" to send via ' + alert[1] + "..." else: print 'Alerts are disabled'
def run(self): """ Called when the process intializes. """ while 1: now = time() # Make sure Redis is up try: self.redis_conn.ping() except: logger.error( 'cloudbrain can\'t connect to redis at socket path %s' % settings.REDIS_SOCKET_PATH) sleep(10) self.redis_conn = StrictRedis( unix_socket_path=settings.REDIS_SOCKET_PATH) continue # Discover unique metrics unique_metrics = list( self.redis_conn.smembers(settings.FULL_NAMESPACE + 'unique_metrics')) pretty_unique_metrics = [] # metric without the FULL_NAMESPACE for metric in unique_metrics: pretty_unique_metrics.append( metric.replace(settings.FULL_NAMESPACE, "")) # Write unique metrics to static webapp directory filename = path.abspath( path.join(path.dirname(__file__), '..', settings.UNIQUE_METRICS)) with open(filename, 'w') as fh: # Make it JSONP with a handle_data() function fh.write(json.dumps(pretty_unique_metrics)) if len(unique_metrics) == 0: logger.info( 'no metrics in redis. try adding some - see README') sleep(10) continue # Spawn processes pids = [] for i in range(1, settings.ANALYZER_PROCESSES + 1): if i > len(unique_metrics): logger.info( 'WARNING: cloudbrain is set for more cores than needed.' ) break p = Process(target=self.spin_process, args=(i, unique_metrics)) pids.append(p) p.start() # Send wait signal to zombie processes for p in pids: p.join() # Grab data from the queue and populate dictionaries exceptions = dict() anomaly_breakdown = dict() while 1: try: key, value = self.anomaly_breakdown_q.get_nowait() if key not in anomaly_breakdown.keys(): anomaly_breakdown[key] = value else: anomaly_breakdown[key] += value except Empty: break while 1: try: key, value = self.exceptions_q.get_nowait() if key not in exceptions.keys(): exceptions[key] = value else: exceptions[key] += value except Empty: break # Send alerts if settings.ENABLE_ALERTS: for alert in settings.ALERTS: for metric in self.anomalous_metrics: if alert[0] in metric[1]: cache_key = 'last_alert.%s.%s' % (alert[1], metric[1]) try: last_alert = self.redis_conn.get(cache_key) if not last_alert: self.redis_conn.setex( cache_key, alert[2], packb(metric[0])) trigger_alert(alert, metric) except Exception as e: logger.error("couldn't send alert: %s" % e) ''' # Write anomalous_metrics to static webapp directory filename = path.abspath(path.join(path.dirname(__file__), '..', settings.ANOMALY_DUMP)) with open(filename, 'w') as fh: anomalous_metrics.sort(key=operator.itemgetter(1)) fh.write('handle_data(%s)' % anomalous_metrics) ''' # read unique metrics filename = path.abspath( path.join(path.dirname(__file__), '..', settings.UNIQUE_METRICS)) with open(filename, 'r') as f: json_data = f.read() unique_metrics = json.loads(json_data) anomalous_metrics_names = [ metric[1] for metric in self.anomalous_metrics ] for metric in unique_metrics: if not metric in anomalous_metrics_names: self.anomalous_metrics.append([0, str(metric)]) # Write anomalous_metrics to static webapp directory filename = path.abspath( path.join(path.dirname(__file__), '..', settings.ANOMALY_DUMP)) with open(filename, 'w') as fh: # Make it JSONP with a handle_data() function anomalous_metrics = list(self.anomalous_metrics) anomalous_metrics.sort(key=operator.itemgetter(1)) fh.write('handle_data(%s)' % repr(anomalous_metrics)) # Log progress logger.info('seconds to run :: %.2f' % (time() - now)) logger.info('total metrics :: %d' % len(unique_metrics)) logger.info('total analyzed :: %d' % (len(unique_metrics) - sum(exceptions.values()))) logger.info('total anomalies :: %d' % len(self.anomalous_metrics)) logger.info('exception stats :: %s' % exceptions) logger.info('anomaly breakdown :: %s' % anomaly_breakdown) # Log to Graphite self.send_graphite_metric('cloudbrain.analyzer.run_time', '%.2f' % (time() - now)) self.send_graphite_metric( 'cloudbrain.analyzer.total_analyzed', '%.2f' % (len(unique_metrics) - sum(exceptions.values()))) # Check canary metric raw_series = self.redis_conn.get(settings.FULL_NAMESPACE + settings.CANARY_METRIC) if raw_series is not None: unpacker = Unpacker(use_list=False) unpacker.feed(raw_series) timeseries = list(unpacker) time_human = (timeseries[-1][0] - timeseries[0][0]) / 3600 projected = 24 * (time() - now) / time_human logger.info('canary duration :: %.2f' % time_human) self.send_graphite_metric('cloudbrain.analyzer.duration', '%.2f' % time_human) self.send_graphite_metric('cloudbrain.analyzer.projected', '%.2f' % projected) # Reset counters self.anomalous_metrics[:] = [] # Sleep if it went too fast if time() - now < 5: logger.info('sleeping due to low run time...') sleep(10)
syslog_enabled = settings.SYSLOG_ENABLED except: syslog_enabled = False print('Verifying alerts for: "' + options.metric + '"') # Send alerts context = 'Analyzer' if alerts_enabled: print('Testing Analyzer alerting on ' + options.metric) for alert in alerts: if alert[0] == options.metric: if options.trigger in alert[1]: print(' Testing Analyzer alerting - against "' + alert[0] + '" to send via ' + alert[1] + "...triggered") metric = (0, options.metric, 12345) trigger_alert(alert, metric, context) if syslog_enabled: print(' Testing Analyzer alerting - against "' + alert[0] + '" to send via syslog ' + "...triggered") alert = (alert[0], 'syslog') trigger_alert(alert, metric, context) else: print('Alerts are disabled') # Mirage alerts try: mirage_enabled = settings.ENABLE_MIRAGE mirage_alerts_enabled = settings.MIRAGE_ENABLE_ALERTS except: mirage_alerts_enabled = False if mirage_alerts_enabled:
def run(self): """ - Called when the process intializes. - Determine if Redis is up and discover the number of `unique metrics`. - Divide the `unique_metrics` between the number of `ANALYZER_PROCESSES` and assign each process a set of metrics to analyse for anomalies. - Wait for the processes to finish. - Determine whether if any anomalous metrics require: - Alerting on (and set `EXPIRATION_TIME` key in Redis for alert). - Feed to another module e.g. mirage. - Alert to syslog. - Populate the webapp json with the anomalous_metrics details. - Log the details about the run to the skyline analyzer log. - Send skyline.analyzer metrics to `GRAPHITE_HOST` """ # Log management to prevent overwriting # Allow the bin/<skyline_app>.d to manage the log if os.path.isfile(skyline_app_logwait): try: os.remove(skyline_app_logwait) except OSError: logger.error('error - failed to remove %s, continuing' % skyline_app_logwait) pass now = time() log_wait_for = now + 5 while now < log_wait_for: if os.path.isfile(skyline_app_loglock): sleep(.1) now = time() else: now = log_wait_for + 1 logger.info('starting %s run' % skyline_app) if os.path.isfile(skyline_app_loglock): logger.error('error - bin/%s.d log management seems to have failed, continuing' % skyline_app) try: os.remove(skyline_app_loglock) logger.info('log lock file removed') except OSError: logger.error('error - failed to remove %s, continuing' % skyline_app_loglock) pass else: logger.info('bin/%s.d log management done' % skyline_app) if not os.path.exists(settings.SKYLINE_TMP_DIR): if python_version == 2: mode_arg = int('0755') if python_version == 3: mode_arg = mode=0o755 os.makedirs(settings.SKYLINE_TMP_DIR, mode_arg) # Initiate the algorithm timings if Analyzer is configured to send the # algorithm_breakdown metrics with ENABLE_ALGORITHM_RUN_METRICS algorithm_tmp_file_prefix = settings.SKYLINE_TMP_DIR + '/' + skyline_app + '.' algorithms_to_time = [] if send_algorithm_run_metrics: algorithms_to_time = settings.ALGORITHMS while 1: now = time() # Make sure Redis is up try: self.redis_conn.ping() except: logger.error('skyline can\'t connect to redis at socket path %s' % settings.REDIS_SOCKET_PATH) sleep(10) self.redis_conn = StrictRedis(unix_socket_path=settings.REDIS_SOCKET_PATH) continue # Report app up self.redis_conn.setex(skyline_app, 120, now) # Discover unique metrics unique_metrics = list(self.redis_conn.smembers(settings.FULL_NAMESPACE + 'unique_metrics')) if len(unique_metrics) == 0: logger.info('no metrics in redis. try adding some - see README') sleep(10) continue # Using count files rather that multiprocessing.Value to enable metrics for # metrics for algorithm run times, etc for algorithm in algorithms_to_time: algorithm_count_file = algorithm_tmp_file_prefix + algorithm + '.count' algorithm_timings_file = algorithm_tmp_file_prefix + algorithm + '.timings' # with open(algorithm_count_file, 'a') as f: with open(algorithm_count_file, 'w') as f: pass with open(algorithm_timings_file, 'w') as f: pass # Remove any existing algorithm.error files from any previous runs # that did not cleanup for any reason pattern = '%s.*.algorithm.error' % skyline_app try: for f in os.listdir(settings.SKYLINE_TMP_DIR): if re.search(pattern, f): try: os.remove(os.path.join(settings.SKYLINE_TMP_DIR, f)) logger.info('cleaning up old error file - %s' % (str(f))) except OSError: pass except: logger.error('failed to cleanup algorithm.error files ' + traceback.format_exc()) # Spawn processes pids = [] spawned_pids = [] pid_count = 0 for i in range(1, settings.ANALYZER_PROCESSES + 1): if i > len(unique_metrics): logger.info('WARNING: skyline is set for more cores than needed.') break p = Process(target=self.spin_process, args=(i, unique_metrics)) pids.append(p) pid_count += 1 logger.info('starting %s of %s spin_process/es' % (str(pid_count), str(settings.ANALYZER_PROCESSES))) p.start() spawned_pids.append(p.pid) # Send wait signal to zombie processes # for p in pids: # p.join() # Self monitor processes and terminate if any spin_process has run # for longer than 180 seconds - 20160512 @earthgecko p_starts = time() while time() - p_starts <= settings.MAX_ANALYZER_PROCESS_RUNTIME: if any(p.is_alive() for p in pids): # Just to avoid hogging the CPU sleep(.1) else: # All the processes are done, break now. time_to_run = time() - p_starts logger.info('%s :: %s spin_process/es completed in %.2f seconds' % (skyline_app, str(settings.ANALYZER_PROCESSES), time_to_run)) break else: # We only enter this if we didn't 'break' above. logger.info('%s :: timed out, killing all spin_process processes' % (skyline_app)) for p in pids: p.terminate() p.join() # Log the last reported error by any algorithms that errored in the # spawned processes from algorithms.py for completed_pid in spawned_pids: logger.info('spin_process with pid %s completed' % (str(completed_pid))) for algorithm in settings.ALGORITHMS: algorithm_error_file = '%s/%s.%s.%s.algorithm.error' % ( settings.SKYLINE_TMP_DIR, skyline_app, str(completed_pid), algorithm) if os.path.isfile(algorithm_error_file): logger.info( 'error - spin_process with pid %s has reported an error with the %s algorithm' % ( str(completed_pid), algorithm)) try: with open(algorithm_error_file, 'r') as f: error_string = f.read() logger.error('%s' % str(error_string)) except: logger.error('failed to read %s error file' % algorithm) try: os.remove(algorithm_error_file) except OSError: pass # Grab data from the queue and populate dictionaries exceptions = dict() anomaly_breakdown = dict() while 1: try: key, value = self.anomaly_breakdown_q.get_nowait() if key not in anomaly_breakdown.keys(): anomaly_breakdown[key] = value else: anomaly_breakdown[key] += value except Empty: break while 1: try: key, value = self.exceptions_q.get_nowait() if key not in exceptions.keys(): exceptions[key] = value else: exceptions[key] += value except Empty: break # Send alerts if settings.ENABLE_ALERTS: for alert in settings.ALERTS: for metric in self.anomalous_metrics: ALERT_MATCH_PATTERN = alert[0] METRIC_PATTERN = metric[1] alert_match_pattern = re.compile(ALERT_MATCH_PATTERN) pattern_match = alert_match_pattern.match(METRIC_PATTERN) if pattern_match: cache_key = 'last_alert.%s.%s' % (alert[1], metric[1]) try: last_alert = self.redis_conn.get(cache_key) if not last_alert: try: SECOND_ORDER_RESOLUTION_FULL_DURATION = alert[3] logger.info('mirage check :: %s' % (metric[1])) # Write anomalous metric to test at second # order resolution by crucible to the check # file metric_timestamp = int(time()) anomaly_check_file = '%s/%s.%s.txt' % (settings.MIRAGE_CHECK_PATH, metric_timestamp, metric[1]) with open(anomaly_check_file, 'w') as fh: # metric_name, anomalous datapoint, hours to resolve, timestamp fh.write('metric = "%s"\nvalue = "%s"\nhours_to_resolve = "%s"\nmetric_timestamp = "%s"\n' % (metric[1], metric[0], alert[3], metric_timestamp)) if python_version == 2: mode_arg = int('0644') if python_version == 3: mode_arg = '0o644' os.chmod(anomaly_check_file, mode_arg) logger.info('added mirage check :: %s,%s,%s' % (metric[1], metric[0], alert[3])) # Add to the mirage_metrics list base_name = METRIC_PATTERN.replace(settings.FULL_NAMESPACE, '', 1) metric = [metric[0], base_name] self.mirage_metrics.append(metric) # Alert for analyzer if enabled if settings.ENABLE_FULL_DURATION_ALERTS: self.redis_conn.setex(cache_key, alert[2], packb(metric[0])) trigger_alert(alert, metric) except: self.redis_conn.setex(cache_key, alert[2], packb(metric[0])) trigger_alert(alert, metric) except Exception as e: logger.error('error :: could not send alert: %s' % e) # Push to crucible # if len(self.crucible_anomalous_metrics) > 0: # logger.info('to do - push to crucible') # Write anomalous_metrics to static webapp directory if len(self.anomalous_metrics) > 0: filename = path.abspath(path.join(path.dirname(__file__), '..', settings.ANOMALY_DUMP)) with open(filename, 'w') as fh: # Make it JSONP with a handle_data() function anomalous_metrics = list(self.anomalous_metrics) anomalous_metrics.sort(key=operator.itemgetter(1)) fh.write('handle_data(%s)' % anomalous_metrics) # Using count files rather that multiprocessing.Value to enable metrics for # metrics for algorithm run times, etc for algorithm in algorithms_to_time: algorithm_count_file = algorithm_tmp_file_prefix + algorithm + '.count' algorithm_timings_file = algorithm_tmp_file_prefix + algorithm + '.timings' try: algorithm_count_array = [] with open(algorithm_count_file, 'r') as f: for line in f: value_string = line.replace('\n', '') unquoted_value_string = value_string.replace("'", '') float_value = float(unquoted_value_string) algorithm_count_array.append(float_value) except: algorithm_count_array = False if not algorithm_count_array: continue number_of_times_algorithm_run = len(algorithm_count_array) logger.info( 'algorithm run count - %s run %s times' % ( algorithm, str(number_of_times_algorithm_run))) if number_of_times_algorithm_run == 0: continue try: algorithm_timings_array = [] with open(algorithm_timings_file, 'r') as f: for line in f: value_string = line.replace('\n', '') unquoted_value_string = value_string.replace("'", '') float_value = float(unquoted_value_string) algorithm_timings_array.append(float_value) except: algorithm_timings_array = False if not algorithm_timings_array: continue number_of_algorithm_timings = len(algorithm_timings_array) logger.info( 'algorithm timings count - %s has %s timings' % ( algorithm, str(number_of_algorithm_timings))) if number_of_algorithm_timings == 0: continue try: _sum_of_algorithm_timings = sum(algorithm_timings_array) except: logger.error("sum error: " + traceback.format_exc()) _sum_of_algorithm_timings = round(0.0, 6) logger.error('error - sum_of_algorithm_timings - %s' % (algorithm)) continue sum_of_algorithm_timings = round(_sum_of_algorithm_timings, 6) # logger.info('sum_of_algorithm_timings - %s - %.16f seconds' % (algorithm, sum_of_algorithm_timings)) try: _median_algorithm_timing = determine_median(algorithm_timings_array) except: _median_algorithm_timing = round(0.0, 6) logger.error('error - _median_algorithm_timing - %s' % (algorithm)) continue median_algorithm_timing = round(_median_algorithm_timing, 6) # logger.info('median_algorithm_timing - %s - %.16f seconds' % (algorithm, median_algorithm_timing)) logger.info( 'algorithm timing - %s - total: %.6f - median: %.6f' % ( algorithm, sum_of_algorithm_timings, median_algorithm_timing)) use_namespace = skyline_app_graphite_namespace + '.algorithm_breakdown.' + algorithm send_metric_name = use_namespace + '.timing.times_run' send_graphite_metric(skyline_app, send_metric_name, str(number_of_algorithm_timings)) send_metric_name = use_namespace + '.timing.total_time' send_graphite_metric(skyline_app, send_metric_name, str(sum_of_algorithm_timings)) send_metric_name = use_namespace + '.timing.median_time' send_graphite_metric(skyline_app, send_metric_name, str(median_algorithm_timing)) run_time = time() - now total_metrics = str(len(unique_metrics)) total_analyzed = str(len(unique_metrics) - sum(exceptions.values())) total_anomalies = str(len(self.anomalous_metrics)) # Log progress logger.info('seconds to run :: %.2f' % run_time) logger.info('total metrics :: %s' % total_metrics) logger.info('total analyzed :: %s' % total_analyzed) logger.info('total anomalies :: %s' % total_anomalies) logger.info('exception stats :: %s' % exceptions) logger.info('anomaly breakdown :: %s' % anomaly_breakdown) # Log to Graphite graphite_run_time = '%.2f' % run_time send_metric_name = skyline_app_graphite_namespace + '.run_time' send_graphite_metric(skyline_app, send_metric_name, graphite_run_time) send_metric_name = skyline_app_graphite_namespace + '.total_analyzed' send_graphite_metric(skyline_app, send_metric_name, total_analyzed) send_metric_name = skyline_app_graphite_namespace + '.total_anomalies' send_graphite_metric(skyline_app, send_metric_name, total_anomalies) send_metric_name = skyline_app_graphite_namespace + '.total_metrics' send_graphite_metric(skyline_app, send_metric_name, total_metrics) for key, value in exceptions.items(): send_metric_name = '%s.exceptions.%s' % (skyline_app_graphite_namespace, key) send_graphite_metric(skyline_app, send_metric_name, str(value)) for key, value in anomaly_breakdown.items(): send_metric_name = '%s.anomaly_breakdown.%s' % (skyline_app_graphite_namespace, key) send_graphite_metric(skyline_app, send_metric_name, str(value)) # Check canary metric raw_series = self.redis_conn.get(settings.FULL_NAMESPACE + settings.CANARY_METRIC) if raw_series is not None: unpacker = Unpacker(use_list=False) unpacker.feed(raw_series) timeseries = list(unpacker) time_human = (timeseries[-1][0] - timeseries[0][0]) / 3600 projected = 24 * (time() - now) / time_human logger.info('canary duration :: %.2f' % time_human) send_metric_name = skyline_app_graphite_namespace + '.duration' send_graphite_metric(skyline_app, send_metric_name, str(time_human)) send_metric_name = skyline_app_graphite_namespace + '.projected' send_graphite_metric(skyline_app, send_metric_name, str(projected)) # Reset counters self.anomalous_metrics[:] = [] # Sleep if it went too fast # if time() - now < 5: # logger.info('sleeping due to low run time...') # sleep(10) # @modified 20160504 - @earthgecko - development internal ref #1338, #1340) # Etsy's original for this was a value of 5 seconds which does # not make skyline Analyzer very efficient in terms of installations # where 100s of 1000s of metrics are being analyzed. This lead to # Analyzer running over several metrics multiple time in a minute # and always working. Therefore this was changed from if you took # less than 5 seconds to run only then sleep. This behaviour # resulted in Analyzer analysing a few 1000 metrics in 9 seconds and # then doing it again and again in a single minute. Therefore the # ANALYZER_OPTIMUM_RUN_DURATION setting was added to allow this to # self optimise in cases where skyline is NOT deployed to analyze # 100s of 1000s of metrics. This relates to optimising performance # for any deployments in the few 1000s and 60 second resolution # area, e.g. smaller and local deployments. process_runtime = time() - now analyzer_optimum_run_duration = settings.ANALYZER_OPTIMUM_RUN_DURATION if process_runtime < analyzer_optimum_run_duration: sleep_for = (analyzer_optimum_run_duration - process_runtime) logger.info('sleeping for %.2f seconds due to low run time...' % sleep_for) sleep(sleep_for)
def run(self): """ Called when the process intializes. """ while 1: now = time() # Make sure Redis is up try: self.redis_conn.ping() except: logger.error('skyline can\'t connect to redis at socket path %s' % settings.REDIS_SOCKET_PATH) sleep(10) self.redis_conn = StrictRedis(unix_socket_path = settings.REDIS_SOCKET_PATH) continue # Discover unique metrics unique_metrics = list(self.redis_conn.smembers(settings.FULL_NAMESPACE + 'unique_metrics')) if len(unique_metrics) == 0: logger.info('no metrics in redis. try adding some - see README') sleep(10) continue # Spawn processes pids = [] for i in range(1, settings.ANALYZER_PROCESSES + 1): if i > len(unique_metrics): logger.info('WARNING: skyline is set for more cores than needed.') break p = Process(target=self.spin_process, args=(i, unique_metrics)) pids.append(p) p.start() # Send wait signal to zombie processes for p in pids: p.join() # Grab data from the queue and populate dictionaries exceptions = dict() anomaly_breakdown = dict() while 1: try: key, value = self.anomaly_breakdown_q.get_nowait() if key not in anomaly_breakdown.keys(): anomaly_breakdown[key] = value else: anomaly_breakdown[key] += value except Empty: break while 1: try: key, value = self.exceptions_q.get_nowait() if key not in exceptions.keys(): exceptions[key] = value else: exceptions[key] += value except Empty: break # Send alerts if settings.ENABLE_ALERTS: for alert in settings.ALERTS: for metric in self.anomalous_metrics: if alert[0] in metric[1]: cache_key = 'last_alert.%s.%s' % (alert[1], metric[1]) try: last_alert = self.redis_conn.get(cache_key) if not last_alert: self.redis_conn.setex(cache_key, alert[2], packb(metric[0])) trigger_alert(alert, metric) except Exception as e: logger.error("couldn't send alert: %s" % e) # Write anomalous_metrics to static webapp directory filename = path.abspath(path.join(path.dirname(__file__), '..', settings.ANOMALY_DUMP)) with open(filename, 'w') as fh: # Make it JSONP with a handle_data() function anomalous_metrics = list(self.anomalous_metrics) anomalous_metrics.sort(key=operator.itemgetter(1)) fh.write('handle_data(%s)' % anomalous_metrics) # Log progress logger.info('seconds to run :: %.2f' % (time() - now)) logger.info('total metrics :: %d' % len(unique_metrics)) logger.info('total analyzed :: %d' % (len(unique_metrics) - sum(exceptions.values()))) logger.info('total anomalies :: %d' % len(self.anomalous_metrics)) logger.info('exception stats :: %s' % exceptions) logger.info('anomaly breakdown :: %s' % anomaly_breakdown) # Log to Graphite self.send_graphite_metric('skyline.analyzer.run_time', '%.2f' % (time() - now)) self.send_graphite_metric('skyline.analyzer.total_analyzed', '%.2f' % (len(unique_metrics) - sum(exceptions.values()))) # Check canary metric raw_series = self.redis_conn.get(settings.FULL_NAMESPACE + settings.CANARY_METRIC) if raw_series is not None: unpacker = Unpacker(use_list = False) unpacker.feed(raw_series) timeseries = list(unpacker) time_human = (timeseries[-1][0] - timeseries[0][0]) / 3600 projected = 24 * (time() - now) / time_human logger.info('canary duration :: %.2f' % time_human) self.send_graphite_metric('skyline.analyzer.duration', '%.2f' % time_human) self.send_graphite_metric('skyline.analyzer.projected', '%.2f' % projected) # Reset counters self.anomalous_metrics[:] = [] # Sleep if it went too fast if time() - now < 5: logger.info('sleeping due to low run time...') sleep(10)
def run(self): """ Called when the process intializes. """ while 1: now = time() # Make sure Redis is up try: self.redis_conn.ping() except: logger.error('skyline can\'t connect to redis at socket path %s' % settings.REDIS_SOCKET_PATH) sleep(10) self.redis_conn = StrictRedis(unix_socket_path=settings.REDIS_SOCKET_PATH) continue # Discover unique metrics unique_metrics = list(self.redis_conn.smembers(settings.FULL_NAMESPACE + 'unique_metrics')) if len(unique_metrics) == 0: logger.info('no metrics in redis. try adding some - see README') sleep(10) continue # Reset boundary_metrics boundary_metrics = [] # Build boundary metrics for metric_name in unique_metrics: for metric in BOUNDARY_METRICS: CHECK_MATCH_PATTERN = metric[0] check_match_pattern = re.compile(CHECK_MATCH_PATTERN) base_name = metric_name.replace(FULL_NAMESPACE, '', 1) pattern_match = check_match_pattern.match(base_name) if pattern_match: if ENABLE_BOUNDARY_DEBUG: logger.info("debug - boundary metric - pattern MATCHED - " + metric[0] + " | " + base_name) boundary_metrics.append([metric_name, metric[1]]) if ENABLE_BOUNDARY_DEBUG: logger.info("debug - boundary metrics - " + str(boundary_metrics)) if len(boundary_metrics) == 0: logger.info('no metrics in redis. try adding some - see README') sleep(10) continue # Spawn processes pids = [] for i in range(1, settings.BOUNDARY_PROCESSES + 1): if i > len(boundary_metrics): logger.info('WARNING: skyline boundary is set for more cores than needed.') break p = Process(target=self.spin_process, args=(i, boundary_metrics)) pids.append(p) p.start() # Send wait signal to zombie processes for p in pids: p.join() # Grab data from the queue and populate dictionaries exceptions = dict() anomaly_breakdown = dict() while 1: try: key, value = self.anomaly_breakdown_q.get_nowait() if key not in anomaly_breakdown.keys(): anomaly_breakdown[key] = value else: anomaly_breakdown[key] += value except Empty: break while 1: try: key, value = self.exceptions_q.get_nowait() if key not in exceptions.keys(): exceptions[key] = value else: exceptions[key] += value except Empty: break # Send alerts if settings.BOUNDARY_ENABLE_ALERTS: for anomalous_metric in self.anomalous_metrics: datapoint = str(anomalous_metric[0]) metric_name = anomalous_metric[1] base_name = metric_name.replace(FULL_NAMESPACE, '', 1) expiration_time = str(anomalous_metric[2]) metric_trigger = str(anomalous_metric[5]) alert_threshold = int(anomalous_metric[6]) metric_alerters = anomalous_metric[7] algorithm = anomalous_metric[8] if ENABLE_BOUNDARY_DEBUG: logger.info("debug - anomalous_metric - " + str(anomalous_metric)) # Determine how many times has the anomaly been seen if the # ALERT_THRESHOLD is set to > 1 and create a cache key in # redis to keep count so that alert_threshold can be honored if alert_threshold == 0: times_seen = 1 if ENABLE_BOUNDARY_DEBUG: logger.info("debug - alert_threshold - " + str(alert_threshold)) if alert_threshold == 1: times_seen = 1 if ENABLE_BOUNDARY_DEBUG: logger.info("debug - alert_threshold - " + str(alert_threshold)) if alert_threshold > 1: if ENABLE_BOUNDARY_DEBUG: logger.info("debug - alert_threshold - " + str(alert_threshold)) anomaly_cache_key_count_set = False anomaly_cache_key_expiration_time = (int(alert_threshold) + 1) * 60 anomaly_cache_key = 'anomaly_seen.%s.%s' % (algorithm, base_name) try: anomaly_cache_key_count = self.redis_conn.get(anomaly_cache_key) if not anomaly_cache_key_count: try: if ENABLE_BOUNDARY_DEBUG: logger.info("debug - redis no anomaly_cache_key - " + str(anomaly_cache_key)) times_seen = 1 if ENABLE_BOUNDARY_DEBUG: logger.info("debug - redis setex anomaly_cache_key - " + str(anomaly_cache_key)) self.redis_conn.setex(anomaly_cache_key, anomaly_cache_key_expiration_time, packb(int(times_seen))) logger.info('set anomaly seen key :: %s seen %s' % (anomaly_cache_key, str(times_seen))) except Exception as e: logger.error('redis setex failed :: %s' % str(anomaly_cache_key)) logger.error("couldn't set key: %s" % e) else: if ENABLE_BOUNDARY_DEBUG: logger.info("debug - redis anomaly_cache_key retrieved OK - " + str(anomaly_cache_key)) anomaly_cache_key_count_set = True except: if ENABLE_BOUNDARY_DEBUG: logger.info("debug - redis failed - anomaly_cache_key retrieval failed - " + str(anomaly_cache_key)) anomaly_cache_key_count_set = False if anomaly_cache_key_count_set: unpacker = Unpacker(use_list=False) unpacker.feed(anomaly_cache_key_count) raw_times_seen = list(unpacker) times_seen = int(raw_times_seen[0]) + 1 try: self.redis_conn.setex(anomaly_cache_key, anomaly_cache_key_expiration_time, packb(int(times_seen))) logger.info('set anomaly seen key :: %s seen %s' % (anomaly_cache_key, str(times_seen))) except: times_seen = 1 logger.error('set anomaly seen key failed :: %s seen %s' % (anomaly_cache_key, str(times_seen))) # Alert the alerters if times_seen > alert_threshold if times_seen >= alert_threshold: if ENABLE_BOUNDARY_DEBUG: logger.info("debug - times_seen %s is greater than or equal to alert_threshold %s" % (str(times_seen), str(alert_threshold))) for alerter in metric_alerters.split("|"): # Determine alerter limits send_alert = False alerts_sent = 0 if ENABLE_BOUNDARY_DEBUG: logger.info("debug - checking alerter - %s" % alerter) try: if ENABLE_BOUNDARY_DEBUG: logger.info("debug - determining alerter_expiration_time for settings") alerter_expiration_time_setting = settings.BOUNDARY_ALERTER_OPTS['alerter_expiration_time'][alerter] alerter_expiration_time = int(alerter_expiration_time_setting) if ENABLE_BOUNDARY_DEBUG: logger.info("debug - determined alerter_expiration_time from settings - %s" % str(alerter_expiration_time)) except: # Set an arbitrary expiry time if not set alerter_expiration_time = 160 if ENABLE_BOUNDARY_DEBUG: logger.info("debug - could not determine alerter_expiration_time from settings") try: if ENABLE_BOUNDARY_DEBUG: logger.info("debug - determining alerter_limit from settings") alerter_limit_setting = settings.BOUNDARY_ALERTER_OPTS['alerter_limit'][alerter] alerter_limit = int(alerter_limit_setting) alerter_limit_set = True if ENABLE_BOUNDARY_DEBUG: logger.info("debug - determined alerter_limit from settings - %s" % str(alerter_limit)) except: alerter_limit_set = False send_alert = True if ENABLE_BOUNDARY_DEBUG: logger.info("debug - could not determine alerter_limit from settings") # If the alerter_limit is set determine how many # alerts the alerter has sent if alerter_limit_set: alerter_sent_count_key = 'alerts_sent.%s' % (alerter) try: alerter_sent_count_key_data = self.redis_conn.get(alerter_sent_count_key) if not alerter_sent_count_key_data: if ENABLE_BOUNDARY_DEBUG: logger.info("debug - redis no alerter key, no alerts sent for - " + str(alerter_sent_count_key)) alerts_sent = 0 send_alert = True if ENABLE_BOUNDARY_DEBUG: logger.info("debug - alerts_sent set to %s" % str(alerts_sent)) logger.info("debug - send_alert set to %s" % str(sent_alert)) else: if ENABLE_BOUNDARY_DEBUG: logger.info("debug - redis alerter key retrieved, unpacking" + str(alerter_sent_count_key)) unpacker = Unpacker(use_list=False) unpacker.feed(alerter_sent_count_key_data) raw_alerts_sent = list(unpacker) alerts_sent = int(raw_alerts_sent[0]) if ENABLE_BOUNDARY_DEBUG: logger.info("debug - alerter %s alerts sent %s " % (str(alerter), str(alerts_sent))) except: logger.info("No key set - %s" % alerter_sent_count_key) alerts_sent = 0 send_alert = True if ENABLE_BOUNDARY_DEBUG: logger.info("debug - alerts_sent set to %s" % str(alerts_sent)) logger.info("debug - send_alert set to %s" % str(send_alert)) if alerts_sent < alerter_limit: send_alert = True if ENABLE_BOUNDARY_DEBUG: logger.info("debug - alerts_sent %s is less than alerter_limit %s" % (str(alerts_sent), str(alerter_limit))) logger.info("debug - send_alert set to %s" % str(send_alert)) # Send alert alerter_alert_sent = False if send_alert: cache_key = 'last_alert.boundary.%s.%s.%s' % (alerter, base_name, algorithm) if ENABLE_BOUNDARY_DEBUG: logger.info("debug - checking cache_key - %s" % cache_key) try: last_alert = self.redis_conn.get(cache_key) if not last_alert: try: self.redis_conn.setex(cache_key, int(anomalous_metric[2]), packb(int(anomalous_metric[0]))) if ENABLE_BOUNDARY_DEBUG: logger.info('debug - key setex OK - %s' % (cache_key)) trigger_alert(alerter, datapoint, base_name, expiration_time, metric_trigger, algorithm) logger.info('alert sent :: %s - %s - via %s - %s' % (base_name, datapoint, alerter, algorithm)) trigger_alert("syslog", datapoint, base_name, expiration_time, metric_trigger, algorithm) logger.info('alert sent :: %s - %s - via syslog - %s' % (base_name, datapoint, algorithm)) alerter_alert_sent = True except Exception as e: logger.error('alert failed :: %s - %s - via %s - %s' % (base_name, datapoint, alerter, algorithm)) logger.error("couldn't send alert: %s" % str(e)) trigger_alert("syslog", datapoint, base_name, expiration_time, metric_trigger, algorithm) else: if ENABLE_BOUNDARY_DEBUG: logger.info("debug - cache_key exists not alerting via %s for %s is less than alerter_limit %s" % (alerter, cache_key)) trigger_alert("syslog", datapoint, base_name, expiration_time, metric_trigger, algorithm) logger.info('alert sent :: %s - %s - via syslog - %s' % (base_name, datapoint, algorithm)) except: trigger_alert("syslog", datapoint, base_name, expiration_time, metric_trigger, algorithm) logger.info('alert sent :: %s - %s - via syslog - %s' % (base_name, datapoint, algorithm)) else: trigger_alert("syslog", datapoint, base_name, expiration_time, metric_trigger, algorithm) logger.info('alert sent :: %s - %s - via syslog - %s' % (base_name, datapoint, algorithm)) # Update the alerts sent for the alerter cache key, # to allow for alert limiting if alerter_alert_sent and alerter_limit_set: try: alerter_sent_count_key = 'alerts_sent.%s' % (alerter) new_alerts_sent = int(alerts_sent) + 1 self.redis_conn.setex(alerter_sent_count_key, alerter_expiration_time, packb(int(new_alerts_sent))) logger.info('set %s - %s' % (alerter_sent_count_key, str(new_alerts_sent))) except: logger.error('failed to set %s - %s' % (alerter_sent_count_key, str(new_alerts_sent))) else: # Always alert to syslog, even if alert_threshold is not # breached or if send_alert is not True trigger_alert("syslog", datapoint, base_name, expiration_time, metric_trigger, algorithm) logger.info('alert sent :: %s - %s - via syslog - %s' % (base_name, datapoint, algorithm)) # Write anomalous_metrics to static webapp directory if len(self.anomalous_metrics) > 0: filename = path.abspath(path.join(path.dirname(__file__), '..', settings.ANOMALY_DUMP)) with open(filename, 'w') as fh: # Make it JSONP with a handle_data() function anomalous_metrics = list(self.anomalous_metrics) anomalous_metrics.sort(key=operator.itemgetter(1)) fh.write('handle_data(%s)' % anomalous_metrics) # Log progress logger.info('seconds to run :: %.2f' % (time() - now)) logger.info('total metrics :: %d' % len(boundary_metrics)) logger.info('total analyzed :: %d' % (len(boundary_metrics) - sum(exceptions.values()))) logger.info('total anomalies :: %d' % len(self.anomalous_metrics)) logger.info('exception stats :: %s' % exceptions) logger.info('anomaly breakdown :: %s' % anomaly_breakdown) # Log to Graphite self.send_graphite_metric('skyline.boundary.' + SERVER_METRIC_PATH + 'run_time', '%.2f' % (time() - now)) self.send_graphite_metric('skyline.boundary.' + SERVER_METRIC_PATH + 'total_analyzed', '%.2f' % (len(boundary_metrics) - sum(exceptions.values()))) self.send_graphite_metric('skyline.boundary.' + SERVER_METRIC_PATH + 'total_anomalies', '%d' % len(self.anomalous_metrics)) self.send_graphite_metric('skyline.boundary.' + SERVER_METRIC_PATH + 'total_metrics', '%d' % len(boundary_metrics)) for key, value in exceptions.items(): send_metric = 'skyline.boundary.' + SERVER_METRIC_PATH + 'exceptions.%s' % key self.send_graphite_metric(send_metric, '%d' % value) for key, value in anomaly_breakdown.items(): send_metric = 'skyline.boundary.' + SERVER_METRIC_PATH + 'anomaly_breakdown.%s' % key self.send_graphite_metric(send_metric, '%d' % value) # Check canary metric raw_series = self.redis_conn.get(settings.FULL_NAMESPACE + settings.CANARY_METRIC) if raw_series is not None: unpacker = Unpacker(use_list=False) unpacker.feed(raw_series) timeseries = list(unpacker) time_human = (timeseries[-1][0] - timeseries[0][0]) / 3600 projected = 24 * (time() - now) / time_human logger.info('canary duration :: %.2f' % time_human) self.send_graphite_metric('skyline.boundary.' + SERVER_METRIC_PATH + 'duration', '%.2f' % time_human) self.send_graphite_metric('skyline.boundary.' + SERVER_METRIC_PATH + 'projected', '%.2f' % projected) # Reset counters self.anomalous_metrics[:] = [] # Only run once per minute seconds_to_run = int((time() - now)) if seconds_to_run < 60: sleep_for_seconds = 60 - seconds_to_run else: sleep_for_seconds = 0 if sleep_for_seconds > 0: logger.info('sleeping for %s seconds' % sleep_for_seconds) sleep(sleep_for_seconds)
syslog_enabled = False print 'Verifying alerts for: "' + options.metric + '"' # Send alerts context = 'Analyzer' if alerts_enabled: for alert in alerts: if alert[0] in options.metric: print ' Testing Analyzer alerting - against "' + alert[ 0] + '" to send via ' + alert[1] + "...triggered" if options.trigger: print ' Testing Analyzer alerting - with options.trigger - ' + str( options.trigger) metric = (0, options.metric) trigger_alert(alert, metric, context) if syslog_enabled: print ' Testing Analyzer alerting - against "' + alert[ 0] + '" to send via syslog ' + "...triggered" alert = (alert[0], 'syslog') trigger_alert(alert, metric, context) else: print ' Testing Analyzer alerting - against "' + alert[ 0] + '" to send via ' + alert[1] + "..." else: print 'Alerts are disabled' # Mirage alerts context = 'Mirage' try: mirage_enabled = settings.ENABLE_MIRAGE
def run(self): """ Called when the process intializes. """ while 1: now = time() # Make sure Redis is up try: self.redis_conn.ping() except: logger.error( 'skyline can\'t connect to redis at socket path %s' % settings.REDIS_SOCKET_PATH) sleep(10) self.redis_conn = StrictRedis( unix_socket_path=settings.REDIS_SOCKET_PATH) continue # Discover unique metrics unique_metrics = list( self.redis_conn.smembers(settings.FULL_NAMESPACE + 'unique_metrics')) if len(unique_metrics) == 0: logger.info( 'no metrics in redis. try adding some - see README') sleep(10) continue # Spawn processes pids = [] for i in range(1, settings.ANALYZER_PROCESSES + 1): if i > len(unique_metrics): logger.info( 'WARNING: skyline is set for more cores than needed.') break p = Process(target=self.spin_process, args=(i, unique_metrics)) pids.append(p) p.start() # Send wait signal to zombie processes for p in pids: p.join() # Send alerts if settings.ENABLE_ALERTS: for alert in settings.ALERTS: for metric in self.anomalous_metrics: if alert[0] in metric[1]: cache_key = 'last_alert.%s.%s' % (alert[1], metric[1]) try: last_alert = self.redis_conn.get(cache_key) if not last_alert: self.redis_conn.setex( cache_key, alert[2], packb(metric[0])) trigger_alert(alert, metric) except Exception as e: logger.error("couldn't send alert: %s" % e) # Write anomalous_metrics to static webapp directory filename = path.abspath( path.join(path.dirname(__file__), '..', settings.ANOMALY_DUMP)) with open(filename, 'w') as fh: # Make it JSONP with a handle_data() function anomalous_metrics = list(self.anomalous_metrics) anomalous_metrics.sort(key=operator.itemgetter(1)) fh.write('handle_data(%s)' % anomalous_metrics) # Log progress logger.info('seconds to run :: %.2f' % (time() - now)) logger.info('total metrics :: %d' % len(unique_metrics)) logger.info('total analyzed :: %d' % (len(unique_metrics) - sum(self.exceptions.values()))) logger.info('total anomalies :: %d' % len(self.anomalous_metrics)) logger.info('exception stats :: %s' % self.exceptions) logger.info('anomaly breakdown :: %s' % self.anomaly_breakdown) # Log to Graphite if settings.GRAPHITE_HOST != '': host = settings.GRAPHITE_HOST.replace('http://', '') system( 'echo skyline.analyzer.run_time %.2f %s | nc -w 3 %s 2003' % ((time() - now), now, host)) system( 'echo skyline.analyzer.total_analyzed %d %s | nc -w 3 %s 2003' % ((len(unique_metrics) - sum(self.exceptions.values())), now, host)) # Check canary metric raw_series = self.redis_conn.get(settings.FULL_NAMESPACE + settings.CANARY_METRIC) if raw_series is not None: unpacker = Unpacker(use_list=False) unpacker.feed(raw_series) timeseries = list(unpacker) time_human = (timeseries[-1][0] - timeseries[0][0]) / 3600 projected = 24 * (time() - now) / time_human logger.info('canary duration :: %.2f' % time_human) if settings.GRAPHITE_HOST != '': host = settings.GRAPHITE_HOST.replace('http://', '') system( 'echo skyline.analyzer.duration %.2f %s | nc -w 3 %s 2003' % (time_human, now, host)) system( 'echo skyline.analyzer.projected %.2f %s | nc -w 3 %s 2003' % (projected, now, host)) # Reset counters self.anomalous_metrics[:] = [] self.exceptions = Manager().dict() self.anomaly_breakdown = Manager().dict() # Sleep if it went too fast if time() - now < 5: logger.info('sleeping due to low run time...') sleep(10)
"-m", "--metric", dest="metric", default='skyline.horizon.queue_size', help="Pass the metric to test (default is skyline.horizon.queue_size)") (options, args) = parser.parse_args() try: alerts_enabled = settings.ENABLE_ALERTS alerts = settings.ALERTS except: print "Exception: Check your settings file for the existence of ENABLE_ALERTS and ALERTS" sys.exit() print 'Verifying alerts for: "' + options.metric + '"' # Send alerts if alerts_enabled: for alert in alerts: if alert[0] in options.metric: print ' Testing against "' + alert[ 0] + '" to send via ' + alert[1] + "...triggered" if options.trigger: trigger_alert(alert, options.metric) else: print ' Testing against "' + alert[ 0] + '" to send via ' + alert[1] + "..." else: print 'Alerts are disabled'
def run(self): """ Called when the process intializes. """ while 1: now = time() # Make sure Redis is up try: self.redis_conn.ping() except: logger.error( 'skyline can\'t connect to redis at socket path %s' % settings.REDIS_SOCKET_PATH) sleep(10) self.redis_conn = StrictRedis( unix_socket_path=settings.REDIS_SOCKET_PATH) continue # Discover unique metrics unique_metrics = list( self.redis_conn.smembers(settings.FULL_NAMESPACE + 'unique_metrics')) if len(unique_metrics) == 0: logger.info( 'no metrics in redis. try adding some - see README') sleep(10) continue # Reset boundary_metrics boundary_metrics = [] # Build boundary metrics for metric_name in unique_metrics: for metric in BOUNDARY_METRICS: CHECK_MATCH_PATTERN = metric[0] check_match_pattern = re.compile(CHECK_MATCH_PATTERN) base_name = metric_name.replace(FULL_NAMESPACE, '', 1) pattern_match = check_match_pattern.match(base_name) if pattern_match: if ENABLE_BOUNDARY_DEBUG: logger.info( "debug - boundary metric - pattern MATCHED - " + metric[0] + " | " + base_name) boundary_metrics.append([metric_name, metric[1]]) if ENABLE_BOUNDARY_DEBUG: logger.info("debug - boundary metrics - " + str(boundary_metrics)) if len(boundary_metrics) == 0: logger.info( 'no metrics in redis. try adding some - see README') sleep(10) continue # Spawn processes pids = [] for i in range(1, settings.BOUNDARY_PROCESSES + 1): if i > len(boundary_metrics): logger.info( 'WARNING: skyline boundary is set for more cores than needed.' ) break p = Process(target=self.spin_process, args=(i, boundary_metrics)) pids.append(p) p.start() # Send wait signal to zombie processes for p in pids: p.join() # Grab data from the queue and populate dictionaries exceptions = dict() anomaly_breakdown = dict() while 1: try: key, value = self.anomaly_breakdown_q.get_nowait() if key not in anomaly_breakdown.keys(): anomaly_breakdown[key] = value else: anomaly_breakdown[key] += value except Empty: break while 1: try: key, value = self.exceptions_q.get_nowait() if key not in exceptions.keys(): exceptions[key] = value else: exceptions[key] += value except Empty: break # Send alerts if settings.BOUNDARY_ENABLE_ALERTS: for anomalous_metric in self.anomalous_metrics: datapoint = str(anomalous_metric[0]) metric_name = anomalous_metric[1] base_name = metric_name.replace(FULL_NAMESPACE, '', 1) expiration_time = str(anomalous_metric[2]) metric_trigger = str(anomalous_metric[5]) alert_threshold = int(anomalous_metric[6]) metric_alerters = anomalous_metric[7] algorithm = anomalous_metric[8] if ENABLE_BOUNDARY_DEBUG: logger.info("debug - anomalous_metric - " + str(anomalous_metric)) # Determine how many times has the anomaly been seen if the # ALERT_THRESHOLD is set to > 1 and create a cache key in # redis to keep count so that alert_threshold can be honored if alert_threshold == 0: times_seen = 1 if ENABLE_BOUNDARY_DEBUG: logger.info("debug - alert_threshold - " + str(alert_threshold)) if alert_threshold == 1: times_seen = 1 if ENABLE_BOUNDARY_DEBUG: logger.info("debug - alert_threshold - " + str(alert_threshold)) if alert_threshold > 1: if ENABLE_BOUNDARY_DEBUG: logger.info("debug - alert_threshold - " + str(alert_threshold)) anomaly_cache_key_count_set = False anomaly_cache_key_expiration_time = ( int(alert_threshold) + 1) * 60 anomaly_cache_key = 'anomaly_seen.%s.%s' % (algorithm, base_name) try: anomaly_cache_key_count = self.redis_conn.get( anomaly_cache_key) if not anomaly_cache_key_count: try: if ENABLE_BOUNDARY_DEBUG: logger.info( "debug - redis no anomaly_cache_key - " + str(anomaly_cache_key)) times_seen = 1 if ENABLE_BOUNDARY_DEBUG: logger.info( "debug - redis setex anomaly_cache_key - " + str(anomaly_cache_key)) self.redis_conn.setex( anomaly_cache_key, anomaly_cache_key_expiration_time, packb(int(times_seen))) logger.info( 'set anomaly seen key :: %s seen %s' % (anomaly_cache_key, str(times_seen))) except Exception as e: logger.error('redis setex failed :: %s' % str(anomaly_cache_key)) logger.error("couldn't set key: %s" % e) else: if ENABLE_BOUNDARY_DEBUG: logger.info( "debug - redis anomaly_cache_key retrieved OK - " + str(anomaly_cache_key)) anomaly_cache_key_count_set = True except: if ENABLE_BOUNDARY_DEBUG: logger.info( "debug - redis failed - anomaly_cache_key retrieval failed - " + str(anomaly_cache_key)) anomaly_cache_key_count_set = False if anomaly_cache_key_count_set: unpacker = Unpacker(use_list=False) unpacker.feed(anomaly_cache_key_count) raw_times_seen = list(unpacker) times_seen = int(raw_times_seen[0]) + 1 try: self.redis_conn.setex( anomaly_cache_key, anomaly_cache_key_expiration_time, packb(int(times_seen))) logger.info( 'set anomaly seen key :: %s seen %s' % (anomaly_cache_key, str(times_seen))) except: times_seen = 1 logger.error( 'set anomaly seen key failed :: %s seen %s' % (anomaly_cache_key, str(times_seen))) # Alert the alerters if times_seen > alert_threshold if times_seen >= alert_threshold: if ENABLE_BOUNDARY_DEBUG: logger.info( "debug - times_seen %s is greater than or equal to alert_threshold %s" % (str(times_seen), str(alert_threshold))) for alerter in metric_alerters.split("|"): # Determine alerter limits send_alert = False alerts_sent = 0 if ENABLE_BOUNDARY_DEBUG: logger.info("debug - checking alerter - %s" % alerter) try: if ENABLE_BOUNDARY_DEBUG: logger.info( "debug - determining alerter_expiration_time for settings" ) alerter_expiration_time_setting = settings.BOUNDARY_ALERTER_OPTS[ 'alerter_expiration_time'][alerter] alerter_expiration_time = int( alerter_expiration_time_setting) if ENABLE_BOUNDARY_DEBUG: logger.info( "debug - determined alerter_expiration_time from settings - %s" % str(alerter_expiration_time)) except: # Set an arbitrary expiry time if not set alerter_expiration_time = 160 if ENABLE_BOUNDARY_DEBUG: logger.info( "debug - could not determine alerter_expiration_time from settings" ) try: if ENABLE_BOUNDARY_DEBUG: logger.info( "debug - determining alerter_limit from settings" ) alerter_limit_setting = settings.BOUNDARY_ALERTER_OPTS[ 'alerter_limit'][alerter] alerter_limit = int(alerter_limit_setting) alerter_limit_set = True if ENABLE_BOUNDARY_DEBUG: logger.info( "debug - determined alerter_limit from settings - %s" % str(alerter_limit)) except: alerter_limit_set = False send_alert = True if ENABLE_BOUNDARY_DEBUG: logger.info( "debug - could not determine alerter_limit from settings" ) # If the alerter_limit is set determine how many # alerts the alerter has sent if alerter_limit_set: alerter_sent_count_key = 'alerts_sent.%s' % ( alerter) try: alerter_sent_count_key_data = self.redis_conn.get( alerter_sent_count_key) if not alerter_sent_count_key_data: if ENABLE_BOUNDARY_DEBUG: logger.info( "debug - redis no alerter key, no alerts sent for - " + str(alerter_sent_count_key)) alerts_sent = 0 send_alert = True if ENABLE_BOUNDARY_DEBUG: logger.info( "debug - alerts_sent set to %s" % str(alerts_sent)) logger.info( "debug - send_alert set to %s" % str(sent_alert)) else: if ENABLE_BOUNDARY_DEBUG: logger.info( "debug - redis alerter key retrieved, unpacking" + str(alerter_sent_count_key)) unpacker = Unpacker(use_list=False) unpacker.feed( alerter_sent_count_key_data) raw_alerts_sent = list(unpacker) alerts_sent = int(raw_alerts_sent[0]) if ENABLE_BOUNDARY_DEBUG: logger.info( "debug - alerter %s alerts sent %s " % (str(alerter), str(alerts_sent))) except: logger.info("No key set - %s" % alerter_sent_count_key) alerts_sent = 0 send_alert = True if ENABLE_BOUNDARY_DEBUG: logger.info( "debug - alerts_sent set to %s" % str(alerts_sent)) logger.info( "debug - send_alert set to %s" % str(send_alert)) if alerts_sent < alerter_limit: send_alert = True if ENABLE_BOUNDARY_DEBUG: logger.info( "debug - alerts_sent %s is less than alerter_limit %s" % (str(alerts_sent), str(alerter_limit))) logger.info( "debug - send_alert set to %s" % str(send_alert)) # Send alert alerter_alert_sent = False if send_alert: cache_key = 'last_alert.boundary.%s.%s.%s' % ( alerter, base_name, algorithm) if ENABLE_BOUNDARY_DEBUG: logger.info( "debug - checking cache_key - %s" % cache_key) try: last_alert = self.redis_conn.get(cache_key) if not last_alert: try: self.redis_conn.setex( cache_key, int(anomalous_metric[2]), packb(int( anomalous_metric[0]))) if ENABLE_BOUNDARY_DEBUG: logger.info( 'debug - key setex OK - %s' % (cache_key)) trigger_alert( alerter, datapoint, base_name, expiration_time, metric_trigger, algorithm) logger.info( 'alert sent :: %s - %s - via %s - %s' % (base_name, datapoint, alerter, algorithm)) trigger_alert( "syslog", datapoint, base_name, expiration_time, metric_trigger, algorithm) logger.info( 'alert sent :: %s - %s - via syslog - %s' % (base_name, datapoint, algorithm)) alerter_alert_sent = True except Exception as e: logger.error( 'alert failed :: %s - %s - via %s - %s' % (base_name, datapoint, alerter, algorithm)) logger.error( "couldn't send alert: %s" % str(e)) trigger_alert( "syslog", datapoint, base_name, expiration_time, metric_trigger, algorithm) else: if ENABLE_BOUNDARY_DEBUG: logger.info( "debug - cache_key exists not alerting via %s for %s is less than alerter_limit %s" % (alerter, cache_key)) trigger_alert("syslog", datapoint, base_name, expiration_time, metric_trigger, algorithm) logger.info( 'alert sent :: %s - %s - via syslog - %s' % (base_name, datapoint, algorithm)) except: trigger_alert("syslog", datapoint, base_name, expiration_time, metric_trigger, algorithm) logger.info( 'alert sent :: %s - %s - via syslog - %s' % (base_name, datapoint, algorithm)) else: trigger_alert("syslog", datapoint, base_name, expiration_time, metric_trigger, algorithm) logger.info( 'alert sent :: %s - %s - via syslog - %s' % (base_name, datapoint, algorithm)) # Update the alerts sent for the alerter cache key, # to allow for alert limiting if alerter_alert_sent and alerter_limit_set: try: alerter_sent_count_key = 'alerts_sent.%s' % ( alerter) new_alerts_sent = int(alerts_sent) + 1 self.redis_conn.setex( alerter_sent_count_key, alerter_expiration_time, packb(int(new_alerts_sent))) logger.info('set %s - %s' % (alerter_sent_count_key, str(new_alerts_sent))) except: logger.error('failed to set %s - %s' % (alerter_sent_count_key, str(new_alerts_sent))) else: # Always alert to syslog, even if alert_threshold is not # breached or if send_alert is not True trigger_alert("syslog", datapoint, base_name, expiration_time, metric_trigger, algorithm) logger.info('alert sent :: %s - %s - via syslog - %s' % (base_name, datapoint, algorithm)) # Write anomalous_metrics to static webapp directory if len(self.anomalous_metrics) > 0: filename = path.abspath( path.join(path.dirname(__file__), '..', settings.ANOMALY_DUMP)) with open(filename, 'w') as fh: # Make it JSONP with a handle_data() function anomalous_metrics = list(self.anomalous_metrics) anomalous_metrics.sort(key=operator.itemgetter(1)) fh.write('handle_data(%s)' % anomalous_metrics) # Log progress logger.info('seconds to run :: %.2f' % (time() - now)) logger.info('total metrics :: %d' % len(boundary_metrics)) logger.info('total analyzed :: %d' % (len(boundary_metrics) - sum(exceptions.values()))) logger.info('total anomalies :: %d' % len(self.anomalous_metrics)) logger.info('exception stats :: %s' % exceptions) logger.info('anomaly breakdown :: %s' % anomaly_breakdown) # Log to Graphite self.send_graphite_metric( 'skyline.boundary.' + SERVER_METRIC_PATH + 'run_time', '%.2f' % (time() - now)) self.send_graphite_metric( 'skyline.boundary.' + SERVER_METRIC_PATH + 'total_analyzed', '%.2f' % (len(boundary_metrics) - sum(exceptions.values()))) self.send_graphite_metric( 'skyline.boundary.' + SERVER_METRIC_PATH + 'total_anomalies', '%d' % len(self.anomalous_metrics)) self.send_graphite_metric( 'skyline.boundary.' + SERVER_METRIC_PATH + 'total_metrics', '%d' % len(boundary_metrics)) for key, value in exceptions.items(): send_metric = 'skyline.boundary.' + SERVER_METRIC_PATH + 'exceptions.%s' % key self.send_graphite_metric(send_metric, '%d' % value) for key, value in anomaly_breakdown.items(): send_metric = 'skyline.boundary.' + SERVER_METRIC_PATH + 'anomaly_breakdown.%s' % key self.send_graphite_metric(send_metric, '%d' % value) # Check canary metric raw_series = self.redis_conn.get(settings.FULL_NAMESPACE + settings.CANARY_METRIC) if raw_series is not None: unpacker = Unpacker(use_list=False) unpacker.feed(raw_series) timeseries = list(unpacker) time_human = (timeseries[-1][0] - timeseries[0][0]) / 3600 projected = 24 * (time() - now) / time_human logger.info('canary duration :: %.2f' % time_human) self.send_graphite_metric( 'skyline.boundary.' + SERVER_METRIC_PATH + 'duration', '%.2f' % time_human) self.send_graphite_metric( 'skyline.boundary.' + SERVER_METRIC_PATH + 'projected', '%.2f' % projected) # Reset counters self.anomalous_metrics[:] = [] # Only run once per minute seconds_to_run = int((time() - now)) if seconds_to_run < 60: sleep_for_seconds = 60 - seconds_to_run else: sleep_for_seconds = 0 if sleep_for_seconds > 0: logger.info('sleeping for %s seconds' % sleep_for_seconds) sleep(sleep_for_seconds)
def run(self): """ Called when the process intializes. """ while 1: now = time() # Make sure Redis is up try: self.redis_conn.ping() except: logger.info( 'skyline can not connect to redis at socket path %s' % settings.REDIS_SOCKET_PATH) sleep(10) logger.info('connecting to redis at socket path %s' % settings.REDIS_SOCKET_PATH) self.redis_conn = StrictRedis( unix_socket_path=settings.REDIS_SOCKET_PATH) continue """ Determine if any metric to analyze """ while True: metric_var_files = [ f for f in listdir(settings.MIRAGE_CHECK_PATH) if isfile(join(settings.MIRAGE_CHECK_PATH, f)) ] if len(metric_var_files) == 0: logger.info('sleeping no metrics...') sleep(10) else: sleep(1) # logger.info('sleeping no metrics...') # sleep(10) # Clean up old files now_timestamp = time() stale_age = now_timestamp - settings.MIRAGE_STALE_SECONDS for current_file in listdir(settings.MIRAGE_CHECK_PATH): if os.path.isfile(settings.MIRAGE_CHECK_PATH + "/" + current_file): t = os.stat(settings.MIRAGE_CHECK_PATH + "/" + current_file) c = t.st_ctime # delete file if older than a week if c < stale_age: os.remove(settings.MIRAGE_CHECK_PATH + "/" + current_file) logger.info('removed %s' % (current_file)) # Discover metric to analyze metric_var_files = '' metric_var_files = [ f for f in listdir(settings.MIRAGE_CHECK_PATH) if isfile(join(settings.MIRAGE_CHECK_PATH, f)) ] if len(metric_var_files) > 0: break metric_var_files_sorted = sorted(metric_var_files) metric_check_file = settings.MIRAGE_CHECK_PATH + "/" + metric_var_files_sorted[ 0] logger.info('processing %s' % metric_var_files_sorted[0]) # Spawn processes pids = [] MIRAGE_PROCESSES = 1 run_timestamp = int(now) for i in range(1, MIRAGE_PROCESSES + 1): p = Process(target=self.spin_process, args=(i, run_timestamp)) pids.append(p) p.start() # Send wait signal to zombie processes for p in pids: p.join() # Grab data from the queue and populate dictionaries exceptions = dict() anomaly_breakdown = dict() while 1: try: key, value = self.mirage_anomaly_breakdown_q.get_nowait() if key not in anomaly_breakdown.keys(): anomaly_breakdown[key] = value else: anomaly_breakdown[key] += value except Empty: break while 1: try: key, value = self.mirage_exceptions_q.get_nowait() if key not in exceptions.keys(): exceptions[key] = value else: exceptions[key] += value except Empty: break for metric_variable in self.metric_variables: if metric_variable[0] == 'metric_name': metric_name = metric_variable[1] if metric_variable[0] == 'metric_value': metric_value = metric_variable[1] if metric_variable[0] == 'hours_to_resolve': hours_to_resolve = metric_variable[1] if metric_variable[0] == 'metric_timestamp': metric_timestamp = metric_variable[1] logger.info('analysis done - %s' % metric_name) # Send alerts # Calculate hours second order resolution to seconds logger.info('analyzed at %s hours resolution' % hours_to_resolve) second_order_resolution_seconds = int(hours_to_resolve) * 3600 logger.info('analyzed at %s seconds resolution' % second_order_resolution_seconds) if settings.MIRAGE_ENABLE_ALERTS: for alert in settings.ALERTS: for metric in self.anomalous_metrics: ALERT_MATCH_PATTERN = alert[0] METRIC_PATTERN = metric[1] alert_match_pattern = re.compile(ALERT_MATCH_PATTERN) pattern_match = alert_match_pattern.match( METRIC_PATTERN) if pattern_match: cache_key = 'mirage.last_alert.%s.%s' % (alert[1], metric[1]) try: last_alert = self.redis_conn.get(cache_key) if not last_alert: self.redis_conn.setex( cache_key, alert[2], packb(metric[0])) trigger_alert( alert, metric, second_order_resolution_seconds) logger.info("Sent %s alert: For %s" % (alert[1], metric[1])) except Exception as e: logger.error( "could not send %s alert for %s: %s" % (alert[1], metric[1], e)) if settings.NEGATE_ANALYZER_ALERTS: if len(self.anomalous_metrics) == 0: for negate_alert in settings.ALERTS: for not_anomalous_metric in self.not_anomalous_metrics: NEGATE_ALERT_MATCH_PATTERN = negate_alert[0] NOT_ANOMALOUS_METRIC_PATTERN = not_anomalous_metric[ 1] alert_match_pattern = re.compile( NEGATE_ALERT_MATCH_PATTERN) negate_pattern_match = alert_match_pattern.match( NOT_ANOMALOUS_METRIC_PATTERN) if negate_pattern_match: try: logger.info("Negate alert sent: For %s" % (not_anomalous_metric[1])) trigger_negater( negate_alert, not_anomalous_metric, second_order_resolution_seconds, metric_value) except Exception as e: logger.error("couldn't send alert: %s" % e) # Log progress if len(self.anomalous_metrics) > 0: logger.info('seconds since last anomaly :: %.2f' % (time() - now)) logger.info('total anomalies :: %d' % len(self.anomalous_metrics)) logger.info('exception stats :: %s' % exceptions) logger.info('anomaly breakdown :: %s' % anomaly_breakdown) # Reset counters self.anomalous_metrics[:] = [] self.not_anomalous_metrics[:] = [] # Reset metric_variables self.metric_variables[:] = [] # Sleep if it went too fast if time() - now < 1: logger.info('sleeping due to low run time...') # sleep(10) sleep(1)
def run(self): """ Called when the process intializes. """ while 1: now = time() # Make sure Redis is up try: self.ring.check_connections() except: sleep(10) self.ring = RedisRing(settings.REDIS_BACKENDS) continue # Discover unique metrics unique_metrics = list(self.ring.run('smembers', settings.FULL_NAMESPACE + 'unique_metrics')) if len(unique_metrics) == 0: logger.info('no metrics in redis. try adding some - see README') sleep(10) continue # Spawn processes pids = [] for i in range(1, settings.ANALYZER_PROCESSES + 1): if i > len(unique_metrics): logger.info('WARNING: skyline is set for more cores than needed.') break p = Process(target=self.spin_process, args=(i, unique_metrics)) pids.append(p) p.start() # Send wait signal to zombie processes for p in pids: p.join() # Grab data from the queue and populate dictionaries exceptions = dict() anomaly_breakdown = dict() while 1: try: key, value = self.anomaly_breakdown_q.get_nowait() if key not in anomaly_breakdown.keys(): anomaly_breakdown[key] = value else: anomaly_breakdown[key] += value except Empty: break while 1: try: key, value = self.exceptions_q.get_nowait() if key not in exceptions.keys(): exceptions[key] = value else: exceptions[key] += value except Empty: break # Send alerts if settings.ENABLE_ALERTS: for alert in settings.ALERTS: for metric in self.anomalous_metrics: if alert[0] in metric[1]: cache_key = 'last_alert.%s.%s' % (alert[1], metric[1]) try: last_alert = self.ring.run('get', cache_key) if not last_alert: self.ring.run('setex', cache_key, alert[2], packb(metric[0])) trigger_alert(alert, metric) except Exception as e: logger.error("couldn't send alert: %s" % e) # Log progress logger.info('seconds to run :: %.2f' % (time() - now)) logger.info('total metrics :: %d' % len(unique_metrics)) logger.info('total analyzed :: %d' % (len(unique_metrics) - sum(exceptions.values()))) logger.info('total anomalies :: %d' % len(self.anomalous_metrics)) logger.info('exception stats :: %s' % exceptions) logger.info('anomaly breakdown :: %s' % anomaly_breakdown) # Log to Graphite if settings.GRAPHITE_HOST != '': host = settings.GRAPHITE_HOST.replace('http://', '') system('echo skyline.analyzer.run_time %.2f %s | nc -w 3 %s 2003' % ((time() - now), now, host)) system('echo skyline.analyzer.total_analyzed %d %s | nc -w 3 %s 2003' % ((len(unique_metrics) - sum(exceptions.values())), now, host)) # Check canary metric raw_series = self.ring.run('get', settings.FULL_NAMESPACE + settings.CANARY_METRIC) if raw_series is not None: unpacker = Unpacker(use_list = False) unpacker.feed(raw_series) timeseries = list(unpacker) time_human = (timeseries[-1][0] - timeseries[0][0]) / 3600 projected = 24 * (time() - now) / time_human logger.info('canary duration :: %.2f' % time_human) if settings.GRAPHITE_HOST != '': host = settings.GRAPHITE_HOST.replace('http://', '') system('echo skyline.analyzer.duration %.2f %s | nc -w 3 %s 2003' % (time_human, now, host)) system('echo skyline.analyzer.projected %.2f %s | nc -w 3 %s 2003' % (projected, now, host)) # Reset counters self.anomalous_metrics[:] = [] # Sleep if it went too fast if time() - now < 5: logger.info('sleeping due to low run time...') sleep(10)
def run(self): """ Called when the process intializes. """ while 1: now = time() # Make sure Redis is up try: self.redis_conn.ping() except: logger.error('skyline can\'t connect to redis at socket path %s' % settings.REDIS_SOCKET_PATH) sleep(10) self.redis_conn = StrictRedis(unix_socket_path=settings.REDIS_SOCKET_PATH) continue # Discover unique metrics unique_metrics = list(self.redis_conn.smembers(settings.FULL_NAMESPACE + 'unique_metrics')) if len(unique_metrics) == 0: logger.info('no metrics in redis. try adding some - see README') sleep(10) continue # Spawn processes pids = [] for i in range(1, settings.ANALYZER_PROCESSES + 1): if i > len(unique_metrics): logger.info('WARNING: skyline is set for more cores than needed.') break p = Process(target=self.spin_process, args=(i, unique_metrics)) pids.append(p) p.start() # Send wait signal to zombie processes for p in pids: p.join() # Grab data from the queue and populate dictionaries exceptions = dict() anomaly_breakdown = dict() while 1: try: key, value = self.anomaly_breakdown_q.get_nowait() if key not in anomaly_breakdown.keys(): anomaly_breakdown[key] = value else: anomaly_breakdown[key] += value except Empty: break while 1: try: key, value = self.exceptions_q.get_nowait() if key not in exceptions.keys(): exceptions[key] = value else: exceptions[key] += value except Empty: break # Send alerts if settings.ENABLE_ALERTS: for alert in settings.ALERTS: for metric in self.anomalous_metrics: ALERT_MATCH_PATTERN = alert[0] METRIC_PATTERN = metric[1] alert_match_pattern = re.compile(ALERT_MATCH_PATTERN) pattern_match = alert_match_pattern.match(METRIC_PATTERN) if pattern_match: cache_key = 'last_alert.%s.%s' % (alert[1], metric[1]) try: last_alert = self.redis_conn.get(cache_key) if not last_alert: try: SECOND_ORDER_RESOLUTION_FULL_DURATION = alert[3] logger.info('mirage check :: %s' % (metric[1])) # Write anomalous metric to test at second # order resolution by crucible to the check # file metric_timestamp = int(time()) anomaly_check_file = '%s/%s.%s.txt' % (settings.MIRAGE_CHECK_PATH, metric_timestamp, metric[1]) with open(anomaly_check_file, 'w') as fh: # metric_name, anomalous datapoint, hours to resolve, timestamp fh.write('metric = "%s"\nvalue = "%s"\nhours_to_resolve = "%s"\nmetric_timestamp = "%s"\n' % (metric[1], metric[0], alert[3], metric_timestamp)) logger.info('added mirage check :: %s,%s,%s' % (metric[1], metric[0], alert[3])) if settings.ENABLE_FULL_DURATION_ALERTS: self.redis_conn.setex(cache_key, alert[2], packb(metric[0])) trigger_alert(alert, metric) except: self.redis_conn.setex(cache_key, alert[2], packb(metric[0])) trigger_alert(alert, metric) except Exception as e: logger.error("couldn't send alert: %s" % e) # Write anomalous_metrics to static webapp directory if len(self.anomalous_metrics) > 0: filename = path.abspath(path.join(path.dirname(__file__), '..', settings.ANOMALY_DUMP)) with open(filename, 'w') as fh: # Make it JSONP with a handle_data() function anomalous_metrics = list(self.anomalous_metrics) anomalous_metrics.sort(key=operator.itemgetter(1)) fh.write('handle_data(%s)' % anomalous_metrics) # Log progress logger.info('seconds to run :: %.2f' % (time() - now)) logger.info('total metrics :: %d' % len(unique_metrics)) logger.info('total analyzed :: %d' % (len(unique_metrics) - sum(exceptions.values()))) logger.info('total anomalies :: %d' % len(self.anomalous_metrics)) logger.info('exception stats :: %s' % exceptions) logger.info('anomaly breakdown :: %s' % anomaly_breakdown) # Log to Graphite self.send_graphite_metric('skyline.analyzer.' + SERVER_METRIC_PATH + 'run_time', '%.2f' % (time() - now)) self.send_graphite_metric('skyline.analyzer.' + SERVER_METRIC_PATH + 'total_analyzed', '%.2f' % (len(unique_metrics) - sum(exceptions.values()))) self.send_graphite_metric('skyline.analyzer.' + SERVER_METRIC_PATH + 'total_anomalies', '%d' % len(self.anomalous_metrics)) self.send_graphite_metric('skyline.analyzer.' + SERVER_METRIC_PATH + 'total_metrics', '%d' % len(unique_metrics)) for key, value in exceptions.items(): send_metric = 'skyline.analyzer.' + SERVER_METRIC_PATH + 'exceptions.%s' % key self.send_graphite_metric(send_metric, '%d' % value) for key, value in anomaly_breakdown.items(): send_metric = 'skyline.analyzer.' + SERVER_METRIC_PATH + 'anomaly_breakdown.%s' % key self.send_graphite_metric(send_metric, '%d' % value) # Check canary metric raw_series = self.redis_conn.get(settings.FULL_NAMESPACE + settings.CANARY_METRIC) if raw_series is not None: unpacker = Unpacker(use_list=False) unpacker.feed(raw_series) timeseries = list(unpacker) time_human = (timeseries[-1][0] - timeseries[0][0]) / 3600 projected = 24 * (time() - now) / time_human logger.info('canary duration :: %.2f' % time_human) self.send_graphite_metric('skyline.analyzer.' + SERVER_METRIC_PATH + 'duration', '%.2f' % time_human) self.send_graphite_metric('skyline.analyzer.' + SERVER_METRIC_PATH + 'projected', '%.2f' % projected) # Reset counters self.anomalous_metrics[:] = [] # Sleep if it went too fast if time() - now < 5: logger.info('sleeping due to low run time...') sleep(10)