def alert_smtp(alert, metric, context): """ Called by :func:`~trigger_alert` and sends an alert via smtp to the recipients that are configured for the metric. """ LOCAL_DEBUG = False logger = logging.getLogger(skyline_app_logger) if settings.ENABLE_DEBUG or LOCAL_DEBUG: logger.info('debug :: alert_smtp - sending smtp alert') logger.info('debug :: alert_smtp - Memory usage at start: %s (kb)' % resource.getrusage(resource.RUSAGE_SELF).ru_maxrss) # FULL_DURATION to hours so that analyzer surfaces the relevant timeseries data # in the graph full_duration_in_hours = int(settings.FULL_DURATION) / 3600 # @added 20161229 - Feature #1830: Ionosphere alerts # Added Ionosphere variables base_name = str(metric[1]).replace(settings.FULL_NAMESPACE, '', 1) if settings.IONOSPHERE_ENABLED: timeseries_dir = base_name.replace('.', '/') training_data_dir = '%s/%s/%s' % (settings.IONOSPHERE_DATA_FOLDER, str(int(metric[2])), timeseries_dir) graphite_image_file = '%s/%s.%s.graphite.%sh.png' % ( training_data_dir, base_name, skyline_app, str(int(full_duration_in_hours))) json_file = '%s/%s.%s.redis.%sh.json' % ( training_data_dir, base_name, skyline_app, str(int(full_duration_in_hours))) training_data_redis_image = '%s/%s.%s.redis.plot.%sh.png' % ( training_data_dir, base_name, skyline_app, str(int(full_duration_in_hours))) # For backwards compatibility if '@' in alert[1]: sender = settings.ALERT_SENDER recipient = alert[1] else: sender = settings.SMTP_OPTS['sender'] # @modified 20160806 - Added default_recipient try: recipients = settings.SMTP_OPTS['recipients'][alert[0]] use_default_recipient = False except: use_default_recipient = True if use_default_recipient: try: recipients = settings.SMTP_OPTS['default_recipient'] logger.info( 'alert_smtp - using default_recipient as no recipients are configured for %s' % str(alert[0])) except: logger.error( 'error :: alert_smtp - no known recipient for %s' % str(alert[0])) return False # Backwards compatibility if type(recipients) is str: recipients = [recipients] # @added 20180524 - Task #2384: Change alerters to cc other recipients # The alerters did send an individual email to each recipient. This would be # more useful if one email was sent with the first smtp recipient being the # to recipient and the subsequent recipients were add in cc. if recipients: primary_recipient = False cc_recipients = False for i_recipient in recipients: if not primary_recipient: primary_recipient = str(i_recipient) if primary_recipient != i_recipient: if not cc_recipients: cc_recipients = str(i_recipient) else: new_cc_recipients = '%s,%s' % (str(cc_recipients), str(i_recipient)) cc_recipients = str(new_cc_recipients) logger.info( 'alert_smtp - will send to primary_recipient :: %s, cc_recipients :: %s' % (str(primary_recipient), str(cc_recipients))) # @modified 20161229 - Feature #1830: Ionosphere alerts # Ionosphere alerts unencoded_graph_title = 'Skyline %s - ALERT at %s hours - %s' % ( context, str(int(full_duration_in_hours)), str(metric[0])) # @modified 20170603 - Feature #2034: analyse_derivatives # Added deriative functions to convert the values of metrics strictly # increasing monotonically to their deriative products in alert graphs and # specify it in the graph_title known_derivative_metric = False try: # @modified 20180519 - Feature #2378: Add redis auth to Skyline and rebrow if settings.REDIS_PASSWORD: REDIS_ALERTER_CONN = redis.StrictRedis( password=settings.REDIS_PASSWORD, unix_socket_path=settings.REDIS_SOCKET_PATH) else: REDIS_ALERTER_CONN = redis.StrictRedis( unix_socket_path=settings.REDIS_SOCKET_PATH) except: logger.error(traceback.format_exc()) logger.error('error :: alert_smtp - redis connection failed') try: derivative_metrics = list( REDIS_ALERTER_CONN.smembers('derivative_metrics')) except: derivative_metrics = [] redis_metric_name = '%s%s' % (settings.FULL_NAMESPACE, str(base_name)) if redis_metric_name in derivative_metrics: known_derivative_metric = True if known_derivative_metric: try: non_derivative_monotonic_metrics = settings.NON_DERIVATIVE_MONOTONIC_METRICS except: non_derivative_monotonic_metrics = [] skip_derivative = in_list(redis_metric_name, non_derivative_monotonic_metrics) if skip_derivative: known_derivative_metric = False if known_derivative_metric: unencoded_graph_title = 'Skyline %s - ALERT at %s hours - derivative graph - %s' % ( context, str(int(full_duration_in_hours)), str(metric[0])) if settings.ENABLE_DEBUG or LOCAL_DEBUG: logger.info('debug :: alert_smtp - unencoded_graph_title: %s' % unencoded_graph_title) graph_title_string = quote(unencoded_graph_title, safe='') graph_title = '&title=%s' % graph_title_string graphite_port = '80' if settings.GRAPHITE_PORT != '': graphite_port = str(settings.GRAPHITE_PORT) link = '%s://%s:%s/render/?from=-%shours&target=cactiStyle(%s)%s%s&colorList=orange' % ( settings.GRAPHITE_PROTOCOL, settings.GRAPHITE_HOST, graphite_port, str(int(full_duration_in_hours)), metric[1], settings.GRAPHITE_GRAPH_SETTINGS, graph_title) # @added 20170603 - Feature #2034: analyse_derivatives if known_derivative_metric: link = '%s://%s:%s/render/?from=-%shours&target=cactiStyle(nonNegativeDerivative(%s))%s%s&colorList=orange' % ( settings.GRAPHITE_PROTOCOL, settings.GRAPHITE_HOST, graphite_port, str(int(full_duration_in_hours)), metric[1], settings.GRAPHITE_GRAPH_SETTINGS, graph_title) content_id = metric[1] image_data = None if settings.SMTP_OPTS.get('embed-images'): # @added 20161229 - Feature #1830: Ionosphere alerts # Use existing data if files exist if os.path.isfile(graphite_image_file): try: with open(graphite_image_file, 'r') as f: image_data = f.read() logger.info('alert_smtp - using existing png - %s' % graphite_image_file) except: logger.error(traceback.format_exc()) logger.error( 'error :: alert_smtp - failed to read image data from existing png - %s' % graphite_image_file) logger.error('error :: alert_smtp - %s' % str(link)) image_data = None if image_data is None: try: # @modified 20170913 - Task #2160: Test skyline with bandit # Added nosec to exclude from bandit tests image_data = urllib2.urlopen(link).read() # nosec if settings.ENABLE_DEBUG or LOCAL_DEBUG: logger.info('debug :: alert_smtp - image data OK') except urllib2.URLError: logger.error(traceback.format_exc()) logger.error('error :: alert_smtp - failed to get image graph') logger.error('error :: alert_smtp - %s' % str(link)) image_data = None if settings.ENABLE_DEBUG or LOCAL_DEBUG: logger.info('debug :: alert_smtp - image data None') if LOCAL_DEBUG: logger.info( 'debug :: alert_smtp - Memory usage after image_data: %s (kb)' % resource.getrusage(resource.RUSAGE_SELF).ru_maxrss) # If we failed to get the image or if it was explicitly disabled, # use the image URL instead of the content. if image_data is None: img_tag = '<img src="%s"/>' % link else: img_tag = '<img src="cid:%s"/>' % content_id if settings.ENABLE_DEBUG or LOCAL_DEBUG: logger.info('debug :: alert_smtp - img_tag: %s' % img_tag) if settings.IONOSPHERE_ENABLED: # Create Ionosphere Graphite image # @modified 20161229 - Feature #1830: Ionosphere alerts # Only write the data to the file if it does not exist if not os.path.isfile(graphite_image_file): try: write_data_to_file(skyline_app, graphite_image_file, 'w', image_data) logger.info('added %s Ionosphere Graphite image :: %s' % (skyline_app, graphite_image_file)) except: logger.info(traceback.format_exc()) logger.error( 'error :: failed to add %s Ionosphere Graphite image' % (skyline_app, graphite_image_file)) else: logger.info( '%s Ionosphere Graphite image already exists :: %s' % (skyline_app, graphite_image_file)) redis_image_data = None try: plot_redis_data = settings.PLOT_REDIS_DATA except: plot_redis_data = False if settings.SMTP_OPTS.get('embed-images') and plot_redis_data: # Create graph from Redis data redis_metric_key = '%s%s' % (settings.FULL_NAMESPACE, metric[1]) try: raw_series = REDIS_ALERTER_CONN.get(redis_metric_key) if settings.ENABLE_DEBUG or LOCAL_DEBUG: logger.info('debug :: alert_smtp - raw_series: %s' % 'OK') except: if settings.ENABLE_DEBUG or LOCAL_DEBUG: logger.info('debug :: alert_smtp - raw_series: %s' % 'FAIL') try: if LOCAL_DEBUG: logger.info( 'debug :: alert_smtp - Memory usage before get Redis timeseries data: %s (kb)' % resource.getrusage(resource.RUSAGE_SELF).ru_maxrss) unpacker = Unpacker(use_list=True) unpacker.feed(raw_series) timeseries_x = [float(item[0]) for item in unpacker] unpacker = Unpacker(use_list=True) unpacker.feed(raw_series) timeseries_y = [item[1] for item in unpacker] unpacker = Unpacker(use_list=False) unpacker.feed(raw_series) timeseries = list(unpacker) if LOCAL_DEBUG: logger.info( 'debug :: alert_smtp - Memory usage after get Redis timeseries data: %s (kb)' % resource.getrusage(resource.RUSAGE_SELF).ru_maxrss) except: logger.error('error :: alert_smtp - unpack timeseries failed') timeseries = None if settings.IONOSPHERE_ENABLED and timeseries: ''' .. todo: this is possibly to be used to allow the user to submit the FULL_DURATION duration data set for the features profile to be created against IF it is a Mirage metric. This would allow for additional granularity in Mirage metrics, thereby maintaining their seasonality, but allow user and Skyline to analyze the anomaly at a FULL_DURATION resolution as well. Not sure how to code that in Ionosphere context yet but could just be additonal flag in the Ionosphere record. In the Ionosphere frontend, the user would be given an option to either create the features profile on the Mirage timeseries or the redis FULL_DURATION timeseries. It is a little complicated, but doable. # @modified 20161229 - Feature #1828: ionosphere - mirage Redis data features However that ^^ is UNDESIRABLE in the Mirage/Ionosphere context at the moment. Ionosphere must only profile SECOND_ORDER_RESOLUTION_HOURS currently so as to not pollute the seasonality aspect of Mirage ''' # Create Ionosphere redis timeseries json if is does not exist # @modified 20161229 - Feature #1830: Ionosphere alerts # Only write the data to the file if it does not exist and replace # the timeseries object if a json file exists # @added 20170920 - Bug #2168: Strange Redis derivative graph using_original_redis_json = False if not os.path.isfile(json_file): timeseries_json = str(timeseries).replace('[', '(').replace( ']', ')') try: write_data_to_file(skyline_app, json_file, 'w', timeseries_json) logger.info( 'added %s Ionosphere Redis data timeseries json file :: %s' % (skyline_app, json_file)) except: logger.info(traceback.format_exc()) logger.error( 'error :: failed to add %s Ionosphere Redis data timeseries json file' % (skyline_app, json_file)) else: # Replace the timeseries object logger.info( '%s Ionosphere Redis data timeseries json file already exists, using :: %s' % (skyline_app, json_file)) anomaly_json = json_file try: # Read the timeseries json file with open(anomaly_json, 'r') as f: raw_timeseries = f.read() timeseries_array_str = str(raw_timeseries).replace( '(', '[').replace(')', ']') timeseries = literal_eval(timeseries_array_str) logger.info( '%s Redis timeseries replaced with timeseries from :: %s' % (skyline_app, anomaly_json)) timeseries_x = [float(item[0]) for item in timeseries] timeseries_y = [item[1] for item in timeseries] # @added 20170920 - Bug #2168: Strange Redis derivative graph # This already has nonNegativeDerivative applied to it using_original_redis_json = True except: logger.error(traceback.format_exc()) logger.error( 'error :: %s failed to read timeseries data from %s' % (skyline_app, anomaly_json)) timeseries = None # @added 20170603 - Feature #2034: analyse_derivatives if known_derivative_metric: # @added 20170920 - Bug #2168: Strange Redis derivative graph # If this is the Mirage Redis json it already has # nonNegativeDerivative applied to it if not using_original_redis_json: logger.info('alert_smtp - nonNegativeDerivative being applied') try: derivative_timeseries = nonNegativeDerivative(timeseries) timeseries = derivative_timeseries # @added 20170920 - Bug #2168: Strange Redis derivative graph logger.info('alert_smtp - nonNegativeDerivative applied') except: logger.error( 'error :: alert_smtp - nonNegativeDerivative failed') else: logger.info( 'alert_smtp - nonNegativeDerivative not being applied, as it will have been applied in the original json' ) # @added 21070726 - Bug #2068: Analyzer smtp alert error on Redis plot with derivative metrics # If the nonNegativeDerivative has been calculated we need to reset the # x and y as nonNegativeDerivative has to discard the first value as it # has no delta for it so the timeseries is 1 item less. timeseries_x = [float(item[0]) for item in timeseries] timeseries_y = [item[1] for item in timeseries] pd_series_values = None if timeseries: try: if LOCAL_DEBUG: logger.info( 'debug :: alert_smtp - Memory usage before pd.Series: %s (kb)' % resource.getrusage(resource.RUSAGE_SELF).ru_maxrss) values = pd.Series([x[1] for x in timeseries]) # Because the truth value of a Series is ambiguous pd_series_values = True if LOCAL_DEBUG: logger.info( 'debug :: alert_smtp - Memory usage after pd.Series: %s (kb)' % resource.getrusage(resource.RUSAGE_SELF).ru_maxrss) except: logger.error( 'error :: alert_smtp - pandas value series on timeseries failed' ) if pd_series_values: try: array_median = np.median(values) if settings.ENABLE_DEBUG or LOCAL_DEBUG: logger.info('debug :: alert_smtp - values median: %s' % str(array_median)) array_amax = np.amax(values) if settings.ENABLE_DEBUG or LOCAL_DEBUG: logger.info('debug :: alert_smtp - array_amax: %s' % str(array_amax)) array_amin = np.amin(values) if settings.ENABLE_DEBUG or LOCAL_DEBUG: logger.info('debug :: alert_smtp - array_amin: %s' % str(array_amin)) mean = values.mean() if settings.ENABLE_DEBUG or LOCAL_DEBUG: logger.info('debug :: alert_smtp - mean: %s' % str(mean)) stdDev = values.std() if settings.ENABLE_DEBUG or LOCAL_DEBUG: logger.info('debug :: alert_smtp - stdDev: %s' % str(stdDev)) sigma3 = 3 * stdDev if settings.ENABLE_DEBUG or LOCAL_DEBUG: logger.info('debug :: alert_smtp - sigma3: %s' % str(sigma3)) # sigma3_series = [sigma3] * len(values) sigma3_upper_bound = mean + sigma3 try: sigma3_lower_bound = mean - sigma3 except: sigma3_lower_bound = 0 sigma3_upper_series = [sigma3_upper_bound] * len(values) sigma3_lower_series = [sigma3_lower_bound] * len(values) amax_series = [array_amax] * len(values) amin_series = [array_amin] * len(values) mean_series = [mean] * len(values) except: logger.error( 'error :: alert_smtp - numpy ops on series failed') mean_series = None if mean_series: graph_title = 'Skyline %s - ALERT - at %s hours - Redis data\n%s - anomalous value: %s' % ( context, str( int(full_duration_in_hours)), metric[1], str(metric[0])) # @added 20170603 - Feature #2034: analyse_derivatives if known_derivative_metric: graph_title = 'Skyline %s - ALERT - at %s hours - Redis data (derivative graph)\n%s - anomalous value: %s' % ( context, str(int(full_duration_in_hours)), metric[1], str(metric[0])) # @modified 20160814 - Bug #1558: Memory leak in Analyzer # I think the buf is causing a memory leak, trying a file # if python_version == 3: # buf = io.StringIO() # else: # buf = io.BytesIO() buf = '%s/%s.%s.%s.png' % (settings.SKYLINE_TMP_DIR, skyline_app, str(int(metric[2])), metric[1]) if LOCAL_DEBUG: logger.info( 'debug :: alert_smtp - Memory usage before plot Redis data: %s (kb)' % resource.getrusage(resource.RUSAGE_SELF).ru_maxrss) # Too big # rcParams['figure.figsize'] = 12, 6 rcParams['figure.figsize'] = 8, 4 try: # fig = plt.figure() fig = plt.figure(frameon=False) ax = fig.add_subplot(111) ax.set_title(graph_title, fontsize='small') # @modified 20180417 - Bug #2358: set_axis_bgcolor method removed from Matplotlib - Luminosity # IssueID #49 'AxesSubplot' object has no attribute 'set_axis_bgcolor' # ax.set_axis_bgcolor('black') if hasattr(ax, 'set_facecolor'): ax.set_facecolor('black') else: ax.set_axis_bgcolor('black') try: datetimes = [ dt.datetime.utcfromtimestamp(ts) for ts in timeseries_x ] if settings.ENABLE_DEBUG or LOCAL_DEBUG: logger.info('debug :: alert_smtp - datetimes: %s' % 'OK') except: logger.error('error :: alert_smtp - datetimes: %s' % 'FAIL') plt.xticks(rotation=0, horizontalalignment='center') xfmt = DateFormatter('%a %H:%M') plt.gca().xaxis.set_major_formatter(xfmt) ax.xaxis.set_major_formatter(xfmt) ax.plot(datetimes, timeseries_y, color='orange', lw=0.6, zorder=3) ax.tick_params(axis='both', labelsize='xx-small') max_value_label = 'max - %s' % str(array_amax) ax.plot(datetimes, amax_series, lw=1, label=max_value_label, color='m', ls='--', zorder=4) min_value_label = 'min - %s' % str(array_amin) ax.plot(datetimes, amin_series, lw=1, label=min_value_label, color='b', ls='--', zorder=4) mean_value_label = 'mean - %s' % str(mean) ax.plot(datetimes, mean_series, lw=1.5, label=mean_value_label, color='g', ls='--', zorder=4) sigma3_text = (r'3$\sigma$') # sigma3_label = '%s - %s' % (str(sigma3_text), str(sigma3)) sigma3_upper_label = '%s upper - %s' % ( str(sigma3_text), str(sigma3_upper_bound)) ax.plot(datetimes, sigma3_upper_series, lw=1, label=sigma3_upper_label, color='r', ls='solid', zorder=4) if sigma3_lower_bound > 0: sigma3_lower_label = '%s lower - %s' % ( str(sigma3_text), str(sigma3_lower_bound)) ax.plot(datetimes, sigma3_lower_series, lw=1, label=sigma3_lower_label, color='r', ls='solid', zorder=4) ax.get_yaxis().get_major_formatter().set_useOffset(False) ax.get_yaxis().get_major_formatter().set_scientific(False) # Shrink current axis's height by 10% on the bottom box = ax.get_position() ax.set_position([ box.x0, box.y0 + box.height * 0.1, box.width, box.height * 0.9 ]) # Put a legend below current axis ax.legend(loc='upper center', bbox_to_anchor=(0.5, -0.05), fancybox=True, shadow=True, ncol=4, fontsize='x-small') plt.rc('lines', lw=2, color='w') plt.grid(True) ax.grid(b=True, which='both', axis='both', color='lightgray', linestyle='solid', alpha=0.5, linewidth=0.6) # @modified 20180417 - Bug #2358: set_axis_bgcolor method removed from Matplotlib - Luminosity # IssueID #49 'AxesSubplot' object has no attribute 'set_axis_bgcolor' # ax.set_axis_bgcolor('black') if hasattr(ax, 'set_facecolor'): ax.set_facecolor('black') else: ax.set_axis_bgcolor('black') rcParams['xtick.direction'] = 'out' rcParams['ytick.direction'] = 'out' ax.margins(y=.02, x=.03) # tight_layout removes the legend box # fig.tight_layout() try: if LOCAL_DEBUG: logger.info( 'debug :: alert_smtp - Memory usage before plt.savefig: %s (kb)' % resource.getrusage(resource.RUSAGE_SELF).ru_maxrss) plt.savefig(buf, format='png') if settings.IONOSPHERE_ENABLED: if not os.path.exists(training_data_dir): mkdir_p(training_data_dir) logger.info('created dir - %s' % training_data_dir) if not os.path.isfile(training_data_redis_image): try: plt.savefig(training_data_redis_image, format='png') logger.info( 'alert_smtp - save Redis training data image - %s' % (training_data_redis_image)) except: logger.info(traceback.format_exc()) logger.error( 'error :: alert_smtp - could not save - %s' % (training_data_redis_image)) else: logger.info( 'alert_smtp - Redis training data image already exists - %s' % (training_data_redis_image)) # @added 20160814 - Bug #1558: Memory leak in Analyzer # As per http://www.mail-archive.com/[email protected]/msg13222.html # savefig in the parent process was causing the memory leak # the below fig.clf() and plt.close() did not resolve this # however spawing a multiprocessing process for alert_smtp # does solve this as issue as all memory is freed when the # process terminates. fig.clf() plt.close(fig) redis_graph_content_id = 'redis.%s' % metric[1] redis_image_data = True if settings.ENABLE_DEBUG or LOCAL_DEBUG: logger.info('debug :: alert_smtp - savefig: %s' % 'OK') logger.info( 'debug :: alert_smtp - Memory usage after plt.savefig: %s (kb)' % resource.getrusage(resource.RUSAGE_SELF).ru_maxrss) except: logger.info(traceback.format_exc()) logger.error('error :: alert_smtp - plt.savefig: %s' % 'FAIL') except: logger.error(traceback.format_exc()) logger.error('error :: alert_smtp - could not build plot') if LOCAL_DEBUG: logger.info( 'debug :: alert_smtp - Memory usage before email: %s (kb)' % resource.getrusage(resource.RUSAGE_SELF).ru_maxrss) if redis_image_data: redis_img_tag = '<img src="cid:%s"/>' % redis_graph_content_id if settings.ENABLE_DEBUG or LOCAL_DEBUG: logger.info('debug :: alert_smtp - redis_img_tag: %s' % str(redis_img_tag)) else: # @modified 20161229 - Feature #1830: Ionosphere alerts # @modified 20170108 - Feature #1852: Ionosphere - features_profile matched graphite graphs # Restored the previous redis_img_tag method as some smtp alerts were # coming without a Redis graph, not all but some and for some reason, # I am pretty certain retrospectively that it was done that way from # testing I just wanted to try and be cleaner. # The redis_img_tag was changed at # https://github.com/earthgecko/skyline/commit/31bcacf3f90f0953ebed0d57260cb937e01f887c#diff-520bf2a218f65074ffead4d8184c138dR489 redis_img_tag = '<img src="%s"/>' % 'none' # redis_img_tag = '<img src="none"/>' # @added 20170806 - Feature #1830: Ionosphere alerts # Show a human date in alerts alerted_at = str(dt.datetime.utcfromtimestamp(int(metric[2]))) try: body = '<h3><font color="#dd3023">Sky</font><font color="#6698FF">line</font><font color="black"> %s alert</font></h3><br>' % context body += '<font color="black">metric: <b>%s</b></font><br>' % metric[1] body += '<font color="black">Anomalous value: %s</font><br>' % str( metric[0]) body += '<font color="black">Anomaly timestamp: %s</font><br>' % str( int(metric[2])) # @added 20170806 - Feature #1830: Ionosphere alerts # Show a human date in alerts body += '<font color="black">Anomalous at: %s</font><br>' % alerted_at body += '<font color="black">At hours: %s</font><br>' % str( int(full_duration_in_hours)) body += '<font color="black">Next alert in: %s seconds</font><br>' % str( alert[2]) # @added 20170603 - Feature #2034: analyse_derivatives if known_derivative_metric: body += '<font color="black">Derivative graph: True</font><br>' more_body = '' if settings.IONOSPHERE_ENABLED: # @modified 20170823 - Bug #2142: 7bit SMTP encoding breaking long urls # Broke body into body and more_body to workaround the 990 character # limit per line for SMTP more_body += '<h3><font color="#dd3023">Ionosphere :: </font><font color="#6698FF">training data</font><font color="black"></font></h3>' ionosphere_link = '%s/ionosphere?timestamp=%s&metric=%s' % ( settings.SKYLINE_URL, str(int(metric[2])), str(metric[1])) more_body += '<font color="black">To use this timeseries to train Skyline that this is not anomalous manage this training data at:<br>' more_body += '<a href="%s">%s</a></font>' % (ionosphere_link, ionosphere_link) if redis_image_data: more_body += '<font color="black">min: %s | max: %s | mean: %s <br>' % ( str(array_amin), str(array_amax), str(mean)) more_body += '3-sigma: %s <br>' % str(sigma3) more_body += '3-sigma upper bound: %s | 3-sigma lower bound: %s <br></font>' % ( str(sigma3_upper_bound), str(sigma3_lower_bound)) more_body += '<h3><font color="black">Redis data at FULL_DURATION</font></h3><br>' more_body += '<div dir="ltr">:%s<br></div>' % redis_img_tag if image_data: more_body += '<h3><font color="black">Graphite data at FULL_DURATION (may be aggregated)</font></h3>' more_body += '<div dir="ltr"><a href="%s">%s</a><br></div><br>' % ( link, img_tag) more_body += '<font color="black">Clicking on the above graph will open to the Graphite graph with current data</font><br>' if redis_image_data: more_body += '<font color="black">To disable the Redis data graph view, set PLOT_REDIS_DATA to False in your settings.py, if the Graphite graph is sufficient for you,<br>' more_body += 'however do note that will remove the 3-sigma and mean value too.</font>' more_body += '<br>' more_body += '<div dir="ltr" align="right"><font color="#dd3023">Sky</font><font color="#6698FF">line</font><font color="black"> version :: %s</font></div><br>' % str( skyline_version) except: logger.error('error :: alert_smtp - could not build body') logger.info(traceback.format_exc()) # @modified 20180524 - Task #2384: Change alerters to cc other recipients # Do not send to each recipient, send to primary_recipient and cc the other # recipients, thereby sending only one email # for recipient in recipients: if primary_recipient: try: # @modified 20170823 - Bug #2142: 7bit SMTP encoding breaking long urls # Broke body into body and more_body to workaround the 990 character # limit per line for SMTP, using mixed as alternative indicates that # the client should select one of the parts for display and ignore # the rest (tripleee - https://stackoverflow.com/a/35115938) # msg = MIMEMultipart('alternative') msg = MIMEMultipart('mixed') # @added 20170812 - Bug #2142: 7bit SMTP encoding breaking long urls # set email charset and email encodings cs_ = charset.Charset('utf-8') cs_.header_encoding = charset.QP cs_.body_encoding = charset.QP msg.set_charset(cs_) msg['Subject'] = '[Skyline alert] - %s ALERT - %s' % (context, metric[1]) msg['From'] = sender # @modified 20180524 - Task #2384: Change alerters to cc other recipients # msg['To'] = recipient msg['To'] = primary_recipient # @added 20180524 - Task #2384: Change alerters to cc other recipients # Added Cc if cc_recipients: msg['Cc'] = cc_recipients msg.attach(MIMEText(body, 'html')) # @added 20170823 - Bug #2142: 7bit SMTP encoding breaking long urls # Broke body into body and more_body to workaround the 990 character # limit per line for SMTP msg.replace_header('content-transfer-encoding', 'quoted-printable') msg.attach(MIMEText(more_body, 'html')) if redis_image_data: try: # @modified 20160814 - Bug #1558: Memory leak in Analyzer # I think the buf is causing a memory leak, trying a file # buf.seek(0) # msg_plot_attachment = MIMEImage(buf.read()) # msg_plot_attachment = MIMEImage(buf.read()) try: with open(buf, 'r') as f: plot_image_data = f.read() try: os.remove(buf) except OSError: logger.error( 'error :: alert_smtp - failed to remove file - %s' % buf) logger.info(traceback.format_exc()) pass except: logger.error('error :: failed to read plot file - %s' % buf) plot_image_data = None # @added 20161124 - Branch #922: ionosphere msg_plot_attachment = MIMEImage(plot_image_data) msg_plot_attachment.add_header( 'Content-ID', '<%s>' % redis_graph_content_id) msg.attach(msg_plot_attachment) if settings.ENABLE_DEBUG or LOCAL_DEBUG: logger.info( 'debug :: alert_smtp - msg_plot_attachment - redis data done' ) except: logger.error('error :: alert_smtp - msg_plot_attachment') logger.info(traceback.format_exc()) if image_data is not None: try: msg_attachment = MIMEImage(image_data) msg_attachment.add_header('Content-ID', '<%s>' % content_id) msg.attach(msg_attachment) if settings.ENABLE_DEBUG or LOCAL_DEBUG: logger.info( 'debug :: alert_smtp - msg_attachment - Graphite img source done' ) except: logger.error('error :: alert_smtp - msg_attachment') logger.info(traceback.format_exc()) except: logger.error('error :: alert_smtp - could not attach') logger.info(traceback.format_exc()) s = SMTP('127.0.0.1') try: # @modified 20180524 - Task #2384: Change alerters to cc other recipients # Send to primary_recipient and cc_recipients # s.sendmail(sender, recipient, msg.as_string()) if cc_recipients: s.sendmail(sender, [primary_recipient, cc_recipients], msg.as_string()) else: s.sendmail(sender, primary_recipient, msg.as_string()) if settings.ENABLE_DEBUG or LOCAL_DEBUG: # logger.info('debug :: alert_smtp - message sent to %s OK' % str(recipient)) logger.info( 'debug :: alert_smtp - message sent OK to primary_recipient :: %s, cc_recipients :: %s' % (str(primary_recipient), str(cc_recipients))) except: logger.info(traceback.format_exc()) # logger.error('error :: alert_smtp - could not send email to %s' % str(recipient)) logger.error( 'error :: alert_smtp - could not send email to primary_recipient :: %s, cc_recipients :: %s' % (str(primary_recipient), str(cc_recipients))) s.quit() if LOCAL_DEBUG: logger.info('debug :: alert_smtp - Memory usage after email: %s (kb)' % resource.getrusage(resource.RUSAGE_SELF).ru_maxrss) if redis_image_data: # buf.seek(0) # buf.write('none') if LOCAL_DEBUG: logger.info( 'debug :: alert_smtp - Memory usage before del redis_image_data objects: %s (kb)' % resource.getrusage(resource.RUSAGE_SELF).ru_maxrss) del raw_series del unpacker del timeseries[:] del timeseries_x[:] del timeseries_y[:] del values del datetimes[:] del msg_plot_attachment del redis_image_data # We del all variables that are floats as they become unique objects and # can result in what appears to be a memory leak, but is not, it is # just the way Python handles floats del mean del array_amin del array_amax del stdDev del sigma3 if LOCAL_DEBUG: logger.info( 'debug :: alert_smtp - Memory usage after del redis_image_data objects: %s (kb)' % resource.getrusage(resource.RUSAGE_SELF).ru_maxrss) if LOCAL_DEBUG: logger.info( 'debug :: alert_smtp - Memory usage before del fig object: %s (kb)' % resource.getrusage(resource.RUSAGE_SELF).ru_maxrss) # @added 20160814 - Bug #1558: Memory leak in Analyzer # Issue #21 Memory leak in Analyzer - https://github.com/earthgecko/skyline/issues/21 # As per http://www.mail-archive.com/[email protected]/msg13222.html fig.clf() plt.close(fig) del fig if LOCAL_DEBUG: logger.info( 'debug :: alert_smtp - Memory usage after del fig object: %s (kb)' % resource.getrusage(resource.RUSAGE_SELF).ru_maxrss) if LOCAL_DEBUG: logger.info( 'debug :: alert_smtp - Memory usage before del other objects: %s (kb)' % resource.getrusage(resource.RUSAGE_SELF).ru_maxrss) del recipients[:] del body del msg del image_data del msg_attachment if LOCAL_DEBUG: logger.info( 'debug :: alert_smtp - Memory usage after del other objects: %s (kb)' % resource.getrusage(resource.RUSAGE_SELF).ru_maxrss) return
def luminosity_remote_data(anomaly_timestamp): """ Gets all the unique_metrics from Redis and then mgets Redis data for all metrics. The data is then preprocessed for the remote Skyline luminosity instance and only the relevant fragments of the time series are returned. This return is then gzipped by the Flask Webapp response to ensure the minimum about of bandwidth is used. :param anomaly_timestamp: the anomaly timestamp :type anomaly_timestamp: int :return: list :rtype: list """ message = 'luminosity_remote_data returned' success = False luminosity_data = [] logger.info('luminosity_remote_data :: determining unique_metrics') unique_metrics = [] # If you modify the values of 61 or 600 here, it must be modified in the # luminosity_remote_data function in # skyline/luminosity/process_correlations.py as well from_timestamp = int(anomaly_timestamp) - 600 until_timestamp = int(anomaly_timestamp) + 61 try: unique_metrics = list(REDIS_CONN.smembers(settings.FULL_NAMESPACE + 'unique_metrics')) except Exception as e: logger.error('error :: %s' % str(e)) logger.error('error :: luminosity_remote_data :: could not determine unique_metrics from Redis set') if not unique_metrics: message = 'error :: luminosity_remote_data :: could not determine unique_metrics from Redis set' return luminosity_data, success, message logger.info('luminosity_remote_data :: %s unique_metrics' % str(len(unique_metrics))) # assigned metrics assigned_min = 0 assigned_max = len(unique_metrics) assigned_keys = range(assigned_min, assigned_max) # Compile assigned metrics assigned_metrics = [unique_metrics[index] for index in assigned_keys] # Check if this process is unnecessary if len(assigned_metrics) == 0: message = 'error :: luminosity_remote_data :: assigned_metrics length is 0' logger.error(message) return luminosity_data, success, message # Multi get series raw_assigned_failed = True try: raw_assigned = REDIS_CONN.mget(assigned_metrics) raw_assigned_failed = False except: logger.info(traceback.format_exc()) message = 'error :: luminosity_remote_data :: failed to mget raw_assigned' logger.error(message) return luminosity_data, success, message if raw_assigned_failed: message = 'error :: luminosity_remote_data :: failed to mget raw_assigned' logger.error(message) return luminosity_data, success, message # Distill timeseries strings into lists for i, metric_name in enumerate(assigned_metrics): timeseries = [] try: raw_series = raw_assigned[i] unpacker = Unpacker(use_list=False) unpacker.feed(raw_series) timeseries = list(unpacker) except: timeseries = [] if not timeseries: continue # @added 20200507 - Feature #3532: Sort all time series # To ensure that there are no unordered timestamps in the time # series which are artefacts of the collector or carbon-relay, sort # all time series by timestamp before analysis. original_timeseries = timeseries if original_timeseries: timeseries = sort_timeseries(original_timeseries) del original_timeseries # Convert the time series if this is a known_derivative_metric base_name = metric_name.replace(settings.FULL_NAMESPACE, '', 1) known_derivative_metric = is_derivative_metric('webapp', base_name) if known_derivative_metric: try: derivative_timeseries = nonNegativeDerivative(timeseries) timeseries = derivative_timeseries except: logger.error('error :: nonNegativeDerivative failed') correlate_ts = [] for ts, value in timeseries: if int(ts) < from_timestamp: continue if int(ts) <= anomaly_timestamp: correlate_ts.append((int(ts), value)) if int(ts) > (anomaly_timestamp + until_timestamp): break if not correlate_ts: continue metric_data = [str(metric_name), correlate_ts] luminosity_data.append(metric_data) logger.info('luminosity_remote_data :: %s valid metric time series data preprocessed for the remote request' % str(len(luminosity_data))) return luminosity_data, success, message
def get_anomalous_ts(base_name, anomaly_timestamp): logger = logging.getLogger(skyline_app_logger) # @added 20180423 - Feature #2360: CORRELATE_ALERTS_ONLY # Branch #2270: luminosity # Only correlate metrics with an alert setting if correlate_alerts_only: try: # @modified 20191030 - Bug #3266: py3 Redis binary objects not strings # Branch #3262: py3 # smtp_alerter_metrics = list(redis_conn.smembers('analyzer.smtp_alerter_metrics')) # @modified 20200421 - Feature #3306: Record anomaly_end_timestamp # Branch #2270: luminosity # Branch #3262: py3 # Changed to use the aet Redis set, used to determine and record the # anomaly_end_timestamp, some transient sets need to copied so that # the data always exists, even if it is sourced from a transient set. # smtp_alerter_metrics = list(redis_conn_decoded.smembers('analyzer.smtp_alerter_metrics')) smtp_alerter_metrics = list(redis_conn_decoded.smembers('aet.analyzer.smtp_alerter_metrics')) except: smtp_alerter_metrics = [] if base_name not in smtp_alerter_metrics: logger.error('%s has no alerter setting, not correlating' % base_name) return [] if not base_name or not anomaly_timestamp: return [] # from skyline_functions import nonNegativeDerivative anomalous_metric = '%s%s' % (settings.FULL_NAMESPACE, base_name) unique_metrics = [] try: # @modified 20191030 - Bug #3266: py3 Redis binary objects not strings # Branch #3262: py3 # unique_metrics = list(redis_conn.smembers(settings.FULL_NAMESPACE + 'unique_metrics')) unique_metrics = list(redis_conn_decoded.smembers(settings.FULL_NAMESPACE + 'unique_metrics')) except: logger.error(traceback.format_exc()) logger.error('error :: get_assigned_metrics :: no unique_metrics') return [] # @added 20180720 - Feature #2464: luminosity_remote_data # Ensure that Luminosity only processes it's own Redis metrics so that if # multiple Skyline instances are running, Luminosity does not process an # anomaly_id for a metric that is not local to itself. This will stop the # call to the remote Redis with other_redis_conn below. With the # introduction of the preprocessing luminosity_remote_data API endpoint for # remote Skyline instances, there is no further requirement for Skyline # instances to have direct access to Redis on another Skyline instance. # A much better solution and means all data is preprocessed and encrypted, # there is no need for iptables other than 443 (or custom https port). # if anomalous_metric in unique_metrics: logger.info('%s is a metric in Redis, processing on this Skyline instance' % base_name) else: logger.info('%s is not a metric in Redis, not processing on this Skyline instance' % base_name) return [] assigned_metrics = [anomalous_metric] # @modified 20180419 - raw_assigned = [] try: raw_assigned = redis_conn.mget(assigned_metrics) except: raw_assigned = [] if raw_assigned == [None]: logger.info('%s data not retrieved from local Redis' % (str(base_name))) raw_assigned = [] # @modified 20180721 - Feature #2464: luminosity_remote_data # TO BE DEPRECATED settings.OTHER_SKYLINE_REDIS_INSTANCES # with the addition of the luminosity_remote_data API call and the above if not raw_assigned and settings.OTHER_SKYLINE_REDIS_INSTANCES: # @modified 20180519 - Feature #2378: Add redis auth to Skyline and rebrow # for redis_ip, redis_port in settings.OTHER_SKYLINE_REDIS_INSTANCES: for redis_ip, redis_port, redis_password in settings.OTHER_SKYLINE_REDIS_INSTANCES: if not raw_assigned: try: if redis_password: other_redis_conn = StrictRedis(host=str(redis_ip), port=int(redis_port), password=str(redis_password)) else: other_redis_conn = StrictRedis(host=str(redis_ip), port=int(redis_port)) raw_assigned = other_redis_conn.mget(assigned_metrics) if raw_assigned == [None]: logger.info('%s data not retrieved from Redis at %s on port %s' % (str(base_name), str(redis_ip), str(redis_port))) raw_assigned = [] if raw_assigned: logger.info('%s data retrieved from Redis at %s on port %s' % (str(base_name), str(redis_ip), str(redis_port))) except: logger.error(traceback.format_exc()) logger.error('error :: failed to connect to Redis at %s on port %s' % (str(redis_ip), str(redis_port))) raw_assigned = [] if not raw_assigned or raw_assigned == [None]: logger.info('%s data not retrieved' % (str(base_name))) return [] for i, metric_name in enumerate(assigned_metrics): try: raw_series = raw_assigned[i] unpacker = Unpacker(use_list=False) unpacker.feed(raw_series) timeseries = list(unpacker) except: timeseries = [] # @added 20200507 - Feature #3532: Sort all time series # To ensure that there are no unordered timestamps in the time # series which are artefacts of the collector or carbon-relay, sort # all time series by timestamp before analysis. original_timeseries = timeseries if original_timeseries: timeseries = sort_timeseries(original_timeseries) del original_timeseries # Convert the time series if this is a known_derivative_metric known_derivative_metric = is_derivative_metric(skyline_app, base_name) if known_derivative_metric: derivative_timeseries = nonNegativeDerivative(timeseries) timeseries = derivative_timeseries # Sample the time series # @modified 20180720 - Feature #2464: luminosity_remote_data # Added note here - if you modify the value of 600 here, it must be # modified in the luminosity_remote_data function in # skyline/webapp/backend.py as well from_timestamp = anomaly_timestamp - 600 anomaly_ts = [] for ts, value in timeseries: if int(ts) < from_timestamp: continue if int(ts) <= anomaly_timestamp: anomaly_ts.append((int(ts), value)) if int(ts) > anomaly_timestamp: break # @added 20190515 - Bug #3008: luminosity - do not analyse short time series # Only return a time series sample if the sample has sufficient data points # otherwise get_anomalies() will throw and error len_anomaly_ts = len(anomaly_ts) if len_anomaly_ts <= 9: logger.info('%s insufficient data not retrieved, only %s data points surfaced, not correlating' % ( str(base_name), str(len_anomaly_ts))) return [] return anomaly_ts
def get_correlations( base_name, anomaly_timestamp, anomalous_ts, assigned_metrics, raw_assigned, remote_assigned, anomalies): logger = logging.getLogger(skyline_app_logger) # Distill timeseries strings into lists start = timer() count = 0 metrics_checked_for_correlation = 0 # Sample the time series # @modified 20180720 - Feature #2464: luminosity_remote_data # Added note here - if you modify the value of 600 here, it must be # modified in the luminosity_remote_data function in # skyline/webapp/backend.py as well from_timestamp = anomaly_timestamp - 600 correlated_metrics = [] correlations = [] no_data = False if not anomalous_ts: no_data = True if not assigned_metrics: no_data = True if not raw_assigned: no_data = True if not anomalies: no_data = True if no_data: logger.error('error :: get_correlations :: no data') return (correlated_metrics, correlations) # @added 20200428 - Feature #3510: Enable Luminosity to handle correlating namespaces only # Feature #3500: webapp - crucible_process_metrics # Feature #1448: Crucible web UI # Discard the check if the anomaly_timestamp is not in FULL_DURATION as it # will have been added via the Crucible or webapp/crucible route start_timestamp_of_full_duration_data = int(time() - settings.FULL_DURATION) if anomaly_timestamp < (start_timestamp_of_full_duration_data + 2000): logger.info('get_correlations :: the anomaly_timestamp is too old not correlating') return (correlated_metrics, correlations) start_local_correlations = timer() local_redis_metrics_checked_count = 0 local_redis_metrics_correlations_count = 0 logger.info('get_correlations :: the local Redis metric count is %s' % str(len(assigned_metrics))) # @added 20200428 - Feature #3510: Enable Luminosity to handle correlating namespaces only # Removed here and handled in get_assigned_metrics for i, metric_name in enumerate(assigned_metrics): count += 1 # print(metric_name) # @modified 20180719 - Branch #2270: luminosity # Removed test limiting that was errorneously left in # if count > 1000: # break correlated = None metric_base_name = metric_name.replace(settings.FULL_NAMESPACE, '', 1) if str(metric_base_name) == str(base_name): continue try: raw_series = raw_assigned[i] unpacker = Unpacker(use_list=False) unpacker.feed(raw_series) timeseries = list(unpacker) except: timeseries = [] if not timeseries: # print('no time series data for %s' % base_name) continue # @added 20200507 - Feature #3532: Sort all time series # To ensure that there are no unordered timestamps in the time # series which are artefacts of the collector or carbon-relay, sort # all time series by timestamp before analysis. original_timeseries = timeseries if original_timeseries: timeseries = sort_timeseries(original_timeseries) del original_timeseries # Convert the time series if this is a known_derivative_metric known_derivative_metric = is_derivative_metric(skyline_app, metric_base_name) if known_derivative_metric: try: derivative_timeseries = nonNegativeDerivative(timeseries) timeseries = derivative_timeseries except: logger.error(traceback.format_exc()) logger.error('error :: nonNegativeDerivative') correlate_ts = [] for ts, value in timeseries: if int(ts) < from_timestamp: continue if int(ts) <= anomaly_timestamp: correlate_ts.append((int(ts), value)) # @modified 20180720 - Feature #2464: luminosity_remote_data # Added note here - if you modify the value of 61 here, it must be # modified in the luminosity_remote_data function in # skyline/webapp/backend.py as well if int(ts) > (anomaly_timestamp + 61): break if not correlate_ts: continue local_redis_metrics_checked_count += 1 anomaly_ts_dict = dict(anomalous_ts) correlate_ts_dict = dict(correlate_ts) for a in anomalies: try: # @modified 20180720 - Feature #2464: luminosity_remote_data # Added note here - if you modify the value of 120 here, it must be # modified in the luminosity_remote_data function in # skyline/webapp/backend.py as well if int(a.exact_timestamp) < int(anomaly_timestamp - 120): continue if int(a.exact_timestamp) > int(anomaly_timestamp + 120): continue except: continue try: time_period = (int(anomaly_timestamp - 120), int(anomaly_timestamp + 120)) my_correlator = Correlator(anomaly_ts_dict, correlate_ts_dict, time_period) # For better correlation use 0.9 instead of 0.8 for the threshold # @modified 20180524 - Feature #2360: CORRELATE_ALERTS_ONLY # Branch #2270: luminosity # Feature #2378: Add redis auth to Skyline and rebrow # Added this to setting.py # if my_correlator.is_correlated(threshold=0.9): try: cross_correlation_threshold = settings.LUMINOL_CROSS_CORRELATION_THRESHOLD metrics_checked_for_correlation += 1 except: cross_correlation_threshold = 0.9 if my_correlator.is_correlated(threshold=cross_correlation_threshold): correlation = my_correlator.get_correlation_result() correlated = True correlations.append([metric_base_name, correlation.coefficient, correlation.shift, correlation.shifted_coefficient]) local_redis_metrics_correlations_count += 1 except: pass if correlated: correlated_metrics.append(metric_base_name) # @added 20180720 - Feature #2464: luminosity_remote_data # Added the correlation of preprocessed remote data end_local_correlations = timer() logger.info('get_correlations :: checked - local_redis_metrics_checked_count is %s' % str(local_redis_metrics_checked_count)) logger.info('get_correlations :: correlated - local_redis_metrics_correlations_count is %s' % str(local_redis_metrics_correlations_count)) logger.info('get_correlations :: processed %s correlations on local_redis_metrics_checked_count %s local metrics in %.6f seconds' % ( str(local_redis_metrics_correlations_count), str(local_redis_metrics_checked_count), (end_local_correlations - start_local_correlations))) remote_metrics_count = 0 remote_correlations_check_count = 0 remote_correlations_count = 0 logger.info('get_correlations :: remote_assigned count %s' % str(len(remote_assigned))) start_remote_correlations = timer() for ts_data in remote_assigned: remote_metrics_count += 1 correlated = None metric_name = str(ts_data[0]) metric_base_name = metric_name.replace(settings.FULL_NAMESPACE, '', 1) if str(metric_base_name) == str(base_name): continue timeseries = [] try: timeseries = ts_data[1] except: timeseries = [] if not timeseries: continue correlate_ts = [] for ts, value in timeseries: if int(ts) < from_timestamp: continue if int(ts) <= anomaly_timestamp: correlate_ts.append((int(ts), value)) # @modified 20180720 - Feature #2464: luminosity_remote_data # Added note here - if you modify the value of 61 here, it must be # modified in the luminosity_remote_data function in # skyline/webapp/backend.py as well if int(ts) > (anomaly_timestamp + 61): break if not correlate_ts: continue anomaly_ts_dict = dict(anomalous_ts) correlate_ts_dict = dict(correlate_ts) for a in anomalies: try: # @modified 20180720 - Feature #2464: luminosity_remote_data # Added note here - if you modify the value of 120 here, it must be # modified in the luminosity_remote_data function in # skyline/webapp/backend.py as well if int(a.exact_timestamp) < int(anomaly_timestamp - 120): continue if int(a.exact_timestamp) > int(anomaly_timestamp + 120): continue except: continue try: time_period = (int(anomaly_timestamp - 120), int(anomaly_timestamp + 120)) my_correlator = Correlator(anomaly_ts_dict, correlate_ts_dict, time_period) metrics_checked_for_correlation += 1 remote_correlations_check_count += 1 try: cross_correlation_threshold = settings.LUMINOL_CROSS_CORRELATION_THRESHOLD except: cross_correlation_threshold = 0.9 if my_correlator.is_correlated(threshold=cross_correlation_threshold): correlation = my_correlator.get_correlation_result() correlated = True correlations.append([metric_base_name, correlation.coefficient, correlation.shift, correlation.shifted_coefficient]) remote_correlations_count += 1 except: pass if correlated: correlated_metrics.append(metric_base_name) end_remote_correlations = timer() logger.info('get_correlations :: checked - remote_correlations_check_count is %s' % str(remote_correlations_check_count)) logger.info('get_correlations :: correlated - remote_correlations_count is %s' % str(remote_correlations_count)) logger.info('get_correlations :: processed remote correlations on remote_metrics_count %s local metric in %.6f seconds' % ( str(remote_metrics_count), (end_remote_correlations - start_remote_correlations))) end = timer() logger.info('get_correlations :: checked a total of %s metrics and correlated %s metrics to %s anomaly, processed in %.6f seconds' % ( str(metrics_checked_for_correlation), str(len(correlated_metrics)), base_name, (end - start))) # @added 20170720 - Task #2462: Implement useful metrics for Luminosity # Added runtime to calculate avg_runtime Graphite metric runtime = '%.6f' % (end - start) return (correlated_metrics, correlations, metrics_checked_for_correlation, runtime)
def luminosity_remote_data(anomaly_timestamp, resolution): """ Gets all the unique_metrics from Redis and then mgets Redis data for all metrics. The data is then preprocessed for the remote Skyline luminosity instance and only the relevant fragments of the time series are returned. This return is then gzipped by the Flask Webapp response to ensure the minimum about of bandwidth is used. :param anomaly_timestamp: the anomaly timestamp :type anomaly_timestamp: int :return: list :rtype: list """ message = 'luminosity_remote_data returned' success = False luminosity_data = [] logger.info('luminosity_remote_data :: determining unique_metrics') unique_metrics = [] # If you modify the values of 61 or 600 here, it must be modified in the # luminosity_remote_data function in # skyline/luminosity/process_correlations.py as well # @modified 20201203 - Feature #3860: luminosity - handle low frequency data # Use the metric resolution # from_timestamp = int(anomaly_timestamp) - 600 # until_timestamp = int(anomaly_timestamp) + 61 from_timestamp = int(anomaly_timestamp) - (resolution * 10) until_timestamp = int(anomaly_timestamp) + (resolution + 1) try: # @modified 20201123 - Feature #3824: get_cluster_data # Feature #2464: luminosity_remote_data # Bug #3266: py3 Redis binary objects not strings # Branch #3262: py3 # unique_metrics = list(REDIS_CONN.smembers(settings.FULL_NAMESPACE + 'unique_metrics')) REDIS_CONN_DECODED = get_redis_conn_decoded(skyline_app) unique_metrics = list(REDIS_CONN_DECODED.smembers(settings.FULL_NAMESPACE + 'unique_metrics')) except Exception as e: logger.error('error :: %s' % str(e)) logger.error('error :: luminosity_remote_data :: could not determine unique_metrics from Redis set') if not unique_metrics: message = 'error :: luminosity_remote_data :: could not determine unique_metrics from Redis set' return luminosity_data, success, message logger.info('luminosity_remote_data :: %s unique_metrics' % str(len(unique_metrics))) # @added 20210125 - Feature #3956: luminosity - motifs # Improve luminosity_remote_data performance # Although the is_derivative_metric function is appropriate in the below # loop here that is not the most performant manner in which to determine if # the metrics are derivatives, as it needs to fire on every metric, so here # we just trust the Redis derivative_metrics list. This increases # performance on 1267 metrics from 6.442009 seconds to 1.473067 seconds try: derivative_metrics = list(REDIS_CONN_DECODED.smembers('derivative_metrics')) except: derivative_metrics = [] # assigned metrics assigned_min = 0 assigned_max = len(unique_metrics) assigned_keys = range(assigned_min, assigned_max) # Compile assigned metrics assigned_metrics = [unique_metrics[index] for index in assigned_keys] # Check if this process is unnecessary if len(assigned_metrics) == 0: message = 'error :: luminosity_remote_data :: assigned_metrics length is 0' logger.error(message) return luminosity_data, success, message # Multi get series raw_assigned_failed = True try: raw_assigned = REDIS_CONN.mget(assigned_metrics) raw_assigned_failed = False except: logger.info(traceback.format_exc()) message = 'error :: luminosity_remote_data :: failed to mget raw_assigned' logger.error(message) return luminosity_data, success, message if raw_assigned_failed: message = 'error :: luminosity_remote_data :: failed to mget raw_assigned' logger.error(message) return luminosity_data, success, message # Distill timeseries strings into lists for i, metric_name in enumerate(assigned_metrics): timeseries = [] try: raw_series = raw_assigned[i] unpacker = Unpacker(use_list=False) unpacker.feed(raw_series) timeseries = list(unpacker) except: timeseries = [] if not timeseries: continue # @added 20200507 - Feature #3532: Sort all time series # To ensure that there are no unordered timestamps in the time # series which are artefacts of the collector or carbon-relay, sort # all time series by timestamp before analysis. original_timeseries = timeseries if original_timeseries: timeseries = sort_timeseries(original_timeseries) del original_timeseries # Convert the time series if this is a known_derivative_metric # @modified 20200728 - Bug #3652: Handle multiple metrics in base_name conversion # base_name = metric_name.replace(settings.FULL_NAMESPACE, '', 1) # @added 20201117 - Feature #3824: get_cluster_data # Feature #2464: luminosity_remote_data # Bug #3266: py3 Redis binary objects not strings # Branch #3262: py3 # Convert metric_name bytes to str metric_name = str(metric_name) # @modified 20210125 - Feature #3956: luminosity - motifs # Improve luminosity_remote_data performance # Although the is_derivative_metric function is appropriate here it is # not the most performant manner in which to determine if the metric # is a derivative in this case as it needs to fire on every metric, so # here we just trust the Redis derivative_metrics list. This increases # performance on 1267 metrics from 6.442009 seconds to 1.473067 seconds # if metric_name.startswith(settings.FULL_NAMESPACE): # base_name = metric_name.replace(settings.FULL_NAMESPACE, '', 1) # else: # base_name = metric_name # known_derivative_metric = is_derivative_metric('webapp', base_name) known_derivative_metric = False if metric_name in derivative_metrics: known_derivative_metric = True if known_derivative_metric: try: derivative_timeseries = nonNegativeDerivative(timeseries) timeseries = derivative_timeseries except: logger.error('error :: nonNegativeDerivative failed') # @modified 20210125 - Feature #3956: luminosity - motifs # Improve luminosity_remote_data performance # The list comprehension method halves the time to create the # correlate_ts from 0.0008357290644198656 to 0.0004676780663430691 seconds # correlate_ts = [] # for ts, value in timeseries: # if int(ts) < from_timestamp: # continue # if int(ts) <= anomaly_timestamp: # correlate_ts.append((int(ts), value)) # if int(ts) > (anomaly_timestamp + until_timestamp): # break correlate_ts = [x for x in timeseries if x[0] >= from_timestamp if x[0] <= until_timestamp] if not correlate_ts: continue metric_data = [str(metric_name), correlate_ts] luminosity_data.append(metric_data) logger.info('luminosity_remote_data :: %s valid metric time series data preprocessed for the remote request' % str(len(luminosity_data))) return luminosity_data, success, message
def get_metric_timeseries(current_skyline_app, metric_name, log=True): """ Return a metric time series as a list e.g. [[ts, value], [ts, value], ..., [ts, value]] :param current_skyline_app: the app calling the function :param metric_name: the full Redis metric name :param log: whether to log or not, optional, defaults to True :type current_skyline_app: str :type metric_name: str :type log: boolean :return: timeseries :rtype: list """ function_str = 'functions.redis.get_metric_timeseries' if log: current_skyline_app_logger = current_skyline_app + 'Log' current_logger = logging.getLogger(current_skyline_app_logger) else: current_logger = None timeseries = [] try: redis_conn = get_redis_conn(current_skyline_app) except Exception as e: if not log: current_skyline_app_logger = current_skyline_app + 'Log' current_logger = logging.getLogger(current_skyline_app_logger) current_logger.error( 'error :: %s :: failed to connect to Redis to fetch time series for %s - %s' % (function_str, metric_name, e)) if metric_name.startswith(FULL_NAMESPACE): metric_name = str(metric_name) else: metric_name = '%s%s' % (FULL_NAMESPACE, str(metric_name)) raw_series = None try: raw_series = redis_conn.get(metric_name) except Exception as e: if not log: current_skyline_app_logger = current_skyline_app + 'Log' current_logger = logging.getLogger(current_skyline_app_logger) current_logger.error('error :: failed to get %s from Redis - %s' % (metric_name, e)) raw_series = None if not raw_series: return timeseries try: unpacker = Unpacker(use_list=False) unpacker.feed(raw_series) timeseries = list(unpacker) except Exception as e: if not log: current_skyline_app_logger = current_skyline_app + 'Log' current_logger = logging.getLogger(current_skyline_app_logger) current_logger.error( 'error :: failed to unpack %s time series from Redis data - %s' % (metric_name, e)) timeseries = [] try: redis_conn_decoded = get_redis_conn_decoded(current_skyline_app) except Exception as e: if not log: current_skyline_app_logger = current_skyline_app + 'Log' current_logger = logging.getLogger(current_skyline_app_logger) current_logger.error( 'error :: %s :: failed to connect to Redis to get derivative_metrics - %s' % (function_str, e)) derivative_metrics = [] try: # @modified 20211012 - Feature #4280: aet.metrics_manager.derivative_metrics Redis hash # derivative_metrics = list(redis_conn_decoded.smembers('derivative_metrics')) derivative_metrics = list( redis_conn_decoded.smembers( 'aet.metrics_manager.derivative_metrics')) except Exception as e: if not log: current_skyline_app_logger = current_skyline_app + 'Log' current_logger = logging.getLogger(current_skyline_app_logger) current_logger.error( 'error :: %s :: failed to connect to Redis for smembers of derivative_metrics - %s' % (function_str, e)) derivative_metrics = [] if metric_name in derivative_metrics: if len(timeseries) > 3: try: derivative_timeseries = nonNegativeDerivative(timeseries) timeseries = derivative_timeseries except Exception as e: if not log: current_skyline_app_logger = current_skyline_app + 'Log' current_logger = logging.getLogger( current_skyline_app_logger) current_logger.error( 'error :: %s :: nonNegativeDerivative failed - %s' % (function_str, e)) return timeseries
def spin_process(self, i, unique_metrics): """ Assign a bunch of metrics for a process to analyze. Multiple get the assigned_metrics to the process from Redis. For each metric: - unpack the `raw_timeseries` for the metric. - Analyse each timeseries against `ALGORITHMS` to determine if it is anomalous. - If anomalous add it to the :obj:`self.anomalous_metrics` list - Add what algorithms triggered to the :obj:`self.anomaly_breakdown_q` queue - If :mod:`settings.ENABLE_CRUCIBLE` is ``True``: - Add a crucible data file with the details about the timeseries and anomaly. - Write the timeseries to a json file for crucible. Add keys and values to the queue so the parent process can collate for:\n * :py:obj:`self.anomaly_breakdown_q` * :py:obj:`self.exceptions_q` """ spin_start = time() logger.info('spin_process started') if LOCAL_DEBUG: logger.info('debug :: Memory usage spin_process start: %s (kb)' % resource.getrusage(resource.RUSAGE_SELF).ru_maxrss) # TESTING removal of p.join() from p.terminate() # sleep(4) # @modified 20160801 - Adding additional exception handling to Analyzer # Check the unique_metrics list is valid try: len(unique_metrics) except: logger.error('error :: the unique_metrics list is not valid') logger.info(traceback.format_exc()) logger.info('nothing to do, no unique_metrics') return # Discover assigned metrics keys_per_processor = int(ceil(float(len(unique_metrics)) / float(settings.ANALYZER_PROCESSES))) if i == settings.ANALYZER_PROCESSES: assigned_max = len(unique_metrics) else: assigned_max = min(len(unique_metrics), i * keys_per_processor) # Fix analyzer worker metric assignment #94 # https://github.com/etsy/skyline/pull/94 @languitar:worker-fix assigned_min = (i - 1) * keys_per_processor assigned_keys = range(assigned_min, assigned_max) # assigned_keys = range(300, 310) # Compile assigned metrics assigned_metrics = [unique_metrics[index] for index in assigned_keys] if LOCAL_DEBUG: logger.info('debug :: Memory usage spin_process after assigned_metrics: %s (kb)' % resource.getrusage(resource.RUSAGE_SELF).ru_maxrss) # @added 20190410 - Feature #2916: ANALYZER_ENABLED setting if not ANALYZER_ENABLED: len_assigned_metrics = len(assigned_metrics) logger.info('ANALYZER_ENABLED is set to %s removing the %s assigned_metrics' % ( str(ANALYZER_ENABLED), str(len_assigned_metrics))) assigned_metrics = [] del unique_metrics # Check if this process is unnecessary if len(assigned_metrics) == 0: return # Multi get series # @modified 20160801 - Adding additional exception handling to Analyzer raw_assigned_failed = True try: raw_assigned = self.redis_conn.mget(assigned_metrics) raw_assigned_failed = False if LOCAL_DEBUG: logger.info('debug :: Memory usage spin_process after raw_assigned: %s (kb)' % resource.getrusage(resource.RUSAGE_SELF).ru_maxrss) except: logger.info(traceback.format_exc()) logger.error('error :: failed to get assigned_metrics from Redis') # Make process-specific dicts exceptions = defaultdict(int) anomaly_breakdown = defaultdict(int) # @added 20160803 - Adding additional exception handling to Analyzer if raw_assigned_failed: return # @added 20161119 - Branch #922: ionosphere # Task #1718: review.tsfresh # Determine the unique Mirage and Ionosphere metrics once, which are # used later to determine how Analyzer should handle/route anomalies try: mirage_unique_metrics = list(self.redis_conn.smembers('mirage.unique_metrics')) except: mirage_unique_metrics = [] # @added 20190408 - Feature #2882: Mirage - periodic_check # Add Mirage periodic checks so that Mirage is analysing each metric at # least once per hour. mirage_periodic_check_metric_list = [] try: mirage_periodic_check_enabled = settings.MIRAGE_PERIODIC_CHECK except: mirage_periodic_check_enabled = False try: mirage_periodic_check_interval = settings.MIRAGE_PERIODIC_CHECK_INTERVAL except: mirage_periodic_check_interval = 3600 mirage_periodic_check_interval_minutes = int(int(mirage_periodic_check_interval) / 60) if mirage_unique_metrics and mirage_periodic_check_enabled: mirage_unique_metrics_count = len(mirage_unique_metrics) # Mirage periodic checks are only done on declared namespaces as to # process all Mirage metrics periodically would probably create a # substantial load on Graphite and is probably not required only key # metrics should be analysed by Mirage periodically. periodic_check_mirage_metrics = [] try: mirage_periodic_check_namespaces = settings.MIRAGE_PERIODIC_CHECK_NAMESPACES except: mirage_periodic_check_namespaces = [] for namespace in mirage_periodic_check_namespaces: for metric_name in mirage_unique_metrics: metric_namespace_elements = metric_name.split('.') mirage_periodic_metric = False for periodic_namespace in mirage_periodic_check_namespaces: if not namespace in mirage_periodic_check_namespaces: continue periodic_namespace_namespace_elements = periodic_namespace.split('.') elements_matched = set(metric_namespace_elements) & set(periodic_namespace_namespace_elements) if len(elements_matched) == len(periodic_namespace_namespace_elements): mirage_periodic_metric = True break if mirage_periodic_metric: if not metric_name in periodic_check_mirage_metrics: periodic_check_mirage_metrics.append(metric_name) periodic_check_mirage_metrics_count = len(periodic_check_mirage_metrics) logger.info( 'there are %s known Mirage periodic metrics' % ( str(periodic_check_mirage_metrics_count))) for metric_name in periodic_check_mirage_metrics: try: self.redis_conn.sadd('new.mirage.periodic_check.metrics.all', metric_name) except Exception as e: logger.error('error :: could not add %s to Redis set new.mirage.periodic_check.metrics.all: %s' % ( metric_name, e)) try: self.redis_conn.rename('mirage.periodic_check.metrics.all', 'mirage.periodic_check.metrics.all.old') except: pass try: self.redis_conn.rename('new.mirage.periodic_check.metrics.all', 'mirage.periodic_check.metrics.all') except: pass try: self.redis_conn.delete('mirage.periodic_check.metrics.all.old') except: pass if periodic_check_mirage_metrics_count > mirage_periodic_check_interval_minutes: mirage_periodic_checks_per_minute = periodic_check_mirage_metrics_count / mirage_periodic_check_interval_minutes else: mirage_periodic_checks_per_minute = 1 logger.info( '%s Mirage periodic checks can be added' % ( str(int(mirage_periodic_checks_per_minute)))) for metric_name in periodic_check_mirage_metrics: if len(mirage_periodic_check_metric_list) == int(mirage_periodic_checks_per_minute): break base_name = metric_name.replace(settings.FULL_NAMESPACE, '', 1) mirage_periodic_check_cache_key = 'mirage.periodic_check.%s' % base_name mirage_periodic_check_key = False try: mirage_periodic_check_key = self.redis_conn.get(mirage_periodic_check_cache_key) except Exception as e: logger.error('error :: could not query Redis for cache_key: %s' % e) if not mirage_periodic_check_key: try: key_created_at = int(time()) self.redis_conn.setex( mirage_periodic_check_cache_key, mirage_periodic_check_interval, key_created_at) logger.info( 'created Mirage periodic_check Redis key - %s' % (mirage_periodic_check_cache_key)) mirage_periodic_check_metric_list.append(metric_name) try: self.redis_conn.sadd('new.mirage.periodic_check.metrics', metric_name) except Exception as e: logger.error('error :: could not add %s to Redis set new.mirage.periodic_check.metrics: %s' % ( metric_name, e)) except: logger.error(traceback.format_exc()) logger.error( 'error :: failed to create Mirage periodic_check Redis key - %s' % (mirage_periodic_check_cache_key)) try: self.redis_conn.rename('mirage.periodic_check.metrics', 'mirage.periodic_check.metrics.old') except: pass try: self.redis_conn.rename('new.mirage.periodic_check.metrics', 'mirage.periodic_check.metrics') except: pass try: self.redis_conn.delete('mirage.periodic_check.metrics.old') except: pass mirage_periodic_check_metric_list_count = len(mirage_periodic_check_metric_list) logger.info( '%s Mirage periodic checks were added' % ( str(mirage_periodic_check_metric_list_count))) try: ionosphere_unique_metrics = list(self.redis_conn.smembers('ionosphere.unique_metrics')) except: ionosphere_unique_metrics = [] # @added 20170602 - Feature #2034: analyse_derivatives # In order to convert monotonic, incrementing metrics to a deriative # metric try: derivative_metrics = list(self.redis_conn.smembers('derivative_metrics')) except: derivative_metrics = [] try: non_derivative_metrics = list(self.redis_conn.smembers('non_derivative_metrics')) except: non_derivative_metrics = [] # This is here to refresh the sets try: manage_derivative_metrics = self.redis_conn.get('analyzer.derivative_metrics_expiry') except Exception as e: if LOCAL_DEBUG: logger.error('error :: could not query Redis for analyzer.derivative_metrics_expiry key: %s' % str(e)) manage_derivative_metrics = False # @added 20170901 - Bug #2154: Infrequent missing new_ Redis keys # If the analyzer.derivative_metrics_expiry is going to expire in the # next 60 seconds, just manage the derivative_metrics in the run as # there is an overlap some times where the key existed at the start of # the run but has expired by the end of the run. derivative_metrics_expiry_ttl = False if manage_derivative_metrics: try: derivative_metrics_expiry_ttl = self.redis_conn.ttl('analyzer.derivative_metrics_expiry') logger.info('the analyzer.derivative_metrics_expiry key ttl is %s' % str(derivative_metrics_expiry_ttl)) except: logger.error('error :: could not query Redis for analyzer.derivative_metrics_expiry key: %s' % str(e)) if derivative_metrics_expiry_ttl: if int(derivative_metrics_expiry_ttl) < 60: logger.info('managing derivative_metrics as the analyzer.derivative_metrics_expiry key ttl is less than 60 with %s' % str(derivative_metrics_expiry_ttl)) manage_derivative_metrics = False try: self.redis_conn.delete('analyzer.derivative_metrics_expiry') logger.info('deleted the Redis key analyzer.derivative_metrics_expiry') except: logger.error('error :: failed to delete Redis key :: analyzer.derivative_metrics_expiry') try: non_derivative_monotonic_metrics = settings.NON_DERIVATIVE_MONOTONIC_METRICS except: non_derivative_monotonic_metrics = [] # @added 20180519 - Feature #2378: Add redis auth to Skyline and rebrow # Added Redis sets for Boring, TooShort and Stale redis_set_errors = 0 # Distill timeseries strings into lists for i, metric_name in enumerate(assigned_metrics): self.check_if_parent_is_alive() # logger.info('analysing %s' % metric_name) try: raw_series = raw_assigned[i] unpacker = Unpacker(use_list=False) unpacker.feed(raw_series) timeseries = list(unpacker) except: timeseries = [] # @added 20170602 - Feature #2034: analyse_derivatives # In order to convert monotonic, incrementing metrics to a deriative # metric known_derivative_metric = False unknown_deriv_status = True if metric_name in non_derivative_metrics: unknown_deriv_status = False if unknown_deriv_status: if metric_name in derivative_metrics: known_derivative_metric = True unknown_deriv_status = False # This is here to refresh the sets if not manage_derivative_metrics: unknown_deriv_status = True base_name = metric_name.replace(settings.FULL_NAMESPACE, '', 1) # @added 20170617 - Bug #2050: analyse_derivatives - change in monotonicity # First check if it has its own Redis z.derivative_metric key # that has not expired derivative_metric_key = 'z.derivative_metric.%s' % str(base_name) if unknown_deriv_status: # @added 20170617 - Bug #2050: analyse_derivatives - change in monotonicity last_derivative_metric_key = False try: last_derivative_metric_key = self.redis_conn.get(derivative_metric_key) except Exception as e: logger.error('error :: could not query Redis for last_derivative_metric_key: %s' % e) # Determine if it is a strictly increasing monotonically metric # or has been in last FULL_DURATION via its z.derivative_metric # key if not last_derivative_metric_key: is_strictly_increasing_monotonically = strictly_increasing_monotonicity(timeseries) if is_strictly_increasing_monotonically: try: last_expire_set = int(time()) self.redis_conn.setex( derivative_metric_key, settings.FULL_DURATION, last_expire_set) except Exception as e: logger.error('error :: could not set Redis derivative_metric key: %s' % e) else: # Until the z.derivative_metric key expires, it is classed # as such is_strictly_increasing_monotonically = True skip_derivative = in_list(base_name, non_derivative_monotonic_metrics) if skip_derivative: is_strictly_increasing_monotonically = False if is_strictly_increasing_monotonically: known_derivative_metric = True try: self.redis_conn.sadd('derivative_metrics', metric_name) except: logger.info(traceback.format_exc()) logger.error('error :: failed to add metric to Redis derivative_metrics set') try: self.redis_conn.sadd('new_derivative_metrics', metric_name) except: logger.info(traceback.format_exc()) logger.error('error :: failed to add metric to Redis new_derivative_metrics set') else: try: self.redis_conn.sadd('non_derivative_metrics', metric_name) except: logger.info(traceback.format_exc()) logger.error('error :: failed to add metric to Redis non_derivative_metrics set') try: self.redis_conn.sadd('new_non_derivative_metrics', metric_name) except: logger.info(traceback.format_exc()) logger.error('error :: failed to add metric to Redis new_non_derivative_metrics set') if known_derivative_metric: try: derivative_timeseries = nonNegativeDerivative(timeseries) timeseries = derivative_timeseries except: logger.error('error :: nonNegativeDerivative failed') # @added 20180903 - Feature #2580: illuminance # Feature #1986: flux try: illuminance_datapoint = timeseries[-1][1] if '.illuminance' not in metric_name: self.illuminance_datapoints.append(illuminance_datapoint) except: pass try: anomalous, ensemble, datapoint = run_selected_algorithm(timeseries, metric_name) # @added 20190408 - Feature #2882: Mirage - periodic_check # Add for Mirage periodic - is really anomalous add to # real_anomalous_metrics and if in mirage_periodic_check_metric_list # add as anomalous if anomalous: # @modified 20190412 - Bug #2932: self.real_anomalous_metrics not being populated correctly # Feature #2882: Mirage - periodic_check # self.real_anomalous_metrics.append(base_name) base_name = metric_name.replace(settings.FULL_NAMESPACE, '', 1) metric_timestamp = timeseries[-1][0] metric = [datapoint, base_name, metric_timestamp] self.real_anomalous_metrics.append(metric) if metric_name in mirage_periodic_check_metric_list: self.mirage_periodic_check_metrics.append(base_name) anomalous = True # If it's anomalous, add it to list if anomalous: base_name = metric_name.replace(settings.FULL_NAMESPACE, '', 1) metric_timestamp = timeseries[-1][0] metric = [datapoint, base_name, metric_timestamp] self.anomalous_metrics.append(metric) # Get the anomaly breakdown - who returned True? triggered_algorithms = [] for index, value in enumerate(ensemble): if value: algorithm = settings.ALGORITHMS[index] anomaly_breakdown[algorithm] += 1 triggered_algorithms.append(algorithm) # It could have been deleted by the Roomba except TypeError: # logger.error('TypeError analysing %s' % metric_name) exceptions['DeletedByRoomba'] += 1 except TooShort: # logger.error('TooShort analysing %s' % metric_name) exceptions['TooShort'] += 1 except Stale: # logger.error('Stale analysing %s' % metric_name) exceptions['Stale'] += 1 except Boring: # logger.error('Boring analysing %s' % metric_name) exceptions['Boring'] += 1 except: # logger.error('Other analysing %s' % metric_name) exceptions['Other'] += 1 logger.info(traceback.format_exc()) # Add values to the queue so the parent process can collate for key, value in anomaly_breakdown.items(): self.anomaly_breakdown_q.put((key, value)) for key, value in exceptions.items(): self.exceptions_q.put((key, value)) spin_end = time() - spin_start logger.info('spin_process took %.2f seconds' % spin_end)
def get_anomalous_ts(base_name, anomaly_timestamp): logger = logging.getLogger(skyline_app_logger) # @added 20180423 - Feature #2360: CORRELATE_ALERTS_ONLY # Branch #2270: luminosity # Only correlate metrics with an alert setting if correlate_alerts_only: try: smtp_alerter_metrics = list( redis_conn.smembers('analyzer.smtp_alerter_metrics')) except: smtp_alerter_metrics = [] if base_name not in smtp_alerter_metrics: logger.error('%s has no alerter setting, not correlating' % base_name) return [] if not base_name or not anomaly_timestamp: return [] # from skyline_functions import nonNegativeDerivative anomalous_metric = '%s%s' % (settings.FULL_NAMESPACE, base_name) unique_metrics = [] try: unique_metrics = list( redis_conn.smembers(settings.FULL_NAMESPACE + 'unique_metrics')) except: logger.error(traceback.format_exc()) logger.error('error :: get_assigned_metrics :: no unique_metrics') return [] # @added 20180720 - Feature #2464: luminosity_remote_data # Ensure that Luminosity only processes it's own Redis metrics so that if # multiple Skyline instances are running, Luminosity does not process an # anomaly_id for a metric that is not local to itself. This will stop the # call to the remote Redis with other_redis_conn below. With the # introduction of the preprocessing luminosity_remote_data API endpoint for # remote Skyline instances, there is no further requirement for Skyline # instances to have direct access to Redis on another Skyline instance. # A much better solution and means all data is preprocessed and encrypted, # there is no need for iptables other than 443 (or custom https port). # if anomalous_metric in unique_metrics: logger.info( '%s is a metric in Redis, processing on this Skyline instance' % base_name) else: logger.info( '%s is not a metric in Redis, not processing on this Skyline instance' % base_name) return [] assigned_metrics = [anomalous_metric] # @modified 20180419 - raw_assigned = [] try: raw_assigned = redis_conn.mget(assigned_metrics) except: raw_assigned = [] if raw_assigned == [None]: logger.info('%s data not retrieved from local Redis' % (str(base_name))) raw_assigned = [] # @modified 20180721 - Feature #2464: luminosity_remote_data # TO BE DEPRECATED settings.OTHER_SKYLINE_REDIS_INSTANCES # with the addition of the luminosity_remote_data API call and the above if not raw_assigned and settings.OTHER_SKYLINE_REDIS_INSTANCES: # @modified 20180519 - Feature #2378: Add redis auth to Skyline and rebrow # for redis_ip, redis_port in settings.OTHER_SKYLINE_REDIS_INSTANCES: for redis_ip, redis_port, redis_password in settings.OTHER_SKYLINE_REDIS_INSTANCES: if not raw_assigned: try: if redis_password: other_redis_conn = StrictRedis( host=str(redis_ip), port=int(redis_port), password=str(redis_password)) else: other_redis_conn = StrictRedis(host=str(redis_ip), port=int(redis_port)) raw_assigned = other_redis_conn.mget(assigned_metrics) if raw_assigned == [None]: logger.info( '%s data not retrieved from Redis at %s on port %s' % (str(base_name), str(redis_ip), str(redis_port))) raw_assigned = [] if raw_assigned: logger.info( '%s data retrieved from Redis at %s on port %s' % (str(base_name), str(redis_ip), str(redis_port))) except: logger.error(traceback.format_exc()) logger.error( 'error :: failed to connect to Redis at %s on port %s' % (str(redis_ip), str(redis_port))) raw_assigned = [] if not raw_assigned or raw_assigned == [None]: logger.info('%s data not retrieved' % (str(base_name))) return [] for i, metric_name in enumerate(assigned_metrics): try: raw_series = raw_assigned[i] unpacker = Unpacker(use_list=False) unpacker.feed(raw_series) timeseries = list(unpacker) except: timeseries = [] # Convert the time series if this is a known_derivative_metric known_derivative_metric = is_derivative_metric(skyline_app, base_name) if known_derivative_metric: derivative_timeseries = nonNegativeDerivative(timeseries) timeseries = derivative_timeseries # Sample the time series # @modified 20180720 - Feature #2464: luminosity_remote_data # Added note here - if you modify the value of 600 here, it must be # modified in the luminosity_remote_data function in # skyline/webapp/backend.py as well from_timestamp = anomaly_timestamp - 600 anomaly_ts = [] for ts, value in timeseries: if int(ts) < from_timestamp: continue if int(ts) <= anomaly_timestamp: anomaly_ts.append((int(ts), value)) if int(ts) > anomaly_timestamp: break return anomaly_ts
def get_anomalous_ts(base_name, anomaly_timestamp): logger = logging.getLogger(skyline_app_logger) # @added 20180423 - Feature #2360: CORRELATE_ALERTS_ONLY # Branch #2270: luminosity # Only correlate metrics with an alert setting if correlate_alerts_only: try: smtp_alerter_metrics = list( redis_conn.smembers('analyzer.smtp_alerter_metrics')) except: smtp_alerter_metrics = [] if base_name not in smtp_alerter_metrics: logger.error('%s has no alerter setting, not correlating' % base_name) return False if not base_name or not anomaly_timestamp: return False # from skyline_functions import nonNegativeDerivative anomalous_metric = '%s%s' % (settings.FULL_NAMESPACE, base_name) assigned_metrics = [anomalous_metric] # @modified 20180419 - raw_assigned = [] try: raw_assigned = redis_conn.mget(assigned_metrics) except: raw_assigned = [] if raw_assigned == [None]: logger.info('%s data not retrieved from local Redis' % (str(base_name))) raw_assigned = [] if not raw_assigned and settings.OTHER_SKYLINE_REDIS_INSTANCES: # @modified 20180519 - Feature #2378: Add redis auth to Skyline and rebrow # for redis_ip, redis_port in settings.OTHER_SKYLINE_REDIS_INSTANCES: for redis_ip, redis_port, redis_password in settings.OTHER_SKYLINE_REDIS_INSTANCES: if not raw_assigned: try: if redis_password: other_redis_conn = StrictRedis( host=str(redis_ip), port=int(redis_port), password=str(redis_password)) else: other_redis_conn = StrictRedis(host=str(redis_ip), port=int(redis_port)) raw_assigned = other_redis_conn.mget(assigned_metrics) if raw_assigned == [None]: logger.info( '%s data not retrieved from Redis at %s on port %s' % (str(base_name), str(redis_ip), str(redis_port))) raw_assigned = [] if raw_assigned: logger.info( '%s data retrieved from Redis at %s on port %s' % (str(base_name), str(redis_ip), str(redis_port))) except: logger.error(traceback.format_exc()) logger.error( 'error :: failed to connect to Redis at %s on port %s' % (str(redis_ip), str(redis_port))) raw_assigned = [] if not raw_assigned or raw_assigned == [None]: logger.info('%s data not retrieved' % (str(base_name))) return False for i, metric_name in enumerate(assigned_metrics): try: raw_series = raw_assigned[i] unpacker = Unpacker(use_list=False) unpacker.feed(raw_series) timeseries = list(unpacker) except: timeseries = [] # Convert the time series if this is a known_derivative_metric known_derivative_metric = is_derivative_metric(skyline_app, base_name) if known_derivative_metric: derivative_timeseries = nonNegativeDerivative(timeseries) timeseries = derivative_timeseries # Sample the time series from_timestamp = anomaly_timestamp - 600 anomaly_ts = [] for ts, value in timeseries: if int(ts) < from_timestamp: continue if int(ts) <= anomaly_timestamp: anomaly_ts.append((int(ts), value)) if int(ts) > anomaly_timestamp: break return anomaly_ts
def get_correlations(base_name, anomaly_timestamp, anomalous_ts, assigned_metrics, raw_assigned, anomalies): logger = logging.getLogger(skyline_app_logger) # Distill timeseries strings into lists start = timer() count = 0 # Sample the time series from_timestamp = anomaly_timestamp - 600 correlated_metrics = [] correlations = [] no_data = False if not anomalous_ts: no_data = True if not assigned_metrics: no_data = True if not raw_assigned: no_data = True if not anomalies: no_data = True if no_data: logger.error('error :: get_correlations :: no data') return (correlated_metrics, correlations) for i, metric_name in enumerate(assigned_metrics): count += 1 # print(metric_name) if count > 1000: break correlated = None metric_base_name = metric_name.replace(settings.FULL_NAMESPACE, '', 1) if str(metric_base_name) == str(base_name): continue try: raw_series = raw_assigned[i] unpacker = Unpacker(use_list=False) unpacker.feed(raw_series) timeseries = list(unpacker) except: timeseries = [] if not timeseries: # print('no time series data for %s' % base_name) continue # Convert the time series if this is a known_derivative_metric known_derivative_metric = is_derivative_metric(skyline_app, metric_base_name) if known_derivative_metric: try: derivative_timeseries = nonNegativeDerivative(timeseries) timeseries = derivative_timeseries except: logger.error(traceback.format_exc()) logger.error('error :: nonNegativeDerivative') correlate_ts = [] for ts, value in timeseries: if int(ts) < from_timestamp: continue if int(ts) <= anomaly_timestamp: correlate_ts.append((int(ts), value)) if int(ts) > (anomaly_timestamp + 61): break if not correlate_ts: continue anomaly_ts_dict = dict(anomalous_ts) correlate_ts_dict = dict(correlate_ts) for a in anomalies: try: if int(a.exact_timestamp) < int(anomaly_timestamp - 120): continue if int(a.exact_timestamp) > int(anomaly_timestamp + 120): continue except: continue try: time_period = (int(anomaly_timestamp - 120), int(anomaly_timestamp + 120)) my_correlator = Correlator(anomaly_ts_dict, correlate_ts_dict, time_period) # For better correlation use 0.9 instead of 0.8 for the threshold # @modified 20180524 - Feature #2360: CORRELATE_ALERTS_ONLY # Branch #2270: luminosity # Feature #2378: Add redis auth to Skyline and rebrow # Added this to setting.py # if my_correlator.is_correlated(threshold=0.9): try: cross_correlation_threshold = settings.LUMINOL_CROSS_CORRELATION_THRESHOLD except: cross_correlation_threshold = 0.9 if my_correlator.is_correlated( threshold=cross_correlation_threshold): correlation = my_correlator.get_correlation_result() correlated = True correlations.append([ metric_base_name, correlation.coefficient, correlation.shift, correlation.shifted_coefficient ]) except: pass if correlated: correlated_metrics.append(metric_base_name) end = timer() logger.info( 'correlated %s metrics to %s anomaly, processed in %.6f seconds' % (str(len(correlated_metrics)), base_name, (end - start))) return (correlated_metrics, correlations)
def get_redis_metrics_timeseries(current_skyline_app, metrics, log=False): """ Return a dict of metrics timeseries as lists e.g. { 'base_name.1': [[ts, value], [ts, value], ..., [ts, value]], 'base_name.2': [[ts, value], [ts, value], ..., [ts, value]] } :param current_skyline_app: the app calling the function :param metrics: a list of base_names or full Redis metric names :param log: whether to log or not, optional, defaults to False :type current_skyline_app: str :type metrics: list :type log: boolean :return: metrics_timeseries :rtype: dict """ function_str = 'functions.redis.get_metrics_timeseries' if log: current_skyline_app_logger = current_skyline_app + 'Log' current_logger = logging.getLogger(current_skyline_app_logger) else: current_logger = None metrics_timeseries = {} try: redis_conn = get_redis_conn(current_skyline_app) except Exception as err: if not log: current_skyline_app_logger = current_skyline_app + 'Log' current_logger = logging.getLogger(current_skyline_app_logger) current_logger.error( 'error :: %s :: %s :: get_redis_conn failed - %s' % (current_skyline_app, function_str, str(err))) try: redis_conn_decoded = get_redis_conn_decoded(current_skyline_app) except Exception as err: if not log: current_skyline_app_logger = current_skyline_app + 'Log' current_logger = logging.getLogger(current_skyline_app_logger) current_logger.error( 'error :: %s :: %s :: get_redis_conn_decoded failed - %s' % (current_skyline_app, function_str, str(err))) assigned_metrics = [] base_names = [] for metric in metrics: if metric.startswith(FULL_NAMESPACE): metric_name = str(metric) base_name = metric.replace(FULL_NAMESPACE, '') else: metric_name = '%s%s' % (FULL_NAMESPACE, str(metric)) base_name = str(metric) assigned_metrics.append(metric_name) base_names.append(base_name) metrics_timeseries[base_name] = {} derivative_metrics = [] try: # @modified 20211012 - Feature #4280: aet.metrics_manager.derivative_metrics Redis hash # derivative_metrics = list(redis_conn_decoded.smembers('derivative_metrics')) derivative_metrics = list( redis_conn_decoded.smembers( 'aet.metrics_manager.derivative_metrics')) except Exception as err: if not log: current_skyline_app_logger = current_skyline_app + 'Log' current_logger = logging.getLogger(current_skyline_app_logger) current_logger.error(traceback.format_exc()) current_logger.error( 'error :: %s :: %s :: failed to get derivative_metrics from Redis - %s' % (current_skyline_app, function_str, str(err))) raw_assigned = {} try: raw_assigned = redis_conn.mget(assigned_metrics) except Exception as err: if not log: current_skyline_app_logger = current_skyline_app + 'Log' current_logger = logging.getLogger(current_skyline_app_logger) current_logger.error(traceback.format_exc()) current_logger.error( 'error :: %s :: %s :: failed to get raw_assigned from Redis - %s' % (current_skyline_app, function_str, str(err))) if raw_assigned: for index, metric_name in enumerate(assigned_metrics): timeseries = [] try: raw_series = raw_assigned[index] if raw_series: unpacker = Unpacker(use_list=False) unpacker.feed(raw_series) timeseries = list(unpacker) except Exception as err: if not log: current_skyline_app_logger = current_skyline_app + 'Log' current_logger = logging.getLogger( current_skyline_app_logger) current_logger.error( 'error :: %s :: %s :: failed to unpack %s timeseries - %s' % (current_skyline_app, function_str, metric_name, str(err))) timeseries = [] if timeseries: # Convert Redis ts floats to ints timeseries = [[int(ts), value] for ts, value in timeseries] if timeseries: # To ensure that there are no unordered timestamps in the time # series which are artefacts of the collector or carbon-relay, sort # all time series by timestamp before analysis. original_timeseries = timeseries if original_timeseries: timeseries = sort_timeseries(original_timeseries) del original_timeseries if metric_name in derivative_metrics: if len(timeseries) > 3: try: derivative_timeseries = nonNegativeDerivative( timeseries) timeseries = derivative_timeseries except Exception as err: if not log: current_skyline_app_logger = current_skyline_app + 'Log' current_logger = logging.getLogger( current_skyline_app_logger) current_logger.error(traceback.format_exc()) current_logger.error( 'error :: %s :: %s :: nonNegativeDerivative failed on timeseries for %s - %s' % (current_skyline_app, function_str, metric_name, str(err))) if timeseries: base_name = base_names[index] metrics_timeseries[base_name] = timeseries return metrics_timeseries