def thunder_send_event(current_skyline_app, event, log=True): """ Add an event to the thunder.events Redis set or the thunder check dir if Redis is not available. :param current_skyline_app: the app calling the function :param event: the event data :param log: whether to log or not, optional, defaults to False :type current_skyline_app: str :type event: dict :type log: boolean :return: submitted :rtype: boolean """ function_str = 'functions.thunder.thunder_sent_event' if log: current_skyline_app_logger = current_skyline_app + 'Log' current_logger = logging.getLogger(current_skyline_app_logger) else: current_logger = None submitted = 0 try: redis_conn = get_redis_conn(current_skyline_app) submitted = redis_conn.sadd('thunder.events', str(event)) if submitted: return True except Exception as e: if not log: current_skyline_app_logger = current_skyline_app + 'Log' current_logger = logging.getLogger(current_skyline_app_logger) current_logger.error(traceback.format_exc()) current_logger.error( 'error :: %s :: failed to add %s to thunder.events Redis set - %s' % (function_str, str(event), e)) # If the thunder event was not added to Redis set, create the event_file if not path.exists(THUNDER_EVENTS_DIR): mkdir_p(THUNDER_EVENTS_DIR) current_logger.info('created dir - %s' % THUNDER_EVENTS_DIR) event_file = '%s/%s.thunder.event.dict' % (THUNDER_EVENTS_DIR, str(time())) try: write_data_to_file(current_skyline_app, event_file, 'w', str(event)) current_logger.info('added thunder event file - %s' % event_file) submitted = True except Exception as e: current_logger.error(traceback.format_exc()) current_logger.error( 'error :: failed to add thunder event file - %s - %s' % (event_file, e)) submitted = False return submitted
def adtk_level_shift(current_skyline_app, parent_pid, timeseries, algorithm_parameters): """ A timeseries is anomalous if a level shift occurs in a 5 window period bound by a factor of 9 of the normal range based on historical interquartile range. :param current_skyline_app: the Skyline app executing the algorithm. This will be passed to the algorithm by Skyline. This is **required** for error handling and logging. You do not have to worry about handling the argument in the scope of the custom algorithm itself, but the algorithm must accept it as the first agrument. :param parent_pid: the parent pid which is executing the algorithm, this is **required** for error handling and logging. You do not have to worry about handling this argument in the scope of algorithm, but the algorithm must accept it as the second argument. :param timeseries: the time series as a list e.g. ``[[1578916800.0, 29.0], [1578920400.0, 55.0], ... [1580353200.0, 55.0]]`` :param algorithm_parameters: a dictionary of any required parameters for the custom_algorithm and algorithm itself. For the matrixprofile custom algorithm the following parameters are required, example: ``algorithm_parameters={ 'c': 9.0, 'run_every': 5, 'side': 'both', 'window': 5 }`` :type current_skyline_app: str :type parent_pid: int :type timeseries: list :type algorithm_parameters: dict :return: True, False or Non :rtype: boolean Performance is of paramount importance in Skyline, especially in terms of computational complexity, along with execution time and CPU usage. The adtk LevelShiftAD algortihm is not O(n) and it is not fast either, not when compared to the normal three-sigma triggered algorithms. However it is useful if you care about detecting all level shifts. The normal three-sigma triggered algorithms do not always detect a level shift, especially if the level shift does not breach the three-sigma limits. Therefore you may find over time that you encounter alerts that contain level shifts that you thought should have been detected. On these types of metrics and events, the adtk LevelShiftAD algortihm can be implemented to detect and alert on these. It is not recommended to run on all your metrics as it would immediately triple the analyzer runtime every if only run every 5 windows/ minutes. Due to the computational complexity and long run time of the adtk LevelShiftAD algorithm on the size of timeseries data used by Skyline, if you consider the following timings of all three-sigma triggered algorithms and compare them to the to the adtk_level_shift results in the last 2 rows of the below log, it is clear that the running adtk_level_shift on all metrics is probably not desirable, even if it is possible to do, it is very noisy. 2021-03-06 10:46:38 :: 1582754 :: algorithm run count - histogram_bins run 567 times 2021-03-06 10:46:38 :: 1582754 :: algorithm timings count - histogram_bins has 567 timings 2021-03-06 10:46:38 :: 1582754 :: algorithm timing - histogram_bins - total: 1.051136 - median: 0.001430 2021-03-06 10:46:38 :: 1582754 :: algorithm run count - first_hour_average run 567 times 2021-03-06 10:46:38 :: 1582754 :: algorithm timings count - first_hour_average has 567 timings 2021-03-06 10:46:38 :: 1582754 :: algorithm timing - first_hour_average - total: 1.322432 - median: 0.001835 2021-03-06 10:46:38 :: 1582754 :: algorithm run count - stddev_from_average run 567 times 2021-03-06 10:46:38 :: 1582754 :: algorithm timings count - stddev_from_average has 567 timings 2021-03-06 10:46:38 :: 1582754 :: algorithm timing - stddev_from_average - total: 1.097290 - median: 0.001641 2021-03-06 10:46:38 :: 1582754 :: algorithm run count - grubbs run 567 times 2021-03-06 10:46:38 :: 1582754 :: algorithm timings count - grubbs has 567 timings 2021-03-06 10:46:38 :: 1582754 :: algorithm timing - grubbs - total: 1.742929 - median: 0.002438 2021-03-06 10:46:38 :: 1582754 :: algorithm run count - ks_test run 147 times 2021-03-06 10:46:38 :: 1582754 :: algorithm timings count - ks_test has 147 timings 2021-03-06 10:46:38 :: 1582754 :: algorithm timing - ks_test - total: 0.127648 - median: 0.000529 2021-03-06 10:46:38 :: 1582754 :: algorithm run count - mean_subtraction_cumulation run 40 times 2021-03-06 10:46:38 :: 1582754 :: algorithm timings count - mean_subtraction_cumulation has 40 timings 2021-03-06 10:46:38 :: 1582754 :: algorithm timing - mean_subtraction_cumulation - total: 0.152515 - median: 0.003152 2021-03-06 10:46:39 :: 1582754 :: algorithm run count - median_absolute_deviation run 35 times 2021-03-06 10:46:39 :: 1582754 :: algorithm timings count - median_absolute_deviation has 35 timings 2021-03-06 10:46:39 :: 1582754 :: algorithm timing - median_absolute_deviation - total: 0.143770 - median: 0.003248 2021-03-06 10:46:39 :: 1582754 :: algorithm run count - stddev_from_moving_average run 30 times 2021-03-06 10:46:39 :: 1582754 :: algorithm timings count - stddev_from_moving_average has 30 timings 2021-03-06 10:46:39 :: 1582754 :: algorithm timing - stddev_from_moving_average - total: 0.125173 - median: 0.003092 2021-03-06 10:46:39 :: 1582754 :: algorithm run count - least_squares run 16 times 2021-03-06 10:46:39 :: 1582754 :: algorithm timings count - least_squares has 16 timings 2021-03-06 10:46:39 :: 1582754 :: algorithm timing - least_squares - total: 0.089108 - median: 0.005538 2021-03-06 10:46:39 :: 1582754 :: algorithm run count - abs_stddev_from_median run 1 times 2021-03-06 10:46:39 :: 1582754 :: algorithm timings count - abs_stddev_from_median has 1 timings 2021-03-06 10:46:39 :: 1582754 :: algorithm timing - abs_stddev_from_median - total: 0.036797 - median: 0.036797 2021-03-06 10:46:39 :: 1582754 :: algorithm run count - adtk_level_shift run 271 times 2021-03-06 10:46:39 :: 1582754 :: algorithm timings count - adtk_level_shift has 271 timings 2021-03-06 10:46:39 :: 1582754 :: algorithm timing - adtk_level_shift - total: 13.729565 - median: 0.035791 ... ... 2021-03-06 10:46:39 :: 1582754 :: seconds to run :: 27.93 # THE TOTAL ANALYZER RUNTIME Therefore the analysis methodology implemented for the adtk_level_shift custom_algorithm is as folows: - When new metrics are added either to the configuration or by actual new metrics coming online that match the ``algorithm_parameters['namespace']``, Skyline implements sharding on new metrics into time slots to prevent a thundering herd situation from developing. A newly added metrics will eventually be assigned into a time shard and be added and the last analysed timestamp will be added to the ``analyzer.last.adtk_level_shift`` Redis hash key to determine the next scheduled run with ``algorithm_parameters['namespace']`` - A ``run_every`` parameter is implemented so that the algorithm can be configured to run on a metric once every ``run_every`` minutes. The default is to run it every 5 minutes using window 5 (rolling) and trigger as anomalous if the algorithm labels any of the last 5 datapoints as anomalous. This means that there could be up to a 5 minute delay on an alert on the 60 second, 168 SECOND_ORDER_RESOLUTION_HOURS metrics in the example, but a ``c=9.0`` level shift would be detected and would be alerted on (if both analyzer and mirage triggered on it). This periodic running of the algorithm is a tradeoff so that the adtk_level_shift load and runtime can be spread over ``run_every`` minutes. - The algorithm is not run against metrics that are sparsely populated. When the algorithm is run on sparsely populated metrics it results in lots of false positives and noise. The Skyline CUSTOM_ALGORITHMS implementation of the adtk LevelShiftAD algorithm is configured as the example shown below. However please note that the algorithm_parameters shown in this example configuration are suitiable for metrics that have a 60 second relation and have a :mod:`settings.ALERTS` Mirage SECOND_ORDER_RESOLUTION_HOURS of 168 (7 days). For metrics with a different resolution/frequency may require different values appropriate for metric resolution. : Example CUSTOM_ALGORITHMS configuration: 'adtk_level_shift': { 'namespaces': [ 'skyline.analyzer.run_time', 'skyline.analyzer.total_metrics', 'skyline.analyzer.exceptions' ], 'algorithm_source': '/opt/skyline/github/skyline/skyline/custom_algorithms/adtk_level_shift.py', 'algorithm_parameters': {'c': 9.0, 'run_every': 5, 'side': 'both', 'window': 5}, 'max_execution_time': 0.5, 'consensus': 1, 'algorithms_allowed_in_consensus': ['adtk_level_shift'], 'run_3sigma_algorithms': True, 'run_before_3sigma': True, 'run_only_if_consensus': False, 'use_with': ["analyzer", "mirage"], 'debug_logging': False, }, """ # You MUST define the algorithm_name algorithm_name = 'adtk_level_shift' # Define the default state of None and None, anomalous does not default to # False as that is not correct, False is only correct if the algorithm # determines the data point is not anomalous. The same is true for the # anomalyScore. anomalous = None anomalyScore = None # @aded 20210308 - Feature #3978: luminosity - classify_metrics # Feature #3642: Anomaly type classification return_anomalies = False anomalies = [] realtime_analysis = True current_logger = None # If you wanted to log, you can but this should only be done during # testing and development def get_log(current_skyline_app): current_skyline_app_logger = current_skyline_app + 'Log' current_logger = logging.getLogger(current_skyline_app_logger) return current_logger start = timer() # Use the algorithm_parameters to determine the sample_period debug_logging = None try: debug_logging = algorithm_parameters['debug_logging'] except: debug_logging = False if debug_logging: try: current_logger = get_log(current_skyline_app) current_logger.debug('debug :: %s :: debug_logging enabled with algorithm_parameters - %s' % ( algorithm_name, str(algorithm_parameters))) except: # This except pattern MUST be used in ALL custom algortihms to # facilitate the traceback from any errors. The algorithm we want to # run super fast and without spamming the log with lots of errors. # But we do not want the function returning and not reporting # anything to the log, so the pythonic except is used to "sample" any # algorithm errors to a tmp file and report once per run rather than # spewing tons of errors into the log e.g. analyzer.log record_algorithm_error(current_skyline_app, parent_pid, algorithm_name, traceback.format_exc()) # Return None and None as the algorithm could not determine True or False return (False, None) # Allow the LevelShiftAD window parameter to be passed in the # algorithm_parameters window = 5 try: window = algorithm_parameters['window'] except: pass # Allow the LevelShiftAD c parameter to be passed in the # algorithm_parameters c = 9.0 try: c = algorithm_parameters['c'] except: pass run_every = window try: run_every = algorithm_parameters['run_every'] except: pass side = 'both' try: side = algorithm_parameters['side'] except: pass if debug_logging: current_logger.debug('debug :: algorithm_parameters :: %s' % ( str(algorithm_parameters))) # @added 20210308 - Feature #3978: luminosity - classify_metrics # Feature #3642: Anomaly type classification try: return_anomalies = algorithm_parameters['return_anomalies'] except: return_anomalies = False try: realtime_analysis = algorithm_parameters['realtime_analysis'] except: realtime_analysis = True # @added 20210316 - Feature #3978: luminosity - classify_metrics # Feature #3642: Anomaly type classification save_plots_to = False try: save_plots_to = algorithm_parameters['save_plots_to'] except: pass # @added 20210323 - Feature #3978: luminosity - classify_metrics # Feature #3642: Anomaly type classification save_plots_to_absolute_dir = False try: save_plots_to_absolute_dir = algorithm_parameters['save_plots_to_absolute_dir'] except: pass filename_prefix = False try: filename_prefix = algorithm_parameters['filename_prefix'] except: pass # @added 20210318 - Feature #3978: luminosity - classify_metrics # Feature #3642: Anomaly type classification run_PersistAD = False try: run_PersistAD = algorithm_parameters['run_PersistAD'] except: pass if debug_logging: current_logger.debug('debug :: algorithm_parameters :: %s' % ( str(algorithm_parameters))) try: base_name = algorithm_parameters['base_name'] except: # This except pattern MUST be used in ALL custom algortihms to # facilitate the traceback from any errors. The algorithm we want to # run super fast and without spamming the log with lots of errors. # But we do not want the function returning and not reporting # anything to the log, so the pythonic except is used to "sample" any # algorithm errors to a tmp file and report once per run rather than # spewing tons of errors into the log e.g. analyzer.log record_algorithm_error(current_skyline_app, parent_pid, algorithm_name, traceback.format_exc()) # Return None and None as the algorithm could not determine True or False if return_anomalies: return (False, None, anomalies) else: return (False, None) if debug_logging: current_logger.debug('debug :: %s :: base_name - %s' % ( algorithm_name, str(base_name))) # Due to the load and runtime of LevelShiftAD it is only run in analyzer # periodically if current_skyline_app == 'analyzer': redis_conn_decoded = get_redis_conn_decoded(current_skyline_app) last_hash_key = 'analyzer.last.%s' % algorithm_name last_check = None try: raw_last_check = redis_conn_decoded.hget(last_hash_key, base_name) last_check = int(raw_last_check) except: last_check = None last_window_timestamps = [int(item[0]) for item in timeseries[-run_every:]] if last_check in last_window_timestamps: if debug_logging: current_logger.debug('debug :: %s :: run_every period is not over yet, skipping base_name - %s' % ( algorithm_name, str(base_name))) if return_anomalies: return (False, None, anomalies) else: return (False, None) # If there is no last timestamp, shard the metric, it will eventually # be added. if not last_check: now = datetime.datetime.now() now_seconds = int(now.second) if now_seconds == 0: now_seconds = 1 period_seconds = int(60 / run_every) shard = int(period_seconds) last_shard = 60 shard = int(period_seconds) shards = [shard] while shard < last_shard: shard = shard + period_seconds shards.append((shard)) shard_value = round(now_seconds / shards[0]) * shards[0] if shard_value <= shards[0]: shard_value = shards[0] metric_as_bytes = str(base_name).encode() value = zlib.adler32(metric_as_bytes) shard_index = [(index + 1) for index, s_value in enumerate(shards) if s_value == shard_value][0] modulo_result = value % shard_index if modulo_result == 0: if debug_logging: current_logger.debug('debug :: %s :: skipping as not sharded into this run - %s' % ( algorithm_name, str(base_name))) if return_anomalies: return (False, None, anomalies) else: return (False, None) if debug_logging: current_logger.debug('debug :: %s :: analysing %s' % ( algorithm_name, str(base_name))) try: int_metric_timestamp = int(timeseries[-1][0]) except: int_metric_timestamp = 0 if int_metric_timestamp: try: redis_conn_decoded.hset( last_hash_key, base_name, int_metric_timestamp) except: pass # ALWAYS WRAP YOUR ALGORITHM IN try and the BELOW except try: start_preprocessing = timer() # INFO: Sorting time series of 10079 data points took 0.002215 seconds timeseries = sorted(timeseries, key=lambda x: x[0]) if debug_logging: current_logger.debug('debug :: %s :: time series of length - %s' % ( algorithm_name, str(len(timeseries)))) # Testing the data to ensure it meets minimum requirements, in the case # of Skyline's use of the LevelShiftAD algorithm this means that: # - the time series must have at least 75% of its full_duration # - the time series must have at least 99% of the data points for the # in the sample being analysed. do_not_use_sparse_data = False if current_skyline_app == 'analyzer': do_not_use_sparse_data = True # @added 20210305 - Feature #3970: custom_algorithm - adtk_level_shift # Task #3664:: POC with adtk # With mirage also do not run LevelShiftAD on sparsely populated data if current_skyline_app == 'mirage': do_not_use_sparse_data = True # @aded 20210309 - Feature #3978: luminosity - classify_metrics # Feature #3642: Anomaly type classification if current_skyline_app == 'luminosity': do_not_use_sparse_data = True if do_not_use_sparse_data: total_period = 0 total_datapoints = 0 try: start_timestamp = int(timeseries[0][0]) end_timestamp = int(timeseries[-1][0]) total_period = end_timestamp - start_timestamp total_datapoints = len(timeseries) except SystemExit as e: if debug_logging: current_logger.debug('debug_logging :: %s :: SystemExit called, exiting - %s' % ( algorithm_name, e)) if return_anomalies: return (anomalous, anomalyScore, anomalies) else: return (anomalous, anomalyScore) except: traceback_msg = traceback.format_exc() record_algorithm_error(current_skyline_app, parent_pid, algorithm_name, traceback_msg) if debug_logging: current_logger.error(traceback_msg) current_logger.error('error :: debug_logging :: %s :: failed to determine total_period and total_datapoints' % ( algorithm_name)) timeseries = [] if not timeseries: if return_anomalies: return (anomalous, anomalyScore, anomalies) else: return (anomalous, anomalyScore) if current_skyline_app == 'analyzer': # Default for analyzer at required period to 18 hours period_required = int(FULL_DURATION * 0.75) else: # Determine from timeseries if total_period < FULL_DURATION: period_required = int(FULL_DURATION * 0.75) else: period_required = int(total_period * 0.75) # If the time series does not have 75% of its full_duration it does not # have sufficient data to sample try: if total_period < period_required: if debug_logging: current_logger.debug('debug :: %s :: time series does not have sufficient data' % ( algorithm_name)) if return_anomalies: return (anomalous, anomalyScore, anomalies) else: return (anomalous, anomalyScore) except SystemExit as e: if debug_logging: current_logger.debug('debug_logging :: %s :: SystemExit called, exiting - %s' % ( algorithm_name, e)) if return_anomalies: return (anomalous, anomalyScore, anomalies) else: return (anomalous, anomalyScore) except: traceback_msg = traceback.format_exc() record_algorithm_error(current_skyline_app, parent_pid, algorithm_name, traceback_msg) if debug_logging: current_logger.error(traceback_msg) current_logger.error('error :: debug_logging :: %s :: falied to determine if time series has sufficient data' % ( algorithm_name)) if return_anomalies: return (anomalous, anomalyScore, anomalies) else: return (anomalous, anomalyScore) # If the time series does not have 75% of its full_duration data points # it does not have sufficient data to sample # Determine resolution from the last 30 data points # INFO took 0.002060 seconds resolution_timestamps = [] metric_resolution = False for metric_datapoint in timeseries[-30:]: timestamp = int(metric_datapoint[0]) resolution_timestamps.append(timestamp) timestamp_resolutions = [] if resolution_timestamps: last_timestamp = None for timestamp in resolution_timestamps: if last_timestamp: resolution = timestamp - last_timestamp timestamp_resolutions.append(resolution) last_timestamp = timestamp else: last_timestamp = timestamp try: del resolution_timestamps except: pass if timestamp_resolutions: try: timestamp_resolutions_count = Counter(timestamp_resolutions) ordered_timestamp_resolutions_count = timestamp_resolutions_count.most_common() metric_resolution = int(ordered_timestamp_resolutions_count[0][0]) except SystemExit as e: if debug_logging: current_logger.debug('debug_logging :: %s :: SystemExit called, exiting - %s' % ( algorithm_name, e)) if return_anomalies: return (anomalous, anomalyScore, anomalies) else: return (anomalous, anomalyScore) except: traceback_msg = traceback.format_exc() record_algorithm_error(current_skyline_app, parent_pid, algorithm_name, traceback_msg) if debug_logging: current_logger.error(traceback_msg) current_logger.error('error :: debug_logging :: %s :: failed to determine if time series has sufficient data' % ( algorithm_name)) try: del timestamp_resolutions except: pass minimum_datapoints = None if metric_resolution: minimum_datapoints = int(period_required / metric_resolution) if minimum_datapoints: if total_datapoints < minimum_datapoints: if debug_logging: current_logger.debug('debug :: %s :: time series does not have sufficient data, minimum_datapoints required is %s and time series has %s' % ( algorithm_name, str(minimum_datapoints), str(total_datapoints))) if return_anomalies: return (anomalous, anomalyScore, anomalies) else: return (anomalous, anomalyScore) # Is the time series fully populated? # full_duration_datapoints = int(full_duration / metric_resolution) total_period_datapoints = int(total_period / metric_resolution) # minimum_percentage_sparsity = 95 minimum_percentage_sparsity = 90 sparsity = int(total_datapoints / (total_period_datapoints / 100)) if sparsity < minimum_percentage_sparsity: if debug_logging: current_logger.debug('debug :: %s :: time series does not have sufficient data, minimum_percentage_sparsity required is %s and time series has %s' % ( algorithm_name, str(minimum_percentage_sparsity), str(sparsity))) if return_anomalies: return (anomalous, anomalyScore, anomalies) else: return (anomalous, anomalyScore) if len(set(item[1] for item in timeseries)) == 1: if debug_logging: current_logger.debug('debug :: %s :: time series does not have sufficient variability, all the values are the same' % algorithm_name) anomalous = False anomalyScore = 0.0 if return_anomalies: return (anomalous, anomalyScore, anomalies) else: return (anomalous, anomalyScore) end_preprocessing = timer() preprocessing_runtime = end_preprocessing - start_preprocessing if debug_logging: current_logger.debug('debug :: %s :: preprocessing took %.6f seconds' % ( algorithm_name, preprocessing_runtime)) if not timeseries: if debug_logging: current_logger.debug('debug :: %s :: LevelShiftAD not run as no data' % ( algorithm_name)) anomalies = [] if return_anomalies: return (anomalous, anomalyScore, anomalies) else: return (anomalous, anomalyScore) else: if debug_logging: current_logger.debug('debug :: %s :: timeseries length: %s' % ( algorithm_name, str(len(timeseries)))) if len(timeseries) < 100: if debug_logging: current_logger.debug('debug :: %s :: time series does not have sufficient data' % ( algorithm_name)) if return_anomalies: return (anomalous, anomalyScore, anomalies) else: return (anomalous, anomalyScore) start_analysis = timer() try: df = pd.DataFrame(timeseries, columns=['date', 'value']) df['date'] = pd.to_datetime(df['date'], unit='s') datetime_index = pd.DatetimeIndex(df['date'].values) df = df.set_index(datetime_index) df.drop('date', axis=1, inplace=True) s = validate_series(df) level_shift_ad = LevelShiftAD(c=c, side=side, window=window) anomaly_df = level_shift_ad.fit_detect(s) anomalies = anomaly_df.loc[anomaly_df['value'] > 0] anomalous = False if len(anomalies) > 0: anomaly_timestamps = list(anomalies.index.astype(np.int64) // 10**9) if realtime_analysis: last_window_timestamps = [int(item[0]) for item in timeseries[-window:]] # if timeseries[-1][0] in anomaly_timestamps: for timestamp in last_window_timestamps: if timestamp in anomaly_timestamps: anomalous = True break else: anomalous = True # Convert anomalies dataframe to anomalies_list anomalies_list = [] # @added 20210316 - Feature #3978: luminosity - classify_metrics # Feature #3642: Anomaly type classification # Convert anomalies dataframe to anomalies_dict anomalies_dict = {} anomalies_dict['metric'] = base_name anomalies_dict['timestamp'] = int(timeseries[-1][0]) anomalies_dict['from_timestamp'] = int(timeseries[0][0]) anomalies_dict['algorithm'] = algorithm_name anomalies_dict['anomalies'] = {} for ts, value in timeseries: if int(ts) in anomaly_timestamps: anomalies_list.append([int(ts), value]) anomalies_dict['anomalies'][int(ts)] = value anomalies = list(anomalies_list) # @added 20210316 - Feature #3978: luminosity - classify_metrics # Feature #3642: Anomaly type classification if save_plots_to: try: from adtk.visualization import plot metric_dir = base_name.replace('.', '/') timestamp_dir = str(int(timeseries[-1][0])) save_path = '%s/%s/%s/%s' % ( save_plots_to, algorithm_name, metric_dir, timestamp_dir) if save_plots_to_absolute_dir: save_path = '%s' % save_plots_to anomalies_dict['file_path'] = save_path save_to_file = '%s/%s.%s.png' % ( save_path, algorithm_name, base_name) if filename_prefix: save_to_file = '%s/%s.%s.%s.png' % ( save_path, filename_prefix, algorithm_name, base_name) save_to_path = os_path_dirname(save_to_file) title = '%s\n%s' % (algorithm_name, base_name) if not os_path_exists(save_to_path): try: mkdir_p(save_to_path) except Exception as e: current_logger.error('error :: %s :: failed to create dir - %s - %s' % ( algorithm_name, save_to_path, e)) if os_path_exists(save_to_path): try: plot(s, anomaly=anomaly_df, anomaly_color='red', title=title, save_to_file=save_to_file) if debug_logging: current_logger.debug('debug :: %s :: plot saved to - %s' % ( algorithm_name, save_to_file)) except Exception as e: current_logger.error('error :: %s :: failed to plot - %s - %s' % ( algorithm_name, base_name, e)) anomalies_file = '%s/%s.%s.anomalies_list.txt' % ( save_path, algorithm_name, base_name) with open(anomalies_file, 'w') as fh: fh.write(str(anomalies_list)) # os.chmod(anomalies_file, mode=0o644) data_file = '%s/data.txt' % (save_path) with open(data_file, 'w') as fh: fh.write(str(anomalies_dict)) except SystemExit as e: if debug_logging: current_logger.debug('debug_logging :: %s :: SystemExit called during save plot, exiting - %s' % ( algorithm_name, e)) if return_anomalies: return (anomalous, anomalyScore, anomalies) else: return (anomalous, anomalyScore) except Exception as e: traceback_msg = traceback.format_exc() record_algorithm_error(current_skyline_app, parent_pid, algorithm_name, traceback_msg) if debug_logging: current_logger.error(traceback_msg) current_logger.error('error :: %s :: failed to plot or save anomalies file - %s - %s' % ( algorithm_name, base_name, e)) else: anomalies = [] # @added 20210318 - Feature #3978: luminosity - classify_metrics # Feature #3642: Anomaly type classification if anomalies and run_PersistAD and not realtime_analysis: persist_ad_algorithm_parameters = {} try: persist_ad_algorithm_parameters = algorithm_parameters['persist_ad_algorithm_parameters'] except: pass persist_ad_window = 20 try: persist_ad_window = persist_ad_algorithm_parameters['window'] except: pass persist_ad_c = 9.9 try: persist_ad_c = persist_ad_algorithm_parameters['c'] except: pass try: from adtk.detector import PersistAD persist_ad = PersistAD(c=persist_ad_c, side='both', window=persist_ad_window) persist_ad_anomaly_df = persist_ad.fit_detect(s) persist_ad_anomalies = persist_ad_anomaly_df.loc[persist_ad_anomaly_df['value'] > 0] if len(persist_ad_anomalies) > 0: current_logger.info('%s :: %s anomalies found with PersistAD on %s' % ( algorithm_name, str(len(persist_ad_anomalies)), base_name)) persist_ad_anomaly_timestamps = list(persist_ad_anomalies.index.astype(np.int64) // 10**9) # Convert persist_ad_anomalies dataframe to persist_ad_anomalies_list persist_ad_anomalies_list = [] persist_ad_anomalies_dict = {} persist_ad_anomalies_dict['metric'] = base_name persist_ad_anomalies_dict['timestamp'] = int(timeseries[-1][0]) persist_ad_anomalies_dict['from_timestamp'] = int(timeseries[0][0]) persist_ad_anomalies_dict['algorithm'] = 'adtk_PersistAD' persist_ad_anomalies_dict['anomalies'] = {} for ts, value in timeseries: if int(ts) in persist_ad_anomaly_timestamps: persist_ad_anomalies_list.append([int(ts), value]) persist_ad_anomalies_dict['anomalies'][int(ts)] = value persist_ad_anomalies = list(persist_ad_anomalies_list) if save_plots_to: try: from adtk.visualization import plot metric_dir = base_name.replace('.', '/') timestamp_dir = str(int(timeseries[-1][0])) save_path = '%s/%s/%s/%s' % ( save_plots_to, algorithm_name, metric_dir, timestamp_dir) if save_plots_to_absolute_dir: save_path = '%s' % save_plots_to persist_ad_anomalies_dict['file_path'] = save_path save_to_file = '%s/%s.PersistAD.%s.png' % ( save_path, algorithm_name, base_name) if filename_prefix: save_to_file = '%s/%s.%s.%s.png' % ( save_path, filename_prefix, algorithm_name, base_name) save_to_path = os_path_dirname(save_to_file) title = '%s - PersistAD verification\n%s' % (algorithm_name, base_name) if not os_path_exists(save_to_path): try: mkdir_p(save_to_path) except Exception as e: current_logger.error('error :: %s :: failed to create dir - %s - %s' % ( algorithm_name, save_to_path, e)) if os_path_exists(save_to_path): try: plot(s, anomaly=persist_ad_anomaly_df, anomaly_color='red', title=title, save_to_file=save_to_file) if debug_logging: current_logger.debug('debug :: %s :: plot saved to - %s' % ( algorithm_name, save_to_file)) except Exception as e: current_logger.error('error :: %s :: failed to plot - %s - %s' % ( algorithm_name, base_name, e)) anomalies_file = '%s/%s.%s.PersistAD.anomalies_list.txt' % ( save_path, algorithm_name, base_name) with open(anomalies_file, 'w') as fh: fh.write(str(persist_ad_anomalies)) # os.chmod(anomalies_file, mode=0o644) data_file = '%s/PersistAD.data.txt' % (save_path) with open(data_file, 'w') as fh: fh.write(str(persist_ad_anomalies_dict)) except Exception as e: traceback_msg = traceback.format_exc() record_algorithm_error(current_skyline_app, parent_pid, algorithm_name, traceback_msg) if debug_logging: current_logger.error(traceback_msg) current_logger.error('error :: %s :: failed to plot or save PersistAD anomalies file - %s - %s' % ( algorithm_name, base_name, e)) except Exception as e: traceback_msg = traceback.format_exc() record_algorithm_error(current_skyline_app, parent_pid, algorithm_name, traceback_msg) if debug_logging: current_logger.error(traceback_msg) current_logger.error('error :: %s :: failed to analysis with PersistAD anomalies file - %s - %s' % ( algorithm_name, base_name, e)) try: del df except: pass except SystemExit as e: if debug_logging: current_logger.debug('debug_logging :: %s :: SystemExit called, during analysis, exiting - %s' % ( algorithm_name, e)) if return_anomalies: return (anomalous, anomalyScore, anomalies) else: return (anomalous, anomalyScore) except: traceback_msg = traceback.format_exc() record_algorithm_error(current_skyline_app, parent_pid, algorithm_name, traceback_msg) if debug_logging: current_logger.error(traceback_msg) current_logger.error('error :: debug_logging :: %s :: failed to run on ts' % ( algorithm_name)) if return_anomalies: return (anomalous, anomalyScore, anomalies) else: return (anomalous, anomalyScore) end_analysis = timer() analysis_runtime = end_analysis - start_analysis if debug_logging: current_logger.debug('debug :: %s :: LevelShiftAD took %.6f seconds' % ( algorithm_name, analysis_runtime)) if anomalous: anomalyScore = 1.0 else: anomalyScore = 0.0 if debug_logging: current_logger.info('%s :: anomalous - %s, anomalyScore - %s' % ( algorithm_name, str(anomalous), str(anomalyScore))) if debug_logging: end = timer() processing_runtime = end - start current_logger.info('%s :: completed analysis in %.6f seconds' % ( algorithm_name, processing_runtime)) try: del timeseries except: pass if return_anomalies: return (anomalous, anomalyScore, anomalies) else: return (anomalous, anomalyScore) except SystemExit as e: if debug_logging: current_logger.debug('debug_logging :: %s :: SystemExit called (before StopIteration), exiting - %s' % ( algorithm_name, e)) if return_anomalies: return (anomalous, anomalyScore, anomalies) else: return (anomalous, anomalyScore) except StopIteration: # This except pattern MUST be used in ALL custom algortihms to # facilitate the traceback from any errors. The algorithm we want to # run super fast and without spamming the log with lots of errors. # But we do not want the function returning and not reporting # anything to the log, so the pythonic except is used to "sample" any # algorithm errors to a tmp file and report once per run rather than # spewing tons of errors into the log e.g. analyzer.log if return_anomalies: return (False, None, anomalies) else: return (False, None) except: record_algorithm_error(current_skyline_app, parent_pid, algorithm_name, traceback.format_exc()) # Return None and None as the algorithm could not determine True or False if return_anomalies: return (False, None, anomalies) else: return (False, None) if return_anomalies: return (anomalous, anomalyScore, anomalies) else: return (anomalous, anomalyScore)
def spin_process(self, i, run_timestamp, metric_check_file): """ Assign a metric for a process to analyze. :param i: python process id :param run_timestamp: the epoch timestamp at which this process was called :param metric_check_file: full path to the metric check file :return: returns True """ child_process_pid = os.getpid() if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('child_process_pid - %s' % str(child_process_pid)) self.process_list.append(child_process_pid) if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('processing metric check - %s' % metric_check_file) if not os.path.isfile(str(metric_check_file)): logger.error('error :: file not found - metric_check_file - %s' % (str(metric_check_file))) return check_file_name = os.path.basename(str(metric_check_file)) if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('check_file_name - %s' % check_file_name) check_file_timestamp = check_file_name.split('.', 1)[0] if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('check_file_timestamp - %s' % str(check_file_timestamp)) check_file_metricname_txt = check_file_name.split('.', 1)[1] if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('check_file_metricname_txt - %s' % check_file_metricname_txt) check_file_metricname = check_file_metricname_txt.replace('.txt', '') if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('check_file_metricname - %s' % check_file_metricname) check_file_metricname_dir = check_file_metricname.replace('.', '/') if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('check_file_metricname_dir - %s' % check_file_metricname_dir) metric_failed_check_dir = failed_checks_dir + '/' + check_file_metricname_dir + '/' + check_file_timestamp if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('metric_failed_check_dir - %s' % metric_failed_check_dir) # failed_check_file = failed_checks_dir + '/' + check_file_name failed_check_file = metric_failed_check_dir + '/' + check_file_name # Load and validate metric variables try: metric_vars = load_metric_vars(skyline_app, str(metric_check_file)) except: logger.error('error :: failed to import metric variables from check file - %s' % (metric_check_file)) fail_check(skyline_app, metric_failed_check_dir, str(metric_check_file)) # TBD - a failed check Panorama update will go here, perhaps txt # files are not the only "queue" that will be used, both, but # Panorama, may be just a part of Skyline Flux, the flux DB # would allow for a very nice, distributed "queue" and a # distributed Skyline workforce... # Any Skyline node could just have one role, e.g. lots of # Skyline nodes running crucible only and instead of reading # the local filesystem for input, they could read the Flux DB # queue or both... return # Test metric variables # We use a pythonic methodology to test if the variables are defined, # this ensures that if any of the variables are not set for some reason # we can handle unexpected data or situations gracefully and try and # ensure that the process does not hang. # if len(str(metric_vars.metric)) == 0: # if not metric_vars.metric: try: metric_vars.metric except: logger.error('error :: failed to read metric variable from check file - %s' % (metric_check_file)) fail_check(skyline_app, metric_failed_check_dir, str(metric_check_file)) return else: metric = str(metric_vars.metric) if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('metric variable - metric - %s' % metric) # if len(metric_vars.value) == 0: # if not metric_vars.value: try: metric_vars.value except: logger.error('error :: failed to read value variable from check file - %s' % (metric_check_file)) fail_check(skyline_app, metric_failed_check_dir, str(metric_check_file)) return else: value = str(metric_vars.value) if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('metric variable - value - %s' % (value)) # if len(metric_vars.from_timestamp) == 0: # if not metric_vars.from_timestamp: try: metric_vars.from_timestamp except: logger.error('error :: failed to read from_timestamp variable from check file - %s' % (metric_check_file)) fail_check(skyline_app, metric_failed_check_dir, str(metric_check_file)) return else: from_timestamp = str(metric_vars.from_timestamp) if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('metric variable - from_timestamp - %s' % from_timestamp) # if len(metric_vars.metric_timestamp) == 0: # if not metric_vars.metric_timestamp: try: metric_vars.metric_timestamp except: logger.error('error :: failed to read metric_timestamp variable from check file - %s' % (metric_check_file)) fail_check(skyline_app, metric_failed_check_dir, str(metric_check_file)) return else: metric_timestamp = str(metric_vars.metric_timestamp) if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('metric variable - metric_timestamp - %s' % metric_timestamp) # if len(metric_vars.algorithms) == 0: # if not metric_vars.algorithms: try: metric_vars.algorithms except: logger.error('error :: failed to read algorithms variable from check file setting to all' % (metric_check_file)) algorithms = ['all'] else: algorithms = [] for i_algorithm in metric_vars.algorithms: algorithms.append(i_algorithm) if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('metric variable - algorithms - %s' % algorithms) # if len(metric_vars.anomaly_dir) == 0: # if not metric_vars.anomaly_dir: try: metric_vars.anomaly_dir except: logger.error('error :: failed to read anomaly_dir variable from check file - %s' % (metric_check_file)) fail_check(skyline_app, metric_failed_check_dir, str(metric_check_file)) return else: anomaly_dir = str(metric_vars.anomaly_dir) if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('metric variable - anomaly_dir - %s' % anomaly_dir) # if len(str(metric_vars.graphite_metric)) == 0: try: metric_vars.graphite_metric except: logger.info('failed to read graphite_metric variable from check file setting to False') # yes this is a string graphite_metric = 'False' else: graphite_metric = str(metric_vars.graphite_metric) if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('metric variable - graphite_metric - %s' % graphite_metric) # if len(str(metric_vars.run_crucible_tests)) == 0: try: metric_vars.run_crucible_tests except: logger.info('failed to read run_crucible_tests variable from check file setting to False') # yes this is a string run_crucible_tests = 'False' else: run_crucible_tests = str(metric_vars.run_crucible_tests) if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('metric variable - run_crucible_tests - %s' % run_crucible_tests) try: metric_vars.added_by except: if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('failed to read added_by variable from check file setting to crucible - set to crucible') added_by = 'crucible' else: added_by = str(metric_vars.added_by) if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('metric variable - added_by - %s' % added_by) try: metric_vars.run_script except: run_script = False if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('metric variable - run_script - not present set to False') else: run_script = str(metric_vars.run_script) if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('metric variable - run_script - %s' % run_script) # Only check if the metric does not a EXPIRATION_TIME key set, crucible # uses the alert EXPIRATION_TIME for the relevant alert setting contexts # whether that be analyzer, mirage, boundary, etc and sets its own # cache_keys in redis. This prevents large amounts of data being added # in terms of tieseries json and image files, crucible samples at the # same EXPIRATION_TIME as alerts. source_app = 'crucible' expiration_timeout = 1800 remove_all_anomaly_files = False check_expired = False check_time = time() if added_by == 'analyzer' or added_by == 'mirage': if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('Will check %s ALERTS' % added_by) if settings.ENABLE_ALERTS: if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('Checking %s ALERTS' % added_by) for alert in settings.ALERTS: ALERT_MATCH_PATTERN = alert[0] METRIC_PATTERN = metric alert_match_pattern = re.compile(ALERT_MATCH_PATTERN) pattern_match = alert_match_pattern.match(METRIC_PATTERN) if pattern_match: source_app = added_by expiration_timeout = alert[2] if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('matched - %s - %s - EXPIRATION_TIME is %s' % (source_app, metric, str(expiration_timeout))) check_age = int(check_time) - int(metric_timestamp) if int(check_age) > int(expiration_timeout): check_expired = True if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('the check is older than EXPIRATION_TIME for the metric - not checking - check_expired') if added_by == 'boundary': if settings.BOUNDARY_ENABLE_ALERTS: for alert in settings.BOUNDARY_METRICS: ALERT_MATCH_PATTERN = alert[0] METRIC_PATTERN = metric alert_match_pattern = re.compile(ALERT_MATCH_PATTERN) pattern_match = alert_match_pattern.match(METRIC_PATTERN) if pattern_match: source_app = 'boundary' expiration_timeout = alert[2] if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('matched - %s - %s - EXPIRATION_TIME is %s' % (source_app, metric, str(expiration_timeout))) check_age = int(check_time) - int(metric_timestamp) if int(check_age) > int(expiration_timeout): check_expired = True if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('the check is older than EXPIRATION_TIME for the metric - not checking - check_expired') cache_key = 'crucible.last_check.%s.%s' % (source_app, metric) if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('cache_key - crucible.last_check.%s.%s' % (source_app, metric)) # Only use the cache_key EXPIRATION_TIME if this is not a request to # run_crucible_tests on a timeseries if run_crucible_tests == 'False': if check_expired: logger.info('check_expired - not checking Redis key') last_check = True else: if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('Checking if cache_key exists') try: last_check = self.redis_conn.get(cache_key) except Exception as e: logger.error('error :: could not query cache_key for %s - %s - %s' % (alerter, metric, e)) logger.info('all anomaly files will be removed') remove_all_anomaly_files = True if not last_check: try: self.redis_conn.setex(cache_key, expiration_timeout, packb(value)) logger.info('set cache_key for %s - %s with timeout of %s' % (source_app, metric, str(expiration_timeout))) except Exception as e: logger.error('error :: could not query cache_key for %s - %s - %s' % (alerter, metric, e)) logger.info('all anomaly files will be removed') remove_all_anomaly_files = True else: if check_expired: logger.info('check_expired - all anomaly files will be removed') remove_all_anomaly_files = True else: logger.info('cache_key is set and not expired for %s - %s - all anomaly files will be removed' % (source_app, metric)) remove_all_anomaly_files = True # anomaly dir if not os.path.exists(str(anomaly_dir)): try: mkdir_p(skyline_app, str(anomaly_dir)) if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('created anomaly dir - %s' % str(anomaly_dir)) except: logger.error('error :: failed to create anomaly_dir - %s' % str(anomaly_dir)) if not os.path.exists(str(anomaly_dir)): logger.error('error :: anomaly_dir does not exist') fail_check(skyline_app, metric_failed_check_dir, str(metric_check_file)) return else: if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('anomaly dir exists - %s' % str(anomaly_dir)) failed_check_file = anomaly_dir + '/' + metric_timestamp + '.failed.check.' + metric + '.txt' if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('failed_check_file - %s' % str(failed_check_file)) # Retrieve data from graphite is necessary anomaly_graph = anomaly_dir + '/' + metric + '.png' anomaly_json = anomaly_dir + '/' + metric + '.json' anomaly_json_gz = anomaly_json + '.gz' if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('anomaly_graph - %s' % str(anomaly_graph)) logger.info('anomaly_json - %s' % str(anomaly_json)) logger.info('anomaly_json_gz - %s' % str(anomaly_json_gz)) # Some things added to crucible may not be added by a skyline app per se # and if run_crucible_tests is string True then no anomaly files should # be removed. if run_crucible_tests == 'True': remove_all_anomaly_files = False # Remove check and anomaly files if the metric has a EXPIRATION_TIME # cache_key set if remove_all_anomaly_files: if os.path.isfile(anomaly_graph): try: os.remove(anomaly_graph) if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('anomaly_graph removed - %s' % str(anomaly_graph)) except OSError: pass if os.path.isfile(anomaly_json): try: os.remove(anomaly_json) if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('anomaly_json removed - %s' % str(anomaly_json)) except OSError: pass if os.path.isfile(anomaly_json_gz): try: os.remove(anomaly_json_gz) if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('anomaly_json_gz removed - %s' % str(anomaly_json_gz)) except OSError: pass anomaly_txt_file = anomaly_dir + '/' + metric + '.txt' if os.path.isfile(anomaly_txt_file): try: os.remove(anomaly_txt_file) if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('anomaly_txt_file removed - %s' % str(anomaly_txt_file)) except OSError: pass # TBD - this data would have to be added to the panaorama DB before # it is removed if os.path.isfile(str(metric_check_file)): try: os.remove(str(metric_check_file)) if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('metric_check_file removed - %s' % str(metric_check_file)) except OSError: pass if os.path.exists(str(anomaly_dir)): try: os.rmdir(str(anomaly_dir)) if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('anomaly_dir removed - %s' % str(anomaly_dir)) except OSError: pass logger.info('check and anomaly files removed') return # Check if the image exists if graphite_metric == 'True': if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('graphite_metric - %s' % (graphite_metric)) # Graphite timeouts connect_timeout = int(settings.GRAPHITE_CONNECT_TIMEOUT) if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('connect_timeout - %s' % str(connect_timeout)) read_timeout = int(settings.GRAPHITE_READ_TIMEOUT) if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('read_timeout - %s' % str(read_timeout)) graphite_until = datetime.datetime.fromtimestamp(int(metric_timestamp)).strftime('%H:%M_%Y%m%d') graphite_from = datetime.datetime.fromtimestamp(int(from_timestamp)).strftime('%H:%M_%Y%m%d') # graphite URL if settings.GRAPHITE_PORT != '': url = settings.GRAPHITE_PROTOCOL + '://' + settings.GRAPHITE_HOST + ':' + settings.GRAPHITE_PORT + '/render/?from=' + graphite_from + '&until=' + graphite_until + '&target=' + metric + '&format=json' else: url = settings.GRAPHITE_PROTOCOL + '://' + settings.GRAPHITE_HOST + '/render/?from=' + graphite_from + '&until=' + graphite_until + '&target=' + metric + '&format=json' if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('graphite url - %s' % (url)) if not os.path.isfile(anomaly_graph): logger.info('retrieving png - surfacing %s graph from graphite from %s to %s' % (metric, graphite_from, graphite_until)) image_url = url.replace('&format=json', '') graphite_image_file = anomaly_dir + '/' + metric + '.png' if 'width' not in image_url: image_url += '&width=586' if 'height' not in image_url: image_url += '&height=308' if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('graphite image url - %s' % (image_url)) image_url_timeout = int(connect_timeout) image_data = None try: image_data = urllib2.urlopen(image_url, timeout=image_url_timeout).read() logger.info('url OK - %s' % (image_url)) except urllib2.URLError: image_data = None logger.error('error :: url bad - %s' % (image_url)) if image_data is not None: with open(graphite_image_file, 'w') as f: f.write(image_data) logger.info('retrieved - %s' % (anomaly_graph)) if python_version == 2: mode_arg = int('0644') if python_version == 3: mode_arg = '0o644' os.chmod(graphite_image_file, mode_arg) else: logger.error('error :: failed to retrieved - %s' % (anomaly_graph)) else: if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('anomaly_graph file exists - %s' % str(anomaly_graph)) if not os.path.isfile(anomaly_graph): logger.error('error :: retrieve failed to surface %s graph from graphite' % (metric)) else: logger.info('graph image exists - %s' % (anomaly_graph)) # Check if the json exists if not os.path.isfile(anomaly_json_gz): if not os.path.isfile(anomaly_json): logger.info('surfacing timeseries data for %s from graphite from %s to %s' % (metric, graphite_from, graphite_until)) if requests.__version__ >= '2.4.0': use_timeout = (int(connect_timeout), int(read_timeout)) else: use_timeout = int(connect_timeout) if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('use_timeout - %s' % (str(use_timeout))) try: r = requests.get(url, timeout=use_timeout) js = r.json() datapoints = js[0]['datapoints'] if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('data retrieved OK') except: datapoints = [[None, int(graphite_until)]] logger.error('error :: data retrieval failed') converted = [] for datapoint in datapoints: try: new_datapoint = [float(datapoint[1]), float(datapoint[0])] converted.append(new_datapoint) except: continue with open(anomaly_json, 'w') as f: f.write(json.dumps(converted)) if python_version == 2: mode_arg = int('0644') if python_version == 3: mode_arg = '0o644' os.chmod(anomaly_json, mode_arg) if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('json file - %s' % anomaly_json) if not os.path.isfile(anomaly_json): logger.error('error :: failed to surface %s json from graphite' % (metric)) # Move metric check file try: shutil.move(metric_check_file, failed_check_file) logger.info('moved check file to - %s' % failed_check_file) except OSError: logger.error('error :: failed to move check file to - %s' % failed_check_file) pass return # Check timeseries json exists - raw or gz if not os.path.isfile(anomaly_json): if not os.path.isfile(anomaly_json_gz): logger.error('error :: no json data found' % (metric)) # Move metric check file try: shutil.move(metric_check_file, failed_check_file) logger.info('moved check file to - %s' % failed_check_file) except OSError: logger.error('error :: failed to move check file to - %s' % failed_check_file) pass return else: logger.info('timeseries json gzip exists - %s' % (anomaly_json_gz)) else: logger.info('timeseries json exists - %s' % (anomaly_json)) # If timeseries json and run_crucible_tests is str(False) gzip and # return here as there is nothing further to do if run_crucible_tests == 'False': if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('run_crucible_tests - %s' % run_crucible_tests) # gzip the json timeseries data if os.path.isfile(anomaly_json): if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('gzipping - %s' % anomaly_json) try: f_in = open(anomaly_json) f_out = gzip.open(anomaly_json_gz, 'wb') f_out.writelines(f_in) f_out.close() f_in.close() os.remove(anomaly_json) if python_version == 2: mode_arg = int('0644') if python_version == 3: mode_arg = '0o644' os.chmod(anomaly_json_gz, mode_arg) if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('gzipped - %s' % anomaly_json_gz) try: os.remove(metric_check_file) logger.info('removed check file - %s' % metric_check_file) except OSError: pass return except: logger.error('error :: Failed to gzip data file - %s' % str(traceback.print_exc())) try: os.remove(metric_check_file) logger.info('removed check file - %s' % metric_check_file) except OSError: pass return if os.path.isfile(anomaly_json_gz): if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('gzip exists - %s' % anomaly_json) try: os.remove(metric_check_file) logger.info('removed check file - %s' % metric_check_file) except OSError: pass return nothing_to_do = 'true - for debug only' # self.check_if_parent_is_alive() # Run crucible algorithms logger.info('running crucible tests - %s' % (metric)) timeseries_dir = metric.replace('.', '/') if os.path.isfile(anomaly_json_gz): if not os.path.isfile(anomaly_json): if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('ungzipping - %s' % anomaly_json_gz) try: # with gzip.open(anomaly_json_gz, 'rb') as fr: fr = gzip.open(anomaly_json_gz, 'rb') raw_timeseries = fr.read() fr.close() except Exception as e: logger.error('error :: could not ungzip %s - %s' % (anomaly_json_gz, e)) traceback.print_exc() if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('ungzipped') logger.info('writing to - %s' % anomaly_json) with open(anomaly_json, 'w') as fw: fw.write(raw_timeseries) if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('anomaly_json done') if python_version == 2: mode_arg = int('0644') if python_version == 3: mode_arg = '0o644' os.chmod(anomaly_json, mode_arg) else: if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('No gzip - %s' % anomaly_json_gz) nothing_to_do = 'true - for debug only' if os.path.isfile(anomaly_json): if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('anomaly_json exists - %s' % anomaly_json) if os.path.isfile(anomaly_json): if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('loading timeseries from - %s' % anomaly_json) with open(anomaly_json, 'r') as f: timeseries = json.loads(f.read()) if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('loaded timeseries from - %s' % anomaly_json) else: try: logger.error('error :: file not found - %s' % anomaly_json) shutil.move(metric_check_file, failed_check_file) if python_version == 2: mode_arg = int('0644') if python_version == 3: mode_arg = '0o644' os.chmod(failed_check_file, mode_arg) logger.info('moved check file to - %s' % failed_check_file) except OSError: logger.error('error :: failed to move check file to - %s' % failed_check_file) pass return start_timestamp = int(timeseries[0][0]) if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('start_timestamp - %s' % str(start_timestamp)) end_timestamp = int(timeseries[-1][0]) if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('end_timestamp - %s' % str(end_timestamp)) full_duration = end_timestamp - start_timestamp if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('full_duration - %s' % str(full_duration)) self.check_if_parent_is_alive() run_algorithms_start_timestamp = int(time()) if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('run_algorithms_start_timestamp - %s' % str(run_algorithms_start_timestamp)) if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('run_algorithms - %s,%s,%s,%s,%s,%s' % (metric, str(end_timestamp), str(full_duration), anomaly_json, skyline_app, str(algorithms))) try: anomalous, ensemble = run_algorithms(timeseries, str(metric), end_timestamp, full_duration, str(anomaly_json), skyline_app, algorithms) except: logger.error('error :: run_algorithms failed - %s' % str(traceback.print_exc())) run_algorithms_end_timestamp = int(time()) run_algorithms_seconds = run_algorithms_end_timestamp - run_algorithms_start_timestamp if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('anomalous, ensemble - %s, %s' % (anomalous, str(ensemble))) if anomalous: if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('anomalous - %s' % (anomalous)) nothing_to_do = 'true - for debug only' logger.info('run_algorithms took %s seconds' % str(run_algorithms_seconds)) # Update anomaly file crucible_data = 'crucible_tests_run = "%s"\n' \ 'crucible_triggered_algorithms = %s\n' \ 'tested_by = "%s"\n' \ % (str(run_timestamp), str(ensemble), str(this_host)) crucible_anomaly_file = '%s/%s.txt' % (anomaly_dir, metric) with open(crucible_anomaly_file, 'a') as fh: fh.write(crucible_data) if python_version == 2: mode_arg = int('0644') if python_version == 3: mode_arg = '0o644' os.chmod(crucible_anomaly_file, mode_arg) logger.info('updated crucible anomaly file - %s/%s.txt' % (anomaly_dir, metric)) # gzip the json timeseries data after analysis if os.path.isfile(anomaly_json): if not os.path.isfile(anomaly_json_gz): try: f_in = open(anomaly_json) f_out = gzip.open(anomaly_json_gz, 'wb') f_out.writelines(f_in) f_out.close() f_in.close() os.remove(anomaly_json) if python_version == 2: mode_arg = int('0644') if python_version == 3: mode_arg = '0o644' os.chmod(anomaly_json_gz, mode_arg) logger.info('gzipped - %s' % (anomaly_json_gz)) except: logger.error('error :: Failed to gzip data file - %s' % str(traceback.print_exc())) else: os.remove(anomaly_json) if run_script: if os.path.isfile(run_script): logger.info('running - %s' % (run_script)) os.system('%s %s' % (str(run_script), str(crucible_anomaly_file))) # Remove metric check file try: os.remove(metric_check_file) logger.info('complete removed check file - %s' % (metric_check_file)) except OSError: pass
def run(self): """ Called when the process intializes. """ # Log management to prevent overwriting # Allow the bin/<skyline_app>.d to manage the log if os.path.isfile(skyline_app_logwait): try: os.remove(skyline_app_logwait) except OSError: logger.error('error - failed to remove %s, continuing' % skyline_app_logwait) pass now = time() log_wait_for = now + 5 while now < log_wait_for: if os.path.isfile(skyline_app_loglock): sleep(.1) now = time() else: now = log_wait_for + 1 logger.info('starting %s run' % skyline_app) if os.path.isfile(skyline_app_loglock): logger.error( 'error - bin/%s.d log management seems to have failed, continuing' % skyline_app) try: os.remove(skyline_app_loglock) logger.info('log lock file removed') except OSError: logger.error('error - failed to remove %s, continuing' % skyline_app_loglock) pass else: logger.info('bin/%s.d log management done' % skyline_app) logger.info('process intialized') while 1: now = time() if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('process started - %s' % int(now)) # Make sure check_dir exists and has not been removed try: if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('checking check dir exists - %s' % settings.CRUCIBLE_CHECK_PATH) os.path.exists(settings.CRUCIBLE_CHECK_PATH) except: logger.error('error :: check dir did not exist - %s' % settings.CRUCIBLE_CHECK_PATH) mkdir_p(settings.CRUCIBLE_CHECK_PATH) logger.info('check dir created - %s' % settings.CRUCIBLE_CHECK_PATH) os.path.exists(settings.CRUCIBLE_CHECK_PATH) # continue # Make sure Redis is up try: self.redis_conn.ping() logger.info('connected to redis at socket path %s' % settings.REDIS_SOCKET_PATH) except: logger.info( 'skyline can not connect to redis at socket path %s' % settings.REDIS_SOCKET_PATH) sleep(10) logger.info('connecting to redis at socket path %s' % settings.REDIS_SOCKET_PATH) # @modified 20180519 - Feature #2378: Add redis auth to Skyline and rebrow if settings.REDIS_PASSWORD: self.redis_conn = StrictRedis( password=settings.REDIS_PASSWORD, unix_socket_path=settings.REDIS_SOCKET_PATH) else: self.redis_conn = StrictRedis( unix_socket_path=settings.REDIS_SOCKET_PATH) continue """ Determine if any metric has been added to test """ while True: # Report app up self.redis_conn.setex(skyline_app, 120, now) metric_var_files = [ f for f in listdir(settings.CRUCIBLE_CHECK_PATH) if isfile(join(settings.CRUCIBLE_CHECK_PATH, f)) ] # if len(metric_var_files) == 0: if not metric_var_files: logger.info('sleeping 10 no metric check files') sleep(10) # Discover metric to analyze metric_var_files = '' metric_var_files = [ f for f in listdir(settings.CRUCIBLE_CHECK_PATH) if isfile(join(settings.CRUCIBLE_CHECK_PATH, f)) ] # if len(metric_var_files) > 0: if metric_var_files: break metric_var_files_sorted = sorted(metric_var_files) metric_check_file = settings.CRUCIBLE_CHECK_PATH + "/" + str( metric_var_files_sorted[0]) logger.info('assigning check for processing - %s' % str(metric_var_files_sorted[0])) # Reset process_list try: self.process_list[:] = [] except: logger.error('error :: failed to reset self.process_list') # Spawn processes pids = [] spawned_pids = [] pid_count = 0 run_timestamp = int(now) for i in range(1, settings.CRUCIBLE_PROCESSES + 1): p = Process(target=self.spin_process, args=(i, run_timestamp, str(metric_check_file))) pids.append(p) pid_count += 1 logger.info('starting %s of %s spin_process/es' % (str(pid_count), str(settings.CRUCIBLE_PROCESSES))) p.start() spawned_pids.append(p.pid) # Send wait signal to zombie processes # for p in pids: # p.join() # Self monitor processes and terminate if any spin_process has run # for longer than CRUCIBLE_TESTS_TIMEOUT p_starts = time() while time() - p_starts <= settings.CRUCIBLE_TESTS_TIMEOUT: if any(p.is_alive() for p in pids): # Just to avoid hogging the CPU sleep(.1) else: # All the processes are done, break now. time_to_run = time() - p_starts logger.info( '%s :: %s spin_process/es completed in %.2f seconds' % (skyline_app, str( settings.CRUCIBLE_PROCESSES), time_to_run)) break else: # We only enter this if we didn't 'break' above. logger.info( '%s :: timed out, killing all spin_process processes' % (skyline_app)) for p in pids: p.terminate() # p.join() for p in pids: if p.is_alive(): logger.info('%s :: stopping spin_process - %s' % (skyline_app, str(p.is_alive()))) p.join() while os.path.isfile(metric_check_file): sleep(1)
def spin_process(self, i, run_timestamp, metric_check_file): """ Assign a metric for a process to analyze. :param i: python process id :param run_timestamp: the epoch timestamp at which this process was called :param metric_check_file: full path to the metric check file :return: returns True """ child_process_pid = os.getpid() if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('child_process_pid - %s' % str(child_process_pid)) self.process_list.append(child_process_pid) if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('processing metric check - %s' % metric_check_file) if not os.path.isfile(str(metric_check_file)): logger.error('error :: file not found - metric_check_file - %s' % (str(metric_check_file))) return check_file_name = os.path.basename(str(metric_check_file)) if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('check_file_name - %s' % check_file_name) check_file_timestamp = check_file_name.split('.', 1)[0] if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('check_file_timestamp - %s' % str(check_file_timestamp)) check_file_metricname_txt = check_file_name.split('.', 1)[1] if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('check_file_metricname_txt - %s' % check_file_metricname_txt) check_file_metricname = check_file_metricname_txt.replace('.txt', '') if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('check_file_metricname - %s' % check_file_metricname) check_file_metricname_dir = check_file_metricname.replace('.', '/') if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('check_file_metricname_dir - %s' % check_file_metricname_dir) metric_failed_check_dir = failed_checks_dir + '/' + check_file_metricname_dir + '/' + check_file_timestamp if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('metric_failed_check_dir - %s' % metric_failed_check_dir) # failed_check_file = failed_checks_dir + '/' + check_file_name failed_check_file = metric_failed_check_dir + '/' + check_file_name # Load and validate metric variables try: metric_vars = load_metric_vars(skyline_app, str(metric_check_file)) except: logger.error( 'error :: failed to import metric variables from check file - %s' % (metric_check_file)) fail_check(skyline_app, metric_failed_check_dir, str(metric_check_file)) # TBD - a failed check Panorama update will go here, perhaps txt # files are not the only "queue" that will be used, both, but # Panorama, may be just a part of Skyline Flux, the flux DB # would allow for a very nice, distributed "queue" and a # distributed Skyline workforce... # Any Skyline node could just have one role, e.g. lots of # Skyline nodes running crucible only and instead of reading # the local filesystem for input, they could read the Flux DB # queue or both... return # Test metric variables # We use a pythonic methodology to test if the variables are defined, # this ensures that if any of the variables are not set for some reason # we can handle unexpected data or situations gracefully and try and # ensure that the process does not hang. # if len(str(metric_vars.metric)) == 0: # if not metric_vars.metric: try: metric_vars.metric except: logger.error( 'error :: failed to read metric variable from check file - %s' % (metric_check_file)) fail_check(skyline_app, metric_failed_check_dir, str(metric_check_file)) return else: metric = str(metric_vars.metric) if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('metric variable - metric - %s' % metric) # if len(metric_vars.value) == 0: # if not metric_vars.value: try: metric_vars.value except: logger.error( 'error :: failed to read value variable from check file - %s' % (metric_check_file)) fail_check(skyline_app, metric_failed_check_dir, str(metric_check_file)) return else: value = str(metric_vars.value) if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('metric variable - value - %s' % (value)) # if len(metric_vars.from_timestamp) == 0: # if not metric_vars.from_timestamp: try: metric_vars.from_timestamp except: logger.error( 'error :: failed to read from_timestamp variable from check file - %s' % (metric_check_file)) fail_check(skyline_app, metric_failed_check_dir, str(metric_check_file)) return else: from_timestamp = str(metric_vars.from_timestamp) if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('metric variable - from_timestamp - %s' % from_timestamp) # if len(metric_vars.metric_timestamp) == 0: # if not metric_vars.metric_timestamp: try: metric_vars.metric_timestamp except: logger.error( 'error :: failed to read metric_timestamp variable from check file - %s' % (metric_check_file)) fail_check(skyline_app, metric_failed_check_dir, str(metric_check_file)) return else: metric_timestamp = str(metric_vars.metric_timestamp) if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('metric variable - metric_timestamp - %s' % metric_timestamp) # if len(metric_vars.algorithms) == 0: # if not metric_vars.algorithms: algorithms = [] try: metric_vars.algorithms except: logger.error( 'error :: failed to read algorithms variable from check file setting to all' ) algorithms = ['all'] # if not algorithms: # algorithms = [] # for i_algorithm in metric_vars.algorithms: # algorithms.append(i_algorithm) if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('metric variable - algorithms - %s' % str(algorithms)) # if len(metric_vars.anomaly_dir) == 0: # if not metric_vars.anomaly_dir: try: metric_vars.anomaly_dir except: logger.error( 'error :: failed to read anomaly_dir variable from check file - %s' % (metric_check_file)) fail_check(skyline_app, metric_failed_check_dir, str(metric_check_file)) return else: anomaly_dir = str(metric_vars.anomaly_dir) if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('metric variable - anomaly_dir - %s' % anomaly_dir) # if len(str(metric_vars.graphite_metric)) == 0: try: metric_vars.graphite_metric except: logger.info( 'failed to read graphite_metric variable from check file setting to False' ) # yes this is a string graphite_metric = 'False' else: graphite_metric = str(metric_vars.graphite_metric) if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('metric variable - graphite_metric - %s' % graphite_metric) # if len(str(metric_vars.run_crucible_tests)) == 0: try: metric_vars.run_crucible_tests except: logger.info( 'failed to read run_crucible_tests variable from check file setting to False' ) # yes this is a string run_crucible_tests = 'False' else: run_crucible_tests = str(metric_vars.run_crucible_tests) if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('metric variable - run_crucible_tests - %s' % run_crucible_tests) try: metric_vars.added_by except: if settings.ENABLE_CRUCIBLE_DEBUG: logger.info( 'failed to read added_by variable from check file setting to crucible - set to crucible' ) added_by = 'crucible' else: added_by = str(metric_vars.added_by) if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('metric variable - added_by - %s' % added_by) try: metric_vars.run_script except: run_script = False if settings.ENABLE_CRUCIBLE_DEBUG: logger.info( 'metric variable - run_script - not present set to False') else: run_script = str(metric_vars.run_script) if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('metric variable - run_script - %s' % run_script) # Only check if the metric does not a EXPIRATION_TIME key set, crucible # uses the alert EXPIRATION_TIME for the relevant alert setting contexts # whether that be analyzer, mirage, boundary, etc and sets its own # cache_keys in redis. This prevents large amounts of data being added # in terms of tieseries json and image files, crucible samples at the # same EXPIRATION_TIME as alerts. source_app = 'crucible' expiration_timeout = 1800 remove_all_anomaly_files = False check_expired = False check_time = time() if added_by == 'analyzer' or added_by == 'mirage': if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('Will check %s ALERTS' % added_by) if settings.ENABLE_ALERTS: if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('Checking %s ALERTS' % added_by) for alert in settings.ALERTS: ALERT_MATCH_PATTERN = alert[0] METRIC_PATTERN = metric alert_match_pattern = re.compile(ALERT_MATCH_PATTERN) pattern_match = alert_match_pattern.match(METRIC_PATTERN) if pattern_match: source_app = added_by expiration_timeout = alert[2] if settings.ENABLE_CRUCIBLE_DEBUG: logger.info( 'matched - %s - %s - EXPIRATION_TIME is %s' % (source_app, metric, str(expiration_timeout))) check_age = int(check_time) - int(metric_timestamp) if int(check_age) > int(expiration_timeout): check_expired = True if settings.ENABLE_CRUCIBLE_DEBUG: logger.info( 'the check is older than EXPIRATION_TIME for the metric - not checking - check_expired' ) if added_by == 'boundary': if settings.BOUNDARY_ENABLE_ALERTS: for alert in settings.BOUNDARY_METRICS: ALERT_MATCH_PATTERN = alert[0] METRIC_PATTERN = metric alert_match_pattern = re.compile(ALERT_MATCH_PATTERN) pattern_match = alert_match_pattern.match(METRIC_PATTERN) if pattern_match: source_app = 'boundary' expiration_timeout = alert[2] if settings.ENABLE_CRUCIBLE_DEBUG: logger.info( 'matched - %s - %s - EXPIRATION_TIME is %s' % (source_app, metric, str(expiration_timeout))) check_age = int(check_time) - int(metric_timestamp) if int(check_age) > int(expiration_timeout): check_expired = True if settings.ENABLE_CRUCIBLE_DEBUG: logger.info( 'the check is older than EXPIRATION_TIME for the metric - not checking - check_expired' ) cache_key = 'crucible.last_check.%s.%s' % (source_app, metric) if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('cache_key - crucible.last_check.%s.%s' % (source_app, metric)) # Only use the cache_key EXPIRATION_TIME if this is not a request to # run_crucible_tests on a timeseries if run_crucible_tests == 'False': if check_expired: logger.info('check_expired - not checking Redis key') last_check = True else: if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('Checking if cache_key exists') try: last_check = self.redis_conn.get(cache_key) except Exception as e: logger.error( 'error :: could not query cache_key for %s - %s - %s' % (source_app, metric, e)) logger.info('all anomaly files will be removed') remove_all_anomaly_files = True if not last_check: try: self.redis_conn.setex(cache_key, expiration_timeout, packb(value)) logger.info( 'set cache_key for %s - %s with timeout of %s' % (source_app, metric, str(expiration_timeout))) except Exception as e: logger.error( 'error :: could not query cache_key for %s - %s - %s' % (source_app, metric, e)) logger.info('all anomaly files will be removed') remove_all_anomaly_files = True else: if check_expired: logger.info( 'check_expired - all anomaly files will be removed') remove_all_anomaly_files = True else: logger.info( 'cache_key is set and not expired for %s - %s - all anomaly files will be removed' % (source_app, metric)) remove_all_anomaly_files = True # anomaly dir if not os.path.exists(str(anomaly_dir)): try: # mkdir_p(skyline_app, str(anomaly_dir)) mkdir_p(anomaly_dir) if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('created anomaly dir - %s' % str(anomaly_dir)) except: logger.error('error :: failed to create anomaly_dir - %s' % str(anomaly_dir)) if not os.path.exists(str(anomaly_dir)): logger.error('error :: anomaly_dir does not exist') fail_check(skyline_app, metric_failed_check_dir, str(metric_check_file)) return else: if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('anomaly dir exists - %s' % str(anomaly_dir)) failed_check_file = anomaly_dir + '/' + metric_timestamp + '.failed.check.' + metric + '.txt' if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('failed_check_file - %s' % str(failed_check_file)) # Retrieve data from graphite is necessary anomaly_graph = anomaly_dir + '/' + metric + '.png' anomaly_json = anomaly_dir + '/' + metric + '.json' anomaly_json_gz = anomaly_json + '.gz' if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('anomaly_graph - %s' % str(anomaly_graph)) logger.info('anomaly_json - %s' % str(anomaly_json)) logger.info('anomaly_json_gz - %s' % str(anomaly_json_gz)) # Some things added to crucible may not be added by a skyline app per se # and if run_crucible_tests is string True then no anomaly files should # be removed. if run_crucible_tests == 'True': remove_all_anomaly_files = False # Remove check and anomaly files if the metric has a EXPIRATION_TIME # cache_key set if remove_all_anomaly_files: if os.path.isfile(anomaly_graph): try: os.remove(anomaly_graph) if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('anomaly_graph removed - %s' % str(anomaly_graph)) except OSError: pass if os.path.isfile(anomaly_json): try: os.remove(anomaly_json) if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('anomaly_json removed - %s' % str(anomaly_json)) except OSError: pass if os.path.isfile(anomaly_json_gz): try: os.remove(anomaly_json_gz) if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('anomaly_json_gz removed - %s' % str(anomaly_json_gz)) except OSError: pass anomaly_txt_file = anomaly_dir + '/' + metric + '.txt' if os.path.isfile(anomaly_txt_file): try: os.remove(anomaly_txt_file) if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('anomaly_txt_file removed - %s' % str(anomaly_txt_file)) except OSError: pass # TBD - this data would have to be added to the panaorama DB before # it is removed if os.path.isfile(str(metric_check_file)): try: os.remove(str(metric_check_file)) if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('metric_check_file removed - %s' % str(metric_check_file)) except OSError: pass if os.path.exists(str(anomaly_dir)): try: os.rmdir(str(anomaly_dir)) if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('anomaly_dir removed - %s' % str(anomaly_dir)) except OSError: pass logger.info('check and anomaly files removed') return # Check if the image exists if graphite_metric == 'True': if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('graphite_metric - %s' % (graphite_metric)) # Graphite timeouts connect_timeout = int(settings.GRAPHITE_CONNECT_TIMEOUT) if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('connect_timeout - %s' % str(connect_timeout)) read_timeout = int(settings.GRAPHITE_READ_TIMEOUT) if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('read_timeout - %s' % str(read_timeout)) graphite_until = datetime.datetime.fromtimestamp( int(metric_timestamp)).strftime('%H:%M_%Y%m%d') graphite_from = datetime.datetime.fromtimestamp( int(from_timestamp)).strftime('%H:%M_%Y%m%d') # graphite URL if settings.GRAPHITE_PORT != '': url = settings.GRAPHITE_PROTOCOL + '://' + settings.GRAPHITE_HOST + ':' + settings.GRAPHITE_PORT + '/render/?from=' + graphite_from + '&until=' + graphite_until + '&target=' + metric + '&format=json' else: url = settings.GRAPHITE_PROTOCOL + '://' + settings.GRAPHITE_HOST + '/render/?from=' + graphite_from + '&until=' + graphite_until + '&target=' + metric + '&format=json' if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('graphite url - %s' % (url)) if not os.path.isfile(anomaly_graph): logger.info( 'retrieving png - surfacing %s graph from graphite from %s to %s' % (metric, graphite_from, graphite_until)) image_url = url.replace('&format=json', '') graphite_image_file = anomaly_dir + '/' + metric + '.png' if 'width' not in image_url: image_url += '&width=586' if 'height' not in image_url: image_url += '&height=308' if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('graphite image url - %s' % (image_url)) image_url_timeout = int(connect_timeout) image_data = None try: # @modified 20170913 - Task #2160: Test skyline with bandit # Added nosec to exclude from bandit tests image_data = urllib2.urlopen( image_url, timeout=image_url_timeout).read() # nosec logger.info('url OK - %s' % (image_url)) except urllib2.URLError: image_data = None logger.error('error :: url bad - %s' % (image_url)) if image_data is not None: with open(graphite_image_file, 'w') as f: f.write(image_data) logger.info('retrieved - %s' % (anomaly_graph)) if python_version == 2: os.chmod(graphite_image_file, 0644) if python_version == 3: os.chmod(graphite_image_file, mode=0o644) else: logger.error('error :: failed to retrieved - %s' % (anomaly_graph)) else: if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('anomaly_graph file exists - %s' % str(anomaly_graph)) if not os.path.isfile(anomaly_graph): logger.error( 'error :: retrieve failed to surface %s graph from graphite' % (metric)) else: logger.info('graph image exists - %s' % (anomaly_graph)) # Check if the json exists if not os.path.isfile(anomaly_json_gz): if not os.path.isfile(anomaly_json): logger.info( 'surfacing timeseries data for %s from graphite from %s to %s' % (metric, graphite_from, graphite_until)) if requests.__version__ >= '2.4.0': use_timeout = (int(connect_timeout), int(read_timeout)) else: use_timeout = int(connect_timeout) if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('use_timeout - %s' % (str(use_timeout))) try: r = requests.get(url, timeout=use_timeout) js = r.json() datapoints = js[0]['datapoints'] if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('data retrieved OK') except: datapoints = [[None, int(graphite_until)]] logger.error('error :: data retrieval failed') converted = [] for datapoint in datapoints: try: new_datapoint = [ float(datapoint[1]), float(datapoint[0]) ] converted.append(new_datapoint) # @modified 20170913 - Task #2160: Test skyline with bandit # Added nosec to exclude from bandit tests except: # nosec continue with open(anomaly_json, 'w') as f: f.write(json.dumps(converted)) if python_version == 2: os.chmod(anomaly_json, 0644) if python_version == 3: os.chmod(anomaly_json, mode=0o644) if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('json file - %s' % anomaly_json) if not os.path.isfile(anomaly_json): logger.error( 'error :: failed to surface %s json from graphite' % (metric)) # Move metric check file try: shutil.move(metric_check_file, failed_check_file) logger.info('moved check file to - %s' % failed_check_file) except OSError: logger.error( 'error :: failed to move check file to - %s' % failed_check_file) pass return # Check timeseries json exists - raw or gz if not os.path.isfile(anomaly_json): if not os.path.isfile(anomaly_json_gz): logger.error('error :: no json data found') # Move metric check file try: shutil.move(metric_check_file, failed_check_file) logger.info('moved check file to - %s' % failed_check_file) except OSError: logger.error('error :: failed to move check file to - %s' % failed_check_file) pass return else: logger.info('timeseries json gzip exists - %s' % (anomaly_json_gz)) else: logger.info('timeseries json exists - %s' % (anomaly_json)) # If timeseries json and run_crucible_tests is str(False) gzip and # return here as there is nothing further to do if run_crucible_tests == 'False': if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('run_crucible_tests - %s' % run_crucible_tests) # gzip the json timeseries data if os.path.isfile(anomaly_json): if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('gzipping - %s' % anomaly_json) try: f_in = open(anomaly_json) f_out = gzip.open(anomaly_json_gz, 'wb') f_out.writelines(f_in) f_out.close() f_in.close() os.remove(anomaly_json) if python_version == 2: os.chmod(anomaly_json_gz, 0644) if python_version == 3: os.chmod(anomaly_json_gz, mode=0o644) if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('gzipped - %s' % anomaly_json_gz) try: os.remove(metric_check_file) logger.info('removed check file - %s' % metric_check_file) except OSError: pass return except: logger.error('error :: Failed to gzip data file - %s' % str(traceback.print_exc())) try: os.remove(metric_check_file) logger.info('removed check file - %s' % metric_check_file) except OSError: pass return if os.path.isfile(anomaly_json_gz): if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('gzip exists - %s' % anomaly_json) try: os.remove(metric_check_file) logger.info('removed check file - %s' % metric_check_file) except OSError: pass return nothing_to_do = 'true - for debug only' # self.check_if_parent_is_alive() # Run crucible algorithms logger.info('running crucible tests - %s' % (metric)) if os.path.isfile(anomaly_json_gz): if not os.path.isfile(anomaly_json): if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('ungzipping - %s' % anomaly_json_gz) try: # with gzip.open(anomaly_json_gz, 'rb') as fr: fr = gzip.open(anomaly_json_gz, 'rb') raw_timeseries = fr.read() fr.close() except Exception as e: logger.error('error :: could not ungzip %s - %s' % (anomaly_json_gz, e)) traceback.print_exc() if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('ungzipped') logger.info('writing to - %s' % anomaly_json) with open(anomaly_json, 'w') as fw: fw.write(raw_timeseries) if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('anomaly_json done') if python_version == 2: os.chmod(anomaly_json, 0644) if python_version == 3: os.chmod(anomaly_json, mode=0o644) else: if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('No gzip - %s' % anomaly_json_gz) nothing_to_do = 'true - for debug only' if os.path.isfile(anomaly_json): if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('anomaly_json exists - %s' % anomaly_json) if os.path.isfile(anomaly_json): if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('loading timeseries from - %s' % anomaly_json) timeseries = None try: with open(anomaly_json, 'r') as f: timeseries = json.loads(f.read()) raw_timeseries = f.read() timeseries_array_str = str(raw_timeseries).replace( '(', '[').replace(')', ']') timeseries = literal_eval(timeseries_array_str) if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('loaded time series from - %s' % anomaly_json) except: # logger.error(traceback.format_exc()) logger.info( 'failed to load with JSON, literal_eval will be tried - %s' % anomaly_json) # @added 20180715 - Task #2444: Evaluate CAD # Task #2446: Optimize Ionosphere # Branch #2270: luminosity # If the json.loads fails use literal_eval if not timeseries: try: with open(anomaly_json, 'r') as f: raw_timeseries = f.read() timeseries_array_str = str(raw_timeseries).replace( '(', '[').replace(')', ']') timeseries = literal_eval(timeseries_array_str) if settings.ENABLE_CRUCIBLE_DEBUG: logger.info( 'loaded time series with literal_eval from - %s' % anomaly_json) except: logger.info(traceback.format_exc()) logger.error('error :: failed to load JSON - %s' % anomaly_json) else: try: logger.error('error :: file not found - %s' % anomaly_json) shutil.move(metric_check_file, failed_check_file) if python_version == 2: os.chmod(failed_check_file, 0644) if python_version == 3: os.chmod(failed_check_file, mode=0o644) logger.info('moved check file to - %s' % failed_check_file) except OSError: logger.error('error :: failed to move check file to - %s' % failed_check_file) pass return start_timestamp = int(timeseries[0][0]) if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('start_timestamp - %s' % str(start_timestamp)) end_timestamp = int(timeseries[-1][0]) if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('end_timestamp - %s' % str(end_timestamp)) full_duration = end_timestamp - start_timestamp if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('full_duration - %s' % str(full_duration)) self.check_if_parent_is_alive() run_algorithms_start_timestamp = int(time()) if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('run_algorithms_start_timestamp - %s' % str(run_algorithms_start_timestamp)) if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('run_algorithms - %s,%s,%s,%s,%s,%s' % (metric, str(end_timestamp), str(full_duration), anomaly_json, skyline_app, str(algorithms))) try: anomalous, ensemble = run_algorithms(timeseries, str(metric), end_timestamp, full_duration, str(anomaly_json), skyline_app, algorithms) except: logger.error('error :: run_algorithms failed - %s' % str(traceback.print_exc())) run_algorithms_end_timestamp = int(time()) run_algorithms_seconds = run_algorithms_end_timestamp - run_algorithms_start_timestamp if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('anomalous, ensemble - %s, %s' % (anomalous, str(ensemble))) if anomalous: if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('anomalous - %s' % (anomalous)) nothing_to_do = 'true - for debug only' logger.info('run_algorithms took %s seconds' % str(run_algorithms_seconds)) # Update anomaly file crucible_data = 'crucible_tests_run = "%s"\n' \ 'crucible_triggered_algorithms = %s\n' \ 'tested_by = "%s"\n' \ % (str(run_timestamp), str(ensemble), str(this_host)) crucible_anomaly_file = '%s/%s.txt' % (anomaly_dir, metric) with open(crucible_anomaly_file, 'a') as fh: fh.write(crucible_data) if python_version == 2: os.chmod(crucible_anomaly_file, 0644) if python_version == 3: os.chmod(crucible_anomaly_file, mode=0o644) logger.info('updated crucible anomaly file - %s/%s.txt' % (anomaly_dir, metric)) # gzip the json timeseries data after analysis if os.path.isfile(anomaly_json): if not os.path.isfile(anomaly_json_gz): try: f_in = open(anomaly_json) f_out = gzip.open(anomaly_json_gz, 'wb') f_out.writelines(f_in) f_out.close() f_in.close() os.remove(anomaly_json) if python_version == 2: os.chmod(anomaly_json_gz, 0644) if python_version == 3: os.chmod(anomaly_json_gz, mode=0o644) logger.info('gzipped - %s' % (anomaly_json_gz)) except: logger.error('error :: Failed to gzip data file - %s' % str(traceback.print_exc())) else: os.remove(anomaly_json) if run_script: if os.path.isfile(run_script): logger.info('running - %s' % (run_script)) # @modified 20170913 - Task #2160: Test skyline with bandit # Added nosec to exclude from bandit tests os.system( '%s %s' % (str(run_script), str(crucible_anomaly_file))) # nosec # Remove metric check file nothing_to_do = '' try: os.remove(metric_check_file) logger.info('complete removed check file - %s %s' % (metric_check_file, nothing_to_do)) except OSError: pass
def alert_smtp(alert, metric, context): """ Called by :func:`~trigger_alert` and sends an alert via smtp to the recipients that are configured for the metric. """ LOCAL_DEBUG = False logger = logging.getLogger(skyline_app_logger) if settings.ENABLE_DEBUG or LOCAL_DEBUG: logger.info('debug :: alert_smtp - sending smtp alert') logger.info('debug :: alert_smtp - Memory usage at start: %s (kb)' % resource.getrusage(resource.RUSAGE_SELF).ru_maxrss) # FULL_DURATION to hours so that analyzer surfaces the relevant timeseries data # in the graph full_duration_in_hours = int(settings.FULL_DURATION) / 3600 # @added 20161229 - Feature #1830: Ionosphere alerts # Added Ionosphere variables base_name = str(metric[1]).replace(settings.FULL_NAMESPACE, '', 1) if settings.IONOSPHERE_ENABLED: timeseries_dir = base_name.replace('.', '/') training_data_dir = '%s/%s/%s' % (settings.IONOSPHERE_DATA_FOLDER, str(int(metric[2])), timeseries_dir) graphite_image_file = '%s/%s.%s.graphite.%sh.png' % ( training_data_dir, base_name, skyline_app, str(int(full_duration_in_hours))) json_file = '%s/%s.%s.redis.%sh.json' % ( training_data_dir, base_name, skyline_app, str(int(full_duration_in_hours))) training_data_redis_image = '%s/%s.%s.redis.plot.%sh.png' % ( training_data_dir, base_name, skyline_app, str(int(full_duration_in_hours))) # For backwards compatibility if '@' in alert[1]: sender = settings.ALERT_SENDER recipient = alert[1] else: sender = settings.SMTP_OPTS['sender'] # @modified 20160806 - Added default_recipient try: recipients = settings.SMTP_OPTS['recipients'][alert[0]] use_default_recipient = False except: use_default_recipient = True if use_default_recipient: try: recipients = settings.SMTP_OPTS['default_recipient'] logger.info( 'alert_smtp - using default_recipient as no recipients are configured for %s' % str(alert[0])) except: logger.error( 'error :: alert_smtp - no known recipient for %s' % str(alert[0])) return False # Backwards compatibility if type(recipients) is str: recipients = [recipients] # @added 20180524 - Task #2384: Change alerters to cc other recipients # The alerters did send an individual email to each recipient. This would be # more useful if one email was sent with the first smtp recipient being the # to recipient and the subsequent recipients were add in cc. if recipients: primary_recipient = False cc_recipients = False for i_recipient in recipients: if not primary_recipient: primary_recipient = str(i_recipient) if primary_recipient != i_recipient: if not cc_recipients: cc_recipients = str(i_recipient) else: new_cc_recipients = '%s,%s' % (str(cc_recipients), str(i_recipient)) cc_recipients = str(new_cc_recipients) logger.info( 'alert_smtp - will send to primary_recipient :: %s, cc_recipients :: %s' % (str(primary_recipient), str(cc_recipients))) # @modified 20161229 - Feature #1830: Ionosphere alerts # Ionosphere alerts unencoded_graph_title = 'Skyline %s - ALERT at %s hours - %s' % ( context, str(int(full_duration_in_hours)), str(metric[0])) # @modified 20170603 - Feature #2034: analyse_derivatives # Added deriative functions to convert the values of metrics strictly # increasing monotonically to their deriative products in alert graphs and # specify it in the graph_title known_derivative_metric = False try: # @modified 20180519 - Feature #2378: Add redis auth to Skyline and rebrow if settings.REDIS_PASSWORD: REDIS_ALERTER_CONN = redis.StrictRedis( password=settings.REDIS_PASSWORD, unix_socket_path=settings.REDIS_SOCKET_PATH) else: REDIS_ALERTER_CONN = redis.StrictRedis( unix_socket_path=settings.REDIS_SOCKET_PATH) except: logger.error(traceback.format_exc()) logger.error('error :: alert_smtp - redis connection failed') try: derivative_metrics = list( REDIS_ALERTER_CONN.smembers('derivative_metrics')) except: derivative_metrics = [] redis_metric_name = '%s%s' % (settings.FULL_NAMESPACE, str(base_name)) if redis_metric_name in derivative_metrics: known_derivative_metric = True if known_derivative_metric: try: non_derivative_monotonic_metrics = settings.NON_DERIVATIVE_MONOTONIC_METRICS except: non_derivative_monotonic_metrics = [] skip_derivative = in_list(redis_metric_name, non_derivative_monotonic_metrics) if skip_derivative: known_derivative_metric = False if known_derivative_metric: unencoded_graph_title = 'Skyline %s - ALERT at %s hours - derivative graph - %s' % ( context, str(int(full_duration_in_hours)), str(metric[0])) if settings.ENABLE_DEBUG or LOCAL_DEBUG: logger.info('debug :: alert_smtp - unencoded_graph_title: %s' % unencoded_graph_title) graph_title_string = quote(unencoded_graph_title, safe='') graph_title = '&title=%s' % graph_title_string graphite_port = '80' if settings.GRAPHITE_PORT != '': graphite_port = str(settings.GRAPHITE_PORT) link = '%s://%s:%s/render/?from=-%shours&target=cactiStyle(%s)%s%s&colorList=orange' % ( settings.GRAPHITE_PROTOCOL, settings.GRAPHITE_HOST, graphite_port, str(int(full_duration_in_hours)), metric[1], settings.GRAPHITE_GRAPH_SETTINGS, graph_title) # @added 20170603 - Feature #2034: analyse_derivatives if known_derivative_metric: link = '%s://%s:%s/render/?from=-%shours&target=cactiStyle(nonNegativeDerivative(%s))%s%s&colorList=orange' % ( settings.GRAPHITE_PROTOCOL, settings.GRAPHITE_HOST, graphite_port, str(int(full_duration_in_hours)), metric[1], settings.GRAPHITE_GRAPH_SETTINGS, graph_title) content_id = metric[1] image_data = None if settings.SMTP_OPTS.get('embed-images'): # @added 20161229 - Feature #1830: Ionosphere alerts # Use existing data if files exist if os.path.isfile(graphite_image_file): try: with open(graphite_image_file, 'r') as f: image_data = f.read() logger.info('alert_smtp - using existing png - %s' % graphite_image_file) except: logger.error(traceback.format_exc()) logger.error( 'error :: alert_smtp - failed to read image data from existing png - %s' % graphite_image_file) logger.error('error :: alert_smtp - %s' % str(link)) image_data = None if image_data is None: try: # @modified 20170913 - Task #2160: Test skyline with bandit # Added nosec to exclude from bandit tests image_data = urllib2.urlopen(link).read() # nosec if settings.ENABLE_DEBUG or LOCAL_DEBUG: logger.info('debug :: alert_smtp - image data OK') except urllib2.URLError: logger.error(traceback.format_exc()) logger.error('error :: alert_smtp - failed to get image graph') logger.error('error :: alert_smtp - %s' % str(link)) image_data = None if settings.ENABLE_DEBUG or LOCAL_DEBUG: logger.info('debug :: alert_smtp - image data None') if LOCAL_DEBUG: logger.info( 'debug :: alert_smtp - Memory usage after image_data: %s (kb)' % resource.getrusage(resource.RUSAGE_SELF).ru_maxrss) # If we failed to get the image or if it was explicitly disabled, # use the image URL instead of the content. if image_data is None: img_tag = '<img src="%s"/>' % link else: img_tag = '<img src="cid:%s"/>' % content_id if settings.ENABLE_DEBUG or LOCAL_DEBUG: logger.info('debug :: alert_smtp - img_tag: %s' % img_tag) if settings.IONOSPHERE_ENABLED: # Create Ionosphere Graphite image # @modified 20161229 - Feature #1830: Ionosphere alerts # Only write the data to the file if it does not exist if not os.path.isfile(graphite_image_file): try: write_data_to_file(skyline_app, graphite_image_file, 'w', image_data) logger.info('added %s Ionosphere Graphite image :: %s' % (skyline_app, graphite_image_file)) except: logger.info(traceback.format_exc()) logger.error( 'error :: failed to add %s Ionosphere Graphite image' % (skyline_app, graphite_image_file)) else: logger.info( '%s Ionosphere Graphite image already exists :: %s' % (skyline_app, graphite_image_file)) redis_image_data = None try: plot_redis_data = settings.PLOT_REDIS_DATA except: plot_redis_data = False if settings.SMTP_OPTS.get('embed-images') and plot_redis_data: # Create graph from Redis data redis_metric_key = '%s%s' % (settings.FULL_NAMESPACE, metric[1]) try: raw_series = REDIS_ALERTER_CONN.get(redis_metric_key) if settings.ENABLE_DEBUG or LOCAL_DEBUG: logger.info('debug :: alert_smtp - raw_series: %s' % 'OK') except: if settings.ENABLE_DEBUG or LOCAL_DEBUG: logger.info('debug :: alert_smtp - raw_series: %s' % 'FAIL') try: if LOCAL_DEBUG: logger.info( 'debug :: alert_smtp - Memory usage before get Redis timeseries data: %s (kb)' % resource.getrusage(resource.RUSAGE_SELF).ru_maxrss) unpacker = Unpacker(use_list=True) unpacker.feed(raw_series) timeseries_x = [float(item[0]) for item in unpacker] unpacker = Unpacker(use_list=True) unpacker.feed(raw_series) timeseries_y = [item[1] for item in unpacker] unpacker = Unpacker(use_list=False) unpacker.feed(raw_series) timeseries = list(unpacker) if LOCAL_DEBUG: logger.info( 'debug :: alert_smtp - Memory usage after get Redis timeseries data: %s (kb)' % resource.getrusage(resource.RUSAGE_SELF).ru_maxrss) except: logger.error('error :: alert_smtp - unpack timeseries failed') timeseries = None if settings.IONOSPHERE_ENABLED and timeseries: ''' .. todo: this is possibly to be used to allow the user to submit the FULL_DURATION duration data set for the features profile to be created against IF it is a Mirage metric. This would allow for additional granularity in Mirage metrics, thereby maintaining their seasonality, but allow user and Skyline to analyze the anomaly at a FULL_DURATION resolution as well. Not sure how to code that in Ionosphere context yet but could just be additonal flag in the Ionosphere record. In the Ionosphere frontend, the user would be given an option to either create the features profile on the Mirage timeseries or the redis FULL_DURATION timeseries. It is a little complicated, but doable. # @modified 20161229 - Feature #1828: ionosphere - mirage Redis data features However that ^^ is UNDESIRABLE in the Mirage/Ionosphere context at the moment. Ionosphere must only profile SECOND_ORDER_RESOLUTION_HOURS currently so as to not pollute the seasonality aspect of Mirage ''' # Create Ionosphere redis timeseries json if is does not exist # @modified 20161229 - Feature #1830: Ionosphere alerts # Only write the data to the file if it does not exist and replace # the timeseries object if a json file exists # @added 20170920 - Bug #2168: Strange Redis derivative graph using_original_redis_json = False if not os.path.isfile(json_file): timeseries_json = str(timeseries).replace('[', '(').replace( ']', ')') try: write_data_to_file(skyline_app, json_file, 'w', timeseries_json) logger.info( 'added %s Ionosphere Redis data timeseries json file :: %s' % (skyline_app, json_file)) except: logger.info(traceback.format_exc()) logger.error( 'error :: failed to add %s Ionosphere Redis data timeseries json file' % (skyline_app, json_file)) else: # Replace the timeseries object logger.info( '%s Ionosphere Redis data timeseries json file already exists, using :: %s' % (skyline_app, json_file)) anomaly_json = json_file try: # Read the timeseries json file with open(anomaly_json, 'r') as f: raw_timeseries = f.read() timeseries_array_str = str(raw_timeseries).replace( '(', '[').replace(')', ']') timeseries = literal_eval(timeseries_array_str) logger.info( '%s Redis timeseries replaced with timeseries from :: %s' % (skyline_app, anomaly_json)) timeseries_x = [float(item[0]) for item in timeseries] timeseries_y = [item[1] for item in timeseries] # @added 20170920 - Bug #2168: Strange Redis derivative graph # This already has nonNegativeDerivative applied to it using_original_redis_json = True except: logger.error(traceback.format_exc()) logger.error( 'error :: %s failed to read timeseries data from %s' % (skyline_app, anomaly_json)) timeseries = None # @added 20170603 - Feature #2034: analyse_derivatives if known_derivative_metric: # @added 20170920 - Bug #2168: Strange Redis derivative graph # If this is the Mirage Redis json it already has # nonNegativeDerivative applied to it if not using_original_redis_json: logger.info('alert_smtp - nonNegativeDerivative being applied') try: derivative_timeseries = nonNegativeDerivative(timeseries) timeseries = derivative_timeseries # @added 20170920 - Bug #2168: Strange Redis derivative graph logger.info('alert_smtp - nonNegativeDerivative applied') except: logger.error( 'error :: alert_smtp - nonNegativeDerivative failed') else: logger.info( 'alert_smtp - nonNegativeDerivative not being applied, as it will have been applied in the original json' ) # @added 21070726 - Bug #2068: Analyzer smtp alert error on Redis plot with derivative metrics # If the nonNegativeDerivative has been calculated we need to reset the # x and y as nonNegativeDerivative has to discard the first value as it # has no delta for it so the timeseries is 1 item less. timeseries_x = [float(item[0]) for item in timeseries] timeseries_y = [item[1] for item in timeseries] pd_series_values = None if timeseries: try: if LOCAL_DEBUG: logger.info( 'debug :: alert_smtp - Memory usage before pd.Series: %s (kb)' % resource.getrusage(resource.RUSAGE_SELF).ru_maxrss) values = pd.Series([x[1] for x in timeseries]) # Because the truth value of a Series is ambiguous pd_series_values = True if LOCAL_DEBUG: logger.info( 'debug :: alert_smtp - Memory usage after pd.Series: %s (kb)' % resource.getrusage(resource.RUSAGE_SELF).ru_maxrss) except: logger.error( 'error :: alert_smtp - pandas value series on timeseries failed' ) if pd_series_values: try: array_median = np.median(values) if settings.ENABLE_DEBUG or LOCAL_DEBUG: logger.info('debug :: alert_smtp - values median: %s' % str(array_median)) array_amax = np.amax(values) if settings.ENABLE_DEBUG or LOCAL_DEBUG: logger.info('debug :: alert_smtp - array_amax: %s' % str(array_amax)) array_amin = np.amin(values) if settings.ENABLE_DEBUG or LOCAL_DEBUG: logger.info('debug :: alert_smtp - array_amin: %s' % str(array_amin)) mean = values.mean() if settings.ENABLE_DEBUG or LOCAL_DEBUG: logger.info('debug :: alert_smtp - mean: %s' % str(mean)) stdDev = values.std() if settings.ENABLE_DEBUG or LOCAL_DEBUG: logger.info('debug :: alert_smtp - stdDev: %s' % str(stdDev)) sigma3 = 3 * stdDev if settings.ENABLE_DEBUG or LOCAL_DEBUG: logger.info('debug :: alert_smtp - sigma3: %s' % str(sigma3)) # sigma3_series = [sigma3] * len(values) sigma3_upper_bound = mean + sigma3 try: sigma3_lower_bound = mean - sigma3 except: sigma3_lower_bound = 0 sigma3_upper_series = [sigma3_upper_bound] * len(values) sigma3_lower_series = [sigma3_lower_bound] * len(values) amax_series = [array_amax] * len(values) amin_series = [array_amin] * len(values) mean_series = [mean] * len(values) except: logger.error( 'error :: alert_smtp - numpy ops on series failed') mean_series = None if mean_series: graph_title = 'Skyline %s - ALERT - at %s hours - Redis data\n%s - anomalous value: %s' % ( context, str( int(full_duration_in_hours)), metric[1], str(metric[0])) # @added 20170603 - Feature #2034: analyse_derivatives if known_derivative_metric: graph_title = 'Skyline %s - ALERT - at %s hours - Redis data (derivative graph)\n%s - anomalous value: %s' % ( context, str(int(full_duration_in_hours)), metric[1], str(metric[0])) # @modified 20160814 - Bug #1558: Memory leak in Analyzer # I think the buf is causing a memory leak, trying a file # if python_version == 3: # buf = io.StringIO() # else: # buf = io.BytesIO() buf = '%s/%s.%s.%s.png' % (settings.SKYLINE_TMP_DIR, skyline_app, str(int(metric[2])), metric[1]) if LOCAL_DEBUG: logger.info( 'debug :: alert_smtp - Memory usage before plot Redis data: %s (kb)' % resource.getrusage(resource.RUSAGE_SELF).ru_maxrss) # Too big # rcParams['figure.figsize'] = 12, 6 rcParams['figure.figsize'] = 8, 4 try: # fig = plt.figure() fig = plt.figure(frameon=False) ax = fig.add_subplot(111) ax.set_title(graph_title, fontsize='small') # @modified 20180417 - Bug #2358: set_axis_bgcolor method removed from Matplotlib - Luminosity # IssueID #49 'AxesSubplot' object has no attribute 'set_axis_bgcolor' # ax.set_axis_bgcolor('black') if hasattr(ax, 'set_facecolor'): ax.set_facecolor('black') else: ax.set_axis_bgcolor('black') try: datetimes = [ dt.datetime.utcfromtimestamp(ts) for ts in timeseries_x ] if settings.ENABLE_DEBUG or LOCAL_DEBUG: logger.info('debug :: alert_smtp - datetimes: %s' % 'OK') except: logger.error('error :: alert_smtp - datetimes: %s' % 'FAIL') plt.xticks(rotation=0, horizontalalignment='center') xfmt = DateFormatter('%a %H:%M') plt.gca().xaxis.set_major_formatter(xfmt) ax.xaxis.set_major_formatter(xfmt) ax.plot(datetimes, timeseries_y, color='orange', lw=0.6, zorder=3) ax.tick_params(axis='both', labelsize='xx-small') max_value_label = 'max - %s' % str(array_amax) ax.plot(datetimes, amax_series, lw=1, label=max_value_label, color='m', ls='--', zorder=4) min_value_label = 'min - %s' % str(array_amin) ax.plot(datetimes, amin_series, lw=1, label=min_value_label, color='b', ls='--', zorder=4) mean_value_label = 'mean - %s' % str(mean) ax.plot(datetimes, mean_series, lw=1.5, label=mean_value_label, color='g', ls='--', zorder=4) sigma3_text = (r'3$\sigma$') # sigma3_label = '%s - %s' % (str(sigma3_text), str(sigma3)) sigma3_upper_label = '%s upper - %s' % ( str(sigma3_text), str(sigma3_upper_bound)) ax.plot(datetimes, sigma3_upper_series, lw=1, label=sigma3_upper_label, color='r', ls='solid', zorder=4) if sigma3_lower_bound > 0: sigma3_lower_label = '%s lower - %s' % ( str(sigma3_text), str(sigma3_lower_bound)) ax.plot(datetimes, sigma3_lower_series, lw=1, label=sigma3_lower_label, color='r', ls='solid', zorder=4) ax.get_yaxis().get_major_formatter().set_useOffset(False) ax.get_yaxis().get_major_formatter().set_scientific(False) # Shrink current axis's height by 10% on the bottom box = ax.get_position() ax.set_position([ box.x0, box.y0 + box.height * 0.1, box.width, box.height * 0.9 ]) # Put a legend below current axis ax.legend(loc='upper center', bbox_to_anchor=(0.5, -0.05), fancybox=True, shadow=True, ncol=4, fontsize='x-small') plt.rc('lines', lw=2, color='w') plt.grid(True) ax.grid(b=True, which='both', axis='both', color='lightgray', linestyle='solid', alpha=0.5, linewidth=0.6) # @modified 20180417 - Bug #2358: set_axis_bgcolor method removed from Matplotlib - Luminosity # IssueID #49 'AxesSubplot' object has no attribute 'set_axis_bgcolor' # ax.set_axis_bgcolor('black') if hasattr(ax, 'set_facecolor'): ax.set_facecolor('black') else: ax.set_axis_bgcolor('black') rcParams['xtick.direction'] = 'out' rcParams['ytick.direction'] = 'out' ax.margins(y=.02, x=.03) # tight_layout removes the legend box # fig.tight_layout() try: if LOCAL_DEBUG: logger.info( 'debug :: alert_smtp - Memory usage before plt.savefig: %s (kb)' % resource.getrusage(resource.RUSAGE_SELF).ru_maxrss) plt.savefig(buf, format='png') if settings.IONOSPHERE_ENABLED: if not os.path.exists(training_data_dir): mkdir_p(training_data_dir) logger.info('created dir - %s' % training_data_dir) if not os.path.isfile(training_data_redis_image): try: plt.savefig(training_data_redis_image, format='png') logger.info( 'alert_smtp - save Redis training data image - %s' % (training_data_redis_image)) except: logger.info(traceback.format_exc()) logger.error( 'error :: alert_smtp - could not save - %s' % (training_data_redis_image)) else: logger.info( 'alert_smtp - Redis training data image already exists - %s' % (training_data_redis_image)) # @added 20160814 - Bug #1558: Memory leak in Analyzer # As per http://www.mail-archive.com/[email protected]/msg13222.html # savefig in the parent process was causing the memory leak # the below fig.clf() and plt.close() did not resolve this # however spawing a multiprocessing process for alert_smtp # does solve this as issue as all memory is freed when the # process terminates. fig.clf() plt.close(fig) redis_graph_content_id = 'redis.%s' % metric[1] redis_image_data = True if settings.ENABLE_DEBUG or LOCAL_DEBUG: logger.info('debug :: alert_smtp - savefig: %s' % 'OK') logger.info( 'debug :: alert_smtp - Memory usage after plt.savefig: %s (kb)' % resource.getrusage(resource.RUSAGE_SELF).ru_maxrss) except: logger.info(traceback.format_exc()) logger.error('error :: alert_smtp - plt.savefig: %s' % 'FAIL') except: logger.error(traceback.format_exc()) logger.error('error :: alert_smtp - could not build plot') if LOCAL_DEBUG: logger.info( 'debug :: alert_smtp - Memory usage before email: %s (kb)' % resource.getrusage(resource.RUSAGE_SELF).ru_maxrss) if redis_image_data: redis_img_tag = '<img src="cid:%s"/>' % redis_graph_content_id if settings.ENABLE_DEBUG or LOCAL_DEBUG: logger.info('debug :: alert_smtp - redis_img_tag: %s' % str(redis_img_tag)) else: # @modified 20161229 - Feature #1830: Ionosphere alerts # @modified 20170108 - Feature #1852: Ionosphere - features_profile matched graphite graphs # Restored the previous redis_img_tag method as some smtp alerts were # coming without a Redis graph, not all but some and for some reason, # I am pretty certain retrospectively that it was done that way from # testing I just wanted to try and be cleaner. # The redis_img_tag was changed at # https://github.com/earthgecko/skyline/commit/31bcacf3f90f0953ebed0d57260cb937e01f887c#diff-520bf2a218f65074ffead4d8184c138dR489 redis_img_tag = '<img src="%s"/>' % 'none' # redis_img_tag = '<img src="none"/>' # @added 20170806 - Feature #1830: Ionosphere alerts # Show a human date in alerts alerted_at = str(dt.datetime.utcfromtimestamp(int(metric[2]))) try: body = '<h3><font color="#dd3023">Sky</font><font color="#6698FF">line</font><font color="black"> %s alert</font></h3><br>' % context body += '<font color="black">metric: <b>%s</b></font><br>' % metric[1] body += '<font color="black">Anomalous value: %s</font><br>' % str( metric[0]) body += '<font color="black">Anomaly timestamp: %s</font><br>' % str( int(metric[2])) # @added 20170806 - Feature #1830: Ionosphere alerts # Show a human date in alerts body += '<font color="black">Anomalous at: %s</font><br>' % alerted_at body += '<font color="black">At hours: %s</font><br>' % str( int(full_duration_in_hours)) body += '<font color="black">Next alert in: %s seconds</font><br>' % str( alert[2]) # @added 20170603 - Feature #2034: analyse_derivatives if known_derivative_metric: body += '<font color="black">Derivative graph: True</font><br>' more_body = '' if settings.IONOSPHERE_ENABLED: # @modified 20170823 - Bug #2142: 7bit SMTP encoding breaking long urls # Broke body into body and more_body to workaround the 990 character # limit per line for SMTP more_body += '<h3><font color="#dd3023">Ionosphere :: </font><font color="#6698FF">training data</font><font color="black"></font></h3>' ionosphere_link = '%s/ionosphere?timestamp=%s&metric=%s' % ( settings.SKYLINE_URL, str(int(metric[2])), str(metric[1])) more_body += '<font color="black">To use this timeseries to train Skyline that this is not anomalous manage this training data at:<br>' more_body += '<a href="%s">%s</a></font>' % (ionosphere_link, ionosphere_link) if redis_image_data: more_body += '<font color="black">min: %s | max: %s | mean: %s <br>' % ( str(array_amin), str(array_amax), str(mean)) more_body += '3-sigma: %s <br>' % str(sigma3) more_body += '3-sigma upper bound: %s | 3-sigma lower bound: %s <br></font>' % ( str(sigma3_upper_bound), str(sigma3_lower_bound)) more_body += '<h3><font color="black">Redis data at FULL_DURATION</font></h3><br>' more_body += '<div dir="ltr">:%s<br></div>' % redis_img_tag if image_data: more_body += '<h3><font color="black">Graphite data at FULL_DURATION (may be aggregated)</font></h3>' more_body += '<div dir="ltr"><a href="%s">%s</a><br></div><br>' % ( link, img_tag) more_body += '<font color="black">Clicking on the above graph will open to the Graphite graph with current data</font><br>' if redis_image_data: more_body += '<font color="black">To disable the Redis data graph view, set PLOT_REDIS_DATA to False in your settings.py, if the Graphite graph is sufficient for you,<br>' more_body += 'however do note that will remove the 3-sigma and mean value too.</font>' more_body += '<br>' more_body += '<div dir="ltr" align="right"><font color="#dd3023">Sky</font><font color="#6698FF">line</font><font color="black"> version :: %s</font></div><br>' % str( skyline_version) except: logger.error('error :: alert_smtp - could not build body') logger.info(traceback.format_exc()) # @modified 20180524 - Task #2384: Change alerters to cc other recipients # Do not send to each recipient, send to primary_recipient and cc the other # recipients, thereby sending only one email # for recipient in recipients: if primary_recipient: try: # @modified 20170823 - Bug #2142: 7bit SMTP encoding breaking long urls # Broke body into body and more_body to workaround the 990 character # limit per line for SMTP, using mixed as alternative indicates that # the client should select one of the parts for display and ignore # the rest (tripleee - https://stackoverflow.com/a/35115938) # msg = MIMEMultipart('alternative') msg = MIMEMultipart('mixed') # @added 20170812 - Bug #2142: 7bit SMTP encoding breaking long urls # set email charset and email encodings cs_ = charset.Charset('utf-8') cs_.header_encoding = charset.QP cs_.body_encoding = charset.QP msg.set_charset(cs_) msg['Subject'] = '[Skyline alert] - %s ALERT - %s' % (context, metric[1]) msg['From'] = sender # @modified 20180524 - Task #2384: Change alerters to cc other recipients # msg['To'] = recipient msg['To'] = primary_recipient # @added 20180524 - Task #2384: Change alerters to cc other recipients # Added Cc if cc_recipients: msg['Cc'] = cc_recipients msg.attach(MIMEText(body, 'html')) # @added 20170823 - Bug #2142: 7bit SMTP encoding breaking long urls # Broke body into body and more_body to workaround the 990 character # limit per line for SMTP msg.replace_header('content-transfer-encoding', 'quoted-printable') msg.attach(MIMEText(more_body, 'html')) if redis_image_data: try: # @modified 20160814 - Bug #1558: Memory leak in Analyzer # I think the buf is causing a memory leak, trying a file # buf.seek(0) # msg_plot_attachment = MIMEImage(buf.read()) # msg_plot_attachment = MIMEImage(buf.read()) try: with open(buf, 'r') as f: plot_image_data = f.read() try: os.remove(buf) except OSError: logger.error( 'error :: alert_smtp - failed to remove file - %s' % buf) logger.info(traceback.format_exc()) pass except: logger.error('error :: failed to read plot file - %s' % buf) plot_image_data = None # @added 20161124 - Branch #922: ionosphere msg_plot_attachment = MIMEImage(plot_image_data) msg_plot_attachment.add_header( 'Content-ID', '<%s>' % redis_graph_content_id) msg.attach(msg_plot_attachment) if settings.ENABLE_DEBUG or LOCAL_DEBUG: logger.info( 'debug :: alert_smtp - msg_plot_attachment - redis data done' ) except: logger.error('error :: alert_smtp - msg_plot_attachment') logger.info(traceback.format_exc()) if image_data is not None: try: msg_attachment = MIMEImage(image_data) msg_attachment.add_header('Content-ID', '<%s>' % content_id) msg.attach(msg_attachment) if settings.ENABLE_DEBUG or LOCAL_DEBUG: logger.info( 'debug :: alert_smtp - msg_attachment - Graphite img source done' ) except: logger.error('error :: alert_smtp - msg_attachment') logger.info(traceback.format_exc()) except: logger.error('error :: alert_smtp - could not attach') logger.info(traceback.format_exc()) s = SMTP('127.0.0.1') try: # @modified 20180524 - Task #2384: Change alerters to cc other recipients # Send to primary_recipient and cc_recipients # s.sendmail(sender, recipient, msg.as_string()) if cc_recipients: s.sendmail(sender, [primary_recipient, cc_recipients], msg.as_string()) else: s.sendmail(sender, primary_recipient, msg.as_string()) if settings.ENABLE_DEBUG or LOCAL_DEBUG: # logger.info('debug :: alert_smtp - message sent to %s OK' % str(recipient)) logger.info( 'debug :: alert_smtp - message sent OK to primary_recipient :: %s, cc_recipients :: %s' % (str(primary_recipient), str(cc_recipients))) except: logger.info(traceback.format_exc()) # logger.error('error :: alert_smtp - could not send email to %s' % str(recipient)) logger.error( 'error :: alert_smtp - could not send email to primary_recipient :: %s, cc_recipients :: %s' % (str(primary_recipient), str(cc_recipients))) s.quit() if LOCAL_DEBUG: logger.info('debug :: alert_smtp - Memory usage after email: %s (kb)' % resource.getrusage(resource.RUSAGE_SELF).ru_maxrss) if redis_image_data: # buf.seek(0) # buf.write('none') if LOCAL_DEBUG: logger.info( 'debug :: alert_smtp - Memory usage before del redis_image_data objects: %s (kb)' % resource.getrusage(resource.RUSAGE_SELF).ru_maxrss) del raw_series del unpacker del timeseries[:] del timeseries_x[:] del timeseries_y[:] del values del datetimes[:] del msg_plot_attachment del redis_image_data # We del all variables that are floats as they become unique objects and # can result in what appears to be a memory leak, but is not, it is # just the way Python handles floats del mean del array_amin del array_amax del stdDev del sigma3 if LOCAL_DEBUG: logger.info( 'debug :: alert_smtp - Memory usage after del redis_image_data objects: %s (kb)' % resource.getrusage(resource.RUSAGE_SELF).ru_maxrss) if LOCAL_DEBUG: logger.info( 'debug :: alert_smtp - Memory usage before del fig object: %s (kb)' % resource.getrusage(resource.RUSAGE_SELF).ru_maxrss) # @added 20160814 - Bug #1558: Memory leak in Analyzer # Issue #21 Memory leak in Analyzer - https://github.com/earthgecko/skyline/issues/21 # As per http://www.mail-archive.com/[email protected]/msg13222.html fig.clf() plt.close(fig) del fig if LOCAL_DEBUG: logger.info( 'debug :: alert_smtp - Memory usage after del fig object: %s (kb)' % resource.getrusage(resource.RUSAGE_SELF).ru_maxrss) if LOCAL_DEBUG: logger.info( 'debug :: alert_smtp - Memory usage before del other objects: %s (kb)' % resource.getrusage(resource.RUSAGE_SELF).ru_maxrss) del recipients[:] del body del msg del image_data del msg_attachment if LOCAL_DEBUG: logger.info( 'debug :: alert_smtp - Memory usage after del other objects: %s (kb)' % resource.getrusage(resource.RUSAGE_SELF).ru_maxrss) return
def m66(current_skyline_app, parent_pid, timeseries, algorithm_parameters): """ A time series data points are anomalous if the 6th median is 6 standard deviations (six-sigma) from the time series 6th median standard deviation and persists for x_windows, where `x_windows = int(window / 2)`. This algorithm finds SIGNIFICANT cahngepoints in a time series, similar to PELT and Bayesian Online Changepoint Detection, however it is more robust to instaneous outliers and more conditionally selective of changepoints. :param current_skyline_app: the Skyline app executing the algorithm. This will be passed to the algorithm by Skyline. This is **required** for error handling and logging. You do not have to worry about handling the argument in the scope of the custom algorithm itself, but the algorithm must accept it as the first agrument. :param parent_pid: the parent pid which is executing the algorithm, this is **required** for error handling and logging. You do not have to worry about handling this argument in the scope of algorithm, but the algorithm must accept it as the second argument. :param timeseries: the time series as a list e.g. ``[[1578916800.0, 29.0], [1578920400.0, 55.0], ... [1580353200.0, 55.0]]`` :param algorithm_parameters: a dictionary of any required parameters for the custom_algorithm and algorithm itself for example: ``algorithm_parameters={ 'nth_median': 6, 'sigma': 6, 'window': 5, 'return_anomalies' = True, }`` :type current_skyline_app: str :type parent_pid: int :type timeseries: list :type algorithm_parameters: dict :return: True, False or Non :rtype: boolean Example CUSTOM_ALGORITHMS configuration: 'm66': { 'namespaces': [ 'skyline.analyzer.run_time', 'skyline.analyzer.total_metrics', 'skyline.analyzer.exceptions' ], 'algorithm_source': '/opt/skyline/github/skyline/skyline/custom_algorithms/m66.py', 'algorithm_parameters': { 'nth_median': 6, 'sigma': 6, 'window': 5, 'resolution': 60, 'minimum_sparsity': 0, 'determine_duration': False, 'return_anomalies': True, 'save_plots_to': False, 'save_plots_to_absolute_dir': False, 'filename_prefix': False }, 'max_execution_time': 1.0 'consensus': 1, 'algorithms_allowed_in_consensus': ['m66'], 'run_3sigma_algorithms': False, 'run_before_3sigma': False, 'run_only_if_consensus': False, 'use_with': ['crucible', 'luminosity'], 'debug_logging': False, }, """ # You MUST define the algorithm_name algorithm_name = 'm66' # Define the default state of None and None, anomalous does not default to # False as that is not correct, False is only correct if the algorithm # determines the data point is not anomalous. The same is true for the # anomalyScore. anomalous = None anomalyScore = None return_anomalies = False anomalies = [] anomalies_dict = {} anomalies_dict['algorithm'] = algorithm_name realtime_analysis = False current_logger = None dev_null = None # If you wanted to log, you can but this should only be done during # testing and development def get_log(current_skyline_app): current_skyline_app_logger = current_skyline_app + 'Log' current_logger = logging.getLogger(current_skyline_app_logger) return current_logger start = timer() # Use the algorithm_parameters to determine the sample_period debug_logging = None try: debug_logging = algorithm_parameters['debug_logging'] except: debug_logging = False if debug_logging: try: current_logger = get_log(current_skyline_app) current_logger.debug( 'debug :: %s :: debug_logging enabled with algorithm_parameters - %s' % (algorithm_name, str(algorithm_parameters))) except Exception as e: # This except pattern MUST be used in ALL custom algortihms to # facilitate the traceback from any errors. The algorithm we want to # run super fast and without spamming the log with lots of errors. # But we do not want the function returning and not reporting # anything to the log, so the pythonic except is used to "sample" any # algorithm errors to a tmp file and report once per run rather than # spewing tons of errors into the log e.g. analyzer.log dev_null = e record_algorithm_error(current_skyline_app, parent_pid, algorithm_name, traceback.format_exc()) # Return None and None as the algorithm could not determine True or False del dev_null if current_skyline_app == 'webapp': return (anomalous, anomalyScore, anomalies, anomalies_dict) if return_anomalies: return (anomalous, anomalyScore, anomalies) return (anomalous, anomalyScore) # Allow the m66 parameters to be passed in the algorithm_parameters window = 6 try: window = algorithm_parameters['window'] except KeyError: window = 6 except Exception as e: record_algorithm_error(current_skyline_app, parent_pid, algorithm_name, traceback.format_exc()) dev_null = e nth_median = 6 try: nth_median = algorithm_parameters['nth_median'] except KeyError: nth_median = 6 except Exception as e: record_algorithm_error(current_skyline_app, parent_pid, algorithm_name, traceback.format_exc()) dev_null = e n_sigma = 6 try: n_sigma = algorithm_parameters['sigma'] except KeyError: n_sigma = 6 except Exception as e: record_algorithm_error(current_skyline_app, parent_pid, algorithm_name, traceback.format_exc()) dev_null = e resolution = 0 try: resolution = algorithm_parameters['resolution'] except KeyError: resolution = 0 except Exception as e: record_algorithm_error(current_skyline_app, parent_pid, algorithm_name, traceback.format_exc()) dev_null = e determine_duration = False try: determine_duration = algorithm_parameters['determine_duration'] except KeyError: determine_duration = False except Exception as e: record_algorithm_error(current_skyline_app, parent_pid, algorithm_name, traceback.format_exc()) dev_null = e minimum_sparsity = 0 try: minimum_sparsity = algorithm_parameters['minimum_sparsity'] except KeyError: minimum_sparsity = 0 except Exception as e: record_algorithm_error(current_skyline_app, parent_pid, algorithm_name, traceback.format_exc()) dev_null = e shift_to_start_of_window = True try: shift_to_start_of_window = algorithm_parameters[ 'shift_to_start_of_window'] except KeyError: shift_to_start_of_window = True except Exception as e: record_algorithm_error(current_skyline_app, parent_pid, algorithm_name, traceback.format_exc()) dev_null = e save_plots_to = False try: save_plots_to = algorithm_parameters['save_plots_to'] except KeyError: save_plots_to = False except Exception as e: record_algorithm_error(current_skyline_app, parent_pid, algorithm_name, traceback.format_exc()) dev_null = e save_plots_to_absolute_dir = False try: save_plots_to_absolute_dir = algorithm_parameters[ 'save_plots_to_absolute_dir'] except KeyError: save_plots_to_absolute_dir = False except Exception as e: record_algorithm_error(current_skyline_app, parent_pid, algorithm_name, traceback.format_exc()) dev_null = e filename_prefix = False try: filename_prefix = algorithm_parameters['filename_prefix'] except KeyError: filename_prefix = False except Exception as e: record_algorithm_error(current_skyline_app, parent_pid, algorithm_name, traceback.format_exc()) dev_null = e if debug_logging: current_logger.debug('debug :: algorithm_parameters :: %s' % (str(algorithm_parameters))) return_anomalies = False try: return_anomalies = algorithm_parameters['return_anomalies'] except KeyError: return_anomalies = False except Exception as e: record_algorithm_error(current_skyline_app, parent_pid, algorithm_name, traceback.format_exc()) dev_null = e try: realtime_analysis = algorithm_parameters['realtime_analysis'] except KeyError: realtime_analysis = False except Exception as e: record_algorithm_error(current_skyline_app, parent_pid, algorithm_name, traceback.format_exc()) dev_null = e save_plots_to = False try: save_plots_to = algorithm_parameters['save_plots_to'] except KeyError: save_plots_to = False except Exception as e: record_algorithm_error(current_skyline_app, parent_pid, algorithm_name, traceback.format_exc()) dev_null = e save_plots_to_absolute_dir = False try: save_plots_to_absolute_dir = algorithm_parameters[ 'save_plots_to_absolute_dir'] except KeyError: save_plots_to = False except Exception as e: record_algorithm_error(current_skyline_app, parent_pid, algorithm_name, traceback.format_exc()) dev_null = e filename_prefix = False try: filename_prefix = algorithm_parameters['filename_prefix'] except KeyError: filename_prefix = False except Exception as e: record_algorithm_error(current_skyline_app, parent_pid, algorithm_name, traceback.format_exc()) dev_null = e try: base_name = algorithm_parameters['base_name'] except Exception as e: # This except pattern MUST be used in ALL custom algortihms to # facilitate the traceback from any errors. The algorithm we want to # run super fast and without spamming the log with lots of errors. # But we do not want the function returning and not reporting # anything to the log, so the pythonic except is used to "sample" any # algorithm errors to a tmp file and report once per run rather than # spewing tons of errors into the log e.g. analyzer.log record_algorithm_error(current_skyline_app, parent_pid, algorithm_name, traceback.format_exc()) # Return None and None as the algorithm could not determine True or False dev_null = e del dev_null if current_skyline_app == 'webapp': return (anomalous, anomalyScore, anomalies, anomalies_dict) if return_anomalies: return (False, None, anomalies) return (False, None) if debug_logging: current_logger.debug('debug :: %s :: base_name - %s' % (algorithm_name, str(base_name))) anomalies_dict['metric'] = base_name anomalies_dict['anomalies'] = {} use_bottleneck = True if save_plots_to: use_bottleneck = False if use_bottleneck: import bottleneck as bn # ALWAYS WRAP YOUR ALGORITHM IN try and the BELOW except try: start_preprocessing = timer() # INFO: Sorting time series of 10079 data points took 0.002215 seconds timeseries = sorted(timeseries, key=lambda x: x[0]) if debug_logging: current_logger.debug('debug :: %s :: time series of length - %s' % (algorithm_name, str(len(timeseries)))) # Testing the data to ensure it meets minimum requirements, in the case # of Skyline's use of the m66 algorithm this means that: # - the time series must have at least 75% of its full_duration do_not_use_sparse_data = False if current_skyline_app == 'luminosity': do_not_use_sparse_data = True if minimum_sparsity == 0: do_not_use_sparse_data = False total_period = 0 total_datapoints = 0 calculate_variables = False if do_not_use_sparse_data: calculate_variables = True if determine_duration: calculate_variables = True if calculate_variables: try: start_timestamp = int(timeseries[0][0]) end_timestamp = int(timeseries[-1][0]) total_period = end_timestamp - start_timestamp total_datapoints = len(timeseries) except SystemExit as e: if debug_logging: current_logger.debug( 'debug_logging :: %s :: SystemExit called, exiting - %s' % (algorithm_name, e)) if current_skyline_app == 'webapp': return (anomalous, anomalyScore, anomalies, anomalies_dict) if return_anomalies: return (anomalous, anomalyScore, anomalies) return (anomalous, anomalyScore) except: traceback_msg = traceback.format_exc() record_algorithm_error(current_skyline_app, parent_pid, algorithm_name, traceback_msg) if debug_logging: current_logger.error(traceback_msg) current_logger.error( 'error :: debug_logging :: %s :: failed to determine total_period and total_datapoints' % (algorithm_name)) timeseries = [] if not timeseries: if current_skyline_app == 'webapp': return (anomalous, anomalyScore, anomalies, anomalies_dict) if return_anomalies: return (anomalous, anomalyScore, anomalies) return (anomalous, anomalyScore) if current_skyline_app == 'analyzer': # Default for analyzer at required period to 18 hours period_required = int(FULL_DURATION * 0.75) else: # Determine from timeseries if total_period < FULL_DURATION: period_required = int(FULL_DURATION * 0.75) else: period_required = int(total_period * 0.75) if determine_duration: period_required = int(total_period * 0.75) if do_not_use_sparse_data: # If the time series does not have 75% of its full_duration it does # not have sufficient data to sample try: if total_period < period_required: if debug_logging: current_logger.debug( 'debug :: %s :: time series does not have sufficient data' % (algorithm_name)) if current_skyline_app == 'webapp': return (anomalous, anomalyScore, anomalies, anomalies_dict) if return_anomalies: return (anomalous, anomalyScore, anomalies) return (anomalous, anomalyScore) except SystemExit as e: if debug_logging: current_logger.debug( 'debug_logging :: %s :: SystemExit called, exiting - %s' % (algorithm_name, e)) if current_skyline_app == 'webapp': return (anomalous, anomalyScore, anomalies, anomalies_dict) if return_anomalies: return (anomalous, anomalyScore, anomalies) return (anomalous, anomalyScore) except: traceback_msg = traceback.format_exc() record_algorithm_error(current_skyline_app, parent_pid, algorithm_name, traceback_msg) if debug_logging: current_logger.error(traceback_msg) current_logger.error( 'error :: debug_logging :: %s :: falied to determine if time series has sufficient data' % (algorithm_name)) if current_skyline_app == 'webapp': return (anomalous, anomalyScore, anomalies, anomalies_dict) if return_anomalies: return (anomalous, anomalyScore, anomalies) return (anomalous, anomalyScore) # If the time series does not have 75% of its full_duration # datapoints it does not have sufficient data to sample # Determine resolution from the last 30 data points # INFO took 0.002060 seconds if not resolution: resolution_timestamps = [] metric_resolution = False for metric_datapoint in timeseries[-30:]: timestamp = int(metric_datapoint[0]) resolution_timestamps.append(timestamp) timestamp_resolutions = [] if resolution_timestamps: last_timestamp = None for timestamp in resolution_timestamps: if last_timestamp: resolution = timestamp - last_timestamp timestamp_resolutions.append(resolution) last_timestamp = timestamp else: last_timestamp = timestamp try: del resolution_timestamps except: pass if timestamp_resolutions: try: timestamp_resolutions_count = Counter( timestamp_resolutions) ordered_timestamp_resolutions_count = timestamp_resolutions_count.most_common( ) metric_resolution = int( ordered_timestamp_resolutions_count[0][0]) except SystemExit as e: if debug_logging: current_logger.debug( 'debug_logging :: %s :: SystemExit called, exiting - %s' % (algorithm_name, e)) if current_skyline_app == 'webapp': return (anomalous, anomalyScore, anomalies, anomalies_dict) if return_anomalies: return (anomalous, anomalyScore, anomalies) return (anomalous, anomalyScore) except: traceback_msg = traceback.format_exc() record_algorithm_error(current_skyline_app, parent_pid, algorithm_name, traceback_msg) if debug_logging: current_logger.error(traceback_msg) current_logger.error( 'error :: debug_logging :: %s :: failed to determine if time series has sufficient data' % (algorithm_name)) try: del timestamp_resolutions except: pass else: metric_resolution = resolution minimum_datapoints = None if metric_resolution: minimum_datapoints = int(period_required / metric_resolution) if minimum_datapoints: if total_datapoints < minimum_datapoints: if debug_logging: current_logger.debug( 'debug :: %s :: time series does not have sufficient data, minimum_datapoints required is %s and time series has %s' % (algorithm_name, str(minimum_datapoints), str(total_datapoints))) if current_skyline_app == 'webapp': return (anomalous, anomalyScore, anomalies, anomalies_dict) if return_anomalies: return (anomalous, anomalyScore, anomalies) return (anomalous, anomalyScore) # Is the time series fully populated? # full_duration_datapoints = int(full_duration / metric_resolution) total_period_datapoints = int(total_period / metric_resolution) # minimum_percentage_sparsity = 95 minimum_percentage_sparsity = 90 sparsity = int(total_datapoints / (total_period_datapoints / 100)) if sparsity < minimum_percentage_sparsity: if debug_logging: current_logger.debug( 'debug :: %s :: time series does not have sufficient data, minimum_percentage_sparsity required is %s and time series has %s' % (algorithm_name, str(minimum_percentage_sparsity), str(sparsity))) if current_skyline_app == 'webapp': return (anomalous, anomalyScore, anomalies, anomalies_dict) if return_anomalies: return (anomalous, anomalyScore, anomalies) return (anomalous, anomalyScore) if len(set(item[1] for item in timeseries)) == 1: if debug_logging: current_logger.debug( 'debug :: %s :: time series does not have sufficient variability, all the values are the same' % algorithm_name) anomalous = False anomalyScore = 0.0 if current_skyline_app == 'webapp': return (anomalous, anomalyScore, anomalies, anomalies_dict) if return_anomalies: return (anomalous, anomalyScore, anomalies) return (anomalous, anomalyScore) end_preprocessing = timer() preprocessing_runtime = end_preprocessing - start_preprocessing if debug_logging: current_logger.debug( 'debug :: %s :: preprocessing took %.6f seconds' % (algorithm_name, preprocessing_runtime)) if not timeseries: if debug_logging: current_logger.debug('debug :: %s :: m66 not run as no data' % (algorithm_name)) anomalies = [] if current_skyline_app == 'webapp': return (anomalous, anomalyScore, anomalies, anomalies_dict) if return_anomalies: return (anomalous, anomalyScore, anomalies) return (anomalous, anomalyScore) if debug_logging: current_logger.debug('debug :: %s :: timeseries length: %s' % (algorithm_name, str(len(timeseries)))) anomalies_dict['timestamp'] = int(timeseries[-1][0]) anomalies_dict['from_timestamp'] = int(timeseries[0][0]) start_analysis = timer() try: # bottleneck is used because it is much faster # pd dataframe method (1445 data point - 24hrs): took 0.077915 seconds # bottleneck method (1445 data point - 24hrs): took 0.005692 seconds # numpy and pandas rolling # 2021-07-30 12:37:31 :: 2827897 :: cloudbursts :: find_cloudbursts completed on 1530 metrics in 136.93 seconds # 2021-07-30 12:44:53 :: 2855884 :: cloudbursts :: find_cloudbursts completed on 1530 metrics in 148.82 seconds # 2021-07-30 12:48:41 :: 2870822 :: cloudbursts :: find_cloudbursts completed on 1530 metrics in 145.62 seconds # 2021-07-30 12:55:00 :: 2893634 :: cloudbursts :: find_cloudbursts completed on 1530 metrics in 139.00 seconds # 2021-07-30 12:59:31 :: 2910443 :: cloudbursts :: find_cloudbursts completed on 1530 metrics in 144.80 seconds # 2021-07-30 13:02:31 :: 2922928 :: cloudbursts :: find_cloudbursts completed on 1530 metrics in 143.35 seconds # 2021-07-30 14:12:56 :: 3132457 :: cloudbursts :: find_cloudbursts completed on 1530 metrics in 129.25 seconds # 2021-07-30 14:22:35 :: 3164370 :: cloudbursts :: find_cloudbursts completed on 1530 metrics in 125.72 seconds # 2021-07-30 14:28:24 :: 3179687 :: cloudbursts :: find_cloudbursts completed on 1530 metrics in 222.43 seconds # 2021-07-30 14:33:45 :: 3179687 :: cloudbursts :: find_cloudbursts completed on 1530 metrics in 244.00 seconds # 2021-07-30 14:36:27 :: 3214047 :: cloudbursts :: find_cloudbursts completed on 1530 metrics in 141.10 seconds # numpy and bottleneck # 2021-07-30 16:41:52 :: 3585162 :: cloudbursts :: find_cloudbursts completed on 1530 metrics in 73.92 seconds # 2021-07-30 16:46:46 :: 3585162 :: cloudbursts :: find_cloudbursts completed on 1530 metrics in 68.84 seconds # 2021-07-30 16:51:48 :: 3585162 :: cloudbursts :: find_cloudbursts completed on 1530 metrics in 70.55 seconds # numpy and bottleneck (passing resolution and not calculating in m66) # 2021-07-30 16:57:46 :: 3643253 :: cloudbursts :: find_cloudbursts completed on 1530 metrics in 65.59 seconds if use_bottleneck: if len(timeseries) < 10: if current_skyline_app == 'webapp': return (anomalous, anomalyScore, anomalies, anomalies_dict) if return_anomalies: return (anomalous, anomalyScore, anomalies) return (anomalous, anomalyScore) x_np = np.asarray([x[1] for x in timeseries]) # Fast Min-Max scaling data = (x_np - x_np.min()) / (x_np.max() - x_np.min()) # m66 - calculate to nth_median median_count = 0 while median_count < nth_median: median_count += 1 rolling_median_s = bn.move_median(data, window=window) median = rolling_median_s.tolist() data = median if median_count == nth_median: break # m66 - calculate the moving standard deviation for the # nth_median array rolling_std_s = bn.move_std(data, window=window) std_nth_median_array = np.nan_to_num(rolling_std_s, copy=False, nan=0.0, posinf=None, neginf=None) std_nth_median = std_nth_median_array.tolist() if debug_logging: current_logger.debug( 'debug :: %s :: std_nth_median calculated with bn' % (algorithm_name)) else: df = pd.DataFrame(timeseries, columns=['date', 'value']) df['date'] = pd.to_datetime(df['date'], unit='s') datetime_index = pd.DatetimeIndex(df['date'].values) df = df.set_index(datetime_index) df.drop('date', axis=1, inplace=True) original_df = df.copy() # MinMax scale df = (df - df.min()) / (df.max() - df.min()) # window = 6 data = df['value'].tolist() if len(data) < 10: if current_skyline_app == 'webapp': return (anomalous, anomalyScore, anomalies, anomalies_dict) if return_anomalies: return (anomalous, anomalyScore, anomalies) return (anomalous, anomalyScore) # m66 - calculate to nth_median median_count = 0 while median_count < nth_median: median_count += 1 s = pd.Series(data) rolling_median_s = s.rolling(window).median() median = rolling_median_s.tolist() data = median if median_count == nth_median: break # m66 - calculate the moving standard deviation for the # nth_median array s = pd.Series(data) rolling_std_s = s.rolling(window).std() nth_median_column = 'std_nth_median_%s' % str(nth_median) df[nth_median_column] = rolling_std_s.tolist() std_nth_median = df[nth_median_column].fillna(0).tolist() # m66 - calculate the standard deviation for the entire nth_median # array metric_stddev = np.std(std_nth_median) std_nth_median_n_sigma = [] anomalies_found = False for value in std_nth_median: # m66 - if the value in the 6th median array is > six-sigma of # the metric_stddev the datapoint is anomalous if value > (metric_stddev * n_sigma): std_nth_median_n_sigma.append(1) anomalies_found = True else: std_nth_median_n_sigma.append(0) std_nth_median_n_sigma_column = 'std_median_%s_%s_sigma' % ( str(nth_median), str(n_sigma)) if not use_bottleneck: df[std_nth_median_n_sigma_column] = std_nth_median_n_sigma anomalies = [] # m66 - only label anomalous if the n_sigma triggers are persisted # for (window / 2) if anomalies_found: current_triggers = [] for index, item in enumerate(timeseries): if std_nth_median_n_sigma[index] == 1: current_triggers.append(index) else: if len(current_triggers) > int(window / 2): for trigger_index in current_triggers: # Shift the anomaly back to the beginning of the # window if shift_to_start_of_window: anomalies.append( timeseries[(trigger_index - (window * int( (nth_median / 2))))]) else: anomalies.append(timeseries[trigger_index]) current_triggers = [] # Process any remaining current_triggers if len(current_triggers) > int(window / 2): for trigger_index in current_triggers: # Shift the anomaly back to the beginning of the # window if shift_to_start_of_window: anomalies.append( timeseries[(trigger_index - (window * int( (nth_median / 2))))]) else: anomalies.append(timeseries[trigger_index]) if not anomalies: anomalous = False if anomalies: anomalous = True anomalies_data = [] anomaly_timestamps = [int(item[0]) for item in anomalies] for item in timeseries: if int(item[0]) in anomaly_timestamps: anomalies_data.append(1) else: anomalies_data.append(0) if not use_bottleneck: df['anomalies'] = anomalies_data anomalies_list = [] for ts, value in timeseries: if int(ts) in anomaly_timestamps: anomalies_list.append([int(ts), value]) anomalies_dict['anomalies'][int(ts)] = value if anomalies and save_plots_to: try: from adtk.visualization import plot metric_dir = base_name.replace('.', '/') timestamp_dir = str(int(timeseries[-1][0])) save_path = '%s/%s/%s/%s' % (save_plots_to, algorithm_name, metric_dir, timestamp_dir) if save_plots_to_absolute_dir: save_path = '%s' % save_plots_to anomalies_dict['file_path'] = save_path save_to_file = '%s/%s.%s.png' % (save_path, algorithm_name, base_name) if filename_prefix: save_to_file = '%s/%s.%s.%s.png' % ( save_path, filename_prefix, algorithm_name, base_name) save_to_path = os_path_dirname(save_to_file) title = '%s\n%s - median %s %s-sigma persisted (window=%s)' % ( base_name, algorithm_name, str(nth_median), str(n_sigma), str(window)) if not os_path_exists(save_to_path): try: mkdir_p(save_to_path) except Exception as e: current_logger.error( 'error :: %s :: failed to create dir - %s - %s' % (algorithm_name, save_to_path, e)) if os_path_exists(save_to_path): try: plot(original_df['value'], anomaly=df['anomalies'], anomaly_color='red', title=title, save_to_file=save_to_file) if debug_logging: current_logger.debug( 'debug :: %s :: plot saved to - %s' % (algorithm_name, save_to_file)) anomalies_dict['image'] = save_to_file except Exception as e: current_logger.error( 'error :: %s :: failed to plot - %s - %s' % (algorithm_name, base_name, e)) anomalies_file = '%s/%s.%s.anomalies_list.txt' % ( save_path, algorithm_name, base_name) with open(anomalies_file, 'w') as fh: fh.write(str(anomalies_list)) # os.chmod(anomalies_file, mode=0o644) data_file = '%s/data.txt' % (save_path) with open(data_file, 'w') as fh: fh.write(str(anomalies_dict)) except SystemExit as e: if debug_logging: current_logger.debug( 'debug_logging :: %s :: SystemExit called during save plot, exiting - %s' % (algorithm_name, e)) if current_skyline_app == 'webapp': return (anomalous, anomalyScore, anomalies, anomalies_dict) if return_anomalies: return (anomalous, anomalyScore, anomalies) return (anomalous, anomalyScore) except Exception as e: traceback_msg = traceback.format_exc() record_algorithm_error(current_skyline_app, parent_pid, algorithm_name, traceback_msg) if debug_logging: current_logger.error(traceback_msg) current_logger.error( 'error :: %s :: failed to plot or save anomalies file - %s - %s' % (algorithm_name, base_name, e)) try: del df except: pass except SystemExit as e: if debug_logging: current_logger.debug( 'debug_logging :: %s :: SystemExit called, during analysis, exiting - %s' % (algorithm_name, e)) if current_skyline_app == 'webapp': return (anomalous, anomalyScore, anomalies, anomalies_dict) if return_anomalies: return (anomalous, anomalyScore, anomalies) return (anomalous, anomalyScore) except: traceback_msg = traceback.format_exc() record_algorithm_error(current_skyline_app, parent_pid, algorithm_name, traceback_msg) if debug_logging: current_logger.error(traceback_msg) current_logger.error( 'error :: debug_logging :: %s :: failed to run on ts' % (algorithm_name)) if current_skyline_app == 'webapp': return (anomalous, anomalyScore, anomalies, anomalies_dict) if return_anomalies: return (anomalous, anomalyScore, anomalies) return (anomalous, anomalyScore) end_analysis = timer() analysis_runtime = end_analysis - start_analysis if debug_logging: current_logger.debug( 'debug :: analysis with %s took %.6f seconds' % (algorithm_name, analysis_runtime)) if anomalous: anomalyScore = 1.0 else: anomalyScore = 0.0 if debug_logging: current_logger.info( '%s :: anomalous - %s, anomalyScore - %s' % (algorithm_name, str(anomalous), str(anomalyScore))) if debug_logging: end = timer() processing_runtime = end - start current_logger.info('%s :: completed in %.6f seconds' % (algorithm_name, processing_runtime)) try: del timeseries except: pass if current_skyline_app == 'webapp': return (anomalous, anomalyScore, anomalies, anomalies_dict) if return_anomalies: return (anomalous, anomalyScore, anomalies) return (anomalous, anomalyScore) except SystemExit as e: if debug_logging: current_logger.debug( 'debug_logging :: %s :: SystemExit called (before StopIteration), exiting - %s' % (algorithm_name, e)) if current_skyline_app == 'webapp': return (anomalous, anomalyScore, anomalies, anomalies_dict) if return_anomalies: return (anomalous, anomalyScore, anomalies) return (anomalous, anomalyScore) except StopIteration: # This except pattern MUST be used in ALL custom algortihms to # facilitate the traceback from any errors. The algorithm we want to # run super fast and without spamming the log with lots of errors. # But we do not want the function returning and not reporting # anything to the log, so the pythonic except is used to "sample" any # algorithm errors to a tmp file and report once per run rather than # spewing tons of errors into the log e.g. analyzer.log if current_skyline_app == 'webapp': return (anomalous, anomalyScore, anomalies, anomalies_dict) if return_anomalies: return (False, None, anomalies) return (False, None) except: record_algorithm_error(current_skyline_app, parent_pid, algorithm_name, traceback.format_exc()) # Return None and None as the algorithm could not determine True or False if current_skyline_app == 'webapp': return (anomalous, anomalyScore, anomalies, anomalies_dict) if return_anomalies: return (False, None, anomalies) return (False, None) if current_skyline_app == 'webapp': return (anomalous, anomalyScore, anomalies, anomalies_dict) if return_anomalies: return (anomalous, anomalyScore, anomalies) return (anomalous, anomalyScore)
def ionosphere_echo(base_name, mirage_full_duration): logger = logging.getLogger(skyline_app_logger) child_process_pid = os.getpid() context = 'ionosphere_echo' logger.info('ionosphere_echo :: started with child_process_pid - %s for %s' % (str(child_process_pid), base_name)) full_duration_in_hours = int(settings.FULL_DURATION / 60 / 60) try: # Allow for 3 seconds ionosphere_echo_max_fp_create_time = (settings.IONOSPHERE_ECHO_MAX_FP_CREATE_TIME - 3) except: ionosphere_echo_max_fp_create_time = 52 echo_started_at = int(time()) def echo_get_an_engine(): try: engine, fail_msg, trace = get_engine(skyline_app) return engine, fail_msg, trace except: trace = traceback.format_exc() logger.error('%s' % trace) fail_msg = 'error :: ionosphere_echo :: get_an_engine :: failed to get MySQL engine' logger.error('%s' % fail_msg) return None, fail_msg, trace def echo_engine_disposal(engine): try: if engine: try: engine.dispose() logger.info('ionosphere_echo :: MySQL engine disposed of') return True except: logger.error(traceback.format_exc()) logger.error('error :: ionosphere_echo :: calling engine.dispose()') else: logger.info('ionosphere_echo :: no MySQL engine to dispose of') return True except: return False return False # Determine the metric details from the database metrics_id = None metric_db_object = None engine = None # Get the metric db object data to memcache it is exists metric_db_object = get_memcache_metric_object(skyline_app, base_name) if metric_db_object: metrics_id = metric_db_object['id'] logger.info('ionosphere_echo :: determined metric id %s from memcache for %s' % (str(metrics_id), base_name)) else: # Only if no memcache data # Get a MySQL engine try: engine, log_msg, trace = echo_get_an_engine() logger.info('ionosphere_echo :: %s' % log_msg) logger.info('ionosphere_echo :: determining metric id from DB as not found in memcache for %s' % (base_name)) except: logger.error(traceback.format_exc()) logger.error('error :: ionosphere_echo :: could not get a MySQL engine to get metric_db_object') if not engine: logger.error('error :: ionosphere_echo :: engine not obtained to get metric_db_object') logger.info('ionosphere_echo :: exiting this work but not removing work item, as database may be available again before the work expires') return try: metrics_id, metric_db_object = echo_get_metric_from_metrics(base_name, engine) echo_engine_disposal(engine) except: logger.error(traceback.format_exc()) logger.error('error :: ionosphere_echo :: failed get the metric details from the database') logger.info('ionosphere_echo :: exiting this work but not removing work item, as database may be available again before the work expires') if not metrics_id: logger.error('error :: ionosphere_echo :: failed get the metrics_id from the database') logger.info('ionosphere_echo :: exiting this work but not removing work item, as database may be available again before the work expires') echo_engine_disposal(engine) return if not metric_db_object: logger.error('error :: ionosphere_echo :: failed get the metric_db_object from the database') logger.info('ionosphere_echo :: exiting this work but not removing work item, as database may be available again before the work expires') echo_engine_disposal(engine) return # Determine the metric fp ids from the database if not engine: logger.info('ionosphere_echo :: getting MySQL engine to determine fp ids for metric id %s - %s' % (str(metrics_id), base_name)) try: engine, log_msg, trace = echo_get_an_engine() logger.info(log_msg) except: logger.error(traceback.format_exc()) logger.error('error :: ionosphere_echo :: could not get a MySQL engine to get fp_ids') if not engine: logger.error('error :: ionosphere_echo :: engine not obtained to get fp_ids') return try: ionosphere_table, log_msg, trace = ionosphere_table_meta(skyline_app, engine) logger.info(log_msg) logger.info('ionosphere_echo :: ionosphere_table OK') except: logger.error(traceback.format_exc()) logger.error('error :: ionosphere_echo :: failed to get ionosphere_table meta for %s' % base_name) # Determine the fp_ids that exist for the metric echo_fp_ids_result = [] try: connection = engine.connect() stmt = select([ionosphere_table]).where(ionosphere_table.c.metric_id == metrics_id).order_by(desc(ionosphere_table.c.id)) echo_fp_ids = connection.execute(stmt) echo_fp_ids_result = [{column: value for column, value in rowproxy.items()} for rowproxy in echo_fp_ids] connection.close() except: logger.error(traceback.format_exc()) logger.error('error :: ionosphere_echo :: could not determine fp ids from DB for %s' % base_name) return if engine: echo_engine_disposal(engine) if not echo_fp_ids_result: logger.error('error :: ionosphere_echo :: no echo_fp_ids_result - could not determine fp ids from DB for %s' % base_name) else: logger.info('ionosphere_echo :: echo_fp_ids_result - determined fp ids from DB for %s' % base_name) try: db_fp_count = len(echo_fp_ids_result) logger.info('ionosphere_echo :: %s features profile ids found' % str(db_fp_count)) except: logger.error(traceback.format_exc()) logger.error('error :: ionosphere_echo :: could not calculate len of echo_fp_ids_result') echo_enabled_mirage_fp_ids = [] for row in echo_fp_ids_result: if row['enabled'] != 1: continue if row['deleted'] == 1: continue # Only create features profiles at settings.FULL_DURATION for Mirage # features profiles if the fp has been validated if row['validated'] == 0: continue if row['full_duration'] == int(mirage_full_duration): fp_id = row['id'] # @added 20190413 - Bug #2934: Ionosphere - no mirage.redis.24h.json file # Feature #1960: ionosphere_layers # Any features profiles that were created before the introduction of # ionosphere_layers will not have a mirage.redis.24h.json file as the # creation of these resources in the training_data dir was only added at # that point, so they are excluded. Less than 20170307 1488844800 excl. # @modified 20190508 - Bug #2934: Ionosphere - no mirage.redis.24h.json file # Feature #1960: ionosphere_layers # Increased the date as some on the cusp were still erroring, move # forward 2 days to 20170309 # if int(row['anomaly_timestamp']) < 1488844800: if int(row['anomaly_timestamp']) < 1489017600: logger.info('ionosphere_echo :: skipping fp id %s as predates having a mirage.redis json file' % str(fp_id)) continue echo_enabled_mirage_fp_ids.append(fp_id) echo_enabled_mirage_fp_ids_count = len(echo_enabled_mirage_fp_ids) logger.info('ionosphere_echo :: %s Mirage features profile ids found' % str(echo_enabled_mirage_fp_ids_count)) # Check which Mirage features profile do not have has a # settings.FULL_DURATION features profile mirage_fd_fp_count = 0 echo_create_fd_fp_for = [] for validated_mirage_fp_id in echo_enabled_mirage_fp_ids: mirage_fd_fp_exists = False for row in echo_fp_ids_result: if int(row['parent_id']) != int(validated_mirage_fp_id): continue if int(row['full_duration']) == int(settings.FULL_DURATION): mirage_fd_fp_exists = True mirage_fd_fp_count += 1 if not mirage_fd_fp_exists: echo_create_fd_fp_for.append(int(validated_mirage_fp_id)) del echo_enabled_mirage_fp_ids logger.info('ionosphere_echo :: there are %s FULL_DURATION features profiles for %s' % (str(mirage_fd_fp_count), base_name)) echo_create_fd_fp_for_count = len(echo_create_fd_fp_for) logger.info('ionosphere_echo :: %s FULL_DURATION features profiles to be created for %s' % (str(echo_create_fd_fp_for_count), base_name)) echo_created_fp_count = 0 # @added 20190404 - Bug #2904: Initial Ionosphere echo load and Ionosphere feedback # Feature #2484: FULL_DURATION feature profiles # Rate limit the creation of ionosphere_echo FULL_DURATION features profiles # this only effects managing the creation of lots of features profiles if # Ionosphere echo is enabled on a Skyline instance with lots of existing # features profiles for Mirage metrics. Only create 5 FULL_DURATION # features profiles from the latest Mirage based features profiles, which # takes around 10 seconds if echo_create_fd_fp_for_count > 5: logger.info('ionosphere_echo :: rate limiting and only creating the 5 least FULL_DURATION features profiles for %s' % (base_name)) # Reverse, newest first, using slicing to produce a reversed copy reverse_echo_create_fd_fp_for = echo_create_fd_fp_for[::-1] rate_limit_echo_create_fd_fp_for = reverse_echo_create_fd_fp_for[0:5] echo_create_fd_fp_for = rate_limit_echo_create_fd_fp_for last_created_fp = int(time()) for mirage_fp_id in echo_create_fd_fp_for: fp_timestamp = None for row in echo_fp_ids_result: if int(row['id']) != int(mirage_fp_id): continue else: fp_timestamp = int(row['anomaly_timestamp']) fp_generation = int(row['generation']) if not fp_timestamp: continue if not fp_generation: fp_generation = 0 time_check_now = int(time()) echo_runtime = time_check_now - echo_started_at if echo_runtime >= ionosphere_echo_max_fp_create_time: logger.info('ionosphere_echo :: ionosphere_echo running for %s seconds, exiting before IONOSPHERE_ECHO_MAX_FP_CREATE_TIME of %s seconds is breached' % (str(echo_runtime), str(ionosphere_echo_max_fp_create_time))) break logger.info('ionosphere_echo :: creating FULL_DURATION features profile based on data from fp id %s - %s' % (str(mirage_fp_id), base_name)) context = 'ionosphere_echo' ionosphere_job = 'learn_fp_human' generation = fp_generation + 1 fp_learn = False # What is the path of the features profile files metric_timeseries_dir = base_name.replace('.', '/') metric_fp_data_dir = '%s/%s/%s' % ( settings.IONOSPHERE_PROFILES_FOLDER, metric_timeseries_dir, str(fp_timestamp)) # What is the path of the new training data dir to copy the files to # and create it created_ts = int(time()) # Ensure features profile creation timestamps do not overlap if last_created_fp == created_ts: sleep(1) created_ts = int(time()) last_created_fp = created_ts metric_training_data_dir = '%s/%s/%s' % ( settings.IONOSPHERE_DATA_FOLDER, str(created_ts), metric_timeseries_dir) if not os.path.exists(metric_training_data_dir): try: mkdir_p(metric_training_data_dir) logger.info('ionosphere_echo :: training data dir created - %s' % metric_training_data_dir) except: logger.error('error :: ionosphere_echo :: failed to create training data dir - %s' % metric_training_data_dir) continue if not os.path.isdir(metric_fp_data_dir): logger.error('error :: ionosphere_echo :: features profile data dir does not exist - %s' % metric_fp_data_dir) continue data_files = [] try: glob_path = '%s/*.*' % metric_fp_data_dir data_files = glob.glob(glob_path) except: trace = traceback.format_exc() logger.error('%s' % trace) logger.error('error :: ionosphere_echo :: glob could not read - %s' % metric_fp_data_dir) # Make a list of the files to copy copy_files = [] for i_file in data_files: # Exclude all the file resources related to the Mirage # full_duration features profile if 'matched.fp_id' in i_file: continue if 'fp.created.txt' in i_file: continue if 'fp.details.txt' in i_file: continue if 'csv.features.transposed.csv' in i_file: continue # Specifically include the required files if 'graphite_now' in i_file: copy_files.append(i_file) echo_metric_txt = '%s.txt' % base_name if echo_metric_txt in i_file: copy_files.append(i_file) echo_metric_json = '%s.json' % base_name if echo_metric_json in i_file: copy_files.append(i_file) if 'mirage.graphite' in i_file: copy_files.append(i_file) if 'mirage.redis' in i_file: copy_files.append(i_file) # Copy the required files to the new training_data dir for i_file in copy_files: copying_filename = os.path.basename(i_file) dest_file = '%s/%s' % (metric_training_data_dir, copying_filename) if not os.path.isfile(dest_file): try: shutil.copy(i_file, metric_training_data_dir) # logger.info('ionosphere_echo :: training data copied - %s' % (i_file)) except shutil.Error as e: trace = traceback.format_exc() logger.error('%s' % trace) logger.error('error :: ionosphere_echo :: shutil error - training data not copied to %s' % metric_training_data_dir) logger.error('error :: ionosphere_echo :: %s' % (e)) # Any error saying that the directory doesn't exist except OSError as e: trace = traceback.format_exc() logger.error('%s' % trace) logger.error('error :: ionosphere_echo :: OSError error - training data not copied to %s' % metric_training_data_dir) logger.error('error :: %s' % (e)) calculated_feature_file = '%s/%s.tsfresh.input.csv.features.transposed.csv' % (metric_training_data_dir, base_name) calculated_feature_file_found = False fp_csv = None if os.path.isfile(calculated_feature_file): calculated_feature_file_found = True fp_csv = calculated_feature_file logger.info('ionosphere_echo :: calculated features file is available - %s' % (calculated_feature_file)) echo_json_file = '%s.mirage.redis.%sh.json' % (base_name, str(full_duration_in_hours)) if not calculated_feature_file_found: logger.info('ionosphere_echo :: calculating features from mirage.redis data ts json - %s' % (echo_json_file)) str_created_ts = str(created_ts) try: fp_csv, successful, fp_exists, fp_id, log_msg, traceback_format_exc, f_calc = calculate_features_profile(skyline_app, str_created_ts, base_name, context) except: logger.error(traceback.format_exc()) logger.error('error :: ionosphere_echo :: failed to calculate features') continue else: logger.info('ionosphere_echo :: using available calculated features file') if os.path.isfile(calculated_feature_file): logger.info('ionosphere_echo :: calculated features - %s' % (calculated_feature_file)) else: logger.error('error :: ionosphere_echo :: failed to calculate features no file found - %s' % calculated_feature_file) continue # Create the new settings.FULL_DURATION features profile ionosphere_job = 'learn_fp_human' # @added 20190503 - Branch #2646: slack # Added slack_ionosphere_job slack_ionosphere_job = ionosphere_job fp_learn = False try: # @modified 20190503 - Branch #2646: slack # Added slack_ionosphere_job # fp_id, fp_in_successful, fp_exists, fail_msg, traceback_format_exc = create_features_profile(skyline_app, created_ts, base_name, context, ionosphere_job, mirage_fp_id, generation, fp_learn) # @modified 20190922 - Feature #3230: users DB table # Ideas #2476: Label and relate anomalies # Feature #2516: Add label to features profile # Added user_id # fp_id, fp_in_successful, fp_exists, fail_msg, traceback_format_exc = create_features_profile(skyline_app, created_ts, base_name, context, ionosphere_job, mirage_fp_id, generation, fp_learn, slack_ionosphere_job, user_id) user_id = 1 # @modified 20191030 - Feature #2516: Add label to features profile # Added missing label label = 'echo' fp_id, fp_in_successful, fp_exists, fail_msg, traceback_format_exc = create_features_profile(skyline_app, created_ts, base_name, context, ionosphere_job, mirage_fp_id, generation, fp_learn, slack_ionosphere_job, user_id, label) except: logger.error(traceback.format_exc()) logger.error('error :: ionosphere_echo :: failed to create a settings.FULL_DURATION features profile from fp_id %s for %s' % (str(mirage_fp_id), base_name)) continue if not fp_in_successful: logger.error(traceback.format_exc()) logger.error('error :: ionosphere_echo :: create_features_profile failed') continue else: echo_created_fp_count += 1 logger.info( 'ionosphere_echo :: new generation %s features profile with id %s settings.FULL_DURATION created from parent feature profile with id %s' % ( str(generation), str(fp_id), str(mirage_fp_id))) del echo_create_fd_fp_for if engine: echo_engine_disposal(engine) logger.info('ionosphere_echo :: created %s of %s FULL_DURATION features profile that were to be created for %s' % (str(echo_created_fp_count), str(echo_create_fd_fp_for_count), base_name)) return
def run(self): """ Called when the process intializes. Determine if what is known in the Skyline DB blah """ # Log management to prevent overwriting # Allow the bin/<skyline_app>.d to manage the log if os.path.isfile(skyline_app_logwait): try: logger.info('removing %s' % skyline_app_logwait) os.remove(skyline_app_logwait) except OSError: logger.error('error :: failed to remove %s, continuing' % skyline_app_logwait) pass now = time() log_wait_for = now + 5 while now < log_wait_for: if os.path.isfile(skyline_app_loglock): sleep(.1) now = time() else: now = log_wait_for + 1 logger.info('starting %s run' % skyline_app) if os.path.isfile(skyline_app_loglock): logger.error('error :: bin/%s.d log management seems to have failed, continuing' % skyline_app) try: os.remove(skyline_app_loglock) logger.info('log lock file removed') except OSError: logger.error('error :: failed to remove %s, continuing' % skyline_app_loglock) pass else: logger.info('bin/%s.d log management done' % skyline_app) # See if I am known in the DB, if so, what are my variables # self.populate mysql # What is my host id in the Skyline panorama DB? # - if not known - INSERT hostname INTO hosts # What are the known apps? # - if returned make a dictionary # What are the known algorithms? # - if returned make a dictionary while 1: now = time() # Make sure Redis is up try: self.redis_conn.ping() if ENABLE_PANORAMA_DEBUG: logger.info('debug :: connected to Redis') except: logger.error('error :: cannot connect to redis at socket path %s' % ( settings.REDIS_SOCKET_PATH)) sleep(30) self.redis_conn = StrictRedis(unix_socket_path=settings.REDIS_SOCKET_PATH) continue # Report app up try: self.redis_conn.setex(skyline_app, 120, now) logger.info('updated Redis key for %s up' % skyline_app) except: logger.error('error :: failed to update Redis key for %s up' % skyline_app) if ENABLE_PANORAMA_DEBUG: # Make sure mysql is available mysql_down = True while mysql_down: query = 'SHOW TABLES' results = self.mysql_select(query) if results: mysql_down = False logger.info('debug :: tested database query - OK') else: logger.error('error :: failed to query database') sleep(30) if ENABLE_PANORAMA_DEBUG: try: query = 'SELECT id, test FROM test' result = self.mysql_select(query) logger.info('debug :: tested mysql SELECT query - OK') logger.info('debug :: result: %s' % str(result)) logger.info('debug :: result[0]: %s' % str(result[0])) logger.info('debug :: result[1]: %s' % str(result[1])) # Works # 2016-06-10 19:07:23 :: 4707 :: result: [(1, u'test1')] except: logger.error( 'error :: mysql error - %s' % traceback.print_exc()) logger.error('error :: failed to SELECT') # self.populate the database metatdata tables # What is my host id in the Skyline panorama DB? host_id = False # @modified 20170913 - Task #2160: Test skyline with bandit # Added nosec to exclude from bandit tests query = 'select id FROM hosts WHERE host=\'%s\'' % this_host # nosec results = self.mysql_select(query) if results: host_id = results[0][0] logger.info('host_id: %s' % str(host_id)) else: logger.info('failed to determine host id of %s' % this_host) # - if not known - INSERT hostname INTO host if not host_id: logger.info('inserting %s into hosts table' % this_host) # @modified 20170913 - Task #2160: Test skyline with bandit # Added nosec to exclude from bandit tests query = 'insert into hosts (host) VALUES (\'%s\')' % this_host # nosec host_id = self.mysql_insert(query) if host_id: logger.info('new host_id: %s' % str(host_id)) if not host_id: logger.error( 'error :: failed to determine populate %s into the hosts table' % this_host) sleep(30) continue # Like loop through the panorama dir and see if anyone has left you # any work, etc # Make sure check_dir exists and has not been removed try: if settings.ENABLE_PANORAMA_DEBUG: logger.info('debug :: checking check dir exists - %s' % settings.PANORAMA_CHECK_PATH) os.path.exists(settings.PANORAMA_CHECK_PATH) except: logger.error('error :: check dir did not exist - %s' % settings.PANORAMA_CHECK_PATH) mkdir_p(settings.PANORAMA_CHECK_PATH) logger.info('check dir created - %s' % settings.PANORAMA_CHECK_PATH) os.path.exists(settings.PANORAMA_CHECK_PATH) # continue """ Determine if any metric has been added to add """ while True: metric_var_files = False try: metric_var_files = [f for f in listdir(settings.PANORAMA_CHECK_PATH) if isfile(join(settings.PANORAMA_CHECK_PATH, f))] except: logger.error('error :: failed to list files in check dir') logger.info(traceback.format_exc()) if not metric_var_files: logger.info('sleeping 20 no metric check files') sleep(20) # Discover metric anomalies to insert metric_var_files = False try: metric_var_files = [f for f in listdir(settings.PANORAMA_CHECK_PATH) if isfile(join(settings.PANORAMA_CHECK_PATH, f))] except: logger.error('error :: failed to list files in check dir') logger.info(traceback.format_exc()) if metric_var_files: break metric_var_files_sorted = sorted(metric_var_files) metric_check_file = '%s/%s' % (settings.PANORAMA_CHECK_PATH, str(metric_var_files_sorted[0])) logger.info('assigning anomaly for insertion - %s' % str(metric_var_files_sorted[0])) # Spawn processes pids = [] spawned_pids = [] pid_count = 0 now = time() for i in range(1, settings.PANORAMA_PROCESSES + 1): try: p = Process(target=self.spin_process, args=(i, metric_check_file)) pids.append(p) pid_count += 1 logger.info('starting %s of %s spin_process/es' % (str(pid_count), str(settings.PANORAMA_PROCESSES))) p.start() spawned_pids.append(p.pid) except: logger.error('error :: to start spin_process') logger.info(traceback.format_exc()) continue # Send wait signal to zombie processes # for p in pids: # p.join() # Self monitor processes and terminate if any spin_process has run # for longer than CRUCIBLE_TESTS_TIMEOUT p_starts = time() while time() - p_starts <= 20: if any(p.is_alive() for p in pids): # Just to avoid hogging the CPU sleep(.1) else: # All the processes are done, break now. time_to_run = time() - p_starts logger.info( '%s :: %s spin_process/es completed in %.2f seconds' % ( skyline_app, str(settings.PANORAMA_PROCESSES), time_to_run)) break else: # We only enter this if we didn't 'break' above. logger.info('%s :: timed out, killing all spin_process processes' % (skyline_app)) for p in pids: p.terminate() # p.join() check_file_name = os.path.basename(str(metric_check_file)) if settings.ENABLE_PANORAMA_DEBUG: logger.info('debug :: check_file_name - %s' % check_file_name) check_file_timestamp = check_file_name.split('.', 1)[0] if settings.ENABLE_PANORAMA_DEBUG: logger.info('debug :: check_file_timestamp - %s' % str(check_file_timestamp)) check_file_metricname_txt = check_file_name.split('.', 1)[1] if settings.ENABLE_PANORAMA_DEBUG: logger.info('debug :: check_file_metricname_txt - %s' % check_file_metricname_txt) check_file_metricname = check_file_metricname_txt.replace('.txt', '') if settings.ENABLE_PANORAMA_DEBUG: logger.info('debug :: check_file_metricname - %s' % check_file_metricname) check_file_metricname_dir = check_file_metricname.replace('.', '/') if settings.ENABLE_PANORAMA_DEBUG: logger.info('debug :: check_file_metricname_dir - %s' % check_file_metricname_dir) metric_failed_check_dir = '%s/%s/%s' % (failed_checks_dir, check_file_metricname_dir, check_file_timestamp) fail_check(skyline_app, metric_failed_check_dir, str(metric_check_file)) for p in pids: if p.is_alive(): logger.info('%s :: stopping spin_process - %s' % (skyline_app, str(p.is_alive()))) p.join()
def create_features_profile(current_skyline_app, requested_timestamp, data_for_metric, context, ionosphere_job, fp_parent_id, fp_generation, fp_learn): """ Add a features_profile to the Skyline ionosphere database table. :param current_skyline_app: Skyline app name :param requested_timestamp: The timestamp of the dir that the features profile data is in :param data_for_metric: The base_name of the metric :param context: The context of the caller :param ionosphere_job: The ionosphere_job name related to creation request valid jobs are ``learn_fp_human``, ``learn_fp_generation``, ``learn_fp_learnt`` and ``learn_fp_automatic``. :param fp_parent_id: The id of the parent features profile that this was learnt from, 0 being an original human generated features profile :param fp_generation: The number of generations away for the original human generated features profile, 0 being an original human generated features profile. :param fp_learn: Whether Ionosphere should learn at use_full_duration_days :type current_skyline_app: str :type requested_timestamp: int :type data_for_metric: str :type context: str :type ionosphere_job: str :type fp_parent_id: int :type fp_generation: int :type fp_learn: boolean :return: fp_id, fp_in_successful, fp_exists, fail_msg, traceback_format_exc :rtype: str, boolean, boolean, str, str """ current_skyline_app_logger = current_skyline_app + 'Log' current_logger = logging.getLogger(current_skyline_app_logger) base_name = data_for_metric.replace(settings.FULL_NAMESPACE, '', 1) if context == 'training_data': log_context = 'training data' ionosphere_learn_job = 'learn_fp_human' if context == 'features_profiles': log_context = 'features profile data' # @added 20170113 - Feature #1854: Ionosphere learn if context == 'ionosphere_learn': log_context = 'learn' current_logger.info('create_features_profile :: %s :: requested for %s at %s' % ( context, str(base_name), str(requested_timestamp))) metric_timeseries_dir = base_name.replace('.', '/') if context == 'training_data': metric_training_data_dir = '%s/%s/%s' % ( settings.IONOSPHERE_DATA_FOLDER, str(requested_timestamp), metric_timeseries_dir) if context == 'features_profiles': metric_training_data_dir = '%s/%s/%s' % ( settings.IONOSPHERE_PROFILES_FOLDER, metric_timeseries_dir, str(requested_timestamp)) # @added 20170113 - Feature #1854: Ionosphere learn if context == 'ionosphere_learn': # @modified 20170116 - Feature #1854: Ionosphere learn # Allowing ionosphere_learn to create a features profile for a training # data set that it has learnt is not anomalous if ionosphere_job != 'learn_fp_automatic': metric_training_data_dir = '%s/%s/%s' % ( settings.IONOSPHERE_LEARN_FOLDER, str(requested_timestamp), metric_timeseries_dir) else: metric_training_data_dir = '%s/%s/%s' % ( settings.IONOSPHERE_DATA_FOLDER, str(requested_timestamp), metric_timeseries_dir) features_file = '%s/%s.tsfresh.input.csv.features.transposed.csv' % ( metric_training_data_dir, base_name) features_profile_dir = '%s/%s' % ( settings.IONOSPHERE_PROFILES_FOLDER, metric_timeseries_dir) ts_features_profile_dir = '%s/%s/%s' % ( settings.IONOSPHERE_PROFILES_FOLDER, metric_timeseries_dir, str(requested_timestamp)) features_profile_created_file = '%s/%s.%s.fp.created.txt' % ( metric_training_data_dir, str(requested_timestamp), base_name) features_profile_details_file = '%s/%s.%s.fp.details.txt' % ( metric_training_data_dir, str(requested_timestamp), base_name) anomaly_check_file = '%s/%s.txt' % (metric_training_data_dir, base_name) trace = 'none' fail_msg = 'none' new_fp_id = False calculated_with_tsfresh = False calculated_time = False fcount = None fsum = None # @added 20170104 - Feature #1842: Ionosphere - Graphite now graphs # Added the ts_full_duration parameter so that the appropriate graphs can be # embedded for the user in the training data page ts_full_duration = '0' if context == 'ionosphere_learn': if not path.isfile(features_profile_details_file): current_logger.error('error :: create_features_profile :: no features_profile_details_file - %s' % features_profile_details_file) return 'none', False, False, fail_msg, trace if path.isfile(features_profile_details_file): current_logger.info('create_features_profile :: getting features profile details from from - %s' % features_profile_details_file) # Read the details file with open(features_profile_details_file, 'r') as f: fp_details_str = f.read() fp_details = literal_eval(fp_details_str) calculated_with_tsfresh = fp_details[1] calculated_time = str(fp_details[2]) fcount = str(fp_details[3]) fsum = str(fp_details[4]) try: ts_full_duration = str(fp_details[5]) except: current_logger.error('error :: create_features_profile :: could not determine the full duration from - %s' % features_profile_details_file) ts_full_duration = '0' if context != 'ionosphere_learn': if ts_full_duration == '0': if path.isfile(anomaly_check_file): current_logger.info('create_features_profile :: determining the full duration from anomaly_check_file - %s' % anomaly_check_file) # Read the details file with open(anomaly_check_file, 'r') as f: anomaly_details = f.readlines() for i, line in enumerate(anomaly_details): if 'full_duration' in line: _ts_full_duration = '%s' % str(line).split("'", 2) full_duration_array = literal_eval(_ts_full_duration) ts_full_duration = str(int(full_duration_array[1])) current_logger.info('create_features_profile :: determined the full duration as - %s' % str(ts_full_duration)) if path.isfile(features_profile_created_file): # Read the created file with open(features_profile_created_file, 'r') as f: fp_created_str = f.read() fp_created = literal_eval(fp_created_str) new_fp_id = fp_created[0] return str(new_fp_id), True, True, fail_msg, trace # Have data if path.isfile(features_file): current_logger.info('create_features_profile :: features_file exists: %s' % features_file) else: trace = traceback.format_exc() current_logger.error(trace) fail_msg = 'error :: create_features_profile :: features_file does not exist: %s' % features_file current_logger.error('%s' % fail_msg) if context == 'training' or context == 'features_profile': # Raise to webbapp I believe to provide traceback to user in UI raise else: return False, False, False, fail_msg, trace features_data = [] with open(features_file, 'rb') as fr: reader = csv.reader(fr, delimiter=',') for i, line in enumerate(reader): feature_name_item = False fname_id = False f_value = False feature_name = str(line[0]) feature_name_item = filter( lambda x: x[1] == feature_name, TSFRESH_FEATURES) if feature_name_item: feature_name_id = feature_name_item[0] if feature_name_item: feature_name_list = feature_name_item[0] fname_id = int(feature_name_list[0]) f_value = str(line[1]) if fname_id and f_value: features_data.append([fname_id, f_value]) # @added 20170113 - Feature #1854: Ionosphere learn - generations # Set the learn generations variables with the IONOSPHERE_LEARN_DEFAULT_ and any # settings.IONOSPHERE_LEARN_NAMESPACE_CONFIG values. These will later be # overridden by any database values determined for the specific metric if # they exist. # Set defaults use_full_duration_days = int(settings.IONOSPHERE_LEARN_DEFAULT_FULL_DURATION_DAYS) valid_learning_duration = int(settings.IONOSPHERE_LEARN_DEFAULT_VALID_TIMESERIES_OLDER_THAN_SECONDS) max_generations = int(settings.IONOSPHERE_LEARN_DEFAULT_MAX_GENERATIONS) max_percent_diff_from_origin = float(settings.IONOSPHERE_LEARN_DEFAULT_MAX_PERCENT_DIFF_FROM_ORIGIN) try: use_full_duration, valid_learning_duration, use_full_duration_days, max_generations, max_percent_diff_from_origin = get_ionosphere_learn_details(current_skyline_app, base_name) learn_full_duration_days = use_full_duration_days except: current_logger.error(traceback.format_exc()) current_logger.error('error :: create_features_profile :: failed to get_ionosphere_learn_details') current_logger.info('create_features_profile :: learn_full_duration_days :: %s days' % (str(learn_full_duration_days))) current_logger.info('create_features_profile :: valid_learning_duration :: %s seconds' % (str(valid_learning_duration))) current_logger.info('create_features_profile :: max_generations :: %s' % (str(max_generations))) current_logger.info('create_features_profile :: max_percent_diff_from_origin :: %s' % (str(max_percent_diff_from_origin))) current_logger.info('create_features_profile :: getting MySQL engine') try: engine, fail_msg, trace = fp_create_get_an_engine(current_skyline_app) current_logger.info(fail_msg) except: trace = traceback.format_exc() current_logger.error(trace) fail_msg = 'error :: create_features_profile :: could not get a MySQL engine' current_logger.error('%s' % fail_msg) if context == 'training' or context == 'features_profile': # Raise to webbapp I believe to provide traceback to user in UI raise else: return False, False, False, fail_msg, trace if not engine: trace = 'none' fail_msg = 'error :: create_features_profile :: engine not obtained' current_logger.error(fail_msg) if context == 'training' or context == 'features_profile': # Raise to webbapp I believe to provide traceback to user in UI raise else: return False, False, False, fail_msg, trace # Get metric details from the database metrics_id = False # Use the learn details as per config metric_learn_full_duration_days = int(use_full_duration_days) metric_learn_valid_ts_older_than = int(valid_learning_duration) metric_max_generations = int(max_generations) metric_max_percent_diff_from_origin = int(max_percent_diff_from_origin) metrics_table = None try: metrics_table, fail_msg, trace = metrics_table_meta(current_skyline_app, engine) current_logger.info(fail_msg) except: trace = traceback.format_exc() current_logger.error('%s' % trace) fail_msg = 'error :: create_features_profile :: failed to get metrics_table meta for %s' % base_name current_logger.error('%s' % fail_msg) if context == 'training' or context == 'features_profile': # @added 20170806 - Bug #2130: MySQL - Aborted_clients # Added missing disposal if engine: fp_create_engine_disposal(current_skyline_app, engine) # Raise to webbapp I believe to provide traceback to user in UI raise else: current_logger.info('create_features_profile :: disposing of any engine') fp_create_engine_disposal(current_skyline_app, engine) return False, False, False, fail_msg, trace current_logger.info('create_features_profile :: metrics_table OK') metric_db_object = None try: connection = engine.connect() # @modified 20161209 - - Branch #922: ionosphere # Task #1658: Patterning Skyline Ionosphere # result = connection.execute('select id from metrics where metric=\'%s\'' % base_name) # for row in result: # while not metrics_id: # metrics_id = row['id'] stmt = select([metrics_table]).where(metrics_table.c.metric == base_name) result = connection.execute(stmt) for row in result: metrics_id = row['id'] # @added 20170113 - Feature #1854: Ionosphere learn - generations # Added Ionosphere LEARN generation related variables try: metric_learn_full_duration_days = int(row['learn_full_duration_days']) metric_learn_valid_ts_older_than = int(row['learn_valid_ts_older_than']) metric_max_generations = int(row['max_generations']) metric_max_percent_diff_from_origin = float(row['max_percent_diff_from_origin']) except: current_logger.error('error :: create_features_profile :: failed to determine learn related values from DB for %s' % base_name) row = result.fetchone() # metric_db_object = row connection.close() current_logger.info('create_features_profile :: determined db metric id: %s' % str(metrics_id)) current_logger.info('create_features_profile :: determined db metric learn_full_duration_days: %s' % str(metric_learn_full_duration_days)) current_logger.info('create_features_profile :: determined db metric learn_valid_ts_older_than: %s' % str(metric_learn_valid_ts_older_than)) current_logger.info('create_features_profile :: determined db metric max_generations: %s' % str(metric_max_generations)) current_logger.info('create_features_profile :: determined db metric max_percent_diff_from_origin: %s' % str(metric_max_percent_diff_from_origin)) except: trace = traceback.format_exc() current_logger.error(trace) fail_msg = 'error :: create_features_profile :: could not determine id of metric from DB: %s' % base_name current_logger.error('%s' % fail_msg) if metric_learn_full_duration_days: learn_full_duration_days = metric_learn_full_duration_days # learn_full_duration = int(learn_full_duration_days) * 86400 if metric_learn_valid_ts_older_than: learn_valid_ts_older_than = metric_learn_valid_ts_older_than if metric_max_generations: max_generations = metric_max_generations if metric_max_percent_diff_from_origin: max_percent_diff_from_origin = metric_max_percent_diff_from_origin current_logger.info('create_features_profile :: generation info - learn_full_duration_days :: %s' % (str(learn_full_duration_days))) current_logger.info('create_features_profile :: generation info - learn_valid_ts_older_than :: %s' % (str(learn_valid_ts_older_than))) current_logger.info('create_features_profile :: generation info - max_generations :: %s' % (str(max_generations))) current_logger.info('create_features_profile :: generation info - max_percent_diff_from_origin :: %s' % (str(max_percent_diff_from_origin))) # @added 20170120 - Feature #1854: Ionosphere learn # Always use the timestamp from the anomaly file use_anomaly_timestamp = int(requested_timestamp) if context == 'ionosphere_learn': if path.isfile(anomaly_check_file): current_logger.info('create_features_profile :: determining the full duration from anomaly_check_file - %s' % anomaly_check_file) # Read the details file with open(anomaly_check_file, 'r') as f: anomaly_details = f.readlines() for i, line in enumerate(anomaly_details): if 'metric_timestamp' in line: _metric_timestamp = '%s' % str(line).split("'", 2) metric_timestamp_array = literal_eval(_metric_timestamp) use_anomaly_timestamp = (int(metric_timestamp_array[1])) current_logger.info('create_features_profile :: determined the anomaly metric_timestamp as - %s' % str(use_anomaly_timestamp)) ionosphere_table = None try: ionosphere_table, fail_msg, trace = ionosphere_table_meta(current_skyline_app, engine) current_logger.info(fail_msg) except: trace = traceback.format_exc() current_logger.error('%s' % trace) fail_msg = 'error :: create_features_profile :: failed to get ionosphere_table meta for %s' % base_name current_logger.error('%s' % fail_msg) if context == 'training' or context == 'features_profile': # Raise to webbapp I believe to provide traceback to user in UI # @added 20170806 - Bug #2130: MySQL - Aborted_clients # Added missing disposal if engine: fp_create_engine_disposal(current_skyline_app, engine) raise else: current_logger.info('create_features_profile :: disposing of any engine') fp_create_engine_disposal(current_skyline_app, engine) return False, False, False, fail_msg, trace current_logger.info('create_features_profile :: ionosphere_table OK') # @added 20170403 - Feature #2000: Ionosphere - validated # Set all learn_fp_human features profiles to validated. fp_validated = 0 if ionosphere_job == 'learn_fp_human': fp_validated = 1 # @added 20170424 - Feature #2000: Ionosphere - validated # Set all generation 0 and 1 as validated if int(fp_generation) <= 1: fp_validated = 1 new_fp_id = False try: connection = engine.connect() # @added 20170113 - Feature #1854: Ionosphere learn # Added learn values parent_id, generation # @modified 20170120 - Feature #1854: Ionosphere learn # Added anomaly_timestamp # @modified 20170403 - Feature #2000: Ionosphere - validated ins = ionosphere_table.insert().values( metric_id=int(metrics_id), full_duration=int(ts_full_duration), anomaly_timestamp=int(use_anomaly_timestamp), enabled=1, tsfresh_version=str(tsfresh_version), calc_time=calculated_time, features_count=fcount, features_sum=fsum, parent_id=fp_parent_id, generation=fp_generation, validated=fp_validated) result = connection.execute(ins) connection.close() new_fp_id = result.inserted_primary_key[0] current_logger.info('create_features_profile :: new ionosphere fp_id: %s' % str(new_fp_id)) except: trace = traceback.format_exc() current_logger.error('%s' % trace) fail_msg = 'error :: create_features_profile :: failed to insert a new record into the ionosphere table for %s' % base_name current_logger.error('%s' % fail_msg) if context == 'training' or context == 'features_profile': # @added 20170806 - Bug #2130: MySQL - Aborted_clients # Added missing disposal if engine: fp_create_engine_disposal(current_skyline_app, engine) # Raise to webbapp I believe to provide traceback to user in UI raise else: current_logger.info('create_features_profile :: disposing of any engine') fp_create_engine_disposal(current_skyline_app, engine) return False, False, False, fail_msg, trace if not RepresentsInt(new_fp_id): trace = traceback.format_exc() current_logger.error('%s' % trace) fail_msg = 'error :: create_features_profile :: unknown new ionosphere new_fp_id for %s' % base_name current_logger.error('%s' % fail_msg) if context == 'training' or context == 'features_profile': # @added 20170806 - Bug #2130: MySQL - Aborted_clients # Added missing disposal if engine: fp_create_engine_disposal(current_skyline_app, engine) # Raise to webbapp I believe to provide traceback to user in UI raise else: current_logger.info('create_features_profile :: disposing of any engine') fp_create_engine_disposal(current_skyline_app, engine) return False, False, False, fail_msg, trace # Create z_fp_<metric_id> table fp_table_created = False fp_table_name = 'z_fp_%s' % str(metrics_id) try: fp_meta = MetaData() # @modified 20161222 - Task #1812: z_fp table type # Changed to InnoDB from MyISAM as no files open issues and MyISAM clean # up, there can be LOTS of file_per_table z_fp_ tables/files without # the MyISAM issues. z_fp_ tables are mostly read and will be shuffled # in the table cache as required. fp_metric_table = Table( fp_table_name, fp_meta, Column('id', Integer, primary_key=True), Column('fp_id', Integer, nullable=False, key='fp_id'), Column('feature_id', Integer, nullable=False), Column('value', DOUBLE(), nullable=True), mysql_charset='utf8', mysql_key_block_size='255', mysql_engine='InnoDB') fp_metric_table.create(engine, checkfirst=True) fp_table_created = True except: trace = traceback.format_exc() current_logger.error('%s' % trace) fail_msg = 'error :: create_features_profile :: failed to create table - %s' % fp_table_name current_logger.error('%s' % fail_msg) if context == 'training' or context == 'features_profile': # @added 20170806 - Bug #2130: MySQL - Aborted_clients # Added missing disposal if engine: fp_create_engine_disposal(current_skyline_app, engine) # Raise to webbapp I believe to provide traceback to user in UI raise else: current_logger.info('create_features_profile :: %s - automated so the table should exists continuing' % context) if not fp_table_created: trace = traceback.format_exc() current_logger.error('%s' % trace) fail_msg = 'error :: create_features_profile :: failed to determine True for create table - %s' % fp_table_name current_logger.error('%s' % fail_msg) if context == 'training' or context == 'features_profile': # @added 20170806 - Bug #2130: MySQL - Aborted_clients # Added missing disposal if engine: fp_create_engine_disposal(current_skyline_app, engine) # Raise to webbapp I believe to provide traceback to user in UI raise else: current_logger.info('create_features_profile :: %s - automated so the table should exists continuing' % context) # Insert features and values insert_statement = [] for fname_id, f_value in features_data: insert_statement.append({'fp_id': new_fp_id, 'feature_id': fname_id, 'value': f_value},) if insert_statement == []: trace = traceback.format_exc() current_logger.error('%s' % trace) fail_msg = 'error :: create_features_profile :: empty insert_statement for %s inserts' % fp_table_name current_logger.error('%s' % fail_msg) # raise # else: # feature_count = sum(1 for x in a if isinstance(x, insert_statement)) # current_logger.info( # 'fp_id - %s - %s feature values in insert_statement for %s ' % # (str(feature_count), str(new_fp_id), fp_table_name)) # feature_count = sum(1 for x in a if isinstance(x, insert_statement)) # current_logger.info( # 'fp_id - %s - feature values in insert_statement for %s ' % # (str(new_fp_id), fp_table_name)) try: connection = engine.connect() connection.execute(fp_metric_table.insert(), insert_statement) connection.close() current_logger.info('create_features_profile :: fp_id - %s - feature values inserted into %s' % (str(new_fp_id), fp_table_name)) except: trace = traceback.format_exc() current_logger.error('%s' % trace) fail_msg = 'error :: create_features_profile :: failed to insert a feature values into %s' % fp_table_name current_logger.error('%s' % fail_msg) if context == 'training' or context == 'features_profile': # @added 20170806 - Bug #2130: MySQL - Aborted_clients # Added missing disposal if engine: fp_create_engine_disposal(current_skyline_app, engine) # Raise to webbapp I believe to provide traceback to user in UI raise else: current_logger.info('create_features_profile :: %s - automated so the table should exists continuing' % context) # Create metric ts table if not exists ts_<metric_id> # Create z_ts_<metric_id> table # @modified 20170121 - Feature #1854: Ionosphere learn - generations # TODO Adding the option to not save timeseries to DB, as default? # ts_table_created = False ts_table_name = 'z_ts_%s' % str(metrics_id) try: ts_meta = MetaData() # @modified 20161222 - Task #1812: z_fp table type # Changed to InnoDB from MyISAM as no files open issues and MyISAM clean # up, there can be LOTS of file_per_table z_fp_ tables/files without # the MyISAM issues. z_fp_ tables are mostly read and will be shuffled # in the table cache as required. ts_metric_table = Table( ts_table_name, ts_meta, Column('id', Integer, primary_key=True), Column('fp_id', Integer, nullable=False, key='fp_id'), Column('timestamp', Integer, nullable=False), Column('value', DOUBLE(), nullable=True), mysql_charset='utf8', mysql_key_block_size='255', mysql_engine='InnoDB') ts_metric_table.create(engine, checkfirst=True) # ts_table_created = True current_logger.info('create_features_profile :: metric ts table created OK - %s' % (ts_table_name)) except: trace = traceback.format_exc() current_logger.error('%s' % trace) fail_msg = 'error :: create_features_profile :: failed to create table - %s' % ts_table_name current_logger.error('%s' % fail_msg) if context == 'training' or context == 'features_profile': # @added 20170806 - Bug #2130: MySQL - Aborted_clients # Added missing disposal if engine: fp_create_engine_disposal(current_skyline_app, engine) # Raise to webbapp I believe to provide traceback to user in UI raise else: current_logger.info('create_features_profile :: %s - automated so the table should exists continuing' % context) # Insert timeseries that the features profile was created from raw_timeseries = [] anomaly_json = '%s/%s.json' % (metric_training_data_dir, base_name) if path.isfile(anomaly_json): current_logger.info('create_features_profile :: metric anomaly json found OK - %s' % (anomaly_json)) try: # Read the timeseries json file with open(anomaly_json, 'r') as f: raw_timeseries = f.read() except: trace = traceback.format_exc() current_logger.error(trace) fail_msg = 'error :: create_features_profile :: failed to read timeseries data from %s' % anomaly_json current_logger.error('%s' % (fail_msg)) fail_msg = 'error: failed to read timeseries data from %s' % anomaly_json # end = timer() if context == 'training' or context == 'features_profile': # @added 20170806 - Bug #2130: MySQL - Aborted_clients # Added missing disposal if engine: fp_create_engine_disposal(current_skyline_app, engine) # Raise to webbapp I believe to provide traceback to user in UI raise else: trace = 'none' fail_msg = 'error: file not found - %s' % (anomaly_json) current_logger.error(fail_msg) # raise # Convert the timeseries to csv timeseries_array_str = str(raw_timeseries).replace('(', '[').replace(')', ']') timeseries = literal_eval(timeseries_array_str) datapoints = timeseries validated_timeseries = [] for datapoint in datapoints: try: new_datapoint = [str(int(datapoint[0])), float(datapoint[1])] validated_timeseries.append(new_datapoint) # @modified 20170913 - Task #2160: Test skyline with bandit # Added nosec to exclude from bandit tests except: # nosec continue insert_statement = [] for ts, value in validated_timeseries: insert_statement.append({'fp_id': new_fp_id, 'timestamp': ts, 'value': value},) try: connection = engine.connect() connection.execute(ts_metric_table.insert(), insert_statement) connection.close() current_logger.info('create_features_profile :: fp_id - %s - timeseries inserted into %s' % (str(new_fp_id), ts_table_name)) except: trace = traceback.format_exc() current_logger.error('%s' % trace) fail_msg = 'error :: create_features_profile :: failed to insert the timeseries into %s' % ts_table_name current_logger.error('%s' % fail_msg) if context == 'training' or context == 'features_profile': # @added 20170806 - Bug #2130: MySQL - Aborted_clients # Added missing disposal if engine: fp_create_engine_disposal(current_skyline_app, engine) raise else: current_logger.info('create_features_profile :: %s - automated so the table should exists continuing' % context) # Create a created features profile file try: # data = '[%s, %s, ]' % (new_fp_id, str(int(time.time()))) # write_data_to_file(skyline_app, features_profile_created_file, 'w', data) # @modified 20170115 - Feature #1854: Ionosphere learn - generations # Added parent_id and generation data = '[%s, %s, \'%s\', %s, %s, %s, %s, %s, %s]' % ( new_fp_id, str(int(time.time())), str(tsfresh_version), str(calculated_time), str(fcount), str(fsum), str(ts_full_duration), str(fp_parent_id), str(fp_generation)) write_data_to_file(current_skyline_app, features_profile_created_file, 'w', data) except: trace = traceback.format_exc() current_logger.error('%s' % trace) fail_msg = 'error :: create_features_profile :: failed to write fp.created file' current_logger.error('%s' % fail_msg) # Set ionosphere_enabled for the metric try: # update_statement = 'UPDATE metrics SET ionosphere_enabled=1 WHERE id=%s' % str(metrics_id) connection = engine.connect() # result = connection.execute('UPDATE metrics SET ionosphere_enabled=1 WHERE id=%s' % str(metrics_id)) # connection.execute(ts_metric_table.insert(), insert_statement) connection.execute( metrics_table.update( metrics_table.c.id == metrics_id).values(ionosphere_enabled=1)) connection.close() current_logger.info('create_features_profile :: ionosphere_enabled set on metric id: %s' % str(metrics_id)) except: trace = traceback.format_exc() current_logger.error(trace) fail_msg = 'error :: create_features_profile :: could not update metrics table and set ionosphere_enabled on id %s' % str(metrics_id) current_logger.error('%s' % fail_msg) # raise # Copy data from training data dir to features_profiles dir if not path.isdir(ts_features_profile_dir): mkdir_p(ts_features_profile_dir) if path.isdir(ts_features_profile_dir): current_logger.info('create_features_profile :: fp_id - %s - features profile dir created - %s' % (str(new_fp_id), ts_features_profile_dir)) # src_files = os.listdir(src) # for file_name in src_files: # full_file_name = path.join(src, file_name) # if (path.isfile(full_file_name)): # shutil.copy(full_file_name, dest) data_files = [] try: glob_path = '%s/*.*' % metric_training_data_dir data_files = glob.glob(glob_path) except: trace = traceback.format_exc() current_logger.error('%s' % trace) current_logger.error('error :: create_features_profile :: glob - fp_id - %s - training data not copied to %s' % (str(new_fp_id), ts_features_profile_dir)) for i_file in data_files: try: shutil.copy(i_file, ts_features_profile_dir) current_logger.info('create_features_profile :: fp_id - %s - training data copied - %s' % (str(new_fp_id), i_file)) except shutil.Error as e: trace = traceback.format_exc() current_logger.error('%s' % trace) current_logger.error('error :: create_features_profile :: shutil error - fp_id - %s - training data not copied to %s' % (str(new_fp_id), ts_features_profile_dir)) current_logger.error('error :: create_features_profile :: %s' % (e)) # Any error saying that the directory doesn't exist except OSError as e: trace = traceback.format_exc() current_logger.error('%s' % trace) current_logger.error('error :: create_features_profile :: OSError error - fp_id - %s - training data not copied to %s' % (str(new_fp_id), ts_features_profile_dir)) current_logger.error('error :: create_features_profile :: %s' % (e)) current_logger.info('create_features_profile :: fp_id - %s - training data copied to %s' % (str(new_fp_id), ts_features_profile_dir)) else: current_logger.error('error :: create_features_profile :: fp_id - %s - training data not copied to %s' % (str(new_fp_id), ts_features_profile_dir)) current_logger.info('create_features_profile :: disposing of any engine') try: if engine: fp_create_engine_disposal(current_skyline_app, engine) else: current_logger.info('create_features_profile :: no engine to dispose of' % (str(new_fp_id), ts_features_profile_dir)) except: trace = traceback.format_exc() current_logger.error('%s' % trace) current_logger.error('error :: create_features_profile :: OSError error - fp_id - %s - training data not copied to %s' % (str(new_fp_id), ts_features_profile_dir)) # @added 20170113 - Feature #1854: Ionosphere learn - Redis ionosphere.learn.work namespace # Ionosphere learn needs Redis works sets # When a features profile is created there needs to be work added to a Redis # set. When a human makes a features profile, we want Ionosphere to make a # use_full_duration_days features profile valid_learning_duration (e.g. # 3361) later. if settings.IONOSPHERE_LEARN and new_fp_id: create_redis_work_item = False if context == 'training_data' and ionosphere_job == 'learn_fp_human': create_redis_work_item = True # @modified 20170120 - Feature #1854: Ionosphere learn - generations # Added fp_learn parameter to allow the user to not learn the # use_full_duration_days if not fp_learn: create_redis_work_item = False current_logger.info('fp_learn is False not adding an item to Redis ionosphere.learn.work set') if ionosphere_job == 'learn_fp_automatic': create_redis_work_item = True # @added 20170131 - Feature #1886 Ionosphere learn - child like parent with evolutionary maturity # TODO: here a check may be required to evaluate whether the origin_fp_id # had a use_full_duration features profile created, however # due to the fact that it is in learn, suggests that it did # have, not 100% sure. origin_fp_id_was_allowed_to_learn = False child_use_full_duration_count_of_origin_fp_id = 1 # TODO: Determine the state # child_use_full_duration_count_of_origin_fp_id = SELECT COUNT(id) FROM ionosphere WHERE parent_id=origin_fp_id AND full_duration=use_full_duration if child_use_full_duration_count_of_origin_fp_id == 0: current_logger.info('the origin parent was not allowed to learn not adding to Redis ionosphere.learn.work set') create_redis_work_item = False if create_redis_work_item: try: current_logger.info( 'adding work to Redis ionosphere.learn.work set - [\'Soft\', \'%s\', %s, \'%s\', %s, %s] to make a learn features profile later' % ( str(ionosphere_job), str(requested_timestamp), base_name, str(new_fp_id), str(fp_generation))) redis_conn = StrictRedis(unix_socket_path=settings.REDIS_SOCKET_PATH) redis_conn.sadd('ionosphere.learn.work', ['Soft', str(ionosphere_job), int(requested_timestamp), base_name, int(new_fp_id), int(fp_generation)]) except: current_logger.error(traceback.format_exc()) current_logger.error( 'error :: failed adding work to Redis ionosphere.learn.work set - [\'Soft\', \'%s\', %s, \'%s\', %s, %s] to make a learn features profile later' % ( str(ionosphere_job), str(requested_timestamp), base_name, str(new_fp_id), str(fp_generation))) # @added 20170806 - Bug #2130: MySQL - Aborted_clients # Added missing disposal if engine: fp_create_engine_disposal(current_skyline_app, engine) return str(new_fp_id), True, False, fail_msg, trace
def submit_crucible_job(from_timestamp, until_timestamp, metrics_list, namespaces_list, source, alert_interval, user_id, user, add_to_panorama, pad_timeseries, training_data_json, run_algorithms): """ Get a list of all the metrics passed and generate Crucible check files for each :param from_timestamp: the timestamp at which to start the time series :param until_timestamp: the timestamp at which to end the time series :param metrics_list: a list of metric names to analyse :param namespaces_list: a list of metric namespaces to analyse :param source: the source webapp making the request :param alert_interval: the alert_interval at which Crucible should trigger anomalies :param user_id: the user id of the user making the request :param user: the username making the request :param add_to_panorama: whether Crucible should add Skyline CONSENSUS anomalies to Panorama :param pad_timeseries: the amount of data to pad the time series with :param training_data_json: the full path to the training_data json file if source is training_data :param run_algorithms: list of algorithms to run :type from_timestamp: int :type until_timestamp: int :type metrics_list: list :type namespaces_list: list :type source: str :type alert_interval: int :type user_id: int :type user: str :type add_to_panorama: boolean :type pad_timeseries: str :type training_data_json: str :type run_algorithms: list :return: tuple of lists :rtype: (list, list, list, list) Returns (crucible_job_id, metrics_submitted_to_process, fail_msg, trace) """ fail_msg = None trace = None crucible_job_id = None metrics_submitted_to_process = 0 # Generate a job id based on the YMDHMS.user_id and a job directory try: jobid_timestamp = int(time()) jobid_datetimestamp = dt.datetime.fromtimestamp( jobid_timestamp).strftime('%Y%m%d%H%M%S') crucible_job_id = '%s.%s' % (str(jobid_datetimestamp), str(user_id)) except: logger.error(traceback.format_exc()) logger.error('error :: failed to determine a crucible_job_id') raise # to webapp to return in the UI # Generate a job id based on the YMDHMS.user_id and a job directory try: crucible_path = os.path.dirname(settings.CRUCIBLE_DATA_FOLDER) crucible_job_dir = '%s/jobs/%s' % (crucible_path, crucible_job_id) if not path.exists(crucible_job_dir): logger.info('creating crucible job directory - %s' % (str(crucible_job_dir))) mkdir_p(crucible_job_dir) except: trace = traceback.format_exc() fail_msg = 'error :: failed to create the crucible job directory' logger.error(trace) logger.error(fail_msg) raise # to webapp to return in the UI # TODO added checks of metric names metric_names = [] if metrics_list: logger.info('submit_crucible_job :: %s metrics passed' % str(len(metrics_list))) for metric in metrics_list: metric_names.append(metric) # TODO added checks of metric namespaces, harder to do, but so that the UI # errors to the usr rather than sending a bad or non-existent metric to # Crucible if namespaces_list: logger.info('submit_crucible_job :: %s namespaces passed' % str(len(namespaces_list))) logger.info( 'submit_crucible_job :: determine metrics for submit_crucible_job between %s and %s' % (str(from_timestamp), str(until_timestamp))) logger.info('getting MySQL engine') try: engine, fail_msg, trace = get_an_engine() logger.info(fail_msg) except: trace = traceback.format_exc() logger.error(trace) logger.error('%s' % fail_msg) logger.error( 'error :: could not get a MySQL engine to get metric names') raise # to webapp to return in the UI if not engine: trace = 'none' fail_msg = 'error :: engine not obtained' logger.error(fail_msg) raise try: metrics_table, log_msg, trace = metrics_table_meta( skyline_app, engine) logger.info(log_msg) logger.info('metrics_table OK') except: logger.error(traceback.format_exc()) logger.error('error :: failed to get metrics_table meta') if engine: engine_disposal(engine) raise # to webapp to return in the UI metrics_like_query = text( """SELECT metric FROM metrics WHERE metric LIKE :like_string""") for namespace in namespaces_list: try: connection = engine.connect() results = connection.execute(metrics_like_query, like_string=str(namespace)) connection.close() for row in results: metric_name = str(row[0]) metric_names.append(metric_name) except: trace = traceback.format_exc() logger.error(trace) logger.error( 'error :: could not determine metrics from metrics table') if engine: engine_disposal(engine) raise logger.info( 'submit_crucible_job :: %s metrics determined from passed namespaces' % str(len(metric_names))) logger.info('submit_crucible_job :: %s metrics to process' % str(len(metric_names))) metrics_submitted_to_process = [] datapoint = 0 triggered_algorithms = [ 'histogram_bins', 'first_hour_average', 'stddev_from_average', 'grubbs', 'ks_test', 'mean_subtraction_cumulation', 'median_absolute_deviation', 'stddev_from_moving_average', 'least_squares' ] added_at = int(time()) for base_name in metric_names: sane_metricname = filesafe_metricname(str(base_name)) derivative_metric = is_derivative_metric(skyline_app, base_name) if derivative_metric: target = 'nonNegativeDerivative(%s)' % base_name else: target = base_name # Generate a metric job directory crucible_anomaly_dir = '%s/%s' % (crucible_job_dir, sane_metricname) try: if not path.exists(crucible_anomaly_dir): logger.info('creating crucible metric job directory - %s' % (str(crucible_anomaly_dir))) mkdir_p(crucible_anomaly_dir) except: trace = traceback.format_exc() fail_msg = 'error :: failed to create the crucible metric job directory' logger.error(trace) logger.error(fail_msg) raise # to webapp to return in the UI if source == 'graphite': graphite_metric = True else: graphite_metric = False # @added 20200422 - Feature #3500: webapp - crucible_process_metrics # Feature #1448: Crucible web UI # In order for metrics to be analysed in Crucible like the Analyzer or # Mirage analysis, the time series data needs to be padded # Added pad_timeseries graphite_override_uri_parameters = 'from=%s&until=%s&target=%s' % ( str(from_timestamp), str(until_timestamp), target) timeseries_full_duration = int(until_timestamp) - int(from_timestamp) pad_timeseries_with = 0 if pad_timeseries == 'auto': if timeseries_full_duration > 3600: pad_timeseries_with = 3600 if timeseries_full_duration > 86400: pad_timeseries_with = 86400 if pad_timeseries == '86400': pad_timeseries_with = 86400 if pad_timeseries == '604800': pad_timeseries_with = 604800 if pad_timeseries == '0': pad_timeseries_with = 0 if pad_timeseries_with: try: padded_from_timestamp = int( from_timestamp) - pad_timeseries_with graphite_override_uri_parameters = 'from=%s&until=%s&target=%s' % ( str(padded_from_timestamp), str(until_timestamp), target) logger.info('padding time series with %s seconds - %s' % (str(pad_timeseries_with), str(graphite_override_uri_parameters))) except: logger.error(traceback.format_exc()) logger.error( 'error :: failed to construct graphite_override_uri_parameters with pad_timeseries_with %s' % str(pad_timeseries_with)) # @added 20200817 - Feature #3682: SNAB - webapp - crucible_process - run_algorithms # Allow the user to pass algorithms to run algorithms = settings.ALGORITHMS if run_algorithms: algorithms = run_algorithms # @modified 20200421 - Feature #3500: webapp - crucible_process_metrics # Feature #1448: Crucible web UI # Added add_to_panorama # @added 20200607 - Feature #3630: webapp - crucible_process_training_data # Added training_data_json crucible_anomaly_data = 'metric = \'%s\'\n' \ 'value = \'%s\'\n' \ 'from_timestamp = \'%s\'\n' \ 'metric_timestamp = \'%s\'\n' \ 'algorithms = %s\n' \ 'triggered_algorithms = %s\n' \ 'anomaly_dir = \'%s\'\n' \ 'graphite_metric = %s\n' \ 'run_crucible_tests = True\n' \ 'added_by = \'%s\'\n' \ 'added_at = \'%s\'\n' \ 'graphite_override_uri_parameters = \'%s\'\n' \ 'alert_interval = \'%s\'\n' \ 'add_to_panorama = %s\n' \ 'training_data_json = %s\n' \ % (base_name, str(datapoint), str(from_timestamp), # @modified 20200817 - Feature #3682: SNAB - webapp - crucible_process - run_algorithms # str(until_timestamp), str(settings.ALGORITHMS), str(until_timestamp), str(algorithms), triggered_algorithms, crucible_anomaly_dir, str(graphite_metric), skyline_app, str(added_at), str(graphite_override_uri_parameters), str(alert_interval), str(add_to_panorama), str(training_data_json)) # Create an anomaly file with details about the anomaly crucible_anomaly_file = '%s/%s.txt' % (crucible_anomaly_dir, sane_metricname) try: write_data_to_file(skyline_app, crucible_anomaly_file, 'w', crucible_anomaly_data) logger.info('added crucible anomaly file :: %s' % (crucible_anomaly_file)) except: logger.error(traceback.format_exc()) logger.error('error :: failed to add crucible anomaly file :: %s' % (crucible_anomaly_file)) # Create a crucible check file crucible_check_file = '%s/%s.%s.txt' % (settings.CRUCIBLE_CHECK_PATH, str(added_at), sane_metricname) try: write_data_to_file(skyline_app, crucible_check_file, 'w', crucible_anomaly_data) logger.info('added crucible check :: %s,%s' % (base_name, str(added_at))) metrics_submitted_to_process.append(base_name) except: logger.error(traceback.format_exc()) logger.error('error :: failed to add crucible check file :: %s' % (crucible_check_file)) return (crucible_job_id, metrics_submitted_to_process, fail_msg, trace)
def on_demand_motif_analysis(metric, timestamp, similarity, batch_size, top_matches, max_distance, range_padding, max_area_percent_diff): """ Process a motif similarity search on demand """ import numpy as np import mass_ts as mts logger = logging.getLogger(skyline_app_logger) dev_null = None function_str = 'on_demand_motif_analysis' logger.info( '%s :: with parameters :: metric: %s, timestamp: %s, similarity: %s, batch_size:%s, top_matches: %s, max_distance: %s, range_padding: %s, max_area_percent_diff: %s' % (function_str, str(metric), str(timestamp), str(similarity), str(batch_size), str(top_matches), str(max_distance), str(range_padding), str(max_area_percent_diff))) trace = 'none' fail_msg = 'none' start = time.time() start_timer = timer() metric_vars_dict = {} metric_id = 0 fp_ids = [] timeseries = [] not_similar_enough_sample = 0 not_similar_motifs = 0 similar_motifs = 0 exact_motifs = 0 distance_motifs = 0 motifs_found = [] find_exact_matches_run = False exact_matches_found = [] fps_timeseries = {} # A motif_analysis dict to add to and return motif_analysis = {} motif_analysis[metric] = {} motif_analysis[metric]['timestamp'] = int(timestamp) motif_analysis[metric]['started'] = start motif_analysis[metric]['motifs'] = {} motif_analysis[metric]['exact_motifs'] = exact_motifs motif_analysis[metric]['similar_motifs'] = similar_motifs motif_analysis[metric]['not_similar_motifs'] = not_similar_motifs motif_analysis[metric][ 'not_similar_enough_sample'] = not_similar_enough_sample # @added 20210417 - Feature #4014: Ionosphere - inference # Allow the user to define the batch_size per similarity search motif_analysis[metric]['batch_size'] = int(batch_size) motif_analysis[metric]['top_matches'] = int(top_matches) motif_analysis[metric]['max_distance'] = float(max_distance) # @added 20210425 - Feature #4014: Ionosphere - inference # Added max_area_percent_diff for computing the area under the curve motif_analysis[metric]['max_area_percent_diff'] = float( max_area_percent_diff) fps_checked_for_motifs = [] metric_dir = metric.replace('.', '/') metric_timeseries_dir = '%s/%s/%s' % (settings.IONOSPHERE_DATA_FOLDER, str(timestamp), metric_dir) # @added 20210418 - Feature #4014: Ionosphere - inference # Allow for the similarity search on saved_training_data if 'saved_training_data' in request.args: saved_training_data_str = request.args.get('saved_training_data', 'false') if saved_training_data_str == 'true': saved_metric_timeseries_dir = '%s_saved/%s/%s' % ( settings.IONOSPHERE_DATA_FOLDER, str(timestamp), metric_dir) if path.exists(saved_metric_timeseries_dir): metric_timeseries_dir = saved_metric_timeseries_dir logger.info('%s :: using saved training_data dir - %s' % (function_str, saved_metric_timeseries_dir)) metric_vars_file = '%s/%s.txt' % (metric_timeseries_dir, metric) timeseries_json = '%s/%s.json' % (metric_timeseries_dir, metric) full_duration_in_hours = int(settings.FULL_DURATION / 60 / 60) full_duration_timeseries_json = '%s/%s.mirage.redis.%sh.json' % ( metric_timeseries_dir, metric, str(full_duration_in_hours)) try: metric_vars_dict = mirage_load_metric_vars(skyline_app, metric_vars_file, True) except Exception as e: logger.error( 'error :: inference :: failed to load metric variables from check file - %s - %s' % (metric_vars_file, e)) if not metric_vars_dict: motif_analysis[metric]['status'] = 'error' motif_analysis[metric][ 'reason'] = 'could not load training data variables' return motif_analysis full_duration = metric_vars_dict['metric_vars']['full_duration'] # Determine the metric details from the database metric_id = 0 metric_db_object = {} try: metric_db_object = get_metrics_db_object(metric) except Exception as e: logger.error('error :: %s :: failed to get_metrics_db_object - %s' % (function_str, e)) try: metric_id = int(metric_db_object['id']) except Exception as e: logger.error( 'error :: %s :: failed to determine metric_id from metric_db_object %s - %s' % (function_str, str(metric_db_object), e)) metric_id = 0 if not metric_id: logger.error( 'error :: %s :: failed to get metric id for %s from the database' % (function_str, str(metric))) fail_msg = 'failed to get metric id' motif_analysis[metric]['status'] = 'error' motif_analysis[metric]['reason'] = 'could not determine metric id' return motif_analysis, fail_msg, trace # @modified 20210419 - Feature #4014: Ionosphere - inference # Create a unique dir for each batch_size max_distance # motif_images_dir = '%s/motifs' % metric_timeseries_dir motif_images_dir = '%s/motifs/batch_size.%s/top_matches.%s/max_distance.%s' % ( metric_timeseries_dir, str(batch_size), str(top_matches), str(max_distance)) if not path.exists(motif_images_dir): # provision motifs image resources mkdir_p(motif_images_dir) full_durations = [full_duration] if path.isfile(full_duration_timeseries_json): full_durations = [full_duration, settings.FULL_DURATION] logger.info('%s :: full_durations - %s' % (function_str, str(full_durations))) # Loop through analysis per full_duration for full_duration in full_durations: start_full_duration = timer() fp_ids = [] try: query = 'SELECT id,last_matched from ionosphere WHERE metric_id=%s AND full_duration=%s AND enabled=1 ORDER BY last_matched DESC' % ( str(metric_id), str(full_duration)) results = mysql_select(skyline_app, query) for row in results: fp_ids.append(int(row[0])) except Exception as e: logger.error( 'error :: %s :: failed to get fp ids via mysql_select from %s - %s' % (function_str, metric, e)) logger.info('%s :: metric_id: %s, full_duration: %s, fp_ids: %s' % (function_str, (metric_id), str(full_duration), str(fp_ids))) if not fp_ids: continue # Now there are known fps, load the timeseries if full_duration == settings.FULL_DURATION: timeseries_json_file = full_duration_timeseries_json else: timeseries_json_file = timeseries_json try: with open((timeseries_json_file), 'r') as f: raw_timeseries = f.read() timeseries_array_str = str(raw_timeseries).replace('(', '[').replace( ')', ']') del raw_timeseries timeseries = literal_eval(timeseries_array_str) del timeseries_array_str except Exception as e: logger.error( 'error :: %s :: failed to load timeseries for %s from %s - %s' % (function_str, metric, timeseries_json_file, e)) continue anomalous_timeseries_subsequence = [] for timestamp_float, value in timeseries[-int(batch_size):]: anomalous_timeseries_subsequence.append( [int(timestamp_float), value]) logger.info( '%s :: looking for motif in trained fps of full_duration: %s' % (function_str, (full_duration))) dataset = [float(item[1]) for item in anomalous_timeseries_subsequence] max_y = max(dataset) min_y = min(dataset) # full_y_range = max_y - min_y # range_padding_percent = range_padding # This was just a test that did not have the desired results # if full_y_range < 10: # range_padding_percent = 35 # if full_y_range < 5: # range_padding_percent = 75 # if full_y_range < 2: # range_padding_percent = 100 use_range_padding = ((max_y - min_y) / 100) * range_padding if min_y > 0 and (min_y - use_range_padding) > 0: min_y_padded = min_y - use_range_padding else: min_y_padded = min_y max_y_padded = max_y + use_range_padding if min_y_padded == max_y_padded: min_y_padded = min_y_padded - ( (min_y_padded / 100) * range_padding) max_y_padded = max_y_padded + ( (max_y_padded / 100) * range_padding) # anomalous_ts = np.array(dataset) anomalous_ts = dataset mass2_batch_times = [] exact_match_times = [] nan = np.array([np.nan]) nanj = complex(0.0, float('nan')) empty_dists = np.array(nan + nanj) # plotted = False count = 0 # fp_ids = [fp_id for index, fp_id in enumerate(fp_ids) if index == 0] # motifs_found = [] # exact_matches_found = [] # fps_timeseries = {} for fp_id in fp_ids: if (time.time() - start) >= 20: break # Attempt to surface the fp timeseries from memcache and/or db # @modified 20210424 - Feature #4014: Ionosphere - inference # Task #4030: refactoring fp_timeseries = None try: fp_timeseries = get_fp_timeseries(skyline_app, metric_id, fp_id) except Exception as e: logger.error( 'inference :: did not get fp timeseries with get_fp_timeseries(%s, %s, %s) - %s' % (skyline_app, str(metric_id), str(fp_id), e)) if not fp_timeseries: continue relate_dataset = [float(item[1]) for item in fp_timeseries] fps_timeseries[fp_id] = fp_timeseries current_best_indices = [] current_best_dists = [] best_indices = None best_dists = None try: logger.info( '%s :: running mts.mass2_batch fp_id: %s, full_duration: %s, batch_size: %s, top_matches: %s, max_distance: %s, motif_size: %s' % (function_str, str(fp_id), str(full_duration), str(batch_size), str(top_matches), str(max_distance), str(len(anomalous_ts)))) # @added 20210418 - Feature #4014: Ionosphere - inference # Handle top_matches being greater than possible kth that can be found # mts.mass2_batch error: kth(=50) out of bounds (16) use_top_matches = int(top_matches) if (len(fp_timeseries) / int(batch_size)) <= int(top_matches): use_top_matches = round( len(fp_timeseries) / int(batch_size)) - 1 if use_top_matches == 2: use_top_matches = 1 logger.info( '%s :: adjusting top_matches to %s (the maximum possible top - 1) as kth(=%s) will be out of bounds mts.mass2_batch' % (function_str, str(use_top_matches), str(top_matches))) start_mass2_batch = timer() # @modified 20210418 - Feature #4014: Ionosphere - inference # Handle top_matches being greater than possible kth that can be found # best_indices, best_dists = mts.mass2_batch(relate_dataset, anomalous_ts, batch_size=int(batch_size), top_matches=int(top_matches)) best_indices, best_dists = mts.mass2_batch( relate_dataset, anomalous_ts, batch_size=int(batch_size), top_matches=int(use_top_matches)) end_mass2_batch = timer() mass2_batch_times.append((end_mass2_batch - start_mass2_batch)) current_best_indices = best_indices.tolist() current_best_dists = best_dists.tolist() # @added 20210412 - Feature #4014: Ionosphere - inference # Branch #3590: inference # Add fp_id to fps_checked_for_motifs to enable ionosphere to update the # motif related columns in the ionosphere database table fps_checked_for_motifs.append(fp_id) except Exception as e: logger.error('error :: %s :: %s mts.mass2_batch error: %s' % (function_str, (fp_id), str(e))) continue try: if str(list(best_dists)) == str(list(empty_dists)): logger.info( '%s :: mts.mass2_batch no similar motif from fp id %s - best_dists: %s' % (function_str, (fp_id), str(list(best_dists)))) continue except Exception as e: dev_null = e if not current_best_indices[0]: continue # if list(best_indices)[0] != anomalous_index: # continue # If the best_dists is > 1 they are not very similar # if list(best_dists)[0].real > 1.0: # continue # if list(best_indices)[0] and best_dists: for index, best_dist in enumerate(current_best_dists): try: motif_added = False """ Note: mass_ts finds similar motifs NOT the same motif, the same motif will result in the best_dists being a nan+nanj So it is DIYed """ try: # @modified 20210414 - Feature #4014: Ionosphere - inference # Branch #3590: inference # Store the not anomalous motifs # motif = [fp_id, current_best_indices[index], best_dist.real] motif = [ fp_id, current_best_indices[index], best_dist.real, anomalous_timeseries_subsequence, full_duration ] except Exception as e: dev_null = e motif = [] # if list(best_indices)[0] and best_dists: # If it is greater than 1.0 it is not similar # if best_dist.real > 1.0: # if best_dist.real > IONOSPHERE_INFERENCE_MASS_TS_MAX_DISTANCE: if best_dist.real > float(max_distance): continue else: if motif: count += 1 motifs_found.append(motif) motif_added = True if not motif_added: if best_dist == nanj: count += 1 motifs_found.append(motif) motif_added = True if not motif_added: if str(best_dist) == 'nan+nanj': count += 1 motifs_found.append([ fp_id, current_best_indices[index], 0.0, anomalous_timeseries_subsequence, full_duration ]) motif_added = True if not motif_added: if best_dist == empty_dists: count += 1 motifs_found.append(motif) motif_added = True except Exception as e: logger.error(traceback.format_exc()) logger.error( 'error :: %s :: could not determine is if fp id %s timeseries at index %s was a match - %s' % (function_str, str(fp_id), str(current_best_indices[index]), e)) continue # FIND EXACT MATCHES # Seeing as I cannot reproduce finding nan+nanj which represents an # exact match with mts.mass2_batch, do it DIY style - iterate the # timeseries and create a batch_size subsequence for every index and # compare the values to the anomalous_ts for an exact match. # This takes ~0.024850 seconds on a timeseries with 10079 datapoints # @modified 20210418 - Feature #4014: Ionosphere - inference # However fiding exact matches can add ~2.5 seconds on 90 minute # batch_size and with a proproptionally scaled max_distance of say 15 # finding an exact match in a longer sequence is less important, # the greater the batch_size the most likely greater the variability # and the chance of an exact match decreases. So save 2.5 seconds. # UNLESS # At a 5 (to 10) batch_size and max_distance of 1.0 an exact match # can be found. Exact matches are quite frequent and sometimes with # such little variability, similar matchs may not be found. # Therefore find find_exact_matches has its place. MASS # A CAVEAT here is that boring metrics and that change and have a # low variability even at a larger batch_size could also benefit and # possibly achieve better accruracy from the use of find_exact_matches # as they can be shapelets resemble a batch_size 5 shapelet. # It would perhaps be possible to use one or more of the features # profile tsfresh values to identify these types of shapelets, if # you knew which feature/s were most descriptive of this type of # shapelet, e.g. 'value__skewness': 3.079477685394873, etc (maybe) # However I predict that this method will perform worst on these # types of shapelets. # find_exact_matches = False # exact matches can be found in batch sizes of 500 and similar not # So actually always run it. find_exact_matches = True find_exact_matches_run = True if int(batch_size) < 10: find_exact_matches = True find_exact_matches_run = True if find_exact_matches: try: start_exact_match = timer() indexed_relate_dataset = [] for index, item in enumerate(relate_dataset): indexed_relate_dataset.append([index, item]) last_index = indexed_relate_dataset[-1][0] current_index = 0 while current_index < last_index: subsequence = [ value for index, value in indexed_relate_dataset[current_index:( current_index + int(batch_size))] ] if subsequence == anomalous_ts: exact_matches_found.append([ fp_id, current_index, 0.0, anomalous_timeseries_subsequence, full_duration ]) motifs_found.append([ fp_id, current_index, 0.0, anomalous_timeseries_subsequence, full_duration ]) current_index += 1 end_exact_match = timer() exact_match_times.append( (end_exact_match - start_exact_match)) except Exception as e: logger.error(traceback.format_exc()) logger.error( 'error :: %s :: could not determine it any exact matches could be found in fp id %s timeseries - %s' % (function_str, str(fp_id), e)) logger.info( '%s :: mts.mass2_batch runs on %s fps of full_duration %s in %.6f seconds' % (function_str, str(len(mass2_batch_times)), str(full_duration), sum(mass2_batch_times))) if find_exact_matches_run: logger.info( '%s :: exact_match runs on %s fps of full_duration %s in %.6f seconds' % (function_str, str(len(exact_match_times)), str(full_duration), sum(exact_match_times))) end_full_duration = timer() logger.info( '%s :: analysed %s fps of full_duration %s in %.6f seconds' % (function_str, str(len(fp_ids)), str(full_duration), (end_full_duration - start_full_duration))) # Patterns are sorted sorted_motifs = [] motifs_found_in_fps = [] if motifs_found: sorted_motifs = sorted(motifs_found, key=lambda x: x[2]) for item in sorted_motifs: motifs_found_in_fps.append(item[0]) logger.info('%s :: %s motifs found' % (function_str, str(len(sorted_motifs)))) for motif in sorted_motifs: if (time.time() - start) >= 25: break try: add_match = False all_in_range = False fp_id = motif[0] best_index = motif[1] best_dist = motif[2] # @added 20210414 - Feature #4014: Ionosphere - inference # Branch #3590: inference # Store the not anomalous motifs motif_sequence = motif[3] motif_full_duration = motif[4] match_type = 'not_similar_enough' if motif in exact_matches_found: add_match = True match_type = 'exact' all_in_range = True exact_motifs += 1 full_relate_timeseries = fps_timeseries[fp_id] # full_relate_dataset = [float(item[1]) for item in full_relate_timeseries] relate_timeseries = [ item for index, item in enumerate(full_relate_timeseries) if index >= best_index and index < (best_index + int(batch_size)) ] relate_dataset = [item[1] for item in relate_timeseries] if not add_match: all_in_range = True for value in relate_dataset: if value < min_y_padded: all_in_range = False break if value > max_y_padded: all_in_range = False break if all_in_range: related_max_y = max(relate_dataset) if related_max_y < (max_y - range_padding): all_in_range = False if related_max_y > (max_y + range_padding): all_in_range = False related_min_y = min(relate_dataset) if related_min_y < (min_y - range_padding): all_in_range = False if related_min_y > (min_y + range_padding): all_in_range = False if all_in_range: logger.info( '%s :: ALL IN RANGE - all_in_range: %s, motif: %s' % (function_str, str(all_in_range), str(relate_dataset[0:2]))) add_match = True match_type = 'all_in_range' similar_motifs += 1 # @added 20210425 - Feature #4014: Ionosphere - inference # Compute the area using the composite trapezoidal rule. motif_area = None fp_motif_area = None percent_different = None try: batch_size_dataset = [ float(item[1]) for item in motif_sequence ] y_motif = np.array(batch_size_dataset) motif_area = np.trapz(y_motif, dx=1) except Exception as e: logger.error( 'error :: %s :: failed to get motif_area with np.trapz - %s' % (function_str, e)) try: y_fp_motif = np.array(relate_dataset) fp_motif_area = np.trapz(y_fp_motif, dx=1) except Exception as e: logger.error( 'error :: %s :: failed to get fp_motif_area with np.trapz - %s' % (function_str, e)) # Determine the percentage difference (as a # positive value) of the areas under the # curves. if motif_area and fp_motif_area: percent_different = get_percent_different( fp_motif_area, motif_area, True) if percent_different > max_area_percent_diff: if add_match: logger.info( '%s :: AREA TOO DIFFERENT - not adding all_in_range match' % (function_str)) add_match = False # BUT ... if best_dist < 3 and not add_match: logger.info( '%s :: DISTANCE VERY SIMILAR - adding match even though area_percent_diff is greater than max_area_percent_diff because best_dist: %s' % (function_str, str(best_dist))) add_match = True match_type = 'distance' distance_motifs += 1 if similarity == 'all': if not add_match: not_similar_motifs += 1 if not_similar_enough_sample >= 10: continue not_similar_enough_sample += 1 add_match = True match_type = 'not_similar_enough' if add_match: generation = 0 fp_id_row = None try: fp_id_row = get_ionosphere_fp_db_row( skyline_app, int(fp_id)) except Exception as e: logger.error( 'error :: %s :: failed to get_ionosphere_fp_db_row for fp_id %s - %s' % (function_str, str(fp_id), e)) if fp_id_row: try: generation = fp_id_row['generation'] except Exception as e: logger.error( 'error :: %s :: failed to generation from fp_id_row for fp_id %s - %s' % (function_str, str(fp_id), e)) if generation == 0: generation_str = 'trained' else: generation_str = 'LEARNT' motif_match_types = motif_match_types_dict() type_id = motif_match_types[match_type] motif_id = '%s-%s' % (str(fp_id), str(best_index)) motif_analysis[metric]['motifs'][motif_id] = {} motif_analysis[metric]['motifs'][motif_id][ 'metric_id'] = metric_id motif_analysis[metric]['motifs'][motif_id]['fp_id'] = fp_id motif_analysis[metric]['motifs'][motif_id][ 'generation'] = generation motif_analysis[metric]['motifs'][motif_id][ 'index'] = best_index motif_analysis[metric]['motifs'][motif_id][ 'distance'] = best_dist motif_analysis[metric]['motifs'][motif_id]['size'] = int( batch_size) motif_analysis[metric]['motifs'][motif_id][ 'max_distance'] = float(max_distance) motif_analysis[metric]['motifs'][motif_id][ 'timestamp'] = timestamp motif_analysis[metric]['motifs'][motif_id][ 'type_id'] = type_id motif_analysis[metric]['motifs'][motif_id][ 'type'] = match_type motif_analysis[metric]['motifs'][motif_id][ 'full_duration'] = motif_full_duration # @added 20210414 - Feature #4014: Ionosphere - inference # Branch #3590: inference # Store the not anomalous motifs motif_analysis[metric]['motifs'][motif_id][ 'motif_timeseries'] = anomalous_timeseries_subsequence motif_analysis[metric]['motifs'][motif_id][ 'motif_sequence'] = motif_sequence not_anomalous_timestamp = int( anomalous_timeseries_subsequence[-1][0]) graph_period_seconds = not_anomalous_timestamp - int( anomalous_timeseries_subsequence[0][0]) motif_analysis[metric]['motifs'][motif_id][ 'motif_period_seconds'] = graph_period_seconds motif_analysis[metric]['motifs'][motif_id][ 'motif_period_minutes'] = round(graph_period_seconds / 60) motif_analysis[metric]['motifs'][motif_id]['image'] = None motif_analysis[metric]['motifs'][motif_id][ 'motif_area'] = motif_area motif_analysis[metric]['motifs'][motif_id][ 'fp_motif_area'] = fp_motif_area motif_analysis[metric]['motifs'][motif_id][ 'area_percent_diff'] = percent_different motif_analysis[metric]['motifs'][motif_id][ 'max_area_percent_diff'] = max_area_percent_diff if (time.time() - start) >= 25: continue graph_image_file = '%s/motif.%s.%s.%s.with_max_distance.%s.png' % ( motif_images_dir, motif_id, match_type, str(batch_size), str(max_distance)) plotted_image = False on_demand_motif_analysis = True if not path.isfile(graph_image_file): plotted_image, plotted_image_file = plot_motif_match( skyline_app, metric, timestamp, fp_id, full_duration, generation_str, motif_id, best_index, int(batch_size), best_dist, type_id, relate_dataset, anomalous_timeseries_subsequence, graph_image_file, on_demand_motif_analysis) else: plotted_image = True logger.info('%s :: plot already exists - %s' % (function_str, str(graph_image_file))) if plotted_image: motif_analysis[metric]['motifs'][motif_id][ 'image'] = graph_image_file else: logger.error('failed to plot motif match plot') graph_image_file = None except Exception as e: logger.error(traceback.format_exc()) logger.error( 'error :: inference :: with fp id %s proceesing motif at index: %s - %s' % (str(fp_id), str(motif[0]), str(e))) continue end_timer = timer() motif_analysis[metric]['fps_checked'] = fps_checked_for_motifs motif_analysis[metric]['exact_motifs'] = exact_motifs motif_analysis[metric]['similar_motifs'] = similar_motifs motif_analysis[metric]['distance_motifs'] = distance_motifs motif_analysis[metric]['not_similar_motifs'] = not_similar_motifs motif_analysis[metric][ 'not_similar_enough_sample'] = not_similar_enough_sample motif_analysis_file = '%s/motif.analysis.similarity_%s.batch_size_%s.top_matches_%s.max_distance_%s.dict' % ( motif_images_dir, similarity, str(batch_size), str(top_matches), str(max_distance)) try: write_data_to_file(skyline_app, motif_analysis_file, 'w', str(motif_analysis)) except Exception as e: trace = traceback.format_exc() logger.error('%s' % trace) fail_msg = '%s :: error :: failed to write motif_analysis_file - %s' % ( function_str, motif_analysis_file) logger.error('%s' % fail_msg) dev_null = e motif_ids = list(motif_analysis[metric]['motifs'].keys()) logger.info( '%s :: %s motif matches found, %s fps where checked and motifs plotted in %.6f seconds for %s' % (function_str, str(len(motif_ids)), str(len(fps_checked_for_motifs)), (end_timer - start_timer), metric)) if dev_null: del dev_null return motif_analysis, fail_msg, trace