def extract(self, use_features=[]): x = self.__x_data_frame() y = self.__y_series() settings = ReasonableFeatureExtractionSettings() extracted_features = extract_features(x, column_id='id', \ feature_extraction_settings=settings) if len(use_features) == 0: impute(extracted_features) features_filtered = select_features(extracted_features, y) use_features = features_filtered.keys() else: features_filtered = extracted_features[use_features] keys = features_filtered.keys() timeseries = [] for index, row in features_filtered.iterrows(): values = [] for key in keys: if key == 'id': continue value = row[key] values.append(value) timeseries.append(Timeseries([values])) return timeseries, use_features
def calculate_features_profile(current_skyline_app, timestamp, metric, context): """ Calculates a tsfresh features profile from a training data set :param timestamp: the timestamp of metric anomaly with training data :type timestamp: str :param metric: the base_name of the metric :type metric: str :return: (features_profile_csv_file_path, successful, fail_msg, traceback_format_exc, calc_time) :rtype: int :rtype: (str, boolean, str, str, str) """ current_skyline_app_logger = current_skyline_app + 'Log' current_logger = logging.getLogger(current_skyline_app_logger) base_name = str(metric) if context == 'training_data': log_context = 'training data' if context == 'features_profiles': log_context = 'features profile data' if context == 'ionosphere': log_context = 'ionosphere' # @added 20170114 - Feature #1854: Ionosphere learn if context == 'ionosphere_learn': log_context = 'ionosphere :: learn' current_logger.info('%s feature profile creation requested for %s at %s' % (log_context, base_name, timestamp)) timeseries_dir = base_name.replace('.', '/') if context == 'training_data' or context == 'ionosphere': metric_data_dir = '%s/%s/%s' % (settings.IONOSPHERE_DATA_FOLDER, timestamp, timeseries_dir) if context == 'features_profiles': metric_data_dir = '%s/%s/%s' % (settings.IONOSPHERE_PROFILES_FOLDER, timeseries_dir, timestamp) # @added 20170113 - Feature #1854: Ionosphere learn if context == 'ionosphere_learn': metric_data_dir = '%s/%s/%s' % (settings.IONOSPHERE_LEARN_FOLDER, timestamp, timeseries_dir) features_profile_created_file = '%s/%s.%s.fp.created.txt' % ( metric_data_dir, str(timestamp), base_name) features_profile_details_file = '%s/%s.%s.fp.details.txt' % ( metric_data_dir, str(timestamp), base_name) # @added 20170108 - Feature #1842: Ionosphere - Graphite now graphs # Added metric_check_file and ts_full_duration is needed to be determined # and added the to features_profile_details_file as it was not added here on # the 20170104 when it was added the webapp and ionosphere metric_var_filename = '%s.txt' % str(base_name) anomaly_check_file = '%s/%s' % (metric_data_dir, metric_var_filename) ts_full_duration = int(settings.FULL_DURATION) if os.path.isfile(anomaly_check_file): # Read the details file with open(anomaly_check_file, 'r') as f: anomaly_details = f.readlines() for i, line in enumerate(anomaly_details): if 'full_duration' in line: _ts_full_duration = '%s' % str(line).split("'", 2) full_duration_array = literal_eval(_ts_full_duration) ts_full_duration = str(int(full_duration_array[1])) anomaly_json = '%s/%s.json' % (metric_data_dir, base_name) ts_csv = '%s/%s.tsfresh.input.csv' % (metric_data_dir, base_name) # anomaly_json = '/opt/skyline/ionosphere/data/1480104000/stats/statsd/graphiteStats/calculationtime/stats.statsd.graphiteStats.calculationtime.json' # ts_csv = '/opt/skyline/ionosphere/data/1480104000/stats/statsd/graphiteStats/calculationtime/stats.statsd.graphiteStats.calculationtime.tsfresh.input.csv' # This is simply to stay in line with tsfresh naming conventions in their # docs and examples fname_in = ts_csv t_fname_out = fname_in + '.features.transposed.csv' fp_id = None f_calc = 'unknown' if os.path.isfile(features_profile_details_file): current_logger.info('features profile details file exist - %s' % (features_profile_details_file)) try: with open(features_profile_details_file, 'r') as f: fp_details_str = f.read() fp_details_array = literal_eval(fp_details_str) f_calc = ' (previously calculated by Ionosphere) - %s' % str( fp_details_array[2]) except: trace = traceback.format_exc() current_logger.error(trace) current_logger.error('error: failed to read from %s' % (features_profile_details_file)) else: current_logger.info('OK no features profile details file exists - %s' % (features_profile_details_file)) fp_created = None if os.path.isfile(features_profile_created_file): current_logger.info('features profile created file exist - %s' % (features_profile_created_file)) try: with open(features_profile_created_file, 'r') as f: fp_created_str = f.read() fp_created_array = literal_eval(fp_created_str) fp_id = fp_created_array[0] fp_created = True except: trace = traceback.format_exc() current_logger.error(trace) current_logger.error('error: failed to read fp_id from %s' % (features_profile_created_file)) else: current_logger.info('OK no features profile created file exists - %s' % (features_profile_created_file)) if os.path.isfile(t_fname_out): current_logger.info('transposed features already exist - %s' % (t_fname_out)) return str( t_fname_out), True, fp_created, fp_id, 'none', 'none', f_calc start = timer() if os.path.isfile(anomaly_json): try: # Read the timeseries json file with open(anomaly_json, 'r') as f: raw_timeseries = f.read() except: trace = traceback.format_exc() current_logger.error(trace) current_logger.error( 'error: failed to read timeseries data from %s' % (anomaly_json)) fail_msg = 'error: failed to read timeseries data from %s' % anomaly_json end = timer() return 'error', False, fp_created, fp_id, fail_msg, trace, f_calc else: trace = 'none' fail_msg = 'error: file not found - %s' % (anomaly_json) current_logger.error(fail_msg) end = timer() return 'error', False, fp_created, fp_id, fail_msg, trace, f_calc # Convert the timeseries to csv timeseries_array_str = str(raw_timeseries).replace('(', '[').replace(')', ']') timeseries = literal_eval(timeseries_array_str) datapoints = timeseries converted = [] for datapoint in datapoints: try: new_datapoint = [float(datapoint[0]), float(datapoint[1])] converted.append(new_datapoint) # @modified 20170913 - Task #2160: Test skyline with bandit # Added nosec to exclude from bandit tests except: # nosec continue if os.path.isfile(ts_csv): os.remove(ts_csv) for ts, value in converted: # print('%s,%s' % (str(int(ts)), str(value))) utc_ts_line = '%s,%s,%s\n' % (metric, str(int(ts)), str(value)) with open(ts_csv, 'a') as fh: fh.write(utc_ts_line) try: df = pd.read_csv(ts_csv, delimiter=',', header=None, names=['metric', 'timestamp', 'value']) current_logger.info('DataFrame created with %s' % ts_csv) except: trace = traceback.format_exc() current_logger.error(trace) fail_msg = 'error: failed to create a pandas DataFrame with %s' % ts_csv current_logger.error('%s' % fail_msg) if os.path.isfile(ts_csv): os.remove(ts_csv) current_logger.info('removed %s' % ts_csv) end = timer() return 'error', False, fp_created, fp_id, fail_msg, trace, f_calc # @added 20161207 - Task #1658: Patterning Skyline Ionosphere # Coverting the Dataframe types to suit MySQL data types # For anyone in here if you have done a code review of Skyline there are # a number of questions that arise from the decision to deviate from json or # storing msgppack as BLOB etc. tsfresh used csv and we can csv from Graphite # etc. Skyline should be able to handle csv. As for how data is stored in # MySQL, this was given considerable review and thought. Given that Ionosphere # and Skyline in general should not be limited to the domain of analyzing # Graphite machine metrics but other timeseries data sources too. # df['feature_name'] = df['feature_name'].astype(string) # df['value'] = df['value'].astype(float) # Test the DataFrame try: df_created = df.head() del df_created except: trace = traceback.format_exc() current_logger.debug(trace) fail_msg = 'error: failed to read the pandas DataFrame created with %s' % ts_csv current_logger.error('%s' % fail_msg) if os.path.isfile(ts_csv): os.remove(ts_csv) current_logger.info('removed %s' % ts_csv) end = timer() return 'error', False, fp_created, fp_id, fail_msg, trace, f_calc df.columns = ['metric', 'timestamp', 'value'] start_feature_extraction = timer() current_logger.info('starting extract_features with %s' % str(TSFRESH_VERSION)) try: # @modified 20161226 - Bug #1822: tsfresh extract_features process stalling # Changed to use the new ReasonableFeatureExtractionSettings that was # introduced in tsfresh-0.4.0 to exclude the computationally high cost # of extracting features from very static timeseries that has little to # no variation is the values, which results in features taking up to # almost 600 seconds to calculate on a timeseries of length 10075 # (168h - 1 datapoint per 60s) # In terms of inline feature calculatation, always exclude # high_comp_cost features. # df_features = extract_features(df, column_id='metric', column_sort='timestamp', column_kind=None, column_value=None) tsf_settings = ReasonableFeatureExtractionSettings() # Disable tqdm progress bar tsf_settings.disable_progressbar = True df_features = extract_features( df, column_id='metric', column_sort='timestamp', column_kind=None, column_value=None, feature_extraction_settings=tsf_settings) current_logger.info('features extracted from %s data' % ts_csv) except: trace = traceback.print_exc() current_logger.debug(trace) fail_msg = 'error: extracting features with tsfresh from - %s' % ts_csv current_logger.error('%s' % fail_msg) end_feature_extraction = timer() current_logger.info( 'feature extraction failed in %.6f seconds' % (end_feature_extraction - start_feature_extraction)) if os.path.isfile(ts_csv): os.remove(ts_csv) current_logger.info('removed %s' % ts_csv) end = timer() return 'error', False, fp_created, fp_id, fail_msg, trace, f_calc end_feature_extraction = timer() feature_extraction_time = end_feature_extraction - start_feature_extraction current_logger.info('feature extraction took %.6f seconds' % (feature_extraction_time)) # write to disk fname_out = fname_in + '.features.csv' # df_features.to_csv(fname_out) # Transpose try: df_t = df_features.transpose() current_logger.info('features transposed') except: trace = traceback.print_exc() current_logger.debug(trace) fail_msg = 'error :: transposing tsfresh features from - %s' % ts_csv current_logger.error('%s' % fail_msg) if os.path.isfile(ts_csv): os.remove(ts_csv) current_logger.info('removed %s' % ts_csv) end = timer() return 'error', False, fp_created, fp_id, fail_msg, trace, f_calc # Create transposed features csv t_fname_out = fname_in + '.features.transposed.csv' try: df_t.to_csv(t_fname_out) except: trace = traceback.print_exc() current_logger.debug(trace) fail_msg = 'error: saving transposed tsfresh features from - %s' % ts_csv current_logger.error('%s' % fail_msg) if os.path.isfile(ts_csv): os.remove(ts_csv) current_logger.info('removed %s' % ts_csv) end = timer() return 'error', False, fp_created, fp_id, fail_msg, trace, f_calc # Calculate the count and sum of the features values try: df_sum = pd.read_csv(t_fname_out, delimiter=',', header=0, names=['feature_name', 'value']) df_sum.columns = ['feature_name', 'value'] df_sum['feature_name'] = df_sum['feature_name'].astype(str) df_sum['value'] = df_sum['value'].astype(float) except: trace = traceback.print_exc() current_logger.error(trace) current_logger.error('error :: failed to create Dataframe to sum') try: features_count = len(df_sum['value']) except: trace = traceback.print_exc() current_logger.debug(trace) current_logger.error( 'error :: failed to count number of features, set to 0') features_count = 0 try: features_sum = df_sum['value'].sum() except: trace = traceback.print_exc() current_logger.debug(trace) current_logger.error('error :: failed to sum feature values, set to 0') features_sum = 0 end = timer() current_logger.info('features saved to %s' % (fname_out)) current_logger.info('transposed features saved to %s' % (t_fname_out)) total_calc_time = '%.6f' % (end - start) calc_time = '%.6f' % (feature_extraction_time) current_logger.info('total feature profile completed in %s seconds' % str(total_calc_time)) # Create a features profile details file try: # @modified 20170108 - Feature #1842: Ionosphere - Graphite now graphs # Added the ts_full_duration here as it was not added here on the 20170104 # when it was added the webapp and ionosphere data = '[%s, \'%s\', %s, %s, %s, %s]' % ( str(int(time.time())), str(tsfresh_version), str(calc_time), str(features_count), str(features_sum), str(ts_full_duration)) write_data_to_file(current_skyline_app, features_profile_details_file, 'w', data) except: trace = traceback.format_exc() current_logger.error('%s' % trace) fail_msg = 'error :: failed to write %s' % features_profile_details_file current_logger.error('%s' % fail_msg) if os.path.isfile(ts_csv): os.remove(ts_csv) current_logger.info('removed the created csv - %s' % ts_csv) # @added 20170112 - Feature #1854: Ionosphere learn - Redis ionosphere.learn.work namespace # Ionosphere learn needs Redis works sets, but this was moved to # ionosphere_backend.py and learn.py not done here return str(t_fname_out), True, fp_created, fp_id, 'none', 'none', str( calc_time)
def calculate_features_other_minmax(use_file, i_json_file, metric): fp_id = 'testing.feature2484' base_name = metric metric_timestamp = 'none' not_anomalous = False minmax_not_anomalous = False minmax = 0 minmax_check = True with open(use_file, 'r') as f: raw_timeseries = f.read() # Convert the timeseries to csv timeseries_array_str = str(raw_timeseries).replace('(', '[').replace(')', ']') del raw_timeseries anomalous_timeseries = literal_eval(timeseries_array_str) anomalous_ts_values_count = len(anomalous_timeseries) with open(i_json_file, 'r') as f: fp_raw_timeseries = f.read() # Convert the timeseries to csv fp_timeseries_array_str = str(fp_raw_timeseries).replace('(', '[').replace( ')', ']') del fp_raw_timeseries fp_id_metric_ts = literal_eval(fp_timeseries_array_str) fp_id_metric_ts_values_count = len(fp_id_metric_ts) try: range_tolerance = settings.IONOSPHERE_MINMAX_SCALING_RANGE_TOLERANCE except: range_tolerance = 0.15 range_tolerance_percentage = range_tolerance * 100 check_range = False range_similar = False if fp_id_metric_ts: if anomalous_ts_values_count > 0: check_range = True lower_range_similar = False upper_range_similar = False min_fp_value = None min_anomalous_value = None max_fp_value = None max_anomalous_value = None if check_range: try: minmax_fp_values = [x[1] for x in fp_id_metric_ts] min_fp_value = min(minmax_fp_values) max_fp_value = max(minmax_fp_values) except: min_fp_value = False max_fp_value = False try: minmax_anomalous_values = [x2[1] for x2 in anomalous_timeseries] min_anomalous_value = min(minmax_anomalous_values) max_anomalous_value = max(minmax_anomalous_values) except: min_anomalous_value = False max_anomalous_value = False lower_range_not_same = True try: if int(min_fp_value) == int(min_anomalous_value): lower_range_not_same = False lower_range_similar = True print( 'min value of fp_id_metric_ts (%s) and anomalous_timeseries (%s) are the same' % (str(min_fp_value), str(min_anomalous_value))) except: lower_range_not_same = True if min_fp_value and min_anomalous_value and lower_range_not_same: if int(min_fp_value) == int(min_anomalous_value): lower_range_similar = True print( 'min value of fp_id_metric_ts (%s) and anomalous_timeseries (%s) are the same' % (str(min_fp_value), str(min_anomalous_value))) else: lower_min_fp_value = int(min_fp_value - (min_fp_value * range_tolerance)) upper_min_fp_value = int(min_fp_value + (min_fp_value * range_tolerance)) if int(min_anomalous_value) in range(lower_min_fp_value, upper_min_fp_value): lower_range_similar = True print( 'min value of fp_id_metric_ts (%s) and anomalous_timeseries (%s) are similar within %s percent of each other' % (str(min_fp_value), str(min_anomalous_value), str(range_tolerance_percentage))) if not lower_range_similar: print( 'lower range of fp_id_metric_ts (%s) and anomalous_timeseries (%s) are not similar' % (str(min_fp_value), str(min_anomalous_value))) upper_range_not_same = True try: if int(max_fp_value) == int(max_anomalous_value): upper_range_not_same = False upper_range_similar = True print( 'max value of fp_id_metric_ts (%s) and anomalous_timeseries (%s) are the same' % (str(max_fp_value), str(max_anomalous_value))) except: upper_range_not_same = True if max_fp_value and max_anomalous_value and lower_range_similar and upper_range_not_same: # @added 20180717 - Task #2446: Optimize Ionosphere # Feature #2404: Ionosphere - fluid approximation # On low values such as 1 and 2, the range_tolerance # should be adjusted to account for the very small # range. TODO lower_max_fp_value = int(max_fp_value - (max_fp_value * range_tolerance)) upper_max_fp_value = int(max_fp_value + (max_fp_value * range_tolerance)) if int(max_anomalous_value) in range(lower_max_fp_value, upper_max_fp_value): upper_range_similar = True print( 'max value of fp_id_metric_ts (%s) and anomalous_timeseries (%s) are similar within %s percent of each other' % (str(max_fp_value), str(max_anomalous_value), str(range_tolerance_percentage))) else: print( 'max value of fp_id_metric_ts (%s) and anomalous_timeseries (%s) are not similar' % (str(max_fp_value), str(max_anomalous_value))) if lower_range_similar and upper_range_similar: range_similar = True else: print( 'the ranges of fp_id_metric_ts and anomalous_timeseries differ significantly Min-Max scaling will be skipped' ) minmax_fp_ts = [] # if fp_id_metric_ts: if range_similar: try: minmax_fp_values = [x[1] for x in fp_id_metric_ts] x_np = np.asarray(minmax_fp_values) # Min-Max scaling np_minmax = (x_np - x_np.min()) / (x_np.max() - x_np.min()) for (ts, v) in zip(fp_id_metric_ts, np_minmax): minmax_fp_ts.append([ts[0], v]) print( 'minmax_fp_ts list populated with the minmax scaled time series with %s data points' % str(len(minmax_fp_ts))) del minmax_fp_values except: print( 'error :: could not minmax scale fp id %s time series for %s' % (str(fp_id), str(base_name))) if not minmax_fp_ts: print('error :: minmax_fp_ts list not populated') minmax_anomalous_ts = [] if minmax_fp_ts: # Only process if they are approximately the same length minmax_fp_ts_values_count = len(minmax_fp_ts) if minmax_fp_ts_values_count - anomalous_ts_values_count in range( -14, 14): try: minmax_anomalous_values = [ x2[1] for x2 in anomalous_timeseries ] x_np = np.asarray(minmax_anomalous_values) # Min-Max scaling np_minmax = (x_np - x_np.min()) / (x_np.max() - x_np.min()) for (ts, v) in zip(fp_id_metric_ts, np_minmax): minmax_anomalous_ts.append([ts[0], v]) del anomalous_timeseries del minmax_anomalous_values except: print( 'error :: could not minmax scale current time series anomalous_timeseries for %s' % (str(fp_id), str(base_name))) if len(minmax_anomalous_ts) > 0: print('minmax_anomalous_ts is populated with %s data points' % str(len(minmax_anomalous_ts))) else: print('error :: minmax_anomalous_ts is not populated') else: print( 'minmax scaled check will be skipped - anomalous_ts_values_count is %s and minmax_fp_ts is %s' % (str(anomalous_ts_values_count), str(minmax_fp_ts_values_count))) minmax_fp_ts_csv = '%s/fpid.%s.%s.minmax_fp_ts.tsfresh.input.std.csv' % ( settings.SKYLINE_TMP_DIR, str(fp_id), base_name) if os.path.isfile(minmax_fp_ts_csv): os.remove(minmax_fp_ts_csv) minmax_fp_fname_out = minmax_fp_ts_csv + '.transposed.csv' if os.path.isfile(minmax_fp_fname_out): os.remove(minmax_fp_fname_out) anomalous_ts_csv = '%s/%s.%s.minmax_anomalous_ts.tsfresh.std.csv' % ( settings.SKYLINE_TMP_DIR, metric_timestamp, base_name) if os.path.isfile(anomalous_ts_csv): os.remove(anomalous_ts_csv) anomalous_fp_fname_out = anomalous_ts_csv + '.transposed.csv' if os.path.isfile(anomalous_fp_fname_out): os.remove(anomalous_fp_fname_out) tsf_settings = ReasonableFeatureExtractionSettings() tsf_settings.disable_progressbar = True minmax_fp_features_sum = None minmax_anomalous_features_sum = None if minmax_anomalous_ts and minmax_fp_ts: if not os.path.isfile(minmax_fp_ts_csv): datapoints = minmax_fp_ts converted = [] for datapoint in datapoints: try: new_datapoint = [float(datapoint[0]), float(datapoint[1])] converted.append(new_datapoint) except: # nosec continue for ts, value in converted: try: utc_ts_line = '%s,%s,%s\n' % (base_name, str( int(ts)), str(value)) with open(minmax_fp_ts_csv, 'a') as fh: fh.write(utc_ts_line) except: print('error :: could not write to file %s' % (str(minmax_fp_ts_csv))) del converted else: print('file found %s, using for data' % minmax_fp_ts_csv) if not os.path.isfile(minmax_fp_ts_csv): print('error :: file not found %s' % minmax_fp_ts_csv) else: print( 'file exists to create the minmax_fp_ts data frame from - %s' % minmax_fp_ts_csv) try: df = pd.read_csv(minmax_fp_ts_csv, delimiter=',', header=None, names=['metric', 'timestamp', 'value']) df.columns = ['metric', 'timestamp', 'value'] except: print('error :: failed to created data frame from %s' % (str(minmax_fp_ts_csv))) try: df_features = extract_features( df, column_id='metric', column_sort='timestamp', column_kind=None, column_value=None, feature_extraction_settings=tsf_settings) except: print('error :: failed to created df_features from %s' % (str(minmax_fp_ts_csv))) # Create transposed features csv if not os.path.isfile(minmax_fp_fname_out): # Transpose df_t = df_features.transpose() df_t.to_csv(minmax_fp_fname_out) try: # Calculate the count and sum of the features values df_sum = pd.read_csv(minmax_fp_fname_out, delimiter=',', header=0, names=['feature_name', 'value']) df_sum.columns = ['feature_name', 'value'] df_sum['feature_name'] = df_sum['feature_name'].astype(str) df_sum['value'] = df_sum['value'].astype(float) minmax_fp_features_count = len(df_sum['value']) minmax_fp_features_sum = df_sum['value'].sum() print('minmax_fp_ts - features_count: %s, features_sum: %s' % (str(minmax_fp_features_count), str(minmax_fp_features_sum))) del df_sum except: print('error :: failed to created df_sum from %s' % (str(minmax_fp_fname_out))) if minmax_fp_features_count > 0: print( 'debug :: minmax_fp_features_count of the minmax_fp_ts is %s' % str(minmax_fp_features_count)) else: print('error :: minmax_fp_features_count is %s' % str(minmax_fp_features_count)) if not os.path.isfile(anomalous_ts_csv): datapoints = minmax_anomalous_ts converted = [] for datapoint in datapoints: try: new_datapoint = [float(datapoint[0]), float(datapoint[1])] converted.append(new_datapoint) except: # nosec continue for ts, value in converted: utc_ts_line = '%s,%s,%s\n' % (base_name, str( int(ts)), str(value)) with open(anomalous_ts_csv, 'a') as fh: fh.write(utc_ts_line) del converted df = pd.read_csv(anomalous_ts_csv, delimiter=',', header=None, names=['metric', 'timestamp', 'value']) df.columns = ['metric', 'timestamp', 'value'] df_features_current = extract_features( df, column_id='metric', column_sort='timestamp', column_kind=None, column_value=None, feature_extraction_settings=tsf_settings) del df # Create transposed features csv if not os.path.isfile(anomalous_fp_fname_out): # Transpose df_t = df_features_current.transpose() df_t.to_csv(anomalous_fp_fname_out) del df_t del df_features_current # Calculate the count and sum of the features values df_sum_2 = pd.read_csv(anomalous_fp_fname_out, delimiter=',', header=0, names=['feature_name', 'value']) df_sum_2.columns = ['feature_name', 'value'] df_sum_2['feature_name'] = df_sum_2['feature_name'].astype(str) df_sum_2['value'] = df_sum_2['value'].astype(float) minmax_anomalous_features_count = len(df_sum_2['value']) minmax_anomalous_features_sum = df_sum_2['value'].sum() print( 'minmax_anomalous_ts - minmax_anomalous_features_count: %s, minmax_anomalous_features_sum: %s' % (str(minmax_anomalous_features_count), str(minmax_anomalous_features_sum))) del df_sum_2 del minmax_anomalous_ts percent_different = 100 if minmax_fp_features_sum and minmax_anomalous_features_sum: percent_different = None try: fp_sum_array = [minmax_fp_features_sum] calc_sum_array = [minmax_anomalous_features_sum] percent_different = 100 sums_array = np.array( [minmax_fp_features_sum, minmax_anomalous_features_sum], dtype=float) calc_percent_different = np.diff( sums_array) / sums_array[:-1] * 100. percent_different = calc_percent_different[0] print( 'percent_different between minmax scaled features sums - %s' % str(percent_different)) except: print( 'error :: failed to calculate percent_different from minmax scaled features sums' ) if percent_different: almost_equal = None try: np.testing.assert_array_almost_equal(fp_sum_array, calc_sum_array) almost_equal = True except: almost_equal = False if almost_equal: minmax_not_anomalous = True print( 'minmax scaled common features sums are almost equal, not anomalous' ) # if diff_in_sums <= 1%: if percent_different < 0: new_pdiff = percent_different * -1 percent_different = new_pdiff # @modified 20190321 # if percent_different < (settings.IONOSPHERE_FEATURES_PERCENT_SIMILAR + 1): if percent_different < IONOSPHERE_ECHO_MINMAX_SCALING_FEATURES_PERCENT_SIMILAR: minmax_not_anomalous = True # log print( 'not anomalous - minmax scaled features profile match - %s - %s' % (base_name, str(minmax_not_anomalous))) print( 'minmax scaled calculated features sum are within %s percent of fp_id %s with %s, not anomalous' % (str( IONOSPHERE_ECHO_MINMAX_SCALING_FEATURES_PERCENT_SIMILAR ), str(fp_id), str(percent_different))) if minmax_not_anomalous: not_anomalous = True minmax = 1 # Created time series resources for graphing in # the matched page try: clean_file = anomalous_ts_csv if os.path.isfile(anomalous_ts_csv): os.remove(anomalous_ts_csv) # print('cleaned up - %s' % clean_file) except: print('no anomalous_ts_csv file to clean up') try: clean_file = anomalous_fp_fname_out if os.path.isfile(anomalous_fp_fname_out): os.remove(anomalous_fp_fname_out) # print('cleaned up - %s' % clean_file) except: print('no anomalous_fp_fname_out file to clean up') return not_anomalous
def minmax_scale_check(fp_id_metric_ts, anomalous_timeseries, range_tolerance, range_tolerance_percentage, fp_id, base_name, metric_timestamp, features_percentage_diff): # @modified 20191115 - Branch #3262: py3 # not_anomalous = False try: minmax_fp_values = [x[1] for x in fp_id_metric_ts] min_fp_value = min(minmax_fp_values) max_fp_value = max(minmax_fp_values) except: min_fp_value = False max_fp_value = False try: minmax_anomalous_values = [x2[1] for x2 in anomalous_timeseries] min_anomalous_value = min(minmax_anomalous_values) max_anomalous_value = max(minmax_anomalous_values) except: min_anomalous_value = False max_anomalous_value = False lower_range_not_same = True try: if int(min_fp_value) == int(min_anomalous_value): lower_range_not_same = False lower_range_similar = True logger.info( 'min value of fp_id_metric_ts (%s) and anomalous_timeseries (%s) are the same' % (str(min_fp_value), str(min_anomalous_value))) except: lower_range_not_same = True if min_fp_value and min_anomalous_value and lower_range_not_same: if int(min_fp_value) == int(min_anomalous_value): lower_range_similar = True logger.info( 'min value of fp_id_metric_ts (%s) and anomalous_timeseries (%s) are the same' % (str(min_fp_value), str(min_anomalous_value))) else: lower_min_fp_value = int(min_fp_value - (min_fp_value * range_tolerance)) upper_min_fp_value = int(min_fp_value + (min_fp_value * range_tolerance)) if int(min_anomalous_value) in range(lower_min_fp_value, upper_min_fp_value): lower_range_similar = True logger.info( 'min value of fp_id_metric_ts (%s) and anomalous_timeseries (%s) are similar within %s percent of each other' % (str(min_fp_value), str(min_anomalous_value), str(range_tolerance_percentage))) if not lower_range_similar: logger.info( 'lower range of fp_id_metric_ts (%s) and anomalous_timeseries (%s) are not similar' % (str(min_fp_value), str(min_anomalous_value))) upper_range_not_same = True try: if int(max_fp_value) == int(max_anomalous_value): upper_range_not_same = False upper_range_similar = True logger.info( 'max value of fp_id_metric_ts (%s) and anomalous_timeseries (%s) are the same' % (str(max_fp_value), str(max_anomalous_value))) except: upper_range_not_same = True if max_fp_value and max_anomalous_value and lower_range_similar and upper_range_not_same: # @added 20180717 - Task #2446: Optimize Ionosphere # Feature #2404: Ionosphere - fluid approximation # On low values such as 1 and 2, the range_tolerance # should be adjusted to account for the very small # range. TODO lower_max_fp_value = int(max_fp_value - (max_fp_value * range_tolerance)) upper_max_fp_value = int(max_fp_value + (max_fp_value * range_tolerance)) if int(max_anomalous_value) in range(lower_max_fp_value, upper_max_fp_value): upper_range_similar = True logger.info( 'max value of fp_id_metric_ts (%s) and anomalous_timeseries (%s) are similar within %s percent of each other' % (str(max_fp_value), str(max_anomalous_value), str(range_tolerance_percentage))) else: logger.info( 'max value of fp_id_metric_ts (%s) and anomalous_timeseries (%s) are not similar' % (str(max_fp_value), str(max_anomalous_value))) if lower_range_similar and upper_range_similar: range_similar = True else: logger.info( 'the ranges of fp_id_metric_ts and anomalous_timeseries differ significantly Min-Max scaling will be skipped' ) minmax_fp_ts = [] # if fp_id_metric_ts: if range_similar: if LOCAL_DEBUG: logger.debug( 'debug :: creating minmax_fp_ts from minmax scaled fp_id_metric_ts' ) try: minmax_fp_values = [x[1] for x in fp_id_metric_ts] x_np = np.asarray(minmax_fp_values) # Min-Max scaling np_minmax = (x_np - x_np.min()) / (x_np.max() - x_np.min()) for (ts, v) in zip(fp_id_metric_ts, np_minmax): minmax_fp_ts.append([ts[0], v]) logger.info( 'minmax_fp_ts list populated with the minmax scaled time series with %s data points' % str(len(minmax_fp_ts))) except: logger.error(traceback.format_exc()) logger.error( 'error :: could not minmax scale fp id %s time series for %s' % (str(fp_id), str(base_name))) if not minmax_fp_ts: logger.error('error :: minmax_fp_ts list not populated') minmax_anomalous_ts = [] anomalous_ts_values_count = len(anomalous_timeseries) if minmax_fp_ts: # Only process if they are approximately the same length minmax_fp_ts_values_count = len(minmax_fp_ts) if minmax_fp_ts_values_count - anomalous_ts_values_count in range( -14, 14): try: minmax_anomalous_values = [ x2[1] for x2 in anomalous_timeseries ] x_np = np.asarray(minmax_anomalous_values) # Min-Max scaling np_minmax = (x_np - x_np.min()) / (x_np.max() - x_np.min()) for (ts, v) in zip(fp_id_metric_ts, np_minmax): minmax_anomalous_ts.append([ts[0], v]) except: logger.error(traceback.format_exc()) logger.error( 'error :: could not minmax scale current time series anomalous_timeseries for %s' % (str(fp_id), str(base_name))) if len(minmax_anomalous_ts) > 0: logger.info( 'minmax_anomalous_ts is populated with %s data points' % str(len(minmax_anomalous_ts))) else: logger.error('error :: minmax_anomalous_ts is not populated') else: logger.info( 'minmax scaled check will be skipped - anomalous_ts_values_count is %s and minmax_fp_ts is %s' % (str(anomalous_ts_values_count), str(minmax_fp_ts_values_count))) minmax_fp_ts_csv = '%s/fpid.%s.%s.minmax_fp_ts.tsfresh.input.std.csv' % ( settings.SKYLINE_TMP_DIR, str(fp_id), base_name) minmax_fp_fname_out = minmax_fp_ts_csv + '.transposed.csv' anomalous_ts_csv = '%s/%s.%s.minmax_anomalous_ts.tsfresh.std.csv' % ( settings.SKYLINE_TMP_DIR, metric_timestamp, base_name) anomalous_fp_fname_out = anomalous_ts_csv + '.transposed.csv' tsf_settings = ReasonableFeatureExtractionSettings() tsf_settings.disable_progressbar = True minmax_fp_features_sum = None minmax_anomalous_features_sum = None if minmax_anomalous_ts and minmax_fp_ts: if LOCAL_DEBUG: logger.debug( 'debug :: analyzing minmax_fp_ts and minmax_anomalous_ts') if not os.path.isfile(minmax_fp_ts_csv): if LOCAL_DEBUG: logger.debug('debug :: creating %s from minmax_fp_ts' % minmax_fp_ts_csv) datapoints = minmax_fp_ts converted = [] for datapoint in datapoints: try: new_datapoint = [float(datapoint[0]), float(datapoint[1])] converted.append(new_datapoint) except: # nosec continue if LOCAL_DEBUG: if len(converted) > 0: logger.debug('debug :: converted is populated') else: logger.debug( 'debug :: error :: converted is not populated') for ts, value in converted: try: utc_ts_line = '%s,%s,%s\n' % (base_name, str( int(ts)), str(value)) with open(minmax_fp_ts_csv, 'a') as fh: fh.write(utc_ts_line) except: logger.error(traceback.format_exc()) logger.error('error :: could not write to file %s' % (str(minmax_fp_ts_csv))) else: logger.info('file found %s, using for data' % minmax_fp_ts_csv) if not os.path.isfile(minmax_fp_ts_csv): logger.error('error :: file not found %s' % minmax_fp_ts_csv) else: logger.info( 'file exists to create the minmax_fp_ts data frame from - %s' % minmax_fp_ts_csv) try: df = pd.read_csv(minmax_fp_ts_csv, delimiter=',', header=None, names=['metric', 'timestamp', 'value']) df.columns = ['metric', 'timestamp', 'value'] except: logger.error(traceback.format_exc()) logger.error('error :: failed to created data frame from %s' % (str(minmax_fp_ts_csv))) try: df_features = extract_features( df, column_id='metric', column_sort='timestamp', column_kind=None, column_value=None, feature_extraction_settings=tsf_settings) except: logger.error(traceback.format_exc()) logger.error('error :: failed to created df_features from %s' % (str(minmax_fp_ts_csv))) # Create transposed features csv if not os.path.isfile(minmax_fp_fname_out): # Transpose df_t = df_features.transpose() df_t.to_csv(minmax_fp_fname_out) else: if LOCAL_DEBUG: logger.debug('debug :: file exists - %s' % minmax_fp_fname_out) try: # Calculate the count and sum of the features values df_sum = pd.read_csv(minmax_fp_fname_out, delimiter=',', header=0, names=['feature_name', 'value']) df_sum.columns = ['feature_name', 'value'] df_sum['feature_name'] = df_sum['feature_name'].astype(str) df_sum['value'] = df_sum['value'].astype(float) minmax_fp_features_count = len(df_sum['value']) minmax_fp_features_sum = df_sum['value'].sum() logger.info( 'minmax_fp_ts - features_count: %s, features_sum: %s' % (str(minmax_fp_features_count), str(minmax_fp_features_sum))) except: logger.error(traceback.format_exc()) logger.error('error :: failed to created df_sum from %s' % (str(minmax_fp_fname_out))) if minmax_fp_features_count > 0: if LOCAL_DEBUG: logger.debug( 'debug :: minmax_fp_features_count of the minmax_fp_ts is %s' % str(minmax_fp_features_count)) else: logger.error('error :: minmax_fp_features_count is %s' % str(minmax_fp_features_count)) if not os.path.isfile(anomalous_ts_csv): datapoints = minmax_anomalous_ts converted = [] for datapoint in datapoints: try: new_datapoint = [float(datapoint[0]), float(datapoint[1])] converted.append(new_datapoint) except: # nosec continue for ts, value in converted: utc_ts_line = '%s,%s,%s\n' % (base_name, str( int(ts)), str(value)) with open(anomalous_ts_csv, 'a') as fh: fh.write(utc_ts_line) df = pd.read_csv(anomalous_ts_csv, delimiter=',', header=None, names=['metric', 'timestamp', 'value']) df.columns = ['metric', 'timestamp', 'value'] df_features_current = extract_features( df, column_id='metric', column_sort='timestamp', column_kind=None, column_value=None, feature_extraction_settings=tsf_settings) # Create transposed features csv if not os.path.isfile(anomalous_fp_fname_out): # Transpose df_t = df_features_current.transpose() df_t.to_csv(anomalous_fp_fname_out) # Calculate the count and sum of the features values df_sum_2 = pd.read_csv(anomalous_fp_fname_out, delimiter=',', header=0, names=['feature_name', 'value']) df_sum_2.columns = ['feature_name', 'value'] df_sum_2['feature_name'] = df_sum_2['feature_name'].astype(str) df_sum_2['value'] = df_sum_2['value'].astype(float) minmax_anomalous_features_count = len(df_sum_2['value']) minmax_anomalous_features_sum = df_sum_2['value'].sum() logger.info( 'minmax_anomalous_ts - minmax_anomalous_features_count: %s, minmax_anomalous_features_sum: %s' % (str(minmax_anomalous_features_count), str(minmax_anomalous_features_sum))) if minmax_fp_features_sum and minmax_anomalous_features_sum: percent_different = None try: fp_sum_array = [minmax_fp_features_sum] calc_sum_array = [minmax_anomalous_features_sum] percent_different = 100 sums_array = np.array( [minmax_fp_features_sum, minmax_anomalous_features_sum], dtype=float) calc_percent_different = np.diff( sums_array) / sums_array[:-1] * 100. percent_different = calc_percent_different[0] logger.info( 'percent_different between minmax scaled features sums - %s' % str(percent_different)) except: logger.error(traceback.format_exc()) logger.error( 'error :: failed to calculate percent_different from minmax scaled features sums' ) if percent_different: almost_equal = None try: np.testing.assert_array_almost_equal(fp_sum_array, calc_sum_array) almost_equal = True except: almost_equal = False if almost_equal: minmax_not_anomalous = True logger.info( 'minmax scaled common features sums are almost equal, not anomalous' ) # if diff_in_sums <= 1%: if percent_different < 0: new_pdiff = percent_different * -1 percent_different = new_pdiff if percent_different < float(features_percentage_diff): minmax_not_anomalous = True # log logger.info( 'not anomalous - minmax scaled features profile match - %s - %s' % (base_name, str(minmax_not_anomalous))) logger.info( 'minmax scaled calculated features sum are within %s percent of fp_id %s with %s, not anomalous' % (str(features_percentage_diff), str(fp_id), str(percent_different))) # @modified 20191115 - Branch #3262: py3 # if minmax_not_anomalous: # not_anomalous = True # minmax = 1 # Created time series resources for graphing in # the matched page return (minmax_not_anomalous, minmax_fp_features_sum, minmax_fp_features_count, minmax_anomalous_features_sum, minmax_anomalous_features_count)
def create_test_features_profile(json_file): filename = os.path.basename(json_file) metric = filename.replace('.mirage.redis.24h.json', '') metric_data_dir = os.path.dirname(json_file) anomaly_json = json_file ts_csv = '%s.test.echo.tsfresh.input.csv' % (json_file) fname_in = ts_csv t_fname_out = fname_in + '.features.transposed.csv' if os.path.isfile(t_fname_out): return t_fname_out start = timer() with open(anomaly_json, 'r') as f: raw_timeseries = f.read() # Convert the timeseries to csv try: timeseries_array_str = str(raw_timeseries).replace('(', '[').replace( ')', ']') del raw_timeseries timeseries = literal_eval(timeseries_array_str) del timeseries_array_str except: print('error :: could not literal_eval %s' % anomaly_json) print(traceback.format_exc()) return False datapoints = timeseries del timeseries converted = [] for datapoint in datapoints: try: new_datapoint = [float(datapoint[0]), float(datapoint[1])] converted.append(new_datapoint) # @modified 20170913 - Task #2160: Test skyline with bandit # Added nosec to exclude from bandit tests except: # nosec continue if os.path.isfile(ts_csv): os.remove(ts_csv) for ts, value in converted: # print('%s,%s' % (str(int(ts)), str(value))) utc_ts_line = '%s,%s,%s\n' % (metric, str(int(ts)), str(value)) with open(ts_csv, 'a') as fh: fh.write(utc_ts_line) del converted df = pd.read_csv(ts_csv, delimiter=',', header=None, names=['metric', 'timestamp', 'value']) # print('DataFrame created with %s' % ts_csv) df.columns = ['metric', 'timestamp', 'value'] tsf_settings = ReasonableFeatureExtractionSettings() # Disable tqdm progress bar tsf_settings.disable_progressbar = True df_features = extract_features(df, column_id='metric', column_sort='timestamp', column_kind=None, column_value=None, feature_extraction_settings=tsf_settings) del df # print('features extracted from %s data' % ts_csv) # write to disk fname_out = fname_in + '.features.csv' # Transpose df_t = df_features.transpose() # print('features transposed') # Create transposed features csv t_fname_out = fname_in + '.features.transposed.csv' df_t.to_csv(t_fname_out) del df_t # Calculate the count and sum of the features values df_sum = pd.read_csv(t_fname_out, delimiter=',', header=0, names=['feature_name', 'value']) df_sum.columns = ['feature_name', 'value'] df_sum['feature_name'] = df_sum['feature_name'].astype(str) df_sum['value'] = df_sum['value'].astype(float) features_count = len(df_sum['value']) features_sum = df_sum['value'].sum() del df_sum # print('features saved to %s' % (fname_out)) # print('transposed features saved to %s' % (t_fname_out)) return t_fname_out