コード例 #1
0
    def extract(self, use_features=[]):
        x = self.__x_data_frame()
        y = self.__y_series()

        settings = ReasonableFeatureExtractionSettings()
        extracted_features = extract_features(x, column_id='id', \
                feature_extraction_settings=settings)
        if len(use_features) == 0:
            impute(extracted_features)
            features_filtered = select_features(extracted_features, y)
            use_features = features_filtered.keys()
        else:
            features_filtered = extracted_features[use_features]

        keys = features_filtered.keys()
        timeseries = []
        for index, row in features_filtered.iterrows():
            values = []
            for key in keys:
                if key == 'id':
                    continue

                value = row[key]
                values.append(value)

            timeseries.append(Timeseries([values]))

        return timeseries, use_features
コード例 #2
0
def calculate_features_profile(current_skyline_app, timestamp, metric,
                               context):
    """
    Calculates a tsfresh features profile from a training data set

    :param timestamp: the timestamp of metric anomaly with training data
    :type timestamp: str
    :param metric: the base_name of the metric
    :type metric: str
    :return: (features_profile_csv_file_path, successful, fail_msg, traceback_format_exc, calc_time)
    :rtype: int
    :rtype: (str, boolean, str, str, str)
    """

    current_skyline_app_logger = current_skyline_app + 'Log'
    current_logger = logging.getLogger(current_skyline_app_logger)

    base_name = str(metric)
    if context == 'training_data':
        log_context = 'training data'
    if context == 'features_profiles':
        log_context = 'features profile data'
    if context == 'ionosphere':
        log_context = 'ionosphere'
    # @added 20170114 - Feature #1854: Ionosphere learn
    if context == 'ionosphere_learn':
        log_context = 'ionosphere :: learn'

    current_logger.info('%s feature profile creation requested for %s at %s' %
                        (log_context, base_name, timestamp))

    timeseries_dir = base_name.replace('.', '/')
    if context == 'training_data' or context == 'ionosphere':
        metric_data_dir = '%s/%s/%s' % (settings.IONOSPHERE_DATA_FOLDER,
                                        timestamp, timeseries_dir)
    if context == 'features_profiles':
        metric_data_dir = '%s/%s/%s' % (settings.IONOSPHERE_PROFILES_FOLDER,
                                        timeseries_dir, timestamp)

    # @added 20170113 - Feature #1854: Ionosphere learn
    if context == 'ionosphere_learn':
        metric_data_dir = '%s/%s/%s' % (settings.IONOSPHERE_LEARN_FOLDER,
                                        timestamp, timeseries_dir)

    features_profile_created_file = '%s/%s.%s.fp.created.txt' % (
        metric_data_dir, str(timestamp), base_name)

    features_profile_details_file = '%s/%s.%s.fp.details.txt' % (
        metric_data_dir, str(timestamp), base_name)

    # @added 20170108 - Feature #1842: Ionosphere - Graphite now graphs
    # Added metric_check_file and ts_full_duration is needed to be determined
    # and added the to features_profile_details_file as it was not added here on
    # the 20170104 when it was added the webapp and ionosphere
    metric_var_filename = '%s.txt' % str(base_name)
    anomaly_check_file = '%s/%s' % (metric_data_dir, metric_var_filename)
    ts_full_duration = int(settings.FULL_DURATION)
    if os.path.isfile(anomaly_check_file):
        # Read the details file
        with open(anomaly_check_file, 'r') as f:
            anomaly_details = f.readlines()
            for i, line in enumerate(anomaly_details):
                if 'full_duration' in line:
                    _ts_full_duration = '%s' % str(line).split("'", 2)
                    full_duration_array = literal_eval(_ts_full_duration)
                    ts_full_duration = str(int(full_duration_array[1]))

    anomaly_json = '%s/%s.json' % (metric_data_dir, base_name)
    ts_csv = '%s/%s.tsfresh.input.csv' % (metric_data_dir, base_name)
    #    anomaly_json = '/opt/skyline/ionosphere/data/1480104000/stats/statsd/graphiteStats/calculationtime/stats.statsd.graphiteStats.calculationtime.json'
    #    ts_csv = '/opt/skyline/ionosphere/data/1480104000/stats/statsd/graphiteStats/calculationtime/stats.statsd.graphiteStats.calculationtime.tsfresh.input.csv'
    # This is simply to stay in line with tsfresh naming conventions in their
    # docs and examples
    fname_in = ts_csv
    t_fname_out = fname_in + '.features.transposed.csv'

    fp_id = None
    f_calc = 'unknown'
    if os.path.isfile(features_profile_details_file):
        current_logger.info('features profile details file exist - %s' %
                            (features_profile_details_file))
        try:
            with open(features_profile_details_file, 'r') as f:
                fp_details_str = f.read()
            fp_details_array = literal_eval(fp_details_str)
            f_calc = ' (previously calculated by Ionosphere) - %s' % str(
                fp_details_array[2])
        except:
            trace = traceback.format_exc()
            current_logger.error(trace)
            current_logger.error('error: failed to read from %s' %
                                 (features_profile_details_file))
    else:
        current_logger.info('OK no features profile details file exists - %s' %
                            (features_profile_details_file))

    fp_created = None
    if os.path.isfile(features_profile_created_file):
        current_logger.info('features profile created file exist - %s' %
                            (features_profile_created_file))
        try:
            with open(features_profile_created_file, 'r') as f:
                fp_created_str = f.read()
            fp_created_array = literal_eval(fp_created_str)
            fp_id = fp_created_array[0]
            fp_created = True
        except:
            trace = traceback.format_exc()
            current_logger.error(trace)
            current_logger.error('error: failed to read fp_id from %s' %
                                 (features_profile_created_file))
    else:
        current_logger.info('OK no features profile created file exists - %s' %
                            (features_profile_created_file))

    if os.path.isfile(t_fname_out):
        current_logger.info('transposed features already exist - %s' %
                            (t_fname_out))
        return str(
            t_fname_out), True, fp_created, fp_id, 'none', 'none', f_calc

    start = timer()
    if os.path.isfile(anomaly_json):
        try:
            # Read the timeseries json file
            with open(anomaly_json, 'r') as f:
                raw_timeseries = f.read()
        except:
            trace = traceback.format_exc()
            current_logger.error(trace)
            current_logger.error(
                'error: failed to read timeseries data from %s' %
                (anomaly_json))
            fail_msg = 'error: failed to read timeseries data from %s' % anomaly_json
            end = timer()
            return 'error', False, fp_created, fp_id, fail_msg, trace, f_calc
    else:
        trace = 'none'
        fail_msg = 'error: file not found - %s' % (anomaly_json)
        current_logger.error(fail_msg)
        end = timer()
        return 'error', False, fp_created, fp_id, fail_msg, trace, f_calc

    # Convert the timeseries to csv
    timeseries_array_str = str(raw_timeseries).replace('(',
                                                       '[').replace(')', ']')
    timeseries = literal_eval(timeseries_array_str)

    datapoints = timeseries
    converted = []
    for datapoint in datapoints:
        try:
            new_datapoint = [float(datapoint[0]), float(datapoint[1])]
            converted.append(new_datapoint)
        # @modified 20170913 - Task #2160: Test skyline with bandit
        # Added nosec to exclude from bandit tests
        except:  # nosec
            continue

    if os.path.isfile(ts_csv):
        os.remove(ts_csv)

    for ts, value in converted:
        # print('%s,%s' % (str(int(ts)), str(value)))
        utc_ts_line = '%s,%s,%s\n' % (metric, str(int(ts)), str(value))
        with open(ts_csv, 'a') as fh:
            fh.write(utc_ts_line)

    try:
        df = pd.read_csv(ts_csv,
                         delimiter=',',
                         header=None,
                         names=['metric', 'timestamp', 'value'])
        current_logger.info('DataFrame created with %s' % ts_csv)
    except:
        trace = traceback.format_exc()
        current_logger.error(trace)
        fail_msg = 'error: failed to create a pandas DataFrame with %s' % ts_csv
        current_logger.error('%s' % fail_msg)
        if os.path.isfile(ts_csv):
            os.remove(ts_csv)
            current_logger.info('removed %s' % ts_csv)
        end = timer()
        return 'error', False, fp_created, fp_id, fail_msg, trace, f_calc

# @added 20161207 - Task #1658: Patterning Skyline Ionosphere
# Coverting the Dataframe types to suit MySQL data types
# For anyone in here if you have done a code review of Skyline there are
# a number of questions that arise from the decision to deviate from json or
# storing msgppack as BLOB etc.  tsfresh used csv and we can csv from Graphite
# etc.  Skyline should be able to handle csv.  As for how data is stored in
# MySQL, this was given considerable review and thought.  Given that Ionosphere
# and Skyline in general should not be limited to the domain of analyzing
# Graphite machine metrics but other timeseries data sources too.
#    df['feature_name'] = df['feature_name'].astype(string)
#    df['value'] = df['value'].astype(float)

# Test the DataFrame
    try:
        df_created = df.head()
        del df_created
    except:
        trace = traceback.format_exc()
        current_logger.debug(trace)
        fail_msg = 'error: failed to read the pandas DataFrame created with %s' % ts_csv
        current_logger.error('%s' % fail_msg)
        if os.path.isfile(ts_csv):
            os.remove(ts_csv)
            current_logger.info('removed %s' % ts_csv)
        end = timer()
        return 'error', False, fp_created, fp_id, fail_msg, trace, f_calc

    df.columns = ['metric', 'timestamp', 'value']

    start_feature_extraction = timer()
    current_logger.info('starting extract_features with %s' %
                        str(TSFRESH_VERSION))
    try:
        # @modified 20161226 - Bug #1822: tsfresh extract_features process stalling
        # Changed to use the new ReasonableFeatureExtractionSettings that was
        # introduced in tsfresh-0.4.0 to exclude the computationally high cost
        # of extracting features from very static timeseries that has little to
        # no variation is the values, which results in features taking up to
        # almost 600 seconds to calculate on a timeseries of length 10075
        # (168h - 1 datapoint per 60s)
        # In terms of inline feature calculatation, always exclude
        # high_comp_cost features.
        # df_features = extract_features(df, column_id='metric', column_sort='timestamp', column_kind=None, column_value=None)
        tsf_settings = ReasonableFeatureExtractionSettings()
        # Disable tqdm progress bar
        tsf_settings.disable_progressbar = True
        df_features = extract_features(
            df,
            column_id='metric',
            column_sort='timestamp',
            column_kind=None,
            column_value=None,
            feature_extraction_settings=tsf_settings)
        current_logger.info('features extracted from %s data' % ts_csv)
    except:
        trace = traceback.print_exc()
        current_logger.debug(trace)
        fail_msg = 'error: extracting features with tsfresh from - %s' % ts_csv
        current_logger.error('%s' % fail_msg)
        end_feature_extraction = timer()
        current_logger.info(
            'feature extraction failed in %.6f seconds' %
            (end_feature_extraction - start_feature_extraction))
        if os.path.isfile(ts_csv):
            os.remove(ts_csv)
            current_logger.info('removed %s' % ts_csv)
        end = timer()
        return 'error', False, fp_created, fp_id, fail_msg, trace, f_calc

    end_feature_extraction = timer()
    feature_extraction_time = end_feature_extraction - start_feature_extraction
    current_logger.info('feature extraction took %.6f seconds' %
                        (feature_extraction_time))

    # write to disk
    fname_out = fname_in + '.features.csv'
    # df_features.to_csv(fname_out)

    # Transpose
    try:
        df_t = df_features.transpose()
        current_logger.info('features transposed')
    except:
        trace = traceback.print_exc()
        current_logger.debug(trace)
        fail_msg = 'error :: transposing tsfresh features from - %s' % ts_csv
        current_logger.error('%s' % fail_msg)
        if os.path.isfile(ts_csv):
            os.remove(ts_csv)
            current_logger.info('removed %s' % ts_csv)
        end = timer()
        return 'error', False, fp_created, fp_id, fail_msg, trace, f_calc

    # Create transposed features csv
    t_fname_out = fname_in + '.features.transposed.csv'
    try:
        df_t.to_csv(t_fname_out)
    except:
        trace = traceback.print_exc()
        current_logger.debug(trace)
        fail_msg = 'error: saving transposed tsfresh features from - %s' % ts_csv
        current_logger.error('%s' % fail_msg)
        if os.path.isfile(ts_csv):
            os.remove(ts_csv)
            current_logger.info('removed %s' % ts_csv)
        end = timer()
        return 'error', False, fp_created, fp_id, fail_msg, trace, f_calc

    # Calculate the count and sum of the features values
    try:
        df_sum = pd.read_csv(t_fname_out,
                             delimiter=',',
                             header=0,
                             names=['feature_name', 'value'])
        df_sum.columns = ['feature_name', 'value']
        df_sum['feature_name'] = df_sum['feature_name'].astype(str)
        df_sum['value'] = df_sum['value'].astype(float)
    except:
        trace = traceback.print_exc()
        current_logger.error(trace)
        current_logger.error('error :: failed to create Dataframe to sum')
    try:
        features_count = len(df_sum['value'])
    except:
        trace = traceback.print_exc()
        current_logger.debug(trace)
        current_logger.error(
            'error :: failed to count number of features, set to 0')
        features_count = 0
    try:
        features_sum = df_sum['value'].sum()
    except:
        trace = traceback.print_exc()
        current_logger.debug(trace)
        current_logger.error('error :: failed to sum feature values, set to 0')
        features_sum = 0

    end = timer()

    current_logger.info('features saved to %s' % (fname_out))
    current_logger.info('transposed features saved to %s' % (t_fname_out))
    total_calc_time = '%.6f' % (end - start)
    calc_time = '%.6f' % (feature_extraction_time)
    current_logger.info('total feature profile completed in %s seconds' %
                        str(total_calc_time))

    # Create a features profile details file
    try:
        # @modified 20170108 - Feature #1842: Ionosphere - Graphite now graphs
        # Added the ts_full_duration here as it was not added here on the 20170104
        # when it was added the webapp and ionosphere
        data = '[%s, \'%s\', %s, %s, %s, %s]' % (
            str(int(time.time())), str(tsfresh_version), str(calc_time),
            str(features_count), str(features_sum), str(ts_full_duration))
        write_data_to_file(current_skyline_app, features_profile_details_file,
                           'w', data)
    except:
        trace = traceback.format_exc()
        current_logger.error('%s' % trace)
        fail_msg = 'error :: failed to write %s' % features_profile_details_file
        current_logger.error('%s' % fail_msg)

    if os.path.isfile(ts_csv):
        os.remove(ts_csv)
        current_logger.info('removed the created csv - %s' % ts_csv)

    # @added 20170112 - Feature #1854: Ionosphere learn - Redis ionosphere.learn.work namespace
    # Ionosphere learn needs Redis works sets, but this was moved to
    # ionosphere_backend.py and learn.py not done here

    return str(t_fname_out), True, fp_created, fp_id, 'none', 'none', str(
        calc_time)
コード例 #3
0
def calculate_features_other_minmax(use_file, i_json_file, metric):

    fp_id = 'testing.feature2484'
    base_name = metric
    metric_timestamp = 'none'

    not_anomalous = False
    minmax_not_anomalous = False
    minmax = 0
    minmax_check = True

    with open(use_file, 'r') as f:
        raw_timeseries = f.read()
    # Convert the timeseries to csv
    timeseries_array_str = str(raw_timeseries).replace('(',
                                                       '[').replace(')', ']')
    del raw_timeseries
    anomalous_timeseries = literal_eval(timeseries_array_str)
    anomalous_ts_values_count = len(anomalous_timeseries)

    with open(i_json_file, 'r') as f:
        fp_raw_timeseries = f.read()
    # Convert the timeseries to csv
    fp_timeseries_array_str = str(fp_raw_timeseries).replace('(', '[').replace(
        ')', ']')
    del fp_raw_timeseries
    fp_id_metric_ts = literal_eval(fp_timeseries_array_str)
    fp_id_metric_ts_values_count = len(fp_id_metric_ts)

    try:
        range_tolerance = settings.IONOSPHERE_MINMAX_SCALING_RANGE_TOLERANCE
    except:
        range_tolerance = 0.15
    range_tolerance_percentage = range_tolerance * 100
    check_range = False
    range_similar = False
    if fp_id_metric_ts:
        if anomalous_ts_values_count > 0:
            check_range = True
    lower_range_similar = False
    upper_range_similar = False

    min_fp_value = None
    min_anomalous_value = None
    max_fp_value = None
    max_anomalous_value = None

    if check_range:
        try:
            minmax_fp_values = [x[1] for x in fp_id_metric_ts]
            min_fp_value = min(minmax_fp_values)
            max_fp_value = max(minmax_fp_values)
        except:
            min_fp_value = False
            max_fp_value = False
        try:
            minmax_anomalous_values = [x2[1] for x2 in anomalous_timeseries]
            min_anomalous_value = min(minmax_anomalous_values)
            max_anomalous_value = max(minmax_anomalous_values)
        except:
            min_anomalous_value = False
            max_anomalous_value = False
        lower_range_not_same = True
        try:
            if int(min_fp_value) == int(min_anomalous_value):
                lower_range_not_same = False
                lower_range_similar = True
                print(
                    'min value of fp_id_metric_ts (%s) and anomalous_timeseries (%s) are the same'
                    % (str(min_fp_value), str(min_anomalous_value)))
        except:
            lower_range_not_same = True
        if min_fp_value and min_anomalous_value and lower_range_not_same:
            if int(min_fp_value) == int(min_anomalous_value):
                lower_range_similar = True
                print(
                    'min value of fp_id_metric_ts (%s) and anomalous_timeseries (%s) are the same'
                    % (str(min_fp_value), str(min_anomalous_value)))
            else:
                lower_min_fp_value = int(min_fp_value -
                                         (min_fp_value * range_tolerance))
                upper_min_fp_value = int(min_fp_value +
                                         (min_fp_value * range_tolerance))
                if int(min_anomalous_value) in range(lower_min_fp_value,
                                                     upper_min_fp_value):
                    lower_range_similar = True
                    print(
                        'min value of fp_id_metric_ts (%s) and anomalous_timeseries (%s) are similar within %s percent of each other'
                        % (str(min_fp_value), str(min_anomalous_value),
                           str(range_tolerance_percentage)))
        if not lower_range_similar:
            print(
                'lower range of fp_id_metric_ts (%s) and anomalous_timeseries (%s) are not similar'
                % (str(min_fp_value), str(min_anomalous_value)))
        upper_range_not_same = True
        try:
            if int(max_fp_value) == int(max_anomalous_value):
                upper_range_not_same = False
                upper_range_similar = True
                print(
                    'max value of fp_id_metric_ts (%s) and anomalous_timeseries (%s) are the same'
                    % (str(max_fp_value), str(max_anomalous_value)))
        except:
            upper_range_not_same = True
        if max_fp_value and max_anomalous_value and lower_range_similar and upper_range_not_same:
            # @added 20180717 - Task #2446: Optimize Ionosphere
            #                   Feature #2404: Ionosphere - fluid approximation
            # On low values such as 1 and 2, the range_tolerance
            # should be adjusted to account for the very small
            # range. TODO
            lower_max_fp_value = int(max_fp_value -
                                     (max_fp_value * range_tolerance))
            upper_max_fp_value = int(max_fp_value +
                                     (max_fp_value * range_tolerance))
            if int(max_anomalous_value) in range(lower_max_fp_value,
                                                 upper_max_fp_value):
                upper_range_similar = True
                print(
                    'max value of fp_id_metric_ts (%s) and anomalous_timeseries (%s) are similar within %s percent of each other'
                    % (str(max_fp_value), str(max_anomalous_value),
                       str(range_tolerance_percentage)))
            else:
                print(
                    'max value of fp_id_metric_ts (%s) and anomalous_timeseries (%s) are not similar'
                    % (str(max_fp_value), str(max_anomalous_value)))
    if lower_range_similar and upper_range_similar:
        range_similar = True
    else:
        print(
            'the ranges of fp_id_metric_ts and anomalous_timeseries differ significantly Min-Max scaling will be skipped'
        )

    minmax_fp_ts = []
    # if fp_id_metric_ts:
    if range_similar:
        try:
            minmax_fp_values = [x[1] for x in fp_id_metric_ts]
            x_np = np.asarray(minmax_fp_values)
            # Min-Max scaling
            np_minmax = (x_np - x_np.min()) / (x_np.max() - x_np.min())
            for (ts, v) in zip(fp_id_metric_ts, np_minmax):
                minmax_fp_ts.append([ts[0], v])
            print(
                'minmax_fp_ts list populated with the minmax scaled time series with %s data points'
                % str(len(minmax_fp_ts)))
            del minmax_fp_values
        except:
            print(
                'error :: could not minmax scale fp id %s time series for %s' %
                (str(fp_id), str(base_name)))
        if not minmax_fp_ts:
            print('error :: minmax_fp_ts list not populated')

    minmax_anomalous_ts = []
    if minmax_fp_ts:
        # Only process if they are approximately the same length
        minmax_fp_ts_values_count = len(minmax_fp_ts)
        if minmax_fp_ts_values_count - anomalous_ts_values_count in range(
                -14, 14):
            try:
                minmax_anomalous_values = [
                    x2[1] for x2 in anomalous_timeseries
                ]
                x_np = np.asarray(minmax_anomalous_values)
                # Min-Max scaling
                np_minmax = (x_np - x_np.min()) / (x_np.max() - x_np.min())
                for (ts, v) in zip(fp_id_metric_ts, np_minmax):
                    minmax_anomalous_ts.append([ts[0], v])
                del anomalous_timeseries
                del minmax_anomalous_values
            except:
                print(
                    'error :: could not minmax scale current time series anomalous_timeseries for %s'
                    % (str(fp_id), str(base_name)))
            if len(minmax_anomalous_ts) > 0:
                print('minmax_anomalous_ts is populated with %s data points' %
                      str(len(minmax_anomalous_ts)))
            else:
                print('error :: minmax_anomalous_ts is not populated')
        else:
            print(
                'minmax scaled check will be skipped - anomalous_ts_values_count is %s and minmax_fp_ts is %s'
                % (str(anomalous_ts_values_count),
                   str(minmax_fp_ts_values_count)))

    minmax_fp_ts_csv = '%s/fpid.%s.%s.minmax_fp_ts.tsfresh.input.std.csv' % (
        settings.SKYLINE_TMP_DIR, str(fp_id), base_name)
    if os.path.isfile(minmax_fp_ts_csv):
        os.remove(minmax_fp_ts_csv)
    minmax_fp_fname_out = minmax_fp_ts_csv + '.transposed.csv'
    if os.path.isfile(minmax_fp_fname_out):
        os.remove(minmax_fp_fname_out)
    anomalous_ts_csv = '%s/%s.%s.minmax_anomalous_ts.tsfresh.std.csv' % (
        settings.SKYLINE_TMP_DIR, metric_timestamp, base_name)
    if os.path.isfile(anomalous_ts_csv):
        os.remove(anomalous_ts_csv)
    anomalous_fp_fname_out = anomalous_ts_csv + '.transposed.csv'
    if os.path.isfile(anomalous_fp_fname_out):
        os.remove(anomalous_fp_fname_out)

    tsf_settings = ReasonableFeatureExtractionSettings()
    tsf_settings.disable_progressbar = True
    minmax_fp_features_sum = None
    minmax_anomalous_features_sum = None
    if minmax_anomalous_ts and minmax_fp_ts:
        if not os.path.isfile(minmax_fp_ts_csv):
            datapoints = minmax_fp_ts
            converted = []
            for datapoint in datapoints:
                try:
                    new_datapoint = [float(datapoint[0]), float(datapoint[1])]
                    converted.append(new_datapoint)
                except:  # nosec
                    continue
            for ts, value in converted:
                try:
                    utc_ts_line = '%s,%s,%s\n' % (base_name, str(
                        int(ts)), str(value))
                    with open(minmax_fp_ts_csv, 'a') as fh:
                        fh.write(utc_ts_line)
                except:
                    print('error :: could not write to file %s' %
                          (str(minmax_fp_ts_csv)))
            del converted
        else:
            print('file found %s, using for data' % minmax_fp_ts_csv)

        if not os.path.isfile(minmax_fp_ts_csv):
            print('error :: file not found %s' % minmax_fp_ts_csv)
        else:
            print(
                'file exists to create the minmax_fp_ts data frame from - %s' %
                minmax_fp_ts_csv)

        try:
            df = pd.read_csv(minmax_fp_ts_csv,
                             delimiter=',',
                             header=None,
                             names=['metric', 'timestamp', 'value'])
            df.columns = ['metric', 'timestamp', 'value']
        except:
            print('error :: failed to created data frame from %s' %
                  (str(minmax_fp_ts_csv)))
        try:
            df_features = extract_features(
                df,
                column_id='metric',
                column_sort='timestamp',
                column_kind=None,
                column_value=None,
                feature_extraction_settings=tsf_settings)
        except:
            print('error :: failed to created df_features from %s' %
                  (str(minmax_fp_ts_csv)))
        # Create transposed features csv
        if not os.path.isfile(minmax_fp_fname_out):
            # Transpose
            df_t = df_features.transpose()
            df_t.to_csv(minmax_fp_fname_out)

        try:
            # Calculate the count and sum of the features values
            df_sum = pd.read_csv(minmax_fp_fname_out,
                                 delimiter=',',
                                 header=0,
                                 names=['feature_name', 'value'])
            df_sum.columns = ['feature_name', 'value']
            df_sum['feature_name'] = df_sum['feature_name'].astype(str)
            df_sum['value'] = df_sum['value'].astype(float)
            minmax_fp_features_count = len(df_sum['value'])
            minmax_fp_features_sum = df_sum['value'].sum()
            print('minmax_fp_ts - features_count: %s, features_sum: %s' %
                  (str(minmax_fp_features_count), str(minmax_fp_features_sum)))
            del df_sum
        except:
            print('error :: failed to created df_sum from %s' %
                  (str(minmax_fp_fname_out)))

        if minmax_fp_features_count > 0:
            print(
                'debug :: minmax_fp_features_count of the minmax_fp_ts is %s' %
                str(minmax_fp_features_count))
        else:
            print('error :: minmax_fp_features_count is %s' %
                  str(minmax_fp_features_count))

        if not os.path.isfile(anomalous_ts_csv):
            datapoints = minmax_anomalous_ts
            converted = []
            for datapoint in datapoints:
                try:
                    new_datapoint = [float(datapoint[0]), float(datapoint[1])]
                    converted.append(new_datapoint)
                except:  # nosec
                    continue
            for ts, value in converted:
                utc_ts_line = '%s,%s,%s\n' % (base_name, str(
                    int(ts)), str(value))
                with open(anomalous_ts_csv, 'a') as fh:
                    fh.write(utc_ts_line)
            del converted

        df = pd.read_csv(anomalous_ts_csv,
                         delimiter=',',
                         header=None,
                         names=['metric', 'timestamp', 'value'])
        df.columns = ['metric', 'timestamp', 'value']
        df_features_current = extract_features(
            df,
            column_id='metric',
            column_sort='timestamp',
            column_kind=None,
            column_value=None,
            feature_extraction_settings=tsf_settings)
        del df

        # Create transposed features csv
        if not os.path.isfile(anomalous_fp_fname_out):
            # Transpose
            df_t = df_features_current.transpose()
            df_t.to_csv(anomalous_fp_fname_out)
            del df_t
            del df_features_current
        # Calculate the count and sum of the features values
        df_sum_2 = pd.read_csv(anomalous_fp_fname_out,
                               delimiter=',',
                               header=0,
                               names=['feature_name', 'value'])
        df_sum_2.columns = ['feature_name', 'value']
        df_sum_2['feature_name'] = df_sum_2['feature_name'].astype(str)
        df_sum_2['value'] = df_sum_2['value'].astype(float)
        minmax_anomalous_features_count = len(df_sum_2['value'])
        minmax_anomalous_features_sum = df_sum_2['value'].sum()
        print(
            'minmax_anomalous_ts - minmax_anomalous_features_count: %s, minmax_anomalous_features_sum: %s'
            % (str(minmax_anomalous_features_count),
               str(minmax_anomalous_features_sum)))
        del df_sum_2
        del minmax_anomalous_ts

    percent_different = 100
    if minmax_fp_features_sum and minmax_anomalous_features_sum:
        percent_different = None
        try:
            fp_sum_array = [minmax_fp_features_sum]
            calc_sum_array = [minmax_anomalous_features_sum]
            percent_different = 100
            sums_array = np.array(
                [minmax_fp_features_sum, minmax_anomalous_features_sum],
                dtype=float)
            calc_percent_different = np.diff(
                sums_array) / sums_array[:-1] * 100.
            percent_different = calc_percent_different[0]
            print(
                'percent_different between minmax scaled features sums - %s' %
                str(percent_different))
        except:
            print(
                'error :: failed to calculate percent_different from minmax scaled features sums'
            )

        if percent_different:
            almost_equal = None
            try:
                np.testing.assert_array_almost_equal(fp_sum_array,
                                                     calc_sum_array)
                almost_equal = True
            except:
                almost_equal = False

            if almost_equal:
                minmax_not_anomalous = True
                print(
                    'minmax scaled common features sums are almost equal, not anomalous'
                )

            # if diff_in_sums <= 1%:
            if percent_different < 0:
                new_pdiff = percent_different * -1
                percent_different = new_pdiff

            # @modified 20190321
            # if percent_different < (settings.IONOSPHERE_FEATURES_PERCENT_SIMILAR + 1):
            if percent_different < IONOSPHERE_ECHO_MINMAX_SCALING_FEATURES_PERCENT_SIMILAR:
                minmax_not_anomalous = True
                # log
                print(
                    'not anomalous - minmax scaled features profile match - %s - %s'
                    % (base_name, str(minmax_not_anomalous)))
                print(
                    'minmax scaled calculated features sum are within %s percent of fp_id %s with %s, not anomalous'
                    % (str(
                        IONOSPHERE_ECHO_MINMAX_SCALING_FEATURES_PERCENT_SIMILAR
                    ), str(fp_id), str(percent_different)))
            if minmax_not_anomalous:
                not_anomalous = True
                minmax = 1
                # Created time series resources for graphing in
                # the matched page

    try:
        clean_file = anomalous_ts_csv
        if os.path.isfile(anomalous_ts_csv):
            os.remove(anomalous_ts_csv)
        # print('cleaned up - %s' % clean_file)
    except:
        print('no anomalous_ts_csv file to clean up')
    try:
        clean_file = anomalous_fp_fname_out
        if os.path.isfile(anomalous_fp_fname_out):
            os.remove(anomalous_fp_fname_out)
        # print('cleaned up - %s' % clean_file)
    except:
        print('no anomalous_fp_fname_out file to clean up')
    return not_anomalous
コード例 #4
0
def minmax_scale_check(fp_id_metric_ts, anomalous_timeseries, range_tolerance,
                       range_tolerance_percentage, fp_id, base_name,
                       metric_timestamp, features_percentage_diff):

    # @modified 20191115 - Branch #3262: py3
    # not_anomalous = False

    try:
        minmax_fp_values = [x[1] for x in fp_id_metric_ts]
        min_fp_value = min(minmax_fp_values)
        max_fp_value = max(minmax_fp_values)
    except:
        min_fp_value = False
        max_fp_value = False
    try:
        minmax_anomalous_values = [x2[1] for x2 in anomalous_timeseries]
        min_anomalous_value = min(minmax_anomalous_values)
        max_anomalous_value = max(minmax_anomalous_values)
    except:
        min_anomalous_value = False
        max_anomalous_value = False
    lower_range_not_same = True
    try:
        if int(min_fp_value) == int(min_anomalous_value):
            lower_range_not_same = False
            lower_range_similar = True
            logger.info(
                'min value of fp_id_metric_ts (%s) and anomalous_timeseries (%s) are the same'
                % (str(min_fp_value), str(min_anomalous_value)))
    except:
        lower_range_not_same = True
    if min_fp_value and min_anomalous_value and lower_range_not_same:
        if int(min_fp_value) == int(min_anomalous_value):
            lower_range_similar = True
            logger.info(
                'min value of fp_id_metric_ts (%s) and anomalous_timeseries (%s) are the same'
                % (str(min_fp_value), str(min_anomalous_value)))
        else:
            lower_min_fp_value = int(min_fp_value -
                                     (min_fp_value * range_tolerance))
            upper_min_fp_value = int(min_fp_value +
                                     (min_fp_value * range_tolerance))
            if int(min_anomalous_value) in range(lower_min_fp_value,
                                                 upper_min_fp_value):
                lower_range_similar = True
                logger.info(
                    'min value of fp_id_metric_ts (%s) and anomalous_timeseries (%s) are similar within %s percent of each other'
                    % (str(min_fp_value), str(min_anomalous_value),
                       str(range_tolerance_percentage)))
    if not lower_range_similar:
        logger.info(
            'lower range of fp_id_metric_ts (%s) and anomalous_timeseries (%s) are not similar'
            % (str(min_fp_value), str(min_anomalous_value)))
    upper_range_not_same = True
    try:
        if int(max_fp_value) == int(max_anomalous_value):
            upper_range_not_same = False
            upper_range_similar = True
            logger.info(
                'max value of fp_id_metric_ts (%s) and anomalous_timeseries (%s) are the same'
                % (str(max_fp_value), str(max_anomalous_value)))
    except:
        upper_range_not_same = True
    if max_fp_value and max_anomalous_value and lower_range_similar and upper_range_not_same:
        # @added 20180717 - Task #2446: Optimize Ionosphere
        #                   Feature #2404: Ionosphere - fluid approximation
        # On low values such as 1 and 2, the range_tolerance
        # should be adjusted to account for the very small
        # range. TODO
        lower_max_fp_value = int(max_fp_value -
                                 (max_fp_value * range_tolerance))
        upper_max_fp_value = int(max_fp_value +
                                 (max_fp_value * range_tolerance))
        if int(max_anomalous_value) in range(lower_max_fp_value,
                                             upper_max_fp_value):
            upper_range_similar = True
            logger.info(
                'max value of fp_id_metric_ts (%s) and anomalous_timeseries (%s) are similar within %s percent of each other'
                % (str(max_fp_value), str(max_anomalous_value),
                   str(range_tolerance_percentage)))
        else:
            logger.info(
                'max value of fp_id_metric_ts (%s) and anomalous_timeseries (%s) are not similar'
                % (str(max_fp_value), str(max_anomalous_value)))

    if lower_range_similar and upper_range_similar:
        range_similar = True
    else:
        logger.info(
            'the ranges of fp_id_metric_ts and anomalous_timeseries differ significantly Min-Max scaling will be skipped'
        )

    minmax_fp_ts = []
    # if fp_id_metric_ts:
    if range_similar:
        if LOCAL_DEBUG:
            logger.debug(
                'debug :: creating minmax_fp_ts from minmax scaled fp_id_metric_ts'
            )
        try:
            minmax_fp_values = [x[1] for x in fp_id_metric_ts]
            x_np = np.asarray(minmax_fp_values)
            # Min-Max scaling
            np_minmax = (x_np - x_np.min()) / (x_np.max() - x_np.min())
            for (ts, v) in zip(fp_id_metric_ts, np_minmax):
                minmax_fp_ts.append([ts[0], v])
            logger.info(
                'minmax_fp_ts list populated with the minmax scaled time series with %s data points'
                % str(len(minmax_fp_ts)))
        except:
            logger.error(traceback.format_exc())
            logger.error(
                'error :: could not minmax scale fp id %s time series for %s' %
                (str(fp_id), str(base_name)))
        if not minmax_fp_ts:
            logger.error('error :: minmax_fp_ts list not populated')

    minmax_anomalous_ts = []
    anomalous_ts_values_count = len(anomalous_timeseries)
    if minmax_fp_ts:
        # Only process if they are approximately the same length
        minmax_fp_ts_values_count = len(minmax_fp_ts)
        if minmax_fp_ts_values_count - anomalous_ts_values_count in range(
                -14, 14):
            try:
                minmax_anomalous_values = [
                    x2[1] for x2 in anomalous_timeseries
                ]
                x_np = np.asarray(minmax_anomalous_values)
                # Min-Max scaling
                np_minmax = (x_np - x_np.min()) / (x_np.max() - x_np.min())
                for (ts, v) in zip(fp_id_metric_ts, np_minmax):
                    minmax_anomalous_ts.append([ts[0], v])
            except:
                logger.error(traceback.format_exc())
                logger.error(
                    'error :: could not minmax scale current time series anomalous_timeseries for %s'
                    % (str(fp_id), str(base_name)))
            if len(minmax_anomalous_ts) > 0:
                logger.info(
                    'minmax_anomalous_ts is populated with %s data points' %
                    str(len(minmax_anomalous_ts)))
            else:
                logger.error('error :: minmax_anomalous_ts is not populated')
        else:
            logger.info(
                'minmax scaled check will be skipped - anomalous_ts_values_count is %s and minmax_fp_ts is %s'
                % (str(anomalous_ts_values_count),
                   str(minmax_fp_ts_values_count)))

    minmax_fp_ts_csv = '%s/fpid.%s.%s.minmax_fp_ts.tsfresh.input.std.csv' % (
        settings.SKYLINE_TMP_DIR, str(fp_id), base_name)
    minmax_fp_fname_out = minmax_fp_ts_csv + '.transposed.csv'
    anomalous_ts_csv = '%s/%s.%s.minmax_anomalous_ts.tsfresh.std.csv' % (
        settings.SKYLINE_TMP_DIR, metric_timestamp, base_name)
    anomalous_fp_fname_out = anomalous_ts_csv + '.transposed.csv'

    tsf_settings = ReasonableFeatureExtractionSettings()
    tsf_settings.disable_progressbar = True
    minmax_fp_features_sum = None
    minmax_anomalous_features_sum = None
    if minmax_anomalous_ts and minmax_fp_ts:
        if LOCAL_DEBUG:
            logger.debug(
                'debug :: analyzing minmax_fp_ts and minmax_anomalous_ts')
        if not os.path.isfile(minmax_fp_ts_csv):
            if LOCAL_DEBUG:
                logger.debug('debug :: creating %s from minmax_fp_ts' %
                             minmax_fp_ts_csv)
            datapoints = minmax_fp_ts
            converted = []
            for datapoint in datapoints:
                try:
                    new_datapoint = [float(datapoint[0]), float(datapoint[1])]
                    converted.append(new_datapoint)
                except:  # nosec
                    continue
            if LOCAL_DEBUG:
                if len(converted) > 0:
                    logger.debug('debug :: converted is populated')
                else:
                    logger.debug(
                        'debug :: error :: converted is not populated')
            for ts, value in converted:
                try:
                    utc_ts_line = '%s,%s,%s\n' % (base_name, str(
                        int(ts)), str(value))
                    with open(minmax_fp_ts_csv, 'a') as fh:
                        fh.write(utc_ts_line)
                except:
                    logger.error(traceback.format_exc())
                    logger.error('error :: could not write to file %s' %
                                 (str(minmax_fp_ts_csv)))
        else:
            logger.info('file found %s, using for data' % minmax_fp_ts_csv)

        if not os.path.isfile(minmax_fp_ts_csv):
            logger.error('error :: file not found %s' % minmax_fp_ts_csv)
        else:
            logger.info(
                'file exists to create the minmax_fp_ts data frame from - %s' %
                minmax_fp_ts_csv)

        try:
            df = pd.read_csv(minmax_fp_ts_csv,
                             delimiter=',',
                             header=None,
                             names=['metric', 'timestamp', 'value'])
            df.columns = ['metric', 'timestamp', 'value']
        except:
            logger.error(traceback.format_exc())
            logger.error('error :: failed to created data frame from %s' %
                         (str(minmax_fp_ts_csv)))
        try:
            df_features = extract_features(
                df,
                column_id='metric',
                column_sort='timestamp',
                column_kind=None,
                column_value=None,
                feature_extraction_settings=tsf_settings)
        except:
            logger.error(traceback.format_exc())
            logger.error('error :: failed to created df_features from %s' %
                         (str(minmax_fp_ts_csv)))
        # Create transposed features csv
        if not os.path.isfile(minmax_fp_fname_out):
            # Transpose
            df_t = df_features.transpose()
            df_t.to_csv(minmax_fp_fname_out)
        else:
            if LOCAL_DEBUG:
                logger.debug('debug :: file exists - %s' % minmax_fp_fname_out)
        try:
            # Calculate the count and sum of the features values
            df_sum = pd.read_csv(minmax_fp_fname_out,
                                 delimiter=',',
                                 header=0,
                                 names=['feature_name', 'value'])
            df_sum.columns = ['feature_name', 'value']
            df_sum['feature_name'] = df_sum['feature_name'].astype(str)
            df_sum['value'] = df_sum['value'].astype(float)
            minmax_fp_features_count = len(df_sum['value'])
            minmax_fp_features_sum = df_sum['value'].sum()
            logger.info(
                'minmax_fp_ts - features_count: %s, features_sum: %s' %
                (str(minmax_fp_features_count), str(minmax_fp_features_sum)))
        except:
            logger.error(traceback.format_exc())
            logger.error('error :: failed to created df_sum from %s' %
                         (str(minmax_fp_fname_out)))

        if minmax_fp_features_count > 0:
            if LOCAL_DEBUG:
                logger.debug(
                    'debug :: minmax_fp_features_count of the minmax_fp_ts is %s'
                    % str(minmax_fp_features_count))
        else:
            logger.error('error :: minmax_fp_features_count is %s' %
                         str(minmax_fp_features_count))

        if not os.path.isfile(anomalous_ts_csv):
            datapoints = minmax_anomalous_ts
            converted = []
            for datapoint in datapoints:
                try:
                    new_datapoint = [float(datapoint[0]), float(datapoint[1])]
                    converted.append(new_datapoint)
                except:  # nosec
                    continue
            for ts, value in converted:
                utc_ts_line = '%s,%s,%s\n' % (base_name, str(
                    int(ts)), str(value))
                with open(anomalous_ts_csv, 'a') as fh:
                    fh.write(utc_ts_line)

        df = pd.read_csv(anomalous_ts_csv,
                         delimiter=',',
                         header=None,
                         names=['metric', 'timestamp', 'value'])
        df.columns = ['metric', 'timestamp', 'value']
        df_features_current = extract_features(
            df,
            column_id='metric',
            column_sort='timestamp',
            column_kind=None,
            column_value=None,
            feature_extraction_settings=tsf_settings)

        # Create transposed features csv
        if not os.path.isfile(anomalous_fp_fname_out):
            # Transpose
            df_t = df_features_current.transpose()
            df_t.to_csv(anomalous_fp_fname_out)
        # Calculate the count and sum of the features values
        df_sum_2 = pd.read_csv(anomalous_fp_fname_out,
                               delimiter=',',
                               header=0,
                               names=['feature_name', 'value'])
        df_sum_2.columns = ['feature_name', 'value']
        df_sum_2['feature_name'] = df_sum_2['feature_name'].astype(str)
        df_sum_2['value'] = df_sum_2['value'].astype(float)
        minmax_anomalous_features_count = len(df_sum_2['value'])
        minmax_anomalous_features_sum = df_sum_2['value'].sum()
        logger.info(
            'minmax_anomalous_ts - minmax_anomalous_features_count: %s, minmax_anomalous_features_sum: %s'
            % (str(minmax_anomalous_features_count),
               str(minmax_anomalous_features_sum)))

    if minmax_fp_features_sum and minmax_anomalous_features_sum:
        percent_different = None
        try:
            fp_sum_array = [minmax_fp_features_sum]
            calc_sum_array = [minmax_anomalous_features_sum]
            percent_different = 100
            sums_array = np.array(
                [minmax_fp_features_sum, minmax_anomalous_features_sum],
                dtype=float)
            calc_percent_different = np.diff(
                sums_array) / sums_array[:-1] * 100.
            percent_different = calc_percent_different[0]
            logger.info(
                'percent_different between minmax scaled features sums - %s' %
                str(percent_different))
        except:
            logger.error(traceback.format_exc())
            logger.error(
                'error :: failed to calculate percent_different from minmax scaled features sums'
            )

        if percent_different:
            almost_equal = None
            try:
                np.testing.assert_array_almost_equal(fp_sum_array,
                                                     calc_sum_array)
                almost_equal = True
            except:
                almost_equal = False

            if almost_equal:
                minmax_not_anomalous = True
                logger.info(
                    'minmax scaled common features sums are almost equal, not anomalous'
                )

            # if diff_in_sums <= 1%:
            if percent_different < 0:
                new_pdiff = percent_different * -1
                percent_different = new_pdiff

            if percent_different < float(features_percentage_diff):
                minmax_not_anomalous = True
                # log
                logger.info(
                    'not anomalous - minmax scaled features profile match - %s - %s'
                    % (base_name, str(minmax_not_anomalous)))
                logger.info(
                    'minmax scaled calculated features sum are within %s percent of fp_id %s with %s, not anomalous'
                    % (str(features_percentage_diff), str(fp_id),
                       str(percent_different)))

            # @modified 20191115 - Branch #3262: py3
            # if minmax_not_anomalous:
            #     not_anomalous = True
            #     minmax = 1

            # Created time series resources for graphing in
            # the matched page

    return (minmax_not_anomalous, minmax_fp_features_sum,
            minmax_fp_features_count, minmax_anomalous_features_sum,
            minmax_anomalous_features_count)
コード例 #5
0
def create_test_features_profile(json_file):
    filename = os.path.basename(json_file)
    metric = filename.replace('.mirage.redis.24h.json', '')
    metric_data_dir = os.path.dirname(json_file)
    anomaly_json = json_file
    ts_csv = '%s.test.echo.tsfresh.input.csv' % (json_file)
    fname_in = ts_csv
    t_fname_out = fname_in + '.features.transposed.csv'
    if os.path.isfile(t_fname_out):
        return t_fname_out
    start = timer()
    with open(anomaly_json, 'r') as f:
        raw_timeseries = f.read()
    # Convert the timeseries to csv
    try:
        timeseries_array_str = str(raw_timeseries).replace('(', '[').replace(
            ')', ']')
        del raw_timeseries
        timeseries = literal_eval(timeseries_array_str)
        del timeseries_array_str
    except:
        print('error :: could not literal_eval %s' % anomaly_json)
        print(traceback.format_exc())
        return False
    datapoints = timeseries
    del timeseries
    converted = []
    for datapoint in datapoints:
        try:
            new_datapoint = [float(datapoint[0]), float(datapoint[1])]
            converted.append(new_datapoint)
        # @modified 20170913 - Task #2160: Test skyline with bandit
        # Added nosec to exclude from bandit tests
        except:  # nosec
            continue

    if os.path.isfile(ts_csv):
        os.remove(ts_csv)

    for ts, value in converted:
        # print('%s,%s' % (str(int(ts)), str(value)))
        utc_ts_line = '%s,%s,%s\n' % (metric, str(int(ts)), str(value))
        with open(ts_csv, 'a') as fh:
            fh.write(utc_ts_line)
    del converted

    df = pd.read_csv(ts_csv,
                     delimiter=',',
                     header=None,
                     names=['metric', 'timestamp', 'value'])
    #    print('DataFrame created with %s' % ts_csv)
    df.columns = ['metric', 'timestamp', 'value']
    tsf_settings = ReasonableFeatureExtractionSettings()
    # Disable tqdm progress bar
    tsf_settings.disable_progressbar = True
    df_features = extract_features(df,
                                   column_id='metric',
                                   column_sort='timestamp',
                                   column_kind=None,
                                   column_value=None,
                                   feature_extraction_settings=tsf_settings)
    del df
    #    print('features extracted from %s data' % ts_csv)
    # write to disk
    fname_out = fname_in + '.features.csv'
    # Transpose
    df_t = df_features.transpose()
    #    print('features transposed')
    # Create transposed features csv
    t_fname_out = fname_in + '.features.transposed.csv'
    df_t.to_csv(t_fname_out)
    del df_t
    # Calculate the count and sum of the features values
    df_sum = pd.read_csv(t_fname_out,
                         delimiter=',',
                         header=0,
                         names=['feature_name', 'value'])
    df_sum.columns = ['feature_name', 'value']
    df_sum['feature_name'] = df_sum['feature_name'].astype(str)
    df_sum['value'] = df_sum['value'].astype(float)

    features_count = len(df_sum['value'])
    features_sum = df_sum['value'].sum()
    del df_sum
    #    print('features saved to %s' % (fname_out))
    #    print('transposed features saved to %s' % (t_fname_out))
    return t_fname_out