Example #1
0
def stability(filtered_data, features_list, iteration):
    """
    This function repeatedly sample the data and calculate the average feature size and stability score for each anomalies
    :param filtered_data: list of data
    :param features_list: list of features
    :param iteration: number of iteration
    :return:
    stability matrix and feature list
    """
    feature_list_result = []
    for i in range(iteration):
        new_data = random_data(filtered_data)
        index_data = calculate_class_entropy(new_data, "stability")
        new_data = select_segment(data, index_data)
        data_segment_entropy = calculate_segment_entropy(new_data)
        distance = calculate_D(data_segment_entropy, index_data['h_class'])
        for j in range(len(distance)):
            correlated_feature_index = remove_monotonic_feature(filtered_data[j], features_list)
            Exstream_feature, Exstream_data = drop_features(distance[j, :], filtered_data[j], features_list,
                                                            correlated_feature_index)
            if len(Exstream_feature) == 1:
                feature_list_result.append(Exstream_data.columns[:-1].values)
            else:
                Exstream_cluster = remove_correlated_features(Exstream_data, Exstream_feature, features_list,
                                                              distance[j, :])
                feature_list_result.append(Exstream_cluster.columns[:-1].values)

    stability_matrix = np.zeros((2, len(distance)))
    list = np.array(feature_list_result)
    for i in range(len(distance)):
        index = np.array(range(i, len(list), len(distance)))
        temp = list[index]
        avg_size, stability = stats(temp)
        stability_matrix[:, i] = avg_size, stability

    return stability_matrix, feature_list_result
Example #2
0
    for file_clean in file_clean_list:
        file_truth = file_clean.replace('clean', 'truth')
        file_segment = file_clean.replace('clean', 'aggregated')
        data = pd.read_csv(os.path.join(path_clean, file_clean))

        # read index data
        index_data = pd.read_csv(os.path.join(path_truth, file_truth))
        index_data_mapped = mapping(index_data)
        index_data_class_entropy = calculate_class_entropy(index_data_mapped)
        filtered_data = select_segment(data, index_data_class_entropy)
        aggregated_data = combine_data(filtered_data)
        index_data = calculate_class_entropy(aggregated_data, "aggregate")
        data_segment_entropy = pd.read_csv(
            os.path.join(path_segment, file_segment))
        #data_segment_entropy = calculate_segment_entropy(aggregated_data, "aggregate")
        distance = calculate_D(data_segment_entropy, index_data['h_class'])
        features_list = data_segment_entropy.columns
        correlated_feature_index = remove_monotonic_feature(
            aggregated_data, features_list)
        Exstream_feature, Exstream_data = drop_features(
            distance[0], aggregated_data, features_list,
            correlated_feature_index)
        Exstream_cluster = remove_correlated_features(Exstream_data,
                                                      Exstream_feature,
                                                      features_list,
                                                      distance[0])
        Exstream_list.append(len(Exstream_feature))
        Exstream_cluster_list.append(len(Exstream_cluster.columns) - 1)
        # print(Exstream_cluster.columns)

    # Set up plot data
    ## map index data and calculate class entropy
    index_data_mapped = mapping(index_data)
    index_data_class_entropy = calculate_class_entropy(index_data_mapped)

    ## calculate segment entropy
    filtered_data = select_segment(data, index_data_class_entropy)
    data_segment_entropy = pd.read_csv(
        './data/segment/batch146_20_segment.csv')

    ## 6x1 class entropy:
    h_class = index_data_class_entropy['h_class']
    ## 6x19 class entropy:
    h_segment = data_segment_entropy

    # numpy array len(anomalies) x len(features)
    distance = calculate_D(h_segment, h_class)
    # adding reward for different features
    # numpy array len(feature) x 1
    aggregated_distance = aggreate_distance(distance)
    # convert the list of data frames to one data
    aggregated_data = combine_data(filtered_data)
    #list of all the features
    features_list = data_segment_entropy.columns
    correlated_feature_index = remove_monotonic_feature(
        aggregated_data, features_list)
    Exstream_feature, Exstream_data = drop_features(aggregated_distance,
                                                    aggregated_data,
                                                    features_list,
                                                    correlated_feature_index)

    # after removing correlated features (via clustering) we will have Exstream_cluster