Exemple #1
0
def stability(filtered_data, features_list, iteration):
    """
    This function repeatedly sample the data and calculate the average feature size and stability score for each anomalies
    :param filtered_data: list of data
    :param features_list: list of features
    :param iteration: number of iteration
    :return:
    stability matrix and feature list
    """
    feature_list_result = []
    for i in range(iteration):
        new_data = random_data(filtered_data)
        index_data = calculate_class_entropy(new_data, "stability")
        new_data = select_segment(data, index_data)
        data_segment_entropy = calculate_segment_entropy(new_data)
        distance = calculate_D(data_segment_entropy, index_data['h_class'])
        for j in range(len(distance)):
            correlated_feature_index = remove_monotonic_feature(filtered_data[j], features_list)
            Exstream_feature, Exstream_data = drop_features(distance[j, :], filtered_data[j], features_list,
                                                            correlated_feature_index)
            if len(Exstream_feature) == 1:
                feature_list_result.append(Exstream_data.columns[:-1].values)
            else:
                Exstream_cluster = remove_correlated_features(Exstream_data, Exstream_feature, features_list,
                                                              distance[j, :])
                feature_list_result.append(Exstream_cluster.columns[:-1].values)

    stability_matrix = np.zeros((2, len(distance)))
    list = np.array(feature_list_result)
    for i in range(len(distance)):
        index = np.array(range(i, len(list), len(distance)))
        temp = list[index]
        avg_size, stability = stats(temp)
        stability_matrix[:, i] = avg_size, stability

    return stability_matrix, feature_list_result
Exemple #2
0
    path_truth = 'data/truth'
    path_segment = 'data/aggregated'
    file_clean_list = [
        'batch146_17_clean.csv', 'batch146_19_clean.csv',
        'batch146_20_clean.csv'
    ]

    for file_clean in file_clean_list:
        file_truth = file_clean.replace('clean', 'truth')
        file_segment = file_clean.replace('clean', 'aggregated')
        data = pd.read_csv(os.path.join(path_clean, file_clean))

        # read index data
        index_data = pd.read_csv(os.path.join(path_truth, file_truth))
        index_data_mapped = mapping(index_data)
        index_data_class_entropy = calculate_class_entropy(index_data_mapped)
        filtered_data = select_segment(data, index_data_class_entropy)
        aggregated_data = combine_data(filtered_data)
        index_data = calculate_class_entropy(aggregated_data, "aggregate")
        data_segment_entropy = pd.read_csv(
            os.path.join(path_segment, file_segment))
        #data_segment_entropy = calculate_segment_entropy(aggregated_data, "aggregate")
        distance = calculate_D(data_segment_entropy, index_data['h_class'])
        features_list = data_segment_entropy.columns
        correlated_feature_index = remove_monotonic_feature(
            aggregated_data, features_list)
        Exstream_feature, Exstream_data = drop_features(
            distance[0], aggregated_data, features_list,
            correlated_feature_index)
        Exstream_cluster = remove_correlated_features(Exstream_data,
                                                      Exstream_feature,
from sklearn.metrics import precision_score, recall_score, confusion_matrix, \
    classification_report, accuracy_score, f1_score

pd.options.mode.chained_assignment = None

# shiqiGao [email protected]
# Thu Dao [email protected]
if __name__ == '__main__':
    ## read cleaned data
    data = pd.read_csv('./data/clean/batch146_20_clean.csv')
    # read index data
    index_data = pd.read_csv('./data/truth/batch146_20_truth.csv')

    ## map index data and calculate class entropy
    index_data_mapped = mapping(index_data)
    index_data_class_entropy = calculate_class_entropy(index_data_mapped)

    ## calculate segment entropy
    filtered_data = select_segment(data, index_data_class_entropy)
    data_segment_entropy = pd.read_csv(
        './data/segment/batch146_20_segment.csv')

    ## 6x1 class entropy:
    h_class = index_data_class_entropy['h_class']
    ## 6x19 class entropy:
    h_segment = data_segment_entropy

    # numpy array len(anomalies) x len(features)
    distance = calculate_D(h_segment, h_class)
    # adding reward for different features
    # numpy array len(feature) x 1