def stability(filtered_data, features_list, iteration): """ This function repeatedly sample the data and calculate the average feature size and stability score for each anomalies :param filtered_data: list of data :param features_list: list of features :param iteration: number of iteration :return: stability matrix and feature list """ feature_list_result = [] for i in range(iteration): new_data = random_data(filtered_data) index_data = calculate_class_entropy(new_data, "stability") new_data = select_segment(data, index_data) data_segment_entropy = calculate_segment_entropy(new_data) distance = calculate_D(data_segment_entropy, index_data['h_class']) for j in range(len(distance)): correlated_feature_index = remove_monotonic_feature(filtered_data[j], features_list) Exstream_feature, Exstream_data = drop_features(distance[j, :], filtered_data[j], features_list, correlated_feature_index) if len(Exstream_feature) == 1: feature_list_result.append(Exstream_data.columns[:-1].values) else: Exstream_cluster = remove_correlated_features(Exstream_data, Exstream_feature, features_list, distance[j, :]) feature_list_result.append(Exstream_cluster.columns[:-1].values) stability_matrix = np.zeros((2, len(distance))) list = np.array(feature_list_result) for i in range(len(distance)): index = np.array(range(i, len(list), len(distance))) temp = list[index] avg_size, stability = stats(temp) stability_matrix[:, i] = avg_size, stability return stability_matrix, feature_list_result
path_truth = 'data/truth' path_segment = 'data/aggregated' file_clean_list = [ 'batch146_17_clean.csv', 'batch146_19_clean.csv', 'batch146_20_clean.csv' ] for file_clean in file_clean_list: file_truth = file_clean.replace('clean', 'truth') file_segment = file_clean.replace('clean', 'aggregated') data = pd.read_csv(os.path.join(path_clean, file_clean)) # read index data index_data = pd.read_csv(os.path.join(path_truth, file_truth)) index_data_mapped = mapping(index_data) index_data_class_entropy = calculate_class_entropy(index_data_mapped) filtered_data = select_segment(data, index_data_class_entropy) aggregated_data = combine_data(filtered_data) index_data = calculate_class_entropy(aggregated_data, "aggregate") data_segment_entropy = pd.read_csv( os.path.join(path_segment, file_segment)) #data_segment_entropy = calculate_segment_entropy(aggregated_data, "aggregate") distance = calculate_D(data_segment_entropy, index_data['h_class']) features_list = data_segment_entropy.columns correlated_feature_index = remove_monotonic_feature( aggregated_data, features_list) Exstream_feature, Exstream_data = drop_features( distance[0], aggregated_data, features_list, correlated_feature_index) Exstream_cluster = remove_correlated_features(Exstream_data, Exstream_feature,
from sklearn.metrics import precision_score, recall_score, confusion_matrix, \ classification_report, accuracy_score, f1_score pd.options.mode.chained_assignment = None # shiqiGao [email protected] # Thu Dao [email protected] if __name__ == '__main__': ## read cleaned data data = pd.read_csv('./data/clean/batch146_20_clean.csv') # read index data index_data = pd.read_csv('./data/truth/batch146_20_truth.csv') ## map index data and calculate class entropy index_data_mapped = mapping(index_data) index_data_class_entropy = calculate_class_entropy(index_data_mapped) ## calculate segment entropy filtered_data = select_segment(data, index_data_class_entropy) data_segment_entropy = pd.read_csv( './data/segment/batch146_20_segment.csv') ## 6x1 class entropy: h_class = index_data_class_entropy['h_class'] ## 6x19 class entropy: h_segment = data_segment_entropy # numpy array len(anomalies) x len(features) distance = calculate_D(h_segment, h_class) # adding reward for different features # numpy array len(feature) x 1