def __init__(self, graphs: Graphs, feature_pairs, split=1): self._interval = int(graphs.number_of_graphs() / split) self._all_features = [] for graph in graphs.graph_names(): m = graphs.features_matrix(graph) # self._nodes_for_graph.append(m.shape[0]) # append graph features self._all_features.append(m) # append 0.001 for all missing nodes self._all_features.append(np.ones((graphs.nodes_for_graph(graphs.name_to_index(graph)) - m.shape[0], m.shape[1])) * 0.001) # create one big matrix of everything - rows: nodes, columns: features self._all_features = np.concatenate(self._all_features) # all_ftr_graph_index - [ .... last_row_index_for_graph_i ... ] self._all_ftr_graph_index = np.cumsum([0] + graphs.nodes_count_list()).tolist() super(LinearContext, self).__init__(graphs, feature_pairs)
class AnomalyDetection: def __init__(self): # pearson + linear_regression(simple) + KNN + context self._params = { 'database': 'EnronInc', # 'database': 'mc2_vast12', # 'database': 'twitter_security', 'files_path': "../../../databases/EnronInc/EnronInc_by_day", # 'files_path': "../../databases/mc2_vast12/basic_by_minute", # 'files_path': "../../databases/twitter_security/data_by_days", 'date_format': '%d-%b-%Y.txt', # Enron # 'date_format': '%d:%m:%Y_%H:%M.txt', # vast # 'date_format': '%d:%m.txt', # Twitter 'directed': False, 'max_connected': False, 'logger_name': "default Anomaly logger", 'ftr_pairs': 28, 'identical_bar': 0.9, # 'single_c': False, 'dist_mat_file_name': "dist_mat", 'anomalies_file_name': "anomalies", 'context_beta': 4.1, 'KNN_k': 18, 'context_split': 1, 'context_bar': 0.45 } self._ground_truth = [ '13-Dec-2000', '18-Oct-2001', '22-Oct-2001', '19-Nov-2001', '23-Jan-2002', '30-Jan-2002', '04-Feb-2002' ] # Enron # self._ground_truth = ['1:5', '13:5', '20:5', '24:5', '30:5', '3:6', '5:6', '6:6', '9:6', '10:6', '11:6', # '15:6', '18:6', '19:6', '20:6', '25:6', '26:6', '3:7', '18:7', '30:7', '8:8', # '9:8'] # Twitter # self._ground_truth = ['4:5:2012_17:51', '4:5:2012_20:25', '4:5:2012_20:26', '4:5:2012_22:16', '4:5:2012_22:21' # , '4:5:2012_22:40', '4:5:2012_22:41', '4:6:2012_17:41', '4:5:2012_18:11'] # vast self._logger = PrintLogger("default Anomaly logger") self._graphs = Graphs(self._params['database'], files_path=self._params['files_path'], logger=self._logger, features_meta=ANOMALY_DETECTION_FEATURES, directed=self._params['directed'], date_format=self._params['date_format'], largest_cc=self._params['max_connected']) self._graphs.build(force_rebuild_ftr=REBUILD_FEATURES, pick_ftr=RE_PICK_FTR, should_zscore=False) # normalize features --------------------------------- self._graphs.norm_features(log_norm) # convert to index self._ground_truth = [ self._graphs.name_to_index(event) for event in self._ground_truth ] self.print_feature_meta() def build(self): pearson_picker = PearsonFeaturePicker( self._graphs, size=self._params['ftr_pairs'], logger=self._logger, identical_bar=self._params['identical_bar']) best_pairs = pearson_picker.best_pairs() beta = LinearContext(self._graphs, best_pairs, split=self._params['context_beta']) beta_matrix = beta.beta_matrix() score = KnnScore(beta_matrix, self._params['KNN_k'], self._params['database'], context_split=self._params['context_beta']) score.dist_heat_map(self._params['dist_mat_file_name']) anomaly_picker = ContextAnomalyPicker( self._graphs, score.score_list(), self._params['database'], logger=None, split=self._params['context_split'], bar=self._params['context_bar']) anomaly_picker.build() anomaly_picker.plot_anomalies(self._params['anomalies_file_name'], truth=self._ground_truth, info_text=self.param_to_str()) def print_feature_meta(self): for ftr in self._graphs.get_feature_meta(): print(ftr) def build_manipulations(self): for ftr_num in range(24, 32, 2): self._params['ftr_pairs'] = ftr_num pearson_picker = PearsonFeaturePicker( self._graphs, size=self._params['ftr_pairs'], logger=self._logger, identical_bar=self._params['identical_bar']) best_pairs = pearson_picker.best_pairs() # beta = LinearRegBetaCalculator(self._graphs, best_pairs, single_c=self._params['single_c']) for context in range(4, 7): self._params['context_beta'] = context beta = LinearContext(self._graphs, best_pairs, split=self._params['context_beta']) beta_matrix = beta.beta_matrix() for k in range( 5, min(100, int(self._graphs.number_of_graphs() / context))): self._params['KNN_k'] = k score = KnnScore( beta_matrix, self._params['KNN_k'], self._params['database'], context_split=self._params['context_beta']) # score = TestScore(beta_matrix, self._params['database']) score.dist_heat_map(self._params['dist_mat_file_name']) anomaly_picker = ContextAnomalyPicker( self._graphs, score.score_list(), self._params['database'], logger=None, split=self._params['context_split'], bar=self._params['context_bar']) anomaly_picker.build() anomaly_picker.plot_anomalies( self._params['anomalies_file_name'], truth=self._ground_truth, info_text=self.param_to_str()) def param_to_str(self): skip = [ 'database', 'files_path', 'date_format', 'logger_name', 'context_split', 'context_bar', 'dist_mat_file_name', 'anomalies_file_name' ] param_str = "" for key, val in self._params.items(): if key in skip: continue param_str += str(key) + ":" + str(val) + "\n" return param_str
class AnomalyDetection: def __init__(self): # pearson + linear_regression(simple) + KNN + context self._params = { 'database': 'EnronInc', 'files_path': "../databases/EnronInc/EnronInc_by_day", 'date_format': '%d-%b-%Y.txt', # Enron 'directed': False, 'max_connected': True, 'logger_name': "Yeela's logger", 'ftr_pairs': 25, 'identical_bar': 0.95, 'dist_mat_file_name': "dist_mat", 'anomalies_file_name': "anomalies", 'context_beta': 4, 'KNN_k': 30, 'context_split': 1, 'context_bar': 0.45 } # the anomalies self._ground_truth = [ '13-Dec-2000', '18-Oct-2001', '22-Oct-2001', '19-Nov-2001', '23-Jan-2002', '30-Jan-2002', '04-Feb-2002' ] # Enron # init logger self._logger = PrintLogger(self._params['logger_name']) # init multi-graph self._graphs = Graphs(self._params['database'], files_path=self._params['files_path'], logger=self._logger, features_meta=ANOMALY_DETECTION_FEATURES, directed=self._params['directed'], date_format=self._params['date_format'], largest_cc=self._params['max_connected']) self._graphs.build(force_rebuild_ftr=REBUILD_FEATURES, pick_ftr=RE_PICK_FTR) # replace features with features from old version old_features_path = path.join("data", "EnronInc_directed:False_lcc:True", "old_features.pkl") self._graphs._multi_graph._features_matrix_dict = pickle.load( open(old_features_path, "rb")) # convert anomalies name to index self._ground_truth = [ self._graphs.name_to_index(event) for event in self._ground_truth ] # print features that are being used self.print_feature_meta() def build(self): # step 1 - pick features pearson_picker = PearsonFeaturePicker( self._graphs, size=self._params['ftr_pairs'], logger=self._logger, identical_bar=self._params['identical_bar']) best_pairs = pearson_picker.best_pairs() # step 2 - calculate beta matrix beta = LinearContext(self._graphs, best_pairs, split=self._params['context_beta'], logger=self._logger) beta_matrix = beta.beta_matrix() # step 3 - score each graph score = KnnScore(beta_matrix, self._params['KNN_k'], self._params['database'], context_split=self._params['context_beta']) score.dist_heat_map(self._params['dist_mat_file_name']) # pick the anomalies based on the scores anomaly_picker = ContextAnomalyPicker( self._graphs, score.score_list(), self._params['database'], logger=None, split=self._params['context_split'], bar=self._params['context_bar']) anomaly_picker.build() anomaly_picker.plot_anomalies(self._params['anomalies_file_name'], truth=self._ground_truth, info_text=self.param_to_str()) def print_feature_meta(self): for ftr in self._graphs.get_feature_meta(): print(ftr) def param_to_str(self): skip = [ 'database', 'files_path', 'date_format', 'logger_name', 'context_split', 'context_bar', 'dist_mat_file_name', 'anomalies_file_name' ] param_str = "" for key, val in self._params.items(): if key in skip: continue param_str += str(key) + ":" + str(val) + "\n" return param_str