class BetaCalculator: def __init__(self, graphs: Graphs, feature_pairs=None, logger: BaseLogger=None): if logger: self._logger = logger else: self._logger = PrintLogger("default graphs logger") self._graphs = graphs self._ftr_pairs = feature_pairs num_features = graphs.features_matrix(0).shape[1] num_rows = len(feature_pairs) if feature_pairs else int(comb(num_features, 2)) self._beta_matrix = np.zeros((self._graphs.number_of_graphs(), num_rows)) self._build() def _build(self): graph_index = 0 for g_id in self._graphs.graph_names(): self._logger.debug("calculating beta vec for:\t" + g_id) self._beta_matrix[graph_index, :] = self._calc_beta(g_id) graph_index += 1 def _calc_beta(self, gid): raise NotImplementedError() def beta_matrix(self): return self._beta_matrix def to_file(self, file_name): out_file = open(file_name, "rw") for i in range(self._graphs.number_of_graphs()): out_file.write(self._graphs.index_to_name(i)) # graph_name for j in range(len(self._ftr_pairs)): out_file.write(str(self._beta_matrix[i][j])) # beta_vector out_file.write("\n") out_file.close()
class FeaturesPicker: def __init__(self, graphs: Graphs, logger: BaseLogger = None, size=10, identical_bar=0.6): if logger: self._logger = logger else: self._logger = PrintLogger("default logger") self._size = size # number of pairs to pick self._graphs = graphs self._features_matrix = self._get_features_np_matrix() self._identical_bar = identical_bar # if feature has identical values to more then bar*|V| - feature is dropped self._features_identicality = [ ] # percentage of biggest vertices group with same value per feature self._fill_features_identicality() self._best_pairs = self._pick() def _get_features_np_matrix(self): return self._graphs.features_matrix_by_index(for_all=True) # fill best pairs with the most informative pair of features def _pick(self): raise NotImplementedError() def best_pairs(self): return self._best_pairs def _fill_features_identicality(self): self._logger.debug("start features identicality") rows, cols = self._features_matrix.shape for i in range(cols): self._features_identicality.append( collections.Counter(self._features_matrix[:, i].T.tolist() [0]).most_common(1)[0][1] / rows) self._logger.debug("end_features identicality") def _identicality_for(self, feature_index): return self._features_identicality[feature_index] def _is_feature_relevant(self, feature_index): return True if self._features_identicality[ feature_index] < self._identical_bar else False
class TPGAD: def __init__(self, params): self._params = params if type(params) is dict else json.load(open(params, "rt")) self._logger = PrintLogger("graph-ad") self._temporal_graph = self._build_temporal_graph() self._ground_truth = self._load_ground_truth(self._params['gt']['filename']) self._num_anomalies = len(self._ground_truth)*2 self._idx_to_graph = list(self._temporal_graph.graph_names()) self._graph_to_idx = {name: idx for idx, name in enumerate(self._idx_to_graph)} self._run_ad() def _load_ground_truth(self, gt_file): df = pd.read_csv(gt_file) return {self._temporal_graph.name_to_index(row.anomaly): row.get("score", 1) for i, row in df.iterrows()} def data_name(self): max_connected = "max_connected_" if self._params['features']['max_connected'] else "" directed = "directed" if self._params['dataset']['directed'] else "undirected" weighted = "weighted_" if self._params['dataset']['weight_col'] is not None else "" return f"{self._params['dataset']['name']}_{weighted}{max_connected}{directed}" def _build_temporal_graph(self): tg_pkl_dir = os.path.join(self._params['general']['pkl_path'], "temporal_graphs") tg_pkl_path = os.path.join(tg_pkl_dir, f"{self.data_name()}_tg.pkl") if os.path.exists(tg_pkl_path): self._logger.info("loading pkl file - temporal_graphs") tg = pickle.load(open(tg_pkl_path, "rb")) else: tg = TemporalGraph(self.data_name(), self._params['dataset']['filename'], self._params['dataset']['time_format'], self._params['dataset']['time_col'], self._params['dataset']['src_col'], self._params['dataset']['dst_col'], weight_col=self._params['dataset'].get('weight_col', None), weeks=self._params['dataset'].get('week_split', None), days=self._params['dataset'].get('day_split', None), hours=self._params['dataset'].get('hour_split', None), minutes=self._params['dataset'].get('min_split', None), seconds=self._params['dataset'].get('sec_split', None), directed=self._params['dataset']['directed'], logger=self._logger).to_multi_graph() tg.suspend_logger() if self._params['general']["dump_pkl"]: os.makedirs(tg_pkl_dir, exist_ok=True) pickle.dump(tg, open(tg_pkl_path, "wb")) tg.wake_logger() return tg def _calc_tg_feature_matrix(self): log_ext = "log_" if self._params['features']['log'] else "" feature_matrix_dir = os.path.join(self._params['general']['pkl_path'], "gt_feature_matrix") mat_pkl = os.path.join(feature_matrix_dir, f"{self.data_name()}_{log_ext}tg_feature_matrices.pkl") if os.path.exists(mat_pkl): self._logger.info("loading pkl file - graph_matrix") return pickle.load(open(mat_pkl, "rb")) gnx_to_vec = {} # create dir for database database_pkl_dir = os.path.join(self._params['general']['pkl_path'], "features", self.data_name()) for gnx_name, gnx in zip(self._temporal_graph.graph_names(), self._temporal_graph.graphs()): # create dir for specific graph features gnx_path = os.path.join(database_pkl_dir, re.sub('[^a-zA-Z0-9]', '_', gnx_name)) if self._params['general']["dump_pkl"]: os.makedirs(gnx_path, exist_ok=True) gnx_ftr = GraphFeatures(gnx, ANOMALY_DETECTION_FEATURES, dir_path=gnx_path, logger=self._logger, is_max_connected=self._params['features']['max_connected']) gnx_ftr.build(should_dump=self._params['general']["dump_pkl"], force_build=self._params['general']['FORCE_REBUILD_FEATURES']) # build features # calc motif ratio vector gnx_to_vec[gnx_name] = FeaturesProcessor(gnx_ftr).as_matrix(norm_func=log_norm if self._params['features']['log'] else None) if self._params['general']['dump_pkl']: os.makedirs(feature_matrix_dir, exist_ok=True) pickle.dump(gnx_to_vec, open(mat_pkl, "wb")) return gnx_to_vec def _get_beta_vec(self, mx_dict, best_pairs): self._logger.debug("calculating beta vectors") if self._params['beta_vectors']['type'] == "regression": beta = LinearContext(self._temporal_graph, mx_dict, best_pairs, window_size=self._params['beta_vectors']['window_size']) elif self._params['beta_vectors']['type'] == "mean_regression": beta = LinearMeanContext(self._temporal_graph, mx_dict, best_pairs, window_size=self._params['beta_vectors']['window_size']) else: raise RuntimeError(f"invalid value for params[beta_vectors][type], got {self._params['beta_vectors']['type']}" f" while valid options are: regression/mean_regression ") if self._params['general']['dump_pkl']: beta_pkl_dir = os.path.join(self._params['general']['pkl_path'], "beta_matrix") tg_pkl_path = os.path.join(beta_pkl_dir, f"{self.data_name()}_beta.pkl") os.makedirs(beta_pkl_dir, exist_ok=True) pickle.dump(beta.beta_matrix(), open(tg_pkl_path, "wb")) self._logger.debug("finish calculating beta vectors") return beta def _get_graphs_score(self, beta_matrix): score_type = self._params['score']['type'] if score_type == "knn": return KnnScore(beta_matrix, self._params['score']['params']['knn']['k'], self.data_name(), window_size=self._params['score']['window_size']) elif score_type == "gmm": return GmmScore(beta_matrix, self.data_name(), window_size=self._params['score']['window_size'], n_components=self._params['score']['params']['gmm']['n_components']) elif score_type == "local_outlier": return LocalOutlierFactorScore(beta_matrix, self.data_name(), window_size=self._params['score']['window_size'], n_neighbors=self._params['score']['params']['local_outlier']['n_neighbors']) else: raise RuntimeError(f"invalid value for params[beta_vectors][type], got {score_type}" f" while valid options are: knn/gmm/local_outlier") def _run_ad(self): mx_dict = self._calc_tg_feature_matrix() concat_mx = np.vstack([mx for name, mx in mx_dict.items()]) pearson_picker = PearsonFeaturePicker(concat_mx, size=self._params['feature_pair_picker']['num_pairs'], logger=self._logger, identical_bar=self._params['feature_pair_picker']['overlap_bar']) best_pairs = pearson_picker.best_pairs() beta_matrix = self._get_beta_vec(mx_dict, best_pairs).beta_matrix() scores = self._get_graphs_score(beta_matrix).score_list() anomaly_picker = SimpleAnomalyPicker(self._temporal_graph, scores, self.data_name(), num_anomalies=self._num_anomalies) anomaly_picker.build() anomaly_picker.plot_anomalies_bokeh("", truth=self._ground_truth, info_text=str(self._params))
class TimedGraphs: def __init__(self, database_name, start_time=10, logger: BaseLogger = None, features_meta=None, directed=False, files_path=None, date_format=None, largest_cc=False): self._start_time = start_time self._features_meta = NODE_FEATURES if features_meta is None else features_meta self._largest_cc = largest_cc self._date_format = date_format self._directed = directed self._database_name = database_name + "_directed:" + str( directed) + "_lcc:" + str(largest_cc) self._path = os.path.join('data', self._database_name) if logger: self._logger = logger else: self._logger = PrintLogger("default graphs logger") self._files_path = files_path # location of graphs as files # make directories to save features data (as pickles) if "data" not in os.listdir("."): os.mkdir("data") if self._database_name not in os.listdir("data/"): os.mkdir(self._path) self._logger.debug("graphs initialized") self._initiation() def get_feature_meta(self): return self._multi_graph.get_feature_meta() def is_directed(self): return self._directed def _is_loaded(self): return True if os.path.exists(self._get_pickle_path()) else False def _load_graphs(self): self._logger.debug("load multi-graph - start") self._multi_graph = pickle.load(open(self._get_pickle_path(), "rb")) self._logger.debug("pickle loaded") self._logger.debug("load multi-graph - end") def _dump(self, redump=False): if self._is_loaded() and not redump: self._logger.debug("multi-graph is already loaded") return log = self._multi_graph._logger key_func = self._multi_graph._key_func self._multi_graph._logger = None self._multi_graph._key_func = None pickle.dump(self._multi_graph, open(self._get_pickle_path(), "wb")) self._multi_graph._logger = log self._multi_graph._key_func = key_func self._logger.debug("multi-graph dumped") def _get_pickle_path(self): return os.path.join(self._path, self._database_name + ".pkl") def _initiation(self): self._logger.debug("build multi-graph - start") self._multi_graph = TimedMultiGraphFeatures( self._database_name, self._logger, features_meta=self._features_meta, directed=self._directed, files_path=self._files_path, pkl_dir=self._path, date_format=self._date_format) for i in range(self._start_time): self._multi_graph.forward_time() self._multi_graph.build_features(largest_cc=self._largest_cc, should_zscore=False) def forward_time(self): flag = self._multi_graph.forward_time() self._multi_graph.build_features(largest_cc=self._largest_cc, should_zscore=False) self._logger.debug("build multi-graph - end") return flag def get_labels(self): return self._multi_graph.get_labels() # Adapter for multi graph def get_subgraph(self, graph_name): return self._multi_graph.subgraph_by_name(graph_name) def subgraph_by_name(self, graph_name: str): return self._multi_graph.subgraph_by_name(graph_name) def subgraph_by_index(self, index: int): return self._multi_graph.subgraph_by_index(index) def combined_graph_by_names(self, names_list=None, combine_all=False): return self._multi_graph.combined_graph_by_names( names_list, combine_all) def combined_graph_by_indexes(self, index_list=None, combine_all=False): return self.combined_graph_by_indexes(index_list, combine_all) def is_graph(self, graph_name): return self._multi_graph.is_graph(graph_name) def index_to_name(self, index): return self._multi_graph.index_to_name(index) def name_to_index(self, graph_name): return self._multi_graph.name_to_index(graph_name) def features_matrix_by_index(self, graph_start=0, graph_end=0, for_all=False): return self._multi_graph.features_matrix_by_indexes( graph_start, graph_end, for_all) def features_matrix_by_name(self, graph_start=0, graph_end=0, for_all=False): return self._multi_graph.features_matrix_by_names( graph_start, graph_end, for_all) def features_matrix(self, graph): return self._multi_graph.feature_matrix(graph) def nodes_for_graph(self, graph): return self._multi_graph.nodes_for_graph(graph) def nodes_count_list(self): return self._multi_graph.nodes_count_list() def edges_for_graph(self, graph): return self._multi_graph.nodes_for_graph(graph) def edges_count_list(self): return self._multi_graph.nodes_count_list() def subgraphs(self, start_id=None, end_id=None): for gid in self._multi_graph._list_id[start_id:end_id]: yield self.subgraph_by_name(gid) def number_of_graphs(self): return self._multi_graph.number_of_graphs() def graph_names(self, start_id=None, end_id=None): for gid in self._multi_graph._list_id[start_id:end_id]: yield gid def norm_features(self, norm_function): self._multi_graph.norm_features(norm_function)