class BetaCalculator:
    def __init__(self, graphs: Graphs, feature_pairs=None, logger: BaseLogger=None):
        if logger:
            self._logger = logger
        else:
            self._logger = PrintLogger("default graphs logger")
        self._graphs = graphs
        self._ftr_pairs = feature_pairs
        num_features = graphs.features_matrix(0).shape[1]
        num_rows = len(feature_pairs) if feature_pairs else int(comb(num_features, 2))
        self._beta_matrix = np.zeros((self._graphs.number_of_graphs(), num_rows))
        self._build()

    def _build(self):
        graph_index = 0
        for g_id in self._graphs.graph_names():
            self._logger.debug("calculating beta vec for:\t" + g_id)
            self._beta_matrix[graph_index, :] = self._calc_beta(g_id)
            graph_index += 1

    def _calc_beta(self, gid):
        raise NotImplementedError()

    def beta_matrix(self):
        return self._beta_matrix

    def to_file(self, file_name):
        out_file = open(file_name, "rw")
        for i in range(self._graphs.number_of_graphs()):
            out_file.write(self._graphs.index_to_name(i))  # graph_name
            for j in range(len(self._ftr_pairs)):
                out_file.write(str(self._beta_matrix[i][j]))  # beta_vector
            out_file.write("\n")
        out_file.close()
Beispiel #2
0
class FeaturesPicker:
    def __init__(self,
                 graphs: Graphs,
                 logger: BaseLogger = None,
                 size=10,
                 identical_bar=0.6):
        if logger:
            self._logger = logger
        else:
            self._logger = PrintLogger("default logger")
        self._size = size  # number of pairs to pick
        self._graphs = graphs
        self._features_matrix = self._get_features_np_matrix()
        self._identical_bar = identical_bar  # if feature has identical values to more then bar*|V| - feature is dropped
        self._features_identicality = [
        ]  # percentage of biggest vertices group with same value per feature
        self._fill_features_identicality()
        self._best_pairs = self._pick()

    def _get_features_np_matrix(self):
        return self._graphs.features_matrix_by_index(for_all=True)

    # fill best pairs with the most informative pair of features
    def _pick(self):
        raise NotImplementedError()

    def best_pairs(self):
        return self._best_pairs

    def _fill_features_identicality(self):
        self._logger.debug("start features identicality")
        rows, cols = self._features_matrix.shape
        for i in range(cols):
            self._features_identicality.append(
                collections.Counter(self._features_matrix[:, i].T.tolist()
                                    [0]).most_common(1)[0][1] / rows)
        self._logger.debug("end_features identicality")

    def _identicality_for(self, feature_index):
        return self._features_identicality[feature_index]

    def _is_feature_relevant(self, feature_index):
        return True if self._features_identicality[
            feature_index] < self._identical_bar else False
class TPGAD:
    def __init__(self, params):
        self._params = params if type(params) is dict else json.load(open(params, "rt"))
        self._logger = PrintLogger("graph-ad")
        self._temporal_graph = self._build_temporal_graph()
        self._ground_truth = self._load_ground_truth(self._params['gt']['filename'])
        self._num_anomalies = len(self._ground_truth)*2
        self._idx_to_graph = list(self._temporal_graph.graph_names())
        self._graph_to_idx = {name: idx for idx, name in enumerate(self._idx_to_graph)}
        self._run_ad()

    def _load_ground_truth(self, gt_file):
        df = pd.read_csv(gt_file)
        return {self._temporal_graph.name_to_index(row.anomaly): row.get("score", 1) for i, row in df.iterrows()}

    def data_name(self):
        max_connected = "max_connected_" if self._params['features']['max_connected'] else ""
        directed = "directed" if self._params['dataset']['directed'] else "undirected"
        weighted = "weighted_" if self._params['dataset']['weight_col'] is not None else ""
        return f"{self._params['dataset']['name']}_{weighted}{max_connected}{directed}"

    def _build_temporal_graph(self):
        tg_pkl_dir = os.path.join(self._params['general']['pkl_path'], "temporal_graphs")
        tg_pkl_path = os.path.join(tg_pkl_dir, f"{self.data_name()}_tg.pkl")
        if os.path.exists(tg_pkl_path):
            self._logger.info("loading pkl file - temporal_graphs")
            tg = pickle.load(open(tg_pkl_path, "rb"))
        else:
            tg = TemporalGraph(self.data_name(), self._params['dataset']['filename'], self._params['dataset']['time_format'],
                               self._params['dataset']['time_col'], self._params['dataset']['src_col'],
                               self._params['dataset']['dst_col'],
                               weight_col=self._params['dataset'].get('weight_col', None),
                               weeks=self._params['dataset'].get('week_split', None),
                               days=self._params['dataset'].get('day_split', None),
                               hours=self._params['dataset'].get('hour_split', None),
                               minutes=self._params['dataset'].get('min_split', None),
                               seconds=self._params['dataset'].get('sec_split', None),
                               directed=self._params['dataset']['directed'],
                               logger=self._logger).to_multi_graph()

            tg.suspend_logger()
            if self._params['general']["dump_pkl"]:
                os.makedirs(tg_pkl_dir, exist_ok=True)
                pickle.dump(tg, open(tg_pkl_path, "wb"))
            tg.wake_logger()
        return tg

    def _calc_tg_feature_matrix(self):
        log_ext = "log_" if self._params['features']['log'] else ""
        feature_matrix_dir = os.path.join(self._params['general']['pkl_path'], "gt_feature_matrix")
        mat_pkl = os.path.join(feature_matrix_dir, f"{self.data_name()}_{log_ext}tg_feature_matrices.pkl")

        if os.path.exists(mat_pkl):
            self._logger.info("loading pkl file - graph_matrix")
            return pickle.load(open(mat_pkl, "rb"))

        gnx_to_vec = {}
        # create dir for database
        database_pkl_dir = os.path.join(self._params['general']['pkl_path'], "features", self.data_name())
        for gnx_name, gnx in zip(self._temporal_graph.graph_names(), self._temporal_graph.graphs()):
            # create dir for specific graph features
            gnx_path = os.path.join(database_pkl_dir, re.sub('[^a-zA-Z0-9]', '_', gnx_name))
            if self._params['general']["dump_pkl"]:
                os.makedirs(gnx_path, exist_ok=True)

            gnx_ftr = GraphFeatures(gnx, ANOMALY_DETECTION_FEATURES, dir_path=gnx_path, logger=self._logger,
                                    is_max_connected=self._params['features']['max_connected'])
            gnx_ftr.build(should_dump=self._params['general']["dump_pkl"],
                          force_build=self._params['general']['FORCE_REBUILD_FEATURES'])  # build features
            # calc motif ratio vector
            gnx_to_vec[gnx_name] = FeaturesProcessor(gnx_ftr).as_matrix(norm_func=log_norm if self._params['features']['log'] else None)
        if self._params['general']['dump_pkl']:
            os.makedirs(feature_matrix_dir, exist_ok=True)
            pickle.dump(gnx_to_vec, open(mat_pkl, "wb"))
        return gnx_to_vec

    def _get_beta_vec(self, mx_dict, best_pairs):
        self._logger.debug("calculating beta vectors")

        if self._params['beta_vectors']['type'] == "regression":
            beta = LinearContext(self._temporal_graph, mx_dict, best_pairs,
                                 window_size=self._params['beta_vectors']['window_size'])
        elif self._params['beta_vectors']['type'] == "mean_regression":
            beta = LinearMeanContext(self._temporal_graph, mx_dict, best_pairs,
                                     window_size=self._params['beta_vectors']['window_size'])
        else:
            raise RuntimeError(f"invalid value for params[beta_vectors][type], got {self._params['beta_vectors']['type']}"
                               f" while valid options are: regression/mean_regression ")
        if self._params['general']['dump_pkl']:
            beta_pkl_dir = os.path.join(self._params['general']['pkl_path'], "beta_matrix")
            tg_pkl_path = os.path.join(beta_pkl_dir, f"{self.data_name()}_beta.pkl")
            os.makedirs(beta_pkl_dir, exist_ok=True)
            pickle.dump(beta.beta_matrix(), open(tg_pkl_path, "wb"))
        self._logger.debug("finish calculating beta vectors")

        return beta

    def _get_graphs_score(self, beta_matrix):
        score_type = self._params['score']['type']
        if score_type == "knn":
            return KnnScore(beta_matrix, self._params['score']['params']['knn']['k'], self.data_name(),
                            window_size=self._params['score']['window_size'])
        elif score_type == "gmm":
            return GmmScore(beta_matrix, self.data_name(), window_size=self._params['score']['window_size'],
                            n_components=self._params['score']['params']['gmm']['n_components'])
        elif score_type == "local_outlier":
            return LocalOutlierFactorScore(beta_matrix, self.data_name(), window_size=self._params['score']['window_size'],
                                           n_neighbors=self._params['score']['params']['local_outlier']['n_neighbors'])
        else:
            raise RuntimeError(f"invalid value for params[beta_vectors][type], got {score_type}"
                               f" while valid options are: knn/gmm/local_outlier")

    def _run_ad(self):
        mx_dict = self._calc_tg_feature_matrix()
        concat_mx = np.vstack([mx for name, mx in mx_dict.items()])
        pearson_picker = PearsonFeaturePicker(concat_mx, size=self._params['feature_pair_picker']['num_pairs'],
                                              logger=self._logger, identical_bar=self._params['feature_pair_picker']['overlap_bar'])
        best_pairs = pearson_picker.best_pairs()
        beta_matrix = self._get_beta_vec(mx_dict, best_pairs).beta_matrix()
        scores = self._get_graphs_score(beta_matrix).score_list()

        anomaly_picker = SimpleAnomalyPicker(self._temporal_graph, scores, self.data_name(),
                                             num_anomalies=self._num_anomalies)
        anomaly_picker.build()
        anomaly_picker.plot_anomalies_bokeh("", truth=self._ground_truth,
                                            info_text=str(self._params))
Beispiel #4
0
class TimedGraphs:
    def __init__(self,
                 database_name,
                 start_time=10,
                 logger: BaseLogger = None,
                 features_meta=None,
                 directed=False,
                 files_path=None,
                 date_format=None,
                 largest_cc=False):
        self._start_time = start_time
        self._features_meta = NODE_FEATURES if features_meta is None else features_meta
        self._largest_cc = largest_cc
        self._date_format = date_format
        self._directed = directed
        self._database_name = database_name + "_directed:" + str(
            directed) + "_lcc:" + str(largest_cc)
        self._path = os.path.join('data', self._database_name)
        if logger:
            self._logger = logger
        else:
            self._logger = PrintLogger("default graphs logger")
        self._files_path = files_path  # location of graphs as files

        # make directories to save features data (as pickles)
        if "data" not in os.listdir("."):
            os.mkdir("data")
        if self._database_name not in os.listdir("data/"):
            os.mkdir(self._path)
        self._logger.debug("graphs initialized")
        self._initiation()

    def get_feature_meta(self):
        return self._multi_graph.get_feature_meta()

    def is_directed(self):
        return self._directed

    def _is_loaded(self):
        return True if os.path.exists(self._get_pickle_path()) else False

    def _load_graphs(self):
        self._logger.debug("load multi-graph - start")
        self._multi_graph = pickle.load(open(self._get_pickle_path(), "rb"))
        self._logger.debug("pickle loaded")
        self._logger.debug("load multi-graph - end")

    def _dump(self, redump=False):
        if self._is_loaded() and not redump:
            self._logger.debug("multi-graph is already loaded")
            return
        log = self._multi_graph._logger
        key_func = self._multi_graph._key_func
        self._multi_graph._logger = None
        self._multi_graph._key_func = None
        pickle.dump(self._multi_graph, open(self._get_pickle_path(), "wb"))
        self._multi_graph._logger = log
        self._multi_graph._key_func = key_func
        self._logger.debug("multi-graph dumped")

    def _get_pickle_path(self):
        return os.path.join(self._path, self._database_name + ".pkl")

    def _initiation(self):
        self._logger.debug("build multi-graph - start")
        self._multi_graph = TimedMultiGraphFeatures(
            self._database_name,
            self._logger,
            features_meta=self._features_meta,
            directed=self._directed,
            files_path=self._files_path,
            pkl_dir=self._path,
            date_format=self._date_format)
        for i in range(self._start_time):
            self._multi_graph.forward_time()
        self._multi_graph.build_features(largest_cc=self._largest_cc,
                                         should_zscore=False)

    def forward_time(self):
        flag = self._multi_graph.forward_time()
        self._multi_graph.build_features(largest_cc=self._largest_cc,
                                         should_zscore=False)
        self._logger.debug("build multi-graph - end")
        return flag

    def get_labels(self):
        return self._multi_graph.get_labels()

    # Adapter for multi graph
    def get_subgraph(self, graph_name):
        return self._multi_graph.subgraph_by_name(graph_name)

    def subgraph_by_name(self, graph_name: str):
        return self._multi_graph.subgraph_by_name(graph_name)

    def subgraph_by_index(self, index: int):
        return self._multi_graph.subgraph_by_index(index)

    def combined_graph_by_names(self, names_list=None, combine_all=False):
        return self._multi_graph.combined_graph_by_names(
            names_list, combine_all)

    def combined_graph_by_indexes(self, index_list=None, combine_all=False):
        return self.combined_graph_by_indexes(index_list, combine_all)

    def is_graph(self, graph_name):
        return self._multi_graph.is_graph(graph_name)

    def index_to_name(self, index):
        return self._multi_graph.index_to_name(index)

    def name_to_index(self, graph_name):
        return self._multi_graph.name_to_index(graph_name)

    def features_matrix_by_index(self,
                                 graph_start=0,
                                 graph_end=0,
                                 for_all=False):
        return self._multi_graph.features_matrix_by_indexes(
            graph_start, graph_end, for_all)

    def features_matrix_by_name(self,
                                graph_start=0,
                                graph_end=0,
                                for_all=False):
        return self._multi_graph.features_matrix_by_names(
            graph_start, graph_end, for_all)

    def features_matrix(self, graph):
        return self._multi_graph.feature_matrix(graph)

    def nodes_for_graph(self, graph):
        return self._multi_graph.nodes_for_graph(graph)

    def nodes_count_list(self):
        return self._multi_graph.nodes_count_list()

    def edges_for_graph(self, graph):
        return self._multi_graph.nodes_for_graph(graph)

    def edges_count_list(self):
        return self._multi_graph.nodes_count_list()

    def subgraphs(self, start_id=None, end_id=None):
        for gid in self._multi_graph._list_id[start_id:end_id]:
            yield self.subgraph_by_name(gid)

    def number_of_graphs(self):
        return self._multi_graph.number_of_graphs()

    def graph_names(self, start_id=None, end_id=None):
        for gid in self._multi_graph._list_id[start_id:end_id]:
            yield gid

    def norm_features(self, norm_function):
        self._multi_graph.norm_features(norm_function)