Ejemplo n.º 1
0
def test_main():
    import numpy as np
    from features_infra.graph_features import GraphFeatures
    from loggers import PrintLogger
    import os
    import pickle
    import networkx as nx

    dataset = "citeseer"
    logger = PrintLogger("MetaTest")
    base_dir = r"/home/benami/git/pygcn/data"
    gnx = pickle.load(open(os.path.join(base_dir, dataset, "gnx.pkl"), 'rb'))

    max_subgnx = max(nx.connected_component_subgraphs(gnx.to_undirected()),
                     key=len)
    gnx = gnx.subgraph(max_subgnx)

    features = GraphFeatures(gnx,
                             TEST_FEATURES,
                             dir_path="./%s_features_sub" % dataset,
                             logger=logger)
    features.build(should_dump=True)
    measures_mx = features.to_matrix(add_ones=False,
                                     dtype=np.float32,
                                     mtype=np.matrix)
    logger.info("Finished")
class TPGAD:
    def __init__(self, params):
        self._params = params if type(params) is dict else json.load(open(params, "rt"))
        self._logger = PrintLogger("graph-ad")
        self._temporal_graph = self._build_temporal_graph()
        self._ground_truth = self._load_ground_truth(self._params['gt']['filename'])
        self._num_anomalies = len(self._ground_truth)*2
        self._idx_to_graph = list(self._temporal_graph.graph_names())
        self._graph_to_idx = {name: idx for idx, name in enumerate(self._idx_to_graph)}
        self._run_ad()

    def _load_ground_truth(self, gt_file):
        df = pd.read_csv(gt_file)
        return {self._temporal_graph.name_to_index(row.anomaly): row.get("score", 1) for i, row in df.iterrows()}

    def data_name(self):
        max_connected = "max_connected_" if self._params['features']['max_connected'] else ""
        directed = "directed" if self._params['dataset']['directed'] else "undirected"
        weighted = "weighted_" if self._params['dataset']['weight_col'] is not None else ""
        return f"{self._params['dataset']['name']}_{weighted}{max_connected}{directed}"

    def _build_temporal_graph(self):
        tg_pkl_dir = os.path.join(self._params['general']['pkl_path'], "temporal_graphs")
        tg_pkl_path = os.path.join(tg_pkl_dir, f"{self.data_name()}_tg.pkl")
        if os.path.exists(tg_pkl_path):
            self._logger.info("loading pkl file - temporal_graphs")
            tg = pickle.load(open(tg_pkl_path, "rb"))
        else:
            tg = TemporalGraph(self.data_name(), self._params['dataset']['filename'], self._params['dataset']['time_format'],
                               self._params['dataset']['time_col'], self._params['dataset']['src_col'],
                               self._params['dataset']['dst_col'],
                               weight_col=self._params['dataset'].get('weight_col', None),
                               weeks=self._params['dataset'].get('week_split', None),
                               days=self._params['dataset'].get('day_split', None),
                               hours=self._params['dataset'].get('hour_split', None),
                               minutes=self._params['dataset'].get('min_split', None),
                               seconds=self._params['dataset'].get('sec_split', None),
                               directed=self._params['dataset']['directed'],
                               logger=self._logger).to_multi_graph()

            tg.suspend_logger()
            if self._params['general']["dump_pkl"]:
                os.makedirs(tg_pkl_dir, exist_ok=True)
                pickle.dump(tg, open(tg_pkl_path, "wb"))
            tg.wake_logger()
        return tg

    def _calc_tg_feature_matrix(self):
        log_ext = "log_" if self._params['features']['log'] else ""
        feature_matrix_dir = os.path.join(self._params['general']['pkl_path'], "gt_feature_matrix")
        mat_pkl = os.path.join(feature_matrix_dir, f"{self.data_name()}_{log_ext}tg_feature_matrices.pkl")

        if os.path.exists(mat_pkl):
            self._logger.info("loading pkl file - graph_matrix")
            return pickle.load(open(mat_pkl, "rb"))

        gnx_to_vec = {}
        # create dir for database
        database_pkl_dir = os.path.join(self._params['general']['pkl_path'], "features", self.data_name())
        for gnx_name, gnx in zip(self._temporal_graph.graph_names(), self._temporal_graph.graphs()):
            # create dir for specific graph features
            gnx_path = os.path.join(database_pkl_dir, re.sub('[^a-zA-Z0-9]', '_', gnx_name))
            if self._params['general']["dump_pkl"]:
                os.makedirs(gnx_path, exist_ok=True)

            gnx_ftr = GraphFeatures(gnx, ANOMALY_DETECTION_FEATURES, dir_path=gnx_path, logger=self._logger,
                                    is_max_connected=self._params['features']['max_connected'])
            gnx_ftr.build(should_dump=self._params['general']["dump_pkl"],
                          force_build=self._params['general']['FORCE_REBUILD_FEATURES'])  # build features
            # calc motif ratio vector
            gnx_to_vec[gnx_name] = FeaturesProcessor(gnx_ftr).as_matrix(norm_func=log_norm if self._params['features']['log'] else None)
        if self._params['general']['dump_pkl']:
            os.makedirs(feature_matrix_dir, exist_ok=True)
            pickle.dump(gnx_to_vec, open(mat_pkl, "wb"))
        return gnx_to_vec

    def _get_beta_vec(self, mx_dict, best_pairs):
        self._logger.debug("calculating beta vectors")

        if self._params['beta_vectors']['type'] == "regression":
            beta = LinearContext(self._temporal_graph, mx_dict, best_pairs,
                                 window_size=self._params['beta_vectors']['window_size'])
        elif self._params['beta_vectors']['type'] == "mean_regression":
            beta = LinearMeanContext(self._temporal_graph, mx_dict, best_pairs,
                                     window_size=self._params['beta_vectors']['window_size'])
        else:
            raise RuntimeError(f"invalid value for params[beta_vectors][type], got {self._params['beta_vectors']['type']}"
                               f" while valid options are: regression/mean_regression ")
        if self._params['general']['dump_pkl']:
            beta_pkl_dir = os.path.join(self._params['general']['pkl_path'], "beta_matrix")
            tg_pkl_path = os.path.join(beta_pkl_dir, f"{self.data_name()}_beta.pkl")
            os.makedirs(beta_pkl_dir, exist_ok=True)
            pickle.dump(beta.beta_matrix(), open(tg_pkl_path, "wb"))
        self._logger.debug("finish calculating beta vectors")

        return beta

    def _get_graphs_score(self, beta_matrix):
        score_type = self._params['score']['type']
        if score_type == "knn":
            return KnnScore(beta_matrix, self._params['score']['params']['knn']['k'], self.data_name(),
                            window_size=self._params['score']['window_size'])
        elif score_type == "gmm":
            return GmmScore(beta_matrix, self.data_name(), window_size=self._params['score']['window_size'],
                            n_components=self._params['score']['params']['gmm']['n_components'])
        elif score_type == "local_outlier":
            return LocalOutlierFactorScore(beta_matrix, self.data_name(), window_size=self._params['score']['window_size'],
                                           n_neighbors=self._params['score']['params']['local_outlier']['n_neighbors'])
        else:
            raise RuntimeError(f"invalid value for params[beta_vectors][type], got {score_type}"
                               f" while valid options are: knn/gmm/local_outlier")

    def _run_ad(self):
        mx_dict = self._calc_tg_feature_matrix()
        concat_mx = np.vstack([mx for name, mx in mx_dict.items()])
        pearson_picker = PearsonFeaturePicker(concat_mx, size=self._params['feature_pair_picker']['num_pairs'],
                                              logger=self._logger, identical_bar=self._params['feature_pair_picker']['overlap_bar'])
        best_pairs = pearson_picker.best_pairs()
        beta_matrix = self._get_beta_vec(mx_dict, best_pairs).beta_matrix()
        scores = self._get_graphs_score(beta_matrix).score_list()

        anomaly_picker = SimpleAnomalyPicker(self._temporal_graph, scores, self.data_name(),
                                             num_anomalies=self._num_anomalies)
        anomaly_picker.build()
        anomaly_picker.plot_anomalies_bokeh("", truth=self._ground_truth,
                                            info_text=str(self._params))
Ejemplo n.º 3
0
class MleEstimator:
    def __init__(self,
                 source_file,
                 num_prefix=120,
                 num_suffix=200,
                 delta=(0.2, 0.5, 0.3)):
        self._logger = PrintLogger("NLP-ass1")
        self._delta = delta
        self._source = source_file
        self._num_prefix = num_prefix
        self._num_suffix = num_suffix
        # counters
        self._emission_count, self._transition_count, self._suffix_count = self._get_data(
        )
        self._pos_list = list(
            set(list(self._transition_count[0].keys()) + [START]))
        self._num_pos = len(self._pos_list)
        self._pos_idx = {pos: i for i, pos in enumerate(self._pos_list)}
        # probabilities
        #self._emission, self._transition, self._prefix, self._suffix = self._calc_probabilities()

    def _get_data(self):
        self._logger.info("get-data - start")
        transition = {0: {}, 1: {}, 2: {}}
        emission = {}
        suffix = {}
        word_counter = 0
        src_file = open(self._source, "rt")  # open file
        for line in src_file:
            t1 = START
            t2 = START
            w_pos = []
            for w_p in line.split():  # break line to [.. (word, POS) ..]
                word, pos = w_p.rsplit("/", 1)
                w_pos.append((word, pos))
            for i, (word, pos) in enumerate(w_pos):
                word_counter += 1
                # -------- EMISSION ----------
                emission[(word, pos)] = emission.get(
                    (word, pos), 0) + 1  # count (word, POS)++
                # --------- SUFFIX -----------
                if word_counter % 10 == 0:
                    suffix[(word[-SUFF:], pos)] = suffix.get(
                        (word[-SUFF:], pos), 0) + 1
                # ------- TRANSITION ---------
                transition[0][pos] = transition[0].get(pos,
                                                       0) + 1  # count(POS)
                transition[1][(t1, pos)] = transition[1].get(
                    (t1, pos), 0) + 1  # count(POS_1, POS_2)
                transition[2][(t2, t1, pos)] = transition[2].get(
                    (t2, t1, pos), 0) + 1  # count(POS_0, POS_1, POS_2)
                t2 = t1
                t1 = pos
        self._logger.info("get-data - end")
        return emission, transition, suffix

    def mle_count_to_txt(self, e_mle_path, q_mle_path):
        self._logger.info("writing e_mle...")
        out_e = open(e_mle_path, "wt")
        out_e.writelines([
            word + " " + pos + "\t" + str(count) + "\n"
            for (word, pos), count in self._emission_count.items()
        ])
        out_e.writelines([
            "^" + sufi + " " + pos + "\t" + str(count) + "\n"
            for (sufi, pos), count in self._suffix_count.items()
        ])
        out_e.close()
        self._logger.info("writing q_mle...")
        out_q = open(q_mle_path, "wt")
        out_q.writelines([
            pos + "\t" + str(count) + "\n"
            for pos, count in self._transition_count[0].items()
        ])
        out_q.writelines([
            pos1 + " " + pos0 + "\t" + str(count) + "\n"
            for (pos1, pos0), count in self._transition_count[1].items()
        ])
        out_q.writelines([
            pos2 + " " + pos1 + " " + pos0 + "\t" + str(count) + "\n"
            for (pos2, pos1, pos0), count in self._transition_count[2].items()
        ])
        out_q.close()
Ejemplo n.º 4
0
class AnomalyDetection:
    def __init__(self, params: AdParams):
        self._base_dir = __file__.replace("/", os.sep)
        self._base_dir = os.path.join(self._base_dir.rsplit(os.sep, 1)[0])
        self._data_path = os.path.join(self._base_dir, "INPUT_DATA", params.database.DATABASE_FILE)
        self._params = params
        self._data_name = params.database.DATABASE_NAME
        self._logger = PrintLogger("Anomaly logger")
        self._temporal_graph = self._build_temporal_graph()
        self._ground_truth = self._load_ground_truth(self._params.database.GROUND_TRUTH)
        # self._temporal_graph.filter(
        #         lambda x: False if self._temporal_graph.node_count(x) < 20 else True,
        #         func_input="graph_name")
        self._idx_to_name = list(self._temporal_graph.graph_names())
        self._name_to_idx = {name: idx for idx, name in enumerate(self._idx_to_name)}

        if self._params.vec_type == "motif_ratio":
            self._build_second_method()
        elif self._params.vec_type == "regression":
            self._build_first_method()

    def _load_ground_truth(self, gd):
        if type(gd) is list:
            return {self._temporal_graph.name_to_index(g_id): 1 for g_id in gd}
        elif type(gd) is dict:
            return {self._temporal_graph.name_to_index(g_id): float(val) for g_id, val in gd.items()}
        return None

    def _build_temporal_graph(self):
        database_name = self._params.database.DATABASE_NAME + "_" + str(self._params.max_connected)\
                        + "_" + str(self._params.directed)
        vec_pkl_path = os.path.join(self._base_dir, "pkl", "temporal_graphs", database_name + "_tg.pkl")
        if os.path.exists(vec_pkl_path):
            self._logger.info("loading pkl file - temoral_graphs")
            tg = pickle.load(open(vec_pkl_path, "rb"))
        else:
            tg = TemporalGraph(database_name, self._data_path, self._params.database.DATE_FORMAT,
                               self._params.database.TIME_COL, self._params.database.SRC_COL,
                               self._params.database.DST_COL, weight_col=self._params.database.WEIGHT_COL,
                               weeks=self._params.database.WEEK_SPLIT, days=self._params.database.DAY_SPLIT,
                               hours=self._params.database.HOUR_SPLIT, minutes=self._params.database.MIN_SPLIT,
                               seconds=self._params.database.SEC_SPLIT, directed=self._params.directed,
                               logger=self._logger).to_multi_graph()
            tg.suspend_logger()
            pickle.dump(tg, open(vec_pkl_path, "wb"))
        tg.wake_logger()
        return tg

    def _calc_matrix(self):
        database_name = self._params.database.DATABASE_NAME + "_" + str(self._params.max_connected) + "_" + str(
            self._params.directed)
        mat_pkl_path = os.path.join(self._base_dir, "pkl", "vectors", database_name + "_matrix_log" +
                                    str(self._params.log) + ".pkl")
        if os.path.exists(mat_pkl_path):
            self._logger.info("loading pkl file - graph_matrix")
            return pickle.load(open(mat_pkl_path, "rb"))

        gnx_to_vec = {}
        # create dir for database
        pkl_dir = os.path.join(self._base_dir, "pkl", "features")
        database_pkl_dir = os.path.join(pkl_dir, database_name)
        if database_name not in os.listdir(pkl_dir):
            os.mkdir(database_pkl_dir)

        for gnx_name, gnx in zip(self._temporal_graph.graph_names(), self._temporal_graph.graphs()):
            # create dir for specific graph features
            gnx_path = os.path.join(database_pkl_dir, gnx_name)
            if gnx_name not in os.listdir(database_pkl_dir):
                os.mkdir(gnx_path)

            gnx_ftr = GraphFeatures(gnx, self._params.features, dir_path=gnx_path, logger=self._logger,
                                    is_max_connected=self._params.max_connected)
            gnx_ftr.build(should_dump=True, force_build=self._params.FORCE_REBUILD_FEATURES)  # build features
            # calc motif ratio vector
            gnx_to_vec[gnx_name] = FeaturesProcessor(gnx_ftr).as_matrix(norm_func=log_norm) if self._params.log else \
                FeaturesProcessor(gnx_ftr).as_matrix()

        pickle.dump(gnx_to_vec, open(mat_pkl_path, "wb"))
        return gnx_to_vec

    def _calc_vec(self):
        database_name = self._params.database.DATABASE_NAME + "_" + \
                        str(self._params.max_connected) + "_" + str(self._params.directed)
        vec_pkl_path = os.path.join(self._base_dir, "pkl", "vectors", database_name + "_vectors_log_" +
                                    str(self._params.log) + ".pkl")
        if os.path.exists(vec_pkl_path):
            self._logger.info("loading pkl file - graph_vectors")
            return pickle.load(open(vec_pkl_path, "rb"))

        # create dir for database
        pkl_dir = os.path.join(self._base_dir, "pkl", "features")
        database_pkl_dir = os.path.join(pkl_dir, database_name)
        if database_name not in os.listdir(pkl_dir):
            os.mkdir(database_pkl_dir)

        gnx_to_vec = {}
        for gnx_name, gnx in zip(self._temporal_graph.graph_names(), self._temporal_graph.graphs()):
            # create dir for specific graph features
            gnx_path = os.path.join(database_pkl_dir, gnx_name)
            if gnx_name not in os.listdir(database_pkl_dir):
                os.mkdir(gnx_path)

            gnx_ftr = GraphFeatures(gnx, self._params.features, dir_path=gnx_path, logger=self._logger,
                                    is_max_connected=self._params.max_connected)
            gnx_ftr.build(should_dump=True, force_build=self._params.FORCE_REBUILD_FEATURES)  # build features
            # calc motif ratio vector
            gnx_to_vec[gnx_name] = FeaturesProcessor(gnx_ftr).activate_motif_ratio_vec(norm_func=log_norm)\
                if self._params.log else FeaturesProcessor(gnx_ftr).activate_motif_ratio_vec()

        pickle.dump(gnx_to_vec, open(vec_pkl_path, "wb"))
        return gnx_to_vec

    def _build_first_method(self):
        mx_dict = self._calc_matrix()
        concat_mx = np.vstack([mx for name, mx in mx_dict.items()])
        pearson_picker = PearsonFeaturePicker(concat_mx, size=self._params.ftr_pairs,
                                              logger=self._logger, identical_bar=self._params.identical_bar)
        best_pairs = pearson_picker.best_pairs()
        beta = LinearContext(self._temporal_graph, mx_dict, best_pairs, window_size=self._params.window_correlation)
        beta_matrix = beta.beta_matrix()
        if self._params.score_type == "knn":
            score = KnnScore(beta_matrix, self._params.KNN_k, self._data_name,
                             window_size=self._params.window_score)
        elif self._params.score_type == "gmm":
            score = GmmScore(beta_matrix, self._data_name, window_size=self._params.window_score,
                             n_components=self._params.n_components)
        else:   # self._params["score_type"] == "local_outlier":
            score = LocalOutlierFactorScore(beta_matrix, self._data_name, window_size=self._params.window_score,
                                            n_neighbors=self._params.n_neighbors)
        anomaly_picker = SimpleAnomalyPicker(self._temporal_graph, score.score_list(), self._data_name,
                                             num_anomalies=self._params.n_outliers)
        anomaly_picker.build()
        anomaly_picker.plot_anomalies_bokeh(self._params.anomalies_file_name, truth=self._ground_truth,
                                            info_text=self._params.tostring())

    def _build_second_method(self):
        self._graph_to_vec = self._calc_vec()
        self._graph_matrix = np.vstack([self._graph_to_vec[name] for name in self._temporal_graph.graph_names()])
        if self._params.log:
            self._graph_matrix = log_norm(self._graph_matrix)

        if self._params.score_type == "knn":
            score = KnnScore(self._graph_matrix, self._params.KNN_k, self._data_name,
                             window_size=self._params.window_score)
        elif self._params.score_type == "gmm":
            score = GmmScore(self._graph_matrix, self._data_name, window_size=self._params.window_score,
                             n_components=self._params.n_components)
        else:   # self._params["score_type"] == "local_outlier":
            score = LocalOutlierFactorScore(self._graph_matrix, self._data_name,
                                            window_size=self._params.window_score,
                                            n_neighbors=self._params.n_neighbors)

        anomaly_picker = SimpleAnomalyPicker(self._temporal_graph, score.score_list(), self._data_name,
                                             num_anomalies=self._params.n_outliers)
        anomaly_picker.build()
        anomaly_picker.plot_anomalies_bokeh(self._params.anomalies_file_name, truth=self._ground_truth,
                                            info_text=self._params.tostring())
Ejemplo n.º 5
0
class AnomalyDetectionOperationResearch:
    def __init__(self, params: AdParams, name):
        self._base_dir = __file__.replace("/", os.sep)
        self._base_dir = os.path.join(
            self._base_dir.rsplit(os.sep, 1)[0], "..")
        self._data_path = os.path.join(self._base_dir, "INPUT_DATA",
                                       params.database.DATABASE_FILE)
        self._params = params
        self._data_name = params.database.DATABASE_NAME
        self._logger = PrintLogger("Anomaly logger")
        self._temporal_graph = self._build_temporal_graph()
        self._ground_truth = self._load_ground_truth(
            self._params.database.GROUND_TRUTH)
        self._idx_to_name = list(self._temporal_graph.graph_names())
        self._name_to_idx = {
            name: idx
            for idx, name in enumerate(self._idx_to_name)
        }
        self._out = open(os.path.join("..", name), "wt")
        self._out.write(",".join([
            "FN", "TN", "TP", "FP", "recall", "precision", "specificity", "F1",
            self._params.attr_string()
        ]) + "\n")
        self._build()

    def _load_ground_truth(self, gd):
        if type(gd) is list:
            return {self._temporal_graph.name_to_index(g_id): 1 for g_id in gd}
        elif type(gd) is dict:
            return {
                self._temporal_graph.name_to_index(g_id): float(val)
                for g_id, val in gd.items()
            }
        return None

    def _build_temporal_graph(self):
        database_name = self._params.database.DATABASE_NAME + "_" + str(self._params.max_connected)\
                        + "_" + str(self._params.directed)
        vec_pkl_path = os.path.join(self._base_dir, "pkl", "temporal_graphs",
                                    database_name + "_tg.pkl")
        if os.path.exists(vec_pkl_path):
            self._logger.info("loading pkl file - temoral_graphs")
            tg = pickle.load(open(vec_pkl_path, "rb"))
        else:
            tg = TemporalGraph(database_name,
                               self._data_path,
                               self._params.database.DATE_FORMAT,
                               self._params.database.TIME_COL,
                               self._params.database.SRC_COL,
                               self._params.database.DST_COL,
                               weight_col=self._params.database.WEIGHT_COL,
                               weeks=self._params.database.WEEK_SPLIT,
                               days=self._params.database.DAY_SPLIT,
                               hours=self._params.database.HOUR_SPLIT,
                               minutes=self._params.database.MIN_SPLIT,
                               seconds=self._params.database.SEC_SPLIT,
                               directed=self._params.directed,
                               logger=self._logger).to_multi_graph()
            tg.suspend_logger()
            pickle.dump(tg, open(vec_pkl_path, "wb"))
        tg.wake_logger()
        return tg

    def _calc_matrix(self):
        database_name = self._params.database.DATABASE_NAME + "_" + str(
            self._params.max_connected) + "_" + str(self._params.directed)
        mat_pkl_path = os.path.join(
            self._base_dir, "pkl", "vectors",
            database_name + "_matrix_log" + str(self._params.log) + ".pkl")
        if os.path.exists(mat_pkl_path):
            self._logger.info("loading pkl file - graph_matrix")
            return pickle.load(open(mat_pkl_path, "rb"))

        gnx_to_vec = {}
        # create dir for database
        pkl_dir = os.path.join(self._base_dir, "pkl", "features")
        database_pkl_dir = os.path.join(pkl_dir, database_name)
        if database_name not in os.listdir(pkl_dir):
            os.mkdir(database_pkl_dir)

        for gnx_name, gnx in zip(self._temporal_graph.graph_names(),
                                 self._temporal_graph.graphs()):
            # create dir for specific graph features
            gnx_name_path = gnx_name.replace(':', '_')
            gnx_name_path = gnx_name_path.replace('/', '_')
            gnx_path = os.path.join(database_pkl_dir, gnx_name_path)
            if gnx_name_path not in os.listdir(database_pkl_dir):
                os.mkdir(gnx_path)

            gnx_ftr = GraphFeatures(
                gnx,
                self._params.features,
                dir_path=gnx_path,
                logger=self._logger,
                is_max_connected=self._params.max_connected)
            gnx_ftr.build(should_dump=True,
                          force_build=self._params.FORCE_REBUILD_FEATURES
                          )  # build features
            # calc motif ratio vector
            gnx_to_vec[gnx_name] = FeaturesProcessor(gnx_ftr).as_matrix(
                norm_func=log_norm)

        pickle.dump(gnx_to_vec, open(mat_pkl_path, "wb"))
        return gnx_to_vec

    def _calc_vec(self):
        database_name = self._params.database.DATABASE_NAME + "_" + \
                        str(self._params.max_connected) + "_" + str(self._params.directed)
        vec_pkl_path = os.path.join(
            self._base_dir, "pkl", "vectors",
            database_name + "_vectors_log_" + str(self._params.log) + ".pkl")
        if os.path.exists(vec_pkl_path):
            self._logger.info("loading pkl file - graph_vectors")
            return pickle.load(open(vec_pkl_path, "rb"))

        # create dir for database
        pkl_dir = os.path.join(self._base_dir, "pkl", "features")
        database_pkl_dir = os.path.join(pkl_dir, database_name)
        if database_name not in os.listdir(pkl_dir):
            os.mkdir(database_pkl_dir)

        gnx_to_vec = {}
        for gnx_name, gnx in zip(self._temporal_graph.graph_names(),
                                 self._temporal_graph.graphs()):
            # create dir for specific graph features
            gnx_path = os.path.join(database_pkl_dir, gnx_name)
            if gnx_name not in os.listdir(database_pkl_dir):
                os.mkdir(gnx_path)

            gnx_ftr = GraphFeatures(
                gnx,
                self._params.features,
                dir_path=gnx_path,
                logger=self._logger,
                is_max_connected=self._params.max_connected)
            gnx_ftr.build(should_dump=True,
                          force_build=self._params.FORCE_REBUILD_FEATURES
                          )  # build features
            # calc motif ratio vector
            gnx_to_vec[gnx_name] = FeaturesProcessor(
                gnx_ftr).activate_motif_ratio_vec(norm_func=log_norm)

        pickle.dump(gnx_to_vec, open(vec_pkl_path, "wb"))
        return gnx_to_vec

    def _build(self):
        for lg in [True, False]:
            self._params.log = lg
            for vec_type in ["mean_regression", "regression"]:  # motif_ratio
                self._params.vec_type = vec_type
                self.features = ANOMALY_DETECTION_FEATURES if self._params.vec_type == "regression" else MOTIF_FEATURES,
                if self._params.vec_type == "regression" or self._params.vec_type == "mean_regression":
                    mx_dict = self._calc_matrix()
                    concat_mx = np.vstack([mx for name, mx in mx_dict.items()])
                    for ftr_pairs in [
                            3, 4, 5
                    ]:  # [1, 2, 3, 4, 5, 10] [5, 10, 15, 20, 25, 30, 40, 45, 50]: # [1, 2, 3, 4, 5, 10, 15, 20, 25, 50, 70, 90, 110, 130, 150, 170, 200]: #  [5, 10, 15, 20, 25, 30, 40, 45, 50]
                        self._params.ftr_pairs = ftr_pairs
                        for identical in [
                                0.99
                        ]:  #  [0.7, 0.8, 0.9, 0.95, 0.99] [0.7, 0.8, 0.9, 0.95, 0.99]
                            self._params.identical_bar = identical

                            pearson_picker = PearsonFeaturePicker(
                                concat_mx,
                                size=self._params.ftr_pairs,
                                logger=self._logger,
                                identical_bar=self._params.identical_bar)
                            for win in list(
                                    range(
                                        25,
                                        min(
                                            100,
                                            self._temporal_graph.
                                            number_of_graphs()), 25)):
                                self._params.window_correlation = win
                                best_pairs = pearson_picker.best_pairs()
                                if best_pairs is None:
                                    continue
                                if self._params.vec_type == "regression":
                                    beta = LinearContext(
                                        self._temporal_graph,
                                        mx_dict,
                                        best_pairs,
                                        window_size=self._params.
                                        window_correlation)
                                else:
                                    beta = LinearMeanContext(
                                        self._temporal_graph,
                                        mx_dict,
                                        best_pairs,
                                        window_size=self._params.
                                        window_correlation)
                                beta_matrix = beta.beta_matrix()
                                self._pick_anomalies(beta_matrix)

                elif self._params.vec_type == "motif_ratio":
                    self._graph_to_vec = self._calc_vec()
                    beta_matrix = np.vstack([
                        self._graph_to_vec[name]
                        for name in self._temporal_graph.graph_names()
                    ])
                    self._pick_anomalies(beta_matrix)

    def _pick_anomalies(self, beta_matrix):
        for score_type in ["knn", "gmm", "local_outlier"]:
            self._params.score_type = score_type
            if self._params.score_type == "knn":
                for win in list(
                        range(
                            25,
                            min(100, self._temporal_graph.number_of_graphs()),
                            25)):
                    self._params.window_score = win
                    for k in list(range(5, min(win, 50) - 1, 5)):
                        self._params.KNN_k = k
                        score = KnnScore(beta_matrix,
                                         self._params.KNN_k,
                                         self._data_name,
                                         window_size=self._params.window_score)
                        anomaly_picker = SimpleAnomalyPicker(
                            self._temporal_graph,
                            score.score_list(),
                            self._data_name,
                            num_anomalies=self._params.n_outliers)
                        truth = [
                            self._temporal_graph.name_to_index(g_id)
                            for g_id in self._params.database.GROUND_TRUTH
                        ] if self._params.database.GROUND_TRUTH else None
                        FN, TN, TP, FP, recall, precision, specificity, F1 = anomaly_picker.build(
                            truth=truth)
                        self._out.write(",".join([
                            str(FN),
                            str(TN),
                            str(TP),
                            str(FP),
                            str(recall),
                            str(precision),
                            str(specificity),
                            str(F1),
                            self._params.attr_val_string()
                        ]) + "\n")

            elif self._params.score_type == "gmm":
                for win in list(
                        range(
                            25,
                            min(100, self._temporal_graph.number_of_graphs()),
                            25)):
                    self._params.window_score = win
                    for comp in [1, 2, 3, 4, 5]:
                        self._params.n_components = comp
                        score = GmmScore(
                            beta_matrix,
                            self._data_name,
                            window_size=self._params.window_score,
                            n_components=self._params.n_components)
                        anomaly_picker = SimpleAnomalyPicker(
                            self._temporal_graph,
                            score.score_list(),
                            self._data_name,
                            num_anomalies=self._params.n_outliers)
                        truth = [
                            self._temporal_graph.name_to_index(g_id)
                            for g_id in self._params.database.GROUND_TRUTH
                        ] if self._params.database.GROUND_TRUTH else None
                        FN, TN, TP, FP, recall, precision, specificity, F1 = anomaly_picker.build(
                            truth=truth)
                    self._out.write(",".join([
                        str(FN),
                        str(TN),
                        str(TP),
                        str(FP),
                        str(recall),
                        str(precision),
                        str(specificity),
                        str(F1),
                        self._params.attr_val_string()
                    ]) + "\n")

            elif self._params.score_type == "local_outlier":
                for win in list(
                        range(
                            25,
                            min(100, self._temporal_graph.number_of_graphs()),
                            25)):
                    self._params.window_score = win
                    for neighbors in list(range(5, min(win, 50), 5)):
                        self._params.n_neighbors = neighbors
                        score = LocalOutlierFactorScore(
                            beta_matrix,
                            self._data_name,
                            window_size=self._params.window_score,
                            n_neighbors=self._params.n_neighbors)
                        anomaly_picker = SimpleAnomalyPicker(
                            self._temporal_graph,
                            score.score_list(),
                            self._data_name,
                            num_anomalies=self._params.n_outliers)
                        truth = [
                            self._temporal_graph.name_to_index(g_id)
                            for g_id in self._params.database.GROUND_TRUTH
                        ] if self._params.database.GROUND_TRUTH else None
                        FN, TN, TP, FP, recall, precision, specificity, F1 = anomaly_picker.build(
                            truth=truth)
                        self._out.write(",".join([
                            str(FN),
                            str(TN),
                            str(TP),
                            str(FP),
                            str(recall),
                            str(precision),
                            str(specificity),
                            str(F1),
                            self._params.attr_val_string()
                        ]) + "\n")
Ejemplo n.º 6
0
class DatasetStat:
    def __init__(self, params: AdParams):
        self._index_ftr = None
        self._base_dir = __file__.replace("/", os.sep)
        self._base_dir = os.path.join(
            self._base_dir.rsplit(os.sep, 1)[0], "..")
        self._data_path = os.path.join(self._base_dir, "INPUT_DATA",
                                       params.database.DATABASE_FILE)
        self._params = params
        self._ground_truth = params.database.GROUND_TRUTH
        self._data_name = params.database.DATABASE_NAME
        self._logger = PrintLogger("Anomaly logger")
        self._temporal_graph = self._build_temporal_graph()
        # self._temporal_graph.filter(
        #         lambda x: False if self._temporal_graph.node_count(x) < 20 else True,
        #         func_input="graph_name")
        self._idx_to_name = list(self._temporal_graph.graph_names())
        self._name_to_idx = {
            name: idx
            for idx, name in enumerate(self._idx_to_name)
        }
        self._graph_to_vec = self._calc_vec()

    def _build_temporal_graph(self):
        database_name = self._data_name + "_" + str(
            self._params.max_connected) + "_" + str(self._params.directed)
        vec_pkl_path = os.path.join(self._base_dir, "pkl", "temporal_graphs",
                                    database_name + "_tg.pkl")
        if os.path.exists(vec_pkl_path):
            self._logger.info("loading pkl file - temoral_graphs")
            tg = pickle.load(open(vec_pkl_path, "rb"))
        else:
            tg = TemporalGraph(database_name,
                               self._data_path,
                               self._params.database.DATE_FORMAT,
                               self._params.database.TIME_COL,
                               self._params.database.SRC_COL,
                               self._params.database.DST_COL,
                               weight_col=self._params.database.WEIGHT_COL,
                               weeks=self._params.database.WEEK_SPLIT,
                               days=self._params.database.DAY_SPLIT,
                               hours=self._params.database.HOUR_SPLIT,
                               minutes=self._params.database.MIN_SPLIT,
                               seconds=self._params.database.SEC_SPLIT,
                               directed=self._params.directed,
                               logger=self._logger).to_multi_graph()
            tg.suspend_logger()
            pickle.dump(tg, open(vec_pkl_path, "wb"))
        tg.wake_logger()
        return tg

    def _calc_vec(self):
        database_name = self._params.database.DATABASE_NAME + "_" + \
                        str(self._params.max_connected) + "_" + str(self._params.directed)
        vec_pkl_path = os.path.join(
            self._base_dir, "pkl", "vectors",
            database_name + "_vectors_log_" + str(self._params.log) + ".pkl")
        if os.path.exists(vec_pkl_path):
            self._logger.info("loading pkl file - graph_vectors")
            return pickle.load(open(vec_pkl_path, "rb"))

        gnx_to_vec = {}
        # create dir for database
        pkl_dir = os.path.join(self._base_dir, "pkl", "features")
        database_pkl_dir = os.path.join(pkl_dir, database_name)
        if database_name not in os.listdir(pkl_dir):
            os.mkdir(database_pkl_dir)

        for gnx_name, gnx in zip(self._temporal_graph.graph_names(),
                                 self._temporal_graph.graphs()):
            # create dir for specific graph features
            gnx_path = os.path.join(database_pkl_dir, gnx_name)
            if gnx_name not in os.listdir(database_pkl_dir):
                os.mkdir(gnx_path)

            gnx_ftr = GraphFeatures(
                gnx,
                self._params.features,
                dir_path=gnx_path,
                logger=self._logger,
                is_max_connected=self._params.max_connected)
            gnx_ftr.build(should_dump=True,
                          force_build=self._params.FORCE_REBUILD_FEATURES
                          )  # build features
            # calc motif ratio vector
            gnx_to_vec[gnx_name] = FeaturesProcessor(
                gnx_ftr).activate_motif_ratio_vec()

        pickle.dump(gnx_to_vec, open(vec_pkl_path, "wb"))
        return gnx_to_vec

    def _calc_matrix(self):
        database_name = self._data_name + "_" + str(
            self._params.max_connected) + "_" + str(self._params.directed)
        mat_pkl_path = os.path.join(self._base_dir, "pkl", "vectors",
                                    database_name + "_matrix.pkl")
        if os.path.exists(mat_pkl_path):
            self._logger.info("loading pkl file - graph_matrix")
            return pickle.load(open(mat_pkl_path, "rb"))

        gnx_to_vec = {}
        # create dir for database
        pkl_dir = os.path.join(self._base_dir, "pkl", "features")
        database_pkl_dir = os.path.join(pkl_dir, database_name)
        if database_name not in os.listdir(pkl_dir):
            os.mkdir(database_pkl_dir)

        for gnx_name, gnx in zip(self._temporal_graph.graph_names(),
                                 self._temporal_graph.graphs()):
            # create dir for specific graph features
            gnx_path = os.path.join(database_pkl_dir, gnx_name)
            if gnx_name not in os.listdir(database_pkl_dir):
                os.mkdir(gnx_path)

            gnx_ftr = GraphFeatures(
                gnx,
                self._params.features,
                dir_path=gnx_path,
                logger=self._logger,
                is_max_connected=self._params.max_connected)
            gnx_ftr.build(should_dump=True,
                          force_build=self._params.FORCE_REBUILD_FEATURES
                          )  # build features
            # calc motif ratio vector
            gnx_to_vec[gnx_name] = FeaturesProcessor(gnx_ftr).as_matrix()

        pickle.dump(gnx_to_vec, open(mat_pkl_path, "wb"))
        return gnx_to_vec

    # map matrix rows to features + count if there's more then one from feature
    def _set_index_to_ftr(self):
        gnx_name = self._temporal_graph.graph_names().__next__()
        gnx = self._temporal_graph.graphs().__next__()
        database_name = self._data_name + "_" + str(
            self._params.max_connected) + "_" + str(self._params.directed)
        gnx_path = os.path.join(self._base_dir, "pkl", "features",
                                database_name, gnx_name)
        gnx_ftr = GraphFeatures(gnx,
                                self._params.features,
                                dir_path=gnx_path,
                                logger=self._logger,
                                is_max_connected=self._params.max_connected)
        gnx_ftr.build(
            should_dump=False,
            force_build=self._params.FORCE_REBUILD_FEATURES)  # build features

        if not self._index_ftr:
            sorted_ftr = [
                f for f in sorted(gnx_ftr) if gnx_ftr[f].is_relevant()
            ]  # fix feature order (names)
            self._index_ftr = []

            for ftr in sorted_ftr:
                len_ftr = len(gnx_ftr[ftr])
                # fill list with (ftr, counter)
                self._index_ftr += self._get_motif_type(ftr, len_ftr) if ftr == 'motif3' or ftr == 'motif4' else \
                    [(ftr, i) for i in range(len_ftr)]
        return self._index_ftr

    # return [ ... (motif_type, counter) ... ]
    def _get_motif_type(self, motif_type, num_motifs):
        header = []
        for i in range(num_motifs):
            header.append((motif_type, i))
        return header

    def plot_nodes_by_time(self):
        # collect data for plot
        nodes_count_by_time = self._temporal_graph.node_count(
        )  # num of nodes per time
        edges_count_by_time = self._temporal_graph.edge_count(
        )  # num of edges per time

        len_mg = self._temporal_graph.number_of_graphs(
        )  # num of graphs (times)
        x_axis = list(range(len_mg))  # [0... num of times]

        p = figure(plot_width=600,
                   plot_height=250,
                   title=self._data_name + ", node & edge count",
                   x_axis_label="time",
                   y_axis_label="nodes_count")  # create figure

        p.line(x_axis, nodes_count_by_time, legend="nodes",
               line_color="blue")  # plot nodes
        p.line(x_axis, edges_count_by_time, legend="edges",
               line_color="green")  # plot edges

        # plot vertical lines for ground truth
        anomalies = [
            self._name_to_idx[anomaly] for anomaly in self._ground_truth
        ]
        y = [edges_count_by_time[time] for time in anomalies]
        p.scatter(anomalies,
                  y,
                  legend="anomalies",
                  line_color="red",
                  fill_color="red")  # plot nodes
        p.xaxis.major_label_overrides = {
            i: graph_name
            for i, graph_name in enumerate(self._temporal_graph.graph_names())
        }  # time to graph_name dict
        p.legend.location = "top_left"
        show(p)

    def plot_timed_mean_std(self):
        NUM_PLOT_FTR = 20
        mat_dict = self._calc_matrix()
        ftrs = self._set_index_to_ftr()
        ftrs = [str(x) for x in ftrs]

        all_mx = np.vstack([mx for name, mx in mat_dict.items()])
        # sort by highest mean
        global_mean = {
            i: m
            for i, m in enumerate(np.mean(all_mx, 0).tolist()[0])
        }
        sorted_mean = [
            i for i, m in sorted(global_mean.items(), key=lambda x: -x[1])
        ][0:NUM_PLOT_FTR]

        # ----------------------- mean -------------------------
        heat_mx = []
        mean_curves = [[] for i in range(NUM_PLOT_FTR)]
        std_curves = [[] for i in range(NUM_PLOT_FTR)]
        for name, mx in mat_dict.items():
            for i, idx in enumerate(sorted_mean):
                mx_mean = np.mean(mx, 0).tolist()[0]
                mx_std = np.std(mx, 0).tolist()[0]
                mean_curves[i].append(mx_mean[idx])
                std_curves[i].append(mx_std[idx])

        x_axis = list(range(
            self._temporal_graph.number_of_graphs()))  # [0... num of times]
        for i in range(1):  #len(std_curves)):
            i = 16
            p = figure(plot_width=600,
                       plot_height=250,
                       title=self._data_name + " std/mean for " +
                       ftrs[sorted_mean[i]],
                       x_axis_label="time",
                       y_axis_label="nodes_count")  # create figure

            p.line(x_axis, mean_curves[i], legend="mean",
                   line_color="blue")  # plot nodes
            p.line(x_axis, std_curves[i], legend="std",
                   line_color="green")  # plot edges

            # plot vertical lines for ground truth
            anomalies = [
                self._name_to_idx[anomaly] for anomaly in self._ground_truth
            ]
            y = [std_curves[i][time] for time in anomalies]
            p.scatter(anomalies,
                      y,
                      legend="anomalies",
                      line_color="red",
                      fill_color="red")  # plot nodes
            p.xaxis.major_label_overrides = {
                i: graph_name
                for i, graph_name in enumerate(
                    self._temporal_graph.graph_names())
            }  # time to graph_name dict
            p.legend.location = "top_left"
            show(p)
            e = 0

    def plot_mean_std_sheatmap(self):
        ftrs = self._set_index_to_ftr()
        ftrs = [str(x) for x in ftrs]
        mat_dict = self._calc_matrix()
        # sort by highest std
        all_mx = np.vstack([mx for name, mx in mat_dict.items()])
        global_std = {
            i: m
            for i, m in enumerate(np.std(all_mx, 0).tolist()[0])
        }
        sorted_std = [
            i for i, m in sorted(global_std.items(), key=lambda x: -x[1])
        ][0:30]

        # sort by highest mean
        global_mean = {
            i: m
            for i, m in enumerate(np.mean(all_mx, 0).tolist()[0])
        }
        sorted_mean = [
            i for i, m in sorted(global_mean.items(), key=lambda x: -x[1])
        ][0:30]

        # global_max
        global_sum = {
            i: m
            for i, m in enumerate(np.max(all_mx, 0).tolist()[0])
        }

        anomalies = [
            self._name_to_idx[anomaly] for anomaly in self._ground_truth
        ]

        # ----------------------- mean -------------------------
        heat_mx = []
        for name, mx in mat_dict.items():
            heat_day_mean = {
                i: m
                for i, m in enumerate(np.mean(mx, 0).tolist()[0])
            }
            heat_day_mean = [
                heat_day_mean[i] / global_sum[i] for i in sorted_mean
            ]
            heat_mx.append(heat_day_mean)
        plt.subplots(figsize=(20, 15))
        heat_mx = np.vstack(heat_mx)
        ax = sns.heatmap(heat_mx, vmin=0.0005, vmax=0.005)
        plt.xticks(list(range(30)), ftrs[:30], rotation='vertical')
        for i in anomalies:
            ax.axhline(y=i, color='red', linewidth=0.4)
        plt.savefig("mean_heatmap")
        e = 0

        plt.clf()

        # ----------------------- std -------------------------
        heat_mx = []
        for name, mx in mat_dict.items():
            heat_day_std = {
                i: m
                for i, m in enumerate(np.std(mx, 0).tolist()[0])
            }
            heat_day_std = [
                heat_day_std[i] / global_sum[i] for i in sorted_std
            ]
            heat_mx.append(heat_day_std)
        heat_mx = np.vstack(heat_mx)
        ax = sns.heatmap(heat_mx, vmin=0.005, vmax=0.05)
        plt.xticks(list(range(30)), ftrs[:30], rotation='vertical')
        for i in anomalies:
            ax.axhline(y=i, color='red', linewidth=0.4)
        plt.savefig("std_heatmap")
        e = 0

    def plot_features_mean_std(self):  # matrix: np.matrix):
        ftrs = self._set_index_to_ftr()
        ftrs = [str(x) for x in ftrs]

        #  -------------------- prepare matrix anomalies and rest of data
        all_list = []
        anomal_list = []
        for name, mx in self._calc_matrix().items():
            if name in self._ground_truth:
                anomal_list.append(mx)
            else:
                all_list.append(mx)

        all_mx = np.vstack(all_list)
        anomal_mx = np.vstack(anomal_list)

        global_mean = {
            i: m
            for i, m in enumerate(np.mean(all_mx, 0).tolist()[0])
        }
        global_max = {
            i: m
            for i, m in enumerate(np.std(all_mx, 0).tolist()[0])
        }
        sorted_keys = [
            i for i, m in sorted(global_mean.items(), key=lambda x: -x[1])
        ]

        groups = []
        prev_val = global_max[sorted_keys[0]]
        sub_group = []
        size_ = 0
        for i in sorted_keys:
            if 100 * prev_val >= global_max[i] >= 00.1 * prev_val and size_ < 6:
                sub_group.append(i)
                size_ += 1
            else:
                prev_val = global_mean[i]
                groups.append(sub_group)
                sub_group = [i]
                size_ = 1

        for group_num in range(1):
            group_num = 2
            curr_ftr = []
            for i in groups[group_num]:
                curr_ftr.append(ftrs[i])
                curr_ftr.append("A_" + ftrs[i])
            mid = []
            bottom = []
            top = []
            for i in groups[group_num]:
                bottom.append(
                    np.percentile(all_mx[:, i], 25, axis=0).tolist()[0])
                bottom.append(
                    np.percentile(anomal_mx[:, i], 25, axis=0).tolist()[0])
                mid.append(np.percentile(all_mx[:, i], 50, axis=0).tolist()[0])
                mid.append(
                    np.percentile(anomal_mx[:, i], 50, axis=0).tolist()[0])
                top.append(np.percentile(all_mx[:, i], 75, axis=0).tolist()[0])
                top.append(
                    np.percentile(anomal_mx[:, i], 75, axis=0).tolist()[0])

            bottom = np.array(bottom)
            mid = np.array(mid)
            top = np.array(top)
            # find the quartiles and IQR for each category
            iqr = top - bottom
            upper = top + 1.5 * iqr
            lower = bottom - 1.5 * iqr

            p = figure(tools="",
                       background_fill_color="#efefef",
                       x_range=curr_ftr,
                       toolbar_location=None,
                       plot_width=600,
                       plot_height=600,
                       title=self._data_name + "_percentile=(25-50-75)")

            colors = ["black", "red"] * int(mid.shape[0] / 2)
            # stems
            p.segment(curr_ftr, upper, curr_ftr, top, line_color=colors)
            p.segment(curr_ftr, lower, curr_ftr, bottom, line_color=colors)

            # boxes
            p.vbar(curr_ftr,
                   0.7,
                   mid,
                   top,
                   fill_color="#E08E79",
                   line_color=colors)
            p.vbar(curr_ftr,
                   0.7,
                   bottom,
                   mid,
                   fill_color="#3B8686",
                   line_color=colors)

            # whiskers (almost-0 height rects simpler than segments)
            p.rect(curr_ftr, lower, 0.2, 0.0000001, line_color=colors)
            p.rect(curr_ftr, upper, 0.2, 0.0000001, line_color=colors)

            p.xaxis.major_label_orientation = np.pi / 2
            p.xgrid.grid_line_color = None
            p.ygrid.grid_line_color = "white"
            p.grid.grid_line_width = 2
            p.xaxis.major_label_text_font_size = "12pt"
            show(p)
            # plot = Plot(output_backend="svg")
            # plot.output_backend(p, filename=str(group_num) + "_svg")

    def plot_correlations(self):
        from sklearn import linear_model
        mx_dict = self._calc_matrix()
        concat_mx = np.vstack([mx for name, mx in mx_dict.items()])
        pearson_picker = PearsonFeaturePicker(
            concat_mx,
            size=self._params.ftr_pairs,
            logger=self._logger,
            identical_bar=self._params.identical_bar)
        best_pairs = pearson_picker.best_pairs()
        for i, j, u in best_pairs:
            reg = linear_model.LinearRegression().fit(
                np.transpose(concat_mx[:, i].T), np.transpose(concat_mx[:,
                                                                        j].T))
            m = reg.coef_
            b = reg.intercept_

            ftr_i = concat_mx[:, i].T.tolist()[0]
            ftr_j = concat_mx[:, j].T.tolist()[0]

            p = figure(plot_width=600,
                       plot_height=250,
                       title=self._data_name + " regression " + str((i, j)),
                       x_axis_label="time",
                       y_axis_label="nodes_count")  # create figure

            p.line(list(range(int(max(ftr_i)) + 1)),
                   [m * i + b for i in range(10)],
                   line_color="blue")  # plot nodes

            p.scatter(list(ftr_i), list(ftr_j))  # plot nodes
            p.xaxis.major_label_overrides = {
                i: graph_name
                for i, graph_name in enumerate(
                    self._temporal_graph.graph_names())
            }  # time to graph_name dict
            p.legend.location = "top_left"
            show(p)

        e = 0
class MleEstimator:
    def __init__(self,
                 source_file,
                 num_prefix=120,
                 num_suffix=200,
                 delta=(0.2, 0.5, 0.3),
                 gamma=(0.6, 0.4)):
        self._logger = PrintLogger("NLP-ass1")
        self._delta = delta
        self._gamma = gamma
        self._source = source_file
        self._num_prefix = num_prefix
        self._num_suffix = num_suffix
        # counters
        self._emmision_count, self._transition_count, self._prefix_count, self._suffix_count = self._get_data(
        )
        self._pos_list = list(
            set(list(self._transition_count[0].keys()) + [START]))
        self._num_pos = len(self._pos_list)
        self._pos_idx = {pos: i for i, pos in enumerate(self._pos_list)}
        # probabilities
        self._emmision, self._transition, self._prefix, self._suffix = self._calc_probabilities(
        )

    def _get_data(self):
        self._logger.info("get-data - start")
        transition = {0: {}, 1: {}, 2: {}}
        t1 = START
        t2 = START
        emmision = {}
        prefix = {}
        suffix = {}
        src_file = open(self._source, "rt")  # open file
        for line in src_file:
            # ---------- BREAK -----------
            w_pos = []
            for w_p in line.split():  # break line to [.. (word, POS) ..]
                word, pos = w_p.rsplit("/", 1)
                w_pos.append((word, pos))
            for i, (word, pos) in enumerate(w_pos):
                # -------- EMISSION ----------
                emmision[(word, pos)] = emmision.get(
                    (word, pos), 0) + 1  # count (word, POS)++
                # --------- PREFIX -----------
                prefix[(word[:PREF], pos)] = prefix.get(
                    (word[:PREF], pos), 0) + 1  # count bigram prefixes
                suffix[(word[-SUFF:], pos)] = prefix.get(
                    (word[-SUFF:], pos), 0) + 1  # count bigram prefixes
                # ------- TRANSITION ---------
                transition[0][pos] = transition[0].get(pos,
                                                       0) + 1  # count(POS)
                transition[1][(t1, pos)] = transition[1].get(
                    (t1, pos), 0) + 1  # count(POS_1, POS_2)
                transition[2][(t2, t1, pos)] = transition[2].get(
                    (t2, t1, pos), 0) + 1  # count(POS_0, POS_1, POS_2)
                t2 = t1
                t1 = pos
        prefix = {
            pre: pos
            for i, (
                pre,
                pos) in enumerate(sorted(prefix.items(), key=lambda x: -x[1]))
            if i < self._num_prefix
        }
        suffix = {
            pre: pos
            for i, (
                pre,
                pos) in enumerate(sorted(suffix.items(), key=lambda x: -x[1]))
            if i < self._num_suffix
        }
        # take K most common prefixes
        self._logger.info("get-data - end")
        return emmision, transition, prefix, suffix

    @staticmethod
    def _my_log(x):
        if x == 0:
            return -100
        if x == 1:
            return -0.001
        else:
            return np.log(x)

    def _calc_probabilities(self):
        self._logger.info("calc-probabilities - start")
        transition_prob = {}

        # -------- EMISSION ----------
        # e(word| pos)
        emmision_prob = {
            (word, pos):
            ((1 - CUT) * w_p_count / self._transition_count[0][pos]) + CUT
            for (word, pos), w_p_count in self._emmision_count.items()
        }

        # --------- PREFIX -----------
        # given word [w_1, w_2 , ... , w_n-1, w_n]
        # e(w_n-1, w_n| pos)
        prefix_bi_prob = {
            (pre, pos):
            ((1 - CUT) * s_p_count / self._transition_count[0][pos]) + CUT
            for (pre, pos), s_p_count in self._prefix_count.items()
        }
        suffix_bi_prob = {
            (sufi, pos):
            ((1 - CUT) * s_p_count / self._transition_count[0][pos]) + CUT
            for (sufi, pos), s_p_count in self._suffix_count.items()
        }

        # ------- TRANSITION ---------
        sum_words = np.sum(list(self._transition_count[0].values()))
        # sequence = [pos2, pos1, pos0]
        # q(pos0)
        transition_prob[0] = {
            pos: ((1 - CUT) * pos_count / sum_words) + CUT
            for pos, pos_count in self._transition_count[0].items()
        }
        # q(pos0| pos1)
        transition_prob[1] = {
            (pos1, pos0):
            ((1 - CUT) * count / self._transition_count[0][pos1]) + CUT
            for (pos1, pos0), count in self._transition_count[1].items()
            if pos1 in self._transition_count[0]
        }
        # q(pos0| pos2, pos1)
        transition_prob[2] = {
            (pos2, pos1, pos0):
            ((1 - CUT) * count / self._transition_count[1][(pos2, pos1)]) + CUT
            for (pos2, pos1, pos0), count in self._transition_count[2].items()
            if (pos2, pos1) in self._transition_count[1]
        }
        self._logger.info("calc-probabilities - end")
        return emmision_prob, transition_prob, prefix_bi_prob, suffix_bi_prob

    def emmision(self, word_pos: tuple, log=False):
        # break
        word, pos = word_pos
        # if there is a value e(word| vec)
        if (word, pos) in self._emmision:
            return self._my_log(
                self._emmision[word_pos]) if log else self._emmision[word_pos]
        # if not then check if there is a value e(w_1, w_2| pos)
        pref = word[:PREF]
        if (pref, pos) in self._prefix:
            return self._my_log(
                self._prefix[(pref, pos)]) if log else self._prefix[(pref,
                                                                     pos)]
        # if not then check if there is a value e(w_n-1, w_n| pos)
        suf = word[-SUFF:]
        if (suf, pos) in self._suffix:
            return self._my_log(
                self._suffix[(suf, pos)]) if log else self._suffix[(suf, pos)]
        return self._my_log(0) if log else 0

    def transition(self, pos_sequence: tuple, log=False):
        # break sequence
        pos0 = pos_sequence[-1]
        pos1 = pos_sequence[-2]
        pos2 = pos_sequence[-3] if len(pos_sequence) > 2 else None
        # calculate:   d1*q(pos0| pos2, pos1)   +   d2*q(pos0| pos1)   +   d3*q(pos0)
        tran_0 = self._delta[0] * self._transition[0].get(pos0, 0)
        tran_1 = self._delta[1] * self._transition[1].get((pos1, pos0), 0)
        tran_2 = self._delta[2] * self._transition[2].get(
            (pos2, pos1, pos0), 0) if pos2 else 0
        return self._my_log(tran_2 + tran_1 +
                            tran_0) if log else tran_2 + tran_1 + tran_0

    def mle_count_to_txt(self, e_mle_path, q_mle_path):
        self._logger.info("writing e_mle...")
        out_e = open(e_mle_path, "wt")
        out_e.writelines([
            word + " " + pos + "\t" + str(count) + "\n"
            for (word, pos), count in self._emmision_count.items()
        ])
        out_e.writelines([
            "^" + pref + " " + pos + "\t" + str(count) + "\n"
            for (pref, pos), count in self._prefix_count.items()
        ])
        out_e.writelines([
            "^" + sufi + " " + pos + "\t" + str(count) + "\n"
            for (sufi, pos), count in self._suffix_count.items()
        ])
        out_e.close()
        self._logger.info("writing q_mle...")
        out_q = open(q_mle_path, "wt")
        out_q.writelines([
            pos + "\t" + str(count) + "\n"
            for pos, count in self._transition_count[0].items()
        ])
        out_q.writelines([
            pos1 + " " + pos0 + "\t" + str(count) + "\n"
            for (pos1, pos0), count in self._transition_count[1].items()
        ])
        out_q.writelines([
            pos2 + " " + pos1 + " " + pos0 + "\t" + str(count) + "\n"
            for (pos2, pos1, pos0), count in self._transition_count[2].items()
        ])
        out_q.close()

    def pred_viterbi(self, sequence, log=False):
        self._logger.info("Viterbi - START...")
        self._logger.info("Viterbi - INITIALIZATION...")
        # ------------ INITIALIZATION --------------
        len_seq = len(sequence) + 1
        base_score = self._my_log(0) if log else 0
        v_mx = [[[(base_score, (-1, self._pos_idx[START],
                                self._pos_idx[START]))
                  for _ in range(self._num_pos)] for _ in range(self._num_pos)]
                for _ in range(len_seq)]
        bp = (-1, self._pos_idx[START], self._pos_idx[START])
        base_score = self._my_log(1) if log else 1
        v_mx[0][self._pos_idx[START]][self._pos_idx[START]] = (base_score, bp)

        self._logger.info("Viterbi - FORWARD...")
        # ------- RECURSIVE STEP / FORWARD ---------
        print("Viterbi - forward: " + str(sequence) + "\nProgress:          ",
              end="")
        for i in range(1, len_seq):
            print("." * (len(sequence[i - 1]) + 3) + "|", end="")
            for j, pos2 in enumerate(self._pos_list):
                for k, pos1 in enumerate(self._pos_list):
                    score, bp = self._max_and_bp(v_mx,
                                                 i,
                                                 sequence[i - 1],
                                                 j,
                                                 pos2,
                                                 pos1,
                                                 log=log)
                    bp = (i - 1, bp, j)
                    v_mx[i][j][k] = (score, bp)
        print(" -- forward completed --")
        self._logger.info("Viterbi - BACKWARDS...")
        # ------- REPRODUCTION / BACKWARDS ---------
        # find max and arg max at v_max[last_layer]
        max_val = self._my_log(0) if log else 0
        max_i = 0
        max_j = 0
        for i in range(self._num_pos):
            for j in range(self._num_pos):
                if v_mx[len_seq - 1][i][j][0] > max_val:
                    max_val = v_mx[len_seq - 1][i][j][0]
                    max_i = i
                    max_j = j
        # reconstruct Part Of Speech
        prediction = [self._pos_list[max_i], self._pos_list[max_j]]
        ps = v_mx[len_seq - 1][max_i][max_j][1]
        for word_idx in range(len_seq - 1, 0, -1):
            curr_pos = self._pos_list[ps[1]]
            if curr_pos == START:
                break
            prediction = [curr_pos] + prediction
            ps = v_mx[ps[0]][ps[1]][ps[2]][1]
        return prediction

    def _max_and_bp(self,
                    v_mx,
                    word_idx,
                    word,
                    pos2_idx,
                    pos2,
                    pos1,
                    log=False):
        # given a word w_n and pos2, pos1
        # we want to maximize w_n is pos1 coming after a pos2 word
        # scores = V(w_n-1, pos_i, pos2) * q(pos1| pos_i, pos2) * e(w_n| pos1)  i = 0..num_pos
        if log:
            scores = [
                v_mx[word_idx - 1][i][pos2_idx][0] +
                (self._gamma[1] * self.transition(
                    (self._pos_list[i], pos2, pos1), log=log) +
                 self._gamma[0] * self.emmision((word, pos1), log=log))
                for i in range(self._num_pos)
            ]
        else:
            scores = [
                v_mx[word_idx - 1][i][pos2_idx][0] * self.transition(
                    (self._pos_list[i], pos2, pos1), log=log) * self.emmision(
                        (word, pos1), log=log) for i in range(self._num_pos)
            ]
        max_score = np.max(scores)
        argmax_score = np.argmax(scores)
        return max_score, argmax_score