コード例 #1
0
ファイル: abs_model.py プロジェクト: Minzc/subspace_outlier
    def __init__(self,
                 name,
                 aggregate_method,
                 base_model,
                 neighbor,
                 norm_method="DEFAULT"):
        self.name = name
        # =======================================
        # Setup normalization method
        # =======================================
        assert norm_method in {
            Normalize.ZSCORE, Normalize.UNIFY, "DEFAULT", None
        }
        if norm_method == "DEFAULT":
            if aggregate_method == Aggregator.AVERAGE:
                self.norm_method = Normalize.ZSCORE
            elif aggregate_method == Aggregator.AVERAGE_THRESHOLD:
                self.norm_method = Normalize.ZSCORE
            else:
                self.norm_method = None
        else:
            self.norm_method = norm_method

        if base_model == kNN.NAME:
            self.mdl = kNN(neighbor, self.norm_method)
        elif base_model == LOF.NAME:
            self.mdl = LOF(neighbor, self.norm_method)
        elif base_model == None:
            self.mdl = None
        else:
            raise Exception(f"Base Model: {base_model} is not supported.")
コード例 #2
0
def outliers_per_subspace():
    import json
    outputs = defaultdict(dict)
    model = 'gke'

    for dataset in [Dataset.GLASS, Dataset.WINE,
                    Dataset.BREASTW, Dataset.ANNTHYROID,
                    Dataset.VOWELS, Dataset.PIMA, Dataset.THYROID]:
        logger.info("=" * 50)
        logger.info(f"             Dataset {dataset}             ")
        logger.info("=" * 50)
        _X, Y = DataLoader.load(dataset)
        outlier_num, inlier_num = np.sum(Y == 1), np.sum(Y == 0)
        feature_index = np.array([i for i in range(_X.shape[1])])
        if model == "knn":
            mdl = kNN(max(10, int(np.floor(0.03 * _X.shape[0]))), Normalize.ZSCORE)
            X_gpu_tensor = _X
        elif model == "gke":
            mdl = GKE_GPU(Normalize.ZSCORE)
            X_gpu_tensor = GKE_GPU.convert_to_tensor(_X)

        model_outputs = []
        selected_features = []
        for l in range(1, len(feature_index) + 1):
            for i in combinations(feature_index, l):
                model_outputs.append( (i, mdl.fit(X_gpu_tensor[:, np.asarray(i)])) )

        logger.info(f"Total model {len(model_outputs)}")
        for name, aggregator, threshold in [("RANK", Aggregator.count_rank_threshold, 0.05),
                                            ("RANK", Aggregator.count_rank_threshold, 0.10),
                                            ("STD", Aggregator.count_std_threshold, 1),
                                            ("STD", Aggregator.count_std_threshold, 2)]:
            logger.info(f"---------------{name}------------------------")
            outlier_num_per_subspace = []
            for selected_features, i in model_outputs:
                y_scores = np.array(aggregator([i, ], threshold))

                outlier_num_per_subspace.append(int(np.sum(y_scores[Y == 1])))
            outputs[f"{name}_{threshold}"][dataset] = {
                "outlier_dist": outlier_num_per_subspace,
                "outlier_total": int(outlier_num),
                "subspace_total": len(model_outputs)
            }

        total_score = Aggregator.count_rank_threshold(model_outputs)
        for idx, i in enumerate(Y):
            if i  == 1 and total_score[i] == 0:
                print("FN Outliers", X_gpu_tensor[idx])
        print("Inliers", X_gpu_tensor[Y == 0])

    output_file = f"{model}_outliers_per_subspace.json"
    with open(output_file, "w") as w:
        w.write(f"{json.dumps(outputs)}\n")
    logger.info(f"Output file {output_file}")
コード例 #3
0
def compare_auc():
    outputs = defaultdict(dict)
    # model = "knn"
    model_name = "gke"
    for dataset in [
            Dataset.VOWELS, Dataset.WINE, Dataset.BREASTW, Dataset.ANNTHYROID,
            Dataset.GLASS, Dataset.PIMA, Dataset.THYROID
    ]:
        logger.info("=" * 50)
        logger.info(f"             Dataset {dataset}             ")
        logger.info("=" * 50)
        _X, Y = DataLoader.load(dataset)
        feature_index = np.array([i for i in range(_X.shape[1])])

        if model_name == "knn":
            X_gpu_tensor = _X
            mdl = kNN(max(10, int(np.floor(0.03 * _X.shape[0]))),
                      Normalize.ZSCORE)
        elif model_name == "gke":
            X_gpu_tensor = GKE_GPU.convert_to_tensor(_X)
            mdl = GKE_GPU(Normalize.ZSCORE)

        model_outputs = []
        for l in range(1, len(feature_index) + 1):
            for i in combinations(feature_index, l):
                model_outputs.append(mdl.fit(X_gpu_tensor[:, np.asarray(i)]))

        logger.info(f"Total model {len(model_outputs)}")
        for name, aggregator, threshold in [
            ("RANK", Aggregator.count_rank_threshold, 0.05),
            ("RANK", Aggregator.count_rank_threshold, 0.10),
            ("STD", Aggregator.count_std_threshold, 1),
            ("STD", Aggregator.count_std_threshold, 2),
            ("AVG", Aggregator.average, None),
            ("AVG", Aggregator.average_threshold, 1),
            ("AVG", Aggregator.average_threshold, 2),
        ]:
            if threshold is not None:
                y_scores = np.array(aggregator(model_outputs, threshold))
            else:
                y_scores = np.array(aggregator(model_outputs))
            roc = roc_auc_score(Y, y_scores)
            precision = precision_n_scores(Y, y_scores)
            logger.info(
                f"ROC of {name}-{threshold} {roc} Precision {precision}")
            outputs[dataset][f"{name}_{threshold}"] = {
                "roc": roc,
                "precision": precision
            }

    output_file = f"{model_name}_performance.json"
    with open(output_file, "w") as w:
        w.write(f"{json.dumps(outputs)}\n")
    logger.info(f"Output file {output_file}")
コード例 #4
0
def load_model_and_data(dataset, model):
    logger.info("=" * 50)
    logger.info(f"             Dataset {dataset}             ")
    logger.info("=" * 50)
    _X, Y = DataLoader.load(dataset)
    outlier_num = int(np.sum(Y == 1))
    feature_index = np.array(range(_X.shape[1]))
    if model == "knn":
        mdl = kNN(max(10, int(np.floor(0.03 * _X.shape[0]))), Normalize.ZSCORE)
        X_gpu_tensor = _X
    elif model == "gke":
        mdl = GKE_GPU(Normalize.ZSCORE)
        X_gpu_tensor = GKE_GPU.convert_to_tensor(_X)
    return mdl, X_gpu_tensor, Y, outlier_num, feature_index
コード例 #5
0
def outlier_correlation_subspace():
    import json
    outputs = defaultdict(dict)
    model = 'gke'

    for dataset in [
            Dataset.VOWELS, Dataset.WINE, Dataset.BREASTW, Dataset.ANNTHYROID,
            Dataset.GLASS, Dataset.PIMA, Dataset.THYROID
    ]:
        logger.info("=" * 50)
        logger.info(f"             Dataset {dataset}             ")
        logger.info("=" * 50)
        _X, Y = DataLoader.load(dataset)
        outlier_num = np.sum(Y == 1)
        feature_index = np.array(range(_X.shape[1]))
        if model == "knn":
            mdl = kNN(max(10, int(np.floor(0.03 * _X.shape[0]))),
                      Normalize.ZSCORE)
            X_gpu_tensor = _X
        elif model == "gke":
            mdl = GKE_GPU(Normalize.ZSCORE)
            X_gpu_tensor = GKE_GPU.convert_to_tensor(_X)

        model_outputs = []
        subspace_idx_to_feautres = []
        for l in range(1, len(feature_index) + 1):
            for i in combinations(feature_index, l):
                model_outputs.append(mdl.fit(X_gpu_tensor[:, np.asarray(i)]))
                subspace_idx_to_feautres.append([int(j) for j in i])

        logger.info(f"Total model {len(model_outputs)}")
        for name, aggregator, threshold in [
            ("RANK", Aggregator.count_rank_threshold, 0.05),
            ("RANK", Aggregator.count_rank_threshold, 0.10),
            ("STD", Aggregator.count_std_threshold, 1),
            ("STD", Aggregator.count_std_threshold, 2)
        ]:
            outliers_to_subspaces = defaultdict(set)
            subspace_to_outlier = {}
            for subspace_id, model_output in enumerate(model_outputs):
                detected_outliers = {
                    point_idx
                    for point_idx, if_outlier in enumerate(
                        aggregator([
                            model_output,
                        ], threshold)) if if_outlier == 1 and Y[point_idx] == 1
                }
                subspace_to_outlier[subspace_id] = detected_outliers
                for detected_outlier in detected_outliers:
                    outliers_to_subspaces[detected_outlier].add(subspace_id)
            _subspace_to_outlier = {
                i: copy.deepcopy(j)
                for i, j in subspace_to_outlier.items()
            }

            not_covered_outliers = {
                i
                for i, subspaces in outliers_to_subspaces.items()
                if len(subspaces) > 0
            }
            not_covered_outliers_num = len(not_covered_outliers)
            logger.info(
                f"Detected outliers {len(not_covered_outliers)}/{outlier_num}")
            selected_subspaces = []
            while len(not_covered_outliers) > 0:
                _tmp = sorted(subspace_to_outlier.items(),
                              key=lambda x: len(x[1]),
                              reverse=True)
                selected_subspace_id, covered_outliers = \
                    sorted(subspace_to_outlier.items(), key=lambda x: len(x[1]), reverse=True)[0]
                not_covered_outliers = not_covered_outliers - covered_outliers
                subspace_to_outlier = {
                    i: (j - covered_outliers)
                    for i, j in subspace_to_outlier.items()
                }
                selected_subspaces.append(selected_subspace_id)

            for i in selected_subspaces:
                print(
                    f"Features {subspace_idx_to_feautres[i]} Outliers {len(_subspace_to_outlier[i])}"
                )
            print(f"{len(selected_subspaces)}/{len(model_outputs)}")
            outputs[f"{name}_{threshold}"][dataset] = {
                "select_subspace":
                [(subspace_idx_to_feautres[i], list(_subspace_to_outlier[i]))
                 for i in selected_subspaces],
                "outliers":
                not_covered_outliers_num,
                "total_subspace":
                len(model_outputs),
                "total_outliers":
                int(outlier_num),
                "dimension":
                len(feature_index)
            }

    output_file = f"{model}_outliers_correlation_subspace.json"
    with open(output_file, "w") as w:
        w.write(f"{json.dumps(outputs)}\n")
    logger.info(f"Output file {output_file}")
コード例 #6
0
def subspace_count_per_point():
    import json
    BIN_NUM = 10
    outputs = defaultdict(dict)
    model = 'knn'

    for dataset in [
            Dataset.VOWELS, Dataset.WINE, Dataset.BREASTW, Dataset.ANNTHYROID,
            Dataset.GLASS, Dataset.PIMA, Dataset.THYROID
    ]:
        logger.info("=" * 50)
        logger.info(f"             Dataset {dataset}             ")
        logger.info("=" * 50)
        _X, Y = DataLoader.load(dataset)
        outlier_num, inlier_num = np.sum(Y == 1), np.sum(Y == 0)
        feature_index = np.array([i for i in range(_X.shape[1])])
        if model == "knn":
            mdl = kNN(max(10, int(np.floor(0.03 * _X.shape[0]))),
                      Normalize.ZSCORE)
            X_gpu_tensor = _X
        elif model == "gke":
            mdl = GKE_GPU(Normalize.ZSCORE)
            X_gpu_tensor = GKE_GPU.convert_to_tensor(_X)

        model_outputs = []
        for l in range(1, len(feature_index) + 1):
            for i in combinations(feature_index, l):
                model_outputs.append(mdl.fit(X_gpu_tensor[:, np.asarray(i)]))

        logger.info(f"Total model {len(model_outputs)}")
        for name, aggregator, threshold in [
            ("RANK", Aggregator.count_rank_threshold, 0.05),
            ("RANK", Aggregator.count_rank_threshold, 0.10),
            ("STD", Aggregator.count_std_threshold, 1),
            ("STD", Aggregator.count_std_threshold, 2)
        ]:
            y_scores = np.array(aggregator(model_outputs, threshold))
            outlier_subspaces, inlier_subspaces = y_scores[Y == 1], y_scores[
                Y == 0]

            outlier_hist, bin = np.histogram(outlier_subspaces,
                                             BIN_NUM,
                                             range=(0.1, len(model_outputs)))
            bin = [f"{i / len(model_outputs):.1f}" for i in bin]
            zero_subspaces_outlier = sum(
                [1 for i in outlier_subspaces if i == 0])
            print(zero_subspaces_outlier)
            print(outlier_hist)
            outlier_hist = np.insert(outlier_hist, 0, zero_subspaces_outlier)
            print(outlier_hist)
            assert np.sum(outlier_hist) == outlier_num

            inlier_hist = np.histogram(inlier_subspaces,
                                       BIN_NUM,
                                       range=(0.1, len(model_outputs)))[0]
            zero_subspaces_inlier = sum(
                [1 for i in inlier_subspaces if i == 0])
            print(zero_subspaces_inlier)
            print(inlier_hist)
            inlier_hist = np.insert(inlier_hist, 0, zero_subspaces_inlier)
            print(inlier_hist)
            assert np.sum(inlier_hist) == inlier_num

            outlier_hist_percent = outlier_hist / outlier_num
            inlier_hist_percent = inlier_hist / inlier_num

            logger.info(f"Outlier {outlier_num} Inlier {inlier_num}")
            logger.info(
                f"Outlier Median {np.median(outlier_subspaces)} Inlier Median {np.median(inlier_subspaces)}"
            )
            logger.info(
                f"Outlier Mean {np.mean(outlier_subspaces)} Inlier Mean {np.mean(inlier_subspaces)}"
            )

            logger.info(f"Bin {bin}")
            logger.info(f"Outlier dist {outlier_hist}")
            logger.info(f"Inlier dist {inlier_hist}")
            logger.info(f"Outlier dist density {outlier_hist_percent}")
            logger.info(f"Inlier dist density {inlier_hist_percent}")

            outputs[f"{name}_{threshold}"][dataset] = {
                "outlier": outlier_hist_percent.tolist(),
                "inlier": inlier_hist_percent.tolist(),
                "bin": bin,
                "outlier_mean": np.mean(outlier_subspaces),
                "inlier_mean": np.mean(inlier_subspaces),
                "outlier_median": np.median(outlier_subspaces),
                "inlier_median": np.median(inlier_subspaces),
            }

    output_file = f"{model}_subspace_count_per_point.json"
    with open(output_file, "w") as w:
        w.write(f"{json.dumps(outputs)}\n")
    logger.info(f"Output file {output_file}")
コード例 #7
0
def point_count_per_dim():
    import json
    outputs = defaultdict(dict)
    model = "knn"
    for dataset in [
            Dataset.VOWELS, Dataset.WINE, Dataset.BREASTW, Dataset.ANNTHYROID,
            Dataset.GLASS, Dataset.PIMA, Dataset.THYROID
    ]:
        logger.info("=" * 50)
        logger.info(f"             Dataset {dataset}             ")
        logger.info("=" * 50)
        _X, Y = DataLoader.load(dataset)
        feature_index = np.array([i for i in range(_X.shape[1])])
        outlier_num, inlier_num = np.sum(Y == 1), np.sum(Y == 0)

        if model == "knn":
            mdl = GKE_GPU(Normalize.ZSCORE)
            X_gpu_tensor = GKE_GPU.convert_to_tensor(_X)
        else:
            mdl = kNN(max(10, int(np.floor(0.03 * _X.shape[0]))),
                      Normalize.ZSCORE)
            X_gpu_tensor = _X

        model_outputs_all = defaultdict(list)
        for l in range(1, len(feature_index) + 1):
            for i in combinations(feature_index, l):
                model_outputs_all[l].append(
                    mdl.fit(X_gpu_tensor[:, np.asarray(i)]))

        assert len(model_outputs_all) == len(feature_index)

        for name, aggregator, threshold in [
            ("RANK", Aggregator.count_rank_threshold, 0.05),
            ("RANK", Aggregator.count_rank_threshold, 0.10),
            ("STD", Aggregator.count_std_threshold, 1),
            ("STD", Aggregator.count_std_threshold, 2)
        ]:
            dim_outlier_ratio = [0] * len(feature_index)
            dim_inlier_ratio = [0] * len(feature_index)

            for l, model_outputs in model_outputs_all.items():
                y_scores = np.array(aggregator(model_outputs, threshold))

                point_idx = set()
                for idx, score in enumerate(y_scores[Y == 1]):
                    if score > 0:
                        point_idx.add(idx)
                dim_outlier_ratio[l - 1] = len(point_idx) / outlier_num

                point_idx = set()
                for idx, score in enumerate(y_scores[Y == 0]):
                    if score > 0:
                        point_idx.add(idx)
                dim_inlier_ratio[l - 1] = len(point_idx) / inlier_num

            outputs[f"{name}_{threshold}"][dataset] = {
                "outlier": dim_outlier_ratio,
                "inlier": dim_inlier_ratio,
                "feature_index": feature_index.tolist()
            }

    with open(f"{model}_point_count_per_dim.json", "w") as w:
        w.write(f"{json.dumps(outputs)}")