Python DataLoader.load Exemples, sood.data_process.data_loader.DataLoader.load Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : weak_oracle_adaptive_jump.py Projet : Minzc/subspace_outlier

def single_test():
    dataset = Dataset.MNIST_ODDS
    aggregator = Aggregator.AVERAGE_THRESHOLD
    threshold = 0

    X, Y = DataLoader.load(dataset)
    X = X[:, np.std(X, axis=0) != 0]

    dim = X.shape[1]
    neigh = max(10, int(np.floor(0.03 * X.shape[0])))
    ENSEMBLE_SIZE = 100
    logger.info(f"{dataset} {aggregator} {threshold}")
    roc_aucs = []
    precision_at_ns = []
    mdl = OracleAdaptive(2, dim / 4, ENSEMBLE_SIZE, aggregator, neigh,
                         kNN.NAME, Y, threshold)
    for _ in tqdm.trange(1):
        try:
            rst = mdl.compute_ensemble_components(X)
            roc_auc = mdl.compute_roc_auc(rst, Y)
            logger.info("Final ROC {}".format(roc_auc))
            precision_at_n = mdl.compute_precision_at_n(rst, Y)
            logger.info("Precision@n {}".format(precision_at_n))

            roc_aucs.append(roc_auc)
            precision_at_ns.append(precision_at_n)
        except Exception as e:
            logger.exception(e)
    logger.info(f"Exp Information {dataset} {aggregator} {threshold}")
    logger.info(f"Final Average ROC {np.mean(roc_aucs)}")
    logger.info(f"Final Precision@n {np.mean(precision_at_ns)}")
    logger.info(f"====================================================")

Exemple #2

0

Afficher le fichier

Fichier : base_detectors_bak.py Projet : Minzc/subspace_outlier

def test():
    X, Y = DataLoader.load(Dataset.MUSK)
    neigh = max(10, int(np.floor(0.03 * X.shape[0])))
    knn = kNN(neigh, if_normalize=False)
    rst = knn.fit(X)
    y_scores = np.array(rst)
    print(y_scores)
    roc_auc = roc_auc_score(Y, y_scores)
    print(roc_auc)

Exemple #3

0

Afficher le fichier

Fichier : outlier_per_subspace.py Projet : Minzc/subspace_outlier

def outliers_per_subspace():
    import json
    outputs = defaultdict(dict)
    model = 'gke'

    for dataset in [Dataset.GLASS, Dataset.WINE,
                    Dataset.BREASTW, Dataset.ANNTHYROID,
                    Dataset.VOWELS, Dataset.PIMA, Dataset.THYROID]:
        logger.info("=" * 50)
        logger.info(f"             Dataset {dataset}             ")
        logger.info("=" * 50)
        _X, Y = DataLoader.load(dataset)
        outlier_num, inlier_num = np.sum(Y == 1), np.sum(Y == 0)
        feature_index = np.array([i for i in range(_X.shape[1])])
        if model == "knn":
            mdl = kNN(max(10, int(np.floor(0.03 * _X.shape[0]))), Normalize.ZSCORE)
            X_gpu_tensor = _X
        elif model == "gke":
            mdl = GKE_GPU(Normalize.ZSCORE)
            X_gpu_tensor = GKE_GPU.convert_to_tensor(_X)

        model_outputs = []
        selected_features = []
        for l in range(1, len(feature_index) + 1):
            for i in combinations(feature_index, l):
                model_outputs.append( (i, mdl.fit(X_gpu_tensor[:, np.asarray(i)])) )

        logger.info(f"Total model {len(model_outputs)}")
        for name, aggregator, threshold in [("RANK", Aggregator.count_rank_threshold, 0.05),
                                            ("RANK", Aggregator.count_rank_threshold, 0.10),
                                            ("STD", Aggregator.count_std_threshold, 1),
                                            ("STD", Aggregator.count_std_threshold, 2)]:
            logger.info(f"---------------{name}------------------------")
            outlier_num_per_subspace = []
            for selected_features, i in model_outputs:
                y_scores = np.array(aggregator([i, ], threshold))

                outlier_num_per_subspace.append(int(np.sum(y_scores[Y == 1])))
            outputs[f"{name}_{threshold}"][dataset] = {
                "outlier_dist": outlier_num_per_subspace,
                "outlier_total": int(outlier_num),
                "subspace_total": len(model_outputs)
            }

        total_score = Aggregator.count_rank_threshold(model_outputs)
        for idx, i in enumerate(Y):
            if i  == 1 and total_score[i] == 0:
                print("FN Outliers", X_gpu_tensor[idx])
        print("Inliers", X_gpu_tensor[Y == 0])

    output_file = f"{model}_outliers_per_subspace.json"
    with open(output_file, "w") as w:
        w.write(f"{json.dumps(outputs)}\n")
    logger.info(f"Output file {output_file}")

Exemple #4

0

Afficher le fichier

Fichier : enumerate_features.py Projet : Minzc/subspace_outlier

def compare_auc():
    outputs = defaultdict(dict)
    # model = "knn"
    model_name = "gke"
    for dataset in [
            Dataset.VOWELS, Dataset.WINE, Dataset.BREASTW, Dataset.ANNTHYROID,
            Dataset.GLASS, Dataset.PIMA, Dataset.THYROID
    ]:
        logger.info("=" * 50)
        logger.info(f"             Dataset {dataset}             ")
        logger.info("=" * 50)
        _X, Y = DataLoader.load(dataset)
        feature_index = np.array([i for i in range(_X.shape[1])])

        if model_name == "knn":
            X_gpu_tensor = _X
            mdl = kNN(max(10, int(np.floor(0.03 * _X.shape[0]))),
                      Normalize.ZSCORE)
        elif model_name == "gke":
            X_gpu_tensor = GKE_GPU.convert_to_tensor(_X)
            mdl = GKE_GPU(Normalize.ZSCORE)

        model_outputs = []
        for l in range(1, len(feature_index) + 1):
            for i in combinations(feature_index, l):
                model_outputs.append(mdl.fit(X_gpu_tensor[:, np.asarray(i)]))

        logger.info(f"Total model {len(model_outputs)}")
        for name, aggregator, threshold in [
            ("RANK", Aggregator.count_rank_threshold, 0.05),
            ("RANK", Aggregator.count_rank_threshold, 0.10),
            ("STD", Aggregator.count_std_threshold, 1),
            ("STD", Aggregator.count_std_threshold, 2),
            ("AVG", Aggregator.average, None),
            ("AVG", Aggregator.average_threshold, 1),
            ("AVG", Aggregator.average_threshold, 2),
        ]:
            if threshold is not None:
                y_scores = np.array(aggregator(model_outputs, threshold))
            else:
                y_scores = np.array(aggregator(model_outputs))
            roc = roc_auc_score(Y, y_scores)
            precision = precision_n_scores(Y, y_scores)
            logger.info(
                f"ROC of {name}-{threshold} {roc} Precision {precision}")
            outputs[dataset][f"{name}_{threshold}"] = {
                "roc": roc,
                "precision": precision
            }

    output_file = f"{model_name}_performance.json"
    with open(output_file, "w") as w:
        w.write(f"{json.dumps(outputs)}\n")
    logger.info(f"Output file {output_file}")

Exemple #5

0

Afficher le fichier

Fichier : outlier_correlation.py Projet : Minzc/subspace_outlier

def load_model_and_data(dataset, model):
    logger.info("=" * 50)
    logger.info(f"             Dataset {dataset}             ")
    logger.info("=" * 50)
    _X, Y = DataLoader.load(dataset)
    outlier_num = int(np.sum(Y == 1))
    feature_index = np.array(range(_X.shape[1]))
    if model == "knn":
        mdl = kNN(max(10, int(np.floor(0.03 * _X.shape[0]))), Normalize.ZSCORE)
        X_gpu_tensor = _X
    elif model == "gke":
        mdl = GKE_GPU(Normalize.ZSCORE)
        X_gpu_tensor = GKE_GPU.convert_to_tensor(_X)
    return mdl, X_gpu_tensor, Y, outlier_num, feature_index

Exemple #6

0

Afficher le fichier

def test_greedy_threshold():
    for dataset in [
            Dataset.ARRHYTHMIA, Dataset.OPTDIGITS, Dataset.MUSK,
            Dataset.MNIST_ODDS
    ]:
        for aggregator in [
                Aggregator.COUNT_STD_THRESHOLD, Aggregator.AVERAGE_THRESHOLD,
                Aggregator.AVERAGE, Aggregator.COUNT_RANK_THRESHOLD
        ]:
            for threshold in [0, 0.5, 0.7, 0.9]:
                X, Y = DataLoader.load(dataset)
                dim = X.shape[1]
                neigh = max(10, int(np.floor(0.03 * X.shape[0])))
                ENSEMBLE_SIZE = 100
                logger.info(f"{dataset} {aggregator} {threshold}")
                roc_aucs = []
                precision_at_ns = []
                mdl = GreedyVariance(1, dim / 2, ENSEMBLE_SIZE, aggregator,
                                     neigh, kNN.NAME, Y, threshold)
                for _ in tqdm.trange(5):
                    try:
                        rst = mdl.run(X)
                        roc_auc = mdl.compute_roc_auc(rst, Y)
                        logger.info("Final ROC {}".format(roc_auc))
                        precision_at_n = mdl.compute_precision_at_n(rst, Y)
                        logger.info("Precision@n {}".format(precision_at_n))

                        roc_aucs.append(roc_auc)
                        precision_at_ns.append(precision_at_n)
                    except Exception as e:
                        logger.exception(e)
                logger.info(
                    f"Exp Information {dataset} {aggregator} {threshold}")
                logger.info(f"Final Average ROC {np.mean(roc_aucs)}")
                logger.info(f"Final Precision@n {np.mean(precision_at_ns)}")
                logger.info(
                    f"====================================================")

Exemple #7

0

Afficher le fichier

Fichier : uniform.py Projet : Minzc/subspace_outlier

            return Aggregator.count_rank_threshold(model_outputs, 100)
        elif self.aggregate_method == Aggregator.AVERAGE:
            return Aggregator.average(model_outputs)
        elif self.aggregate_method == Aggregator.COUNT_STD_THRESHOLD:
            return Aggregator.count_std_threshold(model_outputs, 2)
        elif self.aggregate_method == Aggregator.AVERAGE_THRESHOLD:
            return Aggregator.average_threshold(model_outputs, 2)


if __name__ == '__main__':
    from sood.data_process.data_loader import Dataset, DataLoader

    ENSEMBLE_SIZE = 100
    EXP_NUM = 1

    X, Y = DataLoader.load(Dataset.MNIST_ODDS)
    neigh = max(10, int(np.floor(0.03 * X.shape[0])))
    # X = X[:, np.std(X, axis=0) != 0]
    dim = X.shape[1]

    for start, end in [(2, int(dim / 4))]:
        fb = Uniform(start, end, ENSEMBLE_SIZE, Aggregator.AVERAGE, neigh,
                     kNN.NAME)

        start_ts = time.time()
        roc_aucs = []
        precision_at_ns = []

        for i in range(EXP_NUM):
            rst = fb.run(X)

Exemple #8

0

Afficher le fichier

Fichier : outlier_correlation.py Projet : Minzc/subspace_outlier

def outlier_correlation_subspace():
    import json
    outputs = defaultdict(dict)
    model = 'gke'

    for dataset in [
            Dataset.VOWELS, Dataset.WINE, Dataset.BREASTW, Dataset.ANNTHYROID,
            Dataset.GLASS, Dataset.PIMA, Dataset.THYROID
    ]:
        logger.info("=" * 50)
        logger.info(f"             Dataset {dataset}             ")
        logger.info("=" * 50)
        _X, Y = DataLoader.load(dataset)
        outlier_num = np.sum(Y == 1)
        feature_index = np.array(range(_X.shape[1]))
        if model == "knn":
            mdl = kNN(max(10, int(np.floor(0.03 * _X.shape[0]))),
                      Normalize.ZSCORE)
            X_gpu_tensor = _X
        elif model == "gke":
            mdl = GKE_GPU(Normalize.ZSCORE)
            X_gpu_tensor = GKE_GPU.convert_to_tensor(_X)

        model_outputs = []
        subspace_idx_to_feautres = []
        for l in range(1, len(feature_index) + 1):
            for i in combinations(feature_index, l):
                model_outputs.append(mdl.fit(X_gpu_tensor[:, np.asarray(i)]))
                subspace_idx_to_feautres.append([int(j) for j in i])

        logger.info(f"Total model {len(model_outputs)}")
        for name, aggregator, threshold in [
            ("RANK", Aggregator.count_rank_threshold, 0.05),
            ("RANK", Aggregator.count_rank_threshold, 0.10),
            ("STD", Aggregator.count_std_threshold, 1),
            ("STD", Aggregator.count_std_threshold, 2)
        ]:
            outliers_to_subspaces = defaultdict(set)
            subspace_to_outlier = {}
            for subspace_id, model_output in enumerate(model_outputs):
                detected_outliers = {
                    point_idx
                    for point_idx, if_outlier in enumerate(
                        aggregator([
                            model_output,
                        ], threshold)) if if_outlier == 1 and Y[point_idx] == 1
                }
                subspace_to_outlier[subspace_id] = detected_outliers
                for detected_outlier in detected_outliers:
                    outliers_to_subspaces[detected_outlier].add(subspace_id)
            _subspace_to_outlier = {
                i: copy.deepcopy(j)
                for i, j in subspace_to_outlier.items()
            }

            not_covered_outliers = {
                i
                for i, subspaces in outliers_to_subspaces.items()
                if len(subspaces) > 0
            }
            not_covered_outliers_num = len(not_covered_outliers)
            logger.info(
                f"Detected outliers {len(not_covered_outliers)}/{outlier_num}")
            selected_subspaces = []
            while len(not_covered_outliers) > 0:
                _tmp = sorted(subspace_to_outlier.items(),
                              key=lambda x: len(x[1]),
                              reverse=True)
                selected_subspace_id, covered_outliers = \
                    sorted(subspace_to_outlier.items(), key=lambda x: len(x[1]), reverse=True)[0]
                not_covered_outliers = not_covered_outliers - covered_outliers
                subspace_to_outlier = {
                    i: (j - covered_outliers)
                    for i, j in subspace_to_outlier.items()
                }
                selected_subspaces.append(selected_subspace_id)

            for i in selected_subspaces:
                print(
                    f"Features {subspace_idx_to_feautres[i]} Outliers {len(_subspace_to_outlier[i])}"
                )
            print(f"{len(selected_subspaces)}/{len(model_outputs)}")
            outputs[f"{name}_{threshold}"][dataset] = {
                "select_subspace":
                [(subspace_idx_to_feautres[i], list(_subspace_to_outlier[i]))
                 for i in selected_subspaces],
                "outliers":
                not_covered_outliers_num,
                "total_subspace":
                len(model_outputs),
                "total_outliers":
                int(outlier_num),
                "dimension":
                len(feature_index)
            }

    output_file = f"{model}_outliers_correlation_subspace.json"
    with open(output_file, "w") as w:
        w.write(f"{json.dumps(outputs)}\n")
    logger.info(f"Output file {output_file}")

Exemple #9

0

Afficher le fichier

    def compute_ensemble_components(self, data_array):
        detector_list = []
        feature_index = np.array([i for i in range(data_array.shape[1])])
        for i in range(self.ensemble_size):
            # Randomly sample feature size
            feature_size = np.random.randint(self.dim_start, self.dim_end)
            # Randomly select features
            selected_features = np.random.choice(feature_index, feature_size)
            detector_list.append(kNN_LSCP(neighbor_size=self.neighbor, selected_features=selected_features))

        clf = LSCP(detector_list)
        clf.fit(data_array)
        score = clf.decision_scores_
        return [score, ]

    def aggregate_components(self, model_outputs):
        return model_outputs[0]


if __name__ == '__main__':
    X, Y = DataLoader.load(Dataset.OPTDIGITS)
    dim = X.shape[1]
    neigh = max(10, int(np.floor(0.03 * X.shape[0])))
    ENSEMBLE_SIZE = 100

    mdl = Lscp(1, dim / 2, ENSEMBLE_SIZE, neigh)
    rst = mdl.run(X)
    roc_auc = mdl.compute_roc_auc(rst, Y)
    print(f"Final ROC {roc_auc}")

Exemple #10

0

Afficher le fichier

                    except Exception as e:
                        logger.exception(e)
                logger.info(
                    f"Exp Information {dataset} {aggregator} {threshold}")
                logger.info(f"Final Average ROC {np.mean(roc_aucs)}")
                logger.info(f"Final Precision@n {np.mean(precision_at_ns)}")
                logger.info(
                    f"====================================================")


if __name__ == '__main__':
    dataset = Dataset.MNIST_ODDS
    aggregator = Aggregator.COUNT_STD_THRESHOLD
    threshold = 0

    X, Y = DataLoader.load(dataset)
    dim = X.shape[1]
    neigh = max(10, int(np.floor(0.03 * X.shape[0])))
    ENSEMBLE_SIZE = 100
    logger.info(f"{dataset} {aggregator} {threshold}")
    roc_aucs = []
    precision_at_ns = []
    mdl = GreedyVariance(1, dim / 2, ENSEMBLE_SIZE, aggregator, neigh,
                         kNN.NAME, Y, threshold)
    for _ in tqdm.trange(1):
        try:
            rst = mdl.run(X)
            roc_auc = mdl.compute_roc_auc(rst, Y)
            logger.info("Final ROC {}".format(roc_auc))
            precision_at_n = mdl.compute_precision_at_n(rst, Y)
            logger.info("Precision@n {}".format(precision_at_n))

Exemple #11

0

Afficher le fichier

def batch_test():
    import json
    path_manager = PathManager()
    ENSEMBLE_SIZE = 100
    for dataset in [
            Dataset.ARRHYTHMIA, Dataset.MUSK, Dataset.MNIST_ODDS,
            Dataset.OPTDIGITS
    ]:
        for aggregator in [
                Aggregator.AVERAGE, Aggregator.AVERAGE_THRESHOLD,
                Aggregator.COUNT_STD_THRESHOLD, Aggregator.COUNT_RANK_THRESHOLD
        ]:
            for base_model in [
                    kNN.NAME,
            ]:
                # =======================================================================================
                # Model
                output_path = path_manager.get_batch_test_model_output(
                    FB.NAME, aggregator, base_model, "DEFAULT", dataset)
                # =======================================================================================
                with open(output_path, "w") as w:
                    for threshold in [
                            0,
                    ]:
                        X, Y = DataLoader.load(dataset)
                        dim = X.shape[1]
                        neigh = max(10, int(np.floor(0.03 * X.shape[0])))
                        logger.info(f"{dataset} {aggregator} {threshold}")
                        roc_aucs = []
                        precision_at_ns = []
                        # =======================================================================================
                        # Model
                        mdl = FB(2, dim / 4, ENSEMBLE_SIZE, aggregator, neigh,
                                 base_model, Y, threshold)
                        # =======================================================================================
                        for _ in tqdm.trange(5):
                            try:
                                rst = mdl.run(X)
                                # Throw exception if no satisfied subspaces are found
                                roc_auc = mdl.compute_roc_auc(rst, Y)
                                logger.info("Final ROC {}".format(roc_auc))
                                precision_at_n = mdl.compute_precision_at_n(
                                    rst, Y)
                                logger.info(
                                    "Precision@n {}".format(precision_at_n))

                                roc_aucs.append(roc_auc)
                                precision_at_ns.append(precision_at_n)
                            except Exception as e:
                                logger.exception(e)
                        logger.info(
                            f"Exp Information {dataset} {aggregator} {threshold}"
                        )
                        logger.info(f"Final Average ROC {np.mean(roc_aucs)}")
                        logger.info(
                            f"Final Precision@n {np.mean(precision_at_ns)}")
                        logger.info(
                            f"===================================================="
                        )
                        output = {
                            Consts.DATA: dataset,
                            Consts.ROC_AUC: np.mean(roc_aucs),
                            Consts.PRECISION_A_N: np.mean(precision_at_ns),
                            Consts.AGGREGATE: aggregator,
                            Consts.BASE_MODEL: base_model,
                            Consts.START_DIM: 2,
                            Consts.END_DIM: 1 / 4,
                            Consts.ENSEMBLE_SIZE: ENSEMBLE_SIZE
                        }
                        w.write(f"{json.dumps(output)}\n")
                        logger.info(f"Output file is {output_path}")

Exemple #12

0

Afficher le fichier

Fichier : compare_model_performance.py Projet : Minzc/subspace_outlier

def experiment(model, dim_boundary, threshold):
    import json
    path_manager = PathManager()
    ENSEMBLE_SIZE = 100


    if model == "fb":
        Model = FB
    elif model == "oracle":
        Model = OracleAdaptive
    else:
        raise Exception(f"Model not supported {model}")

    threshold = float(threshold)

    for dataset in [Dataset.OPTDIGITS, Dataset.MNIST_ODDS, Dataset.MUSK, Dataset.ARRHYTHMIA,
                    Dataset.AD, Dataset.AID362, Dataset.BANK, Dataset.PROB, Dataset.U2R]:
        for aggregator in [ Aggregator.COUNT_RANK_THRESHOLD, ]:
            for base_model in [kNN.NAME, ]:
                X, Y = DataLoader.load(dataset)
                dim = X.shape[1]
                neigh = max(10, int(np.floor(0.03 * X.shape[0])))
                logger.info(f"{dataset} {aggregator} {threshold}")
                roc_aucs = []
                precision_at_ns = []
                # =======================================================================================
                # Model
                if dim_boundary == "high":
                    start_dim = dim / 2
                    end_dim = dim
                else:
                    start_dim = 2
                    end_dim = dim / 4
                mdl = Model(start_dim, end_dim, ENSEMBLE_SIZE, aggregator, neigh, base_model, Y, threshold)
                output_path = path_manager.get_batch_test_model_output(Model.NAME, aggregator, base_model,
                                                                       "DEFAULT", dataset, start_dim, end_dim)
                logger.info(f"Output File {output_path}")
                # =======================================================================================
                for _ in tqdm.trange(5):
                    try:
                        rst = mdl.run(X)
                        # Throw exception if no satisfied subspaces are found
                        roc_auc = mdl.compute_roc_auc(rst, Y)
                        logger.info("Final ROC {}".format(roc_auc))
                        precision_at_n = mdl.compute_precision_at_n(rst, Y)
                        logger.info("Precision@n {}".format(precision_at_n))

                        roc_aucs.append(roc_auc)
                        precision_at_ns.append(precision_at_n)
                    except Exception as e:
                        logger.exception(e)
                logger.info(f"Exp Information {dataset} {aggregator} {threshold}")
                logger.info(f"Final Average ROC {np.mean(roc_aucs)}")
                logger.info(f"Final Precision@n {np.mean(precision_at_ns)}")
                logger.info(f"====================================================")
                output = {
                    Consts.DATA: dataset,
                    Consts.ROC_AUC: np.mean(roc_aucs),
                    Consts.PRECISION_A_N: np.mean(precision_at_ns),
                    Consts.AGGREGATE: aggregator,
                    Consts.BASE_MODEL: base_model,
                    Consts.START_DIM: start_dim,
                    Consts.END_DIM: end_dim,
                    Consts.ENSEMBLE_SIZE: ENSEMBLE_SIZE
                }
                with open(output_path, "w") as w:
                    w.write(f"{json.dumps(output)}\n")
                    logger.info(f"Output file is {output_path}")

Exemple #13

0

Afficher le fichier

Fichier : enumerate_features.py Projet : Minzc/subspace_outlier

def subspace_count_per_point():
    import json
    BIN_NUM = 10
    outputs = defaultdict(dict)
    model = 'knn'

    for dataset in [
            Dataset.VOWELS, Dataset.WINE, Dataset.BREASTW, Dataset.ANNTHYROID,
            Dataset.GLASS, Dataset.PIMA, Dataset.THYROID
    ]:
        logger.info("=" * 50)
        logger.info(f"             Dataset {dataset}             ")
        logger.info("=" * 50)
        _X, Y = DataLoader.load(dataset)
        outlier_num, inlier_num = np.sum(Y == 1), np.sum(Y == 0)
        feature_index = np.array([i for i in range(_X.shape[1])])
        if model == "knn":
            mdl = kNN(max(10, int(np.floor(0.03 * _X.shape[0]))),
                      Normalize.ZSCORE)
            X_gpu_tensor = _X
        elif model == "gke":
            mdl = GKE_GPU(Normalize.ZSCORE)
            X_gpu_tensor = GKE_GPU.convert_to_tensor(_X)

        model_outputs = []
        for l in range(1, len(feature_index) + 1):
            for i in combinations(feature_index, l):
                model_outputs.append(mdl.fit(X_gpu_tensor[:, np.asarray(i)]))

        logger.info(f"Total model {len(model_outputs)}")
        for name, aggregator, threshold in [
            ("RANK", Aggregator.count_rank_threshold, 0.05),
            ("RANK", Aggregator.count_rank_threshold, 0.10),
            ("STD", Aggregator.count_std_threshold, 1),
            ("STD", Aggregator.count_std_threshold, 2)
        ]:
            y_scores = np.array(aggregator(model_outputs, threshold))
            outlier_subspaces, inlier_subspaces = y_scores[Y == 1], y_scores[
                Y == 0]

            outlier_hist, bin = np.histogram(outlier_subspaces,
                                             BIN_NUM,
                                             range=(0.1, len(model_outputs)))
            bin = [f"{i / len(model_outputs):.1f}" for i in bin]
            zero_subspaces_outlier = sum(
                [1 for i in outlier_subspaces if i == 0])
            print(zero_subspaces_outlier)
            print(outlier_hist)
            outlier_hist = np.insert(outlier_hist, 0, zero_subspaces_outlier)
            print(outlier_hist)
            assert np.sum(outlier_hist) == outlier_num

            inlier_hist = np.histogram(inlier_subspaces,
                                       BIN_NUM,
                                       range=(0.1, len(model_outputs)))[0]
            zero_subspaces_inlier = sum(
                [1 for i in inlier_subspaces if i == 0])
            print(zero_subspaces_inlier)
            print(inlier_hist)
            inlier_hist = np.insert(inlier_hist, 0, zero_subspaces_inlier)
            print(inlier_hist)
            assert np.sum(inlier_hist) == inlier_num

            outlier_hist_percent = outlier_hist / outlier_num
            inlier_hist_percent = inlier_hist / inlier_num

            logger.info(f"Outlier {outlier_num} Inlier {inlier_num}")
            logger.info(
                f"Outlier Median {np.median(outlier_subspaces)} Inlier Median {np.median(inlier_subspaces)}"
            )
            logger.info(
                f"Outlier Mean {np.mean(outlier_subspaces)} Inlier Mean {np.mean(inlier_subspaces)}"
            )

            logger.info(f"Bin {bin}")
            logger.info(f"Outlier dist {outlier_hist}")
            logger.info(f"Inlier dist {inlier_hist}")
            logger.info(f"Outlier dist density {outlier_hist_percent}")
            logger.info(f"Inlier dist density {inlier_hist_percent}")

            outputs[f"{name}_{threshold}"][dataset] = {
                "outlier": outlier_hist_percent.tolist(),
                "inlier": inlier_hist_percent.tolist(),
                "bin": bin,
                "outlier_mean": np.mean(outlier_subspaces),
                "inlier_mean": np.mean(inlier_subspaces),
                "outlier_median": np.median(outlier_subspaces),
                "inlier_median": np.median(inlier_subspaces),
            }

    output_file = f"{model}_subspace_count_per_point.json"
    with open(output_file, "w") as w:
        w.write(f"{json.dumps(outputs)}\n")
    logger.info(f"Output file {output_file}")

Exemple #14

0

Afficher le fichier

Fichier : enumerate_features.py Projet : Minzc/subspace_outlier

def point_count_per_dim():
    import json
    outputs = defaultdict(dict)
    model = "knn"
    for dataset in [
            Dataset.VOWELS, Dataset.WINE, Dataset.BREASTW, Dataset.ANNTHYROID,
            Dataset.GLASS, Dataset.PIMA, Dataset.THYROID
    ]:
        logger.info("=" * 50)
        logger.info(f"             Dataset {dataset}             ")
        logger.info("=" * 50)
        _X, Y = DataLoader.load(dataset)
        feature_index = np.array([i for i in range(_X.shape[1])])
        outlier_num, inlier_num = np.sum(Y == 1), np.sum(Y == 0)

        if model == "knn":
            mdl = GKE_GPU(Normalize.ZSCORE)
            X_gpu_tensor = GKE_GPU.convert_to_tensor(_X)
        else:
            mdl = kNN(max(10, int(np.floor(0.03 * _X.shape[0]))),
                      Normalize.ZSCORE)
            X_gpu_tensor = _X

        model_outputs_all = defaultdict(list)
        for l in range(1, len(feature_index) + 1):
            for i in combinations(feature_index, l):
                model_outputs_all[l].append(
                    mdl.fit(X_gpu_tensor[:, np.asarray(i)]))

        assert len(model_outputs_all) == len(feature_index)

        for name, aggregator, threshold in [
            ("RANK", Aggregator.count_rank_threshold, 0.05),
            ("RANK", Aggregator.count_rank_threshold, 0.10),
            ("STD", Aggregator.count_std_threshold, 1),
            ("STD", Aggregator.count_std_threshold, 2)
        ]:
            dim_outlier_ratio = [0] * len(feature_index)
            dim_inlier_ratio = [0] * len(feature_index)

            for l, model_outputs in model_outputs_all.items():
                y_scores = np.array(aggregator(model_outputs, threshold))

                point_idx = set()
                for idx, score in enumerate(y_scores[Y == 1]):
                    if score > 0:
                        point_idx.add(idx)
                dim_outlier_ratio[l - 1] = len(point_idx) / outlier_num

                point_idx = set()
                for idx, score in enumerate(y_scores[Y == 0]):
                    if score > 0:
                        point_idx.add(idx)
                dim_inlier_ratio[l - 1] = len(point_idx) / inlier_num

            outputs[f"{name}_{threshold}"][dataset] = {
                "outlier": dim_outlier_ratio,
                "inlier": dim_inlier_ratio,
                "feature_index": feature_index.tolist()
            }

    with open(f"{model}_point_count_per_dim.json", "w") as w:
        w.write(f"{json.dumps(outputs)}")