def __init__(self, name, aggregate_method, base_model, neighbor, norm_method="DEFAULT"): self.name = name # ======================================= # Setup normalization method # ======================================= assert norm_method in { Normalize.ZSCORE, Normalize.UNIFY, "DEFAULT", None } if norm_method == "DEFAULT": if aggregate_method == Aggregator.AVERAGE: self.norm_method = Normalize.ZSCORE elif aggregate_method == Aggregator.AVERAGE_THRESHOLD: self.norm_method = Normalize.ZSCORE else: self.norm_method = None else: self.norm_method = norm_method if base_model == kNN.NAME: self.mdl = kNN(neighbor, self.norm_method) elif base_model == LOF.NAME: self.mdl = LOF(neighbor, self.norm_method) elif base_model == None: self.mdl = None else: raise Exception(f"Base Model: {base_model} is not supported.")
def outliers_per_subspace(): import json outputs = defaultdict(dict) model = 'gke' for dataset in [Dataset.GLASS, Dataset.WINE, Dataset.BREASTW, Dataset.ANNTHYROID, Dataset.VOWELS, Dataset.PIMA, Dataset.THYROID]: logger.info("=" * 50) logger.info(f" Dataset {dataset} ") logger.info("=" * 50) _X, Y = DataLoader.load(dataset) outlier_num, inlier_num = np.sum(Y == 1), np.sum(Y == 0) feature_index = np.array([i for i in range(_X.shape[1])]) if model == "knn": mdl = kNN(max(10, int(np.floor(0.03 * _X.shape[0]))), Normalize.ZSCORE) X_gpu_tensor = _X elif model == "gke": mdl = GKE_GPU(Normalize.ZSCORE) X_gpu_tensor = GKE_GPU.convert_to_tensor(_X) model_outputs = [] selected_features = [] for l in range(1, len(feature_index) + 1): for i in combinations(feature_index, l): model_outputs.append( (i, mdl.fit(X_gpu_tensor[:, np.asarray(i)])) ) logger.info(f"Total model {len(model_outputs)}") for name, aggregator, threshold in [("RANK", Aggregator.count_rank_threshold, 0.05), ("RANK", Aggregator.count_rank_threshold, 0.10), ("STD", Aggregator.count_std_threshold, 1), ("STD", Aggregator.count_std_threshold, 2)]: logger.info(f"---------------{name}------------------------") outlier_num_per_subspace = [] for selected_features, i in model_outputs: y_scores = np.array(aggregator([i, ], threshold)) outlier_num_per_subspace.append(int(np.sum(y_scores[Y == 1]))) outputs[f"{name}_{threshold}"][dataset] = { "outlier_dist": outlier_num_per_subspace, "outlier_total": int(outlier_num), "subspace_total": len(model_outputs) } total_score = Aggregator.count_rank_threshold(model_outputs) for idx, i in enumerate(Y): if i == 1 and total_score[i] == 0: print("FN Outliers", X_gpu_tensor[idx]) print("Inliers", X_gpu_tensor[Y == 0]) output_file = f"{model}_outliers_per_subspace.json" with open(output_file, "w") as w: w.write(f"{json.dumps(outputs)}\n") logger.info(f"Output file {output_file}")
def compare_auc(): outputs = defaultdict(dict) # model = "knn" model_name = "gke" for dataset in [ Dataset.VOWELS, Dataset.WINE, Dataset.BREASTW, Dataset.ANNTHYROID, Dataset.GLASS, Dataset.PIMA, Dataset.THYROID ]: logger.info("=" * 50) logger.info(f" Dataset {dataset} ") logger.info("=" * 50) _X, Y = DataLoader.load(dataset) feature_index = np.array([i for i in range(_X.shape[1])]) if model_name == "knn": X_gpu_tensor = _X mdl = kNN(max(10, int(np.floor(0.03 * _X.shape[0]))), Normalize.ZSCORE) elif model_name == "gke": X_gpu_tensor = GKE_GPU.convert_to_tensor(_X) mdl = GKE_GPU(Normalize.ZSCORE) model_outputs = [] for l in range(1, len(feature_index) + 1): for i in combinations(feature_index, l): model_outputs.append(mdl.fit(X_gpu_tensor[:, np.asarray(i)])) logger.info(f"Total model {len(model_outputs)}") for name, aggregator, threshold in [ ("RANK", Aggregator.count_rank_threshold, 0.05), ("RANK", Aggregator.count_rank_threshold, 0.10), ("STD", Aggregator.count_std_threshold, 1), ("STD", Aggregator.count_std_threshold, 2), ("AVG", Aggregator.average, None), ("AVG", Aggregator.average_threshold, 1), ("AVG", Aggregator.average_threshold, 2), ]: if threshold is not None: y_scores = np.array(aggregator(model_outputs, threshold)) else: y_scores = np.array(aggregator(model_outputs)) roc = roc_auc_score(Y, y_scores) precision = precision_n_scores(Y, y_scores) logger.info( f"ROC of {name}-{threshold} {roc} Precision {precision}") outputs[dataset][f"{name}_{threshold}"] = { "roc": roc, "precision": precision } output_file = f"{model_name}_performance.json" with open(output_file, "w") as w: w.write(f"{json.dumps(outputs)}\n") logger.info(f"Output file {output_file}")
def load_model_and_data(dataset, model): logger.info("=" * 50) logger.info(f" Dataset {dataset} ") logger.info("=" * 50) _X, Y = DataLoader.load(dataset) outlier_num = int(np.sum(Y == 1)) feature_index = np.array(range(_X.shape[1])) if model == "knn": mdl = kNN(max(10, int(np.floor(0.03 * _X.shape[0]))), Normalize.ZSCORE) X_gpu_tensor = _X elif model == "gke": mdl = GKE_GPU(Normalize.ZSCORE) X_gpu_tensor = GKE_GPU.convert_to_tensor(_X) return mdl, X_gpu_tensor, Y, outlier_num, feature_index
def outlier_correlation_subspace(): import json outputs = defaultdict(dict) model = 'gke' for dataset in [ Dataset.VOWELS, Dataset.WINE, Dataset.BREASTW, Dataset.ANNTHYROID, Dataset.GLASS, Dataset.PIMA, Dataset.THYROID ]: logger.info("=" * 50) logger.info(f" Dataset {dataset} ") logger.info("=" * 50) _X, Y = DataLoader.load(dataset) outlier_num = np.sum(Y == 1) feature_index = np.array(range(_X.shape[1])) if model == "knn": mdl = kNN(max(10, int(np.floor(0.03 * _X.shape[0]))), Normalize.ZSCORE) X_gpu_tensor = _X elif model == "gke": mdl = GKE_GPU(Normalize.ZSCORE) X_gpu_tensor = GKE_GPU.convert_to_tensor(_X) model_outputs = [] subspace_idx_to_feautres = [] for l in range(1, len(feature_index) + 1): for i in combinations(feature_index, l): model_outputs.append(mdl.fit(X_gpu_tensor[:, np.asarray(i)])) subspace_idx_to_feautres.append([int(j) for j in i]) logger.info(f"Total model {len(model_outputs)}") for name, aggregator, threshold in [ ("RANK", Aggregator.count_rank_threshold, 0.05), ("RANK", Aggregator.count_rank_threshold, 0.10), ("STD", Aggregator.count_std_threshold, 1), ("STD", Aggregator.count_std_threshold, 2) ]: outliers_to_subspaces = defaultdict(set) subspace_to_outlier = {} for subspace_id, model_output in enumerate(model_outputs): detected_outliers = { point_idx for point_idx, if_outlier in enumerate( aggregator([ model_output, ], threshold)) if if_outlier == 1 and Y[point_idx] == 1 } subspace_to_outlier[subspace_id] = detected_outliers for detected_outlier in detected_outliers: outliers_to_subspaces[detected_outlier].add(subspace_id) _subspace_to_outlier = { i: copy.deepcopy(j) for i, j in subspace_to_outlier.items() } not_covered_outliers = { i for i, subspaces in outliers_to_subspaces.items() if len(subspaces) > 0 } not_covered_outliers_num = len(not_covered_outliers) logger.info( f"Detected outliers {len(not_covered_outliers)}/{outlier_num}") selected_subspaces = [] while len(not_covered_outliers) > 0: _tmp = sorted(subspace_to_outlier.items(), key=lambda x: len(x[1]), reverse=True) selected_subspace_id, covered_outliers = \ sorted(subspace_to_outlier.items(), key=lambda x: len(x[1]), reverse=True)[0] not_covered_outliers = not_covered_outliers - covered_outliers subspace_to_outlier = { i: (j - covered_outliers) for i, j in subspace_to_outlier.items() } selected_subspaces.append(selected_subspace_id) for i in selected_subspaces: print( f"Features {subspace_idx_to_feautres[i]} Outliers {len(_subspace_to_outlier[i])}" ) print(f"{len(selected_subspaces)}/{len(model_outputs)}") outputs[f"{name}_{threshold}"][dataset] = { "select_subspace": [(subspace_idx_to_feautres[i], list(_subspace_to_outlier[i])) for i in selected_subspaces], "outliers": not_covered_outliers_num, "total_subspace": len(model_outputs), "total_outliers": int(outlier_num), "dimension": len(feature_index) } output_file = f"{model}_outliers_correlation_subspace.json" with open(output_file, "w") as w: w.write(f"{json.dumps(outputs)}\n") logger.info(f"Output file {output_file}")
def subspace_count_per_point(): import json BIN_NUM = 10 outputs = defaultdict(dict) model = 'knn' for dataset in [ Dataset.VOWELS, Dataset.WINE, Dataset.BREASTW, Dataset.ANNTHYROID, Dataset.GLASS, Dataset.PIMA, Dataset.THYROID ]: logger.info("=" * 50) logger.info(f" Dataset {dataset} ") logger.info("=" * 50) _X, Y = DataLoader.load(dataset) outlier_num, inlier_num = np.sum(Y == 1), np.sum(Y == 0) feature_index = np.array([i for i in range(_X.shape[1])]) if model == "knn": mdl = kNN(max(10, int(np.floor(0.03 * _X.shape[0]))), Normalize.ZSCORE) X_gpu_tensor = _X elif model == "gke": mdl = GKE_GPU(Normalize.ZSCORE) X_gpu_tensor = GKE_GPU.convert_to_tensor(_X) model_outputs = [] for l in range(1, len(feature_index) + 1): for i in combinations(feature_index, l): model_outputs.append(mdl.fit(X_gpu_tensor[:, np.asarray(i)])) logger.info(f"Total model {len(model_outputs)}") for name, aggregator, threshold in [ ("RANK", Aggregator.count_rank_threshold, 0.05), ("RANK", Aggregator.count_rank_threshold, 0.10), ("STD", Aggregator.count_std_threshold, 1), ("STD", Aggregator.count_std_threshold, 2) ]: y_scores = np.array(aggregator(model_outputs, threshold)) outlier_subspaces, inlier_subspaces = y_scores[Y == 1], y_scores[ Y == 0] outlier_hist, bin = np.histogram(outlier_subspaces, BIN_NUM, range=(0.1, len(model_outputs))) bin = [f"{i / len(model_outputs):.1f}" for i in bin] zero_subspaces_outlier = sum( [1 for i in outlier_subspaces if i == 0]) print(zero_subspaces_outlier) print(outlier_hist) outlier_hist = np.insert(outlier_hist, 0, zero_subspaces_outlier) print(outlier_hist) assert np.sum(outlier_hist) == outlier_num inlier_hist = np.histogram(inlier_subspaces, BIN_NUM, range=(0.1, len(model_outputs)))[0] zero_subspaces_inlier = sum( [1 for i in inlier_subspaces if i == 0]) print(zero_subspaces_inlier) print(inlier_hist) inlier_hist = np.insert(inlier_hist, 0, zero_subspaces_inlier) print(inlier_hist) assert np.sum(inlier_hist) == inlier_num outlier_hist_percent = outlier_hist / outlier_num inlier_hist_percent = inlier_hist / inlier_num logger.info(f"Outlier {outlier_num} Inlier {inlier_num}") logger.info( f"Outlier Median {np.median(outlier_subspaces)} Inlier Median {np.median(inlier_subspaces)}" ) logger.info( f"Outlier Mean {np.mean(outlier_subspaces)} Inlier Mean {np.mean(inlier_subspaces)}" ) logger.info(f"Bin {bin}") logger.info(f"Outlier dist {outlier_hist}") logger.info(f"Inlier dist {inlier_hist}") logger.info(f"Outlier dist density {outlier_hist_percent}") logger.info(f"Inlier dist density {inlier_hist_percent}") outputs[f"{name}_{threshold}"][dataset] = { "outlier": outlier_hist_percent.tolist(), "inlier": inlier_hist_percent.tolist(), "bin": bin, "outlier_mean": np.mean(outlier_subspaces), "inlier_mean": np.mean(inlier_subspaces), "outlier_median": np.median(outlier_subspaces), "inlier_median": np.median(inlier_subspaces), } output_file = f"{model}_subspace_count_per_point.json" with open(output_file, "w") as w: w.write(f"{json.dumps(outputs)}\n") logger.info(f"Output file {output_file}")
def point_count_per_dim(): import json outputs = defaultdict(dict) model = "knn" for dataset in [ Dataset.VOWELS, Dataset.WINE, Dataset.BREASTW, Dataset.ANNTHYROID, Dataset.GLASS, Dataset.PIMA, Dataset.THYROID ]: logger.info("=" * 50) logger.info(f" Dataset {dataset} ") logger.info("=" * 50) _X, Y = DataLoader.load(dataset) feature_index = np.array([i for i in range(_X.shape[1])]) outlier_num, inlier_num = np.sum(Y == 1), np.sum(Y == 0) if model == "knn": mdl = GKE_GPU(Normalize.ZSCORE) X_gpu_tensor = GKE_GPU.convert_to_tensor(_X) else: mdl = kNN(max(10, int(np.floor(0.03 * _X.shape[0]))), Normalize.ZSCORE) X_gpu_tensor = _X model_outputs_all = defaultdict(list) for l in range(1, len(feature_index) + 1): for i in combinations(feature_index, l): model_outputs_all[l].append( mdl.fit(X_gpu_tensor[:, np.asarray(i)])) assert len(model_outputs_all) == len(feature_index) for name, aggregator, threshold in [ ("RANK", Aggregator.count_rank_threshold, 0.05), ("RANK", Aggregator.count_rank_threshold, 0.10), ("STD", Aggregator.count_std_threshold, 1), ("STD", Aggregator.count_std_threshold, 2) ]: dim_outlier_ratio = [0] * len(feature_index) dim_inlier_ratio = [0] * len(feature_index) for l, model_outputs in model_outputs_all.items(): y_scores = np.array(aggregator(model_outputs, threshold)) point_idx = set() for idx, score in enumerate(y_scores[Y == 1]): if score > 0: point_idx.add(idx) dim_outlier_ratio[l - 1] = len(point_idx) / outlier_num point_idx = set() for idx, score in enumerate(y_scores[Y == 0]): if score > 0: point_idx.add(idx) dim_inlier_ratio[l - 1] = len(point_idx) / inlier_num outputs[f"{name}_{threshold}"][dataset] = { "outlier": dim_outlier_ratio, "inlier": dim_inlier_ratio, "feature_index": feature_index.tolist() } with open(f"{model}_point_count_per_dim.json", "w") as w: w.write(f"{json.dumps(outputs)}")