def setUp(self): self.n_train = 200 self.n_test = 100 self.contamination = 0.1 self.roc_floor = 0.8 self.X_train, self.y_train, self.X_test, self.y_test = generate_data( n_train=self.n_train, n_test=self.n_test, n_features=10, contamination=self.contamination, random_state=42) self.clf = COPOD(contamination=self.contamination) self.clf.fit(self.X_train)
def model_init(self, model): """Model initialisation of a single model. """ if self.model == 'pca': self.models[model] = PCA(contamination=self.contamination) elif self.model == 'loda': self.models[model] = LODA(contamination=self.contamination) elif self.model == 'iforest': self.models[model] = IForest(n_estimators=50, bootstrap=True, behaviour='new', contamination=self.contamination) elif self.model == 'cblof': self.models[model] = CBLOF(n_clusters=3, contamination=self.contamination) elif self.model == 'feature_bagging': self.models[model] = FeatureBagging( base_estimator=PCA(contamination=self.contamination), contamination=self.contamination) elif self.model == 'copod': self.models[model] = COPOD(contamination=self.contamination) elif self.model == 'hbos': self.models[model] = HBOS(contamination=self.contamination) else: self.models[model] = HBOS(contamination=self.contamination) self.custom_model_scalers[model] = MinMaxScaler()
def setUp(self): # Define data file and read X and y # Generate some data if the source data is missing this_directory = path.abspath(path.dirname(__file__)) mat_file = 'cardio.mat' try: mat = loadmat(path.join(*[this_directory, 'data', mat_file])) except TypeError: print('{data_file} does not exist. Use generated data'.format( data_file=mat_file)) X, y = generate_data(train_only=True) # load data except IOError: print('{data_file} does not exist. Use generated data'.format( data_file=mat_file)) X, y = generate_data(train_only=True) # load data else: X = mat['X'] y = mat['y'].ravel() X, y = check_X_y(X, y) self.X_train, self.X_test, self.y_train, self.y_test = \ train_test_split(X, y, test_size=0.4, random_state=42) self.base_estimators = [LOF(), LOF(), IForest(), COPOD()] self.clf = SUOD(base_estimators=self.base_estimators) self.clf.fit(self.X_train) self.roc_floor = 0.7
def setUp(self): self.n_train = 200 self.n_test = 100 self.contamination = 0.1 self.roc_floor = 0.8 self.X_train, self.y_train, self.X_test, self.y_test = generate_data( n_train=self.n_train, n_test=self.n_test, n_features=10, contamination=self.contamination, random_state=42) self.clf = COPOD(contamination=self.contamination, n_jobs=2) self.clf.fit(self.X_train) # get a copy from the single thread copy self.clf_ = COPOD(contamination=self.contamination) self.clf_.fit(self.X_train)
def main(): scalers = ['no', 'std', 'minmax'] root = 'Unsupervised_Anamaly_Detection_csv' start = 0 counts = 90 CPUS = 3 CPUS_Models = 4 sklearn_models = [ 'AvgKNN', 'LargestKNN', 'MedKNN', 'PCA', 'COF', 'LODA', 'LOF', 'HBOS', 'MCD', 'AvgBagging', 'MaxBagging', 'IForest', 'CBLOF', 'COPOD', 'SOD', 'LSCPwithLODA', 'AveLMDD', 'VarLMDD', 'IqrLMDD', 'SoGaal', 'MoGaal', 'VAE', 'AutoEncoder' ] models = { 'BRM': BRM(bootstrap_sample_percent=70), 'GM': GaussianMixture(), 'IF': IsolationForest(), 'OCSVM': OneClassSVM(), 'EE': EllipticEnvelope(), 'AvgKNN': KNN(method='mean'), 'LargestKNN': KNN(method='largest'), 'MedKNN': KNN(method='median'), 'PCA': PCA(), 'COF': COF(), 'LODA': LODA(), 'LOF': LOF(), 'HBOS': HBOS(), 'MCD': MCD(), 'AvgBagging': FeatureBagging(combination='average'), 'MaxBagging': FeatureBagging(combination='max'), 'CBLOF': CBLOF(n_clusters=10, n_jobs=4), 'FactorAnalysis': FactorAnalysis(), 'KernelDensity': KernelDensity(), 'COPOD': COPOD(), 'SOD': SOD(), 'LSCPwithLODA': LSCP([LODA(), LODA()]), 'AveLMDD': LMDD(dis_measure='aad'), 'VarLMDD': LMDD(dis_measure='var'), 'IqrLMDD': LMDD(dis_measure='iqr'), 'SoGaal': SO_GAAL(), 'MoGaal': MO_GAAL(), 'VAE': VAE(encoder_neurons=[8, 4, 2]), 'AutoEncoder': AutoEncoder(hidden_neurons=[6, 3, 3, 6]), 'OCKRA': m_OCKRA(), } name = "30_Models" Parallel(n_jobs=CPUS) \ (delayed(runByScaler) (root, scaler, models, start, counts, other_models=sklearn_models, CPUS=CPUS_Models, save_name=name) for scaler in scalers)
def remove_outlier(data, x, row, contamination): ''' data: which kind of data you are passing x : 0 for live and 1 for spoof ''' # 0 indicates all the live images x1 = np.where(data[:, 512]==x) #512 x2 = data[x1][0:row, :] train_features_x = x2[:,0:511] clf = COPOD(contamination = contamination) clf.fit(train_features_x) z = clf.labels_ z = np.asarray(z).reshape(row,1) z_final = np.hstack((x2, z)) x1_2 = np.where(z_final[:, 513]==0) x2_2 = z_final[x1_2][:, :] return x2_2
def models_init(self): """Models initialisation. """ self.model = self.configuration.get('model', 'pca') if self.model == 'pca': self.models = { model: PCA(contamination=self.contamination) for model in self.models_in_scope } elif self.model == 'loda': self.models = { model: LODA(contamination=self.contamination) for model in self.models_in_scope } elif self.model == 'iforest': self.models = { model: IForest(n_estimators=50, bootstrap=True, behaviour='new', contamination=self.contamination) for model in self.models_in_scope } elif self.model == 'cblof': self.models = { model: CBLOF(n_clusters=3, contamination=self.contamination) for model in self.models_in_scope } elif self.model == 'feature_bagging': self.models = { model: FeatureBagging( base_estimator=PCA(contamination=self.contamination), contamination=self.contamination) for model in self.models_in_scope } elif self.model == 'copod': self.models = { model: COPOD(contamination=self.contamination) for model in self.models_in_scope } elif self.model == 'hbos': self.models = { model: HBOS(contamination=self.contamination) for model in self.models_in_scope } else: self.models = { model: HBOS(contamination=self.contamination) for model in self.models_in_scope } self.custom_model_scalers = { model: MinMaxScaler() for model in self.models_in_scope }
def pred_COPOD(self, comp_with="openaq"): self.comp_with = comp_with if comp_with == "openaq": if self.X_o == []: pred = [] else: self.clf = COPOD() self.clf.fit(self.X_o) pred = self.clf.labels_ elif comp_with == "cams": pred = [] for each_X in self.X_c: self.clf = COPOD() self.clf.fit(each_X) pred.append(self.clf.labels_[-1]) A_location, B_location, C_location = self.pred_location(pred) return A_location, B_location, C_location
def detect_anomaly(df_floats: pd.DataFrame, train_size: float, outliers_rate: float, classifier: str, plot: bool = False): """ Return binary classified outlier and raw outlier score. Performs training of anomaly detection model on subset of dataset and returns binary label and decision score for whole dataset. Parameters ---------- df_floats: pd.DataFrame with elements as floats. train_size: proportion of dataset to be used for training anomaly detection model. outliers_rate: proportion of training set to be considered outlier. classifier: string representing name of anomaly detection algorithm. plot: plots 2d contourf of anomaly detection scores. Returns ------- y_labels: numpy array of the same length as df_floats that assigns 0/1 (inlier/outlier) to each observation according to fitted model. y_scores: numpy array of the same length as df_floats that assigns outlier scores to each observation according to fitted model. """ if df_floats.shape[0] < 8: raise Warning( 'Not enough measurements. Please use DataFrame with at last 10 measurements.' ) if train_size > 1: train_size = train_size / 100 # TODO: Find out empirical way to set contamination level - Tukey's method if outliers_rate >= 1: outliers_rate = outliers_rate / 100 random_state = np.random.RandomState(42) # TODO: Perform scaling of data ONLY for AKNN, CBLOF, HBOS, KNN, OCSVM. Other classifiers are not influenced. classifiers = { 'Average KNN (AKNN)': KNN(method='mean', contamination=outliers_rate), 'Cluster-based Local Outlier Factor (CBLOF)': CBLOF(contamination=outliers_rate, check_estimator=False, random_state=random_state), 'Copula based Outlier Detection (COPOD)': COPOD(contamination=outliers_rate), 'Histogram-base Outlier Detection (HBOS)': HBOS(contamination=outliers_rate), 'Isolation Forest (IForest)': IForest(contamination=outliers_rate, random_state=random_state), 'K Nearest Neighbors (KNN)': KNN(contamination=outliers_rate), 'One-Class SVM (OCSVM)': OCSVM(contamination=outliers_rate), 'Principal component analysis (PCA)': PCA(contamination=outliers_rate) } scaler = MinMaxScaler(feature_range=(0, 1)) scaled = scaler.fit_transform(df_floats) df_scaled = pd.DataFrame(scaled, index=df_floats.index, columns=df_floats.columns) x_train, x_test = train_test_split(df_scaled, train_size=train_size) if classifier == 'all': raise Warning('This option is currently unsupported.' '\nPlease use one of those classifiers:' '\n{}.'.format(list(classifiers.keys()))) # for i, (clf_name, clf) in enumerate(classifiers.items()): # # fit model # clf.fit(x_train) # # prediction of a datapoint category outlier or inlier # y_labels = clf.predict(df_scaled) # plot_outlier_detection(df_scaled, y_labels, clf, clf_name, scaler) else: clf_name = '' for name in classifiers.keys(): if classifier in name: clf_name = name break if clf_name: # print("\nUsed classifier: {}".format(clf_name)) clf = classifiers.get(clf_name) clf.fit(x_train) y_labels = clf.predict( df_scaled) # binary labels (0: inliers, 1: outliers) y_scores = clf.decision_function(df_scaled) # raw outlier scores else: raise NameError('Unknown classifier. ' 'Please use one of those: {}.'.format( list(classifiers.keys()))) if plot: plot_outlier_detection(df_scaled, y_labels, clf, clf_name, scaler) return y_labels, y_scores
def execute(self): evaluation_results = [] print("Loading training data...") data = pd.DataFrame() for i, chunk in enumerate( pd.read_csv(self.input_file, header=None, chunksize=self.chunk_size)): print("Reading chunk: %d" % (i + 1)) #print(chunk) data = data.append(chunk) input_dimensionality = len(data.columns) - 1 print("Input Dimensionality: %d" % (input_dimensionality)) positive_data = data[data[len(data.columns) - 1] == 1].iloc[:, :len(data.columns) - 1] negative_data = data[data[len(data.columns) - 1] == -1].iloc[:, :len(data.columns) - 1] training_data = positive_data.sample(frac=0.70) positive_validation_data = positive_data.drop(training_data.index) if self.neg_cont and self.neg_cont > 0: print("Negative Contamination: %0.4f" % (self.neg_cont)) num_negative = math.floor( self.neg_cont * (len(negative_data) + len(positive_validation_data))) negative_data = data.sample(frac=1, random_state=200)[ data[len(data.columns) - 1] == -1].iloc[:num_negative, :len(data.columns) - 1] negative_validation_data = negative_data.copy() temp_positive = positive_validation_data.copy() temp_positive[input_dimensionality] = 1 temp_negative = negative_data.copy() temp_negative[input_dimensionality] = -1 validation_data_with_labels = pd.concat([temp_positive, temp_negative], ignore_index=True) validation_data = validation_data_with_labels.iloc[:, :len(data.columns ) - 1] validation_labels = validation_data_with_labels.iloc[:, -1:].values # Convert to tensor positive_data = torch.tensor(positive_data.values).float().to( self.device) negative_data = torch.tensor(negative_data.values).float().to( self.device) training_data = torch.tensor(training_data.values).float() validation_data = torch.tensor(validation_data.values).float() print("Validation Data:") print(validation_data) ## AE-D TRAINING ## print("Initializing autoencoder...") net = Autoencoder(layers=self.layers, device=self.device, add_syn=self.add_syn) net.to(self.device) print(net) print("Training Stochastic Autoencoder...") net.fit(training_data, epochs=self.epochs, lr=self.lr, batch_size=self.batch_size) predictions = net.predict(validation_data) tp, tn, fp, fn, tpr, tnr, ppv, npv, ts, pt, acc, f1, mcc = performance_metrics( validation_labels, predictions) r = ["AE-D", tp, tn, fp, fn, tpr, tnr, ppv, npv, ts, pt, acc, f1, mcc] evaluation_results.append(r) print("AE-D Results:") print( tabulate([r], [ "ALGO", "TP", "TN", "FP", "FN", "TPR", "TNR", "PPV", "NPV", "TS", "PT", "ACC", "F1", "MCC" ], tablefmt="grid")) # Convert back to CPU before other methods validation_data = validation_data.cpu() # Train only linear classifiers if self.eval_cat == "linear": print("Initiating training for linear detectors...") ## MCD ## print("Training MCD...") result = train_and_evaluate_classifier("MCD", MCD(), validation_data, validation_labels) evaluation_results.append(result) ## ROBUST COVARIANCE ## print("Training Robust Covariance...") result = train_and_evaluate_classifier("ROB-COV", EllipticEnvelope(), validation_data, validation_labels) evaluation_results.append(result) ## ONE CLASS SVM TRAINING ## print("Training OneClassSVM...") result = train_and_evaluate_classifier( "OC-SVM", svm.OneClassSVM(gamma="auto"), validation_data, validation_labels) evaluation_results.append(result) elif self.eval_cat == "prob": ## ABOD ## #print("Training ABOD...") #result = train_and_evaluate_classifier("ABOD", ABOD(), validation_data, validation_labels) #evaluation_results.append(result) ## SOS ## #print("Training SOS...") #result = train_and_evaluate_classifier("SOS", SOS(), validation_data, validation_labels) #evaluation_results.append(result) ## COPOD ## print("Training COPOD...") result = train_and_evaluate_classifier("COPOD", COPOD(), validation_data, validation_labels) evaluation_results.append(result) elif self.eval_cat == "ensemble": ## ISOLATION FOREST TRAINING ## print("Training Isolation Forest...") result = train_and_evaluate_classifier( "ISO-F", IsolationForest(random_state=0), validation_data, validation_labels) evaluation_results.append(result) ## LODA ## print("Training LODA...") result = train_and_evaluate_classifier("LODA", LODA(), validation_data, validation_labels) evaluation_results.append(result) ## LSCP ## # print("Training LSCP...") # result = train_and_evaluate_classifier("LSCP", LSCP([LOF(), LOF()]), validation_data, validation_labels) # evaluation_results.append(result) elif self.eval_cat == "proximity": ## LOCAL OUTLIER FACTOR ## print("Training Local Outlier Factor...") result = train_and_evaluate_classifier( "LOC-OF", LocalOutlierFactor(novelty=True), validation_data, validation_labels) evaluation_results.append(result) ## CBLOF ## print("Training CBLOF...") result = train_and_evaluate_classifier("CBLOF", CBLOF(), validation_data, validation_labels) evaluation_results.append(result) ## HBOS ## print("Training HBOS...") result = train_and_evaluate_classifier("HBOS", HBOS(), validation_data, validation_labels) evaluation_results.append(result) elif self.eval_cat == "nn": ## VAE ## print("Training VAE...") result = train_and_evaluate_classifier( "VAE", VAE(encoder_neurons=self.layers, decoder_neurons=self.layers.reverse()), validation_data, validation_labels) evaluation_results.append(result) ## SO_GAAL ## print("Training SO_GAAL...") result = train_and_evaluate_classifier( "SO_GAAL", SO_GAAL(lr_d=self.lr, stop_epochs=self.epochs), validation_data, validation_labels) evaluation_results.append(result) ## MO_GAAL ## print("Training MO_GAAL...") result = train_and_evaluate_classifier( "MO_GAAL", MO_GAAL(lr_d=self.lr, stop_epochs=self.epochs), validation_data, validation_labels) evaluation_results.append(result) ## EVALUATE RESULTS ## if self.eval_cat != "none": print("Aggregated Results:") print( tabulate(evaluation_results, [ "ALGO", "TP", "TN", "FP", "FN", "TPR", "TNR", "PPV", "NPV", "TS", "PT", "ACC", "F1", "MCC" ], tablefmt="grid")) ## DATASET METRICS ## len_training_data_points = len(training_data) len_positive_validations = len(positive_validation_data) len_negative_validations = len(negative_validation_data) len_validations = len_positive_validations + len_negative_validations metrics_results = [ ["Training Data Points", len_training_data_points], ["# Normal Points", len_positive_validations], ["# Anomalies", len_negative_validations], [ "Contamination Percentage", math.floor((len_negative_validations / len_validations) * 100) ] ] ## EVALUATE RESULTS ## print(tabulate(metrics_results, ["Metric", "Value"], tablefmt="grid")) if self.printout: print("Saving results to %s" % (self.printout)) df = pd.DataFrame(evaluation_results) df.to_csv(self.printout, header=None, index=False)
class TestCOPOD(unittest.TestCase): def setUp(self): self.n_train = 200 self.n_test = 100 self.contamination = 0.1 self.roc_floor = 0.8 self.X_train, self.y_train, self.X_test, self.y_test = generate_data( n_train=self.n_train, n_test=self.n_test, n_features=10, contamination=self.contamination, random_state=42) self.clf = COPOD(contamination=self.contamination) self.clf.fit(self.X_train) def test_parameters(self): assert (hasattr(self.clf, 'decision_scores_') and self.clf.decision_scores_ is not None) assert (hasattr(self.clf, 'labels_') and self.clf.labels_ is not None) assert (hasattr(self.clf, 'threshold_') and self.clf.threshold_ is not None) def test_train_scores(self): assert_equal(len(self.clf.decision_scores_), self.X_train.shape[0]) def test_prediction_scores(self): pred_scores = self.clf.decision_function(self.X_test) # check score shapes assert_equal(pred_scores.shape[0], self.X_test.shape[0]) # check performance assert (roc_auc_score(self.y_test, pred_scores) >= self.roc_floor) def test_prediction_labels(self): pred_labels = self.clf.predict(self.X_test) assert_equal(pred_labels.shape, self.y_test.shape) def test_prediction_proba(self): pred_proba = self.clf.predict_proba(self.X_test) assert (pred_proba.min() >= 0) assert (pred_proba.max() <= 1) def test_prediction_proba_linear(self): pred_proba = self.clf.predict_proba(self.X_test, method='linear') assert (pred_proba.min() >= 0) assert (pred_proba.max() <= 1) def test_prediction_proba_unify(self): pred_proba = self.clf.predict_proba(self.X_test, method='unify') assert (pred_proba.min() >= 0) assert (pred_proba.max() <= 1) def test_fit_predict(self): pred_labels = self.clf.fit_predict(self.X_train) assert_equal(pred_labels.shape, self.y_train.shape) def test_fit_predict_score(self): self.clf.fit_predict_score(self.X_test, self.y_test) self.clf.fit_predict_score(self.X_test, self.y_test, scoring='roc_auc_score') self.clf.fit_predict_score(self.X_test, self.y_test, scoring='prc_n_score') with assert_raises(NotImplementedError): self.clf.fit_predict_score(self.X_test, self.y_test, scoring='something') def test_predict_rank(self): pred_socres = self.clf.decision_function(self.X_test) pred_ranks = self.clf._predict_rank(self.X_test) # assert the order is reserved assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=3) assert_array_less(pred_ranks, self.X_train.shape[0] + 1) assert_array_less(-0.1, pred_ranks) def test_predict_rank_normalized(self): pred_socres = self.clf.decision_function(self.X_test) pred_ranks = self.clf._predict_rank(self.X_test, normalized=True) # assert the order is reserved assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=3) assert_array_less(pred_ranks, 1.01) assert_array_less(-0.1, pred_ranks) # def test_plot(self): # os, cutoff1, cutoff2 = self.clf.explain_outlier(ind=1) # assert_array_less(0, os) def test_model_clone(self): clone_clf = clone(self.clf) def tearDown(self): pass
def evaluation_od_train(x, y, data_name, model_name="iforest", chosen_subspace=None): """ using anomaly detector to yield anomaly score for each subspace, generate two files: the subspaces with the highest anomaly score & lof score for each subspace :param x: data matrix :param y: class information :param data_name: the data set name, using for naming the ground truth file :param model_name: anomaly detector name, default: lof :param chosen_subspace: use this to only evaluate a subset of the power set of full feature space :return: df: a ground-truth map using anomaly idx as key and ground truth feature subspace as value. """ global chosen_model dim = x.shape[1] ano_idx = np.where(y == 1)[0] n_ano = len(ano_idx) # get all the possible feature subset or just use given subset list f_subsets = utils.get_subset_candidate(dim, chosen_subspace) # score anomalies in each subspace, generate the score matrix n_subsets = len(f_subsets) score_matrix = np.zeros([n_ano, n_subsets]) for i in tqdm(range(n_subsets)): subset = f_subsets[i] x_subset = x[:, subset] if model_name == "iforest": clf = IForest() clf.fit(x_subset) od_score = clf.decision_scores_ elif model_name == "copod": clf = COPOD() clf.fit(x_subset) od_score = clf.decision_scores_ elif model_name == "hbos": clf = HBOS() clf.fit(x_subset) od_score = clf.decision_scores_ else: raise ValueError("unsupported od model") od_score = utils.min_max_norm(od_score) score_matrix[:, i] = od_score[ano_idx] if not os.path.exists(eva_root + "data_od_evaluation/"): os.makedirs(eva_root + "data_od_evaluation/") # score matrix to df anomaly_score_df = pd.DataFrame(data=score_matrix, columns=[str(s) for s in f_subsets]) col_name = anomaly_score_df.columns.tolist() col_name.insert(0, 'ano_idx') anomaly_score_df["ano_idx"] = ano_idx anomaly_score_df = anomaly_score_df.reindex(columns=col_name) path1 = eva_root + "data_od_evaluation/" + data_name + "_score_" + model_name + ".csv" anomaly_score_df.to_csv(path1, index=False) # get the ground truth (one subspace for each anomaly that the anomaly can obtain the highest anomaly score) g_truth_df = pd.DataFrame(columns=["ano_idx", "exp_subspace"]) exp_subspaces = [] for ii, ano_score in enumerate(score_matrix): max_score_idx = int(np.argmax(ano_score)) exp_subset = str(f_subsets[max_score_idx]) exp_subspaces.append(exp_subset) g_truth_df["ano_idx"] = ano_idx g_truth_df["exp_subspace"] = exp_subspaces g_truth_df.astype({"exp_subspace": "object"}) path2 = eva_root + "data_od_evaluation/" + data_name + "_gt_" + model_name + ".csv" g_truth_df.to_csv(path2, index=False) return anomaly_score_df, g_truth_df
generate_data(n_train=n_train, n_test=n_test, n_features=2, contamination=contamination, random_state=42) # train SUOD clf_name = 'SUOD' # initialized a group of outlier detectors for acceleration detector_list = [ LOF(n_neighbors=15), LOF(n_neighbors=20), LOF(n_neighbors=25), LOF(n_neighbors=35), COPOD(), IForest(n_estimators=100), IForest(n_estimators=200) ] # decide the number of parallel process, and the combination method clf = SUOD(base_estimators=detector_list, n_jobs=2, combination='average', verbose=False) # or to use the default detectors # clf = SUOD(n_jobs=2, combination='average', # verbose=False) clf.fit(X_train)
elif model == 'MCD': clf = MCD() elif model == 'OCSVM': clf = OCSVM() elif model == 'LOF': clf = LOF() elif model == 'CBLOF': clf = CBLOF() elif model == 'HBOS': clf = HBOS() elif model == 'KNN': clf = KNN() elif model == 'ABOD': clf = ABOD() else: clf = COPOD() # fit the model clf.fit(X) # get outlier scores scores = clf.decision_scores_ # raw outlier scores with col2: st.write('Top 10 anomaly scores for the', model, 'model:') df_id.loc[:, 'scores'] = scores top10 = df_id.nlargest(10, 'scores') top10_list = top10.index.tolist() st.write(top10) st.write('---')
def compare(inputdata, labels, n_clusters, dset_name): """ Compute the AUC, Fgap, Frank score on all conventional outlier detectors for the given dataset Args: inputdata: input data labels: ground truth outlier labels n_clusters: number of clusters, for some cluster-based detectors dset_name: dataset Returns: AUC, Fgap, Frank """ print( "Competing with conventional unsupervised outlier detection algorithms..." ) random_state = np.random.RandomState(1) if inputdata.shape[1] < 64: AEneurons = [16, 8, 8, 16] VAEneurons = [16, 8, 4], [4, 8, 16] else: AEneurons = [64, 32, 32, 64] VAEneurons = [128, 64, 32], [32, 64, 128] classifiers = { 'PCA': PCA(random_state=random_state), 'AutoEncoder': AutoEncoder(batch_size=100, hidden_neurons=AEneurons, random_state=random_state), 'VAE': VAE(batch_size=100, encoder_neurons=VAEneurons[0], decoder_neurons=VAEneurons[1], random_state=random_state), 'COPOD': COPOD(), 'Iforest': IForest(random_state=random_state), 'AutoEncoder': AutoEncoder(batch_size=100, random_state=random_state), 'VAE': VAE(batch_size=100, random_state=random_state), 'LODA': LODA(), 'OCSVM': OCSVM(), 'ABOD': ABOD(n_neighbors=20), 'Fb': FeatureBagging(random_state=random_state), 'CBLOF': CBLOF(n_clusters=n_clusters, check_estimator=False, random_state=random_state), 'LOF': LOF(), 'COF': COF() } for clf_name, clf in classifiers.items(): print(f"Using {clf_name} method") starttime = time.time() clf.fit(inputdata) time_taken = time.time() - starttime test_scores = clf.decision_scores_ # -----fix some broken scores----- # for i in range(len(test_scores)): cur = test_scores[i] if np.isnan(cur) or not np.isfinite(cur): test_scores[i] = 0 np.save(f'{dset_name}/{clf_name}_raw.npy', test_scores) auc = roc_auc_score(labels, test_scores) print('AUC:', auc) fetch(normalize(test_scores), f'../datasets/{dset_name.upper()}_Y.npy', f'{dset_name}/attribute.npy') print('time_taken:', time_taken)
'AvgKNN': KNN(method='mean'), 'LargestKNN': KNN(method='largest'), 'MedKNN': KNN(method='median'), 'PCA': PCA(), 'COF': COF(), 'LODA': LODA(), 'LOF': LOF(), 'HBOS': HBOS(), 'MCD': MCD(), 'AvgBagging': FeatureBagging(combination='average'), 'MaxBagging': FeatureBagging(combination='max'), 'IForest': IForest(), 'CBLOF': CBLOF(n_clusters=10, n_jobs=4), 'FactorAnalysis': FactorAnalysis(), 'KernelDensity': KernelDensity(), 'COPOD': COPOD(), 'SOD': SOD(), 'LSCPwithLODA': LSCP([LODA(), LODA()]), 'AveLMDD': LMDD(dis_measure='aad'), 'VarLMDD': LMDD(dis_measure='var'), 'IqrLMDD': LMDD(dis_measure='iqr'), 'SoGaal': SO_GAAL(), #'MoGaal':MO_GAAL(), 'VAE': VAE(encoder_neurons=[8, 4, 2]), 'AutoEncoder': AutoEncoder(hidden_neurons=[6, 3, 3, 6]) } models = { 'XGBOD': XGBOD(), 'BRM': BRM(), 'GM': GaussianMixture(),
if __name__ == "__main__": contamination = 0.1 # percentage of outliers n_train = 200 # number of training points n_test = 100 # number of testing points # Generate sample data X_train, y_train, X_test, y_test = \ generate_data(n_train=n_train, n_test=n_test, n_features=2, contamination=contamination, random_state=42) # train COPOD detector clf_name = 'COPOD' clf = COPOD() clf.fit(X_train) # get the prediction labels and outlier scores of the training data y_train_pred = clf.labels_ # binary labels (0: inliers, 1: outliers) y_train_scores = clf.decision_scores_ # raw outlier scores # get the prediction on the test data y_test_pred = clf.predict(X_test) # outlier labels (0 or 1) y_test_scores = clf.decision_function(X_test) # outlier scores # evaluate and print the results print("\nOn Training Data:") evaluate_print(clf_name, y_train, y_train_scores) print("\nOn Test Data:") evaluate_print(clf_name, y_test, y_test_scores)
''' Author: Christian O'Leary Email: [email protected] ''' import numpy as np from pyod.models.copod import COPOD from emmv import emmv_scores rng = np.random.RandomState(42) NUM_COLS = 2 # Generate train data X = 0.3 * rng.randn(100, NUM_COLS) X_train = np.r_[X + 2, X - 2] # Generate some regular novel observations X = 0.3 * rng.randn(20, NUM_COLS) X_regular = np.r_[X + 2, X - 2] # Generate some abnormal novel observations X_outliers = rng.uniform(low=-4, high=4, size=(20, NUM_COLS)) # fit the model model = COPOD() model.fit(X_train) # Get EM & MV scores X_test = np.concatenate((X_regular, X_outliers), axis=0) test_scores = emmv_scores(model, X_test) print('Excess Mass score;', test_scores['em']) print('Mass Volume score:', test_scores['mv'])
def main(): # PART 1: # Getting the predictions for each classifier # SK means: The classifier is from sklearn or works like sklearn # PY means: The classifier is from pyod or works like pyod models = { 'SK_EE': EllipticEnvelope(), 'SK_GM': GaussianMixture(), 'SK_IF': IsolationForest(), 'SK_OCSVM': OneClassSVM(), 'SK_FA': FactorAnalysis(), 'SK_KD': KernelDensity(), 'PY_PCA': PCA(), 'PY_COF': COF(), 'PY_LODA': LODA(), 'PY_LOF': LOF(), 'PY_HBOS': HBOS(), 'PY_MCD': MCD(), 'PY_AvgKNN': KNN(method='mean'), 'PY_LargestKNN': KNN(method='largest'), 'PY_MedKNN': KNN(method='median'), 'PY_AvgBagging': FeatureBagging(combination='average'), 'PY_MaxBagging': FeatureBagging(combination='max'), 'PY_CBLOF': CBLOF(n_clusters=10, n_jobs=4), 'PY_COPOD': COPOD(), 'PY_SOD': SOD(), 'PY_LSCPwithLODA': LSCP([LODA(), LODA()]), 'PY_AveLMDD': LMDD(dis_measure='aad'), 'PY_VarLMDD': LMDD(dis_measure='var'), 'PY_IqrLMDD': LMDD(dis_measure='iqr'), 'PY_VAE': VAE(encoder_neurons=[8, 4, 2]), 'PY_AutoEncoder': AutoEncoder(hidden_neurons=[6, 3, 3, 6]), 'SK_BRM': BRM(bootstrap_sample_percent=70), 'SK_OCKRA': m_OCKRA(), 'PY_SoGaal': SO_GAAL(), 'PY_MoGaal': MO_GAAL() } ranker = ADRanker(data="datasets", models=models) ranker.get_predictions() # PART 2: # After predictions, we can evaluate our classifiers using different scores # You can add manually a new metric by modifying 'metrics.py' ranker.get_scores(scores={'auc': Metrics.get_roc, 'ave': Metrics.get_ave}) # PART 3: # Finally, it is time to summarize the results by plotting different graphs # You can add your own graphs by modifying ' plots.py' plot = Plots() plot.make_plot_basic(paths=[ 'results/scores/auc/no/results.csv', 'results/scores/auc/minmax/results.csv', 'results/scores/auc/std/results.csv', 'results/scores/ave/no/results.csv', 'results/scores/ave/minmax/results.csv', 'results/scores/ave/std/results.csv' ], scalers=[ 'Without scaler', 'Min max scaler', 'Standard scaler', 'Without scaler', 'Min max scaler', 'Standard scaler' ]) plot.make_cd_plot( paths=[ 'results/scores/auc/minmax/results.csv', 'results/scores/ave/no/results.csv', 'results/scores/auc/no/results.csv', 'results/scores/ave/no/results.csv', 'results/scores/auc/std/results.csv', 'results/scores/ave/std/results.csv' ], names=[ 'CD auc minmax scale', 'CD ave minmax scale', 'CD auc no scale', 'CD ave no scale', 'CD auc std scale', 'CD ave std scale' ], titles=[ 'CD diagram - AUC with min max scaling', 'CD diagram - Average precision with min max scaling', 'CD diagram - AUC without scaling', 'CD diagram - Average precision without scaling', 'CD diagram - AUC with standard scaling', 'CD diagram - Average precision with standard scaling' ])
mat = loadmat(os.path.join('data', mat_file)) X = mat['X'] y = mat['y'].ravel() X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=1) # standardizing data for processing X_train_norm, X_test_norm = standardizer(X_train, X_test) # train COPOD detector clf_name = 'COPOD' clf = COPOD() # you could try parallel version as well. # clf = COPOD(n_jobs=2) clf.fit(X_train) # get the prediction labels and outlier scores of the training data y_train_pred = clf.labels_ # binary labels (0: inliers, 1: outliers) y_train_scores = clf.decision_scores_ # raw outlier scores print('The first sample is an outlier', y_train[0]) clf.explain_outlier(0) # we could see feature 7, 16, and 20 is above the 0.99 cutoff # and play a more important role in deciding it is an outlier.
def fit(self, X, shrink_cols = True, data_scaler = preprocessing.MaxAbsScaler(), quick_methods = True, slow_methods = False, nn_methods = False, contamination = 0.05, use_score_rank = False, random_state = None, verbose = 0): if len(X.shape) > 2: X = X.reshape(X.shape[0], X.shape[1]*X.shape[2]) elif len(X.shape) > 3: raise ValueError("Expected number of dimensions: 2 or 3") if shrink_cols: X = X[:,~np.all(X == 0, axis=0)] log.info('zero columns shrinked') if data_scaler: X = data_scaler.fit_transform(X) log.info(f'used {data_scaler} data scaler') #log.info(X[0:1,:]) n_rows = X.shape[0] n_features = X.shape[1] log.info (f'n_rows = {n_rows}, n_features = {n_features}') quick_scores = np.zeros([n_rows, 0]) slow_scores = np.zeros([n_rows, 0]) nn_scores = np.zeros([n_rows, 0]) if quick_methods: # Define anomaly detection tools to be compared quick_classifiers = { 'PCA_randomized': PCA(contamination=contamination, random_state=random_state, standardization = False, svd_solver = 'randomized'), 'PCA_full': PCA(contamination=contamination, random_state=random_state, standardization = False, svd_solver = 'full'), 'COPOD': COPOD(contamination=contamination), f'HBOS': HBOS(contamination=contamination), f'HBOS_{200}': HBOS(contamination=contamination, n_bins = 200), f'HBOS_{300}': HBOS(contamination=contamination, n_bins = 300), 'LODA': LODA(contamination=contamination), 'LODA_200': LODA(contamination=contamination, n_random_cuts = 200), 'LODA_300': LODA(contamination=contamination, n_random_cuts = 300), 'IForest_100': IForest(contamination=contamination, random_state=random_state, n_estimators = 100, bootstrap = False, n_jobs = -1), 'IForest_200': IForest(contamination=contamination, random_state=random_state, n_estimators = 200, bootstrap = False, n_jobs = -1), 'IForest_bootstrap': IForest(contamination = contamination, random_state=random_state, n_estimators = 150, bootstrap = True, n_jobs = -1), #'MCD': # MCD(contamination=contamination, random_state=random_state, assume_centered = False), #'MCD_centered': # MCD(contamination=contamination, random_state=random_state, assume_centered = True), f'CBLOF_16': CBLOF(contamination=contamination, random_state=random_state, n_clusters = 16), f'CBLOF_24': CBLOF(contamination=contamination, random_state=random_state, n_clusters = 24), f'CBLOF_32': CBLOF(contamination=contamination, random_state=random_state, n_clusters = 32) } quick_scores = np.zeros([n_rows, len(quick_classifiers)]) for i, (clf_name, clf) in enumerate(quick_classifiers.items()): log.info(f'{i+1} - fitting {clf_name}') try: clf.fit(X) quick_scores[:, i] = clf.decision_scores_ except: log.info(traceback.print_exc()) else: log.info(f'Base detector {i+1}/{len(quick_classifiers)} is fitted for prediction') quick_scores = np.nan_to_num(quick_scores) if slow_methods: # initialize a set of detectors for LSCP detector_list = [LOF(n_neighbors=10), LOF(n_neighbors=15), LOF(n_neighbors=20)] slow_classifiers = { #'Angle-based Outlier Detector (ABOD)': #too slow and nan results # ABOD(contamination=contamination), #'One-class SVM (OCSVM)': # OCSVM(contamination=contamination, cache_size = 2000, shrinking = False, tol = 1e-2), #'LSCP': #slow and no parallel # LSCP(detector_list, contamination=contamination, random_state=random_state, local_region_size = 30), #'Feature Bagging': #ensemble #no real par # FeatureBagging(LOF(n_neighbors=20), contamination=contamination, # random_state=random_state, n_jobs = -1), #'SOS' : # too memory inefficient # SOS(contamination=contamination), #'COF': # memory inefficient # COF(contamination=contamination), #'SOD': # SOD(contamination = contamination), #'KNN': # KNN(contamination=contamination, n_jobs = -1), #'KNN_50': # KNN(contamination=contamination, leaf_size = 50, n_jobs = -1), #'KNN_70': # KNN(contamination=contamination, leaf_size = 70, n_jobs = -1), 'LOF_4': LOF(n_neighbors=4, contamination=contamination, n_jobs = -1), 'LOF_5': LOF(n_neighbors=5, contamination=contamination, n_jobs = -1), 'LOF_6': LOF(n_neighbors=6, contamination=contamination, n_jobs = -1), 'LOF_7': LOF(n_neighbors=7, contamination=contamination, n_jobs = -1), 'LOF_8': LOF(n_neighbors=8, contamination=contamination, n_jobs = -1), 'LOF_9': LOF(n_neighbors=9, contamination=contamination, n_jobs = -1), 'LOF_10': LOF(n_neighbors=10, contamination=contamination, n_jobs = -1), 'LOF_12': LOF(n_neighbors=12, contamination=contamination, n_jobs = -1), 'LOF_14': LOF(n_neighbors=14, contamination=contamination, n_jobs = -1), 'LOF_16': LOF(n_neighbors=16, contamination=contamination, n_jobs = -1), 'LOF_18': LOF(n_neighbors=18, contamination=contamination, n_jobs = -1), 'LOF_20': LOF(n_neighbors=20, contamination=contamination, n_jobs = -1), 'LOF_22': LOF(n_neighbors=22, contamination=contamination, n_jobs = -1) } slow_scores = np.zeros([n_rows, len(slow_classifiers)]) for i, (clf_name, clf) in enumerate(slow_classifiers.items()): log.info(f'{i+1} - fitting {clf_name}') try: clf.fit(X) slow_scores[:, i] = clf.decision_scores_ except: log.info(traceback.print_exc()) else: log.info(f'Base detector {i+1}/{len(slow_classifiers)} is fitted for prediction') slow_scores = np.nan_to_num(slow_scores) if nn_methods: nn_classifiers = {} n_list = [1024, 512, 256, 128, 64, 32, 16, 8, 4, 2] n_idx = next(x[0] for x in enumerate(n_list) if x[1] < n_features) for i in range(3,6): n_enc = n_list[n_idx:n_idx+i-1] n_dec = n_enc[::-1] n_enc_dec = n_enc + n_dec nn_classifiers[f'FULL_AE_{len(n_enc + n_dec)}'] = {'clf': self.full_autoencoder, 'hidden_layers' : n_enc_dec } nn_classifiers[f'VAE_{len(n_enc_dec)}'] = {'clf': VAE(contamination = contamination, random_state = random_state, encoder_neurons = n_enc, decoder_neurons = n_dec, preprocessing = False, epochs = 32, verbosity = verbose), 'hidden_layers' : n_enc + n_dec } nn_scores = np.zeros([n_rows, len(nn_classifiers)]) for i, (clf_name, clf) in enumerate(nn_classifiers.items()): log.info(f'''{i+1} - fitting {clf_name} with layers {clf['hidden_layers']}''') try: if clf['clf'] == self.full_autoencoder: nn_scores[:, i] = clf['clf'](X, neurons_list = clf['hidden_layers'], verbose = verbose) else: clf['clf'].fit(X) nn_scores[:, i] = clf['clf'].decision_scores_ except: log.info(traceback.print_exc()) else: log.info(f'Base detector {i+1}/{len(nn_classifiers)} is fitted for prediction') nn_scores = np.nan_to_num(nn_scores) all_scores = np.concatenate((quick_scores, slow_scores, nn_scores), axis=1) all_scores = all_scores[:,~np.all(all_scores == 0, axis=0)] log.info(f'total scores = {all_scores.shape[1]}') all_scores_norm = np.copy(all_scores) if use_score_rank: all_scores_norm = np.apply_along_axis(rank_fun, 0, all_scores_norm) log.info(f'score rank applied') all_scores_norm = preprocessing.MinMaxScaler().fit_transform(all_scores_norm) if all_scores_norm.shape[1] >= 12: score_by_aom = aom(all_scores_norm, method = 'dynamic', n_buckets = round(all_scores_norm.shape[1]/4)) score_by_moa = moa(all_scores_norm, method = 'dynamic', n_buckets = round(all_scores_norm.shape[1]/4)) score_by_avg = np.mean(all_scores_norm, axis = 1) score_by_max = np.max(all_scores_norm, axis = 1) else: score_by_avg = np.mean(all_scores_norm, axis = 1) score_by_max = np.max(all_scores_norm, axis = 1) score_by_aom = score_by_avg score_by_moa = score_by_max return score_by_aom, score_by_moa, score_by_max, score_by_avg, all_scores, all_scores_norm
print('\nFinished ' + self.model_name) return None if __name__ == '__main__': # Specify the root directory rootDir = 'G:/My Drive/Github/ml-group-col/One-Class-models/Anomaly_Datasets_csv/' # specify the random state rs = 10 # Save how to run the models models = [ (IsolationForest(random_state=rs), 'ISOF'), (EllipticEnvelope(random_state=rs), 'EE'), (LMDD(dis_measure='aad', random_state=rs), 'AAD_LMDD'), (COPOD(), 'COPOD'), (FeatureBagging(combination='average', random_state=rs), 'AVE_Bagging'), # n_jobs (LMDD(dis_measure='iqr', random_state=rs), 'IQR_LMDD'), (KNN(method='largest'), 'Largest_KNN'), # n_jobs (LODA(), 'LODA'), (FeatureBagging(combination='max', n_jobs=-1, random_state=rs), 'MAX_Bagging'), (MCD(random_state=rs), 'MCD'), (XGBOD(random_state=rs), 'XGBOD'), # n_jobs (GaussianMixture(random_state=rs), 'GMM'), (LocalOutlierFactor(novelty=True), 'LOF'), (KNN(method='median'), 'Median_KNN'), # n_jobs (KNN(method='mean'), 'Avg_KNN'), # n_jobs (CBLOF(n_clusters=10, random_state=rs), 'CBLOF'), (HBOS(), 'HBOS'),