def vaeAD(self, encoder_neurons, decoder_neurons, epochs, contamination): clf_name = 'VAE' clf = VAE(encoder_neurons=encoder_neurons, decoder_neurons=decoder_neurons, epochs=epochs, contamination=contamination) clf.fit(self.X) # get the prediction labels and outlier scores of the training data y_pred = clf.labels_ # binary labels (0: inliers, 1: outliers) y_scores = clf.decision_scores_ # raw outlier scores generateAnomalis(self.data, self.label, y_pred) self.evaluate()
def setUp(self): self.n_train = 6000 self.n_test = 1000 self.n_features = 300 self.contamination = 0.1 self.roc_floor = 0.8 self.X_train, self.X_test, self.y_train, self.y_test = generate_data( n_train=self.n_train, n_test=self.n_test, n_features=self.n_features, contamination=self.contamination, random_state=42) self.clf = VAE(epochs=5, contamination=self.contamination) self.clf.fit(self.X_train)
def __init__( self, *, hyperparams: Hyperparams, # random_seed: int = 0, docker_containers: Dict[str, DockerContainer] = None) -> None: super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers) if hyperparams['loss'] == 'mean_squared_error': loss = keras.losses.mean_squared_error else: raise ValueError('VAE only suports mean squered error for now') self._clf = VAE( contamination=hyperparams['contamination'], encoder_neurons=hyperparams['encoder_neurons'], decoder_neurons=hyperparams['decoder_neurons'], hidden_activation=hyperparams['hidden_activation'], output_activation=hyperparams['output_activation'], loss=loss, gamma=hyperparams['gamma'], capacity=hyperparams['capacity'], optimizer=hyperparams['optimizer'], epochs=hyperparams['epochs'], batch_size=hyperparams['batch_size'], dropout_rate=hyperparams['dropout_rate'], l2_regularizer=hyperparams['l2_regularizer'], validation_size=hyperparams['validation_size'], preprocessing=hyperparams['preprocessing'], verbosity=hyperparams['verbosity'], random_state=hyperparams['random_state'], ) return
def fit_VAE_with_scores(username): """This is the function that performs unsupervised anomaly detection\ on the scaled tweet data from a user.""" #read the data in df = pd.read_csv('../data/processed/'+username+\ '_scaled_tweet_features.csv').drop('Unnamed: 0',axis='columns') #dataframe to array X = df.values ndim = X.shape[1] #the number of features random_state = np.random.RandomState(81) #Random seed outlier_fraction = 0.01 #1% of all tweets are outliers classifiers = { 'Variational Auto Encoder (VAE)': VAE(epochs=20, contamination=outlier_fraction, random_state=random_state, encoder_neurons=[ ndim, max(int(ndim / 2), 1), max(int(ndim / 4), 1) ], decoder_neurons=[ max(int(ndim / 4), 1), max(int(ndim / 2), 1), ndim ], verbosity=0) } for i, (clf_name, clf) in enumerate(classifiers.items()): clf.fit(X) scores_pred = clf.decision_function(X) * -1 y_pred = clf.predict(X) return y_pred, scores_pred
def fit_VAE_direct(df): """This is the function that performs unsupervised anomaly detection\ on the scaled tweet data from a user.""" #dataframe to array X = df.values ndim = X.shape[1] #the number of features random_state = np.random.RandomState(81) #Random seed outlier_fraction = 0.007 #.7% of all tweets are outliers (best fit) classifiers = { 'Variational Auto Encoder (VAE)': VAE(epochs=20, contamination=outlier_fraction, random_state=random_state, encoder_neurons=[ ndim, max(int(ndim / 2), 1), max(int(ndim / 4), 1) ], decoder_neurons=[ max(int(ndim / 4), 1), max(int(ndim / 2), 1), ndim ], verbosity=0) } for i, (clf_name, clf) in enumerate(classifiers.items()): clf.fit(X) y_pred = clf.predict(X) return y_pred
def main(args): data = loadmat(args.filename) trainx, testx, trainy, testy = train_test_split(data['X'], data['y'], test_size=args.train_split, random_state=2) valx, evalx, valy, evaly = train_test_split(testx, testy, test_size=0.5) data_size = len(trainx[0]) encoder_neurons = [data_size, data_size / 2, data_size / 4] clf = KNN() clf.fit(trainx) print("Results Validation KNN") print_metrics(valy, clf.predict(valx)) print("Results Evaluation KNN") print_metrics(evaly, clf.predict(evalx)) clf = PCA(n_components=args.components) clf.fit(trainx) print("Results Validation PCA") print_metrics(valy, clf.predict(valx)) print("Results Evaluation PCA") print_metrics(evaly, clf.predict(evalx)) clf = VAE(encoder_neurons=encoder_neurons, decoder_neurons=encoder_neurons[::-1], epochs=args.epochs, contamination=args.contamination, gamma=args.gamma, capacity=args.capacity) clf.fit(trainx) print("Results Validation VAE") print_metrics(valy, clf.predict(valx)) print("Results Evaluation VAE") print_metrics(evaly, clf.predict(evalx))
def model_test(model_type, y_train, y_test, X_train, X_test, model_file, save_flag): if model_type == 'KNN': clf_name = 'KNN' clf = KNN() clf.fit(X_train) if model_type == 'XGBOD': clf_name = 'XGBOD' #set this scale_pos_weight sum(negative instances) / sum(positive instances). clf = XGBOD(random_state=42, scale_pos_weight=50) clf.fit(X_train, y_train) if model_type == 'SOD': # train SOD detector # Note that SOD is meant to work in high dimensions d > 2. # But here we are using 2D for visualization purpose # thus, higher precision is expected in higher dimensions clf_name = 'SOD' clf = SOD() clf.fit(X_train) if model_type == 'VAE': # train VAE detector (Beta-VAE) clf_name = 'VAE' contamination = 0.01 clf = VAE(epochs=30, contamination=contamination, gamma=0.8, capacity=0.2) clf.fit(X_train) #save model if specified if save_flag == '1': pickle.dump(clf, open(model_file, "wb")) # get the prediction labels and outlier scores of the training data y_train_pred = clf.labels_ # binary labels (0: inliers, 1: outliers) y_train_scores = clf.decision_scores_ # raw outlier scores # get the prediction on the test data y_test_pred = clf.predict(X_test) # outlier labels (0 or 1) y_test_scores = clf.decision_function(X_test) # outlier scores # evaluate and print the results print("\nOn Training Data:") evaluate_print(clf_name, y_train, y_train_scores) conf_train = confusion_matrix(y_train, y_train_pred) print("<<<< confusion matrix for train: ", conf_train) print("\nOn Test Data:") evaluate_print(clf_name, y_test, y_test_scores) conf_test = confusion_matrix(y_test, y_test_pred) print("<<<< confusion matrix for test: ", conf_test) # visualize the results #todo: Input data has to be 2-d for visualization. #visualize(clf_name, X_train, y_train, X_test, y_test, y_train_pred, # y_test_pred, show_figure=True, save_figure=False) return model_file
def main(): scalers = ['no', 'std', 'minmax'] root = 'Unsupervised_Anamaly_Detection_csv' start = 0 counts = 90 CPUS = 3 CPUS_Models = 4 sklearn_models = [ 'AvgKNN', 'LargestKNN', 'MedKNN', 'PCA', 'COF', 'LODA', 'LOF', 'HBOS', 'MCD', 'AvgBagging', 'MaxBagging', 'IForest', 'CBLOF', 'COPOD', 'SOD', 'LSCPwithLODA', 'AveLMDD', 'VarLMDD', 'IqrLMDD', 'SoGaal', 'MoGaal', 'VAE', 'AutoEncoder' ] models = { 'BRM': BRM(bootstrap_sample_percent=70), 'GM': GaussianMixture(), 'IF': IsolationForest(), 'OCSVM': OneClassSVM(), 'EE': EllipticEnvelope(), 'AvgKNN': KNN(method='mean'), 'LargestKNN': KNN(method='largest'), 'MedKNN': KNN(method='median'), 'PCA': PCA(), 'COF': COF(), 'LODA': LODA(), 'LOF': LOF(), 'HBOS': HBOS(), 'MCD': MCD(), 'AvgBagging': FeatureBagging(combination='average'), 'MaxBagging': FeatureBagging(combination='max'), 'CBLOF': CBLOF(n_clusters=10, n_jobs=4), 'FactorAnalysis': FactorAnalysis(), 'KernelDensity': KernelDensity(), 'COPOD': COPOD(), 'SOD': SOD(), 'LSCPwithLODA': LSCP([LODA(), LODA()]), 'AveLMDD': LMDD(dis_measure='aad'), 'VarLMDD': LMDD(dis_measure='var'), 'IqrLMDD': LMDD(dis_measure='iqr'), 'SoGaal': SO_GAAL(), 'MoGaal': MO_GAAL(), 'VAE': VAE(encoder_neurons=[8, 4, 2]), 'AutoEncoder': AutoEncoder(hidden_neurons=[6, 3, 3, 6]), 'OCKRA': m_OCKRA(), } name = "30_Models" Parallel(n_jobs=CPUS) \ (delayed(runByScaler) (root, scaler, models, start, counts, other_models=sklearn_models, CPUS=CPUS_Models, save_name=name) for scaler in scalers)
def detect_outliers_VAE(df): ''' Returns the outlier scores using Variational AutoEncoders Parameters: ----------- df: pd.DataFrame, ''' if df.shape[1] < 128: encoder = [df.shape[1], df.shape[1]/2, df.shape[1]/4] decoder = encoder[::-1] else: encoder = [128, 64, 32] decoder = encoder[::-1] clf = VAE(contamination=0.1, encoder_neurons=encoder, decoder_neurons=decoder) df = df.astype(np.float32) clf.fit(df) outlier_score = clf.decision_scores_ # df_result = pd.DataFrame(outlier_pred, columns=['outlier_pred']) return outlier_score * -1
def __init__(self, hidden_neurons, nu, epochs, batch_size=32): if len(hidden_neurons) % 2 == 0: print("The number of layers must be an odd number(2n+1).") sys.exit() encoder = hidden_neurons[0:len(hidden_neurons) // 2] latent = hidden_neurons[len(hidden_neurons) // 2] decoder = hidden_neurons[len(hidden_neurons) // 2 + 1:len(hidden_neurons)] self.model = VAE(encoder_neurons=encoder, decoder_neurons=decoder, latent_dim=latent, contamination=nu, epochs=epochs, batch_size=batch_size)
def choose_model(model, nnet): """ among implemented in PyOD """ clfs = { 'AE': AutoEncoder(hidden_neurons=nnet, contamination=0.1, epochs=15), 'VAE': VAE(encoder_neurons=nnet[:5], decoder_neurons=nnet[4:], contamination=0.1, epochs=13), 'ABOD': ABOD(), 'FeatureBagging': FeatureBagging(), 'HBOS': HBOS(), 'IForest': IForest(), 'KNN': KNN(), 'LOF': LOF(), 'OCSVM': OCSVM(), 'PCA': PCA(), 'SOS': SOS(), 'COF': COF(), 'CBLOF': CBLOF(), 'SOD': SOD(), 'LOCI': LOCI(), 'MCD': MCD() } return clfs[model]
def identify_anomalies(X,outlier_fraction = 0.007,epochs=20,path,username): """A function that performs variational auto encoding analysis on the tweet data""" ndim = X.shape[1] #the number of features random_state = np.random.RandomState(42) #outlier_fraction = 0.01 #1% of all tweets are outliers #specifies the model parameters classifiers = { 'Variational Auto Encoder (VAE)': VAE(epochs, contamination = outlier_fraction, random_state = random_state, encoder_neurons = [ndim,max(int(ndim/2),1),max(int(ndim/4),1)], decoder_neurons = [max(int(ndim/4),1),max(int(ndim/2),1),20], verbosity=0) } for i, (clf_name,clf) in enumerate(classifiers.items()): clf.fit(X) #fits the model scores_pred = clf.decision_function(X) * -1 #model scores y_pred = clf.predict(X) #model predictions for anomalies return y_pred # Don't forget to do this at some point: #unscaled_tweet_features_df['anomalous'] = y_pred #unscaled_tweet_features_df.to_csv(path+username+ # '_anomaly_tagged_tweet_features.csv')
PyodDetector(PCA, "PCA") ]) pyod_umap_big_dim = EvalRun("pyod_umap_big_dim", [doc2vecwikiall], [imdb_20news_3splits], [], [ PyodDetector(HBOS, "HBOS"), PyodDetector(IForest, "iForest"), PyodDetector(LOF, "LOF"), PyodDetector(OCSVM, "OCSVM"), PyodDetector(PCA, "PCA") ]) pyod_autoencoder_test = EvalRun( "pyod_autoencoder_test", [doc2vecwikiall, longformer_large], [imdb_20news_3splits], [NoReduction()], [ PyodDetector(VAE(epochs=30, verbosity=1), "VAE_30"), PyodDetector(VAE(epochs=100, verbosity=1), "VAE_100"), PyodDetector(AutoEncoder(epochs=30, verbose=1), "AE_30"), PyodDetector(AutoEncoder(epochs=100, verbose=2), "AE_100") ]) pyod_autoencer_refined = EvalRun( "pyod_autoencer_refined", [doc2vecwikiall, doc2vecapnews], [imdb_20news_3split_fracs], [], [ PyodDetector( AutoEncoder(hidden_neurons=[32, 16, 16, 32], epochs=30, verbose=1), "AE_30_small"), PyodDetector(AutoEncoder(epochs=10, verbose=1), "AE_10"), PyodDetector(AutoEncoder(epochs=30, verbose=1), "AE_30"), PyodDetector(AutoEncoder(epochs=100, verbose=2), "AE_100") ])
], axis=1) ] unimodality = [ "image", "word2vec", "bert", "concat_joint", "vae_joint", "simple_concat" ] clfs = [ IForest(random_state=42), LOF(), OCSVM(), PCA(), KNN(), HBOS(), COPOD(), AutoEncoder(verbose=0), VAE(latent_dim=32, verbose=0) ] for embedding, modality in zip(unimodal_embeddings, unimodality): print() print(modality) print() embedding_scaled = standardizer(embedding) for clf in clfs: # print(clf) clf.fit(embedding_scaled) evaluate_print(clf.__class__.__name__, anomaly_label, clf.decision_scores_) # -*- coding: utf-8 -*-
contamination = 0.1 # percentage of outliers n_train = 20000 # number of training points n_test = 2000 # number of testing points n_features = 300 # number of features # Generate sample data X_train, y_train, X_test, y_test = \ generate_data(n_train=n_train, n_test=n_test, n_features=n_features, contamination=contamination, random_state=42) # train VAE detector clf_name = 'VAE' clf = VAE(epochs=30, contamination=contamination) clf.fit(X_train) # get the prediction labels and outlier scores of the training data y_train_pred = clf.labels_ # binary labels (0: inliers, 1: outliers) y_train_scores = clf.decision_scores_ # raw outlier scores # get the prediction on the test data y_test_pred = clf.predict(X_test) # outlier labels (0 or 1) y_test_scores = clf.decision_function(X_test) # outlier scores # evaluate and print the results print("\nOn Training Data:") evaluate_print(clf_name, y_train, y_train_scores) print("\nOn Test Data:") evaluate_print(clf_name, y_test, y_test_scores)
def train(self): self.model = VAE() self.model.fit(self.training_data.dataset.x)
class SolverVAECIFAR(): def __init__(self, data_name, hidden_dim=256, seed=0, learning_rate=3e-4, normal_class=0, anomaly_ratio=0.1, batch_size=128, concentrated=0, training_ratio=0.8, SN=1, Trim=1, L=1.5, max_epochs=100): np.random.seed(seed) torch.manual_seed(seed) torch.cuda.manual_seed(seed) use_cuda = torch.cuda.is_available() self.device = torch.device("cuda" if use_cuda else "cpu") self.L = L if concentrated == 1.0: full_data_name = 'CIFAR10_Concentrated' elif concentrated == 0.0: full_data_name = 'CIFAR10' self.result_path = "./results/{}_{}/0.0/VAE/{}/".format( full_data_name, normal_class, seed ) data_path = "./data/" + data_name + ".npy" self.learning_rate = learning_rate self.SN = SN self.Trim = Trim # self.dataset = RealGraphDataset(data_path, missing_ratio=0, radius=2) self.dataset = CIFARVGGDataset(data_path, normal_class=normal_class, anomaly_ratio=anomaly_ratio, concentrated=concentrated) self.seed = seed self.hidden_dim = hidden_dim self.max_epochs = max_epochs self.data_path = data_path self.data_anomaly_ratio = self.dataset.__anomalyratio__() self.batch_size = batch_size self.input_dim = self.dataset.__dim__() self.data_normaly_ratio = 1 - self.data_anomaly_ratio n_sample = self.dataset.__len__() self.n_train = int(n_sample * training_ratio) self.n_test = n_sample - self.n_train print('|data dimension: {}|data noise ratio:{}'.format(self.dataset.__dim__(), self.data_anomaly_ratio)) self.training_data, self.testing_data = data.random_split(dataset=self.dataset, lengths=[ self.n_train, self.n_test ]) self.ae = None self.discriminator = None self.model=None def train(self): self.model = VAE() self.model.fit(self.training_data.dataset.x) def test(self): y_test_scores = self.model.decision_function(self.testing_data.dataset.x) auc = roc_auc_score(self.testing_data.dataset.y, y_test_scores) from sklearn.metrics import precision_recall_fscore_support as prf, accuracy_score print("AUC:{:0.4f}".format( auc)) os.makedirs(self.result_path, exist_ok=True) np.save( self.result_path + "result.npy", { "accuracy": auc, "precision": auc, "recall": auc, "f1": auc, "auc": auc, }, ) # for consistency print("result save to {}".format(self.result_path))
class TestVAE(unittest.TestCase): def setUp(self): self.n_train = 6000 self.n_test = 1000 self.n_features = 300 self.contamination = 0.1 self.roc_floor = 0.8 self.X_train, self.y_train, self.X_test, self.y_test = generate_data( n_train=self.n_train, n_test=self.n_test, n_features=self.n_features, contamination=self.contamination, random_state=42) self.clf = VAE(epochs=5, contamination=self.contamination) self.clf.fit(self.X_train) def test_parameters(self): assert (hasattr(self.clf, 'decision_scores_') and self.clf.decision_scores_ is not None) assert (hasattr(self.clf, 'labels_') and self.clf.labels_ is not None) assert (hasattr(self.clf, 'threshold_') and self.clf.threshold_ is not None) assert (hasattr(self.clf, '_mu') and self.clf._mu is not None) assert (hasattr(self.clf, '_sigma') and self.clf._sigma is not None) assert (hasattr(self.clf, 'model_') and self.clf.model_ is not None) def test_train_scores(self): assert_equal(len(self.clf.decision_scores_), self.X_train.shape[0]) def test_prediction_scores(self): pred_scores = self.clf.decision_function(self.X_test) # check score shapes assert_equal(pred_scores.shape[0], self.X_test.shape[0]) # check performance assert (roc_auc_score(self.y_test, pred_scores) >= self.roc_floor) def test_prediction_labels(self): pred_labels = self.clf.predict(self.X_test) assert_equal(pred_labels.shape, self.y_test.shape) def test_prediction_proba(self): pred_proba = self.clf.predict_proba(self.X_test) assert (pred_proba.min() >= 0) assert (pred_proba.max() <= 1) def test_prediction_proba_linear(self): pred_proba = self.clf.predict_proba(self.X_test, method='linear') assert (pred_proba.min() >= 0) assert (pred_proba.max() <= 1) def test_prediction_proba_unify(self): pred_proba = self.clf.predict_proba(self.X_test, method='unify') assert (pred_proba.min() >= 0) assert (pred_proba.max() <= 1) def test_prediction_proba_parameter(self): with assert_raises(ValueError): self.clf.predict_proba(self.X_test, method='something') def test_fit_predict(self): pred_labels = self.clf.fit_predict(self.X_train) assert_equal(pred_labels.shape, self.y_train.shape) def test_fit_predict_score(self): self.clf.fit_predict_score(self.X_test, self.y_test) self.clf.fit_predict_score(self.X_test, self.y_test, scoring='roc_auc_score') self.clf.fit_predict_score(self.X_test, self.y_test, scoring='prc_n_score') with assert_raises(NotImplementedError): self.clf.fit_predict_score(self.X_test, self.y_test, scoring='something') def tearDown(self): pass
def pyod_init(model, n_features=None): # initial model set up if model == 'abod': from pyod.models.abod import ABOD clf = ABOD() elif model == 'auto_encoder' and n_features: #import os #os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' from pyod.models.auto_encoder import AutoEncoder clf = AutoEncoder(hidden_neurons=[ n_features, n_features * 5, n_features * 5, n_features ], epochs=5, batch_size=64, preprocessing=False) elif model == 'cblof': from pyod.models.cblof import CBLOF clf = CBLOF(n_clusters=4) elif model == 'hbos': from pyod.models.hbos import HBOS clf = HBOS() elif model == 'iforest': from pyod.models.iforest import IForest clf = IForest() elif model == 'knn': from pyod.models.knn import KNN clf = KNN() elif model == 'lmdd': from pyod.models.lmdd import LMDD clf = LMDD() elif model == 'loci': from pyod.models.loci import LOCI clf = LOCI() elif model == 'loda': from pyod.models.loda import LODA clf = LODA() elif model == 'lof': from pyod.models.lof import LOF clf = LOF() elif model == 'mcd': from pyod.models.mcd import MCD clf = MCD() elif model == 'ocsvm': from pyod.models.ocsvm import OCSVM clf = OCSVM() elif model == 'pca': from pyod.models.pca import PCA clf = PCA() elif model == 'sod': from pyod.models.sod import SOD clf = SOD() elif model == 'vae': from pyod.models.vae import VAE clf = VAE() elif model == 'xgbod': from pyod.models.xgbod import XGBOD clf = XGBOD() else: #raise ValueError(f"unknown model {model}") clf = PyODDefaultModel() return clf
def fit(self, X, shrink_cols = True, data_scaler = preprocessing.MaxAbsScaler(), quick_methods = True, slow_methods = False, nn_methods = False, contamination = 0.05, use_score_rank = False, random_state = None, verbose = 0): if len(X.shape) > 2: X = X.reshape(X.shape[0], X.shape[1]*X.shape[2]) elif len(X.shape) > 3: raise ValueError("Expected number of dimensions: 2 or 3") if shrink_cols: X = X[:,~np.all(X == 0, axis=0)] log.info('zero columns shrinked') if data_scaler: X = data_scaler.fit_transform(X) log.info(f'used {data_scaler} data scaler') #log.info(X[0:1,:]) n_rows = X.shape[0] n_features = X.shape[1] log.info (f'n_rows = {n_rows}, n_features = {n_features}') quick_scores = np.zeros([n_rows, 0]) slow_scores = np.zeros([n_rows, 0]) nn_scores = np.zeros([n_rows, 0]) if quick_methods: # Define anomaly detection tools to be compared quick_classifiers = { 'PCA_randomized': PCA(contamination=contamination, random_state=random_state, standardization = False, svd_solver = 'randomized'), 'PCA_full': PCA(contamination=contamination, random_state=random_state, standardization = False, svd_solver = 'full'), 'COPOD': COPOD(contamination=contamination), f'HBOS': HBOS(contamination=contamination), f'HBOS_{200}': HBOS(contamination=contamination, n_bins = 200), f'HBOS_{300}': HBOS(contamination=contamination, n_bins = 300), 'LODA': LODA(contamination=contamination), 'LODA_200': LODA(contamination=contamination, n_random_cuts = 200), 'LODA_300': LODA(contamination=contamination, n_random_cuts = 300), 'IForest_100': IForest(contamination=contamination, random_state=random_state, n_estimators = 100, bootstrap = False, n_jobs = -1), 'IForest_200': IForest(contamination=contamination, random_state=random_state, n_estimators = 200, bootstrap = False, n_jobs = -1), 'IForest_bootstrap': IForest(contamination = contamination, random_state=random_state, n_estimators = 150, bootstrap = True, n_jobs = -1), #'MCD': # MCD(contamination=contamination, random_state=random_state, assume_centered = False), #'MCD_centered': # MCD(contamination=contamination, random_state=random_state, assume_centered = True), f'CBLOF_16': CBLOF(contamination=contamination, random_state=random_state, n_clusters = 16), f'CBLOF_24': CBLOF(contamination=contamination, random_state=random_state, n_clusters = 24), f'CBLOF_32': CBLOF(contamination=contamination, random_state=random_state, n_clusters = 32) } quick_scores = np.zeros([n_rows, len(quick_classifiers)]) for i, (clf_name, clf) in enumerate(quick_classifiers.items()): log.info(f'{i+1} - fitting {clf_name}') try: clf.fit(X) quick_scores[:, i] = clf.decision_scores_ except: log.info(traceback.print_exc()) else: log.info(f'Base detector {i+1}/{len(quick_classifiers)} is fitted for prediction') quick_scores = np.nan_to_num(quick_scores) if slow_methods: # initialize a set of detectors for LSCP detector_list = [LOF(n_neighbors=10), LOF(n_neighbors=15), LOF(n_neighbors=20)] slow_classifiers = { #'Angle-based Outlier Detector (ABOD)': #too slow and nan results # ABOD(contamination=contamination), #'One-class SVM (OCSVM)': # OCSVM(contamination=contamination, cache_size = 2000, shrinking = False, tol = 1e-2), #'LSCP': #slow and no parallel # LSCP(detector_list, contamination=contamination, random_state=random_state, local_region_size = 30), #'Feature Bagging': #ensemble #no real par # FeatureBagging(LOF(n_neighbors=20), contamination=contamination, # random_state=random_state, n_jobs = -1), #'SOS' : # too memory inefficient # SOS(contamination=contamination), #'COF': # memory inefficient # COF(contamination=contamination), #'SOD': # SOD(contamination = contamination), #'KNN': # KNN(contamination=contamination, n_jobs = -1), #'KNN_50': # KNN(contamination=contamination, leaf_size = 50, n_jobs = -1), #'KNN_70': # KNN(contamination=contamination, leaf_size = 70, n_jobs = -1), 'LOF_4': LOF(n_neighbors=4, contamination=contamination, n_jobs = -1), 'LOF_5': LOF(n_neighbors=5, contamination=contamination, n_jobs = -1), 'LOF_6': LOF(n_neighbors=6, contamination=contamination, n_jobs = -1), 'LOF_7': LOF(n_neighbors=7, contamination=contamination, n_jobs = -1), 'LOF_8': LOF(n_neighbors=8, contamination=contamination, n_jobs = -1), 'LOF_9': LOF(n_neighbors=9, contamination=contamination, n_jobs = -1), 'LOF_10': LOF(n_neighbors=10, contamination=contamination, n_jobs = -1), 'LOF_12': LOF(n_neighbors=12, contamination=contamination, n_jobs = -1), 'LOF_14': LOF(n_neighbors=14, contamination=contamination, n_jobs = -1), 'LOF_16': LOF(n_neighbors=16, contamination=contamination, n_jobs = -1), 'LOF_18': LOF(n_neighbors=18, contamination=contamination, n_jobs = -1), 'LOF_20': LOF(n_neighbors=20, contamination=contamination, n_jobs = -1), 'LOF_22': LOF(n_neighbors=22, contamination=contamination, n_jobs = -1) } slow_scores = np.zeros([n_rows, len(slow_classifiers)]) for i, (clf_name, clf) in enumerate(slow_classifiers.items()): log.info(f'{i+1} - fitting {clf_name}') try: clf.fit(X) slow_scores[:, i] = clf.decision_scores_ except: log.info(traceback.print_exc()) else: log.info(f'Base detector {i+1}/{len(slow_classifiers)} is fitted for prediction') slow_scores = np.nan_to_num(slow_scores) if nn_methods: nn_classifiers = {} n_list = [1024, 512, 256, 128, 64, 32, 16, 8, 4, 2] n_idx = next(x[0] for x in enumerate(n_list) if x[1] < n_features) for i in range(3,6): n_enc = n_list[n_idx:n_idx+i-1] n_dec = n_enc[::-1] n_enc_dec = n_enc + n_dec nn_classifiers[f'FULL_AE_{len(n_enc + n_dec)}'] = {'clf': self.full_autoencoder, 'hidden_layers' : n_enc_dec } nn_classifiers[f'VAE_{len(n_enc_dec)}'] = {'clf': VAE(contamination = contamination, random_state = random_state, encoder_neurons = n_enc, decoder_neurons = n_dec, preprocessing = False, epochs = 32, verbosity = verbose), 'hidden_layers' : n_enc + n_dec } nn_scores = np.zeros([n_rows, len(nn_classifiers)]) for i, (clf_name, clf) in enumerate(nn_classifiers.items()): log.info(f'''{i+1} - fitting {clf_name} with layers {clf['hidden_layers']}''') try: if clf['clf'] == self.full_autoencoder: nn_scores[:, i] = clf['clf'](X, neurons_list = clf['hidden_layers'], verbose = verbose) else: clf['clf'].fit(X) nn_scores[:, i] = clf['clf'].decision_scores_ except: log.info(traceback.print_exc()) else: log.info(f'Base detector {i+1}/{len(nn_classifiers)} is fitted for prediction') nn_scores = np.nan_to_num(nn_scores) all_scores = np.concatenate((quick_scores, slow_scores, nn_scores), axis=1) all_scores = all_scores[:,~np.all(all_scores == 0, axis=0)] log.info(f'total scores = {all_scores.shape[1]}') all_scores_norm = np.copy(all_scores) if use_score_rank: all_scores_norm = np.apply_along_axis(rank_fun, 0, all_scores_norm) log.info(f'score rank applied') all_scores_norm = preprocessing.MinMaxScaler().fit_transform(all_scores_norm) if all_scores_norm.shape[1] >= 12: score_by_aom = aom(all_scores_norm, method = 'dynamic', n_buckets = round(all_scores_norm.shape[1]/4)) score_by_moa = moa(all_scores_norm, method = 'dynamic', n_buckets = round(all_scores_norm.shape[1]/4)) score_by_avg = np.mean(all_scores_norm, axis = 1) score_by_max = np.max(all_scores_norm, axis = 1) else: score_by_avg = np.mean(all_scores_norm, axis = 1) score_by_max = np.max(all_scores_norm, axis = 1) score_by_aom = score_by_avg score_by_moa = score_by_max return score_by_aom, score_by_moa, score_by_max, score_by_avg, all_scores, all_scores_norm
axis=1) ] unimodality = [ "image", "word2vec", "bert", "concat_joint", "vae_joint", "simple_concat" ] clfs = [ IForest(random_state=42), LOF(), OCSVM(), PCA(), KNN(), HBOS(), COPOD(), AutoEncoder(verbose=0), VAE(latent_dim=32, verbosity=0) ] for embedding, modality in zip(unimodal_embeddings, unimodality): print() print(modality) print() embedding_scaled = standardizer(embedding) for clf in clfs: # print(clf) clf.fit(embedding_scaled) evaluate_print(clf.__class__.__name__, anomaly_label, clf.decision_scores_) #%%
import matplotlib.pyplot as plt from mpl_toolkits.mplot3d import Axes3D plt.rcParams['font.sans-serif'] = ['SimHei'] # 指定默认字体 plt.rcParams['axes.unicode_minus'] = False # 解决保存图像是负号'-'显示为方块的问题 begin = "2020-01-19" end = "2020-01-19" test_date = "2020-01-19" KNN_clf = KNN(contamination=0.05) PCA_clf = PCA(contamination=0.05, n_components=0.9) VAE_clf = VAE(contamination=0.05, epochs=50, gamma=0.8, capacity=0.2, encoder_neurons=[9, 4], decoder_neurons=[4, 9]) LOF_clf = LOF(contamination=0.05) IForest_clf = IForest(contamination=0.05) FeatureBagging_clf = FeatureBagging(contamination=0.05, check_estimator=False) SO_GAAL_clf = SO_GAAL(contamination=0.05, stop_epochs=20) K_models = ['SO_GAAL', 'VAE'] S_models = ['KNN', 'PCA', 'LOF', 'IForest'] def get_train_data(): """ 获取训练样本 :return: x_train 9特征训练样本 df 原训练数据
from pyod.models.mcd import MCD from pyod.models.mo_gaal import MO_GAAL from pyod.models.so_gaal import SO_GAAL import datetime import pickle begin = "2020-02-13" end = "2020-02-15" test_date = "2020-02-16" KNN_clf = KNN(contamination=0.05) PCA_clf = PCA(contamination=0.05) VAE_clf = VAE(contamination=0.05, epochs=30, encoder_neurons=[9, 4], decoder_neurons=[4, 9]) LOF_clf = LOF(contamination=0.05) IForest_clf = IForest(contamination=0.05) AutoEncoder_clf = AutoEncoder(contamination=0.05, epochs=30, hidden_neurons=[9, 4, 4, 9]) FeatureBagging_clf = FeatureBagging(contamination=0.05, check_estimator=False) ABOD_clf = ABOD(contamination=0.05) HBOS_clf = HBOS(contamination=0.05) CBLOF_clf = CBLOF(contamination=0.05) LODA_clf = LODA(contamination=0.05) MCD_clf = MCD(contamination=0.05) MO_GAAL_clf = MO_GAAL(k=3, stop_epochs=2, contamination=0.05) SO_GAAL_clf = SO_GAAL(contamination=0.05) KNN_MAH_clf = None S_models = ["KNN", "LOF", "PCA", "IForest", "HBOS", "LODA", "MCD", "CBLOF", "FeatureBagging", "ABOD", "KNN_MAH"] K_models = ["AutoEncoder", "SO_GAAL", "VAE"]
(KNN(method='largest'), 'Largest_KNN'), # n_jobs (LODA(), 'LODA'), (FeatureBagging(combination='max', n_jobs=-1, random_state=rs), 'MAX_Bagging'), (MCD(random_state=rs), 'MCD'), (XGBOD(random_state=rs), 'XGBOD'), # n_jobs (GaussianMixture(random_state=rs), 'GMM'), (LocalOutlierFactor(novelty=True), 'LOF'), (KNN(method='median'), 'Median_KNN'), # n_jobs (KNN(method='mean'), 'Avg_KNN'), # n_jobs (CBLOF(n_clusters=10, random_state=rs), 'CBLOF'), (HBOS(), 'HBOS'), (SOD(), 'SOD'), (PCA(random_state=rs), 'PCA'), (VAE(encoder_neurons=[3, 4, 3], decoder_neurons=[3, 4, 3], random_state=rs), 'VAE'), (AutoEncoder(hidden_neurons=[3, 4, 4, 3], verbose=0, random_state=rs), 'AE') ] # Start the counter of time st = time.time() # Initialize the pool class with the number of required CPU's pool = mp.Pool(mp.cpu_count()) # StarMap method pool.starmap_async(AnomalyTester, [(models[i][0], models[i][1], rootDir) for i in range(len(models))]).get() pool.close() # Finish the counter of time end = time.time() # Print the needed time to compute
'MCD': MCD(), 'AvgBagging': FeatureBagging(combination='average'), 'MaxBagging': FeatureBagging(combination='max'), 'IForest': IForest(), 'CBLOF': CBLOF(n_clusters=10, n_jobs=4), 'FactorAnalysis': FactorAnalysis(), 'KernelDensity': KernelDensity(), 'COPOD': COPOD(), 'SOD': SOD(), 'LSCPwithLODA': LSCP([LODA(), LODA()]), 'AveLMDD': LMDD(dis_measure='aad'), 'VarLMDD': LMDD(dis_measure='var'), 'IqrLMDD': LMDD(dis_measure='iqr'), 'SoGaal': SO_GAAL(), #'MoGaal':MO_GAAL(), 'VAE': VAE(encoder_neurons=[8, 4, 2]), 'AutoEncoder': AutoEncoder(hidden_neurons=[6, 3, 3, 6]) } models = { 'XGBOD': XGBOD(), 'BRM': BRM(), 'GM': GaussianMixture(), 'IF': IsolationForest(), 'OCSVM': OneClassSVM(), 'EE': EllipticEnvelope(), 'OCKRA': m_OCKRA(), 'FactorAnalysis': FactorAnalysis(), 'KernelDensity': KernelDensity(), }
def main(): # PART 1: # Getting the predictions for each classifier # SK means: The classifier is from sklearn or works like sklearn # PY means: The classifier is from pyod or works like pyod models = { 'SK_EE': EllipticEnvelope(), 'SK_GM': GaussianMixture(), 'SK_IF': IsolationForest(), 'SK_OCSVM': OneClassSVM(), 'SK_FA': FactorAnalysis(), 'SK_KD': KernelDensity(), 'PY_PCA': PCA(), 'PY_COF': COF(), 'PY_LODA': LODA(), 'PY_LOF': LOF(), 'PY_HBOS': HBOS(), 'PY_MCD': MCD(), 'PY_AvgKNN': KNN(method='mean'), 'PY_LargestKNN': KNN(method='largest'), 'PY_MedKNN': KNN(method='median'), 'PY_AvgBagging': FeatureBagging(combination='average'), 'PY_MaxBagging': FeatureBagging(combination='max'), 'PY_CBLOF': CBLOF(n_clusters=10, n_jobs=4), 'PY_COPOD': COPOD(), 'PY_SOD': SOD(), 'PY_LSCPwithLODA': LSCP([LODA(), LODA()]), 'PY_AveLMDD': LMDD(dis_measure='aad'), 'PY_VarLMDD': LMDD(dis_measure='var'), 'PY_IqrLMDD': LMDD(dis_measure='iqr'), 'PY_VAE': VAE(encoder_neurons=[8, 4, 2]), 'PY_AutoEncoder': AutoEncoder(hidden_neurons=[6, 3, 3, 6]), 'SK_BRM': BRM(bootstrap_sample_percent=70), 'SK_OCKRA': m_OCKRA(), 'PY_SoGaal': SO_GAAL(), 'PY_MoGaal': MO_GAAL() } ranker = ADRanker(data="datasets", models=models) ranker.get_predictions() # PART 2: # After predictions, we can evaluate our classifiers using different scores # You can add manually a new metric by modifying 'metrics.py' ranker.get_scores(scores={'auc': Metrics.get_roc, 'ave': Metrics.get_ave}) # PART 3: # Finally, it is time to summarize the results by plotting different graphs # You can add your own graphs by modifying ' plots.py' plot = Plots() plot.make_plot_basic(paths=[ 'results/scores/auc/no/results.csv', 'results/scores/auc/minmax/results.csv', 'results/scores/auc/std/results.csv', 'results/scores/ave/no/results.csv', 'results/scores/ave/minmax/results.csv', 'results/scores/ave/std/results.csv' ], scalers=[ 'Without scaler', 'Min max scaler', 'Standard scaler', 'Without scaler', 'Min max scaler', 'Standard scaler' ]) plot.make_cd_plot( paths=[ 'results/scores/auc/minmax/results.csv', 'results/scores/ave/no/results.csv', 'results/scores/auc/no/results.csv', 'results/scores/ave/no/results.csv', 'results/scores/auc/std/results.csv', 'results/scores/ave/std/results.csv' ], names=[ 'CD auc minmax scale', 'CD ave minmax scale', 'CD auc no scale', 'CD ave no scale', 'CD auc std scale', 'CD ave std scale' ], titles=[ 'CD diagram - AUC with min max scaling', 'CD diagram - Average precision with min max scaling', 'CD diagram - AUC without scaling', 'CD diagram - Average precision without scaling', 'CD diagram - AUC with standard scaling', 'CD diagram - Average precision with standard scaling' ])
def execute(self): evaluation_results = [] print("Loading training data...") data = pd.DataFrame() for i, chunk in enumerate( pd.read_csv(self.input_file, header=None, chunksize=self.chunk_size)): print("Reading chunk: %d" % (i + 1)) #print(chunk) data = data.append(chunk) input_dimensionality = len(data.columns) - 1 print("Input Dimensionality: %d" % (input_dimensionality)) positive_data = data[data[len(data.columns) - 1] == 1].iloc[:, :len(data.columns) - 1] negative_data = data[data[len(data.columns) - 1] == -1].iloc[:, :len(data.columns) - 1] training_data = positive_data.sample(frac=0.70) positive_validation_data = positive_data.drop(training_data.index) if self.neg_cont and self.neg_cont > 0: print("Negative Contamination: %0.4f" % (self.neg_cont)) num_negative = math.floor( self.neg_cont * (len(negative_data) + len(positive_validation_data))) negative_data = data.sample(frac=1, random_state=200)[ data[len(data.columns) - 1] == -1].iloc[:num_negative, :len(data.columns) - 1] negative_validation_data = negative_data.copy() temp_positive = positive_validation_data.copy() temp_positive[input_dimensionality] = 1 temp_negative = negative_data.copy() temp_negative[input_dimensionality] = -1 validation_data_with_labels = pd.concat([temp_positive, temp_negative], ignore_index=True) validation_data = validation_data_with_labels.iloc[:, :len(data.columns ) - 1] validation_labels = validation_data_with_labels.iloc[:, -1:].values # Convert to tensor positive_data = torch.tensor(positive_data.values).float().to( self.device) negative_data = torch.tensor(negative_data.values).float().to( self.device) training_data = torch.tensor(training_data.values).float() validation_data = torch.tensor(validation_data.values).float() print("Validation Data:") print(validation_data) ## AE-D TRAINING ## print("Initializing autoencoder...") net = Autoencoder(layers=self.layers, device=self.device, add_syn=self.add_syn) net.to(self.device) print(net) print("Training Stochastic Autoencoder...") net.fit(training_data, epochs=self.epochs, lr=self.lr, batch_size=self.batch_size) predictions = net.predict(validation_data) tp, tn, fp, fn, tpr, tnr, ppv, npv, ts, pt, acc, f1, mcc = performance_metrics( validation_labels, predictions) r = ["AE-D", tp, tn, fp, fn, tpr, tnr, ppv, npv, ts, pt, acc, f1, mcc] evaluation_results.append(r) print("AE-D Results:") print( tabulate([r], [ "ALGO", "TP", "TN", "FP", "FN", "TPR", "TNR", "PPV", "NPV", "TS", "PT", "ACC", "F1", "MCC" ], tablefmt="grid")) # Convert back to CPU before other methods validation_data = validation_data.cpu() # Train only linear classifiers if self.eval_cat == "linear": print("Initiating training for linear detectors...") ## MCD ## print("Training MCD...") result = train_and_evaluate_classifier("MCD", MCD(), validation_data, validation_labels) evaluation_results.append(result) ## ROBUST COVARIANCE ## print("Training Robust Covariance...") result = train_and_evaluate_classifier("ROB-COV", EllipticEnvelope(), validation_data, validation_labels) evaluation_results.append(result) ## ONE CLASS SVM TRAINING ## print("Training OneClassSVM...") result = train_and_evaluate_classifier( "OC-SVM", svm.OneClassSVM(gamma="auto"), validation_data, validation_labels) evaluation_results.append(result) elif self.eval_cat == "prob": ## ABOD ## #print("Training ABOD...") #result = train_and_evaluate_classifier("ABOD", ABOD(), validation_data, validation_labels) #evaluation_results.append(result) ## SOS ## #print("Training SOS...") #result = train_and_evaluate_classifier("SOS", SOS(), validation_data, validation_labels) #evaluation_results.append(result) ## COPOD ## print("Training COPOD...") result = train_and_evaluate_classifier("COPOD", COPOD(), validation_data, validation_labels) evaluation_results.append(result) elif self.eval_cat == "ensemble": ## ISOLATION FOREST TRAINING ## print("Training Isolation Forest...") result = train_and_evaluate_classifier( "ISO-F", IsolationForest(random_state=0), validation_data, validation_labels) evaluation_results.append(result) ## LODA ## print("Training LODA...") result = train_and_evaluate_classifier("LODA", LODA(), validation_data, validation_labels) evaluation_results.append(result) ## LSCP ## # print("Training LSCP...") # result = train_and_evaluate_classifier("LSCP", LSCP([LOF(), LOF()]), validation_data, validation_labels) # evaluation_results.append(result) elif self.eval_cat == "proximity": ## LOCAL OUTLIER FACTOR ## print("Training Local Outlier Factor...") result = train_and_evaluate_classifier( "LOC-OF", LocalOutlierFactor(novelty=True), validation_data, validation_labels) evaluation_results.append(result) ## CBLOF ## print("Training CBLOF...") result = train_and_evaluate_classifier("CBLOF", CBLOF(), validation_data, validation_labels) evaluation_results.append(result) ## HBOS ## print("Training HBOS...") result = train_and_evaluate_classifier("HBOS", HBOS(), validation_data, validation_labels) evaluation_results.append(result) elif self.eval_cat == "nn": ## VAE ## print("Training VAE...") result = train_and_evaluate_classifier( "VAE", VAE(encoder_neurons=self.layers, decoder_neurons=self.layers.reverse()), validation_data, validation_labels) evaluation_results.append(result) ## SO_GAAL ## print("Training SO_GAAL...") result = train_and_evaluate_classifier( "SO_GAAL", SO_GAAL(lr_d=self.lr, stop_epochs=self.epochs), validation_data, validation_labels) evaluation_results.append(result) ## MO_GAAL ## print("Training MO_GAAL...") result = train_and_evaluate_classifier( "MO_GAAL", MO_GAAL(lr_d=self.lr, stop_epochs=self.epochs), validation_data, validation_labels) evaluation_results.append(result) ## EVALUATE RESULTS ## if self.eval_cat != "none": print("Aggregated Results:") print( tabulate(evaluation_results, [ "ALGO", "TP", "TN", "FP", "FN", "TPR", "TNR", "PPV", "NPV", "TS", "PT", "ACC", "F1", "MCC" ], tablefmt="grid")) ## DATASET METRICS ## len_training_data_points = len(training_data) len_positive_validations = len(positive_validation_data) len_negative_validations = len(negative_validation_data) len_validations = len_positive_validations + len_negative_validations metrics_results = [ ["Training Data Points", len_training_data_points], ["# Normal Points", len_positive_validations], ["# Anomalies", len_negative_validations], [ "Contamination Percentage", math.floor((len_negative_validations / len_validations) * 100) ] ] ## EVALUATE RESULTS ## print(tabulate(metrics_results, ["Metric", "Value"], tablefmt="grid")) if self.printout: print("Saving results to %s" % (self.printout)) df = pd.DataFrame(evaluation_results) df.to_csv(self.printout, header=None, index=False)
def compare(inputdata, labels, n_clusters, dset_name): """ Compute the AUC, Fgap, Frank score on all conventional outlier detectors for the given dataset Args: inputdata: input data labels: ground truth outlier labels n_clusters: number of clusters, for some cluster-based detectors dset_name: dataset Returns: AUC, Fgap, Frank """ print( "Competing with conventional unsupervised outlier detection algorithms..." ) random_state = np.random.RandomState(1) if inputdata.shape[1] < 64: AEneurons = [16, 8, 8, 16] VAEneurons = [16, 8, 4], [4, 8, 16] else: AEneurons = [64, 32, 32, 64] VAEneurons = [128, 64, 32], [32, 64, 128] classifiers = { 'PCA': PCA(random_state=random_state), 'AutoEncoder': AutoEncoder(batch_size=100, hidden_neurons=AEneurons, random_state=random_state), 'VAE': VAE(batch_size=100, encoder_neurons=VAEneurons[0], decoder_neurons=VAEneurons[1], random_state=random_state), 'COPOD': COPOD(), 'Iforest': IForest(random_state=random_state), 'AutoEncoder': AutoEncoder(batch_size=100, random_state=random_state), 'VAE': VAE(batch_size=100, random_state=random_state), 'LODA': LODA(), 'OCSVM': OCSVM(), 'ABOD': ABOD(n_neighbors=20), 'Fb': FeatureBagging(random_state=random_state), 'CBLOF': CBLOF(n_clusters=n_clusters, check_estimator=False, random_state=random_state), 'LOF': LOF(), 'COF': COF() } for clf_name, clf in classifiers.items(): print(f"Using {clf_name} method") starttime = time.time() clf.fit(inputdata) time_taken = time.time() - starttime test_scores = clf.decision_scores_ # -----fix some broken scores----- # for i in range(len(test_scores)): cur = test_scores[i] if np.isnan(cur) or not np.isfinite(cur): test_scores[i] = 0 np.save(f'{dset_name}/{clf_name}_raw.npy', test_scores) auc = roc_auc_score(labels, test_scores) print('AUC:', auc) fetch(normalize(test_scores), f'../datasets/{dset_name.upper()}_Y.npy', f'{dset_name}/attribute.npy') print('time_taken:', time_taken)