Ejemplo n.º 1
0
    def vaeAD(self, encoder_neurons, decoder_neurons, epochs, contamination):
        clf_name = 'VAE'
        clf = VAE(encoder_neurons=encoder_neurons, decoder_neurons=decoder_neurons,
                  epochs=epochs, contamination=contamination)
        clf.fit(self.X)

        # get the prediction labels and outlier scores of the training data
        y_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
        y_scores = clf.decision_scores_  # raw outlier scores
        generateAnomalis(self.data, self.label, y_pred)
        self.evaluate()
Ejemplo n.º 2
0
    def setUp(self):
        self.n_train = 6000
        self.n_test = 1000
        self.n_features = 300
        self.contamination = 0.1
        self.roc_floor = 0.8
        self.X_train, self.X_test, self.y_train, self.y_test = generate_data(
            n_train=self.n_train, n_test=self.n_test,
            n_features=self.n_features, contamination=self.contamination,
            random_state=42)

        self.clf = VAE(epochs=5, contamination=self.contamination)
        self.clf.fit(self.X_train)
Ejemplo n.º 3
0
    def __init__(
            self,
            *,
            hyperparams: Hyperparams,  #
            random_seed: int = 0,
            docker_containers: Dict[str, DockerContainer] = None) -> None:
        super().__init__(hyperparams=hyperparams,
                         random_seed=random_seed,
                         docker_containers=docker_containers)
        if hyperparams['loss'] == 'mean_squared_error':
            loss = keras.losses.mean_squared_error
        else:
            raise ValueError('VAE only suports mean squered error for now')

        self._clf = VAE(
            contamination=hyperparams['contamination'],
            encoder_neurons=hyperparams['encoder_neurons'],
            decoder_neurons=hyperparams['decoder_neurons'],
            hidden_activation=hyperparams['hidden_activation'],
            output_activation=hyperparams['output_activation'],
            loss=loss,
            gamma=hyperparams['gamma'],
            capacity=hyperparams['capacity'],
            optimizer=hyperparams['optimizer'],
            epochs=hyperparams['epochs'],
            batch_size=hyperparams['batch_size'],
            dropout_rate=hyperparams['dropout_rate'],
            l2_regularizer=hyperparams['l2_regularizer'],
            validation_size=hyperparams['validation_size'],
            preprocessing=hyperparams['preprocessing'],
            verbosity=hyperparams['verbosity'],
            random_state=hyperparams['random_state'],
        )

        return
def fit_VAE_with_scores(username):
    """This is the function that performs unsupervised anomaly detection\
        on the scaled tweet data from a user."""
    #read the data in
    df = pd.read_csv('../data/processed/'+username+\
                '_scaled_tweet_features.csv').drop('Unnamed: 0',axis='columns')
    #dataframe to array
    X = df.values
    ndim = X.shape[1]  #the number of features
    random_state = np.random.RandomState(81)  #Random seed
    outlier_fraction = 0.01  #1% of all tweets are outliers
    classifiers = {
        'Variational Auto Encoder (VAE)':
        VAE(epochs=20,
            contamination=outlier_fraction,
            random_state=random_state,
            encoder_neurons=[
                ndim, max(int(ndim / 2), 1),
                max(int(ndim / 4), 1)
            ],
            decoder_neurons=[
                max(int(ndim / 4), 1),
                max(int(ndim / 2), 1), ndim
            ],
            verbosity=0)
    }
    for i, (clf_name, clf) in enumerate(classifiers.items()):
        clf.fit(X)
        scores_pred = clf.decision_function(X) * -1
        y_pred = clf.predict(X)
    return y_pred, scores_pred
def fit_VAE_direct(df):
    """This is the function that performs unsupervised anomaly detection\
        on the scaled tweet data from a user."""
    #dataframe to array
    X = df.values
    ndim = X.shape[1]  #the number of features
    random_state = np.random.RandomState(81)  #Random seed
    outlier_fraction = 0.007  #.7% of all tweets are outliers (best fit)
    classifiers = {
        'Variational Auto Encoder (VAE)':
        VAE(epochs=20,
            contamination=outlier_fraction,
            random_state=random_state,
            encoder_neurons=[
                ndim, max(int(ndim / 2), 1),
                max(int(ndim / 4), 1)
            ],
            decoder_neurons=[
                max(int(ndim / 4), 1),
                max(int(ndim / 2), 1), ndim
            ],
            verbosity=0)
    }
    for i, (clf_name, clf) in enumerate(classifiers.items()):
        clf.fit(X)
        y_pred = clf.predict(X)
    return y_pred
Ejemplo n.º 6
0
def main(args):
    data = loadmat(args.filename)
    trainx, testx, trainy, testy = train_test_split(data['X'],
                                                    data['y'],
                                                    test_size=args.train_split,
                                                    random_state=2)
    valx, evalx, valy, evaly = train_test_split(testx, testy, test_size=0.5)
    data_size = len(trainx[0])
    encoder_neurons = [data_size, data_size / 2, data_size / 4]
    clf = KNN()
    clf.fit(trainx)
    print("Results Validation KNN")
    print_metrics(valy, clf.predict(valx))
    print("Results Evaluation KNN")
    print_metrics(evaly, clf.predict(evalx))

    clf = PCA(n_components=args.components)
    clf.fit(trainx)
    print("Results Validation PCA")
    print_metrics(valy, clf.predict(valx))
    print("Results Evaluation PCA")
    print_metrics(evaly, clf.predict(evalx))

    clf = VAE(encoder_neurons=encoder_neurons,
              decoder_neurons=encoder_neurons[::-1],
              epochs=args.epochs,
              contamination=args.contamination,
              gamma=args.gamma,
              capacity=args.capacity)
    clf.fit(trainx)
    print("Results Validation VAE")
    print_metrics(valy, clf.predict(valx))
    print("Results Evaluation VAE")
    print_metrics(evaly, clf.predict(evalx))
Ejemplo n.º 7
0
def model_test(model_type, y_train, y_test, X_train, X_test, model_file,
               save_flag):
    if model_type == 'KNN':
        clf_name = 'KNN'
        clf = KNN()
        clf.fit(X_train)
    if model_type == 'XGBOD':
        clf_name = 'XGBOD'
        #set this scale_pos_weight  sum(negative instances) / sum(positive instances).
        clf = XGBOD(random_state=42, scale_pos_weight=50)
        clf.fit(X_train, y_train)
    if model_type == 'SOD':
        # train SOD detector
        # Note that SOD is meant to work in high dimensions d > 2.
        # But here we are using 2D for visualization purpose
        # thus, higher precision is expected in higher dimensions
        clf_name = 'SOD'
        clf = SOD()
        clf.fit(X_train)
    if model_type == 'VAE':
        # train VAE detector (Beta-VAE)
        clf_name = 'VAE'
        contamination = 0.01
        clf = VAE(epochs=30,
                  contamination=contamination,
                  gamma=0.8,
                  capacity=0.2)
        clf.fit(X_train)

    #save model if specified
    if save_flag == '1':
        pickle.dump(clf, open(model_file, "wb"))

    # get the prediction labels and outlier scores of the training data
    y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
    y_train_scores = clf.decision_scores_  # raw outlier scores

    # get the prediction on the test data
    y_test_pred = clf.predict(X_test)  # outlier labels (0 or 1)
    y_test_scores = clf.decision_function(X_test)  # outlier scores

    # evaluate and print the results
    print("\nOn Training Data:")
    evaluate_print(clf_name, y_train, y_train_scores)
    conf_train = confusion_matrix(y_train, y_train_pred)
    print("<<<< confusion matrix for train: ", conf_train)

    print("\nOn Test Data:")
    evaluate_print(clf_name, y_test, y_test_scores)
    conf_test = confusion_matrix(y_test, y_test_pred)
    print("<<<< confusion matrix for test: ", conf_test)

    # visualize the results
    #todo: Input data has to be 2-d for visualization.
    #visualize(clf_name, X_train, y_train, X_test, y_test, y_train_pred,
    #         y_test_pred, show_figure=True, save_figure=False)

    return model_file
Ejemplo n.º 8
0
def main():

    scalers = ['no', 'std', 'minmax']
    root = 'Unsupervised_Anamaly_Detection_csv'
    start = 0
    counts = 90
    CPUS = 3
    CPUS_Models = 4
    sklearn_models = [
        'AvgKNN', 'LargestKNN', 'MedKNN', 'PCA', 'COF', 'LODA', 'LOF', 'HBOS',
        'MCD', 'AvgBagging', 'MaxBagging', 'IForest', 'CBLOF', 'COPOD', 'SOD',
        'LSCPwithLODA', 'AveLMDD', 'VarLMDD', 'IqrLMDD', 'SoGaal', 'MoGaal',
        'VAE', 'AutoEncoder'
    ]

    models = {
        'BRM': BRM(bootstrap_sample_percent=70),
        'GM': GaussianMixture(),
        'IF': IsolationForest(),
        'OCSVM': OneClassSVM(),
        'EE': EllipticEnvelope(),
        'AvgKNN': KNN(method='mean'),
        'LargestKNN': KNN(method='largest'),
        'MedKNN': KNN(method='median'),
        'PCA': PCA(),
        'COF': COF(),
        'LODA': LODA(),
        'LOF': LOF(),
        'HBOS': HBOS(),
        'MCD': MCD(),
        'AvgBagging': FeatureBagging(combination='average'),
        'MaxBagging': FeatureBagging(combination='max'),
        'CBLOF': CBLOF(n_clusters=10, n_jobs=4),
        'FactorAnalysis': FactorAnalysis(),
        'KernelDensity': KernelDensity(),
        'COPOD': COPOD(),
        'SOD': SOD(),
        'LSCPwithLODA': LSCP([LODA(), LODA()]),
        'AveLMDD': LMDD(dis_measure='aad'),
        'VarLMDD': LMDD(dis_measure='var'),
        'IqrLMDD': LMDD(dis_measure='iqr'),
        'SoGaal': SO_GAAL(),
        'MoGaal': MO_GAAL(),
        'VAE': VAE(encoder_neurons=[8, 4, 2]),
        'AutoEncoder': AutoEncoder(hidden_neurons=[6, 3, 3, 6]),
        'OCKRA': m_OCKRA(),
    }

    name = "30_Models"

    Parallel(n_jobs=CPUS) \
        (delayed(runByScaler)
         (root, scaler, models, start, counts,
          other_models=sklearn_models,
          CPUS=CPUS_Models,
          save_name=name)
         for scaler in scalers)
Ejemplo n.º 9
0
def detect_outliers_VAE(df):
    ''' Returns the outlier scores using Variational AutoEncoders

    Parameters:
    -----------
    df: pd.DataFrame,
    '''
    if df.shape[1] < 128:
        encoder = [df.shape[1], df.shape[1]/2, df.shape[1]/4]
        decoder = encoder[::-1]
    else:
        encoder = [128, 64, 32]
        decoder = encoder[::-1]

    clf = VAE(contamination=0.1, encoder_neurons=encoder, decoder_neurons=decoder)

    df = df.astype(np.float32)

    clf.fit(df)
    outlier_score = clf.decision_scores_
    # df_result = pd.DataFrame(outlier_pred, columns=['outlier_pred'])
    return outlier_score * -1
Ejemplo n.º 10
0
 def __init__(self, hidden_neurons, nu, epochs, batch_size=32):
     if len(hidden_neurons) % 2 == 0:
         print("The number of layers must be an odd number(2n+1).")
         sys.exit()
     encoder = hidden_neurons[0:len(hidden_neurons) // 2]
     latent = hidden_neurons[len(hidden_neurons) // 2]
     decoder = hidden_neurons[len(hidden_neurons) // 2 +
                              1:len(hidden_neurons)]
     self.model = VAE(encoder_neurons=encoder,
                      decoder_neurons=decoder,
                      latent_dim=latent,
                      contamination=nu,
                      epochs=epochs,
                      batch_size=batch_size)
Ejemplo n.º 11
0
def choose_model(model, nnet):
    """ among implemented in PyOD """
    clfs = {
        'AE':
        AutoEncoder(hidden_neurons=nnet, contamination=0.1, epochs=15),
        'VAE':
        VAE(encoder_neurons=nnet[:5],
            decoder_neurons=nnet[4:],
            contamination=0.1,
            epochs=13),
        'ABOD':
        ABOD(),
        'FeatureBagging':
        FeatureBagging(),
        'HBOS':
        HBOS(),
        'IForest':
        IForest(),
        'KNN':
        KNN(),
        'LOF':
        LOF(),
        'OCSVM':
        OCSVM(),
        'PCA':
        PCA(),
        'SOS':
        SOS(),
        'COF':
        COF(),
        'CBLOF':
        CBLOF(),
        'SOD':
        SOD(),
        'LOCI':
        LOCI(),
        'MCD':
        MCD()
    }
    return clfs[model]
def identify_anomalies(X,outlier_fraction = 0.007,epochs=20,path,username):
    """A function that performs variational auto encoding analysis on the tweet data"""
    ndim = X.shape[1] #the number of features
    random_state = np.random.RandomState(42)
    #outlier_fraction = 0.01 #1% of all tweets are outliers
    #specifies the model parameters
    classifiers = {
        'Variational Auto Encoder (VAE)': VAE(epochs,
                contamination = outlier_fraction, random_state = random_state,
                encoder_neurons = [ndim,max(int(ndim/2),1),max(int(ndim/4),1)],
                decoder_neurons = [max(int(ndim/4),1),max(int(ndim/2),1),20],
                verbosity=0)
    }

    for i, (clf_name,clf) in enumerate(classifiers.items()):
        clf.fit(X) #fits the model
        scores_pred = clf.decision_function(X) * -1 #model scores
        y_pred = clf.predict(X) #model predictions for anomalies
    return y_pred

# Don't forget to do this at some point:
#unscaled_tweet_features_df['anomalous'] = y_pred
#unscaled_tweet_features_df.to_csv(path+username+
#                                  '_anomaly_tagged_tweet_features.csv')
Ejemplo n.º 13
0
                                             PyodDetector(PCA, "PCA")
                                         ])

pyod_umap_big_dim = EvalRun("pyod_umap_big_dim", [doc2vecwikiall],
                            [imdb_20news_3splits], [], [
                                PyodDetector(HBOS, "HBOS"),
                                PyodDetector(IForest, "iForest"),
                                PyodDetector(LOF, "LOF"),
                                PyodDetector(OCSVM, "OCSVM"),
                                PyodDetector(PCA, "PCA")
                            ])

pyod_autoencoder_test = EvalRun(
    "pyod_autoencoder_test", [doc2vecwikiall, longformer_large],
    [imdb_20news_3splits], [NoReduction()], [
        PyodDetector(VAE(epochs=30, verbosity=1), "VAE_30"),
        PyodDetector(VAE(epochs=100, verbosity=1), "VAE_100"),
        PyodDetector(AutoEncoder(epochs=30, verbose=1), "AE_30"),
        PyodDetector(AutoEncoder(epochs=100, verbose=2), "AE_100")
    ])

pyod_autoencer_refined = EvalRun(
    "pyod_autoencer_refined", [doc2vecwikiall, doc2vecapnews],
    [imdb_20news_3split_fracs], [], [
        PyodDetector(
            AutoEncoder(hidden_neurons=[32, 16, 16, 32], epochs=30, verbose=1),
            "AE_30_small"),
        PyodDetector(AutoEncoder(epochs=10, verbose=1), "AE_10"),
        PyodDetector(AutoEncoder(epochs=30, verbose=1), "AE_30"),
        PyodDetector(AutoEncoder(epochs=100, verbose=2), "AE_100")
    ])
Ejemplo n.º 14
0
    ],
                   axis=1)
]
unimodality = [
    "image", "word2vec", "bert", "concat_joint", "vae_joint", "simple_concat"
]

clfs = [
    IForest(random_state=42),
    LOF(),
    OCSVM(),
    PCA(),
    KNN(),
    HBOS(),
    COPOD(),
    AutoEncoder(verbose=0),
    VAE(latent_dim=32, verbose=0)
]

for embedding, modality in zip(unimodal_embeddings, unimodality):
    print()
    print(modality)
    print()
    embedding_scaled = standardizer(embedding)

    for clf in clfs:
        # print(clf)
        clf.fit(embedding_scaled)
        evaluate_print(clf.__class__.__name__, anomaly_label,
                       clf.decision_scores_)  # -*- coding: utf-8 -*-
Ejemplo n.º 15
0
    contamination = 0.1  # percentage of outliers
    n_train = 20000  # number of training points
    n_test = 2000  # number of testing points
    n_features = 300  # number of features

    # Generate sample data
    X_train, y_train, X_test, y_test = \
        generate_data(n_train=n_train,
                      n_test=n_test,
                      n_features=n_features,
                      contamination=contamination,
                      random_state=42)

    # train VAE detector
    clf_name = 'VAE'
    clf = VAE(epochs=30, contamination=contamination)
    clf.fit(X_train)

    # get the prediction labels and outlier scores of the training data
    y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
    y_train_scores = clf.decision_scores_  # raw outlier scores

    # get the prediction on the test data
    y_test_pred = clf.predict(X_test)  # outlier labels (0 or 1)
    y_test_scores = clf.decision_function(X_test)  # outlier scores

    # evaluate and print the results
    print("\nOn Training Data:")
    evaluate_print(clf_name, y_train, y_train_scores)
    print("\nOn Test Data:")
    evaluate_print(clf_name, y_test, y_test_scores)
Ejemplo n.º 16
0
 def train(self):
     self.model = VAE()
     self.model.fit(self.training_data.dataset.x)
Ejemplo n.º 17
0
class SolverVAECIFAR():
    def __init__(self, data_name, hidden_dim=256, seed=0, learning_rate=3e-4, normal_class=0, anomaly_ratio=0.1,
                 batch_size=128, concentrated=0, training_ratio=0.8, SN=1, Trim=1, L=1.5, max_epochs=100):
        np.random.seed(seed)
        torch.manual_seed(seed)
        torch.cuda.manual_seed(seed)
        use_cuda = torch.cuda.is_available()
        self.device = torch.device("cuda" if use_cuda else "cpu")
        self.L = L
        if concentrated == 1.0:
            full_data_name = 'CIFAR10_Concentrated'
        elif concentrated == 0.0:
            full_data_name = 'CIFAR10'
        self.result_path = "./results/{}_{}/0.0/VAE/{}/".format(
            full_data_name, normal_class, seed
        )
        data_path = "./data/" + data_name + ".npy"
        self.learning_rate = learning_rate
        self.SN = SN
        self.Trim = Trim
        # self.dataset = RealGraphDataset(data_path, missing_ratio=0, radius=2)
        self.dataset = CIFARVGGDataset(data_path, normal_class=normal_class, anomaly_ratio=anomaly_ratio, concentrated=concentrated)
        self.seed = seed
        self.hidden_dim = hidden_dim
        self.max_epochs = max_epochs

        self.data_path = data_path
        self.data_anomaly_ratio = self.dataset.__anomalyratio__()
        self.batch_size = batch_size
        self.input_dim = self.dataset.__dim__()
        self.data_normaly_ratio = 1 - self.data_anomaly_ratio
        n_sample = self.dataset.__len__()
        self.n_train = int(n_sample * training_ratio)
        self.n_test = n_sample - self.n_train
        print('|data dimension: {}|data noise ratio:{}'.format(self.dataset.__dim__(), self.data_anomaly_ratio))

        self.training_data, self.testing_data = data.random_split(dataset=self.dataset,
                                                                         lengths=[
                                                                             self.n_train,
                                                                             self.n_test
                                                                         ])

        self.ae = None
        self.discriminator = None
        self.model=None



    def train(self):
        self.model = VAE()
        self.model.fit(self.training_data.dataset.x)


    def test(self):
        y_test_scores = self.model.decision_function(self.testing_data.dataset.x)
        auc = roc_auc_score(self.testing_data.dataset.y, y_test_scores)

        from sklearn.metrics import precision_recall_fscore_support as prf, accuracy_score

        print("AUC:{:0.4f}".format(
           auc))

        os.makedirs(self.result_path, exist_ok=True)

        np.save(
            self.result_path + "result.npy",
            {
                "accuracy": auc,
                "precision": auc,
                "recall": auc,
                "f1": auc,
                "auc": auc,
            },
        ) # for consistency
        print("result save to {}".format(self.result_path))
Ejemplo n.º 18
0
class TestVAE(unittest.TestCase):
    def setUp(self):
        self.n_train = 6000
        self.n_test = 1000
        self.n_features = 300
        self.contamination = 0.1
        self.roc_floor = 0.8
        self.X_train, self.y_train, self.X_test, self.y_test = generate_data(
            n_train=self.n_train,
            n_test=self.n_test,
            n_features=self.n_features,
            contamination=self.contamination,
            random_state=42)

        self.clf = VAE(epochs=5, contamination=self.contamination)
        self.clf.fit(self.X_train)

    def test_parameters(self):
        assert (hasattr(self.clf, 'decision_scores_')
                and self.clf.decision_scores_ is not None)
        assert (hasattr(self.clf, 'labels_') and self.clf.labels_ is not None)
        assert (hasattr(self.clf, 'threshold_')
                and self.clf.threshold_ is not None)
        assert (hasattr(self.clf, '_mu') and self.clf._mu is not None)
        assert (hasattr(self.clf, '_sigma') and self.clf._sigma is not None)
        assert (hasattr(self.clf, 'model_') and self.clf.model_ is not None)

    def test_train_scores(self):
        assert_equal(len(self.clf.decision_scores_), self.X_train.shape[0])

    def test_prediction_scores(self):
        pred_scores = self.clf.decision_function(self.X_test)

        # check score shapes
        assert_equal(pred_scores.shape[0], self.X_test.shape[0])

        # check performance
        assert (roc_auc_score(self.y_test, pred_scores) >= self.roc_floor)

    def test_prediction_labels(self):
        pred_labels = self.clf.predict(self.X_test)
        assert_equal(pred_labels.shape, self.y_test.shape)

    def test_prediction_proba(self):
        pred_proba = self.clf.predict_proba(self.X_test)
        assert (pred_proba.min() >= 0)
        assert (pred_proba.max() <= 1)

    def test_prediction_proba_linear(self):
        pred_proba = self.clf.predict_proba(self.X_test, method='linear')
        assert (pred_proba.min() >= 0)
        assert (pred_proba.max() <= 1)

    def test_prediction_proba_unify(self):
        pred_proba = self.clf.predict_proba(self.X_test, method='unify')
        assert (pred_proba.min() >= 0)
        assert (pred_proba.max() <= 1)

    def test_prediction_proba_parameter(self):
        with assert_raises(ValueError):
            self.clf.predict_proba(self.X_test, method='something')

    def test_fit_predict(self):
        pred_labels = self.clf.fit_predict(self.X_train)
        assert_equal(pred_labels.shape, self.y_train.shape)

    def test_fit_predict_score(self):
        self.clf.fit_predict_score(self.X_test, self.y_test)
        self.clf.fit_predict_score(self.X_test,
                                   self.y_test,
                                   scoring='roc_auc_score')
        self.clf.fit_predict_score(self.X_test,
                                   self.y_test,
                                   scoring='prc_n_score')
        with assert_raises(NotImplementedError):
            self.clf.fit_predict_score(self.X_test,
                                       self.y_test,
                                       scoring='something')

    def tearDown(self):
        pass
Ejemplo n.º 19
0
def pyod_init(model, n_features=None):
    # initial model set up
    if model == 'abod':
        from pyod.models.abod import ABOD
        clf = ABOD()
    elif model == 'auto_encoder' and n_features:
        #import os
        #os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
        from pyod.models.auto_encoder import AutoEncoder
        clf = AutoEncoder(hidden_neurons=[
            n_features, n_features * 5, n_features * 5, n_features
        ],
                          epochs=5,
                          batch_size=64,
                          preprocessing=False)
    elif model == 'cblof':
        from pyod.models.cblof import CBLOF
        clf = CBLOF(n_clusters=4)
    elif model == 'hbos':
        from pyod.models.hbos import HBOS
        clf = HBOS()
    elif model == 'iforest':
        from pyod.models.iforest import IForest
        clf = IForest()
    elif model == 'knn':
        from pyod.models.knn import KNN
        clf = KNN()
    elif model == 'lmdd':
        from pyod.models.lmdd import LMDD
        clf = LMDD()
    elif model == 'loci':
        from pyod.models.loci import LOCI
        clf = LOCI()
    elif model == 'loda':
        from pyod.models.loda import LODA
        clf = LODA()
    elif model == 'lof':
        from pyod.models.lof import LOF
        clf = LOF()
    elif model == 'mcd':
        from pyod.models.mcd import MCD
        clf = MCD()
    elif model == 'ocsvm':
        from pyod.models.ocsvm import OCSVM
        clf = OCSVM()
    elif model == 'pca':
        from pyod.models.pca import PCA
        clf = PCA()
    elif model == 'sod':
        from pyod.models.sod import SOD
        clf = SOD()
    elif model == 'vae':
        from pyod.models.vae import VAE
        clf = VAE()
    elif model == 'xgbod':
        from pyod.models.xgbod import XGBOD
        clf = XGBOD()
    else:
        #raise ValueError(f"unknown model {model}")
        clf = PyODDefaultModel()
    return clf
Ejemplo n.º 20
0
    def fit(self, X, shrink_cols = True, data_scaler = preprocessing.MaxAbsScaler(), 
            quick_methods = True, slow_methods = False, nn_methods = False, 
            contamination = 0.05, use_score_rank = False, random_state = None, verbose = 0):

        if len(X.shape) > 2:
            X = X.reshape(X.shape[0], X.shape[1]*X.shape[2])
        elif len(X.shape) > 3:
            raise ValueError("Expected number of dimensions: 2 or 3") 
        
        if shrink_cols:
            X = X[:,~np.all(X == 0, axis=0)]
            log.info('zero columns shrinked')
        if data_scaler:
            X = data_scaler.fit_transform(X)
            log.info(f'used {data_scaler} data scaler')
            #log.info(X[0:1,:])
        
        n_rows = X.shape[0]
        n_features = X.shape[1]
        log.info (f'n_rows = {n_rows}, n_features = {n_features}')
        
        quick_scores = np.zeros([n_rows, 0])
        slow_scores = np.zeros([n_rows, 0])
        nn_scores = np.zeros([n_rows, 0])
        
        if quick_methods:
            # Define anomaly detection tools to be compared
            quick_classifiers = {
                'PCA_randomized':
                    PCA(contamination=contamination, random_state=random_state, 
                        standardization = False, svd_solver = 'randomized'),
                'PCA_full':
                    PCA(contamination=contamination, random_state=random_state, 
                        standardization = False, svd_solver = 'full'),                               
                'COPOD':
                   COPOD(contamination=contamination),  
                f'HBOS': 
                    HBOS(contamination=contamination),
                f'HBOS_{200}': 
                    HBOS(contamination=contamination, n_bins = 200),                
                f'HBOS_{300}':  
                    HBOS(contamination=contamination, n_bins = 300), 
                'LODA':
                    LODA(contamination=contamination),
                'LODA_200':
                    LODA(contamination=contamination, n_random_cuts  = 200),
                'LODA_300':
                    LODA(contamination=contamination, n_random_cuts  = 300),                
                'IForest_100':
                    IForest(contamination=contamination, random_state=random_state, 
                            n_estimators = 100, bootstrap = False, n_jobs = -1),
                'IForest_200':
                    IForest(contamination=contamination, random_state=random_state, 
                            n_estimators = 200, bootstrap = False, n_jobs = -1),                
                'IForest_bootstrap':
                    IForest(contamination = contamination, random_state=random_state, 
                            n_estimators = 150, bootstrap = True, n_jobs = -1), 
                #'MCD': 
                #    MCD(contamination=contamination, random_state=random_state, assume_centered = False),
                #'MCD_centered': 
                #    MCD(contamination=contamination, random_state=random_state, assume_centered = True),    
                f'CBLOF_16':
                    CBLOF(contamination=contamination, random_state=random_state, n_clusters = 16),
                f'CBLOF_24':
                    CBLOF(contamination=contamination, random_state=random_state, n_clusters = 24),
                f'CBLOF_32':
                    CBLOF(contamination=contamination, random_state=random_state, n_clusters = 32)
            }
            
            quick_scores = np.zeros([n_rows, len(quick_classifiers)])

            for i, (clf_name, clf) in enumerate(quick_classifiers.items()):
                log.info(f'{i+1} - fitting {clf_name}')
                try:
                    clf.fit(X)
                    quick_scores[:, i] = clf.decision_scores_
                except:
                    log.info(traceback.print_exc())
                else:    
                    log.info(f'Base detector {i+1}/{len(quick_classifiers)} is fitted for prediction') 

            quick_scores = np.nan_to_num(quick_scores)
            
        if slow_methods:
            # initialize a set of detectors for LSCP
            detector_list = [LOF(n_neighbors=10), LOF(n_neighbors=15), LOF(n_neighbors=20)]
            slow_classifiers = {               
                #'Angle-based Outlier Detector (ABOD)': #too slow and nan results
                #   ABOD(contamination=contamination),
                #'One-class SVM (OCSVM)':
                #   OCSVM(contamination=contamination, cache_size = 2000, shrinking = False, tol = 1e-2),   
                #'LSCP': #slow and no parallel
                #   LSCP(detector_list, contamination=contamination, random_state=random_state, local_region_size = 30),
                #'Feature Bagging': #ensemble #no real par
                #   FeatureBagging(LOF(n_neighbors=20), contamination=contamination, 
                #                  random_state=random_state, n_jobs = -1),                
                #'SOS' : # too memory inefficient  
                #    SOS(contamination=contamination),
                #'COF': # memory inefficient
                #   COF(contamination=contamination),                  
                #'SOD':
                #    SOD(contamination = contamination),
                #'KNN': 
                #   KNN(contamination=contamination, n_jobs = -1),
                #'KNN_50': 
                #   KNN(contamination=contamination, leaf_size = 50, n_jobs = -1),
                #'KNN_70': 
                #   KNN(contamination=contamination, leaf_size = 70, n_jobs = -1),

                'LOF_4':
                   LOF(n_neighbors=4, contamination=contamination, n_jobs = -1),
                'LOF_5':
                   LOF(n_neighbors=5, contamination=contamination, n_jobs = -1),                
                'LOF_6':
                   LOF(n_neighbors=6, contamination=contamination, n_jobs = -1),
                'LOF_7':
                   LOF(n_neighbors=7, contamination=contamination, n_jobs = -1),                
                'LOF_8':
                   LOF(n_neighbors=8, contamination=contamination, n_jobs = -1),
                'LOF_9':
                   LOF(n_neighbors=9, contamination=contamination, n_jobs = -1),                
                'LOF_10':
                   LOF(n_neighbors=10, contamination=contamination, n_jobs = -1),
                'LOF_12':
                   LOF(n_neighbors=12, contamination=contamination, n_jobs = -1),  
                'LOF_14':
                   LOF(n_neighbors=14, contamination=contamination, n_jobs = -1),
                'LOF_16':
                   LOF(n_neighbors=16, contamination=contamination, n_jobs = -1),
                'LOF_18':
                   LOF(n_neighbors=18, contamination=contamination, n_jobs = -1),
                'LOF_20':
                   LOF(n_neighbors=20, contamination=contamination, n_jobs = -1), 
                'LOF_22':
                   LOF(n_neighbors=22, contamination=contamination, n_jobs = -1)            
            }
            
            slow_scores = np.zeros([n_rows, len(slow_classifiers)])

            for i, (clf_name, clf) in enumerate(slow_classifiers.items()):
                log.info(f'{i+1} - fitting {clf_name}')
                try:
                    clf.fit(X)
                    slow_scores[:, i] = clf.decision_scores_
                except:
                    log.info(traceback.print_exc())
                else:    
                    log.info(f'Base detector {i+1}/{len(slow_classifiers)} is fitted for prediction') 
            
            slow_scores = np.nan_to_num(slow_scores)
        
        if nn_methods:
            
            nn_classifiers = {}
            n_list = [1024, 512, 256, 128, 64, 32, 16, 8, 4, 2]
            n_idx = next(x[0] for x in enumerate(n_list) if x[1] < n_features)
            for i in range(3,6):
                n_enc = n_list[n_idx:n_idx+i-1] 
                n_dec = n_enc[::-1]
                n_enc_dec = n_enc + n_dec
                nn_classifiers[f'FULL_AE_{len(n_enc + n_dec)}'] = {'clf': self.full_autoencoder, 
                                                                   'hidden_layers' : n_enc_dec
                                                                  }
                nn_classifiers[f'VAE_{len(n_enc_dec)}'] = {'clf': VAE(contamination = contamination, random_state = random_state,
                                                                      encoder_neurons  = n_enc, decoder_neurons = n_dec,
                                                                      preprocessing = False, epochs = 32, verbosity = verbose), 
                                                            'hidden_layers' : n_enc + n_dec
                                                            }                
                
            
            nn_scores = np.zeros([n_rows, len(nn_classifiers)])
            
            for i, (clf_name, clf) in enumerate(nn_classifiers.items()):
                log.info(f'''{i+1} - fitting {clf_name} with layers {clf['hidden_layers']}''')
                try:
                    if clf['clf'] == self.full_autoencoder:
                        nn_scores[:, i] = clf['clf'](X, neurons_list = clf['hidden_layers'], verbose = verbose)
                    else:
                        clf['clf'].fit(X)
                        nn_scores[:, i] = clf['clf'].decision_scores_                        
                except:
                    log.info(traceback.print_exc())
                else:    
                    log.info(f'Base detector {i+1}/{len(nn_classifiers)} is fitted for prediction')             

            nn_scores = np.nan_to_num(nn_scores)

            
        all_scores = np.concatenate((quick_scores, slow_scores, nn_scores), axis=1)
        all_scores = all_scores[:,~np.all(all_scores == 0, axis=0)]
        log.info(f'total scores = {all_scores.shape[1]}')
        
        all_scores_norm = np.copy(all_scores)
        if use_score_rank:
            all_scores_norm = np.apply_along_axis(rank_fun, 0, all_scores_norm)
            log.info(f'score rank applied')
        all_scores_norm = preprocessing.MinMaxScaler().fit_transform(all_scores_norm)
        
        if all_scores_norm.shape[1] >= 12:
            score_by_aom = aom(all_scores_norm, method = 'dynamic', n_buckets = round(all_scores_norm.shape[1]/4))
            score_by_moa = moa(all_scores_norm, method = 'dynamic', n_buckets = round(all_scores_norm.shape[1]/4))
            score_by_avg = np.mean(all_scores_norm, axis = 1) 
            score_by_max = np.max(all_scores_norm, axis = 1)
        else:
            score_by_avg = np.mean(all_scores_norm, axis = 1)
            score_by_max = np.max(all_scores_norm, axis = 1)
            score_by_aom = score_by_avg
            score_by_moa = score_by_max
        return score_by_aom, score_by_moa, score_by_max, score_by_avg, all_scores, all_scores_norm
Ejemplo n.º 21
0
                   axis=1)
]
unimodality = [
    "image", "word2vec", "bert", "concat_joint", "vae_joint", "simple_concat"
]

clfs = [
    IForest(random_state=42),
    LOF(),
    OCSVM(),
    PCA(),
    KNN(),
    HBOS(),
    COPOD(),
    AutoEncoder(verbose=0),
    VAE(latent_dim=32, verbosity=0)
]

for embedding, modality in zip(unimodal_embeddings, unimodality):
    print()
    print(modality)
    print()
    embedding_scaled = standardizer(embedding)

    for clf in clfs:
        # print(clf)
        clf.fit(embedding_scaled)
        evaluate_print(clf.__class__.__name__, anomaly_label,
                       clf.decision_scores_)

#%%
Ejemplo n.º 22
0
Archivo: pyod.py Proyecto: shaoleshi/mh
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

plt.rcParams['font.sans-serif'] = ['SimHei']  # 指定默认字体
plt.rcParams['axes.unicode_minus'] = False  # 解决保存图像是负号'-'显示为方块的问题

begin = "2020-01-19"
end = "2020-01-19"
test_date = "2020-01-19"

KNN_clf = KNN(contamination=0.05)
PCA_clf = PCA(contamination=0.05, n_components=0.9)
VAE_clf = VAE(contamination=0.05,
              epochs=50,
              gamma=0.8,
              capacity=0.2,
              encoder_neurons=[9, 4],
              decoder_neurons=[4, 9])
LOF_clf = LOF(contamination=0.05)
IForest_clf = IForest(contamination=0.05)
FeatureBagging_clf = FeatureBagging(contamination=0.05, check_estimator=False)
SO_GAAL_clf = SO_GAAL(contamination=0.05, stop_epochs=20)
K_models = ['SO_GAAL', 'VAE']
S_models = ['KNN', 'PCA', 'LOF', 'IForest']


def get_train_data():
    """
    获取训练样本
    :return:    x_train 9特征训练样本
                df 原训练数据
Ejemplo n.º 23
0
from pyod.models.mcd import MCD
from pyod.models.mo_gaal import MO_GAAL
from pyod.models.so_gaal import SO_GAAL

import datetime
import pickle


begin = "2020-02-13"
end = "2020-02-15"

test_date = "2020-02-16"

KNN_clf = KNN(contamination=0.05)
PCA_clf = PCA(contamination=0.05)
VAE_clf = VAE(contamination=0.05, epochs=30, encoder_neurons=[9, 4], decoder_neurons=[4, 9])
LOF_clf = LOF(contamination=0.05)
IForest_clf = IForest(contamination=0.05)
AutoEncoder_clf = AutoEncoder(contamination=0.05, epochs=30, hidden_neurons=[9, 4, 4, 9])
FeatureBagging_clf = FeatureBagging(contamination=0.05, check_estimator=False)
ABOD_clf = ABOD(contamination=0.05)
HBOS_clf = HBOS(contamination=0.05)
CBLOF_clf = CBLOF(contamination=0.05)
LODA_clf = LODA(contamination=0.05)
MCD_clf = MCD(contamination=0.05)
MO_GAAL_clf = MO_GAAL(k=3, stop_epochs=2, contamination=0.05)
SO_GAAL_clf = SO_GAAL(contamination=0.05)
KNN_MAH_clf = None

S_models = ["KNN", "LOF", "PCA", "IForest", "HBOS", "LODA", "MCD", "CBLOF", "FeatureBagging", "ABOD", "KNN_MAH"]
K_models = ["AutoEncoder", "SO_GAAL", "VAE"]
Ejemplo n.º 24
0
     (KNN(method='largest'), 'Largest_KNN'),  # n_jobs
     (LODA(), 'LODA'),
     (FeatureBagging(combination='max', n_jobs=-1,
                     random_state=rs), 'MAX_Bagging'),
     (MCD(random_state=rs), 'MCD'),
     (XGBOD(random_state=rs), 'XGBOD'),  # n_jobs
     (GaussianMixture(random_state=rs), 'GMM'),
     (LocalOutlierFactor(novelty=True), 'LOF'),
     (KNN(method='median'), 'Median_KNN'),  # n_jobs
     (KNN(method='mean'), 'Avg_KNN'),  # n_jobs
     (CBLOF(n_clusters=10, random_state=rs), 'CBLOF'),
     (HBOS(), 'HBOS'),
     (SOD(), 'SOD'),
     (PCA(random_state=rs), 'PCA'),
     (VAE(encoder_neurons=[3, 4, 3],
          decoder_neurons=[3, 4, 3],
          random_state=rs), 'VAE'),
     (AutoEncoder(hidden_neurons=[3, 4, 4, 3], verbose=0,
                  random_state=rs), 'AE')
 ]
 # Start the counter of time
 st = time.time()
 # Initialize the pool class with the number of required CPU's
 pool = mp.Pool(mp.cpu_count())
 # StarMap method
 pool.starmap_async(AnomalyTester, [(models[i][0], models[i][1], rootDir)
                                    for i in range(len(models))]).get()
 pool.close()
 # Finish the counter of time
 end = time.time()
 # Print the needed time to compute
Ejemplo n.º 25
0
    'MCD': MCD(),
    'AvgBagging': FeatureBagging(combination='average'),
    'MaxBagging': FeatureBagging(combination='max'),
    'IForest': IForest(),
    'CBLOF': CBLOF(n_clusters=10, n_jobs=4),
    'FactorAnalysis': FactorAnalysis(),
    'KernelDensity': KernelDensity(),
    'COPOD': COPOD(),
    'SOD': SOD(),
    'LSCPwithLODA': LSCP([LODA(), LODA()]),
    'AveLMDD': LMDD(dis_measure='aad'),
    'VarLMDD': LMDD(dis_measure='var'),
    'IqrLMDD': LMDD(dis_measure='iqr'),
    'SoGaal': SO_GAAL(),
    #'MoGaal':MO_GAAL(),
    'VAE': VAE(encoder_neurons=[8, 4, 2]),
    'AutoEncoder': AutoEncoder(hidden_neurons=[6, 3, 3, 6])
}

models = {
    'XGBOD': XGBOD(),
    'BRM': BRM(),
    'GM': GaussianMixture(),
    'IF': IsolationForest(),
    'OCSVM': OneClassSVM(),
    'EE': EllipticEnvelope(),
    'OCKRA': m_OCKRA(),
    'FactorAnalysis': FactorAnalysis(),
    'KernelDensity': KernelDensity(),
}
Ejemplo n.º 26
0
def main():

    # PART 1:
    # Getting the predictions for each classifier
    # SK means: The classifier is from sklearn or works like sklearn
    # PY means: The classifier is from pyod or works like pyod

    models = {
        'SK_EE': EllipticEnvelope(),
        'SK_GM': GaussianMixture(),
        'SK_IF': IsolationForest(),
        'SK_OCSVM': OneClassSVM(),
        'SK_FA': FactorAnalysis(),
        'SK_KD': KernelDensity(),
        'PY_PCA': PCA(),
        'PY_COF': COF(),
        'PY_LODA': LODA(),
        'PY_LOF': LOF(),
        'PY_HBOS': HBOS(),
        'PY_MCD': MCD(),
        'PY_AvgKNN': KNN(method='mean'),
        'PY_LargestKNN': KNN(method='largest'),
        'PY_MedKNN': KNN(method='median'),
        'PY_AvgBagging': FeatureBagging(combination='average'),
        'PY_MaxBagging': FeatureBagging(combination='max'),
        'PY_CBLOF': CBLOF(n_clusters=10, n_jobs=4),
        'PY_COPOD': COPOD(),
        'PY_SOD': SOD(),
        'PY_LSCPwithLODA': LSCP([LODA(), LODA()]),
        'PY_AveLMDD': LMDD(dis_measure='aad'),
        'PY_VarLMDD': LMDD(dis_measure='var'),
        'PY_IqrLMDD': LMDD(dis_measure='iqr'),
        'PY_VAE': VAE(encoder_neurons=[8, 4, 2]),
        'PY_AutoEncoder': AutoEncoder(hidden_neurons=[6, 3, 3, 6]),
        'SK_BRM': BRM(bootstrap_sample_percent=70),
        'SK_OCKRA': m_OCKRA(),
        'PY_SoGaal': SO_GAAL(),
        'PY_MoGaal': MO_GAAL()
    }
    ranker = ADRanker(data="datasets", models=models)
    ranker.get_predictions()

    # PART 2:
    # After predictions, we can evaluate our classifiers using different scores
    # You can add manually a new metric by modifying 'metrics.py'

    ranker.get_scores(scores={'auc': Metrics.get_roc, 'ave': Metrics.get_ave})

    # PART 3:
    # Finally, it is time to summarize the results by plotting different graphs
    # You can add your own graphs by modifying ' plots.py'
    plot = Plots()
    plot.make_plot_basic(paths=[
        'results/scores/auc/no/results.csv',
        'results/scores/auc/minmax/results.csv',
        'results/scores/auc/std/results.csv',
        'results/scores/ave/no/results.csv',
        'results/scores/ave/minmax/results.csv',
        'results/scores/ave/std/results.csv'
    ],
                         scalers=[
                             'Without scaler', 'Min max scaler',
                             'Standard scaler', 'Without scaler',
                             'Min max scaler', 'Standard scaler'
                         ])

    plot.make_cd_plot(
        paths=[
            'results/scores/auc/minmax/results.csv',
            'results/scores/ave/no/results.csv',
            'results/scores/auc/no/results.csv',
            'results/scores/ave/no/results.csv',
            'results/scores/auc/std/results.csv',
            'results/scores/ave/std/results.csv'
        ],
        names=[
            'CD auc minmax scale', 'CD ave minmax scale', 'CD auc no scale',
            'CD ave no scale', 'CD auc std scale', 'CD ave std scale'
        ],
        titles=[
            'CD diagram - AUC with min max scaling',
            'CD diagram - Average precision with min max scaling',
            'CD diagram - AUC without scaling',
            'CD diagram - Average precision without scaling',
            'CD diagram - AUC with standard scaling',
            'CD diagram - Average precision with  standard scaling'
        ])
Ejemplo n.º 27
0
    def execute(self):
        evaluation_results = []

        print("Loading training data...")
        data = pd.DataFrame()

        for i, chunk in enumerate(
                pd.read_csv(self.input_file,
                            header=None,
                            chunksize=self.chunk_size)):
            print("Reading chunk: %d" % (i + 1))
            #print(chunk)
            data = data.append(chunk)

        input_dimensionality = len(data.columns) - 1
        print("Input Dimensionality: %d" % (input_dimensionality))

        positive_data = data[data[len(data.columns) -
                                  1] == 1].iloc[:, :len(data.columns) - 1]
        negative_data = data[data[len(data.columns) -
                                  1] == -1].iloc[:, :len(data.columns) - 1]

        training_data = positive_data.sample(frac=0.70)
        positive_validation_data = positive_data.drop(training_data.index)

        if self.neg_cont and self.neg_cont > 0:
            print("Negative Contamination: %0.4f" % (self.neg_cont))
            num_negative = math.floor(
                self.neg_cont *
                (len(negative_data) + len(positive_validation_data)))
            negative_data = data.sample(frac=1, random_state=200)[
                data[len(data.columns) -
                     1] == -1].iloc[:num_negative, :len(data.columns) - 1]

        negative_validation_data = negative_data.copy()

        temp_positive = positive_validation_data.copy()
        temp_positive[input_dimensionality] = 1

        temp_negative = negative_data.copy()
        temp_negative[input_dimensionality] = -1

        validation_data_with_labels = pd.concat([temp_positive, temp_negative],
                                                ignore_index=True)
        validation_data = validation_data_with_labels.iloc[:, :len(data.columns
                                                                   ) - 1]
        validation_labels = validation_data_with_labels.iloc[:, -1:].values

        # Convert to tensor
        positive_data = torch.tensor(positive_data.values).float().to(
            self.device)
        negative_data = torch.tensor(negative_data.values).float().to(
            self.device)
        training_data = torch.tensor(training_data.values).float()
        validation_data = torch.tensor(validation_data.values).float()

        print("Validation Data:")
        print(validation_data)

        ## AE-D TRAINING ##
        print("Initializing autoencoder...")
        net = Autoencoder(layers=self.layers,
                          device=self.device,
                          add_syn=self.add_syn)
        net.to(self.device)

        print(net)

        print("Training Stochastic Autoencoder...")
        net.fit(training_data,
                epochs=self.epochs,
                lr=self.lr,
                batch_size=self.batch_size)

        predictions = net.predict(validation_data)

        tp, tn, fp, fn, tpr, tnr, ppv, npv, ts, pt, acc, f1, mcc = performance_metrics(
            validation_labels, predictions)

        r = ["AE-D", tp, tn, fp, fn, tpr, tnr, ppv, npv, ts, pt, acc, f1, mcc]

        evaluation_results.append(r)

        print("AE-D Results:")
        print(
            tabulate([r], [
                "ALGO", "TP", "TN", "FP", "FN", "TPR", "TNR", "PPV", "NPV",
                "TS", "PT", "ACC", "F1", "MCC"
            ],
                     tablefmt="grid"))

        # Convert back to CPU before other methods
        validation_data = validation_data.cpu()

        # Train only linear classifiers
        if self.eval_cat == "linear":
            print("Initiating training for linear detectors...")

            ## MCD ##
            print("Training MCD...")
            result = train_and_evaluate_classifier("MCD", MCD(),
                                                   validation_data,
                                                   validation_labels)
            evaluation_results.append(result)

            ## ROBUST COVARIANCE ##
            print("Training Robust Covariance...")
            result = train_and_evaluate_classifier("ROB-COV",
                                                   EllipticEnvelope(),
                                                   validation_data,
                                                   validation_labels)
            evaluation_results.append(result)

            ## ONE CLASS SVM TRAINING ##
            print("Training OneClassSVM...")
            result = train_and_evaluate_classifier(
                "OC-SVM", svm.OneClassSVM(gamma="auto"), validation_data,
                validation_labels)
            evaluation_results.append(result)

        elif self.eval_cat == "prob":
            ## ABOD ##
            #print("Training ABOD...")
            #result = train_and_evaluate_classifier("ABOD", ABOD(), validation_data, validation_labels)
            #evaluation_results.append(result)

            ## SOS ##
            #print("Training SOS...")
            #result = train_and_evaluate_classifier("SOS", SOS(), validation_data, validation_labels)
            #evaluation_results.append(result)

            ## COPOD ##
            print("Training COPOD...")
            result = train_and_evaluate_classifier("COPOD", COPOD(),
                                                   validation_data,
                                                   validation_labels)
            evaluation_results.append(result)

        elif self.eval_cat == "ensemble":
            ## ISOLATION FOREST TRAINING ##
            print("Training Isolation Forest...")
            result = train_and_evaluate_classifier(
                "ISO-F", IsolationForest(random_state=0), validation_data,
                validation_labels)
            evaluation_results.append(result)

            ## LODA ##
            print("Training LODA...")
            result = train_and_evaluate_classifier("LODA", LODA(),
                                                   validation_data,
                                                   validation_labels)
            evaluation_results.append(result)

            ## LSCP ##


#      print("Training LSCP...")
#      result = train_and_evaluate_classifier("LSCP", LSCP([LOF(), LOF()]), validation_data, validation_labels)
#      evaluation_results.append(result)

        elif self.eval_cat == "proximity":
            ## LOCAL OUTLIER FACTOR ##
            print("Training Local Outlier Factor...")
            result = train_and_evaluate_classifier(
                "LOC-OF", LocalOutlierFactor(novelty=True), validation_data,
                validation_labels)
            evaluation_results.append(result)

            ## CBLOF ##
            print("Training CBLOF...")
            result = train_and_evaluate_classifier("CBLOF", CBLOF(),
                                                   validation_data,
                                                   validation_labels)
            evaluation_results.append(result)

            ## HBOS ##
            print("Training HBOS...")
            result = train_and_evaluate_classifier("HBOS", HBOS(),
                                                   validation_data,
                                                   validation_labels)
            evaluation_results.append(result)

        elif self.eval_cat == "nn":
            ## VAE ##
            print("Training VAE...")
            result = train_and_evaluate_classifier(
                "VAE",
                VAE(encoder_neurons=self.layers,
                    decoder_neurons=self.layers.reverse()), validation_data,
                validation_labels)
            evaluation_results.append(result)

            ## SO_GAAL ##
            print("Training SO_GAAL...")
            result = train_and_evaluate_classifier(
                "SO_GAAL", SO_GAAL(lr_d=self.lr, stop_epochs=self.epochs),
                validation_data, validation_labels)
            evaluation_results.append(result)

            ## MO_GAAL ##
            print("Training MO_GAAL...")
            result = train_and_evaluate_classifier(
                "MO_GAAL", MO_GAAL(lr_d=self.lr, stop_epochs=self.epochs),
                validation_data, validation_labels)
            evaluation_results.append(result)

        ## EVALUATE RESULTS ##
        if self.eval_cat != "none":
            print("Aggregated Results:")
            print(
                tabulate(evaluation_results, [
                    "ALGO", "TP", "TN", "FP", "FN", "TPR", "TNR", "PPV", "NPV",
                    "TS", "PT", "ACC", "F1", "MCC"
                ],
                         tablefmt="grid"))

        ## DATASET METRICS ##
        len_training_data_points = len(training_data)
        len_positive_validations = len(positive_validation_data)
        len_negative_validations = len(negative_validation_data)
        len_validations = len_positive_validations + len_negative_validations

        metrics_results = [
            ["Training Data Points", len_training_data_points],
            ["# Normal Points", len_positive_validations],
            ["# Anomalies", len_negative_validations],
            [
                "Contamination Percentage",
                math.floor((len_negative_validations / len_validations) * 100)
            ]
        ]

        ## EVALUATE RESULTS ##
        print(tabulate(metrics_results, ["Metric", "Value"], tablefmt="grid"))

        if self.printout:
            print("Saving results to %s" % (self.printout))
            df = pd.DataFrame(evaluation_results)
            df.to_csv(self.printout, header=None, index=False)
def compare(inputdata, labels, n_clusters, dset_name):
    """
    Compute the AUC, Fgap, Frank score on all conventional outlier detectors for the given dataset
    Args:
        inputdata: input data
        labels: ground truth outlier labels
        n_clusters: number of clusters, for some cluster-based detectors
        dset_name: dataset

    Returns: AUC, Fgap, Frank

    """
    print(
        "Competing with conventional unsupervised outlier detection algorithms..."
    )
    random_state = np.random.RandomState(1)
    if inputdata.shape[1] < 64:
        AEneurons = [16, 8, 8, 16]
        VAEneurons = [16, 8, 4], [4, 8, 16]
    else:
        AEneurons = [64, 32, 32, 64]
        VAEneurons = [128, 64, 32], [32, 64, 128]

    classifiers = {
        'PCA':
        PCA(random_state=random_state),
        'AutoEncoder':
        AutoEncoder(batch_size=100,
                    hidden_neurons=AEneurons,
                    random_state=random_state),
        'VAE':
        VAE(batch_size=100,
            encoder_neurons=VAEneurons[0],
            decoder_neurons=VAEneurons[1],
            random_state=random_state),
        'COPOD':
        COPOD(),
        'Iforest':
        IForest(random_state=random_state),
        'AutoEncoder':
        AutoEncoder(batch_size=100, random_state=random_state),
        'VAE':
        VAE(batch_size=100, random_state=random_state),
        'LODA':
        LODA(),
        'OCSVM':
        OCSVM(),
        'ABOD':
        ABOD(n_neighbors=20),
        'Fb':
        FeatureBagging(random_state=random_state),
        'CBLOF':
        CBLOF(n_clusters=n_clusters,
              check_estimator=False,
              random_state=random_state),
        'LOF':
        LOF(),
        'COF':
        COF()
    }

    for clf_name, clf in classifiers.items():
        print(f"Using {clf_name} method")
        starttime = time.time()
        clf.fit(inputdata)
        time_taken = time.time() - starttime
        test_scores = clf.decision_scores_

        # -----fix some broken scores----- #
        for i in range(len(test_scores)):
            cur = test_scores[i]
            if np.isnan(cur) or not np.isfinite(cur):
                test_scores[i] = 0

        np.save(f'{dset_name}/{clf_name}_raw.npy', test_scores)
        auc = roc_auc_score(labels, test_scores)
        print('AUC:', auc)
        fetch(normalize(test_scores), f'../datasets/{dset_name.upper()}_Y.npy',
              f'{dset_name}/attribute.npy')
        print('time_taken:', time_taken)