Exemple #1
0
    def train(self):
        model = SO_GAAL(
            stop_epochs=self.max_epochs, contamination=self.data_anomaly_ratio
        )
        model.fit(self.X_train)

        self.best_model = model
def getOutlierSOGAAL(dataset):
    '''
    @brief Function that executes SO_GAAL algorithm on the dataset and obtains the
    labels of the dataset indicating which instance is an inlier (0) or outlier (1)
    @param dataset Dataset on which to try the algorithm
    @return It returns a list of labels 0 means inlier, 1 means outlier
    '''
    # Initializating the model
    sg = SO_GAAL()
    # Fits the data and obtains labels
    sg.fit(dataset)
    # Return labels
    return sg.labels_
Exemple #3
0
    def setUp(self):
        self.n_train = 1000
        self.n_test = 200
        self.n_features = 2
        self.contamination = 0.1
        # GAN may yield unstable results; turning performance check off
        # self.roc_floor = 0.8
        self.X_train, self.y_train, self.X_test, self.y_test = generate_data(
            n_train=self.n_train, n_test=self.n_test,
            n_features=self.n_features, contamination=self.contamination,
            random_state=42)

        self.clf = SO_GAAL(contamination=self.contamination)
        self.clf.fit(self.X_train)
Exemple #4
0
def main():

    scalers = ['no', 'std', 'minmax']
    root = 'Unsupervised_Anamaly_Detection_csv'
    start = 0
    counts = 90
    CPUS = 3
    CPUS_Models = 4
    sklearn_models = [
        'AvgKNN', 'LargestKNN', 'MedKNN', 'PCA', 'COF', 'LODA', 'LOF', 'HBOS',
        'MCD', 'AvgBagging', 'MaxBagging', 'IForest', 'CBLOF', 'COPOD', 'SOD',
        'LSCPwithLODA', 'AveLMDD', 'VarLMDD', 'IqrLMDD', 'SoGaal', 'MoGaal',
        'VAE', 'AutoEncoder'
    ]

    models = {
        'BRM': BRM(bootstrap_sample_percent=70),
        'GM': GaussianMixture(),
        'IF': IsolationForest(),
        'OCSVM': OneClassSVM(),
        'EE': EllipticEnvelope(),
        'AvgKNN': KNN(method='mean'),
        'LargestKNN': KNN(method='largest'),
        'MedKNN': KNN(method='median'),
        'PCA': PCA(),
        'COF': COF(),
        'LODA': LODA(),
        'LOF': LOF(),
        'HBOS': HBOS(),
        'MCD': MCD(),
        'AvgBagging': FeatureBagging(combination='average'),
        'MaxBagging': FeatureBagging(combination='max'),
        'CBLOF': CBLOF(n_clusters=10, n_jobs=4),
        'FactorAnalysis': FactorAnalysis(),
        'KernelDensity': KernelDensity(),
        'COPOD': COPOD(),
        'SOD': SOD(),
        'LSCPwithLODA': LSCP([LODA(), LODA()]),
        'AveLMDD': LMDD(dis_measure='aad'),
        'VarLMDD': LMDD(dis_measure='var'),
        'IqrLMDD': LMDD(dis_measure='iqr'),
        'SoGaal': SO_GAAL(),
        'MoGaal': MO_GAAL(),
        'VAE': VAE(encoder_neurons=[8, 4, 2]),
        'AutoEncoder': AutoEncoder(hidden_neurons=[6, 3, 3, 6]),
        'OCKRA': m_OCKRA(),
    }

    name = "30_Models"

    Parallel(n_jobs=CPUS) \
        (delayed(runByScaler)
         (root, scaler, models, start, counts,
          other_models=sklearn_models,
          CPUS=CPUS_Models,
          save_name=name)
         for scaler in scalers)
Exemple #5
0
    def setUp(self):
        self.n_train = 3000
        self.n_test = 1000
        self.n_features = 10
        self.contamination = 0.1
        # TODO: GAN may yield unstable results; turning performance check off
        # self.roc_floor = 0.8
        self.X_train, self.y_train, self.X_test, self.y_test = generate_data(
            n_train=self.n_train, n_test=self.n_test,
            n_features=self.n_features, contamination=self.contamination,
            random_state=42)

        self.clf = SO_GAAL(contamination=self.contamination)
        self.clf.fit(self.X_train)
Exemple #6
0
    def __init__(
            self,
            *,
            hyperparams: Hyperparams,  #
            random_seed: int = 0,
            docker_containers: Dict[str, DockerContainer] = None) -> None:
        super().__init__(hyperparams=hyperparams,
                         random_seed=random_seed,
                         docker_containers=docker_containers)

        self._clf = SO_GAAL(
            stop_epochs=hyperparams['stop_epochs'],
            lr_d=hyperparams['lr_d'],
            lr_g=hyperparams['lr_g'],
            decay=hyperparams['decay'],
            momentum=hyperparams['momentum'],
            contamination=hyperparams['contamination'],
        )

        return
Exemple #7
0
def run_all_models(all_array, labels, pca, data_set_name):
    picture_name = all_array.get("# img", 1)
    all_array = all_array.drop("# img", 1)

    # standardizing data for processing
    all_array = standardizer(all_array)

    y = labels.get("in").to_numpy()
    x_train, x_test, y_train, y_test, picture_train, picture_test = train_test_split(all_array, y, picture_name,
                                                                                     test_size=0.4)

    if pca:
        transformer = IncrementalPCA()
        all_array = transformer.fit_transform(all_array)

    print("OCSVM")
    now = time()
    clf = OCSVM()
    clf.fit(x_train)
    test_scores = clf.decision_function(x_test)
    temp = print_score(picture_test, test_scores, y_test)
    train_scores = clf.decision_function(x_train)
    scores_train = print_score(picture_train, train_scores, y_train)
    output_table.append(("OCSVM", all_array.shape, temp, data_set_name, time() - now, scores_train))

    print("Auto-encoder")
    now = time()
    clf = AutoEncoder(epochs=30)
    clf.fit(x_train)
    test_scores = clf.decision_function(x_test)
    temp = print_score(picture_test, test_scores, y_test)
    train_scores = clf.decision_function(x_train)
    scores_train = print_score(picture_train, train_scores, y_train)
    output_table.append(("Auto-encoder", all_array.shape, temp, data_set_name, time() - now, scores_train))

    print("HBOS")
    now = time()
    clf = HBOS()
    clf.fit(x_train)
    test_scores = clf.decision_function(x_test)
    temp = print_score(picture_test, test_scores, y_test)
    train_scores = clf.decision_function(x_train)
    scores_train = print_score(picture_train, train_scores, y_train)
    output_table.append(("HBOS", all_array.shape, temp, data_set_name, time() - now, scores_train))

    print("SO_GAAL")
    now = time()
    clf = SO_GAAL()
    clf.fit(x_train)
    test_scores = clf.decision_function(x_test)
    temp = print_score(picture_test, test_scores, y_test)
    train_scores = clf.decision_function(x_train)
    scores_train = print_score(picture_train, train_scores, y_train)
    output_table.append(("SO_GAAL", all_array.shape, temp, data_set_name, time() - now, scores_train))

    print("MO_GAAL")
    now = time()
    clf = MO_GAAL()
    clf.fit(x_train)
    test_scores = clf.decision_function(x_test)
    temp = print_score(picture_test, test_scores, y_test)
    train_scores = clf.decision_function(x_train)
    scores_train = print_score(picture_train, train_scores, y_train)
    output_table.append(("MO_GAAL", all_array.shape, temp, data_set_name, time() - now, scores_train))

    print("MCD")
    now = time()
    clf = MCD()
    clf.fit(x_train)
    test_scores = clf.decision_function(x_test)
    temp = print_score(picture_test, test_scores, y_test)
    train_scores = clf.decision_function(x_train)
    scores_train = print_score(picture_train, train_scores, y_train)
    output_table.append(("MCD", all_array.shape, temp, data_set_name, time() - now, scores_train))

    print("SOS")
    now = time()
    clf = SOS()
    clf.fit(x_train)
    test_scores = clf.decision_function(x_test)
    temp = print_score(picture_test, test_scores, y_test)
    train_scores = clf.decision_function(x_train)
    scores_train = print_score(picture_train, train_scores, y_train)
    output_table.append(("SOS", all_array.shape, temp, data_set_name, time() - now, scores_train))

    print("IForest")
    now = time()
    clf = IForest()
    clf.fit(x_train)
    test_scores = clf.decision_function(x_test)
    temp = print_score(picture_test, test_scores, y_test)
    train_scores = clf.decision_function(x_train)
    scores_train = print_score(picture_train, train_scores, y_train)
    output_table.append(("IFrorest", all_array.shape, temp, data_set_name, time() - now, scores_train))

    print("KNN")
    now = time()
    clf = KNN()
    clf.fit(x_train)
    test_scores = clf.decision_function(x_test)
    temp = print_score(picture_test, test_scores, y_test)
    train_scores = clf.decision_function(x_train)
    scores_train = print_score(picture_train, train_scores, y_train)
    output_table.append(("KNN", all_array.shape, temp, data_set_name, time() - now, scores_train))

    print("PCA")
    now = time()
    clf = PCA()
    clf.fit(x_train)
    test_scores = clf.decision_function(x_test)
    temp = print_score(picture_test, test_scores, y_test)
    train_scores = clf.decision_function(x_train)
    scores_train = print_score(picture_train, train_scores, y_train)
    output_table.append(("PCA", all_array.shape, temp, data_set_name, time() - now, scores_train))
Exemple #8
0
def main():

    # PART 1:
    # Getting the predictions for each classifier
    # SK means: The classifier is from sklearn or works like sklearn
    # PY means: The classifier is from pyod or works like pyod

    models = {
        'SK_EE': EllipticEnvelope(),
        'SK_GM': GaussianMixture(),
        'SK_IF': IsolationForest(),
        'SK_OCSVM': OneClassSVM(),
        'SK_FA': FactorAnalysis(),
        'SK_KD': KernelDensity(),
        'PY_PCA': PCA(),
        'PY_COF': COF(),
        'PY_LODA': LODA(),
        'PY_LOF': LOF(),
        'PY_HBOS': HBOS(),
        'PY_MCD': MCD(),
        'PY_AvgKNN': KNN(method='mean'),
        'PY_LargestKNN': KNN(method='largest'),
        'PY_MedKNN': KNN(method='median'),
        'PY_AvgBagging': FeatureBagging(combination='average'),
        'PY_MaxBagging': FeatureBagging(combination='max'),
        'PY_CBLOF': CBLOF(n_clusters=10, n_jobs=4),
        'PY_COPOD': COPOD(),
        'PY_SOD': SOD(),
        'PY_LSCPwithLODA': LSCP([LODA(), LODA()]),
        'PY_AveLMDD': LMDD(dis_measure='aad'),
        'PY_VarLMDD': LMDD(dis_measure='var'),
        'PY_IqrLMDD': LMDD(dis_measure='iqr'),
        'PY_VAE': VAE(encoder_neurons=[8, 4, 2]),
        'PY_AutoEncoder': AutoEncoder(hidden_neurons=[6, 3, 3, 6]),
        'SK_BRM': BRM(bootstrap_sample_percent=70),
        'SK_OCKRA': m_OCKRA(),
        'PY_SoGaal': SO_GAAL(),
        'PY_MoGaal': MO_GAAL()
    }
    ranker = ADRanker(data="datasets", models=models)
    ranker.get_predictions()

    # PART 2:
    # After predictions, we can evaluate our classifiers using different scores
    # You can add manually a new metric by modifying 'metrics.py'

    ranker.get_scores(scores={'auc': Metrics.get_roc, 'ave': Metrics.get_ave})

    # PART 3:
    # Finally, it is time to summarize the results by plotting different graphs
    # You can add your own graphs by modifying ' plots.py'
    plot = Plots()
    plot.make_plot_basic(paths=[
        'results/scores/auc/no/results.csv',
        'results/scores/auc/minmax/results.csv',
        'results/scores/auc/std/results.csv',
        'results/scores/ave/no/results.csv',
        'results/scores/ave/minmax/results.csv',
        'results/scores/ave/std/results.csv'
    ],
                         scalers=[
                             'Without scaler', 'Min max scaler',
                             'Standard scaler', 'Without scaler',
                             'Min max scaler', 'Standard scaler'
                         ])

    plot.make_cd_plot(
        paths=[
            'results/scores/auc/minmax/results.csv',
            'results/scores/ave/no/results.csv',
            'results/scores/auc/no/results.csv',
            'results/scores/ave/no/results.csv',
            'results/scores/auc/std/results.csv',
            'results/scores/ave/std/results.csv'
        ],
        names=[
            'CD auc minmax scale', 'CD ave minmax scale', 'CD auc no scale',
            'CD ave no scale', 'CD auc std scale', 'CD ave std scale'
        ],
        titles=[
            'CD diagram - AUC with min max scaling',
            'CD diagram - Average precision with min max scaling',
            'CD diagram - AUC without scaling',
            'CD diagram - Average precision without scaling',
            'CD diagram - AUC with standard scaling',
            'CD diagram - Average precision with  standard scaling'
        ])
Exemple #9
0
class TestSO_GAAL(unittest.TestCase):
    """
    Notes: GAN may yield unstable results, so the test is design for running
    models only, without any performance check.
    """
    def setUp(self):
        self.n_train = 1000
        self.n_test = 200
        self.n_features = 2
        self.contamination = 0.1
        # GAN may yield unstable results; turning performance check off
        # self.roc_floor = 0.8
        self.X_train, self.y_train, self.X_test, self.y_test = generate_data(
            n_train=self.n_train,
            n_test=self.n_test,
            n_features=self.n_features,
            contamination=self.contamination,
            random_state=42)

        self.clf = SO_GAAL(contamination=self.contamination)
        self.clf.fit(self.X_train)

    def test_parameters(self):
        assert (hasattr(self.clf, 'decision_scores_')
                and self.clf.decision_scores_ is not None)
        assert (hasattr(self.clf, 'labels_') and self.clf.labels_ is not None)
        assert (hasattr(self.clf, 'threshold_')
                and self.clf.threshold_ is not None)
        assert (hasattr(self.clf, '_mu') and self.clf._mu is not None)
        assert (hasattr(self.clf, '_sigma') and self.clf._sigma is not None)
        assert (hasattr(self.clf, 'discriminator')
                and self.clf.discriminator is not None)

    def test_train_scores(self):
        assert_equal(len(self.clf.decision_scores_), self.X_train.shape[0])

    def test_prediction_scores(self):
        pred_scores = self.clf.decision_function(self.X_test)

        # check score shapes
        assert_equal(pred_scores.shape[0], self.X_test.shape[0])

        # check performance
        # assert (roc_auc_score(self.y_test, pred_scores) >= self.roc_floor)

    def test_prediction_labels(self):
        pred_labels = self.clf.predict(self.X_test)
        assert_equal(pred_labels.shape, self.y_test.shape)

    def test_prediction_proba(self):
        pred_proba = self.clf.predict_proba(self.X_test)
        assert (pred_proba.min() >= 0)
        assert (pred_proba.max() <= 1)

    def test_prediction_proba_linear(self):
        pred_proba = self.clf.predict_proba(self.X_test, method='linear')
        assert (pred_proba.min() >= 0)
        assert (pred_proba.max() <= 1)

    def test_prediction_proba_unify(self):
        pred_proba = self.clf.predict_proba(self.X_test, method='unify')
        assert (pred_proba.min() >= 0)
        assert (pred_proba.max() <= 1)

    def test_prediction_proba_parameter(self):
        with assert_raises(ValueError):
            self.clf.predict_proba(self.X_test, method='something')

    def test_fit_predict(self):
        pred_labels = self.clf.fit_predict(self.X_train)
        assert_equal(pred_labels.shape, self.y_train.shape)

    def test_fit_predict_score(self):
        self.clf.fit_predict_score(self.X_test, self.y_test)
        self.clf.fit_predict_score(self.X_test,
                                   self.y_test,
                                   scoring='roc_auc_score')
        self.clf.fit_predict_score(self.X_test,
                                   self.y_test,
                                   scoring='prc_n_score')
        with assert_raises(NotImplementedError):
            self.clf.fit_predict_score(self.X_test,
                                       self.y_test,
                                       scoring='something')

    def test_model_clone(self):
        clone_clf = clone(self.clf)

    def tearDown(self):
        pass
Exemple #10
0
 def train(self):
     self.model = SO_GAAL()
     self.model.fit(self.training_data.dataset.x)
Exemple #11
0
    contamination = 0.1  # percentage of outliers
    n_train = 30000  # number of training points
    n_test = 3000  # number of testing points
    n_features = 300  # number of features

    # Generate sample data
    X_train, X_test, y_train, y_test = \
        generate_data(n_train=n_train,
                      n_test=n_test,
                      n_features=n_features,
                      contamination=contamination,
                      random_state=42)

    # train SO_GAAL detector
    clf_name = 'SO_GAAL'
    clf = SO_GAAL(stop_epochs=2, contamination=contamination)
    clf.fit(X_train)

    # get the prediction labels and outlier scores of the training data
    y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
    y_train_scores = clf.decision_scores_  # raw outlier scores

    # get the prediction on the test data
    y_test_pred = clf.predict(X_test)  # outlier labels (0 or 1)
    y_test_scores = clf.decision_function(X_test)  # outlier scores

    # evaluate and print the results
    print("\nOn Training Data:")
    evaluate_print(clf_name, y_train, y_train_scores)
    print("\nOn Test Data:")
    evaluate_print(clf_name, y_test, y_test_scores)
        # COF pyod
        (COF(contamination=0.1, n_neighbors=20), 'COF'),
        # ABOD pyod
        (ABOD(contamination=0.1, n_neighbors=5, method='fast'), 'ABOD'),
        # MO_GAAL pyod
        (MO_GAAL(k=10,
                 stop_epochs=20,
                 lr_d=0.01,
                 lr_g=0.0001,
                 decay=1e-06,
                 momentum=0.9,
                 contamination=0.1), 'MO_GAAL'),
        # SO_GAAL pyod
        (SO_GAAL(stop_epochs=20,
                 lr_d=0.01,
                 lr_g=0.0001,
                 decay=1e-06,
                 momentum=0.9,
                 contamination=0.1), 'SO_GAAL'),
        # OCKRA github
        (m_ockra.m_OCKRA(), 'OCKRA'),
        # VAR LMDD pyOD
        (LMDD(dis_measure='var', random_state=rs), 'VAR_LMDD'),
        # LOCI pyod
        (LSCP(detector_list,
              local_region_size=30,
              local_max_features=1.0,
              n_bins=10,
              random_state=None,
              contamination=0.1), 'LSCP')
    ]
Exemple #13
0
class SolverVAECIFAR():
    def __init__(self,
                 data_name,
                 hidden_dim=256,
                 seed=0,
                 learning_rate=3e-4,
                 normal_class=0,
                 anomaly_ratio=0.1,
                 batch_size=128,
                 concentrated=0,
                 training_ratio=0.8,
                 SN=1,
                 Trim=1,
                 L=1.5,
                 max_epochs=100):
        np.random.seed(seed)
        torch.manual_seed(seed)
        torch.cuda.manual_seed(seed)
        use_cuda = torch.cuda.is_available()
        self.device = torch.device("cuda" if use_cuda else "cpu")
        self.L = L
        if concentrated == 1.0:
            full_data_name = 'CIFAR10_Concentrated'
        elif concentrated == 0.0:
            full_data_name = 'CIFAR10'
        self.result_path = "./results/{}_{}/0.0/SO_GAAL/{}/".format(
            full_data_name, normal_class, seed)
        data_path = "./data/" + data_name + ".npy"
        self.learning_rate = learning_rate
        self.SN = SN
        self.Trim = Trim
        # self.dataset = RealGraphDataset(data_path, missing_ratio=0, radius=2)
        self.dataset = CIFARVGGDataset(data_path,
                                       normal_class=normal_class,
                                       anomaly_ratio=anomaly_ratio,
                                       concentrated=concentrated)
        self.seed = seed
        self.hidden_dim = hidden_dim
        self.max_epochs = max_epochs

        self.data_path = data_path
        self.data_anomaly_ratio = self.dataset.__anomalyratio__()
        self.batch_size = batch_size
        self.input_dim = self.dataset.__dim__()
        self.data_normaly_ratio = 1 - self.data_anomaly_ratio
        n_sample = self.dataset.__len__()
        self.n_train = int(n_sample * training_ratio)
        self.n_test = n_sample - self.n_train
        print('|data dimension: {}|data noise ratio:{}'.format(
            self.dataset.__dim__(), self.data_anomaly_ratio))

        self.training_data, self.testing_data = data.random_split(
            dataset=self.dataset, lengths=[self.n_train, self.n_test])

        self.ae = None
        self.discriminator = None
        self.model = None

    def train(self):
        self.model = SO_GAAL()
        self.model.fit(self.training_data.dataset.x)

    def test(self):
        y_test_scores = self.model.decision_function(
            self.testing_data.dataset.x)
        auc = roc_auc_score(self.testing_data.dataset.y, y_test_scores)

        from sklearn.metrics import precision_recall_fscore_support as prf, accuracy_score

        print("AUC:{:0.4f}".format(auc))

        os.makedirs(self.result_path, exist_ok=True)

        np.save(
            self.result_path + "result.npy",
            {
                "accuracy": auc,
                "precision": auc,
                "recall": auc,
                "f1": auc,
                "auc": auc,
            },
        )  # for consistency
        print("result save to {}".format(self.result_path))
Exemple #14
0
    contamination = 0.1  # percentage of outliers
    n_train = 20000  # number of training points
    n_test = 2000  # number of testing points
    n_features = 300  # number of features

    # Generate sample data
    X_train, y_train, X_test, y_test = \
        generate_data(n_train=n_train,
                      n_test=n_test,
                      n_features=n_features,
                      contamination=contamination,
                      random_state=42)

    # train SO_GAAL detector
    clf_name = 'SO_GAAL'
    clf = SO_GAAL(contamination=contamination)
    clf.fit(X_train)

    # get the prediction labels and outlier scores of the training data
    y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
    y_train_scores = clf.decision_scores_  # raw outlier scores

    # get the prediction on the test data
    y_test_pred = clf.predict(X_test)  # outlier labels (0 or 1)
    y_test_scores = clf.decision_function(X_test)  # outlier scores

    # evaluate and print the results
    print("\nOn Training Data:")
    evaluate_print(clf_name, y_train, y_train_scores)
    print("\nOn Test Data:")
    evaluate_print(clf_name, y_test, y_test_scores)
Exemple #15
0
    def execute(self):
        evaluation_results = []

        print("Loading training data...")
        data = pd.DataFrame()

        for i, chunk in enumerate(
                pd.read_csv(self.input_file,
                            header=None,
                            chunksize=self.chunk_size)):
            print("Reading chunk: %d" % (i + 1))
            #print(chunk)
            data = data.append(chunk)

        input_dimensionality = len(data.columns) - 1
        print("Input Dimensionality: %d" % (input_dimensionality))

        positive_data = data[data[len(data.columns) -
                                  1] == 1].iloc[:, :len(data.columns) - 1]
        negative_data = data[data[len(data.columns) -
                                  1] == -1].iloc[:, :len(data.columns) - 1]

        training_data = positive_data.sample(frac=0.70)
        positive_validation_data = positive_data.drop(training_data.index)

        if self.neg_cont and self.neg_cont > 0:
            print("Negative Contamination: %0.4f" % (self.neg_cont))
            num_negative = math.floor(
                self.neg_cont *
                (len(negative_data) + len(positive_validation_data)))
            negative_data = data.sample(frac=1, random_state=200)[
                data[len(data.columns) -
                     1] == -1].iloc[:num_negative, :len(data.columns) - 1]

        negative_validation_data = negative_data.copy()

        temp_positive = positive_validation_data.copy()
        temp_positive[input_dimensionality] = 1

        temp_negative = negative_data.copy()
        temp_negative[input_dimensionality] = -1

        validation_data_with_labels = pd.concat([temp_positive, temp_negative],
                                                ignore_index=True)
        validation_data = validation_data_with_labels.iloc[:, :len(data.columns
                                                                   ) - 1]
        validation_labels = validation_data_with_labels.iloc[:, -1:].values

        # Convert to tensor
        positive_data = torch.tensor(positive_data.values).float().to(
            self.device)
        negative_data = torch.tensor(negative_data.values).float().to(
            self.device)
        training_data = torch.tensor(training_data.values).float()
        validation_data = torch.tensor(validation_data.values).float()

        print("Validation Data:")
        print(validation_data)

        ## AE-D TRAINING ##
        print("Initializing autoencoder...")
        net = Autoencoder(layers=self.layers,
                          device=self.device,
                          add_syn=self.add_syn)
        net.to(self.device)

        print(net)

        print("Training Stochastic Autoencoder...")
        net.fit(training_data,
                epochs=self.epochs,
                lr=self.lr,
                batch_size=self.batch_size)

        predictions = net.predict(validation_data)

        tp, tn, fp, fn, tpr, tnr, ppv, npv, ts, pt, acc, f1, mcc = performance_metrics(
            validation_labels, predictions)

        r = ["AE-D", tp, tn, fp, fn, tpr, tnr, ppv, npv, ts, pt, acc, f1, mcc]

        evaluation_results.append(r)

        print("AE-D Results:")
        print(
            tabulate([r], [
                "ALGO", "TP", "TN", "FP", "FN", "TPR", "TNR", "PPV", "NPV",
                "TS", "PT", "ACC", "F1", "MCC"
            ],
                     tablefmt="grid"))

        # Convert back to CPU before other methods
        validation_data = validation_data.cpu()

        # Train only linear classifiers
        if self.eval_cat == "linear":
            print("Initiating training for linear detectors...")

            ## MCD ##
            print("Training MCD...")
            result = train_and_evaluate_classifier("MCD", MCD(),
                                                   validation_data,
                                                   validation_labels)
            evaluation_results.append(result)

            ## ROBUST COVARIANCE ##
            print("Training Robust Covariance...")
            result = train_and_evaluate_classifier("ROB-COV",
                                                   EllipticEnvelope(),
                                                   validation_data,
                                                   validation_labels)
            evaluation_results.append(result)

            ## ONE CLASS SVM TRAINING ##
            print("Training OneClassSVM...")
            result = train_and_evaluate_classifier(
                "OC-SVM", svm.OneClassSVM(gamma="auto"), validation_data,
                validation_labels)
            evaluation_results.append(result)

        elif self.eval_cat == "prob":
            ## ABOD ##
            #print("Training ABOD...")
            #result = train_and_evaluate_classifier("ABOD", ABOD(), validation_data, validation_labels)
            #evaluation_results.append(result)

            ## SOS ##
            #print("Training SOS...")
            #result = train_and_evaluate_classifier("SOS", SOS(), validation_data, validation_labels)
            #evaluation_results.append(result)

            ## COPOD ##
            print("Training COPOD...")
            result = train_and_evaluate_classifier("COPOD", COPOD(),
                                                   validation_data,
                                                   validation_labels)
            evaluation_results.append(result)

        elif self.eval_cat == "ensemble":
            ## ISOLATION FOREST TRAINING ##
            print("Training Isolation Forest...")
            result = train_and_evaluate_classifier(
                "ISO-F", IsolationForest(random_state=0), validation_data,
                validation_labels)
            evaluation_results.append(result)

            ## LODA ##
            print("Training LODA...")
            result = train_and_evaluate_classifier("LODA", LODA(),
                                                   validation_data,
                                                   validation_labels)
            evaluation_results.append(result)

            ## LSCP ##


#      print("Training LSCP...")
#      result = train_and_evaluate_classifier("LSCP", LSCP([LOF(), LOF()]), validation_data, validation_labels)
#      evaluation_results.append(result)

        elif self.eval_cat == "proximity":
            ## LOCAL OUTLIER FACTOR ##
            print("Training Local Outlier Factor...")
            result = train_and_evaluate_classifier(
                "LOC-OF", LocalOutlierFactor(novelty=True), validation_data,
                validation_labels)
            evaluation_results.append(result)

            ## CBLOF ##
            print("Training CBLOF...")
            result = train_and_evaluate_classifier("CBLOF", CBLOF(),
                                                   validation_data,
                                                   validation_labels)
            evaluation_results.append(result)

            ## HBOS ##
            print("Training HBOS...")
            result = train_and_evaluate_classifier("HBOS", HBOS(),
                                                   validation_data,
                                                   validation_labels)
            evaluation_results.append(result)

        elif self.eval_cat == "nn":
            ## VAE ##
            print("Training VAE...")
            result = train_and_evaluate_classifier(
                "VAE",
                VAE(encoder_neurons=self.layers,
                    decoder_neurons=self.layers.reverse()), validation_data,
                validation_labels)
            evaluation_results.append(result)

            ## SO_GAAL ##
            print("Training SO_GAAL...")
            result = train_and_evaluate_classifier(
                "SO_GAAL", SO_GAAL(lr_d=self.lr, stop_epochs=self.epochs),
                validation_data, validation_labels)
            evaluation_results.append(result)

            ## MO_GAAL ##
            print("Training MO_GAAL...")
            result = train_and_evaluate_classifier(
                "MO_GAAL", MO_GAAL(lr_d=self.lr, stop_epochs=self.epochs),
                validation_data, validation_labels)
            evaluation_results.append(result)

        ## EVALUATE RESULTS ##
        if self.eval_cat != "none":
            print("Aggregated Results:")
            print(
                tabulate(evaluation_results, [
                    "ALGO", "TP", "TN", "FP", "FN", "TPR", "TNR", "PPV", "NPV",
                    "TS", "PT", "ACC", "F1", "MCC"
                ],
                         tablefmt="grid"))

        ## DATASET METRICS ##
        len_training_data_points = len(training_data)
        len_positive_validations = len(positive_validation_data)
        len_negative_validations = len(negative_validation_data)
        len_validations = len_positive_validations + len_negative_validations

        metrics_results = [
            ["Training Data Points", len_training_data_points],
            ["# Normal Points", len_positive_validations],
            ["# Anomalies", len_negative_validations],
            [
                "Contamination Percentage",
                math.floor((len_negative_validations / len_validations) * 100)
            ]
        ]

        ## EVALUATE RESULTS ##
        print(tabulate(metrics_results, ["Metric", "Value"], tablefmt="grid"))

        if self.printout:
            print("Saving results to %s" % (self.printout))
            df = pd.DataFrame(evaluation_results)
            df.to_csv(self.printout, header=None, index=False)
Exemple #16
0
class TestSO_GAAL(unittest.TestCase):
    def setUp(self):
        self.n_train = 3000
        self.n_test = 1000
        self.n_features = 10
        self.contamination = 0.1
        # TODO: GAN may yield unstable results; turning performance check off
        # self.roc_floor = 0.8
        self.X_train, self.y_train, self.X_test, self.y_test = generate_data(
            n_train=self.n_train, n_test=self.n_test,
            n_features=self.n_features, contamination=self.contamination,
            random_state=42)

        self.clf = SO_GAAL(contamination=self.contamination)
        self.clf.fit(self.X_train)

    def test_parameters(self):
        assert_true(hasattr(self.clf, 'decision_scores_') and
                    self.clf.decision_scores_ is not None)
        assert_true(hasattr(self.clf, 'labels_') and
                    self.clf.labels_ is not None)
        assert_true(hasattr(self.clf, 'threshold_') and
                    self.clf.threshold_ is not None)
        assert_true(hasattr(self.clf, '_mu') and
                    self.clf._mu is not None)
        assert_true(hasattr(self.clf, '_sigma') and
                    self.clf._sigma is not None)
        assert_true(hasattr(self.clf, 'discriminator') and
                    self.clf.discriminator is not None)

    def test_train_scores(self):
        assert_equal(len(self.clf.decision_scores_), self.X_train.shape[0])

    def test_prediction_scores(self):
        pred_scores = self.clf.decision_function(self.X_test)

        # check score shapes
        assert_equal(pred_scores.shape[0], self.X_test.shape[0])

        # check performance
        # assert_greater(roc_auc_score(self.y_test, pred_scores), self.roc_floor)

    def test_prediction_labels(self):
        pred_labels = self.clf.predict(self.X_test)
        assert_equal(pred_labels.shape, self.y_test.shape)

    def test_prediction_proba(self):
        pred_proba = self.clf.predict_proba(self.X_test)
        assert_greater_equal(pred_proba.min(), 0)
        assert_less_equal(pred_proba.max(), 1)

    def test_prediction_proba_linear(self):
        pred_proba = self.clf.predict_proba(self.X_test, method='linear')
        assert_greater_equal(pred_proba.min(), 0)
        assert_less_equal(pred_proba.max(), 1)

    def test_prediction_proba_unify(self):
        pred_proba = self.clf.predict_proba(self.X_test, method='unify')
        assert_greater_equal(pred_proba.min(), 0)
        assert_less_equal(pred_proba.max(), 1)

    def test_prediction_proba_parameter(self):
        with assert_raises(ValueError):
            self.clf.predict_proba(self.X_test, method='something')

    def test_fit_predict(self):
        pred_labels = self.clf.fit_predict(self.X_train)
        assert_equal(pred_labels.shape, self.y_train.shape)

    def test_fit_predict_score(self):
        self.clf.fit_predict_score(self.X_test, self.y_test)
        self.clf.fit_predict_score(self.X_test, self.y_test,
                                   scoring='roc_auc_score')
        self.clf.fit_predict_score(self.X_test, self.y_test,
                                   scoring='prc_n_score')
        with assert_raises(NotImplementedError):
            self.clf.fit_predict_score(self.X_test, self.y_test,
                                       scoring='something')

    def tearDown(self):
        pass
Exemple #17
0
    'LOF': LOF(),
    'HBOS': HBOS(),
    'MCD': MCD(),
    'AvgBagging': FeatureBagging(combination='average'),
    'MaxBagging': FeatureBagging(combination='max'),
    'IForest': IForest(),
    'CBLOF': CBLOF(n_clusters=10, n_jobs=4),
    'FactorAnalysis': FactorAnalysis(),
    'KernelDensity': KernelDensity(),
    'COPOD': COPOD(),
    'SOD': SOD(),
    'LSCPwithLODA': LSCP([LODA(), LODA()]),
    'AveLMDD': LMDD(dis_measure='aad'),
    'VarLMDD': LMDD(dis_measure='var'),
    'IqrLMDD': LMDD(dis_measure='iqr'),
    'SoGaal': SO_GAAL(),
    #'MoGaal':MO_GAAL(),
    'VAE': VAE(encoder_neurons=[8, 4, 2]),
    'AutoEncoder': AutoEncoder(hidden_neurons=[6, 3, 3, 6])
}

models = {
    'XGBOD': XGBOD(),
    'BRM': BRM(),
    'GM': GaussianMixture(),
    'IF': IsolationForest(),
    'OCSVM': OneClassSVM(),
    'EE': EllipticEnvelope(),
    'OCKRA': m_OCKRA(),
    'FactorAnalysis': FactorAnalysis(),
    'KernelDensity': KernelDensity(),
Exemple #18
0
    def fit(self, X):
        """
        Fit individual detectors.

        Parameters
        ----------
        X : numpy array of shape (n_samples, n_features)
            The RD profile of all segments generated after preprocessing.

        Returns
        -------
        self : object
             Fitted estimator.
        """
        X = check_array(X)

        # normalization of all segments with Z-score
        scale_X = scale(X)

        # all base detectors with default parameters
        detectors = [LOF(), SO_GAAL(), IForest(), HBOS(), CBLOF()]

        # record results for individual detectors
        self.scores_base_ = np.zeros((len(scale_X), len(detectors)))
        self.labels_base_ = np.zeros((len(scale_X), len(detectors)))

        # record results for all merging strategies
        self.scores_ = np.zeros((len(scale_X), len(self.scores_comb)))
        self.labels_ = np.zeros((len(scale_X), len(self.scores_comb)))

        for i in range(len(detectors)):
            clf = detectors[i].fit(scale_X)
            self.scores_base_[:, i] = clf.decision_function(scale_X)

            # obtain a series of binary labels using the BCM
            _npat = BCM(X=scale_X,
                        is_require_X=self.is_require_X,
                        bandwidth=self.bandwidth)
            _npat.fit(self.scores_base_[:, i].reshape(-1, 1))
            self.labels_base_[:, i] = _npat.labels_

        # normalization of all outlier score vectors with Z-score
        _scale_score = scale(self.scores_base_)

        for i in range(len(self.scores_comb)):
            if self.scores_comb[i] == "voting":  # majority_vote
                self.scores_[:, i] = np.array([np.nan] * len(scale_X))
                self.labels_[:, i] = np.array(
                    [statistics.mode(j) for j in self.labels_base_])

            elif self.scores_comb[i] == "maximum":
                # the maximum of five outlier scores for each segment
                self.scores_[:, i] = np.max(_scale_score, axis=1)

                # obtain binary labels with BCM
                _npat = BCM(X=scale_X,
                            is_require_X=self.is_require_X,
                            bandwidth=self.bandwidth)
                _npat.fit(self.scores_[:, i].reshape(-1, 1))
                self.labels_[:, i] = _npat.labels_

            elif self.scores_comb[i] == "lscp":
                clf = LSCP(detectors, pre_fitted=True)
                clf.fit(scale_X)
                self.scores_[:, i] = clf.decision_function(scale_X)

                # obtain binary labels with the BCM
                _npat = BCM(X=scale_X,
                            is_require_X=self.is_require_X,
                            bandwidth=self.bandwidth)
                _npat.fit(self.scores_[:, i].reshape(-1, 1))
                self.labels_[:, i] = _npat.labels_

            elif self.scores_comb[i] == "averaging":
                self.scores_[:, i] = np.mean(_scale_score, axis=1)

                # obtain binary labels with the BCM
                _npat = BCM(X=scale_X,
                            is_require_X=self.is_require_X,
                            bandwidth=self.bandwidth)
                _npat.fit(self.scores_[:, i].reshape(-1, 1))
                self.labels_[:, i] = _npat.labels_
Exemple #19
0
begin = "2020-01-19"
end = "2020-01-19"
test_date = "2020-01-19"

KNN_clf = KNN(contamination=0.05)
PCA_clf = PCA(contamination=0.05, n_components=0.9)
VAE_clf = VAE(contamination=0.05,
              epochs=50,
              gamma=0.8,
              capacity=0.2,
              encoder_neurons=[9, 4],
              decoder_neurons=[4, 9])
LOF_clf = LOF(contamination=0.05)
IForest_clf = IForest(contamination=0.05)
FeatureBagging_clf = FeatureBagging(contamination=0.05, check_estimator=False)
SO_GAAL_clf = SO_GAAL(contamination=0.05, stop_epochs=20)
K_models = ['SO_GAAL', 'VAE']
S_models = ['KNN', 'PCA', 'LOF', 'IForest']


def get_train_data():
    """
    获取训练样本
    :return:    x_train 9特征训练样本
                df 原训练数据
    """
    acc_date = pd.date_range(begin, end, freq='1D')
    for day in acc_date:
        date = str(day.date())
        file_add = u"M:\mh_data\info\info_{}.csv".format(date)
        if date == begin:
Exemple #20
0
test_date = "2020-02-16"

KNN_clf = KNN(contamination=0.05)
PCA_clf = PCA(contamination=0.05)
VAE_clf = VAE(contamination=0.05, epochs=30, encoder_neurons=[9, 4], decoder_neurons=[4, 9])
LOF_clf = LOF(contamination=0.05)
IForest_clf = IForest(contamination=0.05)
AutoEncoder_clf = AutoEncoder(contamination=0.05, epochs=30, hidden_neurons=[9, 4, 4, 9])
FeatureBagging_clf = FeatureBagging(contamination=0.05, check_estimator=False)
ABOD_clf = ABOD(contamination=0.05)
HBOS_clf = HBOS(contamination=0.05)
CBLOF_clf = CBLOF(contamination=0.05)
LODA_clf = LODA(contamination=0.05)
MCD_clf = MCD(contamination=0.05)
MO_GAAL_clf = MO_GAAL(k=3, stop_epochs=2, contamination=0.05)
SO_GAAL_clf = SO_GAAL(contamination=0.05)
KNN_MAH_clf = None

S_models = ["KNN", "LOF", "PCA", "IForest", "HBOS", "LODA", "MCD", "CBLOF", "FeatureBagging", "ABOD", "KNN_MAH"]
K_models = ["AutoEncoder", "SO_GAAL", "VAE"]

def get_train_data():
    """
    获取训练样本
    :return:    x_train 9特征训练样本
                df 原训练数据
    """
    acc_date = pd.date_range(begin, end, freq='1D')
    for day in acc_date:
        date = str(day.date())
        file_add = r"M:\mh_data\info\features\features_{}.csv".format(date)