Beispiel #1
0
 def setUp(self):
     self.n_train = 100
     self.n_test = 50
     self.contamination = 0.1
     self.roc_floor = 0.8
     # generate data and fit model without missing or infinite values:
     self.X_train, self.X_test, self.y_train, self.y_test = generate_data(
         n_train=self.n_train, n_test=self.n_test, n_features=1,
         contamination=self.contamination, random_state=42)
     self.clf = MAD()
     self.clf.fit(self.X_train)
     # generate data and fit model with missing value:
     self.X_train_nan, self.X_test_nan, self.y_train_nan, self.y_test_nan = generate_data(
         n_train=self.n_train, n_test=self.n_test, n_features=1,
         contamination=self.contamination, random_state=42,
         n_nan=1)
     self.clf_nan = MAD()
     self.clf_nan.fit(self.X_train_nan)
     # generate data and fit model with infinite value:
     self.X_train_inf, self.X_test_inf, self.y_train_inf, self.y_test_inf = generate_data(
         n_train=self.n_train, n_test=self.n_test, n_features=1,
         contamination=self.contamination, random_state=42,
         n_inf=1)
     self.clf_inf = MAD()
     self.clf_inf.fit(self.X_train_inf)
Beispiel #2
0
    def setUp(self):
        # Define data file and read X and y
        # Generate some data if the source data is missing
        this_directory = path.abspath(path.dirname(__file__))
        mat_file = 'pima.mat'
        try:
            mat = loadmat(path.join(*[this_directory, 'data', mat_file]))

        except TypeError:
            print('{data_file} does not exist. Use generated data'.format(
                data_file=mat_file))
            X, y = generate_data(train_only=True)  # load data
        except IOError:
            print('{data_file} does not exist. Use generated data'.format(
                data_file=mat_file))
            X, y = generate_data(train_only=True)  # load data
        else:
            X = mat['X']
            y = mat['y'].ravel()
            X, y = check_X_y(X, y)

        self.X_train, self.X_test, self.y_train, self.y_test = \
            train_test_split(X, y, test_size=0.4, random_state=42)

        self.clf = XGBOD(random_state=42)
        self.clf.fit(self.X_train, self.y_train)

        self.roc_floor = 0.8
Beispiel #3
0
    def setUp(self):
        # Define data file and read X and y
        # Generate some data if the source data is missing
        this_directory = path.abspath(path.dirname(__file__))
        mat_file = 'pima.mat'
        try:
            mat = loadmat(path.join(*[this_directory, 'data', mat_file]))

        except TypeError:
            print('{data_file} does not exist. Use generated data'.format(
                data_file=mat_file))
            X, y = generate_data(train_only=True)  # load data
        except IOError:
            print('{data_file} does not exist. Use generated data'.format(
                data_file=mat_file))
            X, y = generate_data(train_only=True)  # load data
        else:
            X = mat['X']
            y = mat['y'].ravel()
            X, y = check_X_y(X, y)

        self.X_train, self.X_test, self.y_train, self.y_test = \
            train_test_split(X, y, test_size=0.4, random_state=42)

        self.clf = XGBOD(random_state=42)
        self.clf.fit(self.X_train, self.y_train)

        self.roc_floor = 0.75
Beispiel #4
0
    def test_default_njobs(self):
        # Define data file and read X and y
        # Generate some data if the source data is missing
        this_directory = path.abspath(path.dirname(__file__))
        mat_file = 'cardio.mat'
        try:
            mat = loadmat(path.join(*[this_directory, 'data', mat_file]))

        except TypeError:
            print('{data_file} does not exist. Use generated data'.format(
                data_file=mat_file))
            X, y = generate_data(train_only=True)  # load data
        except IOError:
            print('{data_file} does not exist. Use generated data'.format(
                data_file=mat_file))
            X, y = generate_data(train_only=True)  # load data
        else:
            X = mat['X']
            y = mat['y'].ravel()
            X, y = check_X_y(X, y)

        self.X_train, self.X_test, self.y_train, self.y_test = \
            train_test_split(X, y, test_size=0.4, random_state=42)

        self.base_estimators = [LOF(), LOF(), IForest(), COPOD()]
        self.clf = SUOD(n_jobs=2)
        self.clf.fit(self.X_train)
        self.roc_floor = 0.7
Beispiel #5
0
    def setUp(self):
        self.n_train = 100
        self.n_test = 50
        self.n_features = 2
        self.contamination = 0.1
        self.roc_floor = 0.8

        # Generate sample data
        self.X_train, self.X_test, self.y_train, self.y_test = generate_data(
            n_train=self.n_train,
            n_test=self.n_test,
            n_features=self.n_features,
            contamination=self.contamination,
            random_state=42)

        self.clf = AnoGAN(G_layers=[10, 20],
                          D_layers=[20, 2],
                          epochs_query=10,
                          preprocessing=True,
                          index_D_layer_for_recon_error=1,
                          epochs=500,
                          contamination=self.contamination,
                          verbose=0)

        self.clf.fit(self.X_train)
Beispiel #6
0
    def test_check_consistent_shape(self):
        X_train, X_test, y_train, y_test = generate_data(
            n_train=self.n_train,
            n_test=self.n_test,
            contamination=self.contamination)

        X_train_n, y_train_n, X_test_n, y_test_n, y_train_pred_n, y_test_pred_n \
            = check_consistent_shape(X_train, y_train, X_test, y_test,
                                     y_train, y_test)

        assert_allclose(X_train_n, X_train)
        assert_allclose(y_train_n, y_train)
        assert_allclose(X_test_n, X_test)
        assert_allclose(y_test_n, y_test)
        assert_allclose(y_train_pred_n, y_train)
        assert_allclose(y_test_pred_n, y_test)

        # test shape difference
        with assert_raises(ValueError):
            check_consistent_shape(X_train, y_train, y_train, y_test,
                                   y_train, y_test)

        # test shape difference between X_train and X_test
        X_test = np.hstack((X_test, np.zeros(
            (X_test.shape[0], 1))))  # add extra column/feature
        with assert_raises(ValueError):
            check_consistent_shape(X_train, y_train, X_test, y_test,
                                   y_train_pred_n, y_test_pred_n)
Beispiel #7
0
    def setUp(self):
        self.n_train = 6000
        self.n_test = 1000
        self.n_features = 300
        self.contamination = 0.1
        self.roc_floor = 0.5
        self.X_train, self.X_test, self.y_train, self.y_test = generate_data(
            n_train=self.n_train,
            n_test=self.n_test,
            n_features=self.n_features,
            contamination=self.contamination,
            random_state=42)

        self.clf = DeepSVDD(epochs=10,
                            hidden_neurons=[64, 32],
                            contamination=self.contamination,
                            random_state=2021)
        self.clf_ae = DeepSVDD(epochs=5,
                               use_ae=True,
                               output_activation='relu',
                               hidden_neurons=[16, 8, 4],
                               contamination=self.contamination,
                               preprocessing=False)
        self.clf.fit(self.X_train)
        self.clf_ae.fit(self.X_train)
Beispiel #8
0
 def test_data_generate2(self):
     X_train, y_train, X_test, y_test = \
         generate_data(n_train=self.n_train,
                       n_test=self.n_test,
                       n_features=3,
                       contamination=self.contamination)
     assert_allclose(X_train.shape, (self.n_train, 3))
     assert_allclose(X_test.shape, (self.n_test, 3))
Beispiel #9
0
 def setUp(self):
     self.n_train = 100
     self.n_test = 50
     self.contamination = 0.1
     self.roc_floor = 0.6
     self.X_train, self.y_train, self.X_test, self.y_test = generate_data(
         n_train=self.n_train, n_test=self.n_test,
         contamination=self.contamination)
Beispiel #10
0
    def test_data_generate3(self):
        X_train, y_train, X_test, y_test = \
            generate_data(n_train=self.n_train,
                          n_test=self.n_test,
                          n_features=2,
                          contamination=self.contamination,
                          random_state=42)

        X_train2, y_train2, X_test2, y_test2 = \
            generate_data(n_train=self.n_train,
                          n_test=self.n_test,
                          n_features=2,
                          contamination=self.contamination,
                          random_state=42)

        assert_allclose(X_train, X_train2)
        assert_allclose(X_test, X_test2)
        assert_allclose(y_train, y_train2)
        assert_allclose(y_test, y_test2)
Beispiel #11
0
    def setUp(self):
        self.n_train = 100
        self.n_test = 50
        self.contamination = 0.1
        self.roc_floor = 0.75
        self.X_train, self.y_train, self.X_test, self.y_test = generate_data(
            n_train=self.n_train, n_test=self.n_test,
            contamination=self.contamination, random_state=42)

        self.clf = KNN(contamination=self.contamination, method='median')
Beispiel #12
0
    def setUp(self):
        self.n_train = 100
        self.n_test = 50
        self.contamination = 0.1
        self.roc_floor = 0.75
        self.X_train, self.y_train, self.X_test, self.y_test = generate_data(
            n_train=self.n_train, n_test=self.n_test,
            contamination=self.contamination, random_state=42)

        self.clf = KNN(contamination=self.contamination, method='median')
Beispiel #13
0
    def setUp(self):
        self.n_train = 200
        self.n_test = 100
        self.contamination = 0.1
        self.roc_floor = 0.8
        self.X_train, self.y_train, self.X_test, self.y_test = generate_data(
            n_train=self.n_train, n_test=self.n_test, n_features=10,
            contamination=self.contamination, random_state=42)

        self.clf = COPOD(contamination=self.contamination)
        self.clf.fit(self.X_train)
    def setUp(self):
        self.n_train = 50
        self.n_test = 50
        self.contamination = 0.2
        self.roc_floor = 0.6
        self.X_train, self.y_train, self.X_test, self.y_test = generate_data(
            n_train=self.n_train, n_test=self.n_test,
            contamination=self.contamination, random_state=42)

        self.clf = ABOD(contamination=self.contamination, method='default')
        self.clf.fit(self.X_train)
Beispiel #15
0
    def setUp(self):
        self.n_train = 100
        self.n_test = 50
        self.contamination = 0.1
        self.roc_floor = 0.8
        self.X_train, self.y_train, self.X_test, self.y_test = generate_data(
            n_train=self.n_train, n_test=self.n_test,
            contamination=self.contamination, random_state=42)

        self.clf = COF(contamination=self.contamination, method="memory")
        self.clf.fit(self.X_train)
Beispiel #16
0
    def setUp(self):
        self.n_train = 100
        self.n_test = 50
        self.contamination = 0.1
        self.roc_floor = 0.6
        self.X_train, self.y_train, self.X_test, self.y_test = generate_data(
            n_train=self.n_train, n_test=self.n_test,
            contamination=self.contamination)

        self.clf = FeatureBagging(contamination=self.contamination)
        self.clf.fit(self.X_train)
Beispiel #17
0
    def test_get_outliers_inliers(self):
        X_train, y_train = generate_data(
            n_train=self.n_train, train_only=True,
            contamination=self.contamination)

        X_outliers, X_inliers = get_outliers_inliers(X_train, y_train)

        inlier_index = int(self.n_train * (1 - self.contamination))

        assert_allclose(X_train[0:inlier_index, :], X_inliers)
        assert_allclose(X_train[inlier_index:, :], X_outliers)
Beispiel #18
0
    def setUp(self):
        self.n_train = 100
        self.n_test = 50
        self.contamination = 0.1
        self.roc_floor = 0.6
        self.X_train, self.y_train, self.X_test, self.y_test = generate_data(
            n_train=self.n_train, n_test=self.n_test,
            contamination=self.contamination, random_state=42)

        self.clf = LOF(contamination=self.contamination)
        self.clf.fit(self.X_train)
Beispiel #19
0
    def setUp(self):
        self.n_train = 100
        self.n_test = 50
        self.contamination = 0.1
        self.roc_floor = 0.6
        self.X_train, self.y_train, self.X_test, self.y_test = generate_data(
            n_train=self.n_train, n_test=self.n_test,
            contamination=self.contamination, random_state=42)

        self.clf = HBOS(contamination=self.contamination)
        self.clf.fit(self.X_train)
Beispiel #20
0
def data(type, contamination):
    if type == 'MAD':
        #contamination = 0.1  # percentage of outliers
        n_train = 200  # number of training points
        n_test = 100  # number of testing points

        # Generate sample data
        X_train, y_train, X_test, y_test = generate_data(
            n_train=n_train,
            n_test=n_test,
            n_features=1,
            contamination=contamination,
            random_state=42)
    elif type == 'ABOD':
        contamination = 0.1  # percentage of outliers
        n_train = 200  # number of training points
        n_test = 100  # number of testing points

        # Generate sample data
        X_train, y_train, X_test, y_test = \
            generate_data(n_train=n_train,
                          n_test=n_test,
                          n_features=2,
                          contamination=contamination,
                          random_state=42)

    elif type == 'AutoEncoder':
        #contamination = 0.1  # percentage of outliers
        n_train = 20000  # number of training points
        n_test = 2000  # number of testing points
        n_features = 300  # number of features

        # Generate sample data
        X_train, y_train, X_test, y_test = \
            generate_data(n_train=n_train,
                          n_test=n_test,
                          n_features=n_features,
                          contamination=contamination,
                          random_state=42)

    return X_train, y_train, X_test, y_test
Beispiel #21
0
    def setUp(self):
        self.contamination = 0.05  # percentage of outliers
        self.n_train = 1000  # number of training points
        self.n_test = 100  # number of testing points

        # Generate sample data
        self.X_train, self.y_train, self.X_test, self.y_test = \
            generate_data(n_train=self.n_train,
                          n_test=self.n_test,
                          n_features=3,
                          contamination=self.contamination,
                          random_state=42)
 def setUp(self):
     self.n_train = 200
     self.n_test = 100
     self.contamination = 0.1
     self.roc_floor = 0.8
     self.X_train, self.y_train, self.X_test, self.y_test = generate_data(
         n_train=self.n_train, n_test=self.n_test,
         contamination=self.contamination, random_state=42)
     self.X_train, self.X_test = standardizer(self.X_train, self.X_test)
     self.detector_list = [LOF(), LOF()]
     self.clf = LSCP(self.detector_list, contamination=self.contamination)
     self.clf.fit(self.X_train)
    def setUp(self):
        self.n_train = 3000
        self.n_test = 1000
        self.n_features = 200
        self.contamination = 0.1
        self.batch_size = 1000

        self.X_train, self.X_test, self.y_train, self.y_test = generate_data(
            n_train=self.n_train,
            n_test=self.n_test,
            n_features=self.n_features,
            contamination=self.contamination,
            random_state=42)
Beispiel #24
0
    def setUp(self):
        self.n_train = 6000
        self.n_test = 1000
        self.n_features = 300
        self.contamination = 0.1
        self.roc_floor = 0.8
        self.X_train, self.X_test, self.y_train, self.y_test = generate_data(
            n_train=self.n_train, n_test=self.n_test,
            n_features=self.n_features, contamination=self.contamination,
            random_state=42)

        self.clf = VAE(epochs=5, contamination=self.contamination)
        self.clf.fit(self.X_train)
Beispiel #25
0
    def setUp(self):
        self.n_train = 200
        self.n_test = 100
        self.contamination = 0.1
        self.roc_floor = 0.75
        self.X_train, self.X_test, self.y_train, self.y_test = generate_data(
            n_train=self.n_train,
            n_test=self.n_test,
            contamination=self.contamination,
            random_state=42)

        self.clf = LODA(contamination=self.contamination, n_bins='auto')
        self.clf.fit(self.X_train)
Beispiel #26
0
    def setUp(self):
        self.n_train = 200
        self.n_test = 100
        self.contamination = 0.1
        self.roc_floor = 0.8
        self.X_train, self.X_test, self.y_train, self.y_test = generate_data(
            n_train=self.n_train,
            n_test=self.n_test,
            contamination=self.contamination,
            random_state=42,
        )

        self.clf = Sampling(contamination=self.contamination, random_state=42)
        self.clf.fit(self.X_train)
Beispiel #27
0
    def setUp(self):
        self.n_train = 1000
        self.n_test = 200
        self.n_features = 2
        self.contamination = 0.1
        # GAN may yield unstable results; turning performance check off
        # self.roc_floor = 0.8
        self.X_train, self.y_train, self.X_test, self.y_test = generate_data(
            n_train=self.n_train, n_test=self.n_test,
            n_features=self.n_features, contamination=self.contamination,
            random_state=42)

        self.clf = SO_GAAL(contamination=self.contamination)
        self.clf.fit(self.X_train)
Beispiel #28
0
    def setUp(self):
        self.n_train = 3000
        self.n_test = 1000
        self.n_features = 10
        self.contamination = 0.1
        # TODO: GAN may yield unstable results; turning performance check off
        # self.roc_floor = 0.8
        self.X_train, self.y_train, self.X_test, self.y_test = generate_data(
            n_train=self.n_train, n_test=self.n_test,
            n_features=self.n_features, contamination=self.contamination,
            random_state=42)

        self.clf = SO_GAAL(contamination=self.contamination)
        self.clf.fit(self.X_train)
Beispiel #29
0
def _create_data(contamination, n_features, n_test, n_train):
    X_train, y_train, X_test, y_test = generate_data(
        n_train=n_train,
        n_test=n_test,
        n_features=n_features,
        contamination=contamination,
        random_state=1234,
        behaviour="old")
    X_train = pd.DataFrame(X_train)
    X_test = pd.DataFrame(X_test)
    X_train = StandardScaler().fit_transform(X_train)
    X_train = pd.DataFrame(X_train)
    X_test = StandardScaler().fit_transform(X_test)
    X_test = pd.DataFrame(X_test)
    return X_train, y_train, X_test, y_test
Beispiel #30
0
    def setUp(self):
        self.n_train = 200
        self.n_test = 100
        self.contamination = 0.1
        self.roc_floor = 0.8
        self.X_train, self.y_train, self.X_test, self.y_test = generate_data(
            n_train=self.n_train, n_test=self.n_test,
            contamination=self.contamination, random_state=42)

        # calculate covariance for mahalanobis distance
        X_train_cov = np.cov(self.X_train, rowvar=False)

        self.clf = KNN(algorithm='auto', metric='mahalanobis',
                       metric_params={'V': X_train_cov})
        self.clf.fit(self.X_train)
Beispiel #31
0
    def setUp(self):
        self.n_train = 200
        self.n_test = 100
        self.contamination = 0.1
        self.roc_floor = 0.8
        self.X_train, self.X_test, self.y_train, self.y_test = generate_data(
            n_train=self.n_train, n_test=self.n_test, n_features=10,
            contamination=self.contamination, random_state=42)

        self.clf = ECOD(contamination=self.contamination, n_jobs=2)
        self.clf.fit(self.X_train)

        # get a copy from the single thread copy
        self.clf_ = ECOD(contamination=self.contamination)
        self.clf_.fit(self.X_train)
Beispiel #32
0
    def setUp(self):
        self.n_train = 100
        self.n_test = 50
        self.contamination = 0.1
        self.roc_floor = 0.8
        self.gm = None
        self.median = None
        self.data_scaler = None
        self.angles_scalers1 = None
        self.angles_scalers2 = None
        self.X_train, self.y_train, self.X_test, self.y_test = generate_data(
            n_train=self.n_train, n_test=self.n_test, n_features=4,
            contamination=self.contamination, random_state=42)

        self.clf = ROD()
        self.clf.fit(self.X_train)
Beispiel #33
0
    def setUp(self):
        self.n_train = 200
        self.n_test = 100
        self.contamination = 0.1
        self.roc_floor = 0.8
        self.X_train, self.y_train, self.X_test, self.y_test = generate_data(
            n_train=self.n_train,
            n_test=self.n_test,
            contamination=self.contamination,
            random_state=42)

        detectors = [KNN(), LOF(), OCSVM()]

        self.clf = SimpleDetectorAggregator(base_estimators=detectors,
                                            method='maximization',
                                            contamination=self.contamination)
        self.clf.fit(self.X_train)
Beispiel #34
0
    def setUp(self):
        self.n_train = 1000
        self.n_test = 500
        self.contamination = 0.1
        self.roc_floor = 0.6
        self.random_state = 42
        self.X_train, self.y_train, self.X_test, self.y_test = generate_data(
            n_train=self.n_train,
            n_test=self.n_test,
            contamination=self.contamination,
            random_state=self.random_state)

        self.base_estimators = [
            LOF(n_neighbors=5, contamination=self.contamination),
            LOF(n_neighbors=15, contamination=self.contamination),
            LOF(n_neighbors=25, contamination=self.contamination),
            LOF(n_neighbors=35, contamination=self.contamination),
            LOF(n_neighbors=45, contamination=self.contamination),
            HBOS(contamination=self.contamination),
            PCA(contamination=self.contamination),
            LSCP(detector_list=[
                LOF(n_neighbors=5, contamination=self.contamination),
                LOF(n_neighbors=15, contamination=self.contamination)
            ],
                 random_state=self.random_state)
        ]

        this_directory = os.path.abspath(os.path.dirname(__file__))

        self.cost_forecast_loc_fit_ = os.path.join(this_directory,
                                                   'bps_train.joblib')

        self.cost_forecast_loc_pred_ = os.path.join(this_directory,
                                                    'bps_prediction.joblib')

        self.model = SUOD(base_estimators=self.base_estimators,
                          n_jobs=2,
                          rp_flag_global=True,
                          bps_flag=True,
                          contamination=self.contamination,
                          approx_flag_global=True,
                          cost_forecast_loc_fit=self.cost_forecast_loc_fit_,
                          cost_forecast_loc_pred=self.cost_forecast_loc_pred_,
                          verbose=True)
from pyod.models.auto_encoder import AutoEncoder
from pyod.utils.data import generate_data
from pyod.utils.data import evaluate_print

if __name__ == "__main__":
    contamination = 0.1  # percentage of outliers
    n_train = 20000  # number of training points
    n_test = 2000  # number of testing points
    n_features = 300  # number of features

    # Generate sample data
    X_train, y_train, X_test, y_test = \
        generate_data(n_train=n_train,
                      n_test=n_test,
                      n_features=n_features,
                      contamination=contamination,
                      random_state=42)

    # train AutoEncoder detector
    clf_name = 'AutoEncoder'
    clf = AutoEncoder(epochs=30, contamination=contamination)
    clf.fit(X_train)

    # get the prediction labels and outlier scores of the training data
    y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
    y_train_scores = clf.decision_scores_  # raw outlier scores

    # get the prediction on the test data
    y_test_pred = clf.predict(X_test)  # outlier labels (0 or 1)
    y_test_scores = clf.decision_function(X_test)  # outlier scores
Beispiel #36
0
from pyod.utils.utility import standardizer
from pyod.utils.data import generate_data
from pyod.utils.data import evaluate_print

if __name__ == "__main__":

    # Define data file and read X and y
    # Generate some data if the source data is missing
    mat_file = 'cardio.mat'
    try:
        mat = loadmat(os.path.join('data', mat_file))

    except TypeError:
        print('{data_file} does not exist. Use generated data'.format(
            data_file=mat_file))
        X, y = generate_data(train_only=True)  # load data
    except IOError:
        print('{data_file} does not exist. Use generated data'.format(
            data_file=mat_file))
        X, y = generate_data(train_only=True)  # load data
    else:
        X = mat['X']
        y = mat['y'].ravel()

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4)

    # standardizing data for processing
    X_train_norm, X_test_norm = standardizer(X_train, X_test)

    n_clf = 20  # number of base detectors