Beispiel #1
0
    def setUp(self):
        # Define data file and read X and y
        # Generate some data if the source data is missing
        this_directory = path.abspath(path.dirname(__file__))
        mat_file = 'pima.mat'
        try:
            mat = loadmat(path.join(*[this_directory, 'data', mat_file]))

        except TypeError:
            print('{data_file} does not exist. Use generated data'.format(
                data_file=mat_file))
            X, y = generate_data(train_only=True)  # load data
        except IOError:
            print('{data_file} does not exist. Use generated data'.format(
                data_file=mat_file))
            X, y = generate_data(train_only=True)  # load data
        else:
            X = mat['X']
            y = mat['y'].ravel()
            X, y = check_X_y(X, y)

        self.X_train, self.X_test, self.y_train, self.y_test = \
            train_test_split(X, y, test_size=0.4, random_state=42)

        self.clf = XGBOD(random_state=42)
        self.clf.fit(self.X_train, self.y_train)

        self.roc_floor = 0.75
Beispiel #2
0
    def setUp(self):
        # Define data file and read X and y
        # Generate some data if the source data is missing
        this_directory = path.abspath(path.dirname(__file__))
        mat_file = 'pima.mat'
        try:
            mat = loadmat(path.join(*[this_directory, 'data', mat_file]))

        except TypeError:
            print('{data_file} does not exist. Use generated data'.format(
                data_file=mat_file))
            X, y = generate_data(train_only=True)  # load data
        except IOError:
            print('{data_file} does not exist. Use generated data'.format(
                data_file=mat_file))
            X, y = generate_data(train_only=True)  # load data
        else:
            X = mat['X']
            y = mat['y'].ravel()
            X, y = check_X_y(X, y)

        self.X_train, self.X_test, self.y_train, self.y_test = \
            train_test_split(X, y, test_size=0.4, random_state=42)

        self.clf = XGBOD(random_state=42)
        self.clf.fit(self.X_train, self.y_train)

        self.roc_floor = 0.8
Beispiel #3
0
def train_model(X, Y, contamination, name, from_scratch=True):
    model_dir = './model'
    if not os.path.exists(model_dir):
        os.mkdir(model_dir)
    file_name = name + '.pkl'

    if from_scratch:
        if name == 'ocsvm':
            model = OCSVM(contamination=contamination)
            model.fit(X)
        elif name == 'iforest':
            model = IForest(contamination=contamination)
            model.fit(X)
        elif name == 'lof':
            model = LOF(contamination=contamination)
            model.fit(X)
        elif name == 'knn':
            model = KNN(contamination=contamination)
            model.fit(X)
        elif name == 'xgbod':
            model = XGBOD(contamination=contamination)
            model.fit(X, Y)

        save(model, model_dir, file_name)

    else:
        model = load(model_dir, file_name)

    return model
Beispiel #4
0
def model_test(model_type, y_train, y_test, X_train, X_test, model_file,
               save_flag):
    if model_type == 'KNN':
        clf_name = 'KNN'
        clf = KNN()
        clf.fit(X_train)
    if model_type == 'XGBOD':
        clf_name = 'XGBOD'
        #set this scale_pos_weight  sum(negative instances) / sum(positive instances).
        clf = XGBOD(random_state=42, scale_pos_weight=50)
        clf.fit(X_train, y_train)
    if model_type == 'SOD':
        # train SOD detector
        # Note that SOD is meant to work in high dimensions d > 2.
        # But here we are using 2D for visualization purpose
        # thus, higher precision is expected in higher dimensions
        clf_name = 'SOD'
        clf = SOD()
        clf.fit(X_train)
    if model_type == 'VAE':
        # train VAE detector (Beta-VAE)
        clf_name = 'VAE'
        contamination = 0.01
        clf = VAE(epochs=30,
                  contamination=contamination,
                  gamma=0.8,
                  capacity=0.2)
        clf.fit(X_train)

    #save model if specified
    if save_flag == '1':
        pickle.dump(clf, open(model_file, "wb"))

    # get the prediction labels and outlier scores of the training data
    y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
    y_train_scores = clf.decision_scores_  # raw outlier scores

    # get the prediction on the test data
    y_test_pred = clf.predict(X_test)  # outlier labels (0 or 1)
    y_test_scores = clf.decision_function(X_test)  # outlier scores

    # evaluate and print the results
    print("\nOn Training Data:")
    evaluate_print(clf_name, y_train, y_train_scores)
    conf_train = confusion_matrix(y_train, y_train_pred)
    print("<<<< confusion matrix for train: ", conf_train)

    print("\nOn Test Data:")
    evaluate_print(clf_name, y_test, y_test_scores)
    conf_test = confusion_matrix(y_test, y_test_pred)
    print("<<<< confusion matrix for test: ", conf_test)

    # visualize the results
    #todo: Input data has to be 2-d for visualization.
    #visualize(clf_name, X_train, y_train, X_test, y_test, y_train_pred,
    #         y_test_pred, show_figure=True, save_figure=False)

    return model_file
Beispiel #5
0
 def initialise_supervised_classifiers(self):
     classifiers = {}
     classifiers['Extreme Boosting Based Outlier Detector (XGBOD)'] = []
     classifiers['Nearest Non Outlier (NNO)'] = []
     for i in range(self.k_way):
         classifiers[
             'Extreme Boosting Based Outlier Detector (XGBOD)'].append(
                 XGBOD(silent=False))
         classifiers['Nearest Non Outlier (NNO)'].append(NNO())
     return classifiers
Beispiel #6
0
def outlier_detection(x_raw, y_raw):
    """
    Filter all ourlier points
    :param x_raw: feature in ndarray
    :param y_raw: label in ndarray
    :return x_clean, y_clean: cleaned feature and label in ndarray
    """
    # TODO Filter the outliers.
    print()
    print("Detecting outliers...")
    print("Before outlier detection: {}".format(x_raw.shape))
    outliers_fraction = 0.04
    random_state = np.random.RandomState(42)
    # all outlier detection method candidate list as follows
    classifiers = {
        'Angle-based Outlier Detector (ABOD)':
        ABOD(contamination=outliers_fraction),
        'Cluster-based Local Outlier Factor':
        CBLOF(contamination=outliers_fraction,
              check_estimator=False,
              random_state=random_state),
        'Feature Bagging':
        FeatureBagging(contamination=outliers_fraction,
                       random_state=random_state),
        'Histogram-base Outlier Detection (HBOS)':
        HBOS(contamination=outliers_fraction),
        'Isolation Forest':
        IForest(contamination=outliers_fraction, random_state=random_state),
        'K Nearest Neighbors (KNN)':
        KNN(contamination=outliers_fraction),
        'Local Outlier Factor (LOF)':
        LOF(contamination=outliers_fraction),
        'Minimum Covariance Determinant (MCD)':
        MCD(contamination=outliers_fraction, random_state=random_state),
        'One-class SVM (OCSVM)':
        OCSVM(contamination=outliers_fraction),
        'Principal Component Analysis (PCA)':
        PCA(contamination=outliers_fraction, random_state=random_state),
        'Improving Supervised Outlier Detection with Unsupervised Representation Learning':
        XGBOD(contamination=outliers_fraction),
    }
    clf_name = 'Isolation Forest'
    clf = IForest(contamination=outliers_fraction, random_state=random_state)
    # clf_name = 'Angle-based Outlier Detector (ABOD)'
    # clf = ABOD(contamination=outliers_fraction, method='default')
    clf.fit(x_raw)
    y_pred = clf.predict(x_raw)
    # for pyod, 1 means outliers and 0 means inliers
    # for sklearn,  -1 means outliers and 1 means inliers
    idx_y_pred = [i for i in range(0, 1212) if y_pred[i] == 1]
    x_clean = del_rowsorcolumns(x_raw, idx_y_pred, axis=0)
    y_clean = del_rowsorcolumns(y_raw, idx_y_pred, axis=0)
    print("After outlier detection: {}".format(x_clean.shape))
    assert (x_clean.shape[0] == y_clean.shape[0])
    return x_clean, y_clean
Beispiel #7
0
class TestXGBOD(unittest.TestCase):
    def setUp(self):
        # Define data file and read X and y
        # Generate some data if the source data is missing
        this_directory = path.abspath(path.dirname(__file__))
        mat_file = 'pima.mat'
        try:
            mat = loadmat(path.join(*[this_directory, 'data', mat_file]))

        except TypeError:
            print('{data_file} does not exist. Use generated data'.format(
                data_file=mat_file))
            X, y = generate_data(train_only=True)  # load data
        except IOError:
            print('{data_file} does not exist. Use generated data'.format(
                data_file=mat_file))
            X, y = generate_data(train_only=True)  # load data
        else:
            X = mat['X']
            y = mat['y'].ravel()
            X, y = check_X_y(X, y)

        self.X_train, self.X_test, self.y_train, self.y_test = \
            train_test_split(X, y, test_size=0.4, random_state=42)

        self.clf = XGBOD(random_state=42)
        self.clf.fit(self.X_train, self.y_train)

        self.roc_floor = 0.75

    def test_parameters(self):
        assert (hasattr(self.clf, 'clf_')
                and self.clf.decision_scores_ is not None)
        assert (hasattr(self.clf, '_scalar') and self.clf.labels_ is not None)
        assert (hasattr(self.clf, 'n_detector_')
                and self.clf.labels_ is not None)
        assert (hasattr(self.clf, 'X_train_add_')
                and self.clf.labels_ is not None)
        assert (hasattr(self.clf, 'decision_scores_')
                and self.clf.decision_scores_ is not None)
        assert (hasattr(self.clf, 'labels_') and self.clf.labels_ is not None)

    def test_train_scores(self):
        assert_equal(len(self.clf.decision_scores_), self.X_train.shape[0])

    def test_prediction_scores(self):
        pred_scores = self.clf.decision_function(self.X_test)

        # check score shapes
        assert_equal(pred_scores.shape[0], self.X_test.shape[0])

        # check performance
        assert_greater(roc_auc_score(self.y_test, pred_scores), self.roc_floor)

    def test_prediction_labels(self):
        pred_labels = self.clf.predict(self.X_test)
        assert_equal(pred_labels.shape, self.y_test.shape)

    def test_prediction_proba(self):
        pred_proba = self.clf.predict_proba(self.X_test)
        assert_greater_equal(pred_proba.min(), 0)
        assert_less_equal(pred_proba.max(), 1)
        # check performance
        assert_greater(roc_auc_score(self.y_test, pred_proba), self.roc_floor)

    def test_fit_predict(self):
        pred_labels = self.clf.fit_predict(self.X_train, self.y_train)
        assert_equal(pred_labels.shape, self.y_train.shape)

    def test_fit_predict_score(self):
        self.clf.fit_predict_score(self.X_test, self.y_test)
        self.clf.fit_predict_score(self.X_test,
                                   self.y_test,
                                   scoring='roc_auc_score')
        self.clf.fit_predict_score(self.X_test,
                                   self.y_test,
                                   scoring='prc_n_score')
        with assert_raises(NotImplementedError):
            self.clf.fit_predict_score(self.X_test,
                                       self.y_test,
                                       scoring='something')

    def test_predict_rank(self):
        pred_socres = self.clf.decision_function(self.X_test)
        pred_ranks = self.clf._predict_rank(self.X_test)
        print(pred_ranks)

        # assert the order is reserved
        assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), rtol=4)
        assert_array_less(pred_ranks, self.X_train.shape[0] + 1)
        assert_array_less(-0.1, pred_ranks)

    def test_predict_rank_normalized(self):
        pred_socres = self.clf.decision_function(self.X_test)
        pred_ranks = self.clf._predict_rank(self.X_test, normalized=True)

        # assert the order is reserved
        assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), rtol=4)
        assert_array_less(pred_ranks, 1.01)
        assert_array_less(-0.1, pred_ranks)

    def tearDown(self):
        pass
Beispiel #8
0
class TestXGBOD(unittest.TestCase):
    def setUp(self):
        # Define data file and read X and y
        # Generate some data if the source data is missing
        this_directory = path.abspath(path.dirname(__file__))
        mat_file = 'pima.mat'
        try:
            mat = loadmat(path.join(*[this_directory, 'data', mat_file]))

        except TypeError:
            print('{data_file} does not exist. Use generated data'.format(
                data_file=mat_file))
            X, y = generate_data(train_only=True)  # load data
        except IOError:
            print('{data_file} does not exist. Use generated data'.format(
                data_file=mat_file))
            X, y = generate_data(train_only=True)  # load data
        else:
            X = mat['X']
            y = mat['y'].ravel()
            X, y = check_X_y(X, y)

        self.X_train, self.X_test, self.y_train, self.y_test = \
            train_test_split(X, y, test_size=0.4, random_state=42)

        self.clf = XGBOD(random_state=42)
        self.clf.fit(self.X_train, self.y_train)

        self.roc_floor = 0.8

    def test_parameters(self):
        assert_true(hasattr(self.clf, 'clf_') and
                    self.clf.decision_scores_ is not None)
        assert_true(hasattr(self.clf, '_scalar') and
                    self.clf.labels_ is not None)
        assert_true(hasattr(self.clf, 'n_detector_') and
                    self.clf.labels_ is not None)
        assert_true(hasattr(self.clf, 'X_train_add_') and
                    self.clf.labels_ is not None)
        assert_true(hasattr(self.clf, 'decision_scores_') and
                    self.clf.decision_scores_ is not None)
        assert_true(hasattr(self.clf, 'labels_') and
                    self.clf.labels_ is not None)

    def test_train_scores(self):
        assert_equal(len(self.clf.decision_scores_), self.X_train.shape[0])

    def test_prediction_scores(self):
        pred_scores = self.clf.decision_function(self.X_test)

        # check score shapes
        assert_equal(pred_scores.shape[0], self.X_test.shape[0])

        # check performance
        assert_greater(roc_auc_score(self.y_test, pred_scores), self.roc_floor)

    def test_prediction_labels(self):
        pred_labels = self.clf.predict(self.X_test)
        assert_equal(pred_labels.shape, self.y_test.shape)

    def test_prediction_proba(self):
        pred_proba = self.clf.predict_proba(self.X_test)
        assert_greater_equal(pred_proba.min(), 0)
        assert_less_equal(pred_proba.max(), 1)
        # check performance
        assert_greater(roc_auc_score(self.y_test, pred_proba), self.roc_floor)

    def test_fit_predict(self):
        pred_labels = self.clf.fit_predict(self.X_train, self.y_train)
        assert_equal(pred_labels.shape, self.y_train.shape)

    def test_fit_predict_score(self):
        self.clf.fit_predict_score(self.X_test, self.y_test)
        self.clf.fit_predict_score(self.X_test, self.y_test,
                                   scoring='roc_auc_score')
        self.clf.fit_predict_score(self.X_test, self.y_test,
                                   scoring='prc_n_score')
        with assert_raises(NotImplementedError):
            self.clf.fit_predict_score(self.X_test, self.y_test,
                                       scoring='something')

    def test_predict_rank(self):
        pred_socres = self.clf.decision_function(self.X_test)
        pred_ranks = self.clf._predict_rank(self.X_test)
        print(pred_ranks)

        # assert the order is reserved
        assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=4)
        assert_array_less(pred_ranks, self.X_train.shape[0] + 1)
        assert_array_less(-0.1, pred_ranks)

    def test_predict_rank_normalized(self):
        pred_socres = self.clf.decision_function(self.X_test)
        pred_ranks = self.clf._predict_rank(self.X_test, normalized=True)

        # assert the order is reserved
        assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=4)
        assert_array_less(pred_ranks, 1.01)
        assert_array_less(-0.1, pred_ranks)

    def tearDown(self):
        pass
Beispiel #9
0
def construct_xgbod():
    from pyod.models.xgbod import XGBOD
    model = XGBOD(estimator_list=construct_raw_base_estimators(),
                  silent=False,
                  n_jobs=24)
    return model
Beispiel #10
0
        X, y = generate_data(train_only=True)  # load data
    except IOError:
        print('{data_file} does not exist. Use generated data'.format(
            data_file=mat_file))
        X, y = generate_data(train_only=True)  # load data
    else:
        X = mat['X']
        y = mat['y'].ravel()
        X, y = check_X_y(X, y)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4,
                                                        random_state=42)

    # train XGBOD detector
    clf_name = 'XGBOD'
    clf = XGBOD(random_state=42)
    clf.fit(X_train, y_train)

    # get the prediction labels and outlier scores of the training data
    y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
    y_train_scores = clf.decision_scores_  # raw outlier scores

    # get the prediction on the test data
    y_test_pred = clf.predict(X_test)  # outlier labels (0 or 1)
    y_test_scores = clf.decision_function(X_test)  # outlier scores

    # evaluate and print the results
    print("\nOn Training Data:")
    evaluate_print(clf_name, y_train, y_train_scores)
    print("\nOn Test Data:")
    evaluate_print(clf_name, y_test, y_test_scores)
def run(data_train, data_test, clf_name):
    X_train, y_train = train_data_process(data_train)
    X_test, y_true = test_data_process(data_test)
    classifiers = {
        "XGBOD": XGBOD(random_state=0),
        "KNeighborsClassifier": KNeighborsClassifier(3),
        "SVC": SVC(random_state=0),
        "GaussianProcessClassifier": GaussianProcessClassifier(1.0 * RBF(1.0)),
        "DecisionTreeClassifier": DecisionTreeClassifier(random_state=0),
        "RandomForestClassifier": RandomForestClassifier(random_state=0),
        "MLPClassifier": MLPClassifier(random_state=0),
        "AdaBoostClassifier": AdaBoostClassifier(),
        "GaussianNB": GaussianNB(),
        "QuadraticDiscriminantAnalysis": QuadraticDiscriminantAnalysis(),
        "BaggingClassifierPU": BaggingClassifierPU(
            DecisionTreeClassifier(),
            n_estimators=1000,  # 1000 trees as usual
            max_samples=sum(y_train),  # Balance the positives and unlabeled in each bag
            n_jobs=-1  # Use all cores
        )
    }

    clf = classifiers[clf_name]
    try:
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        TP = 0
        FN = 0
        FP = 0
        TN = 0
        for i, label in enumerate(y_true):
            if label:
                if y_pred[i]:
                    TP += 1
                else:
                    FN += 1
            else:
                if y_pred[i]:
                    FP += 1
                else:
                    TN += 1
        if (FP + TN) == 0:
            pf = "no negative samples."
        else:
            pf = FP / (FP + TN)

        try:
            auc = roc_auc_score(y_true, y_pred)
        except ValueError as e:
            auc = str(e)
        return {
            'train samples': str(X_train.shape[0]),
            'defective train samples': str(np.sum(y_train)),
            'precision': precision_score(y_true, y_pred),
            'recall': recall_score(y_true, y_pred),
            'pf': pf,
            'F-measure': f1_score(y_true, y_pred),
            'accuracy': accuracy_score(y_true, y_pred),
            'AUC': auc
        }
    except ValueError as e:
        return str(e)
Beispiel #12
0
def pyod_init(model, n_features=None):
    # initial model set up
    if model == 'abod':
        from pyod.models.abod import ABOD
        clf = ABOD()
    elif model == 'auto_encoder' and n_features:
        #import os
        #os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
        from pyod.models.auto_encoder import AutoEncoder
        clf = AutoEncoder(hidden_neurons=[
            n_features, n_features * 5, n_features * 5, n_features
        ],
                          epochs=5,
                          batch_size=64,
                          preprocessing=False)
    elif model == 'cblof':
        from pyod.models.cblof import CBLOF
        clf = CBLOF(n_clusters=4)
    elif model == 'hbos':
        from pyod.models.hbos import HBOS
        clf = HBOS()
    elif model == 'iforest':
        from pyod.models.iforest import IForest
        clf = IForest()
    elif model == 'knn':
        from pyod.models.knn import KNN
        clf = KNN()
    elif model == 'lmdd':
        from pyod.models.lmdd import LMDD
        clf = LMDD()
    elif model == 'loci':
        from pyod.models.loci import LOCI
        clf = LOCI()
    elif model == 'loda':
        from pyod.models.loda import LODA
        clf = LODA()
    elif model == 'lof':
        from pyod.models.lof import LOF
        clf = LOF()
    elif model == 'mcd':
        from pyod.models.mcd import MCD
        clf = MCD()
    elif model == 'ocsvm':
        from pyod.models.ocsvm import OCSVM
        clf = OCSVM()
    elif model == 'pca':
        from pyod.models.pca import PCA
        clf = PCA()
    elif model == 'sod':
        from pyod.models.sod import SOD
        clf = SOD()
    elif model == 'vae':
        from pyod.models.vae import VAE
        clf = VAE()
    elif model == 'xgbod':
        from pyod.models.xgbod import XGBOD
        clf = XGBOD()
    else:
        #raise ValueError(f"unknown model {model}")
        clf = PyODDefaultModel()
    return clf
Beispiel #13
0
    'IForest': IForest(),
    'CBLOF': CBLOF(n_clusters=10, n_jobs=4),
    'FactorAnalysis': FactorAnalysis(),
    'KernelDensity': KernelDensity(),
    'COPOD': COPOD(),
    'SOD': SOD(),
    'LSCPwithLODA': LSCP([LODA(), LODA()]),
    'AveLMDD': LMDD(dis_measure='aad'),
    'VarLMDD': LMDD(dis_measure='var'),
    'IqrLMDD': LMDD(dis_measure='iqr'),
    'SoGaal': SO_GAAL(),
    #'MoGaal':MO_GAAL(),
    'VAE': VAE(encoder_neurons=[8, 4, 2]),
    'AutoEncoder': AutoEncoder(hidden_neurons=[6, 3, 3, 6])
}

models = {
    'XGBOD': XGBOD(),
    'BRM': BRM(),
    'GM': GaussianMixture(),
    'IF': IsolationForest(),
    'OCSVM': OneClassSVM(),
    'EE': EllipticEnvelope(),
    'OCKRA': m_OCKRA(),
    'FactorAnalysis': FactorAnalysis(),
    'KernelDensity': KernelDensity(),
}

Parallel(n_jobs=CPUS) \
        (delayed(runByScaler)
         (root,scaler,models,start,counts,other_models,CPUS_Models) for scaler in scalers)
Beispiel #14
0
 rs = 10
 # Save how to run the models
 models = [
     (IsolationForest(random_state=rs), 'ISOF'),
     (EllipticEnvelope(random_state=rs), 'EE'),
     (LMDD(dis_measure='aad', random_state=rs), 'AAD_LMDD'),
     (COPOD(), 'COPOD'),
     (FeatureBagging(combination='average',
                     random_state=rs), 'AVE_Bagging'),  # n_jobs
     (LMDD(dis_measure='iqr', random_state=rs), 'IQR_LMDD'),
     (KNN(method='largest'), 'Largest_KNN'),  # n_jobs
     (LODA(), 'LODA'),
     (FeatureBagging(combination='max', n_jobs=-1,
                     random_state=rs), 'MAX_Bagging'),
     (MCD(random_state=rs), 'MCD'),
     (XGBOD(random_state=rs), 'XGBOD'),  # n_jobs
     (GaussianMixture(random_state=rs), 'GMM'),
     (LocalOutlierFactor(novelty=True), 'LOF'),
     (KNN(method='median'), 'Median_KNN'),  # n_jobs
     (KNN(method='mean'), 'Avg_KNN'),  # n_jobs
     (CBLOF(n_clusters=10, random_state=rs), 'CBLOF'),
     (HBOS(), 'HBOS'),
     (SOD(), 'SOD'),
     (PCA(random_state=rs), 'PCA'),
     (VAE(encoder_neurons=[3, 4, 3],
          decoder_neurons=[3, 4, 3],
          random_state=rs), 'VAE'),
     (AutoEncoder(hidden_neurons=[3, 4, 4, 3], verbose=0,
                  random_state=rs), 'AE')
 ]
 # Start the counter of time