コード例 #1
0
ファイル: AllFunctions.py プロジェクト: kinjaldand/MLProjects
def getOutliers(df):
    from pyod.models.abod import ABOD
    from pyod.models.knn import KNN

    outlier_fraction = 0.1
    outlierlist = []

    classifiers = {
        'Angle-based Outlier Detector (ABOD)':
        ABOD(contamination=outlier_fraction),
        'K Nearest Neighbors (KNN)': KNN(contamination=outlier_fraction)
    }
    classifiers = {
        'Angle-based Outlier Detector (ABOD)':
        ABOD(contamination=outlier_fraction),
    }
    print("outlier detection")
    for i, (clf_name, clf) in enumerate(classifiers.items()):
        print(clf_name)
        clf.fit(df)
        # predict raw anomaly score
        #scores_pred = clf.decision_function(df) * -1

        y_pred = clf.predict(df)
        #n_inliers = len(y_pred) - np.count_nonzero(y_pred)
        n_outliers = np.count_nonzero(y_pred == 1)
        print(clf_name, n_outliers)
        outlierlist.append((n_outliers, y_pred.tolist()))

    return outlierlist
コード例 #2
0
def fast_abod_pyod_auc(X_nor, X_test, y_test, n_neighbors, contamination=0.05):

    fastABOD = ABOD(n_neighbors=n_neighbors,
                    contamination=contamination,
                    method='fast')

    X_train = X_nor.astype(float).values.copy()

    fastABOD.fit(X_train)
    ## now threshold is determined

    #y_pred = fastABOD.predict(X_test)
    scoreTable = fastABOD.decision_function(X_test)
    #print(scoreTable)
    scoreTable = np.nan_to_num(scoreTable, copy=True)

    ## confusion matrix
    #tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

    #tpr = tp/(tp+fn)
    #fpr = fp/(tn+fp)
    #tprW[trail] = tpr
    #fprW[trail] = fpr
    #tprW = tpr
    #fprW = fpr

    # Auc score
    auc = roc_auc_score(y_test, scoreTable)

    #print(tpr, fpr)
    #print(auc)

    return auc
コード例 #3
0
def define_classifiers(random_state, outliers_fraction):
    classifiers = {
        'Angle-based Outlier Detector (ABOD)':
        ABOD(contamination=outliers_fraction),
        'Cluster-based Local Outlier Factor':
        CBLOF(contamination=outliers_fraction,
              check_estimator=False,
              random_state=random_state),
        'Feature Bagging':
        FeatureBagging(contamination=outliers_fraction,
                       random_state=random_state),
        'Histogram-base Outlier Detection (HBOS)':
        HBOS(contamination=outliers_fraction),
        'Isolation Forest':
        IForest(contamination=outliers_fraction, random_state=random_state),
        'K Nearest Neighbors (KNN)':
        KNN(contamination=outliers_fraction),
        'Local Outlier Factor (LOF)':
        LOF(contamination=outliers_fraction),
        'Minimum Covariance Determinant (MCD)':
        MCD(contamination=outliers_fraction, random_state=random_state),
        'One-class SVM (OCSVM)':
        OCSVM(contamination=outliers_fraction),
        'Principal Component Analysis (PCA)':
        PCA(contamination=outliers_fraction, random_state=random_state)
    }
    return classifiers
コード例 #4
0
def load_classifiers(outliers_fraction):
    outliers_fraction = min(0.5, outliers_fraction)
    random_state = np.random.RandomState(42)
    # Define nine outlier detection tools to be compared
    classifiers = {
        'Angle-based Outlier Detector (ABOD)':
        ABOD(contamination=outliers_fraction),
        'Cluster-based Local Outlier Factor (CBLOF)':
        CBLOF(contamination=outliers_fraction,
              check_estimator=False,
              random_state=random_state),
        'Feature Bagging':
        FeatureBagging(LOF(n_neighbors=35),
                       contamination=outliers_fraction,
                       random_state=random_state),
        'Histogram-base Outlier Detection (HBOS)':
        HBOS(contamination=outliers_fraction),
        'Isolation Forest':
        IForest(contamination=outliers_fraction,
                random_state=random_state,
                behaviour="new"),
        'K Nearest Neighbors (KNN)':
        KNN(contamination=outliers_fraction),
        'Average KNN':
        KNN(method='mean', contamination=outliers_fraction),
        'Local Outlier Factor (LOF)':
        LOF(n_neighbors=35, contamination=outliers_fraction),
        'Minimum Covariance Determinant (MCD)':
        MCD(contamination=outliers_fraction, random_state=random_state),
        'One-class SVM (OCSVM)':
        OCSVM(contamination=outliers_fraction),
        'Principal Component Analysis (PCA)':
        PCA(contamination=outliers_fraction, random_state=random_state)
    }
    return classifiers
コード例 #5
0
    def __init__(self, inliers, outliers):
        data_total = np.concatenate((inliers, outliers), axis=0)
        self.data_total = data_total
        self.outliers = outliers
        self.inliers = inliers

        OutlierStream.__init__(self, inliers, outliers)
        #self.model = KNN(contamination=0.045)
        self.model = ABOD(n_neighbors=20, contamination=0.2)
コード例 #6
0
def outlier_detection(x_raw, y_raw):
    """
    Filter all ourlier points
    :param x_raw: feature in ndarray
    :param y_raw: label in ndarray
    :return x_clean, y_clean: cleaned feature and label in ndarray
    """
    # TODO Filter the outliers.
    print()
    print("Detecting outliers...")
    print("Before outlier detection: {}".format(x_raw.shape))
    outliers_fraction = 0.04
    random_state = np.random.RandomState(42)
    # all outlier detection method candidate list as follows
    classifiers = {
        'Angle-based Outlier Detector (ABOD)':
        ABOD(contamination=outliers_fraction),
        'Cluster-based Local Outlier Factor':
        CBLOF(contamination=outliers_fraction,
              check_estimator=False,
              random_state=random_state),
        'Feature Bagging':
        FeatureBagging(contamination=outliers_fraction,
                       random_state=random_state),
        'Histogram-base Outlier Detection (HBOS)':
        HBOS(contamination=outliers_fraction),
        'Isolation Forest':
        IForest(contamination=outliers_fraction, random_state=random_state),
        'K Nearest Neighbors (KNN)':
        KNN(contamination=outliers_fraction),
        'Local Outlier Factor (LOF)':
        LOF(contamination=outliers_fraction),
        'Minimum Covariance Determinant (MCD)':
        MCD(contamination=outliers_fraction, random_state=random_state),
        'One-class SVM (OCSVM)':
        OCSVM(contamination=outliers_fraction),
        'Principal Component Analysis (PCA)':
        PCA(contamination=outliers_fraction, random_state=random_state),
        'Improving Supervised Outlier Detection with Unsupervised Representation Learning':
        XGBOD(contamination=outliers_fraction),
    }
    clf_name = 'Isolation Forest'
    clf = IForest(contamination=outliers_fraction, random_state=random_state)
    # clf_name = 'Angle-based Outlier Detector (ABOD)'
    # clf = ABOD(contamination=outliers_fraction, method='default')
    clf.fit(x_raw)
    y_pred = clf.predict(x_raw)
    # for pyod, 1 means outliers and 0 means inliers
    # for sklearn,  -1 means outliers and 1 means inliers
    idx_y_pred = [i for i in range(0, 1212) if y_pred[i] == 1]
    x_clean = del_rowsorcolumns(x_raw, idx_y_pred, axis=0)
    y_clean = del_rowsorcolumns(y_raw, idx_y_pred, axis=0)
    print("After outlier detection: {}".format(x_clean.shape))
    assert (x_clean.shape[0] == y_clean.shape[0])
    return x_clean, y_clean
コード例 #7
0
    def test_abod(self):
        clf = ABOD(contamination=0.05)
        clf.fit(self.X_train)
        assert_equal(len(clf.decision_scores), self.X_train.shape[0])

        # invert the scores
        pred_scores = clf.decision_function(self.X_test) * -1
        assert_equal(pred_scores.shape[0], self.X_test.shape[0])
        assert_equal(clf.predict(self.X_test).shape[0],
                     self.X_test.shape[0])
        assert_greater(roc_auc_score(self.y_test, pred_scores), 0.5)
コード例 #8
0
ファイル: test_abod.py プロジェクト: zzhaozeng/pyod
    def setUp(self):
        self.n_train = 50
        self.n_test = 50
        self.contamination = 0.2
        self.roc_floor = 0.8
        self.X_train, self.y_train, self.X_test, self.y_test = generate_data(
            n_train=self.n_train, n_test=self.n_test,
            contamination=self.contamination, random_state=42)

        self.clf = ABOD(contamination=self.contamination, method='default')
        self.clf.fit(self.X_train)
コード例 #9
0
    def setUp(self):
        self.n_train = 100
        self.n_test = 50
        self.contamination = 0.1
        self.roc_floor = 0.6
        self.X_train, self.y_train, self.X_test, self.y_test = generate_data(
            n_train=self.n_train, n_test=self.n_test,
            contamination=self.contamination, random_state=42)

        self.clf = ABOD(contamination=self.contamination)
        self.clf.fit(self.X_train)
コード例 #10
0
def getOutlierABOD(dataset):
    '''
    @brief Function that executes ABOD algorithm on the dataset and obtains the
    labels of the dataset indicating which instance is an inlier (0) or outlier (1)
    @param dataset Dataset on which to try the algorithm
    @return It returns a list of labels 0 means inlier, 1 means outlier
    '''
    # Initializating the model
    abod = ABOD()
    # Fits the data and obtains labels
    abod.fit(dataset)
    # Return labels
    return abod.labels_
コード例 #11
0
    def __init__(self, nth_layer=18, nn_name='vgg', detector_name='svm', pool=None, pca_n_components=None):
        """
        Extract feature by neural network and detector train normal samples then predict new data
        nn_name: 'Xception', 'ResNet'(Default), 'InceptionV3',
        'InceptionResNetV2', 'MobileNet', 'MobileNetV2', 'DenseNet', 'NASNet'
        detector_name: 'RobustCovariance', 'IsolationForest, 'LocalOutlierFactor, ABOD, kNN(Default)'
        """
        self.nth_layer = nth_layer
        self.nn_name = nn_name
        self.pool = pool
        self.pca_n_components = pca_n_components
        self.input_shape = None
        self.pretrained_nn = None
        self.extracting_model = None

        K.clear_session()

        detector_name_lower = detector_name.lower()
        if detector_name_lower == 'robustcovariance':
            self.detector_name = 'rc'
            from sklearn.covariance import EllipticEnvelope
            self.clf = EllipticEnvelope()
            print('Novelty Detector: Robust covariance')
        elif detector_name_lower in ['localoutlierfactor', 'lof']:
            self.detector_name = 'lof'
            from sklearn.neighbors import LocalOutlierFactor
            self.clf = LocalOutlierFactor(novelty=True)
            print('Novelty Detector: Local Outlier Factor')
        elif detector_name_lower in ['abod', 'fastabod', 'anglebasedoutlierdetection']:
            self.detector_name = 'abod'
            from pyod.models.abod import ABOD
            self.clf = ABOD()
            print('Novelty Detector: Angle Based Outlier Detection')
        elif detector_name_lower in ['iforest', 'isolationforest']:
            self.detector_name = 'iforest'
            from sklearn.ensemble import IsolationForest
            self.clf = IsolationForest()
            print('Novelty Detector: Isolation Forest')
        elif detector_name_lower in ['knn', 'median_knn']:
            self.detector_name = 'median_kNN'
            from pyod.models.knn import KNN
            self.clf = KNN(method='median', contamination=0.1)
            print('Novelty Detector: Median K Nearest Neighbors')
        elif detector_name_lower =='svm':
            from sklearn.svm import OneClassSVM
            self.clf = OneClassSVM(gamma='scale')
            print('SVM')
        else:
            print(self.detector_name_lower)
            raise ValueError
コード例 #12
0
    def __init__(
            self,
            *,
            hyperparams: Hyperparams,  #
            random_seed: int = 0,
            docker_containers: Dict[str, DockerContainer] = None) -> None:
        super().__init__(hyperparams=hyperparams,
                         random_seed=random_seed,
                         docker_containers=docker_containers)

        self._clf = ABOD(
            contamination=hyperparams['contamination'],
            n_neighbors=hyperparams['n_neighbors'],
            method=hyperparams['method'],
        )
コード例 #13
0
    def fit(self, X, y=None):
        """ Fits the outlier detection on the given data
        
        Parameters
        ----------
        X (DataFrame) : training data

        y (DataFrame, default=None) : target values (if needed)

        Returns
        -------
        (DataFrame, DataFrame) : A tuple of the transformed DataFrames, the first being the X data and the second being the y data
        """
        abod = ABOD(**self.kwargs)
        self.fitted = abod.fit(X)
        return self.transform(X, y=y)
コード例 #14
0
def out_lier_score(df, target, num_var):

    scaler = MinMaxScaler(feature_range=(0, 1))
    df = scaler.fit_transform(df.loc[:, num_var], df[target])  #.to_numpy()
    random_state = np.random.RandomState(42)
    outliers_fraction = 0.05

    X = df
    df_out_score = []
    # Define seven outlier  tools detectionto be compared
    classifiers = {
        'Angle-based Outlier Detector (ABOD)':
        ABOD(contamination=outliers_fraction),
        'Cluster-based Local Outlier Factor (CBLOF)':
        CBLOF(contamination=outliers_fraction,
              check_estimator=False,
              random_state=random_state),
        'Feature Bagging':
        FeatureBagging(LOF(n_neighbors=35),
                       contamination=outliers_fraction,
                       check_estimator=False,
                       random_state=random_state),
        'Histogram-base Outlier Detection (HBOS)':
        HBOS(contamination=outliers_fraction),
        'Isolation Forest':
        IForest(contamination=outliers_fraction, random_state=random_state),
        'K Nearest Neighbors (KNN)':
        KNN(contamination=outliers_fraction),
        'Average KNN':
        KNN(method='mean', contamination=outliers_fraction)
    }
    for i, (clf_name, clf) in enumerate(classifiers.items()):
        clf.fit(X)
        # predict raw anomaly score
        scores_pred = clf.decision_function(X) * -1
        # prediction of a datapoint category outlier or inlier
        y_pred = clf.predict(X)
        df_out_score.append(y_pred.tolist())

    df_out_score = pd.DataFrame(df_out_score).T
    df_out_score.columns = list(classifiers.keys())
    return df_out_score
コード例 #15
0
def create_tunable_ensemble(knn_neighbors, lof_neighbors, abod_neighbors):
    model_list = []
    for knn_neighbor in knn_neighbors:
        for lof_neighbor in lof_neighbors:
            for abod_neighbor in abod_neighbors:
                element = {
                    "model": SimpleDetectorAggregator,
                    "supervised": False,
                    "parameters": {
                        "method": "average",
                        "base_estimators": [
                            KNN(n_neighbors=knn_neighbor),
                            LOF(n_neighbors=lof_neighbor),
                            ABOD(n_neighbors=abod_neighbor),
                            OCSVM()
                        ],
                    }
                }
                model_list.append(element)

    return model_list
コード例 #16
0
def choose_model(model, nnet):
    """ among implemented in PyOD """
    clfs = {
        'AE':
        AutoEncoder(hidden_neurons=nnet, contamination=0.1, epochs=15),
        'VAE':
        VAE(encoder_neurons=nnet[:5],
            decoder_neurons=nnet[4:],
            contamination=0.1,
            epochs=13),
        'ABOD':
        ABOD(),
        'FeatureBagging':
        FeatureBagging(),
        'HBOS':
        HBOS(),
        'IForest':
        IForest(),
        'KNN':
        KNN(),
        'LOF':
        LOF(),
        'OCSVM':
        OCSVM(),
        'PCA':
        PCA(),
        'SOS':
        SOS(),
        'COF':
        COF(),
        'CBLOF':
        CBLOF(),
        'SOD':
        SOD(),
        'LOCI':
        LOCI(),
        'MCD':
        MCD()
    }
    return clfs[model]
コード例 #17
0
def create_ensemble_models(knn_methods, ensemble_combinations, pca):
    model_list = []

    for ensemble_combination in ensemble_combinations:
        for i in range(1, pca + 1):
            for j in range(1, pca + 1):
                for method in knn_methods:
                    element = {
                        "model": SimpleDetectorAggregator,
                        "supervised": False,
                        "parameters": {
                            "method": ensemble_combination,
                            "base_estimators": [
                                KNN(n_neighbors=i, method=method),
                                LOF(n_neighbors=j),
                                ABOD(),
                                OCSVM()
                            ],
                        }
                    }
                    model_list.append(element)
    return model_list
    rootDir = os.path.abspath(datasets_path)
    # specify the random state
    rs = 10
    # Save how to run the models

    detector_list = [LOF(), LOF()]

    models = [
        # BRM github
        (brminer.BRM(), 'BRM'),
        # ocSVM sklearn
        (OneClassSVM(gamma='auto'), 'ocSVM'),
        # COF pyod
        (COF(contamination=0.1, n_neighbors=20), 'COF'),
        # ABOD pyod
        (ABOD(contamination=0.1, n_neighbors=5, method='fast'), 'ABOD'),
        # MO_GAAL pyod
        (MO_GAAL(k=10,
                 stop_epochs=20,
                 lr_d=0.01,
                 lr_g=0.0001,
                 decay=1e-06,
                 momentum=0.9,
                 contamination=0.1), 'MO_GAAL'),
        # SO_GAAL pyod
        (SO_GAAL(stop_epochs=20,
                 lr_d=0.01,
                 lr_g=0.0001,
                 decay=1e-06,
                 momentum=0.9,
                 contamination=0.1), 'SO_GAAL'),
コード例 #19
0
    def initialise_pyod_classifiers(self, outlier_fraction):
        #Testing every query to every class and then predicting only if it belongs to the same class
        classifiers = {}
        #Proximity based
        classifiers['K Nearest Neighbors (KNN)'] = []
        classifiers['Average K Nearest Neighbors (AvgKNN)'] = []
        classifiers['Median K Nearest Neighbors (MedKNN)'] = []
        classifiers['Local Outlier Factor (LOF)'] = []
        classifiers['Connectivity-Based Outlier Factor (COF)'] = []
        #classifiers['Clustering-Based Local Outlier Factor (CBLOF)'] = []
        classifiers['LOCI'] = []
        #classifiers['Histogram-based Outlier Score (HBOS)'] = []
        classifiers['Subspace Outlier Detection (SOD)'] = []
        #Linear models
        classifiers['Principal Component Analysis (PCA)'] = []
        #classifiers['Minimum Covariance Determinant (MCD)'] = []           #To slow
        classifiers['One-Class Support Vector Machines (OCSVM)'] = []
        classifiers['Deviation-based Outlier Detection (LMDD)'] = []
        #Probabilistic
        classifiers['Angle-Based Outlier Detection (ABOD)'] = []
        classifiers['Stochastic Outlier Selection (SOS)'] = []
        #Outlier Ensembles
        classifiers['Isolation Forest (IForest)'] = []
        classifiers['Feature Bagging'] = []
        classifiers['Lightweight On-line Detector of Anomalies (LODA)'] = []

        for i in range(self.k_way):
            for i in range(self.k_way):
                classifiers['K Nearest Neighbors (KNN)'].append(
                    KNN(method='largest',
                        n_neighbors=int(self.n_shot / 3) + 1,
                        contamination=outlier_fraction))
                classifiers['Average K Nearest Neighbors (AvgKNN)'].append(
                    KNN(method='mean',
                        n_neighbors=int(self.n_shot / 3) + 1,
                        contamination=outlier_fraction))
                classifiers['Median K Nearest Neighbors (MedKNN)'].append(
                    KNN(method='median',
                        n_neighbors=int(self.n_shot / 3) + 1,
                        contamination=outlier_fraction))
                classifiers['Local Outlier Factor (LOF)'].append(
                    LOF(n_neighbors=int(self.n_shot / 3) + 1,
                        contamination=outlier_fraction))
                classifiers['Connectivity-Based Outlier Factor (COF)'].append(
                    COF(n_neighbors=int(self.n_shot / 3) + 1,
                        contamination=outlier_fraction))
                classifiers['LOCI'].append(
                    LOCI(contamination=outlier_fraction))
                classifiers['Subspace Outlier Detection (SOD)'].append(
                    SOD(n_neighbors=int(self.n_shot / 3) + 2,
                        contamination=outlier_fraction,
                        ref_set=max(2, int((int(self.n_shot / 3) + 2) / 3))))
                classifiers['Principal Component Analysis (PCA)'].append(
                    PCA(contamination=outlier_fraction))
                classifiers[
                    'One-Class Support Vector Machines (OCSVM)'].append(
                        OCSVM(contamination=outlier_fraction))
                classifiers['Deviation-based Outlier Detection (LMDD)'].append(
                    LMDD(contamination=outlier_fraction))
                classifiers['Angle-Based Outlier Detection (ABOD)'].append(
                    ABOD(contamination=outlier_fraction))
                classifiers['Stochastic Outlier Selection (SOS)'].append(
                    SOS(contamination=outlier_fraction))
                classifiers['Isolation Forest (IForest)'].append(
                    IForest(contamination=outlier_fraction))
                classifiers['Feature Bagging'].append(
                    FeatureBagging(contamination=outlier_fraction))
                classifiers[
                    'Lightweight On-line Detector of Anomalies (LODA)'].append(
                        LODA(contamination=outlier_fraction))
        self.num_different_models = len(classifiers)
        return classifiers
コード例 #20
0
ground_truth = np.zeros(n_samples, dtype=int)
ground_truth[-n_outliers:] = 1

# Show the statics of the data
print('Number of inliers: %i' % n_inliers)
print('Number of outliers: %i' % n_outliers)
print(
    'Ground truth shape is {shape}. Outlier are 1 and inliers are 0.\n'.format(
        shape=ground_truth.shape))
print(ground_truth)

random_state = np.random.RandomState(42)
# Define nine outlier detection tools to be compared
classifiers = {
    'Angle-based Outlier Detector (ABOD)':
    ABOD(n_neighbors=10, contamination=outliers_fraction),
    'Cluster-based Local Outlier Factor (CBLOF)':
    CBLOF(contamination=outliers_fraction,
          check_estimator=False,
          random_state=random_state),
    'Feature Bagging':
    FeatureBagging(LOF(n_neighbors=35),
                   contamination=outliers_fraction,
                   check_estimator=False,
                   random_state=random_state),
    'Histogram-base Outlier Detection (HBOS)':
    HBOS(contamination=outliers_fraction),
    'Isolation Forest':
    IForest(contamination=outliers_fraction, random_state=random_state),
    'K Nearest Neighbors (KNN)':
    KNN(contamination=outliers_fraction),
コード例 #21
0
ファイル: skin.py プロジェクト: BigBossYan/dataming5
# df.plot.scatter('original.label', 'R')

scaler = MinMaxScaler(feature_range=(0, 1))
df[['R', 'G']] = scaler.fit_transform(df[['R', 'G']])
df[['R', 'G']].head()

X1 = df['R'].values.reshape(-1, 1)
X2 = df['G'].values.reshape(-1, 1)
X = np.concatenate((X1, X2), axis=1)

random_state = np.random.RandomState(42)
outliers_fraction = 0.05
# Define seven outlier detection tools to be compared
classifiers = {
    'Angle-based Outlier Detector (ABOD)':
    ABOD(contamination=outliers_fraction),
    'Cluster-based Local Outlier Factor (CBLOF)':
    CBLOF(contamination=outliers_fraction,
          check_estimator=False,
          random_state=random_state),
    'Feature Bagging':
    FeatureBagging(LOF(n_neighbors=35),
                   contamination=outliers_fraction,
                   check_estimator=False,
                   random_state=random_state),
    'Histogram-base Outlier Detection (HBOS)':
    HBOS(contamination=outliers_fraction),
    'Isolation Forest':
    IForest(contamination=outliers_fraction, random_state=random_state),
    'K Nearest Neighbors (KNN)':
    KNN(contamination=outliers_fraction),
コード例 #22
0
ファイル: abod_example.py プロジェクト: Lightge/Pyod
from pyod.utils.data import generate_data
from pyod.utils.data import evaluate_print
from pyod.utils.data import visualize

if __name__ == "__main__":
    contamination = 0.1  # percentage of outliers
    n_train = 200  # number of training points
    n_test = 100  # number of testing points

    X_train, y_train, X_test, y_test = generate_data(
        n_train=n_train, n_test=n_test, contamination=contamination)

    # train ABOD detector
    clf_name = 'ABOD'
    clf = ABOD()
    clf.fit(X_train)

    # get the prediction labels and outlier scores of the training data
    y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
    y_train_scores = clf.decision_scores_  # raw outlier scores

    # get the prediction on the test data
    y_test_pred = clf.predict(X_test)  # outlier labels (0 or 1)
    y_test_scores = clf.decision_function(X_test)  # outlier s`cores

    # evaluate and print the results
    print("\nOn Training Data:")
    evaluate_print(clf_name, y_train, y_train_scores)
    print("\nOn Test Data:")
    evaluate_print(clf_name, y_test, y_test_scores)
コード例 #23
0
mat_file = 'cardio.mat'
mat_file_name = mat_file.replace('.mat', '')
print("\n... Processing", mat_file_name, '...')
mat = sp.io.loadmat(os.path.join('../datasets', mat_file))

X = mat['X']
y = mat['y']

X = StandardScaler().fit_transform(X)

# load the pre-trained model cost predictor
clf = load('rf_predictor.joblib')

classifiers = {
    1: ABOD(n_neighbors=10),
    2: CBLOF(check_estimator=False),
    3: FeatureBagging(LOF()),
    4: HBOS(),
    5: IForest(),
    6: KNN(),
    7: KNN(method='mean'),
    8: LOF(),
    9: MCD(),
    10: OCSVM(),
    11: PCA(),
}

clfs = np.random.choice([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11],
                        size=n_estimators_total)
clfs_real = []
コード例 #24
0
    roc_mat = np.zeros([n_ite, n_classifiers])
    prn_mat = np.zeros([n_ite, n_classifiers])
    time_mat = np.zeros([n_ite, n_classifiers])

    # Apagar o número 0 sendo passado como parâmetro do RandomState
    random_state = np.random.RandomState(0)

    # 60% data for training and 40% for testing
    X_train, X_test, y_train, y_test = \
        train_test_split(X, y, test_size=0.4, random_state=random_state)

    # standardizing data for processing
    X_train_norm, X_test_norm = standardizer(X_train, X_test)

    if sys.argv[1] == 'abod':
        classifiers = {'Angle-based Outlier Detector (ABOD)': ABOD(contamination=outliers_fraction)}
        classifiers_indices = {
            'Angle-based Outlier Detector (ABOD)': 0}
    elif sys.argv[1] == 'cblof':
        classifiers = {'Cluster-based Local Outlier Factor': CBLOF(contamination=outliers_fraction, check_estimator=False, random_state=random_state)}
        classifiers_indices = {'Cluster-based Local Outlier Factor': 0}
    elif sys.argv[1] == 'fb':
        classifiers = {'Feature Bagging': FeatureBagging(contamination=outliers_fraction, check_estimator=False, random_state=random_state)}
        classifiers_indices = {'Feature Bagging': 0}
    elif sys.argv[1] == 'hbos':
        classifiers = {'Histogram-base Outlier Detection (HBOS)': HBOS(contamination=outliers_fraction)}
        classifiers_indices = {'Histogram-base Outlier Detection (HBOS)': 0}
    elif sys.argv[1] == 'iforest':
        classifiers = {'Isolation Forest': IForest(contamination=outliers_fraction, random_state=random_state)}
        classifiers_indices = {'Isolation Forest': 0}
    elif sys.argv[1] == 'knn':
コード例 #25
0
def fun(dir_path):
    file_list = []
    total_roc = []
    total_prn = []
    count = 0
    for home, dirs, files in os.walk("./"+dir_path+"/benchmarks"):
        for filename in files:
            fullname = os.path.join(home, filename)
            file_list.append(fullname)cb
    for file_csv in file_list:

        # if count == 2:
        #     break
        
        df = pd.read_csv(file_csv)
        columns = df.columns
        # df = df[columns].fillna('nan')

        data = df.drop(columns = ['point.id', 'motherset', 'origin'])

        class_mapping = {"anomaly":1, "nominal":0}
        data['ground.truth'] = data['ground.truth'].map(class_mapping)
        class_mapping = {"anomaly":1, "nominal":0}

        y = data['ground.truth']

        x = data.drop('ground.truth',axis=1)

        X_train, X_test, y_train, y_test = train_test_split(
                x, y, test_size=0.2, random_state=28)

        random_state = np.random.RandomState(42)
        outliers_fraction = 0.05
        # Define seven outlier detection tools to be compared
        classifiers = {
                'Angle-based Outlier Detector (ABOD)': ABOD(contamination=outliers_fraction),
                'Cluster-based Local Outlier Factor (CBLOF)':CBLOF(contamination=outliers_fraction,check_estimator=False, random_state=random_state),
                'Feature Bagging':FeatureBagging(LOF(n_neighbors=35),contamination=outliers_fraction,check_estimator=False,random_state=random_state),
                'Histogram-base Outlier Detection (HBOS)': HBOS(contamination=outliers_fraction),
                'Isolation Forest': IForest(contamination=outliers_fraction,random_state=random_state),
                'K Nearest Neighbors (KNN)': KNN(contamination=outliers_fraction),
                'Average KNN': KNN(method='mean',contamination=outliers_fraction)
        }
        p_prn = []
        p_roc = []
        for i, (clf_name, clf) in enumerate(classifiers.items()):
            try:
                clf.fit(X_train)

                # get the prediction labels and outlier scores of the training data
                y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
                y_train_scores = clf.decision_scores_  # raw outlier scores

                # get the prediction on the test data
                y_test_pred = clf.predict(X_test)  # outlier labels (0 or 1)
                y_test_scores = clf.decision_function(X_test)  # outlier scores

                # evaluate and print the results
                print(str(count)+"is analysing")
                print("\nOn Training Data:")
        
                evaluate_print(clf_name, y_train, y_train_scores)
                print("\nOn Test Data:")
                evaluate_print(clf_name, y_test, y_test_scores)
                roc=np.round(roc_auc_score(y_train, y_train_scores), decimals=4),
                prn=np.round(precision_n_scores(y_test, y_test_scores), decimals=4)

                p_prn.append(prn)
                p_roc.append(roc[0])
            except:
                p_prn.append(-1)
                p_roc.append(-1)

        total_prn.append(p_prn)
        total_roc.append(p_roc)    
        count += 1
            
    total_prn = json.dumps(total_prn)
    total_roc = json.dumps(total_roc)
    a = open(dir_path+"_prn_list.txt", "w",encoding='UTF-8')
    a.write(total_prn)
    a.close()
    a = open(dir_path+"_roc_list.txt", "w",encoding='UTF-8')
    a.write(total_roc)
    a.close()
コード例 #26
0
ファイル: utility.py プロジェクト: hsw2012s/temp
def get_estimators(contamination):
    """Internal method to create a list of 600 random base outlier detectors.

    Parameters
    ----------
    contamination : float in (0., 0.5), optional (default=0.1)
        The amount of contamination of the data set,
        i.e. the proportion of outliers in the data set. Used when fitting to
        define the threshold on the decision function.

    Returns
    -------
    base_detectors : list
        A list of initialized random base outlier detectors.

    """
    BASE_ESTIMATORS = [
        LOF(n_neighbors=5, contamination=contamination),
        LOF(n_neighbors=10, contamination=contamination),
        LOF(n_neighbors=15, contamination=contamination),
        LOF(n_neighbors=25, contamination=contamination),
        LOF(n_neighbors=35, contamination=contamination),
        LOF(n_neighbors=45, contamination=contamination),
        LOF(n_neighbors=50, contamination=contamination),
        LOF(n_neighbors=55, contamination=contamination),
        LOF(n_neighbors=60, contamination=contamination),
        LOF(n_neighbors=65, contamination=contamination),
        LOF(n_neighbors=70, contamination=contamination),
        LOF(n_neighbors=75, contamination=contamination),
        LOF(n_neighbors=80, contamination=contamination),
        LOF(n_neighbors=85, contamination=contamination),
        LOF(n_neighbors=90, contamination=contamination),
        LOF(n_neighbors=95, contamination=contamination),
        LOF(n_neighbors=100, contamination=contamination),
        ABOD(n_neighbors=5, contamination=contamination),
        ABOD(n_neighbors=10, contamination=contamination),
        ABOD(n_neighbors=15, contamination=contamination),
        ABOD(n_neighbors=20, contamination=contamination),
        ABOD(n_neighbors=25, contamination=contamination),
        ABOD(n_neighbors=30, contamination=contamination),
        ABOD(n_neighbors=35, contamination=contamination),
        ABOD(n_neighbors=40, contamination=contamination),
        LOF(n_neighbors=5, contamination=contamination),
        LOF(n_neighbors=10, contamination=contamination),
        LOF(n_neighbors=15, contamination=contamination),
        LOF(n_neighbors=25, contamination=contamination),
        LOF(n_neighbors=35, contamination=contamination),
        LOF(n_neighbors=45, contamination=contamination),
        LOF(n_neighbors=50, contamination=contamination),
        LOF(n_neighbors=55, contamination=contamination),
        LOF(n_neighbors=60, contamination=contamination),
        LOF(n_neighbors=65, contamination=contamination),
        LOF(n_neighbors=70, contamination=contamination),
        LOF(n_neighbors=75, contamination=contamination),
        LOF(n_neighbors=80, contamination=contamination),
        LOF(n_neighbors=85, contamination=contamination),
        LOF(n_neighbors=90, contamination=contamination),
        LOF(n_neighbors=95, contamination=contamination),
        LOF(n_neighbors=100, contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        PCA(contamination=contamination),
        PCA(contamination=contamination),
        PCA(contamination=contamination),
        PCA(contamination=contamination),
        PCA(contamination=contamination),
        PCA(contamination=contamination),
        PCA(contamination=contamination),
        PCA(contamination=contamination),
        KNN(n_neighbors=5, contamination=contamination),
        KNN(n_neighbors=15, contamination=contamination),
        KNN(n_neighbors=25, contamination=contamination),
        KNN(n_neighbors=35, contamination=contamination),
        KNN(n_neighbors=45, contamination=contamination),
        KNN(n_neighbors=50, contamination=contamination),
        KNN(n_neighbors=55, contamination=contamination),
        KNN(n_neighbors=65, contamination=contamination),
        KNN(n_neighbors=75, contamination=contamination),
        KNN(n_neighbors=85, contamination=contamination),
        KNN(n_neighbors=85, contamination=contamination),
        KNN(n_neighbors=85, contamination=contamination),
        KNN(n_neighbors=95, contamination=contamination),
        KNN(n_neighbors=100, contamination=contamination),
        IForest(n_estimators=50, contamination=contamination),
        IForest(n_estimators=100, contamination=contamination),
        IForest(n_estimators=150, contamination=contamination),
        IForest(n_estimators=200, contamination=contamination),
        IForest(n_estimators=50, contamination=contamination),
        IForest(n_estimators=100, contamination=contamination),
        IForest(n_estimators=150, contamination=contamination),
        IForest(n_estimators=200, contamination=contamination),
        LOF(n_neighbors=5, contamination=contamination),
        LOF(n_neighbors=10, contamination=contamination),
        LOF(n_neighbors=15, contamination=contamination),
        LOF(n_neighbors=25, contamination=contamination),
        LOF(n_neighbors=35, contamination=contamination),
        LOF(n_neighbors=45, contamination=contamination),
        LOF(n_neighbors=50, contamination=contamination),
        LOF(n_neighbors=55, contamination=contamination),
        LOF(n_neighbors=60, contamination=contamination),
        LOF(n_neighbors=65, contamination=contamination),
        LOF(n_neighbors=70, contamination=contamination),
        LOF(n_neighbors=75, contamination=contamination),
        LOF(n_neighbors=80, contamination=contamination),
        LOF(n_neighbors=85, contamination=contamination),
        LOF(n_neighbors=90, contamination=contamination),
        LOF(n_neighbors=95, contamination=contamination),
        LOF(n_neighbors=100, contamination=contamination),
        LOF(n_neighbors=5, contamination=contamination),
        LOF(n_neighbors=10, contamination=contamination),
        LOF(n_neighbors=15, contamination=contamination),
        LOF(n_neighbors=25, contamination=contamination),
        LOF(n_neighbors=35, contamination=contamination),
        LOF(n_neighbors=45, contamination=contamination),
        LOF(n_neighbors=50, contamination=contamination),
        LOF(n_neighbors=55, contamination=contamination),
        LOF(n_neighbors=60, contamination=contamination),
        LOF(n_neighbors=65, contamination=contamination),
        LOF(n_neighbors=70, contamination=contamination),
        LOF(n_neighbors=75, contamination=contamination),
        LOF(n_neighbors=80, contamination=contamination),
        LOF(n_neighbors=85, contamination=contamination),
        LOF(n_neighbors=90, contamination=contamination),
        LOF(n_neighbors=95, contamination=contamination),
        LOF(n_neighbors=100, contamination=contamination),
        LOF(n_neighbors=5, contamination=contamination),
        LOF(n_neighbors=10, contamination=contamination),
        LOF(n_neighbors=15, contamination=contamination),
        LOF(n_neighbors=25, contamination=contamination),
        LOF(n_neighbors=35, contamination=contamination),
        LOF(n_neighbors=45, contamination=contamination),
        LOF(n_neighbors=50, contamination=contamination),
        LOF(n_neighbors=55, contamination=contamination),
        LOF(n_neighbors=60, contamination=contamination),
        LOF(n_neighbors=65, contamination=contamination),
        LOF(n_neighbors=70, contamination=contamination),
        LOF(n_neighbors=75, contamination=contamination),
        LOF(n_neighbors=80, contamination=contamination),
        LOF(n_neighbors=85, contamination=contamination),
        LOF(n_neighbors=90, contamination=contamination),
        LOF(n_neighbors=95, contamination=contamination),
        LOF(n_neighbors=100, contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        PCA(contamination=contamination),
        PCA(contamination=contamination),
        PCA(contamination=contamination),
        PCA(contamination=contamination),
        PCA(contamination=contamination),
        PCA(contamination=contamination),
        PCA(contamination=contamination),
        PCA(contamination=contamination),
        PCA(contamination=contamination),
        PCA(contamination=contamination),
        KNN(n_neighbors=5, contamination=contamination),
        KNN(n_neighbors=15, contamination=contamination),
        KNN(n_neighbors=25, contamination=contamination),
        KNN(n_neighbors=35, contamination=contamination),
        KNN(n_neighbors=45, contamination=contamination),
        KNN(n_neighbors=50, contamination=contamination),
        KNN(n_neighbors=55, contamination=contamination),
        KNN(n_neighbors=65, contamination=contamination),
        KNN(n_neighbors=75, contamination=contamination),
        KNN(n_neighbors=85, contamination=contamination),
        KNN(n_neighbors=85, contamination=contamination),
        KNN(n_neighbors=85, contamination=contamination),
        KNN(n_neighbors=95, contamination=contamination),
        KNN(n_neighbors=100, contamination=contamination),
        IForest(n_estimators=50, contamination=contamination),
        IForest(n_estimators=100, contamination=contamination),
        IForest(n_estimators=150, contamination=contamination),
        IForest(n_estimators=200, contamination=contamination),
        IForest(n_estimators=50, contamination=contamination),
        IForest(n_estimators=100, contamination=contamination),
        IForest(n_estimators=150, contamination=contamination),
        IForest(n_estimators=200, contamination=contamination),
        LOF(n_neighbors=5, contamination=contamination),
        LOF(n_neighbors=10, contamination=contamination),
        LOF(n_neighbors=15, contamination=contamination),
        LOF(n_neighbors=25, contamination=contamination),
        LOF(n_neighbors=35, contamination=contamination),
        LOF(n_neighbors=45, contamination=contamination),
        LOF(n_neighbors=50, contamination=contamination),
        LOF(n_neighbors=55, contamination=contamination),
        LOF(n_neighbors=60, contamination=contamination),
        LOF(n_neighbors=65, contamination=contamination),
        LOF(n_neighbors=70, contamination=contamination),
        LOF(n_neighbors=75, contamination=contamination),
        LOF(n_neighbors=80, contamination=contamination),
        LOF(n_neighbors=85, contamination=contamination),
        LOF(n_neighbors=90, contamination=contamination),
        LOF(n_neighbors=95, contamination=contamination),
        LOF(n_neighbors=100, contamination=contamination),
        LOF(n_neighbors=5, contamination=contamination),
        LOF(n_neighbors=10, contamination=contamination),
        LOF(n_neighbors=15, contamination=contamination),
        LOF(n_neighbors=25, contamination=contamination),
        LOF(n_neighbors=35, contamination=contamination),
        LOF(n_neighbors=45, contamination=contamination),
        LOF(n_neighbors=50, contamination=contamination),
        LOF(n_neighbors=55, contamination=contamination),
        LOF(n_neighbors=60, contamination=contamination),
        LOF(n_neighbors=65, contamination=contamination),
        LOF(n_neighbors=70, contamination=contamination),
        LOF(n_neighbors=75, contamination=contamination),
        LOF(n_neighbors=80, contamination=contamination),
        LOF(n_neighbors=85, contamination=contamination),
        LOF(n_neighbors=90, contamination=contamination),
        LOF(n_neighbors=95, contamination=contamination),
        LOF(n_neighbors=100, contamination=contamination),
        LOF(n_neighbors=5, contamination=contamination),
        LOF(n_neighbors=10, contamination=contamination),
        LOF(n_neighbors=15, contamination=contamination),
        LOF(n_neighbors=25, contamination=contamination),
        LOF(n_neighbors=35, contamination=contamination),
        LOF(n_neighbors=45, contamination=contamination),
        LOF(n_neighbors=50, contamination=contamination),
        LOF(n_neighbors=55, contamination=contamination),
        LOF(n_neighbors=60, contamination=contamination),
        LOF(n_neighbors=65, contamination=contamination),
        LOF(n_neighbors=70, contamination=contamination),
        LOF(n_neighbors=75, contamination=contamination),
        LOF(n_neighbors=80, contamination=contamination),
        LOF(n_neighbors=85, contamination=contamination),
        LOF(n_neighbors=90, contamination=contamination),
        LOF(n_neighbors=95, contamination=contamination),
        LOF(n_neighbors=100, contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        PCA(contamination=contamination),
        PCA(contamination=contamination),
        PCA(contamination=contamination),
        PCA(contamination=contamination),
        PCA(contamination=contamination),
        PCA(contamination=contamination),
        PCA(contamination=contamination),
        PCA(contamination=contamination),
        PCA(contamination=contamination),
        PCA(contamination=contamination),
        KNN(n_neighbors=5, contamination=contamination),
        KNN(n_neighbors=15, contamination=contamination),
        KNN(n_neighbors=25, contamination=contamination),
        KNN(n_neighbors=35, contamination=contamination),
        KNN(n_neighbors=45, contamination=contamination),
        KNN(n_neighbors=50, contamination=contamination),
        KNN(n_neighbors=55, contamination=contamination),
        KNN(n_neighbors=65, contamination=contamination),
        KNN(n_neighbors=75, contamination=contamination),
        KNN(n_neighbors=85, contamination=contamination),
        KNN(n_neighbors=85, contamination=contamination),
        KNN(n_neighbors=85, contamination=contamination),
        KNN(n_neighbors=95, contamination=contamination),
        KNN(n_neighbors=100, contamination=contamination),
        IForest(n_estimators=50, contamination=contamination),
        IForest(n_estimators=100, contamination=contamination),
        IForest(n_estimators=150, contamination=contamination),
        IForest(n_estimators=200, contamination=contamination),
        IForest(n_estimators=50, contamination=contamination),
        IForest(n_estimators=100, contamination=contamination),
        IForest(n_estimators=150, contamination=contamination),
        IForest(n_estimators=200, contamination=contamination),
        LOF(n_neighbors=5, contamination=contamination),
        LOF(n_neighbors=10, contamination=contamination),
        LOF(n_neighbors=15, contamination=contamination),
        LOF(n_neighbors=25, contamination=contamination),
        LOF(n_neighbors=35, contamination=contamination),
        LOF(n_neighbors=45, contamination=contamination),
        LOF(n_neighbors=50, contamination=contamination),
        LOF(n_neighbors=55, contamination=contamination),
        LOF(n_neighbors=60, contamination=contamination),
        LOF(n_neighbors=65, contamination=contamination),
        LOF(n_neighbors=70, contamination=contamination),
        LOF(n_neighbors=75, contamination=contamination),
        LOF(n_neighbors=80, contamination=contamination),
        LOF(n_neighbors=85, contamination=contamination),
        LOF(n_neighbors=90, contamination=contamination),
        LOF(n_neighbors=95, contamination=contamination),
        LOF(n_neighbors=100, contamination=contamination),
        LOF(n_neighbors=5, contamination=contamination),
        LOF(n_neighbors=10, contamination=contamination),
        LOF(n_neighbors=15, contamination=contamination),
        LOF(n_neighbors=25, contamination=contamination),
        LOF(n_neighbors=35, contamination=contamination),
        LOF(n_neighbors=45, contamination=contamination),
        LOF(n_neighbors=50, contamination=contamination),
        LOF(n_neighbors=55, contamination=contamination),
        LOF(n_neighbors=60, contamination=contamination),
        LOF(n_neighbors=65, contamination=contamination),
        LOF(n_neighbors=70, contamination=contamination),
        LOF(n_neighbors=75, contamination=contamination),
        LOF(n_neighbors=80, contamination=contamination),
        LOF(n_neighbors=85, contamination=contamination),
        LOF(n_neighbors=90, contamination=contamination),
        LOF(n_neighbors=95, contamination=contamination),
        LOF(n_neighbors=100, contamination=contamination),
        LOF(n_neighbors=5, contamination=contamination),
        LOF(n_neighbors=10, contamination=contamination),
        LOF(n_neighbors=15, contamination=contamination),
        LOF(n_neighbors=25, contamination=contamination),
        LOF(n_neighbors=35, contamination=contamination),
        LOF(n_neighbors=45, contamination=contamination),
        LOF(n_neighbors=50, contamination=contamination),
        LOF(n_neighbors=55, contamination=contamination),
        LOF(n_neighbors=60, contamination=contamination),
        LOF(n_neighbors=65, contamination=contamination),
        LOF(n_neighbors=70, contamination=contamination),
        LOF(n_neighbors=75, contamination=contamination),
        LOF(n_neighbors=80, contamination=contamination),
        LOF(n_neighbors=85, contamination=contamination),
        LOF(n_neighbors=90, contamination=contamination),
        LOF(n_neighbors=95, contamination=contamination),
        LOF(n_neighbors=100, contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        ABOD(n_neighbors=5, contamination=contamination),
        ABOD(n_neighbors=10, contamination=contamination),
        ABOD(n_neighbors=15, contamination=contamination),
        ABOD(n_neighbors=20, contamination=contamination),
        ABOD(n_neighbors=25, contamination=contamination),
        ABOD(n_neighbors=30, contamination=contamination),
        ABOD(n_neighbors=35, contamination=contamination),
        ABOD(n_neighbors=40, contamination=contamination),
        IForest(n_estimators=50, contamination=contamination),
        IForest(n_estimators=100, contamination=contamination),
        IForest(n_estimators=150, contamination=contamination),
        IForest(n_estimators=200, contamination=contamination),
        IForest(n_estimators=50, contamination=contamination),
        IForest(n_estimators=100, contamination=contamination),
        IForest(n_estimators=150, contamination=contamination),
        IForest(n_estimators=200, contamination=contamination),
        IForest(n_estimators=150, contamination=contamination),
        IForest(n_estimators=200, contamination=contamination),
        PCA(contamination=contamination),
        PCA(contamination=contamination),
        PCA(contamination=contamination),
        PCA(contamination=contamination),
        PCA(contamination=contamination),
        PCA(contamination=contamination),
        PCA(contamination=contamination),
        PCA(contamination=contamination),
        PCA(contamination=contamination),
        PCA(contamination=contamination),
        KNN(n_neighbors=5, contamination=contamination),
        KNN(n_neighbors=15, contamination=contamination),
        KNN(n_neighbors=25, contamination=contamination),
        KNN(n_neighbors=35, contamination=contamination),
        KNN(n_neighbors=45, contamination=contamination),
        KNN(n_neighbors=50, contamination=contamination),
        KNN(n_neighbors=55, contamination=contamination),
        KNN(n_neighbors=65, contamination=contamination),
        KNN(n_neighbors=75, contamination=contamination),
        KNN(n_neighbors=85, contamination=contamination),
        KNN(n_neighbors=85, contamination=contamination),
        KNN(n_neighbors=85, contamination=contamination),
        KNN(n_neighbors=95, contamination=contamination),
        KNN(n_neighbors=100, contamination=contamination),
        IForest(n_estimators=50, contamination=contamination),
        IForest(n_estimators=100, contamination=contamination),
        IForest(n_estimators=150, contamination=contamination),
        IForest(n_estimators=200, contamination=contamination),
        IForest(n_estimators=50, contamination=contamination),
        IForest(n_estimators=100, contamination=contamination),
        IForest(n_estimators=150, contamination=contamination),
        IForest(n_estimators=200, contamination=contamination),
        LOF(n_neighbors=5, contamination=contamination),
        LOF(n_neighbors=10, contamination=contamination),
        LOF(n_neighbors=15, contamination=contamination),
        LOF(n_neighbors=25, contamination=contamination),
        LOF(n_neighbors=35, contamination=contamination),
        LOF(n_neighbors=45, contamination=contamination),
        LOF(n_neighbors=50, contamination=contamination),
        LOF(n_neighbors=55, contamination=contamination),
        LOF(n_neighbors=60, contamination=contamination),
        LOF(n_neighbors=65, contamination=contamination),
        LOF(n_neighbors=70, contamination=contamination),
        LOF(n_neighbors=75, contamination=contamination),
        LOF(n_neighbors=80, contamination=contamination),
        LOF(n_neighbors=85, contamination=contamination),
        LOF(n_neighbors=90, contamination=contamination),
        LOF(n_neighbors=95, contamination=contamination),
        LOF(n_neighbors=100, contamination=contamination),
        ABOD(n_neighbors=5, contamination=contamination),
        ABOD(n_neighbors=10, contamination=contamination),
        ABOD(n_neighbors=15, contamination=contamination),
        ABOD(n_neighbors=20, contamination=contamination),
        ABOD(n_neighbors=25, contamination=contamination),
        ABOD(n_neighbors=30, contamination=contamination),
        ABOD(n_neighbors=35, contamination=contamination),
        ABOD(n_neighbors=40, contamination=contamination),
        LOF(n_neighbors=5, contamination=contamination),
        LOF(n_neighbors=10, contamination=contamination),
        LOF(n_neighbors=15, contamination=contamination),
        LOF(n_neighbors=25, contamination=contamination),
        LOF(n_neighbors=35, contamination=contamination),
        LOF(n_neighbors=45, contamination=contamination),
        LOF(n_neighbors=50, contamination=contamination),
        LOF(n_neighbors=55, contamination=contamination),
        LOF(n_neighbors=60, contamination=contamination),
        LOF(n_neighbors=65, contamination=contamination),
        LOF(n_neighbors=70, contamination=contamination),
        LOF(n_neighbors=75, contamination=contamination),
        LOF(n_neighbors=80, contamination=contamination),
        LOF(n_neighbors=85, contamination=contamination),
        LOF(n_neighbors=90, contamination=contamination),
        LOF(n_neighbors=95, contamination=contamination),
        LOF(n_neighbors=100, contamination=contamination),
        LOF(n_neighbors=5, contamination=contamination),
        LOF(n_neighbors=10, contamination=contamination),
        LOF(n_neighbors=15, contamination=contamination),
        LOF(n_neighbors=25, contamination=contamination),
        LOF(n_neighbors=35, contamination=contamination),
        LOF(n_neighbors=45, contamination=contamination),
        LOF(n_neighbors=50, contamination=contamination),
        LOF(n_neighbors=55, contamination=contamination),
        LOF(n_neighbors=60, contamination=contamination),
        LOF(n_neighbors=65, contamination=contamination),
        LOF(n_neighbors=70, contamination=contamination),
        LOF(n_neighbors=75, contamination=contamination),
        LOF(n_neighbors=80, contamination=contamination),
        LOF(n_neighbors=85, contamination=contamination),
        LOF(n_neighbors=90, contamination=contamination),
        LOF(n_neighbors=95, contamination=contamination),
        LOF(n_neighbors=100, contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        PCA(contamination=contamination),
        PCA(contamination=contamination),
        PCA(contamination=contamination),
        PCA(contamination=contamination),
        PCA(contamination=contamination),
        PCA(contamination=contamination),
        PCA(contamination=contamination),
        PCA(contamination=contamination),
        PCA(contamination=contamination),
        PCA(contamination=contamination),
        KNN(n_neighbors=5, contamination=contamination),
        KNN(n_neighbors=15, contamination=contamination),
        KNN(n_neighbors=25, contamination=contamination),
        KNN(n_neighbors=35, contamination=contamination),
        KNN(n_neighbors=45, contamination=contamination),
        KNN(n_neighbors=50, contamination=contamination),
        KNN(n_neighbors=55, contamination=contamination),
        KNN(n_neighbors=65, contamination=contamination),
        KNN(n_neighbors=75, contamination=contamination),
        KNN(n_neighbors=85, contamination=contamination),
        KNN(n_neighbors=85, contamination=contamination),
        KNN(n_neighbors=85, contamination=contamination),
        KNN(n_neighbors=95, contamination=contamination),
        KNN(n_neighbors=100, contamination=contamination),
        IForest(n_estimators=50, contamination=contamination),
        IForest(n_estimators=100, contamination=contamination),
        IForest(n_estimators=150, contamination=contamination),
        IForest(n_estimators=200, contamination=contamination),
        IForest(n_estimators=50, contamination=contamination),
        IForest(n_estimators=100, contamination=contamination),
        IForest(n_estimators=150, contamination=contamination),
        IForest(n_estimators=200, contamination=contamination),
        LOF(n_neighbors=5, contamination=contamination),
        LOF(n_neighbors=10, contamination=contamination),
        LOF(n_neighbors=15, contamination=contamination),
        LOF(n_neighbors=25, contamination=contamination),
        LOF(n_neighbors=35, contamination=contamination),
        LOF(n_neighbors=45, contamination=contamination),
        LOF(n_neighbors=50, contamination=contamination),
        LOF(n_neighbors=55, contamination=contamination),
        LOF(n_neighbors=60, contamination=contamination),
        LOF(n_neighbors=65, contamination=contamination),
        LOF(n_neighbors=70, contamination=contamination),
        LOF(n_neighbors=75, contamination=contamination),
        LOF(n_neighbors=80, contamination=contamination),
        LOF(n_neighbors=85, contamination=contamination),
        LOF(n_neighbors=90, contamination=contamination),
        LOF(n_neighbors=95, contamination=contamination),
        LOF(n_neighbors=100, contamination=contamination),
        ABOD(n_neighbors=5, contamination=contamination),
        ABOD(n_neighbors=10, contamination=contamination),
        ABOD(n_neighbors=15, contamination=contamination),
        ABOD(n_neighbors=20, contamination=contamination),
        ABOD(n_neighbors=25, contamination=contamination),
        ABOD(n_neighbors=30, contamination=contamination),
        ABOD(n_neighbors=35, contamination=contamination),
        ABOD(n_neighbors=40, contamination=contamination),
        ABOD(n_neighbors=45, contamination=contamination),
        OCSVM(contamination=contamination),
        OCSVM(contamination=contamination),
        OCSVM(contamination=contamination),
        OCSVM(contamination=contamination),
        OCSVM(contamination=contamination),
        OCSVM(contamination=contamination),
        OCSVM(contamination=contamination),
        OCSVM(contamination=contamination),
        OCSVM(contamination=contamination),
        OCSVM(contamination=contamination),
        MCD(contamination=contamination),
        MCD(contamination=contamination),
        MCD(contamination=contamination),
        MCD(contamination=contamination),
        MCD(contamination=contamination),
        MCD(contamination=contamination),
        MCD(contamination=contamination),
        MCD(contamination=contamination),
        MCD(contamination=contamination),
        MCD(contamination=contamination),
        MCD(contamination=contamination),
        MCD(contamination=contamination),
        MCD(contamination=contamination),
        MCD(contamination=contamination),
        MCD(contamination=contamination),
        MCD(contamination=contamination),
        MCD(contamination=contamination),
        MCD(contamination=contamination),
        MCD(contamination=contamination),
        MCD(contamination=contamination),
        LOF(n_neighbors=75, contamination=contamination),
        LOF(n_neighbors=80, contamination=contamination),
        LOF(n_neighbors=85, contamination=contamination),
        LOF(n_neighbors=90, contamination=contamination),
        LOF(n_neighbors=95, contamination=contamination),
        LOF(n_neighbors=100, contamination=contamination),
        ABOD(n_neighbors=5, contamination=contamination),
        ABOD(n_neighbors=10, contamination=contamination),
        ABOD(n_neighbors=15, contamination=contamination),
        ABOD(n_neighbors=20, contamination=contamination),
        ABOD(n_neighbors=25, contamination=contamination),
        ABOD(n_neighbors=30, contamination=contamination),
        ABOD(n_neighbors=35, contamination=contamination),
        ABOD(n_neighbors=40, contamination=contamination),
    ]

    return BASE_ESTIMATORS
コード例 #27
0
ファイル: wine.py プロジェクト: Carmelo1997/DigAssignment
#
# ## 使用PyOD来做离群点检测,使用的是离群得分,即每个模型都会给每个数据点打分而非直接根据一个阈值判定某个点是否为离群点。

# ### 定义一个数组来保存下述所有的算法分类器

# In[1]:

clfs = []

# ## 1.1 Angle_Based Outlier Detection (ABOD)
# ### 它考虑了每个数据点和其邻居的关系,但是不考虑邻居之间的关系。ABOD在多维度数据上表现较好,PyOD提供了两种不同版本的ABOD,Fast ABOD:使用KNN来近似,Original ABOD:以高时间复杂度来考虑所有训练数据点,这里采用Original ABOD

# In[2]:

from pyod.models.abod import ABOD
clf_ABOD = ABOD()
clfs.append(["ABOD", clf_ABOD])

# ## 1.2 KNN 检测器
# ### 对于任意数据点,其到第k个邻居的距离可以作为其离群得分,PyOD提供三种不同的KNN检测器:Largest: 使用第k个邻居的距离来作为离群得分;Mean: 使用全部k个邻居的平均距离作为离群得分;Median:使用k个邻居的距离的中位数作为离群得分。这里采用Largest检测器。

# In[1]:

from pyod.models.knn import KNN
clf_KNN = KNN()
clfs.append(["KNN", clf_KNN])

# ## 1.3 Isolation Forest
# ### 内部使用sklearn,此方法中,使用一个集合的树来完成数据分区。Isolation Forest提供一个离群得分来判定一个数据点在结构中有多孤立。其离群得分用来将它与正常观测数据区分开来。Isolation Forest在多维数据上表现很好

# In[4]:
コード例 #28
0
ファイル: CompareAllModels.py プロジェクト: adir1002/ML_Logs
    LOF(n_neighbors=10),
    LOF(n_neighbors=15),
    LOF(n_neighbors=20),
    LOF(n_neighbors=25),
    LOF(n_neighbors=30),
    LOF(n_neighbors=35),
    LOF(n_neighbors=40),
    LOF(n_neighbors=45),
    LOF(n_neighbors=50)
]

random_state = 42
# Define nine outlier detection tools to be compared
classifiers = {
    'Angle-based Outlier Detector (ABOD)':
    ABOD(),
    'Cluster-based Local Outlier Factor (CBLOF)':
    CBLOF(check_estimator=False, random_state=random_state),
    'Feature Bagging':
    FeatureBagging(LOF(n_neighbors=35), random_state=random_state),
    'Histogram-base Outlier Detection (HBOS)':
    HBOS(),
    'Isolation Forest':
    IForest(random_state=random_state),
    'K Nearest Neighbors (KNN)':
    KNN(),
    'Average KNN':
    KNN(method='mean'),
    # 'Median KNN': KNN(method='median',
    #                   contamination=outliers_fraction),
    'Local Outlier Factor (LOF)':
コード例 #29
0
ファイル: pyod_tools.py プロジェクト: chomiestyle/Unitti
def pyod_anomaly_detection(type, contamination):
    X_train, y_train, X_test, y_test = data(type=type,
                                            contamination=contamination)
    if type == 'MAD':
        # train MAD detector
        clf_name = 'MAD'
        clf = MAD(threshold=3.5)
        clf.fit(X_train)

        # get the prediction labels and outlier scores of the training data
        y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
        y_train_scores = clf.decision_scores_  # raw outlier scores

        # get the prediction on the test data
        y_test_pred = clf.predict(X_test)  # outlier labels (0 or 1)
        y_test_scores = clf.decision_function(X_test)  # outlier scores
        # evaluate and print the results
        print("\nOn Training Data:")
        evaluate_print(clf_name, y_train, y_train_scores)
        print("\nOn Test Data:")
        evaluate_print(clf_name, y_test, y_test_scores)

        # visualize the results
        # making dimensions = 2 for visualising purpose only. By repeating same data each dimension.
        visualize(clf_name,
                  np.hstack((X_train, X_train)),
                  y_train,
                  np.hstack((X_test, X_test)),
                  y_test,
                  y_train_pred,
                  y_test_pred,
                  show_figure=True,
                  save_figure=False)
    elif type == 'ABOD':
        # train ABOD detector
        clf_name = 'ABOD'
        clf = ABOD()
        clf.fit(X_train)

        # get the prediction labels and outlier scores of the training data
        y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
        y_train_scores = clf.decision_scores_  # raw outlier scores

        # get the prediction on the test data
        y_test_pred = clf.predict(X_test)  # outlier labels (0 or 1)
        y_test_scores = clf.decision_function(X_test)  # outlier scores

        # evaluate and print the results
        print("\nOn Training Data:")
        evaluate_print(clf_name, y_train, y_train_scores)
        print("\nOn Test Data:")
        evaluate_print(clf_name, y_test, y_test_scores)

        # visualize the results
        visualize(clf_name,
                  X_train,
                  y_train,
                  X_test,
                  y_test,
                  y_train_pred,
                  y_test_pred,
                  show_figure=True,
                  save_figure=False)
    elif type == 'AutoEncoder':
        # train AutoEncoder detector
        clf_name = 'AutoEncoder'
        clf = AutoEncoder(epochs=30, contamination=contamination)
        clf.fit(X_train)

        # get the prediction labels and outlier scores of the training data
        y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
        y_train_scores = clf.decision_scores_  # raw outlier scores

        # get the prediction on the test data
        y_test_pred = clf.predict(X_test)  # outlier labels (0 or 1)
        y_test_scores = clf.decision_function(X_test)  # outlier scores

        # evaluate and print the results
        print("\nOn Training Data:")
        evaluate_print(clf_name, y_train, y_train_scores)
        print("\nOn Test Data:")
        evaluate_print(clf_name, y_test, y_test_scores)
コード例 #30
0
from scipy import stats
import matplotlib.pyplot as plt
import matplotlib.font_manager
from pyod.models.abod import ABOD
from pyod.models.knn import KNN
from pyod.models.feature_bagging import FeatureBagging
from pyod.models.hbos import HBOS
from pyod.models.iforest import IForest
# from pyod.models.cblof import CBLOF
from pyod.models.lof import LOF
from sklearn.utils import *

pd.set_option('display.max_column',100)
n_clusters=8
classifiers={
    'abod':ABOD(n_neighbors=15),
    'knn':KNN(),
    # 'cblof':CBLOF(n_clusters=n_clusters),
    'fg':FeatureBagging(),
    'hbos':HBOS(),
    'if':IForest(),
    'lof':LOF()
}
dict={'csvname':[],
      'roc_abod_train':[],
      'roc_abod_test':[],
      'prn_abod_train':[],
      'prn_abod_test':[],
      'roc_knn_train':[],
      'roc_knn_test':[],
      'prn_knn_train':[],