Esempio n. 1
0
def feature_bagging(X_train, X_test, Y_train, Y_test):
    from pyod.models.feature_bagging import FeatureBagging
    model = FeatureBagging(random_state=1)
    model.fit(X_train)
    pred = model.predict(X_test)
    acc = np.sum(pred == Y_test) / X_test.shape[0]
    print(acc)
    return (acc * 100)
Esempio n. 2
0
def main():

    scalers = ['no', 'std', 'minmax']
    root = 'Unsupervised_Anamaly_Detection_csv'
    start = 0
    counts = 90
    CPUS = 3
    CPUS_Models = 4
    sklearn_models = [
        'AvgKNN', 'LargestKNN', 'MedKNN', 'PCA', 'COF', 'LODA', 'LOF', 'HBOS',
        'MCD', 'AvgBagging', 'MaxBagging', 'IForest', 'CBLOF', 'COPOD', 'SOD',
        'LSCPwithLODA', 'AveLMDD', 'VarLMDD', 'IqrLMDD', 'SoGaal', 'MoGaal',
        'VAE', 'AutoEncoder'
    ]

    models = {
        'BRM': BRM(bootstrap_sample_percent=70),
        'GM': GaussianMixture(),
        'IF': IsolationForest(),
        'OCSVM': OneClassSVM(),
        'EE': EllipticEnvelope(),
        'AvgKNN': KNN(method='mean'),
        'LargestKNN': KNN(method='largest'),
        'MedKNN': KNN(method='median'),
        'PCA': PCA(),
        'COF': COF(),
        'LODA': LODA(),
        'LOF': LOF(),
        'HBOS': HBOS(),
        'MCD': MCD(),
        'AvgBagging': FeatureBagging(combination='average'),
        'MaxBagging': FeatureBagging(combination='max'),
        'CBLOF': CBLOF(n_clusters=10, n_jobs=4),
        'FactorAnalysis': FactorAnalysis(),
        'KernelDensity': KernelDensity(),
        'COPOD': COPOD(),
        'SOD': SOD(),
        'LSCPwithLODA': LSCP([LODA(), LODA()]),
        'AveLMDD': LMDD(dis_measure='aad'),
        'VarLMDD': LMDD(dis_measure='var'),
        'IqrLMDD': LMDD(dis_measure='iqr'),
        'SoGaal': SO_GAAL(),
        'MoGaal': MO_GAAL(),
        'VAE': VAE(encoder_neurons=[8, 4, 2]),
        'AutoEncoder': AutoEncoder(hidden_neurons=[6, 3, 3, 6]),
        'OCKRA': m_OCKRA(),
    }

    name = "30_Models"

    Parallel(n_jobs=CPUS) \
        (delayed(runByScaler)
         (root, scaler, models, start, counts,
          other_models=sklearn_models,
          CPUS=CPUS_Models,
          save_name=name)
         for scaler in scalers)
Esempio n. 3
0
    def setUp(self):
        self.n_train = 200
        self.n_test = 100
        self.contamination = 0.1
        self.roc_floor = 0.8
        self.X_train, self.y_train, self.X_test, self.y_test = generate_data(
            n_train=self.n_train, n_test=self.n_test,
            contamination=self.contamination, random_state=42)

        self.clf = FeatureBagging(contamination=self.contamination)
        self.clf.fit(self.X_train)
def getOulierFeatureBagging(dataset):
    '''
    @brief Function that executes Feature Bagging algorithm on the dataset and obtains the
    labels of the dataset indicating which instance is an inlier (0) or outlier (1)
    @param dataset Dataset on which to try the algorithm
    @return It returns a list of labels 0 means inlier, 1 means outlier
    '''
    # Initializating the model without verbose
    fb = FeatureBagging(verbose=0)
    # Fits the data and obtains labels
    fb.fit(dataset)
    # Return labels
    return fb.labels_
Esempio n. 5
0
def get_model_bagging(percentage_of_outliers=0.002,
                      num_estimators=2,
                      combination='max'):
    """Create a Feature Bagging model.

    Args:
        percentage_of_outliers: percentage of fraud on data
        num_estimators: number of base estimators in the ensemble.
        combination: if ‘average’: take the average of all detectors
                     if ‘max’: take the maximum scores of all detectors

    Returns:
        model: Feature Bagging model
    """
    utils.save_log('{0} :: {1}'.format(
        get_model_bagging.__module__,
        get_model_bagging.__name__))

    model = FeatureBagging(contamination=percentage_of_outliers,
                           n_estimators=num_estimators,
                           combination=combination,
                           random_state=config.random_seed,
                           n_jobs=config.num_jobs)

    return model
Esempio n. 6
0
 def model_init(self, model):
     """Model initialisation of a single model.
     """
     if self.model == 'pca':
         self.models[model] = PCA(contamination=self.contamination)
     elif self.model == 'loda':
         self.models[model] = LODA(contamination=self.contamination)
     elif self.model == 'iforest':
         self.models[model] = IForest(n_estimators=50,
                                      bootstrap=True,
                                      behaviour='new',
                                      contamination=self.contamination)
     elif self.model == 'cblof':
         self.models[model] = CBLOF(n_clusters=3,
                                    contamination=self.contamination)
     elif self.model == 'feature_bagging':
         self.models[model] = FeatureBagging(
             base_estimator=PCA(contamination=self.contamination),
             contamination=self.contamination)
     elif self.model == 'copod':
         self.models[model] = COPOD(contamination=self.contamination)
     elif self.model == 'hbos':
         self.models[model] = HBOS(contamination=self.contamination)
     else:
         self.models[model] = HBOS(contamination=self.contamination)
     self.custom_model_scalers[model] = MinMaxScaler()
Esempio n. 7
0
def define_classifiers(random_state, outliers_fraction):
    classifiers = {
        'Angle-based Outlier Detector (ABOD)':
        ABOD(contamination=outliers_fraction),
        'Cluster-based Local Outlier Factor':
        CBLOF(contamination=outliers_fraction,
              check_estimator=False,
              random_state=random_state),
        'Feature Bagging':
        FeatureBagging(contamination=outliers_fraction,
                       random_state=random_state),
        'Histogram-base Outlier Detection (HBOS)':
        HBOS(contamination=outliers_fraction),
        'Isolation Forest':
        IForest(contamination=outliers_fraction, random_state=random_state),
        'K Nearest Neighbors (KNN)':
        KNN(contamination=outliers_fraction),
        'Local Outlier Factor (LOF)':
        LOF(contamination=outliers_fraction),
        'Minimum Covariance Determinant (MCD)':
        MCD(contamination=outliers_fraction, random_state=random_state),
        'One-class SVM (OCSVM)':
        OCSVM(contamination=outliers_fraction),
        'Principal Component Analysis (PCA)':
        PCA(contamination=outliers_fraction, random_state=random_state)
    }
    return classifiers
Esempio n. 8
0
    def __load_classifiers(self):
        outliers_fraction = 0.05
        random_state = np.random.RandomState(0)

        classifiers = {
            'Cluster-based Local Outlier Factor (CBLOF)':
            CBLOF(contamination=outliers_fraction,
                  check_estimator=False,
                  random_state=random_state),
            'Feature Bagging':
            FeatureBagging(LOF(n_neighbors=35),
                           contamination=outliers_fraction,
                           random_state=random_state),
            'Histogram-base Outlier Detection (HBOS)':
            HBOS(contamination=outliers_fraction),
            'Isolation Forest':
            IForest(contamination=outliers_fraction,
                    random_state=random_state),
            'K Nearest Neighbors (KNN)':
            KNN(contamination=outliers_fraction),
            'Average KNN':
            KNN(method='mean', contamination=outliers_fraction),
            'Local Outlier Factor (LOF)':
            LOF(n_neighbors=35, contamination=outliers_fraction),
            'Minimum Covariance Determinant (MCD)':
            MCD(contamination=outliers_fraction, random_state=random_state),
            'One-class SVM (OCSVM)':
            OCSVM(contamination=outliers_fraction),
        }

        return classifiers
Esempio n. 9
0
def load_classifiers(outliers_fraction):
    outliers_fraction = min(0.5, outliers_fraction)
    random_state = np.random.RandomState(42)
    # Define nine outlier detection tools to be compared
    classifiers = {
        'Angle-based Outlier Detector (ABOD)':
        ABOD(contamination=outliers_fraction),
        'Cluster-based Local Outlier Factor (CBLOF)':
        CBLOF(contamination=outliers_fraction,
              check_estimator=False,
              random_state=random_state),
        'Feature Bagging':
        FeatureBagging(LOF(n_neighbors=35),
                       contamination=outliers_fraction,
                       random_state=random_state),
        'Histogram-base Outlier Detection (HBOS)':
        HBOS(contamination=outliers_fraction),
        'Isolation Forest':
        IForest(contamination=outliers_fraction,
                random_state=random_state,
                behaviour="new"),
        'K Nearest Neighbors (KNN)':
        KNN(contamination=outliers_fraction),
        'Average KNN':
        KNN(method='mean', contamination=outliers_fraction),
        'Local Outlier Factor (LOF)':
        LOF(n_neighbors=35, contamination=outliers_fraction),
        'Minimum Covariance Determinant (MCD)':
        MCD(contamination=outliers_fraction, random_state=random_state),
        'One-class SVM (OCSVM)':
        OCSVM(contamination=outliers_fraction),
        'Principal Component Analysis (PCA)':
        PCA(contamination=outliers_fraction, random_state=random_state)
    }
    return classifiers
Esempio n. 10
0
def outlier_detection(x_raw, y_raw):
    """
    Filter all ourlier points
    :param x_raw: feature in ndarray
    :param y_raw: label in ndarray
    :return x_clean, y_clean: cleaned feature and label in ndarray
    """
    # TODO Filter the outliers.
    print()
    print("Detecting outliers...")
    print("Before outlier detection: {}".format(x_raw.shape))
    outliers_fraction = 0.04
    random_state = np.random.RandomState(42)
    # all outlier detection method candidate list as follows
    classifiers = {
        'Angle-based Outlier Detector (ABOD)':
        ABOD(contamination=outliers_fraction),
        'Cluster-based Local Outlier Factor':
        CBLOF(contamination=outliers_fraction,
              check_estimator=False,
              random_state=random_state),
        'Feature Bagging':
        FeatureBagging(contamination=outliers_fraction,
                       random_state=random_state),
        'Histogram-base Outlier Detection (HBOS)':
        HBOS(contamination=outliers_fraction),
        'Isolation Forest':
        IForest(contamination=outliers_fraction, random_state=random_state),
        'K Nearest Neighbors (KNN)':
        KNN(contamination=outliers_fraction),
        'Local Outlier Factor (LOF)':
        LOF(contamination=outliers_fraction),
        'Minimum Covariance Determinant (MCD)':
        MCD(contamination=outliers_fraction, random_state=random_state),
        'One-class SVM (OCSVM)':
        OCSVM(contamination=outliers_fraction),
        'Principal Component Analysis (PCA)':
        PCA(contamination=outliers_fraction, random_state=random_state),
        'Improving Supervised Outlier Detection with Unsupervised Representation Learning':
        XGBOD(contamination=outliers_fraction),
    }
    clf_name = 'Isolation Forest'
    clf = IForest(contamination=outliers_fraction, random_state=random_state)
    # clf_name = 'Angle-based Outlier Detector (ABOD)'
    # clf = ABOD(contamination=outliers_fraction, method='default')
    clf.fit(x_raw)
    y_pred = clf.predict(x_raw)
    # for pyod, 1 means outliers and 0 means inliers
    # for sklearn,  -1 means outliers and 1 means inliers
    idx_y_pred = [i for i in range(0, 1212) if y_pred[i] == 1]
    x_clean = del_rowsorcolumns(x_raw, idx_y_pred, axis=0)
    y_clean = del_rowsorcolumns(y_raw, idx_y_pred, axis=0)
    print("After outlier detection: {}".format(x_clean.shape))
    assert (x_clean.shape[0] == y_clean.shape[0])
    return x_clean, y_clean
Esempio n. 11
0
    def setUp(self):
        self.n_train = 100
        self.n_test = 50
        self.contamination = 0.1
        self.roc_floor = 0.6
        self.X_train, self.y_train, self.X_test, self.y_test = generate_data(
            n_train=self.n_train, n_test=self.n_test,
            contamination=self.contamination, random_state=42)

        self.clf = FeatureBagging(contamination=self.contamination)
        self.clf.fit(self.X_train)
Esempio n. 12
0
 def models_init(self):
     """Models initialisation.
     """
     self.model = self.configuration.get('model', 'pca')
     if self.model == 'pca':
         self.models = {
             model: PCA(contamination=self.contamination)
             for model in self.models_in_scope
         }
     elif self.model == 'loda':
         self.models = {
             model: LODA(contamination=self.contamination)
             for model in self.models_in_scope
         }
     elif self.model == 'iforest':
         self.models = {
             model: IForest(n_estimators=50,
                            bootstrap=True,
                            behaviour='new',
                            contamination=self.contamination)
             for model in self.models_in_scope
         }
     elif self.model == 'cblof':
         self.models = {
             model: CBLOF(n_clusters=3, contamination=self.contamination)
             for model in self.models_in_scope
         }
     elif self.model == 'feature_bagging':
         self.models = {
             model: FeatureBagging(
                 base_estimator=PCA(contamination=self.contamination),
                 contamination=self.contamination)
             for model in self.models_in_scope
         }
     elif self.model == 'copod':
         self.models = {
             model: COPOD(contamination=self.contamination)
             for model in self.models_in_scope
         }
     elif self.model == 'hbos':
         self.models = {
             model: HBOS(contamination=self.contamination)
             for model in self.models_in_scope
         }
     else:
         self.models = {
             model: HBOS(contamination=self.contamination)
             for model in self.models_in_scope
         }
     self.custom_model_scalers = {
         model: MinMaxScaler()
         for model in self.models_in_scope
     }
def out_lier_score(df, target, num_var):

    scaler = MinMaxScaler(feature_range=(0, 1))
    df = scaler.fit_transform(df.loc[:, num_var], df[target])  #.to_numpy()
    random_state = np.random.RandomState(42)
    outliers_fraction = 0.05

    X = df
    df_out_score = []
    # Define seven outlier  tools detectionto be compared
    classifiers = {
        'Angle-based Outlier Detector (ABOD)':
        ABOD(contamination=outliers_fraction),
        'Cluster-based Local Outlier Factor (CBLOF)':
        CBLOF(contamination=outliers_fraction,
              check_estimator=False,
              random_state=random_state),
        'Feature Bagging':
        FeatureBagging(LOF(n_neighbors=35),
                       contamination=outliers_fraction,
                       check_estimator=False,
                       random_state=random_state),
        'Histogram-base Outlier Detection (HBOS)':
        HBOS(contamination=outliers_fraction),
        'Isolation Forest':
        IForest(contamination=outliers_fraction, random_state=random_state),
        'K Nearest Neighbors (KNN)':
        KNN(contamination=outliers_fraction),
        'Average KNN':
        KNN(method='mean', contamination=outliers_fraction)
    }
    for i, (clf_name, clf) in enumerate(classifiers.items()):
        clf.fit(X)
        # predict raw anomaly score
        scores_pred = clf.decision_function(X) * -1
        # prediction of a datapoint category outlier or inlier
        y_pred = clf.predict(X)
        df_out_score.append(y_pred.tolist())

    df_out_score = pd.DataFrame(df_out_score).T
    df_out_score.columns = list(classifiers.keys())
    return df_out_score
def choose_model(model, nnet):
    """ among implemented in PyOD """
    clfs = {
        'AE':
        AutoEncoder(hidden_neurons=nnet, contamination=0.1, epochs=15),
        'VAE':
        VAE(encoder_neurons=nnet[:5],
            decoder_neurons=nnet[4:],
            contamination=0.1,
            epochs=13),
        'ABOD':
        ABOD(),
        'FeatureBagging':
        FeatureBagging(),
        'HBOS':
        HBOS(),
        'IForest':
        IForest(),
        'KNN':
        KNN(),
        'LOF':
        LOF(),
        'OCSVM':
        OCSVM(),
        'PCA':
        PCA(),
        'SOS':
        SOS(),
        'COF':
        COF(),
        'CBLOF':
        CBLOF(),
        'SOD':
        SOD(),
        'LOCI':
        LOCI(),
        'MCD':
        MCD()
    }
    return clfs[model]
Esempio n. 15
0
def fun(dir_path):
    file_list = []
    total_roc = []
    total_prn = []
    count = 0
    for home, dirs, files in os.walk("./"+dir_path+"/benchmarks"):
        for filename in files:
            fullname = os.path.join(home, filename)
            file_list.append(fullname)cb
    for file_csv in file_list:

        # if count == 2:
        #     break
        
        df = pd.read_csv(file_csv)
        columns = df.columns
        # df = df[columns].fillna('nan')

        data = df.drop(columns = ['point.id', 'motherset', 'origin'])

        class_mapping = {"anomaly":1, "nominal":0}
        data['ground.truth'] = data['ground.truth'].map(class_mapping)
        class_mapping = {"anomaly":1, "nominal":0}

        y = data['ground.truth']

        x = data.drop('ground.truth',axis=1)

        X_train, X_test, y_train, y_test = train_test_split(
                x, y, test_size=0.2, random_state=28)

        random_state = np.random.RandomState(42)
        outliers_fraction = 0.05
        # Define seven outlier detection tools to be compared
        classifiers = {
                'Angle-based Outlier Detector (ABOD)': ABOD(contamination=outliers_fraction),
                'Cluster-based Local Outlier Factor (CBLOF)':CBLOF(contamination=outliers_fraction,check_estimator=False, random_state=random_state),
                'Feature Bagging':FeatureBagging(LOF(n_neighbors=35),contamination=outliers_fraction,check_estimator=False,random_state=random_state),
                'Histogram-base Outlier Detection (HBOS)': HBOS(contamination=outliers_fraction),
                'Isolation Forest': IForest(contamination=outliers_fraction,random_state=random_state),
                'K Nearest Neighbors (KNN)': KNN(contamination=outliers_fraction),
                'Average KNN': KNN(method='mean',contamination=outliers_fraction)
        }
        p_prn = []
        p_roc = []
        for i, (clf_name, clf) in enumerate(classifiers.items()):
            try:
                clf.fit(X_train)

                # get the prediction labels and outlier scores of the training data
                y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
                y_train_scores = clf.decision_scores_  # raw outlier scores

                # get the prediction on the test data
                y_test_pred = clf.predict(X_test)  # outlier labels (0 or 1)
                y_test_scores = clf.decision_function(X_test)  # outlier scores

                # evaluate and print the results
                print(str(count)+"is analysing")
                print("\nOn Training Data:")
        
                evaluate_print(clf_name, y_train, y_train_scores)
                print("\nOn Test Data:")
                evaluate_print(clf_name, y_test, y_test_scores)
                roc=np.round(roc_auc_score(y_train, y_train_scores), decimals=4),
                prn=np.round(precision_n_scores(y_test, y_test_scores), decimals=4)

                p_prn.append(prn)
                p_roc.append(roc[0])
            except:
                p_prn.append(-1)
                p_roc.append(-1)

        total_prn.append(p_prn)
        total_roc.append(p_roc)    
        count += 1
            
    total_prn = json.dumps(total_prn)
    total_roc = json.dumps(total_roc)
    a = open(dir_path+"_prn_list.txt", "w",encoding='UTF-8')
    a.write(total_prn)
    a.close()
    a = open(dir_path+"_roc_list.txt", "w",encoding='UTF-8')
    a.write(total_roc)
    a.close()
Esempio n. 16
0
            X_train, X_test, y_train, y_test = \
                train_test_split(X, y, test_size=0.4, random_state=random_state)

            # standardizing data for processing
            X_train_norm, X_test_norm = standardizer(X_train, X_test)

            classifiers = {
                'Angle-based Outlier Detector (ABOD)':
                ABOD(contamination=outliers_fraction),
                'Cluster-based Local Outlier Factor':
                CBLOF(n_clusters=10,
                      contamination=outliers_fraction,
                      check_estimator=False,
                      random_state=random_state),
                'Feature Bagging':
                FeatureBagging(contamination=outliers_fraction,
                               random_state=random_state),
                'Histogram-base Outlier Detection (HBOS)':
                HBOS(contamination=outliers_fraction),
                'Isolation Forest':
                IForest(contamination=outliers_fraction,
                        random_state=random_state),
                'K Nearest Neighbors (KNN)':
                KNN(contamination=outliers_fraction),
                'Local Outlier Factor (LOF)':
                LOF(contamination=outliers_fraction),
                'Minimum Covariance Determinant (MCD)':
                MCD(contamination=outliers_fraction,
                    random_state=random_state),
                'One-class SVM (OCSVM)':
                OCSVM(contamination=outliers_fraction),
                'Principal Component Analysis (PCA)':
Esempio n. 17
0
class TestFeatureBagging(unittest.TestCase):
    def setUp(self):
        self.n_train = 200
        self.n_test = 100
        self.contamination = 0.1
        self.roc_floor = 0.8
        self.X_train, self.y_train, self.X_test, self.y_test = generate_data(
            n_train=self.n_train,
            n_test=self.n_test,
            contamination=self.contamination,
            random_state=42)

        self.clf = FeatureBagging(contamination=self.contamination)
        self.clf.fit(self.X_train)

    def test_parameters(self):
        assert (hasattr(self.clf, 'decision_scores_')
                and self.clf.decision_scores_ is not None)
        assert (hasattr(self.clf, 'labels_') and self.clf.labels_ is not None)
        assert (hasattr(self.clf, 'threshold_')
                and self.clf.threshold_ is not None)
        assert (hasattr(self.clf, '_mu') and self.clf._mu is not None)
        assert (hasattr(self.clf, '_sigma') and self.clf._sigma is not None)
        assert (hasattr(self.clf, 'estimators_')
                and self.clf.estimators_ is not None)
        assert (hasattr(self.clf, 'estimators_features_')
                and self.clf.estimators_features_ is not None)

    def test_train_scores(self):
        assert_equal(len(self.clf.decision_scores_), self.X_train.shape[0])

    def test_prediction_scores(self):
        pred_scores = self.clf.decision_function(self.X_test)

        # check score shapes
        assert_equal(pred_scores.shape[0], self.X_test.shape[0])

        # check performance
        assert (roc_auc_score(self.y_test, pred_scores) >= self.roc_floor)

    def test_prediction_labels(self):
        pred_labels = self.clf.predict(self.X_test)
        assert_equal(pred_labels.shape, self.y_test.shape)

    def test_prediction_proba(self):
        pred_proba = self.clf.predict_proba(self.X_test)
        assert (pred_proba.min() >= 0)
        assert (pred_proba.max() <= 1)

    def test_prediction_proba_linear(self):
        pred_proba = self.clf.predict_proba(self.X_test, method='linear')
        assert (pred_proba.min() >= 0)
        assert (pred_proba.max() <= 1)

    def test_prediction_proba_unify(self):
        pred_proba = self.clf.predict_proba(self.X_test, method='unify')
        assert (pred_proba.min() >= 0)
        assert (pred_proba.max() <= 1)

    def test_prediction_proba_parameter(self):
        with assert_raises(ValueError):
            self.clf.predict_proba(self.X_test, method='something')

    def test_fit_predict(self):
        pred_labels = self.clf.fit_predict(self.X_train)
        assert_equal(pred_labels.shape, self.y_train.shape)

    def test_fit_predict_score(self):
        self.clf.fit_predict_score(self.X_test, self.y_test)
        self.clf.fit_predict_score(self.X_test,
                                   self.y_test,
                                   scoring='roc_auc_score')
        self.clf.fit_predict_score(self.X_test,
                                   self.y_test,
                                   scoring='prc_n_score')
        with assert_raises(NotImplementedError):
            self.clf.fit_predict_score(self.X_test,
                                       self.y_test,
                                       scoring='something')

    def test_predict_rank(self):
        pred_socres = self.clf.decision_function(self.X_test)
        pred_ranks = self.clf._predict_rank(self.X_test)

        # assert the order is reserved
        assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=3)
        assert_array_less(pred_ranks, self.X_train.shape[0] + 1)
        assert_array_less(-0.1, pred_ranks)

    def test_predict_rank_normalized(self):
        pred_socres = self.clf.decision_function(self.X_test)
        pred_ranks = self.clf._predict_rank(self.X_test, normalized=True)

        # assert the order is reserved
        assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=3)
        assert_array_less(pred_ranks, 1.01)
        assert_array_less(-0.1, pred_ranks)

    def tearDown(self):
        pass
Esempio n. 18
0
    # 60% data for training and 40% for testing
    X_train, X_test, y_train, y_test = \
        train_test_split(X, y, test_size=0.4, random_state=random_state)

    # standardizing data for processing
    X_train_norm, X_test_norm = standardizer(X_train, X_test)

    if sys.argv[1] == 'abod':
        classifiers = {'Angle-based Outlier Detector (ABOD)': ABOD(contamination=outliers_fraction)}
        classifiers_indices = {
            'Angle-based Outlier Detector (ABOD)': 0}
    elif sys.argv[1] == 'cblof':
        classifiers = {'Cluster-based Local Outlier Factor': CBLOF(contamination=outliers_fraction, check_estimator=False, random_state=random_state)}
        classifiers_indices = {'Cluster-based Local Outlier Factor': 0}
    elif sys.argv[1] == 'fb':
        classifiers = {'Feature Bagging': FeatureBagging(contamination=outliers_fraction, check_estimator=False, random_state=random_state)}
        classifiers_indices = {'Feature Bagging': 0}
    elif sys.argv[1] == 'hbos':
        classifiers = {'Histogram-base Outlier Detection (HBOS)': HBOS(contamination=outliers_fraction)}
        classifiers_indices = {'Histogram-base Outlier Detection (HBOS)': 0}
    elif sys.argv[1] == 'iforest':
        classifiers = {'Isolation Forest': IForest(contamination=outliers_fraction, random_state=random_state)}
        classifiers_indices = {'Isolation Forest': 0}
    elif sys.argv[1] == 'knn':
        classifiers = {'K Nearest Neighbors (KNN)': KNN(contamination=outliers_fraction)}
        classifiers_indices = {'K Nearest Neighbors (KNN)': 0}
    elif sys.argv[1] == 'lof':
        classifiers = {'Local Outlier Factor (LOF)': LOF(contamination=outliers_fraction)}
        classifiers_indices = {'Local Outlier Factor (LOF)': 0}
    elif sys.argv[1] == 'mcd':
        classifiers = {'Minimum Covariance Determinant (MCD)': MCD(contamination=outliers_fraction, random_state=random_state)}
Esempio n. 19
0
if __name__ == "__main__":
    contamination = 0.1  # percentage of outliers
    n_train = 200  # number of training points
    n_test = 100  # number of testing points

    # Generate sample data
    X_train, y_train, X_test, y_test = \
        generate_data(n_train=n_train,
                      n_test=n_test,
                      n_features=2,
                      contamination=contamination,
                      random_state=42)

    # train ABOD detector
    clf_name = 'FeatureBagging'
    clf = FeatureBagging()
    clf.fit(X_train)

    # get the prediction labels and outlier scores of the training data
    y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
    y_train_scores = clf.decision_scores_  # raw outlier scores

    # get the prediction on the test data
    y_test_pred = clf.predict(X_test)  # outlier labels (0 or 1)
    y_test_scores = clf.decision_function(X_test)  # outlier scores

    # evaluate and print the results
    print("\nOn Training Data:")
    evaluate_print(clf_name, y_train, y_train_scores)
    print("\nOn Test Data:")
    evaluate_print(clf_name, y_test, y_test_scores)
Esempio n. 20
0
    LOF(n_neighbors=30),
    LOF(n_neighbors=35),
    LOF(n_neighbors=40),
    LOF(n_neighbors=45),
    LOF(n_neighbors=50)
]

random_state = 42
# Define nine outlier detection tools to be compared
classifiers = {
    'Angle-based Outlier Detector (ABOD)':
    ABOD(),
    'Cluster-based Local Outlier Factor (CBLOF)':
    CBLOF(check_estimator=False, random_state=random_state),
    'Feature Bagging':
    FeatureBagging(LOF(n_neighbors=35), random_state=random_state),
    'Histogram-base Outlier Detection (HBOS)':
    HBOS(),
    'Isolation Forest':
    IForest(random_state=random_state),
    'K Nearest Neighbors (KNN)':
    KNN(),
    'Average KNN':
    KNN(method='mean'),
    # 'Median KNN': KNN(method='median',
    #                   contamination=outliers_fraction),
    'Local Outlier Factor (LOF)':
    LOF(n_neighbors=35),
    # 'Local Correlation Integral (LOCI)':
    #     LOCI(contamination=outliers_fraction),
    'Minimum Covariance Determinant (MCD)':
Esempio n. 21
0
    'Ground truth shape is {shape}. Outlier are 1 and inliers are 0.\n'.format(
        shape=ground_truth.shape))
print(ground_truth, '\n')

random_state = np.random.RandomState(42)
# Define nine outlier detection tools to be compared
classifiers = {
    'Angle-based Outlier Detector (ABOD)':
    ABOD(contamination=outliers_fraction),
    'Cluster-based Local Outlier Factor (CBLOF)':
    CBLOF(contamination=outliers_fraction,
          check_estimator=False,
          random_state=random_state),
    'Feature Bagging':
    FeatureBagging(LOF(n_neighbors=35),
                   contamination=outliers_fraction,
                   random_state=random_state),
    'Histogram-base Outlier Detection (HBOS)':
    HBOS(contamination=outliers_fraction),
    'Isolation Forest':
    IForest(contamination=outliers_fraction, random_state=random_state),
    'K Nearest Neighbors (KNN)':
    KNN(contamination=outliers_fraction),
    'Average KNN':
    KNN(method='mean', contamination=outliers_fraction),
    # 'Median KNN': KNN(method='median',
    #                   contamination=outliers_fraction),
    'Local Outlier Factor (LOF)':
    LOF(n_neighbors=35, contamination=outliers_fraction),
    # 'Local Correlation Integral (LOCI)':
    #     LOCI(contamination=outliers_fraction),
Esempio n. 22
0
    def analysis():
        roc_df = pd.DataFrame(columns=df_columns)
        prn_df = pd.DataFrame(columns=df_columns)

        for doc in fileList:
            print(doc)
            df = pd.read_csv(doc, encoding='utf-8')
            # x =df.loc[:,('V1','V2','V3','V4','V5','V6','V7')]
            x = df.loc[:, ('R', 'G', 'B')]
            # x=df.iloc[:,6:57]
            y = df.loc[:, 'original.label']
            roc_list = [count, doc]
            count = count + 1
            roc_mat = np.zeros(6)
            # 设置 5%的离群点数据
            random_state = np.random.RandomState(42)
            outliers_fraction = 0.02
            # 定义6个后续会使用的离群点检测模型
            classifiers = {
                "Feature Bagging":
                FeatureBagging(LOF(n_neighbors=35),
                               contamination=outliers_fraction,
                               check_estimator=False,
                               random_state=random_state),
                "Isolation Forest":
                IForest(contamination=outliers_fraction,
                        random_state=random_state),
                "KNN":
                KNN(contamination=outliers_fraction),
                'Local Outlier Factor':
                LOF(contamination=outliers_fraction),
                'One-class SVM':
                OCSVM(contamination=outliers_fraction),
                'Principal Component Analysis':
                PCA(contamination=outliers_fraction,
                    random_state=random_state),
            }
            classifiers_indices = {
                'Feature Bagging': 0,
                'Isolation Forest': 1,
                "Average KNN": 2,
                'Local Outlier Factor': 3,
                'One-class SVM': 4,
                'Principal Component Analysis': 5,
            }
            # 60% data for training and 40% for testing
            X_train, X_test, y_train, y_test = \
                train_test_split(x, y, test_size=0.4, random_state=random_state)

            # standardizing data for processing
            X_train_norm, X_test_norm = standardizer(X_train, X_test)
            for i, (clf_name, clf) in enumerate(classifiers.items()):
                clf.fit(X_train_norm, y_train)
                # 预测离群点得分
                scores_pred = clf.decision_function(X_test_norm)
                try:
                    roc = round(roc_auc_score(y_test, scores_pred), ndigits=4)
                    roc_mat[classifiers_indices[clf_name]] = roc
                except ValueError:
                    continue
            roc_list = roc_list + roc_mat.tolist()
            temp_df = pd.DataFrame(roc_list).transpose()
            temp_df.columns = [
                'Data', 'dir', 'FB', 'IForest', 'Average KNN', 'LOF', 'OCSVM',
                'PCA'
            ]
            roc_df = pd.concat([roc_df, temp_df], axis=0)

            roc_df.to_csv("roc.csv", index=False, float_format="%.3f")
mat_file_name = mat_file.replace('.mat', '')
print("\n... Processing", mat_file_name, '...')
mat = sp.io.loadmat(os.path.join('../datasets', mat_file))

X = mat['X']
y = mat['y']

X = StandardScaler().fit_transform(X)

# load the pre-trained model cost predictor
clf = load('rf_predictor.joblib')

classifiers = {
    1: ABOD(n_neighbors=10),
    2: CBLOF(check_estimator=False),
    3: FeatureBagging(LOF()),
    4: HBOS(),
    5: IForest(),
    6: KNN(),
    7: KNN(method='mean'),
    8: LOF(),
    9: MCD(),
    10: OCSVM(),
    11: PCA(),
}

clfs = np.random.choice([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11],
                        size=n_estimators_total)
clfs_real = []

for estimator in clfs:
Esempio n. 24
0
def bivariate_outliers(df, method, x_col, y_col, outliers_fraction, visualize):
    dfx = df.loc[:, [x_col, y_col]]
    scaler = MinMaxScaler(feature_range=(0, 1))
    dfx.loc[:, [x_col, y_col]] = scaler.fit_transform(dfx.loc[:,
                                                              [x_col, y_col]])

    X1 = dfx[x_col].values.reshape(-1, 1)
    X2 = dfx[y_col].values.reshape(-1, 1)

    X = np.concatenate((X1, X2), axis=1)

    random_state = np.random.RandomState(42)

    classifiers_name = {
        'IForest': 'Isolation Forest',
        'CBLOF': 'Cluster-based Local Outlier Factor (CBLOF)',
        'ABOD': 'Angle-based Outlier Detector (ABOD)',
        'Feature Bagging': 'Feature Bagging',
        'HBOS': 'Histogram-base Outlier Detection (HBOS)',
        'KNN': 'K Nearest Neighbors (KNN)',
        'AvgKNN': 'Average KNN'
    }

    # Seven outlier detection tools to be used
    classifiers = {
        'Isolation Forest':
        IForest(behaviour='new',
                contamination=outliers_fraction,
                random_state=random_state),
        'Cluster-based Local Outlier Factor (CBLOF)':
        CBLOF(contamination=outliers_fraction,
              check_estimator=False,
              random_state=random_state),
        'Angle-based Outlier Detector (ABOD)':
        ABOD(contamination=outliers_fraction),
        'Feature Bagging':
        FeatureBagging(LOF(n_neighbors=35),
                       contamination=outliers_fraction,
                       check_estimator=False,
                       random_state=random_state),
        'Histogram-base Outlier Detection (HBOS)':
        HBOS(contamination=outliers_fraction),
        'K Nearest Neighbors (KNN)':
        KNN(contamination=outliers_fraction),
        'Average KNN':
        KNN(method='mean', contamination=outliers_fraction)
    }

    clf = classifiers[classifiers_name[method]]
    clf.fit(X)

    # prediction of a dfpoint category outlier or inlier
    y_pred = clf.predict(X)

    if visualize == False:
        df[x_col] = y_pred.tolist()
        return df
    else:
        xx, yy = np.meshgrid(np.linspace(0, 1, 200), np.linspace(0, 1, 200))

        # predict raw anomaly score
        scores_pred = clf.decision_function(X) * -1

        n_inliers = len(y_pred) - np.count_nonzero(y_pred)
        n_outliers = np.count_nonzero(y_pred == 1)
        plt.figure(figsize=(16, 8))

        # copy of dfframe
        dfx['outlier'] = y_pred.tolist()

        # IX1 - inlier feature 1,  IX2 - inlier feature 2
        IX1 = np.array(dfx[x_col][dfx['outlier'] == 0]).reshape(-1, 1)
        IX2 = np.array(dfx[y_col][dfx['outlier'] == 0]).reshape(-1, 1)

        # OX1 - outlier feature 1, OX2 - outlier feature 2
        OX1 = dfx[x_col][dfx['outlier'] == 1].values.reshape(-1, 1)
        OX2 = dfx[y_col][dfx['outlier'] == 1].values.reshape(-1, 1)

        print('OUTLIERS: ', n_outliers, ',', 'INLIERS: ', n_inliers, ',',
              'Detection Method:', classifiers_name[method])

        # threshold value to consider a dfpoint inlier or outlier
        threshold = stats.scoreatpercentile(scores_pred,
                                            100 * outliers_fraction)

        # decision function calculates the raw anomaly score for every point
        Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()]) * -1
        Z = Z.reshape(xx.shape)

        # fill blue map colormap from minimum anomaly score to threshold value
        plt.contourf(xx,
                     yy,
                     Z,
                     levels=np.linspace(Z.min(), threshold, 7),
                     cmap=plt.cm.Blues_r)

        # draw red contour line where anomaly score is equal to thresold
        a = plt.contour(xx,
                        yy,
                        Z,
                        levels=[threshold],
                        linewidths=2,
                        colors='red')

        # fill orange contour lines where range of anomaly score is from threshold to maximum anomaly score
        plt.contourf(xx, yy, Z, levels=[threshold, Z.max()], colors='orange')

        b = plt.scatter(IX1, IX2, c='white', s=20, edgecolor='k')

        c = plt.scatter(OX1, OX2, c='black', s=20, edgecolor='k')

        plt.axis('tight')

        # loc=2 is used for the top left corner
        plt.legend([a.collections[0], b, c],
                   ['learned decision function', 'inliers', 'outliers'],
                   prop=matplotlib.font_manager.FontProperties(size=16),
                   loc='best')

        plt.xlim((0, 1))
        plt.ylim((0, 1))
        plt.title(method, fontsize=20)
        plt.xlabel(x_col, fontsize=16)
        plt.ylabel(y_col, fontsize=16)
        plt.show()
if __name__ == "__main__":
    contamination = 0.1  # percentage of outliers
    n_train = 200  # number of training points
    n_test = 100  # number of testing points

    # Generate sample data
    X_train, y_train, X_test, y_test = \
        generate_data(n_train=n_train,
                      n_test=n_test,
                      n_features=2,
                      contamination=contamination,
                      random_state=42)

    # train Feature Bagging detector
    clf_name = 'FeatureBagging'
    clf = FeatureBagging()
    clf.fit(X_train)

    # get the prediction labels and outlier scores of the training data
    y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
    y_train_scores = clf.decision_scores_  # raw outlier scores

    # get the prediction on the test data
    y_test_pred = clf.predict(X_test)  # outlier labels (0 or 1)
    y_test_scores = clf.decision_function(X_test)  # outlier scores

    # evaluate and print the results
    print("\nOn Training Data:")
    evaluate_print(clf_name, y_train, y_train_scores)
    print("\nOn Test Data:")
    evaluate_print(clf_name, y_test, y_test_scores)
Esempio n. 26
0
    def initialise_pyod_classifiers(self, outlier_fraction):
        #Testing every query to every class and then predicting only if it belongs to the same class
        classifiers = {}
        #Proximity based
        classifiers['K Nearest Neighbors (KNN)'] = []
        classifiers['Average K Nearest Neighbors (AvgKNN)'] = []
        classifiers['Median K Nearest Neighbors (MedKNN)'] = []
        classifiers['Local Outlier Factor (LOF)'] = []
        classifiers['Connectivity-Based Outlier Factor (COF)'] = []
        #classifiers['Clustering-Based Local Outlier Factor (CBLOF)'] = []
        classifiers['LOCI'] = []
        #classifiers['Histogram-based Outlier Score (HBOS)'] = []
        classifiers['Subspace Outlier Detection (SOD)'] = []
        #Linear models
        classifiers['Principal Component Analysis (PCA)'] = []
        #classifiers['Minimum Covariance Determinant (MCD)'] = []           #To slow
        classifiers['One-Class Support Vector Machines (OCSVM)'] = []
        classifiers['Deviation-based Outlier Detection (LMDD)'] = []
        #Probabilistic
        classifiers['Angle-Based Outlier Detection (ABOD)'] = []
        classifiers['Stochastic Outlier Selection (SOS)'] = []
        #Outlier Ensembles
        classifiers['Isolation Forest (IForest)'] = []
        classifiers['Feature Bagging'] = []
        classifiers['Lightweight On-line Detector of Anomalies (LODA)'] = []

        for i in range(self.k_way):
            for i in range(self.k_way):
                classifiers['K Nearest Neighbors (KNN)'].append(
                    KNN(method='largest',
                        n_neighbors=int(self.n_shot / 3) + 1,
                        contamination=outlier_fraction))
                classifiers['Average K Nearest Neighbors (AvgKNN)'].append(
                    KNN(method='mean',
                        n_neighbors=int(self.n_shot / 3) + 1,
                        contamination=outlier_fraction))
                classifiers['Median K Nearest Neighbors (MedKNN)'].append(
                    KNN(method='median',
                        n_neighbors=int(self.n_shot / 3) + 1,
                        contamination=outlier_fraction))
                classifiers['Local Outlier Factor (LOF)'].append(
                    LOF(n_neighbors=int(self.n_shot / 3) + 1,
                        contamination=outlier_fraction))
                classifiers['Connectivity-Based Outlier Factor (COF)'].append(
                    COF(n_neighbors=int(self.n_shot / 3) + 1,
                        contamination=outlier_fraction))
                classifiers['LOCI'].append(
                    LOCI(contamination=outlier_fraction))
                classifiers['Subspace Outlier Detection (SOD)'].append(
                    SOD(n_neighbors=int(self.n_shot / 3) + 2,
                        contamination=outlier_fraction,
                        ref_set=max(2, int((int(self.n_shot / 3) + 2) / 3))))
                classifiers['Principal Component Analysis (PCA)'].append(
                    PCA(contamination=outlier_fraction))
                classifiers[
                    'One-Class Support Vector Machines (OCSVM)'].append(
                        OCSVM(contamination=outlier_fraction))
                classifiers['Deviation-based Outlier Detection (LMDD)'].append(
                    LMDD(contamination=outlier_fraction))
                classifiers['Angle-Based Outlier Detection (ABOD)'].append(
                    ABOD(contamination=outlier_fraction))
                classifiers['Stochastic Outlier Selection (SOS)'].append(
                    SOS(contamination=outlier_fraction))
                classifiers['Isolation Forest (IForest)'].append(
                    IForest(contamination=outlier_fraction))
                classifiers['Feature Bagging'].append(
                    FeatureBagging(contamination=outlier_fraction))
                classifiers[
                    'Lightweight On-line Detector of Anomalies (LODA)'].append(
                        LODA(contamination=outlier_fraction))
        self.num_different_models = len(classifiers)
        return classifiers
Esempio n. 27
0
class TestFeatureBagging(unittest.TestCase):
    def setUp(self):
        self.n_train = 100
        self.n_test = 50
        self.contamination = 0.1
        self.roc_floor = 0.6
        self.X_train, self.y_train, self.X_test, self.y_test = generate_data(
            n_train=self.n_train, n_test=self.n_test,
            contamination=self.contamination, random_state=42)

        self.clf = FeatureBagging(contamination=self.contamination)
        self.clf.fit(self.X_train)

    # TODO: failed due to sklearn uses 2 feature examples.
    # def test_sklearn_estimator(self):
    #     check_estimator(self.clf)

    def test_parameters(self):
        assert_true(hasattr(self.clf, 'decision_scores_') and
                    self.clf.decision_scores_ is not None)
        assert_true(hasattr(self.clf, 'labels_') and
                    self.clf.labels_ is not None)
        assert_true(hasattr(self.clf, 'threshold_') and
                    self.clf.threshold_ is not None)
        assert_true(hasattr(self.clf, '_mu') and
                    self.clf._mu is not None)
        assert_true(hasattr(self.clf, '_sigma') and
                    self.clf._sigma is not None)
        assert_true(hasattr(self.clf, 'estimators_') and
                    self.clf.estimators_ is not None)
        assert_true(hasattr(self.clf, 'estimators_features_') and
                    self.clf.estimators_features_ is not None)

    def test_train_scores(self):
        assert_equal(len(self.clf.decision_scores_), self.X_train.shape[0])

    def test_prediction_scores(self):
        pred_scores = self.clf.decision_function(self.X_test)

        # check score shapes
        assert_equal(pred_scores.shape[0], self.X_test.shape[0])

        # check performance
        assert_greater(roc_auc_score(self.y_test, pred_scores), self.roc_floor)

    def test_prediction_labels(self):
        pred_labels = self.clf.predict(self.X_test)
        assert_equal(pred_labels.shape, self.y_test.shape)

    def test_prediction_proba(self):
        pred_proba = self.clf.predict_proba(self.X_test)
        assert_greater_equal(pred_proba.min(), 0)
        assert_less_equal(pred_proba.max(), 1)

    def test_prediction_proba_linear(self):
        pred_proba = self.clf.predict_proba(self.X_test, method='linear')
        assert_greater_equal(pred_proba.min(), 0)
        assert_less_equal(pred_proba.max(), 1)

    def test_prediction_proba_unify(self):
        pred_proba = self.clf.predict_proba(self.X_test, method='unify')
        assert_greater_equal(pred_proba.min(), 0)
        assert_less_equal(pred_proba.max(), 1)

    def test_prediction_proba_parameter(self):
        with assert_raises(ValueError):
            self.clf.predict_proba(self.X_test, method='something')

    def test_fit_predict(self):
        pred_labels = self.clf.fit_predict(self.X_train)
        assert_equal(pred_labels.shape, self.y_train.shape)

    def test_fit_predict_score(self):
        self.clf.fit_predict_score(self.X_test, self.y_test)
        self.clf.fit_predict_score(self.X_test, self.y_test,
                                   scoring='roc_auc_score')
        self.clf.fit_predict_score(self.X_test, self.y_test,
                                   scoring='prc_n_score')
        with assert_raises(NotImplementedError):
            self.clf.fit_predict_score(self.X_test, self.y_test,
                                       scoring='something')

    def test_predict_rank(self):
        pred_socres = self.clf.decision_function(self.X_test)
        pred_ranks = self.clf._predict_rank(self.X_test)

        # assert the order is reserved
        assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=3)
        assert_array_less(pred_ranks, self.X_train.shape[0] + 1)
        assert_array_less(-0.1, pred_ranks)

    def test_predict_rank_normalized(self):
        pred_socres = self.clf.decision_function(self.X_test)
        pred_ranks = self.clf._predict_rank(self.X_test, normalized=True)

        # assert the order is reserved
        assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=3)
        assert_array_less(pred_ranks, 1.01)
        assert_array_less(-0.1, pred_ranks)

    def tearDown(self):
        pass
Esempio n. 28
0
def main():

    # PART 1:
    # Getting the predictions for each classifier
    # SK means: The classifier is from sklearn or works like sklearn
    # PY means: The classifier is from pyod or works like pyod

    models = {
        'SK_EE': EllipticEnvelope(),
        'SK_GM': GaussianMixture(),
        'SK_IF': IsolationForest(),
        'SK_OCSVM': OneClassSVM(),
        'SK_FA': FactorAnalysis(),
        'SK_KD': KernelDensity(),
        'PY_PCA': PCA(),
        'PY_COF': COF(),
        'PY_LODA': LODA(),
        'PY_LOF': LOF(),
        'PY_HBOS': HBOS(),
        'PY_MCD': MCD(),
        'PY_AvgKNN': KNN(method='mean'),
        'PY_LargestKNN': KNN(method='largest'),
        'PY_MedKNN': KNN(method='median'),
        'PY_AvgBagging': FeatureBagging(combination='average'),
        'PY_MaxBagging': FeatureBagging(combination='max'),
        'PY_CBLOF': CBLOF(n_clusters=10, n_jobs=4),
        'PY_COPOD': COPOD(),
        'PY_SOD': SOD(),
        'PY_LSCPwithLODA': LSCP([LODA(), LODA()]),
        'PY_AveLMDD': LMDD(dis_measure='aad'),
        'PY_VarLMDD': LMDD(dis_measure='var'),
        'PY_IqrLMDD': LMDD(dis_measure='iqr'),
        'PY_VAE': VAE(encoder_neurons=[8, 4, 2]),
        'PY_AutoEncoder': AutoEncoder(hidden_neurons=[6, 3, 3, 6]),
        'SK_BRM': BRM(bootstrap_sample_percent=70),
        'SK_OCKRA': m_OCKRA(),
        'PY_SoGaal': SO_GAAL(),
        'PY_MoGaal': MO_GAAL()
    }
    ranker = ADRanker(data="datasets", models=models)
    ranker.get_predictions()

    # PART 2:
    # After predictions, we can evaluate our classifiers using different scores
    # You can add manually a new metric by modifying 'metrics.py'

    ranker.get_scores(scores={'auc': Metrics.get_roc, 'ave': Metrics.get_ave})

    # PART 3:
    # Finally, it is time to summarize the results by plotting different graphs
    # You can add your own graphs by modifying ' plots.py'
    plot = Plots()
    plot.make_plot_basic(paths=[
        'results/scores/auc/no/results.csv',
        'results/scores/auc/minmax/results.csv',
        'results/scores/auc/std/results.csv',
        'results/scores/ave/no/results.csv',
        'results/scores/ave/minmax/results.csv',
        'results/scores/ave/std/results.csv'
    ],
                         scalers=[
                             'Without scaler', 'Min max scaler',
                             'Standard scaler', 'Without scaler',
                             'Min max scaler', 'Standard scaler'
                         ])

    plot.make_cd_plot(
        paths=[
            'results/scores/auc/minmax/results.csv',
            'results/scores/ave/no/results.csv',
            'results/scores/auc/no/results.csv',
            'results/scores/ave/no/results.csv',
            'results/scores/auc/std/results.csv',
            'results/scores/ave/std/results.csv'
        ],
        names=[
            'CD auc minmax scale', 'CD ave minmax scale', 'CD auc no scale',
            'CD ave no scale', 'CD auc std scale', 'CD ave std scale'
        ],
        titles=[
            'CD diagram - AUC with min max scaling',
            'CD diagram - Average precision with min max scaling',
            'CD diagram - AUC without scaling',
            'CD diagram - Average precision without scaling',
            'CD diagram - AUC with standard scaling',
            'CD diagram - Average precision with  standard scaling'
        ])
Esempio n. 29
0
from pyod.models.cblof import CBLOF
from pyod.models.feature_bagging import FeatureBagging
from pyod.models.hbos import HBOS
from pyod.models.iforest import IForest
from pyod.models.knn import KNN
from pyod.models.lof import LOF
from sklearn.preprocessing import MinMaxScaler

import matplotlib.pyplot as plt
import matplotlib.font_manager as mfm
from sklearn.metrics import accuracy_score,recall_score
# 设置 10%的离群点数据
random_state = np.random.RandomState(42)
outliers_fraction = 0.1
classifiers = {
        "FB": FeatureBagging(LOF(n_neighbors=35), contamination=outliers_fraction, check_estimator=False,
                                          random_state=random_state),
        "IForest": IForest(contamination=outliers_fraction, random_state=random_state),
        "Average KNN": KNN(contamination=outliers_fraction),
        'LOF': LOF(
            contamination=outliers_fraction),
        'OCSVM': OCSVM(contamination=outliers_fraction),
        'PCA': PCA(
            contamination=outliers_fraction, random_state=random_state),
    }
#读取roc,orignal文件
path="D:\\BIT\\Course\\sjwj\\homework\\12\\abalone\\skin_roc.csv"
f=open(path,encoding='utf-8')
df=pd.read_csv(f)

dff_orignal = pd.read_csv('D:\\BIT\\Course\\sjwj\\homework\\12\\skin_benchmarks\\skin\\meta_data\\skin.original.csv',encoding='utf-8')
x_orignal = dff_orignal.loc[:, ('R', 'G', 'B')]
Esempio n. 30
0
from pyod.models.abod import ABOD
from pyod.models.knn import KNN
from pyod.models.feature_bagging import FeatureBagging
from pyod.models.hbos import HBOS
from pyod.models.iforest import IForest
# from pyod.models.cblof import CBLOF
from pyod.models.lof import LOF
from sklearn.utils import *

pd.set_option('display.max_column',100)
n_clusters=8
classifiers={
    'abod':ABOD(n_neighbors=15),
    'knn':KNN(),
    # 'cblof':CBLOF(n_clusters=n_clusters),
    'fg':FeatureBagging(),
    'hbos':HBOS(),
    'if':IForest(),
    'lof':LOF()
}
dict={'csvname':[],
      'roc_abod_train':[],
      'roc_abod_test':[],
      'prn_abod_train':[],
      'prn_abod_test':[],
      'roc_knn_train':[],
      'roc_knn_test':[],
      'prn_knn_train':[],
      'prn_knn_test':[],
      # 'roc_cblof_train':[],
      # 'roc_cblof_test':[],
def plot_out_liers(df, cur_var, target):

    plt.scatter(df[cur_var], df[target])
    plt.show(block=False)
    plt.pause(5)
    plt.close()

    scaler = MinMaxScaler(feature_range=(0, 1))
    df[[cur_var, target]] = scaler.fit_transform(df[[cur_var, target]])

    X1 = df[cur_var].values.reshape(-1, 1)
    X2 = df[target].values.reshape(-1, 1)

    X = np.concatenate((X1, X2), axis=1)
    random_state = np.random.RandomState(42)
    outliers_fraction = 0.05
    # Define seven outlier  tools detectionto be compared
    classifiers = {
        'Angle-based Outlier Detector (ABOD)':
        ABOD(contamination=outliers_fraction),
        'Cluster-based Local Outlier Factor (CBLOF)':
        CBLOF(contamination=outliers_fraction,
              check_estimator=False,
              random_state=random_state),
        'Feature Bagging':
        FeatureBagging(LOF(n_neighbors=35),
                       contamination=outliers_fraction,
                       check_estimator=False,
                       random_state=random_state),
        'Histogram-base Outlier Detection (HBOS)':
        HBOS(contamination=outliers_fraction),
        'Isolation Forest':
        IForest(contamination=outliers_fraction, random_state=random_state),
        'K Nearest Neighbors (KNN)':
        KNN(contamination=outliers_fraction),
        'Average KNN':
        KNN(method='mean', contamination=outliers_fraction)
    }

    xx, yy = np.meshgrid(np.linspace(0, 1, 200), np.linspace(0, 1, 200))

    for i, (clf_name, clf) in enumerate(classifiers.items()):
        clf.fit(X)
        # predict raw anomaly score
        scores_pred = clf.decision_function(X) * -1

        # prediction of a datapoint category outlier or inlier
        y_pred = clf.predict(X)
        n_inliers = len(y_pred) - np.count_nonzero(y_pred)
        n_outliers = np.count_nonzero(y_pred == 1)
        plt.figure(figsize=(10, 10))

        # copy of dataframe
        dfx = df
        dfx['outlier'] = y_pred.tolist()

        # IX1 - inlier feature 1,  IX2 - inlier feature 2
        IX1 = np.array(dfx[cur_var][dfx['outlier'] == 0]).reshape(-1, 1)
        IX2 = np.array(dfx[target][dfx['outlier'] == 0]).reshape(-1, 1)

        # OX1 - outlier feature 1, OX2 - outlier feature 2
        OX1 = dfx[cur_var][dfx['outlier'] == 1].values.reshape(-1, 1)
        OX2 = dfx[target][dfx['outlier'] == 1].values.reshape(-1, 1)

        print('OUTLIERS : ', n_outliers, 'INLIERS : ', n_inliers, clf_name)

        # threshold value to consider a datapoint inlier or outlier
        threshold = stats.scoreatpercentile(scores_pred,
                                            100 * outliers_fraction)

        # decision function calculates the raw anomaly score for every point
        Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()]) * -1
        Z = Z.reshape(xx.shape)

        # fill blue map colormap from minimum anomaly score to threshold value
        plt.contourf(xx,
                     yy,
                     Z,
                     levels=np.linspace(Z.min(), threshold, 7),
                     cmap=plt.cm.Blues_r)

        # draw red contour line where anomaly score is equal to thresold
        a = plt.contour(xx,
                        yy,
                        Z,
                        levels=[threshold],
                        linewidths=2,
                        colors='red')

        # fill orange contour lines where range of anomaly score is from threshold to maximum anomaly score
        plt.contourf(xx, yy, Z, levels=[threshold, Z.max()], colors='orange')

        b = plt.scatter(IX1, IX2, c='white', s=20, edgecolor='k')

        c = plt.scatter(OX1, OX2, c='black', s=20, edgecolor='k')

        plt.axis('tight')

        # loc=2 is used for the top left corner
        plt.legend([a.collections[0], b, c],
                   ['learned decision function', 'inliers', 'outliers'],
                   prop=matplotlib.font_manager.FontProperties(size=20),
                   loc=2)

        plt.xlim((0, 1))
        plt.ylim((0, 1))
        plt.title(clf_name)
        plt.show(block=False)
        plt.pause(5)
        plt.close()
Esempio n. 32
0
X2 = df['G'].values.reshape(-1, 1)
X = np.concatenate((X1, X2), axis=1)

random_state = np.random.RandomState(42)
outliers_fraction = 0.05
# Define seven outlier detection tools to be compared
classifiers = {
    'Angle-based Outlier Detector (ABOD)':
    ABOD(contamination=outliers_fraction),
    'Cluster-based Local Outlier Factor (CBLOF)':
    CBLOF(contamination=outliers_fraction,
          check_estimator=False,
          random_state=random_state),
    'Feature Bagging':
    FeatureBagging(LOF(n_neighbors=35),
                   contamination=outliers_fraction,
                   check_estimator=False,
                   random_state=random_state),
    'Histogram-base Outlier Detection (HBOS)':
    HBOS(contamination=outliers_fraction),
    'Isolation Forest':
    IForest(contamination=outliers_fraction, random_state=random_state),
    'K Nearest Neighbors (KNN)':
    KNN(contamination=outliers_fraction),
    'Average KNN':
    KNN(method='mean', contamination=outliers_fraction)
}

xx, yy = np.meshgrid(np.linspace(0, 1, 200), np.linspace(0, 1, 200))

for i, (clf_name, clf) in enumerate(classifiers.items()):
    clf.fit(X)
Esempio n. 33
0
    def outlier_detector(self,
                         clustered_data,
                         outliers_fraction=0.05,
                         method='Voting',
                         cluster_number=3):

        random_state = np.random.RandomState(42)
        outliers_df = pd.DataFrame()
        classifiers = {
            #Cluster-based Local Outlier Factor
            'CBLOF':
            CBLOF(contamination=outliers_fraction,
                  check_estimator=False,
                  random_state=random_state),
            #Feature Bagging
            'FB':
            FeatureBagging(LOF(n_neighbors=35),
                           contamination=outliers_fraction,
                           check_estimator=False,
                           random_state=random_state),
            #Histogram-base Outlier Detection
            'HBOS':
            HBOS(contamination=outliers_fraction),
            #Isolation Forest
            'IF':
            IForest(contamination=outliers_fraction,
                    random_state=random_state),
            #K Nearest Neighbors
            'KNN':
            KNN(contamination=outliers_fraction)
        }
        detectors_list = []
        for k in range(cluster_number):
            curr_cluster = clustered_data[clustered_data['Cluster'] == k]
            X_train = curr_cluster.drop(['consumer_id', 'Cluster'], axis=1)
            for i, (clf_name, clf) in enumerate(classifiers.items()):
                clf_pred = clf_name + '_Decision'
                clf.fit(X_train)
                if (method == 'Voting'):
                    if (clf_name == 'KNN'):  #just save KNN for inference
                        detectors_list.append(clf)
                elif (method != 'Voting'):
                    if (clf_name == method):
                        detectors_list.append(clf)
        # predict raw anomaly score
                scores_pred = clf.decision_function(X_train)
                scores_pred_df = pd.DataFrame(list(scores_pred),
                                              columns=[clf_name],
                                              index=curr_cluster.index.copy())
                curr_cluster = pd.concat([curr_cluster, scores_pred_df],
                                         axis=1)

                outliers_pred = clf.predict(X_train)
                outliers_pred_df = pd.DataFrame(
                    list(outliers_pred),
                    columns=[clf_pred],
                    index=curr_cluster.index.copy())
                curr_cluster = pd.concat([curr_cluster, outliers_pred_df],
                                         axis=1)

            outliers_df = outliers_df.append(curr_cluster)

        if (method == 'Voting'):
            outliers_df['Voting'] = outliers_df.filter(regex='Decision').sum(
                axis=1)
            outliers_df['bad_customer'] = 0
            outliers_df.loc[(outliers_df.Voting > len(classifiers) / 2),
                            'bad_customer'] = 1
        else:
            decision = method + '_Decision'
            outliers_df['bad_customer'] = outliers_df[decision]

        return outliers_df, detectors_list