Example #1
0
def main():

    # Read all the csv files
    csvPath = "./csv_files"
    csvFiles = [f for f in listdir(csvPath) if isfile(join(csvPath, f))]

    dfs = []
    for cv in csvFiles:
        print("CSV Processing: " + cv)
        dfs.append(pd.read_csv(csvPath + '/' + cv, index_col=False))

    df = pd.concat(dfs, ignore_index=True)

    # Process all the csv file
    totalNormal = 0
    totalAnomalies = 0

    # Turn every column to numeric
    cols = [c for c in df.columns]

    nom_cols = ['ip_flags', 'tcp_udp_flags', 'payload']
    for c in nom_cols:
        le = LabelEncoder()
        df[c] = le.fit_transform(df[c])

    # Remove the cols with small standard deviation
    df = df.loc[:, df.std() > 0.0]

    # Calculate the correlation matrix
    corr_matrix = df.corr().abs()

    # Select upper triangle of correlation matrix
    upper = corr_matrix.where(
        np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))

    # Find index of feature columns with correlation greater than 0.95
    to_drop = [column for column in upper.columns if any(upper[column] == 1)]

    df = df.drop(df[to_drop], axis=1)

    print(df.head())

    # Fit the first model
    clf = MCD().fit(df)

    df['label'] = clf.predict(df)
    print(df)

    totalNormal = len(df[df['label'] == 0])
    totalAnomalies = len(df[df['label'] == 1])
    print("Normal: " + str(totalNormal))
    print("Anomaly: " + str(totalAnomalies))
    print('Accuracy: ' +
          str(totalNormal / float(totalNormal + totalAnomalies)))
    df.to_csv('./processed_csv/' + 'processed.csv', index=False)

    #Save the model
    filename = 'model.sav'
    pickle.dump(clf, open(filename, 'wb'))
Example #2
0
    def setUp(self):
        self.n_train = 200
        self.n_test = 100
        self.contamination = 0.1
        self.roc_floor = 0.8
        self.X_train, self.y_train, self.X_test, self.y_test = generate_data(
            n_train=self.n_train, n_test=self.n_test,
            contamination=self.contamination, random_state=42)

        self.clf = MCD(contamination=self.contamination, random_state=42)
        self.clf.fit(self.X_train)
def getOutlierMCD(dataset):
    '''
    @brief Function that executes MCD algorithm on the dataset and obtains the
    labels of the dataset indicating which instance is an inlier (0) or outlier (1)
    @param dataset Dataset on which to try the algorithm
    @return It returns a list of labels 0 means inlier, 1 means outlier
    '''
    # Initializating the model
    mcd = MCD()
    # Fits the data and obtains labels
    mcd.fit(dataset)
    # Return labels
    return mcd.labels_
Example #4
0
def define_classifiers(random_state, outliers_fraction):
    classifiers = {
        'Angle-based Outlier Detector (ABOD)':
        ABOD(contamination=outliers_fraction),
        'Cluster-based Local Outlier Factor':
        CBLOF(contamination=outliers_fraction,
              check_estimator=False,
              random_state=random_state),
        'Feature Bagging':
        FeatureBagging(contamination=outliers_fraction,
                       random_state=random_state),
        'Histogram-base Outlier Detection (HBOS)':
        HBOS(contamination=outliers_fraction),
        'Isolation Forest':
        IForest(contamination=outliers_fraction, random_state=random_state),
        'K Nearest Neighbors (KNN)':
        KNN(contamination=outliers_fraction),
        'Local Outlier Factor (LOF)':
        LOF(contamination=outliers_fraction),
        'Minimum Covariance Determinant (MCD)':
        MCD(contamination=outliers_fraction, random_state=random_state),
        'One-class SVM (OCSVM)':
        OCSVM(contamination=outliers_fraction),
        'Principal Component Analysis (PCA)':
        PCA(contamination=outliers_fraction, random_state=random_state)
    }
    return classifiers
Example #5
0
def load_classifiers(outliers_fraction):
    outliers_fraction = min(0.5, outliers_fraction)
    random_state = np.random.RandomState(42)
    # Define nine outlier detection tools to be compared
    classifiers = {
        'Angle-based Outlier Detector (ABOD)':
        ABOD(contamination=outliers_fraction),
        'Cluster-based Local Outlier Factor (CBLOF)':
        CBLOF(contamination=outliers_fraction,
              check_estimator=False,
              random_state=random_state),
        'Feature Bagging':
        FeatureBagging(LOF(n_neighbors=35),
                       contamination=outliers_fraction,
                       random_state=random_state),
        'Histogram-base Outlier Detection (HBOS)':
        HBOS(contamination=outliers_fraction),
        'Isolation Forest':
        IForest(contamination=outliers_fraction,
                random_state=random_state,
                behaviour="new"),
        'K Nearest Neighbors (KNN)':
        KNN(contamination=outliers_fraction),
        'Average KNN':
        KNN(method='mean', contamination=outliers_fraction),
        'Local Outlier Factor (LOF)':
        LOF(n_neighbors=35, contamination=outliers_fraction),
        'Minimum Covariance Determinant (MCD)':
        MCD(contamination=outliers_fraction, random_state=random_state),
        'One-class SVM (OCSVM)':
        OCSVM(contamination=outliers_fraction),
        'Principal Component Analysis (PCA)':
        PCA(contamination=outliers_fraction, random_state=random_state)
    }
    return classifiers
Example #6
0
    def __load_classifiers(self):
        outliers_fraction = 0.05
        random_state = np.random.RandomState(0)

        classifiers = {
            'Cluster-based Local Outlier Factor (CBLOF)':
            CBLOF(contamination=outliers_fraction,
                  check_estimator=False,
                  random_state=random_state),
            'Feature Bagging':
            FeatureBagging(LOF(n_neighbors=35),
                           contamination=outliers_fraction,
                           random_state=random_state),
            'Histogram-base Outlier Detection (HBOS)':
            HBOS(contamination=outliers_fraction),
            'Isolation Forest':
            IForest(contamination=outliers_fraction,
                    random_state=random_state),
            'K Nearest Neighbors (KNN)':
            KNN(contamination=outliers_fraction),
            'Average KNN':
            KNN(method='mean', contamination=outliers_fraction),
            'Local Outlier Factor (LOF)':
            LOF(n_neighbors=35, contamination=outliers_fraction),
            'Minimum Covariance Determinant (MCD)':
            MCD(contamination=outliers_fraction, random_state=random_state),
            'One-class SVM (OCSVM)':
            OCSVM(contamination=outliers_fraction),
        }

        return classifiers
Example #7
0
def main():

    scalers = ['no', 'std', 'minmax']
    root = 'Unsupervised_Anamaly_Detection_csv'
    start = 0
    counts = 90
    CPUS = 3
    CPUS_Models = 4
    sklearn_models = [
        'AvgKNN', 'LargestKNN', 'MedKNN', 'PCA', 'COF', 'LODA', 'LOF', 'HBOS',
        'MCD', 'AvgBagging', 'MaxBagging', 'IForest', 'CBLOF', 'COPOD', 'SOD',
        'LSCPwithLODA', 'AveLMDD', 'VarLMDD', 'IqrLMDD', 'SoGaal', 'MoGaal',
        'VAE', 'AutoEncoder'
    ]

    models = {
        'BRM': BRM(bootstrap_sample_percent=70),
        'GM': GaussianMixture(),
        'IF': IsolationForest(),
        'OCSVM': OneClassSVM(),
        'EE': EllipticEnvelope(),
        'AvgKNN': KNN(method='mean'),
        'LargestKNN': KNN(method='largest'),
        'MedKNN': KNN(method='median'),
        'PCA': PCA(),
        'COF': COF(),
        'LODA': LODA(),
        'LOF': LOF(),
        'HBOS': HBOS(),
        'MCD': MCD(),
        'AvgBagging': FeatureBagging(combination='average'),
        'MaxBagging': FeatureBagging(combination='max'),
        'CBLOF': CBLOF(n_clusters=10, n_jobs=4),
        'FactorAnalysis': FactorAnalysis(),
        'KernelDensity': KernelDensity(),
        'COPOD': COPOD(),
        'SOD': SOD(),
        'LSCPwithLODA': LSCP([LODA(), LODA()]),
        'AveLMDD': LMDD(dis_measure='aad'),
        'VarLMDD': LMDD(dis_measure='var'),
        'IqrLMDD': LMDD(dis_measure='iqr'),
        'SoGaal': SO_GAAL(),
        'MoGaal': MO_GAAL(),
        'VAE': VAE(encoder_neurons=[8, 4, 2]),
        'AutoEncoder': AutoEncoder(hidden_neurons=[6, 3, 3, 6]),
        'OCKRA': m_OCKRA(),
    }

    name = "30_Models"

    Parallel(n_jobs=CPUS) \
        (delayed(runByScaler)
         (root, scaler, models, start, counts,
          other_models=sklearn_models,
          CPUS=CPUS_Models,
          save_name=name)
         for scaler in scalers)
Example #8
0
def outlier_detection(x_raw, y_raw):
    """
    Filter all ourlier points
    :param x_raw: feature in ndarray
    :param y_raw: label in ndarray
    :return x_clean, y_clean: cleaned feature and label in ndarray
    """
    # TODO Filter the outliers.
    print()
    print("Detecting outliers...")
    print("Before outlier detection: {}".format(x_raw.shape))
    outliers_fraction = 0.04
    random_state = np.random.RandomState(42)
    # all outlier detection method candidate list as follows
    classifiers = {
        'Angle-based Outlier Detector (ABOD)':
        ABOD(contamination=outliers_fraction),
        'Cluster-based Local Outlier Factor':
        CBLOF(contamination=outliers_fraction,
              check_estimator=False,
              random_state=random_state),
        'Feature Bagging':
        FeatureBagging(contamination=outliers_fraction,
                       random_state=random_state),
        'Histogram-base Outlier Detection (HBOS)':
        HBOS(contamination=outliers_fraction),
        'Isolation Forest':
        IForest(contamination=outliers_fraction, random_state=random_state),
        'K Nearest Neighbors (KNN)':
        KNN(contamination=outliers_fraction),
        'Local Outlier Factor (LOF)':
        LOF(contamination=outliers_fraction),
        'Minimum Covariance Determinant (MCD)':
        MCD(contamination=outliers_fraction, random_state=random_state),
        'One-class SVM (OCSVM)':
        OCSVM(contamination=outliers_fraction),
        'Principal Component Analysis (PCA)':
        PCA(contamination=outliers_fraction, random_state=random_state),
        'Improving Supervised Outlier Detection with Unsupervised Representation Learning':
        XGBOD(contamination=outliers_fraction),
    }
    clf_name = 'Isolation Forest'
    clf = IForest(contamination=outliers_fraction, random_state=random_state)
    # clf_name = 'Angle-based Outlier Detector (ABOD)'
    # clf = ABOD(contamination=outliers_fraction, method='default')
    clf.fit(x_raw)
    y_pred = clf.predict(x_raw)
    # for pyod, 1 means outliers and 0 means inliers
    # for sklearn,  -1 means outliers and 1 means inliers
    idx_y_pred = [i for i in range(0, 1212) if y_pred[i] == 1]
    x_clean = del_rowsorcolumns(x_raw, idx_y_pred, axis=0)
    y_clean = del_rowsorcolumns(y_raw, idx_y_pred, axis=0)
    print("After outlier detection: {}".format(x_clean.shape))
    assert (x_clean.shape[0] == y_clean.shape[0])
    return x_clean, y_clean
Example #9
0
    def setUp(self):
        self.n_train = 100
        self.n_test = 50
        self.contamination = 0.1
        self.roc_floor = 0.6
        self.X_train, self.y_train, self.X_test, self.y_test = generate_data(
            n_train=self.n_train, n_test=self.n_test,
            contamination=self.contamination, random_state=42)

        self.clf = MCD(contamination=self.contamination, random_state=42)
        self.clf.fit(self.X_train)
def choose_model(model, nnet):
    """ among implemented in PyOD """
    clfs = {
        'AE':
        AutoEncoder(hidden_neurons=nnet, contamination=0.1, epochs=15),
        'VAE':
        VAE(encoder_neurons=nnet[:5],
            decoder_neurons=nnet[4:],
            contamination=0.1,
            epochs=13),
        'ABOD':
        ABOD(),
        'FeatureBagging':
        FeatureBagging(),
        'HBOS':
        HBOS(),
        'IForest':
        IForest(),
        'KNN':
        KNN(),
        'LOF':
        LOF(),
        'OCSVM':
        OCSVM(),
        'PCA':
        PCA(),
        'SOS':
        SOS(),
        'COF':
        COF(),
        'CBLOF':
        CBLOF(),
        'SOD':
        SOD(),
        'LOCI':
        LOCI(),
        'MCD':
        MCD()
    }
    return clfs[model]
Example #11
0
def train(doc_list, dataset_name, clf_name):
    model_roc = []
    model_prc = []
    if clf_name == "PCA":
        clf = PCA()
    elif clf_name == "MCD":
        clf = MCD()
    elif clf_name == "LOF":
        clf = LOF()
    elif clf_name == "KNN":
        clf = KNN()
    elif clf_name == "LODA":
        clf = LODA()
    for i in range(10):
        data = pd.read_csv(doc_list[i], header=0, index_col=0)
        train_x = data.drop(drop + ground_truth, axis=1).values
        train_y = np.array([
            transfor[x]
            for x in list(_flatten(data[ground_truth].values.tolist()))
        ])
        clf.fit(train_x)
        predict = clf.decision_scores_
        roc = roc_auc_score(train_y, predict)
        prc = precision_n_scores(train_y, predict)
        if ((i + 1) % 200 == 0):
            print("第" + str(i + 1) + "个文件结果:")
            evaluate_print(clf_name, train_y, predict)
        model_roc.append(roc)
        model_prc.append(prc)
    model_roc_avg = np.mean(model_roc)
    model_prc_avg = np.mean(model_prc)
    print("模型" + clf_name + "在数据集" + dataset_name + "的平均roc_auc为" +
          str(round(model_roc_avg, 4)) + ",平均prc为" +
          str(round(model_prc_avg, 4)) + "。")

    return model_roc_avg, model_prc_avg
Example #12
0
class TestMCD(unittest.TestCase):
    def setUp(self):
        self.n_train = 200
        self.n_test = 100
        self.contamination = 0.1
        self.roc_floor = 0.8
        self.X_train, self.y_train, self.X_test, self.y_test = generate_data(
            n_train=self.n_train, n_test=self.n_test,
            contamination=self.contamination, random_state=42)

        self.clf = MCD(contamination=self.contamination, random_state=42)
        self.clf.fit(self.X_train)

    def test_parameters(self):
        assert (hasattr(self.clf, 'decision_scores_') and
                self.clf.decision_scores_ is not None)
        assert (hasattr(self.clf, 'labels_') and
                self.clf.labels_ is not None)
        assert (hasattr(self.clf, 'threshold_') and
                self.clf.threshold_ is not None)
        assert (hasattr(self.clf, '_mu') and
                self.clf._mu is not None)
        assert (hasattr(self.clf, '_sigma') and
                self.clf._sigma is not None)
        assert (hasattr(self.clf, 'raw_location_') and
                self.clf.raw_location_ is not None)
        assert (hasattr(self.clf, 'raw_covariance_') and
                self.clf.raw_covariance_ is not None)
        assert (hasattr(self.clf, 'raw_support_') and
                self.clf.raw_support_ is not None)
        assert (hasattr(self.clf, 'location_') and
                self.clf.location_ is not None)
        assert (hasattr(self.clf, 'covariance_') and
                self.clf.covariance_ is not None)
        assert (hasattr(self.clf, 'precision_') and
                self.clf.precision_ is not None)
        assert (hasattr(self.clf, 'support_') and
                self.clf.support_ is not None)

    def test_train_scores(self):
        assert_equal(len(self.clf.decision_scores_), self.X_train.shape[0])

    def test_prediction_scores(self):
        pred_scores = self.clf.decision_function(self.X_test)

        # check score shapes
        assert_equal(pred_scores.shape[0], self.X_test.shape[0])

        # check performance
        assert (roc_auc_score(self.y_test, pred_scores) >= self.roc_floor)

    def test_prediction_labels(self):
        pred_labels = self.clf.predict(self.X_test)
        assert_equal(pred_labels.shape, self.y_test.shape)

    def test_prediction_proba(self):
        pred_proba = self.clf.predict_proba(self.X_test)
        assert (pred_proba.min() >= 0)
        assert (pred_proba.max() <= 1)

    def test_prediction_proba_linear(self):
        pred_proba = self.clf.predict_proba(self.X_test, method='linear')
        assert (pred_proba.min() >= 0)
        assert (pred_proba.max() <= 1)

    def test_prediction_proba_unify(self):
        pred_proba = self.clf.predict_proba(self.X_test, method='unify')
        assert (pred_proba.min() >= 0)
        assert (pred_proba.max() <= 1)

    def test_prediction_proba_parameter(self):
        with assert_raises(ValueError):
            self.clf.predict_proba(self.X_test, method='something')

    def test_fit_predict(self):
        pred_labels = self.clf.fit_predict(self.X_train)
        assert_equal(pred_labels.shape, self.y_train.shape)

    def test_fit_predict_score(self):
        self.clf.fit_predict_score(self.X_test, self.y_test)
        self.clf.fit_predict_score(self.X_test, self.y_test,
                                   scoring='roc_auc_score')
        self.clf.fit_predict_score(self.X_test, self.y_test,
                                   scoring='prc_n_score')
        with assert_raises(NotImplementedError):
            self.clf.fit_predict_score(self.X_test, self.y_test,
                                       scoring='something')

    def test_predict_rank(self):
        pred_socres = self.clf.decision_function(self.X_test)
        pred_ranks = self.clf._predict_rank(self.X_test)

        # assert the order is reserved
        assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=2.5)
        assert_array_less(pred_ranks, self.X_train.shape[0] + 1)
        assert_array_less(-0.1, pred_ranks)

    def test_predict_rank_normalized(self):
        pred_socres = self.clf.decision_function(self.X_test)
        pred_ranks = self.clf._predict_rank(self.X_test, normalized=True)

        # assert the order is reserved
        assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=2.5)
        assert_array_less(pred_ranks, 1.01)
        assert_array_less(-0.1, pred_ranks)

    def tearDown(self):
        pass
# get the prediction labels and outlier scores of the training data
y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
y_train_scores = clf.decision_scores_  # raw outlier scores
# get the prediction on the test data
y_test_pred = clf.predict(new_origin_all[pos:])  # outlier labels (0 or 1)
y_test_scores = clf.decision_function(new_origin_all[pos:])  # outlier scores

show_scatter(clf_name, df, y_train_pred, pos)


# In[173]:


# train MCD detector
clf_name = 'MCD'
clf = MCD()
clf.fit(new_origin_all[:pos])

# get the prediction labels and outlier scores of the training data
y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
y_train_scores = clf.decision_scores_  # raw outlier scores

# get the prediction on the test data
y_test_pred = clf.predict(new_origin_all[pos:])  # outlier labels (0 or 1)
y_test_scores = clf.decision_function(new_origin_all[pos:])  # outlier scores

show_scatter(clf_name, df, y_train_pred, pos)


# In[174]:
X = StandardScaler().fit_transform(X)

# load the pre-trained model cost predictor
clf = load('rf_predictor.joblib')

classifiers = {
    1: ABOD(n_neighbors=10),
    2: CBLOF(check_estimator=False),
    3: FeatureBagging(LOF()),
    4: HBOS(),
    5: IForest(),
    6: KNN(),
    7: KNN(method='mean'),
    8: LOF(),
    9: MCD(),
    10: OCSVM(),
    11: PCA(),
}

clfs = np.random.choice([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11],
                        size=n_estimators_total)
clfs_real = []

for estimator in clfs:
    clfs_real.append(classifiers[estimator])
X_w = indices_to_one_hot(clfs - 1, 11)
X_d1 = np.array([X.shape[0], X.shape[1]]).reshape(1, 2)
X_d = np.repeat(X_d1, len(clfs), axis=0)

X_c = np.concatenate((X_d, X_w), axis=1)
Example #15
0
if __name__ == "__main__":
    contamination = 0.1  # percentage of outliers
    n_train = 200  # number of training points
    n_test = 100  # number of testing points

    # Generate sample data
    X_train, y_train, X_test, y_test = \
        generate_data(n_train=n_train,
                      n_test=n_test,
                      n_features=2,
                      contamination=contamination,
                      random_state=42)

    # train LOF detector
    clf_name = 'MCD'
    clf = MCD()
    clf.fit(X_train)

    # get the prediction labels and outlier scores of the training data
    y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
    y_train_scores = clf.decision_scores_  # raw outlier scores

    # get the prediction on the test data
    y_test_pred = clf.predict(X_test)  # outlier labels (0 or 1)
    y_test_scores = clf.decision_function(X_test)  # outlier scores

    # evaluate and print the results
    print("\nOn Training Data:")
    evaluate_print(clf_name, y_train, y_train_scores)
    print("\nOn Test Data:")
    evaluate_print(clf_name, y_test, y_test_scores)
plt.title('Test Set Ground Truth')
plt.figure()

plt.scatter(X_test_pred_d2_0[:, 0], X_test_pred_d2_0[:, 1], c='g', marker='o')
plt.scatter(X_test_pred_d2_1[:, 0], X_test_pred_d2_1[:, 1], c='r', marker='d')
plt.legend((u'inliers', u'outliers'), loc=2)
plt.title('Test Set Prediction')
plt.show()

# # MCD

# In[8]:

#MCD
clf_name = 'MCD'
clf = MCD()

# In[9]:

#用训练集训练
clf.fit(X_train)
y_train_pred = clf.labels_
y_train_scores = clf.decision_scores_
y_test_pred = clf.predict(X_test)
y_test_scores = clf.decision_function(X_test)
#评价性能
roc_train = round(roc_auc_score(y_train, y_train_scores), 4)
prn_train = round(precision_n_scores(y_train, y_train_scores), ndigits=4)
roc_test = round(roc_auc_score(y_test, y_test_scores), 4)
prn_test = round(precision_n_scores(y_test, y_test_scores), ndigits=4)
Example #17
0
def main():

    # PART 1:
    # Getting the predictions for each classifier
    # SK means: The classifier is from sklearn or works like sklearn
    # PY means: The classifier is from pyod or works like pyod

    models = {
        'SK_EE': EllipticEnvelope(),
        'SK_GM': GaussianMixture(),
        'SK_IF': IsolationForest(),
        'SK_OCSVM': OneClassSVM(),
        'SK_FA': FactorAnalysis(),
        'SK_KD': KernelDensity(),
        'PY_PCA': PCA(),
        'PY_COF': COF(),
        'PY_LODA': LODA(),
        'PY_LOF': LOF(),
        'PY_HBOS': HBOS(),
        'PY_MCD': MCD(),
        'PY_AvgKNN': KNN(method='mean'),
        'PY_LargestKNN': KNN(method='largest'),
        'PY_MedKNN': KNN(method='median'),
        'PY_AvgBagging': FeatureBagging(combination='average'),
        'PY_MaxBagging': FeatureBagging(combination='max'),
        'PY_CBLOF': CBLOF(n_clusters=10, n_jobs=4),
        'PY_COPOD': COPOD(),
        'PY_SOD': SOD(),
        'PY_LSCPwithLODA': LSCP([LODA(), LODA()]),
        'PY_AveLMDD': LMDD(dis_measure='aad'),
        'PY_VarLMDD': LMDD(dis_measure='var'),
        'PY_IqrLMDD': LMDD(dis_measure='iqr'),
        'PY_VAE': VAE(encoder_neurons=[8, 4, 2]),
        'PY_AutoEncoder': AutoEncoder(hidden_neurons=[6, 3, 3, 6]),
        'SK_BRM': BRM(bootstrap_sample_percent=70),
        'SK_OCKRA': m_OCKRA(),
        'PY_SoGaal': SO_GAAL(),
        'PY_MoGaal': MO_GAAL()
    }
    ranker = ADRanker(data="datasets", models=models)
    ranker.get_predictions()

    # PART 2:
    # After predictions, we can evaluate our classifiers using different scores
    # You can add manually a new metric by modifying 'metrics.py'

    ranker.get_scores(scores={'auc': Metrics.get_roc, 'ave': Metrics.get_ave})

    # PART 3:
    # Finally, it is time to summarize the results by plotting different graphs
    # You can add your own graphs by modifying ' plots.py'
    plot = Plots()
    plot.make_plot_basic(paths=[
        'results/scores/auc/no/results.csv',
        'results/scores/auc/minmax/results.csv',
        'results/scores/auc/std/results.csv',
        'results/scores/ave/no/results.csv',
        'results/scores/ave/minmax/results.csv',
        'results/scores/ave/std/results.csv'
    ],
                         scalers=[
                             'Without scaler', 'Min max scaler',
                             'Standard scaler', 'Without scaler',
                             'Min max scaler', 'Standard scaler'
                         ])

    plot.make_cd_plot(
        paths=[
            'results/scores/auc/minmax/results.csv',
            'results/scores/ave/no/results.csv',
            'results/scores/auc/no/results.csv',
            'results/scores/ave/no/results.csv',
            'results/scores/auc/std/results.csv',
            'results/scores/ave/std/results.csv'
        ],
        names=[
            'CD auc minmax scale', 'CD ave minmax scale', 'CD auc no scale',
            'CD ave no scale', 'CD auc std scale', 'CD ave std scale'
        ],
        titles=[
            'CD diagram - AUC with min max scaling',
            'CD diagram - Average precision with min max scaling',
            'CD diagram - AUC without scaling',
            'CD diagram - Average precision without scaling',
            'CD diagram - AUC with standard scaling',
            'CD diagram - Average precision with  standard scaling'
        ])
Example #18
0
def get_estimators(contamination):
    """Internal method to create a list of 600 random base outlier detectors.

    Parameters
    ----------
    contamination : float in (0., 0.5), optional (default=0.1)
        The amount of contamination of the data set,
        i.e. the proportion of outliers in the data set. Used when fitting to
        define the threshold on the decision function.

    Returns
    -------
    base_detectors : list
        A list of initialized random base outlier detectors.

    """
    BASE_ESTIMATORS = [
        LOF(n_neighbors=5, contamination=contamination),
        LOF(n_neighbors=10, contamination=contamination),
        LOF(n_neighbors=15, contamination=contamination),
        LOF(n_neighbors=25, contamination=contamination),
        LOF(n_neighbors=35, contamination=contamination),
        LOF(n_neighbors=45, contamination=contamination),
        LOF(n_neighbors=50, contamination=contamination),
        LOF(n_neighbors=55, contamination=contamination),
        LOF(n_neighbors=60, contamination=contamination),
        LOF(n_neighbors=65, contamination=contamination),
        LOF(n_neighbors=70, contamination=contamination),
        LOF(n_neighbors=75, contamination=contamination),
        LOF(n_neighbors=80, contamination=contamination),
        LOF(n_neighbors=85, contamination=contamination),
        LOF(n_neighbors=90, contamination=contamination),
        LOF(n_neighbors=95, contamination=contamination),
        LOF(n_neighbors=100, contamination=contamination),
        ABOD(n_neighbors=5, contamination=contamination),
        ABOD(n_neighbors=10, contamination=contamination),
        ABOD(n_neighbors=15, contamination=contamination),
        ABOD(n_neighbors=20, contamination=contamination),
        ABOD(n_neighbors=25, contamination=contamination),
        ABOD(n_neighbors=30, contamination=contamination),
        ABOD(n_neighbors=35, contamination=contamination),
        ABOD(n_neighbors=40, contamination=contamination),
        LOF(n_neighbors=5, contamination=contamination),
        LOF(n_neighbors=10, contamination=contamination),
        LOF(n_neighbors=15, contamination=contamination),
        LOF(n_neighbors=25, contamination=contamination),
        LOF(n_neighbors=35, contamination=contamination),
        LOF(n_neighbors=45, contamination=contamination),
        LOF(n_neighbors=50, contamination=contamination),
        LOF(n_neighbors=55, contamination=contamination),
        LOF(n_neighbors=60, contamination=contamination),
        LOF(n_neighbors=65, contamination=contamination),
        LOF(n_neighbors=70, contamination=contamination),
        LOF(n_neighbors=75, contamination=contamination),
        LOF(n_neighbors=80, contamination=contamination),
        LOF(n_neighbors=85, contamination=contamination),
        LOF(n_neighbors=90, contamination=contamination),
        LOF(n_neighbors=95, contamination=contamination),
        LOF(n_neighbors=100, contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        PCA(contamination=contamination),
        PCA(contamination=contamination),
        PCA(contamination=contamination),
        PCA(contamination=contamination),
        PCA(contamination=contamination),
        PCA(contamination=contamination),
        PCA(contamination=contamination),
        PCA(contamination=contamination),
        KNN(n_neighbors=5, contamination=contamination),
        KNN(n_neighbors=15, contamination=contamination),
        KNN(n_neighbors=25, contamination=contamination),
        KNN(n_neighbors=35, contamination=contamination),
        KNN(n_neighbors=45, contamination=contamination),
        KNN(n_neighbors=50, contamination=contamination),
        KNN(n_neighbors=55, contamination=contamination),
        KNN(n_neighbors=65, contamination=contamination),
        KNN(n_neighbors=75, contamination=contamination),
        KNN(n_neighbors=85, contamination=contamination),
        KNN(n_neighbors=85, contamination=contamination),
        KNN(n_neighbors=85, contamination=contamination),
        KNN(n_neighbors=95, contamination=contamination),
        KNN(n_neighbors=100, contamination=contamination),
        IForest(n_estimators=50, contamination=contamination),
        IForest(n_estimators=100, contamination=contamination),
        IForest(n_estimators=150, contamination=contamination),
        IForest(n_estimators=200, contamination=contamination),
        IForest(n_estimators=50, contamination=contamination),
        IForest(n_estimators=100, contamination=contamination),
        IForest(n_estimators=150, contamination=contamination),
        IForest(n_estimators=200, contamination=contamination),
        LOF(n_neighbors=5, contamination=contamination),
        LOF(n_neighbors=10, contamination=contamination),
        LOF(n_neighbors=15, contamination=contamination),
        LOF(n_neighbors=25, contamination=contamination),
        LOF(n_neighbors=35, contamination=contamination),
        LOF(n_neighbors=45, contamination=contamination),
        LOF(n_neighbors=50, contamination=contamination),
        LOF(n_neighbors=55, contamination=contamination),
        LOF(n_neighbors=60, contamination=contamination),
        LOF(n_neighbors=65, contamination=contamination),
        LOF(n_neighbors=70, contamination=contamination),
        LOF(n_neighbors=75, contamination=contamination),
        LOF(n_neighbors=80, contamination=contamination),
        LOF(n_neighbors=85, contamination=contamination),
        LOF(n_neighbors=90, contamination=contamination),
        LOF(n_neighbors=95, contamination=contamination),
        LOF(n_neighbors=100, contamination=contamination),
        LOF(n_neighbors=5, contamination=contamination),
        LOF(n_neighbors=10, contamination=contamination),
        LOF(n_neighbors=15, contamination=contamination),
        LOF(n_neighbors=25, contamination=contamination),
        LOF(n_neighbors=35, contamination=contamination),
        LOF(n_neighbors=45, contamination=contamination),
        LOF(n_neighbors=50, contamination=contamination),
        LOF(n_neighbors=55, contamination=contamination),
        LOF(n_neighbors=60, contamination=contamination),
        LOF(n_neighbors=65, contamination=contamination),
        LOF(n_neighbors=70, contamination=contamination),
        LOF(n_neighbors=75, contamination=contamination),
        LOF(n_neighbors=80, contamination=contamination),
        LOF(n_neighbors=85, contamination=contamination),
        LOF(n_neighbors=90, contamination=contamination),
        LOF(n_neighbors=95, contamination=contamination),
        LOF(n_neighbors=100, contamination=contamination),
        LOF(n_neighbors=5, contamination=contamination),
        LOF(n_neighbors=10, contamination=contamination),
        LOF(n_neighbors=15, contamination=contamination),
        LOF(n_neighbors=25, contamination=contamination),
        LOF(n_neighbors=35, contamination=contamination),
        LOF(n_neighbors=45, contamination=contamination),
        LOF(n_neighbors=50, contamination=contamination),
        LOF(n_neighbors=55, contamination=contamination),
        LOF(n_neighbors=60, contamination=contamination),
        LOF(n_neighbors=65, contamination=contamination),
        LOF(n_neighbors=70, contamination=contamination),
        LOF(n_neighbors=75, contamination=contamination),
        LOF(n_neighbors=80, contamination=contamination),
        LOF(n_neighbors=85, contamination=contamination),
        LOF(n_neighbors=90, contamination=contamination),
        LOF(n_neighbors=95, contamination=contamination),
        LOF(n_neighbors=100, contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        PCA(contamination=contamination),
        PCA(contamination=contamination),
        PCA(contamination=contamination),
        PCA(contamination=contamination),
        PCA(contamination=contamination),
        PCA(contamination=contamination),
        PCA(contamination=contamination),
        PCA(contamination=contamination),
        PCA(contamination=contamination),
        PCA(contamination=contamination),
        KNN(n_neighbors=5, contamination=contamination),
        KNN(n_neighbors=15, contamination=contamination),
        KNN(n_neighbors=25, contamination=contamination),
        KNN(n_neighbors=35, contamination=contamination),
        KNN(n_neighbors=45, contamination=contamination),
        KNN(n_neighbors=50, contamination=contamination),
        KNN(n_neighbors=55, contamination=contamination),
        KNN(n_neighbors=65, contamination=contamination),
        KNN(n_neighbors=75, contamination=contamination),
        KNN(n_neighbors=85, contamination=contamination),
        KNN(n_neighbors=85, contamination=contamination),
        KNN(n_neighbors=85, contamination=contamination),
        KNN(n_neighbors=95, contamination=contamination),
        KNN(n_neighbors=100, contamination=contamination),
        IForest(n_estimators=50, contamination=contamination),
        IForest(n_estimators=100, contamination=contamination),
        IForest(n_estimators=150, contamination=contamination),
        IForest(n_estimators=200, contamination=contamination),
        IForest(n_estimators=50, contamination=contamination),
        IForest(n_estimators=100, contamination=contamination),
        IForest(n_estimators=150, contamination=contamination),
        IForest(n_estimators=200, contamination=contamination),
        LOF(n_neighbors=5, contamination=contamination),
        LOF(n_neighbors=10, contamination=contamination),
        LOF(n_neighbors=15, contamination=contamination),
        LOF(n_neighbors=25, contamination=contamination),
        LOF(n_neighbors=35, contamination=contamination),
        LOF(n_neighbors=45, contamination=contamination),
        LOF(n_neighbors=50, contamination=contamination),
        LOF(n_neighbors=55, contamination=contamination),
        LOF(n_neighbors=60, contamination=contamination),
        LOF(n_neighbors=65, contamination=contamination),
        LOF(n_neighbors=70, contamination=contamination),
        LOF(n_neighbors=75, contamination=contamination),
        LOF(n_neighbors=80, contamination=contamination),
        LOF(n_neighbors=85, contamination=contamination),
        LOF(n_neighbors=90, contamination=contamination),
        LOF(n_neighbors=95, contamination=contamination),
        LOF(n_neighbors=100, contamination=contamination),
        LOF(n_neighbors=5, contamination=contamination),
        LOF(n_neighbors=10, contamination=contamination),
        LOF(n_neighbors=15, contamination=contamination),
        LOF(n_neighbors=25, contamination=contamination),
        LOF(n_neighbors=35, contamination=contamination),
        LOF(n_neighbors=45, contamination=contamination),
        LOF(n_neighbors=50, contamination=contamination),
        LOF(n_neighbors=55, contamination=contamination),
        LOF(n_neighbors=60, contamination=contamination),
        LOF(n_neighbors=65, contamination=contamination),
        LOF(n_neighbors=70, contamination=contamination),
        LOF(n_neighbors=75, contamination=contamination),
        LOF(n_neighbors=80, contamination=contamination),
        LOF(n_neighbors=85, contamination=contamination),
        LOF(n_neighbors=90, contamination=contamination),
        LOF(n_neighbors=95, contamination=contamination),
        LOF(n_neighbors=100, contamination=contamination),
        LOF(n_neighbors=5, contamination=contamination),
        LOF(n_neighbors=10, contamination=contamination),
        LOF(n_neighbors=15, contamination=contamination),
        LOF(n_neighbors=25, contamination=contamination),
        LOF(n_neighbors=35, contamination=contamination),
        LOF(n_neighbors=45, contamination=contamination),
        LOF(n_neighbors=50, contamination=contamination),
        LOF(n_neighbors=55, contamination=contamination),
        LOF(n_neighbors=60, contamination=contamination),
        LOF(n_neighbors=65, contamination=contamination),
        LOF(n_neighbors=70, contamination=contamination),
        LOF(n_neighbors=75, contamination=contamination),
        LOF(n_neighbors=80, contamination=contamination),
        LOF(n_neighbors=85, contamination=contamination),
        LOF(n_neighbors=90, contamination=contamination),
        LOF(n_neighbors=95, contamination=contamination),
        LOF(n_neighbors=100, contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        PCA(contamination=contamination),
        PCA(contamination=contamination),
        PCA(contamination=contamination),
        PCA(contamination=contamination),
        PCA(contamination=contamination),
        PCA(contamination=contamination),
        PCA(contamination=contamination),
        PCA(contamination=contamination),
        PCA(contamination=contamination),
        PCA(contamination=contamination),
        KNN(n_neighbors=5, contamination=contamination),
        KNN(n_neighbors=15, contamination=contamination),
        KNN(n_neighbors=25, contamination=contamination),
        KNN(n_neighbors=35, contamination=contamination),
        KNN(n_neighbors=45, contamination=contamination),
        KNN(n_neighbors=50, contamination=contamination),
        KNN(n_neighbors=55, contamination=contamination),
        KNN(n_neighbors=65, contamination=contamination),
        KNN(n_neighbors=75, contamination=contamination),
        KNN(n_neighbors=85, contamination=contamination),
        KNN(n_neighbors=85, contamination=contamination),
        KNN(n_neighbors=85, contamination=contamination),
        KNN(n_neighbors=95, contamination=contamination),
        KNN(n_neighbors=100, contamination=contamination),
        IForest(n_estimators=50, contamination=contamination),
        IForest(n_estimators=100, contamination=contamination),
        IForest(n_estimators=150, contamination=contamination),
        IForest(n_estimators=200, contamination=contamination),
        IForest(n_estimators=50, contamination=contamination),
        IForest(n_estimators=100, contamination=contamination),
        IForest(n_estimators=150, contamination=contamination),
        IForest(n_estimators=200, contamination=contamination),
        LOF(n_neighbors=5, contamination=contamination),
        LOF(n_neighbors=10, contamination=contamination),
        LOF(n_neighbors=15, contamination=contamination),
        LOF(n_neighbors=25, contamination=contamination),
        LOF(n_neighbors=35, contamination=contamination),
        LOF(n_neighbors=45, contamination=contamination),
        LOF(n_neighbors=50, contamination=contamination),
        LOF(n_neighbors=55, contamination=contamination),
        LOF(n_neighbors=60, contamination=contamination),
        LOF(n_neighbors=65, contamination=contamination),
        LOF(n_neighbors=70, contamination=contamination),
        LOF(n_neighbors=75, contamination=contamination),
        LOF(n_neighbors=80, contamination=contamination),
        LOF(n_neighbors=85, contamination=contamination),
        LOF(n_neighbors=90, contamination=contamination),
        LOF(n_neighbors=95, contamination=contamination),
        LOF(n_neighbors=100, contamination=contamination),
        LOF(n_neighbors=5, contamination=contamination),
        LOF(n_neighbors=10, contamination=contamination),
        LOF(n_neighbors=15, contamination=contamination),
        LOF(n_neighbors=25, contamination=contamination),
        LOF(n_neighbors=35, contamination=contamination),
        LOF(n_neighbors=45, contamination=contamination),
        LOF(n_neighbors=50, contamination=contamination),
        LOF(n_neighbors=55, contamination=contamination),
        LOF(n_neighbors=60, contamination=contamination),
        LOF(n_neighbors=65, contamination=contamination),
        LOF(n_neighbors=70, contamination=contamination),
        LOF(n_neighbors=75, contamination=contamination),
        LOF(n_neighbors=80, contamination=contamination),
        LOF(n_neighbors=85, contamination=contamination),
        LOF(n_neighbors=90, contamination=contamination),
        LOF(n_neighbors=95, contamination=contamination),
        LOF(n_neighbors=100, contamination=contamination),
        LOF(n_neighbors=5, contamination=contamination),
        LOF(n_neighbors=10, contamination=contamination),
        LOF(n_neighbors=15, contamination=contamination),
        LOF(n_neighbors=25, contamination=contamination),
        LOF(n_neighbors=35, contamination=contamination),
        LOF(n_neighbors=45, contamination=contamination),
        LOF(n_neighbors=50, contamination=contamination),
        LOF(n_neighbors=55, contamination=contamination),
        LOF(n_neighbors=60, contamination=contamination),
        LOF(n_neighbors=65, contamination=contamination),
        LOF(n_neighbors=70, contamination=contamination),
        LOF(n_neighbors=75, contamination=contamination),
        LOF(n_neighbors=80, contamination=contamination),
        LOF(n_neighbors=85, contamination=contamination),
        LOF(n_neighbors=90, contamination=contamination),
        LOF(n_neighbors=95, contamination=contamination),
        LOF(n_neighbors=100, contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        ABOD(n_neighbors=5, contamination=contamination),
        ABOD(n_neighbors=10, contamination=contamination),
        ABOD(n_neighbors=15, contamination=contamination),
        ABOD(n_neighbors=20, contamination=contamination),
        ABOD(n_neighbors=25, contamination=contamination),
        ABOD(n_neighbors=30, contamination=contamination),
        ABOD(n_neighbors=35, contamination=contamination),
        ABOD(n_neighbors=40, contamination=contamination),
        IForest(n_estimators=50, contamination=contamination),
        IForest(n_estimators=100, contamination=contamination),
        IForest(n_estimators=150, contamination=contamination),
        IForest(n_estimators=200, contamination=contamination),
        IForest(n_estimators=50, contamination=contamination),
        IForest(n_estimators=100, contamination=contamination),
        IForest(n_estimators=150, contamination=contamination),
        IForest(n_estimators=200, contamination=contamination),
        IForest(n_estimators=150, contamination=contamination),
        IForest(n_estimators=200, contamination=contamination),
        PCA(contamination=contamination),
        PCA(contamination=contamination),
        PCA(contamination=contamination),
        PCA(contamination=contamination),
        PCA(contamination=contamination),
        PCA(contamination=contamination),
        PCA(contamination=contamination),
        PCA(contamination=contamination),
        PCA(contamination=contamination),
        PCA(contamination=contamination),
        KNN(n_neighbors=5, contamination=contamination),
        KNN(n_neighbors=15, contamination=contamination),
        KNN(n_neighbors=25, contamination=contamination),
        KNN(n_neighbors=35, contamination=contamination),
        KNN(n_neighbors=45, contamination=contamination),
        KNN(n_neighbors=50, contamination=contamination),
        KNN(n_neighbors=55, contamination=contamination),
        KNN(n_neighbors=65, contamination=contamination),
        KNN(n_neighbors=75, contamination=contamination),
        KNN(n_neighbors=85, contamination=contamination),
        KNN(n_neighbors=85, contamination=contamination),
        KNN(n_neighbors=85, contamination=contamination),
        KNN(n_neighbors=95, contamination=contamination),
        KNN(n_neighbors=100, contamination=contamination),
        IForest(n_estimators=50, contamination=contamination),
        IForest(n_estimators=100, contamination=contamination),
        IForest(n_estimators=150, contamination=contamination),
        IForest(n_estimators=200, contamination=contamination),
        IForest(n_estimators=50, contamination=contamination),
        IForest(n_estimators=100, contamination=contamination),
        IForest(n_estimators=150, contamination=contamination),
        IForest(n_estimators=200, contamination=contamination),
        LOF(n_neighbors=5, contamination=contamination),
        LOF(n_neighbors=10, contamination=contamination),
        LOF(n_neighbors=15, contamination=contamination),
        LOF(n_neighbors=25, contamination=contamination),
        LOF(n_neighbors=35, contamination=contamination),
        LOF(n_neighbors=45, contamination=contamination),
        LOF(n_neighbors=50, contamination=contamination),
        LOF(n_neighbors=55, contamination=contamination),
        LOF(n_neighbors=60, contamination=contamination),
        LOF(n_neighbors=65, contamination=contamination),
        LOF(n_neighbors=70, contamination=contamination),
        LOF(n_neighbors=75, contamination=contamination),
        LOF(n_neighbors=80, contamination=contamination),
        LOF(n_neighbors=85, contamination=contamination),
        LOF(n_neighbors=90, contamination=contamination),
        LOF(n_neighbors=95, contamination=contamination),
        LOF(n_neighbors=100, contamination=contamination),
        ABOD(n_neighbors=5, contamination=contamination),
        ABOD(n_neighbors=10, contamination=contamination),
        ABOD(n_neighbors=15, contamination=contamination),
        ABOD(n_neighbors=20, contamination=contamination),
        ABOD(n_neighbors=25, contamination=contamination),
        ABOD(n_neighbors=30, contamination=contamination),
        ABOD(n_neighbors=35, contamination=contamination),
        ABOD(n_neighbors=40, contamination=contamination),
        LOF(n_neighbors=5, contamination=contamination),
        LOF(n_neighbors=10, contamination=contamination),
        LOF(n_neighbors=15, contamination=contamination),
        LOF(n_neighbors=25, contamination=contamination),
        LOF(n_neighbors=35, contamination=contamination),
        LOF(n_neighbors=45, contamination=contamination),
        LOF(n_neighbors=50, contamination=contamination),
        LOF(n_neighbors=55, contamination=contamination),
        LOF(n_neighbors=60, contamination=contamination),
        LOF(n_neighbors=65, contamination=contamination),
        LOF(n_neighbors=70, contamination=contamination),
        LOF(n_neighbors=75, contamination=contamination),
        LOF(n_neighbors=80, contamination=contamination),
        LOF(n_neighbors=85, contamination=contamination),
        LOF(n_neighbors=90, contamination=contamination),
        LOF(n_neighbors=95, contamination=contamination),
        LOF(n_neighbors=100, contamination=contamination),
        LOF(n_neighbors=5, contamination=contamination),
        LOF(n_neighbors=10, contamination=contamination),
        LOF(n_neighbors=15, contamination=contamination),
        LOF(n_neighbors=25, contamination=contamination),
        LOF(n_neighbors=35, contamination=contamination),
        LOF(n_neighbors=45, contamination=contamination),
        LOF(n_neighbors=50, contamination=contamination),
        LOF(n_neighbors=55, contamination=contamination),
        LOF(n_neighbors=60, contamination=contamination),
        LOF(n_neighbors=65, contamination=contamination),
        LOF(n_neighbors=70, contamination=contamination),
        LOF(n_neighbors=75, contamination=contamination),
        LOF(n_neighbors=80, contamination=contamination),
        LOF(n_neighbors=85, contamination=contamination),
        LOF(n_neighbors=90, contamination=contamination),
        LOF(n_neighbors=95, contamination=contamination),
        LOF(n_neighbors=100, contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        PCA(contamination=contamination),
        PCA(contamination=contamination),
        PCA(contamination=contamination),
        PCA(contamination=contamination),
        PCA(contamination=contamination),
        PCA(contamination=contamination),
        PCA(contamination=contamination),
        PCA(contamination=contamination),
        PCA(contamination=contamination),
        PCA(contamination=contamination),
        KNN(n_neighbors=5, contamination=contamination),
        KNN(n_neighbors=15, contamination=contamination),
        KNN(n_neighbors=25, contamination=contamination),
        KNN(n_neighbors=35, contamination=contamination),
        KNN(n_neighbors=45, contamination=contamination),
        KNN(n_neighbors=50, contamination=contamination),
        KNN(n_neighbors=55, contamination=contamination),
        KNN(n_neighbors=65, contamination=contamination),
        KNN(n_neighbors=75, contamination=contamination),
        KNN(n_neighbors=85, contamination=contamination),
        KNN(n_neighbors=85, contamination=contamination),
        KNN(n_neighbors=85, contamination=contamination),
        KNN(n_neighbors=95, contamination=contamination),
        KNN(n_neighbors=100, contamination=contamination),
        IForest(n_estimators=50, contamination=contamination),
        IForest(n_estimators=100, contamination=contamination),
        IForest(n_estimators=150, contamination=contamination),
        IForest(n_estimators=200, contamination=contamination),
        IForest(n_estimators=50, contamination=contamination),
        IForest(n_estimators=100, contamination=contamination),
        IForest(n_estimators=150, contamination=contamination),
        IForest(n_estimators=200, contamination=contamination),
        LOF(n_neighbors=5, contamination=contamination),
        LOF(n_neighbors=10, contamination=contamination),
        LOF(n_neighbors=15, contamination=contamination),
        LOF(n_neighbors=25, contamination=contamination),
        LOF(n_neighbors=35, contamination=contamination),
        LOF(n_neighbors=45, contamination=contamination),
        LOF(n_neighbors=50, contamination=contamination),
        LOF(n_neighbors=55, contamination=contamination),
        LOF(n_neighbors=60, contamination=contamination),
        LOF(n_neighbors=65, contamination=contamination),
        LOF(n_neighbors=70, contamination=contamination),
        LOF(n_neighbors=75, contamination=contamination),
        LOF(n_neighbors=80, contamination=contamination),
        LOF(n_neighbors=85, contamination=contamination),
        LOF(n_neighbors=90, contamination=contamination),
        LOF(n_neighbors=95, contamination=contamination),
        LOF(n_neighbors=100, contamination=contamination),
        ABOD(n_neighbors=5, contamination=contamination),
        ABOD(n_neighbors=10, contamination=contamination),
        ABOD(n_neighbors=15, contamination=contamination),
        ABOD(n_neighbors=20, contamination=contamination),
        ABOD(n_neighbors=25, contamination=contamination),
        ABOD(n_neighbors=30, contamination=contamination),
        ABOD(n_neighbors=35, contamination=contamination),
        ABOD(n_neighbors=40, contamination=contamination),
        ABOD(n_neighbors=45, contamination=contamination),
        OCSVM(contamination=contamination),
        OCSVM(contamination=contamination),
        OCSVM(contamination=contamination),
        OCSVM(contamination=contamination),
        OCSVM(contamination=contamination),
        OCSVM(contamination=contamination),
        OCSVM(contamination=contamination),
        OCSVM(contamination=contamination),
        OCSVM(contamination=contamination),
        OCSVM(contamination=contamination),
        MCD(contamination=contamination),
        MCD(contamination=contamination),
        MCD(contamination=contamination),
        MCD(contamination=contamination),
        MCD(contamination=contamination),
        MCD(contamination=contamination),
        MCD(contamination=contamination),
        MCD(contamination=contamination),
        MCD(contamination=contamination),
        MCD(contamination=contamination),
        MCD(contamination=contamination),
        MCD(contamination=contamination),
        MCD(contamination=contamination),
        MCD(contamination=contamination),
        MCD(contamination=contamination),
        MCD(contamination=contamination),
        MCD(contamination=contamination),
        MCD(contamination=contamination),
        MCD(contamination=contamination),
        MCD(contamination=contamination),
        LOF(n_neighbors=75, contamination=contamination),
        LOF(n_neighbors=80, contamination=contamination),
        LOF(n_neighbors=85, contamination=contamination),
        LOF(n_neighbors=90, contamination=contamination),
        LOF(n_neighbors=95, contamination=contamination),
        LOF(n_neighbors=100, contamination=contamination),
        ABOD(n_neighbors=5, contamination=contamination),
        ABOD(n_neighbors=10, contamination=contamination),
        ABOD(n_neighbors=15, contamination=contamination),
        ABOD(n_neighbors=20, contamination=contamination),
        ABOD(n_neighbors=25, contamination=contamination),
        ABOD(n_neighbors=30, contamination=contamination),
        ABOD(n_neighbors=35, contamination=contamination),
        ABOD(n_neighbors=40, contamination=contamination),
    ]

    return BASE_ESTIMATORS
Example #19
0
                res_df[k].append(res[k])
        res_df = pd.DataFrame(data=res_df)
        res_df.to_csv(os.path.join(res_dir, 'result.csv'), index=False)


if __name__ == "__main__":
    opt = {
        'data_dir': '../data/anomaly_detection',
        'result_dir': './result',
        'dataset': {
            'abalone': ['V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7'],
            'skin': ['R', 'G', 'B'],
        },
    }

    models = {
        'KNN largest': KNN(method='largest'),
        'KNN mean': KNN(method='mean'),
        'KNN median': KNN(method='median'),
        'CBLOF': CBLOF(),
        'LOF': LOF(),
        'FeatureBagging': FeatureBagging(),
        'HBOS': HBOS(),
        'IForest': IForest(),
        'MCD': MCD(),
        'OCSVM': OCSVM(),
        'PCA': PCA(),
    }

    main()
    f.write("Model: " + modelname + "\n")
    f.write("Dataset " + str(datasetnumber) + ": " + datasetname + "\n")
    f.write("Time taken: " + str(time) + " seg.\n")
    f.write("Accuracy: " + str(accuracy) + "\n")
    if accuracy!=None:
        f.write("@scores\n")
        for score in model.decision_scores_:
            f.write(str(score) + "\n")
    f.close()

# This is based on executing the script from the folder experiments
ROUTE = "../datasets/outlier_ground_truth/"
# List of datasets
datasets = ["annthyroid.mat", "arrhythmia.mat", "breastw.mat", "cardio.mat", "glass.mat", "ionosphere.mat", "letter.mat", "lympho.mat", "mammography.mat", "mnist.mat", "musk.mat", "optdigits.mat", "pendigits.mat", "pima.mat", "satellite.mat", "satimage-2.mat", "speech.mat", "thyroid.mat", "vertebral.mat", "vowels.mat", "wbc.mat", "wine.mat"]
# List of models and names
models = [ABOD(), COF(), HBOS(), KNN(), LOF(), MCD(), OCSVM(), PCA(), SOD(), SOS()]
names = ["ABOD", "COF", "HBOS", "KNN", "LOF", "MCD", "OCSVM", "PCA", "SOD", "SOS"]
accuracies = []

for name, model in zip(names, models):
    print("\n\n#################################################################")
    print("MODEL " + name + " " + str(names.index(name)+1) + "/" + str(len(names)))
    print("#################################################################")
    acc = []
    for dat in datasets:
        if name=="ABOD" and dat in ["breastw.mat", "letter.mat", "satellite.mat"]:
            result = None
        else:
            print("Computing dataset " + dat + " " + str(datasets.index(dat)+1) + "/" + str(len(datasets)))
            # Read dataset
            dataset, labels = readDataset(ROUTE + dat)
Example #21
0
    def execute(self):
        evaluation_results = []

        print("Loading training data...")
        data = pd.DataFrame()

        for i, chunk in enumerate(
                pd.read_csv(self.input_file,
                            header=None,
                            chunksize=self.chunk_size)):
            print("Reading chunk: %d" % (i + 1))
            #print(chunk)
            data = data.append(chunk)

        input_dimensionality = len(data.columns) - 1
        print("Input Dimensionality: %d" % (input_dimensionality))

        positive_data = data[data[len(data.columns) -
                                  1] == 1].iloc[:, :len(data.columns) - 1]
        negative_data = data[data[len(data.columns) -
                                  1] == -1].iloc[:, :len(data.columns) - 1]

        training_data = positive_data.sample(frac=0.70)
        positive_validation_data = positive_data.drop(training_data.index)

        if self.neg_cont and self.neg_cont > 0:
            print("Negative Contamination: %0.4f" % (self.neg_cont))
            num_negative = math.floor(
                self.neg_cont *
                (len(negative_data) + len(positive_validation_data)))
            negative_data = data.sample(frac=1, random_state=200)[
                data[len(data.columns) -
                     1] == -1].iloc[:num_negative, :len(data.columns) - 1]

        negative_validation_data = negative_data.copy()

        temp_positive = positive_validation_data.copy()
        temp_positive[input_dimensionality] = 1

        temp_negative = negative_data.copy()
        temp_negative[input_dimensionality] = -1

        validation_data_with_labels = pd.concat([temp_positive, temp_negative],
                                                ignore_index=True)
        validation_data = validation_data_with_labels.iloc[:, :len(data.columns
                                                                   ) - 1]
        validation_labels = validation_data_with_labels.iloc[:, -1:].values

        # Convert to tensor
        positive_data = torch.tensor(positive_data.values).float().to(
            self.device)
        negative_data = torch.tensor(negative_data.values).float().to(
            self.device)
        training_data = torch.tensor(training_data.values).float()
        validation_data = torch.tensor(validation_data.values).float()

        print("Validation Data:")
        print(validation_data)

        ## AE-D TRAINING ##
        print("Initializing autoencoder...")
        net = Autoencoder(layers=self.layers,
                          device=self.device,
                          add_syn=self.add_syn)
        net.to(self.device)

        print(net)

        print("Training Stochastic Autoencoder...")
        net.fit(training_data,
                epochs=self.epochs,
                lr=self.lr,
                batch_size=self.batch_size)

        predictions = net.predict(validation_data)

        tp, tn, fp, fn, tpr, tnr, ppv, npv, ts, pt, acc, f1, mcc = performance_metrics(
            validation_labels, predictions)

        r = ["AE-D", tp, tn, fp, fn, tpr, tnr, ppv, npv, ts, pt, acc, f1, mcc]

        evaluation_results.append(r)

        print("AE-D Results:")
        print(
            tabulate([r], [
                "ALGO", "TP", "TN", "FP", "FN", "TPR", "TNR", "PPV", "NPV",
                "TS", "PT", "ACC", "F1", "MCC"
            ],
                     tablefmt="grid"))

        # Convert back to CPU before other methods
        validation_data = validation_data.cpu()

        # Train only linear classifiers
        if self.eval_cat == "linear":
            print("Initiating training for linear detectors...")

            ## MCD ##
            print("Training MCD...")
            result = train_and_evaluate_classifier("MCD", MCD(),
                                                   validation_data,
                                                   validation_labels)
            evaluation_results.append(result)

            ## ROBUST COVARIANCE ##
            print("Training Robust Covariance...")
            result = train_and_evaluate_classifier("ROB-COV",
                                                   EllipticEnvelope(),
                                                   validation_data,
                                                   validation_labels)
            evaluation_results.append(result)

            ## ONE CLASS SVM TRAINING ##
            print("Training OneClassSVM...")
            result = train_and_evaluate_classifier(
                "OC-SVM", svm.OneClassSVM(gamma="auto"), validation_data,
                validation_labels)
            evaluation_results.append(result)

        elif self.eval_cat == "prob":
            ## ABOD ##
            #print("Training ABOD...")
            #result = train_and_evaluate_classifier("ABOD", ABOD(), validation_data, validation_labels)
            #evaluation_results.append(result)

            ## SOS ##
            #print("Training SOS...")
            #result = train_and_evaluate_classifier("SOS", SOS(), validation_data, validation_labels)
            #evaluation_results.append(result)

            ## COPOD ##
            print("Training COPOD...")
            result = train_and_evaluate_classifier("COPOD", COPOD(),
                                                   validation_data,
                                                   validation_labels)
            evaluation_results.append(result)

        elif self.eval_cat == "ensemble":
            ## ISOLATION FOREST TRAINING ##
            print("Training Isolation Forest...")
            result = train_and_evaluate_classifier(
                "ISO-F", IsolationForest(random_state=0), validation_data,
                validation_labels)
            evaluation_results.append(result)

            ## LODA ##
            print("Training LODA...")
            result = train_and_evaluate_classifier("LODA", LODA(),
                                                   validation_data,
                                                   validation_labels)
            evaluation_results.append(result)

            ## LSCP ##


#      print("Training LSCP...")
#      result = train_and_evaluate_classifier("LSCP", LSCP([LOF(), LOF()]), validation_data, validation_labels)
#      evaluation_results.append(result)

        elif self.eval_cat == "proximity":
            ## LOCAL OUTLIER FACTOR ##
            print("Training Local Outlier Factor...")
            result = train_and_evaluate_classifier(
                "LOC-OF", LocalOutlierFactor(novelty=True), validation_data,
                validation_labels)
            evaluation_results.append(result)

            ## CBLOF ##
            print("Training CBLOF...")
            result = train_and_evaluate_classifier("CBLOF", CBLOF(),
                                                   validation_data,
                                                   validation_labels)
            evaluation_results.append(result)

            ## HBOS ##
            print("Training HBOS...")
            result = train_and_evaluate_classifier("HBOS", HBOS(),
                                                   validation_data,
                                                   validation_labels)
            evaluation_results.append(result)

        elif self.eval_cat == "nn":
            ## VAE ##
            print("Training VAE...")
            result = train_and_evaluate_classifier(
                "VAE",
                VAE(encoder_neurons=self.layers,
                    decoder_neurons=self.layers.reverse()), validation_data,
                validation_labels)
            evaluation_results.append(result)

            ## SO_GAAL ##
            print("Training SO_GAAL...")
            result = train_and_evaluate_classifier(
                "SO_GAAL", SO_GAAL(lr_d=self.lr, stop_epochs=self.epochs),
                validation_data, validation_labels)
            evaluation_results.append(result)

            ## MO_GAAL ##
            print("Training MO_GAAL...")
            result = train_and_evaluate_classifier(
                "MO_GAAL", MO_GAAL(lr_d=self.lr, stop_epochs=self.epochs),
                validation_data, validation_labels)
            evaluation_results.append(result)

        ## EVALUATE RESULTS ##
        if self.eval_cat != "none":
            print("Aggregated Results:")
            print(
                tabulate(evaluation_results, [
                    "ALGO", "TP", "TN", "FP", "FN", "TPR", "TNR", "PPV", "NPV",
                    "TS", "PT", "ACC", "F1", "MCC"
                ],
                         tablefmt="grid"))

        ## DATASET METRICS ##
        len_training_data_points = len(training_data)
        len_positive_validations = len(positive_validation_data)
        len_negative_validations = len(negative_validation_data)
        len_validations = len_positive_validations + len_negative_validations

        metrics_results = [
            ["Training Data Points", len_training_data_points],
            ["# Normal Points", len_positive_validations],
            ["# Anomalies", len_negative_validations],
            [
                "Contamination Percentage",
                math.floor((len_negative_validations / len_validations) * 100)
            ]
        ]

        ## EVALUATE RESULTS ##
        print(tabulate(metrics_results, ["Metric", "Value"], tablefmt="grid"))

        if self.printout:
            print("Saving results to %s" % (self.printout))
            df = pd.DataFrame(evaluation_results)
            df.to_csv(self.printout, header=None, index=False)
Example #22
0
File: utility.py Project: yyht/SUOD
def get_estimators(contamination):
    BASE_ESTIMATORS = [
        LOF(n_neighbors=5, contamination=contamination),
        LOF(n_neighbors=10, contamination=contamination),
        LOF(n_neighbors=15, contamination=contamination),
        LOF(n_neighbors=25, contamination=contamination),
        LOF(n_neighbors=35, contamination=contamination),
        LOF(n_neighbors=45, contamination=contamination),
        LOF(n_neighbors=50, contamination=contamination),
        LOF(n_neighbors=55, contamination=contamination),
        LOF(n_neighbors=60, contamination=contamination),
        LOF(n_neighbors=65, contamination=contamination),
        LOF(n_neighbors=70, contamination=contamination),
        LOF(n_neighbors=75, contamination=contamination),
        LOF(n_neighbors=80, contamination=contamination),
        LOF(n_neighbors=85, contamination=contamination),
        LOF(n_neighbors=90, contamination=contamination),
        LOF(n_neighbors=95, contamination=contamination),
        LOF(n_neighbors=100, contamination=contamination),
        ABOD(n_neighbors=5, contamination=contamination),
        ABOD(n_neighbors=10, contamination=contamination),
        ABOD(n_neighbors=15, contamination=contamination),
        ABOD(n_neighbors=20, contamination=contamination),
        ABOD(n_neighbors=25, contamination=contamination),
        ABOD(n_neighbors=30, contamination=contamination),
        ABOD(n_neighbors=35, contamination=contamination),
        ABOD(n_neighbors=40, contamination=contamination),
        LOF(n_neighbors=5, contamination=contamination),
        LOF(n_neighbors=10, contamination=contamination),
        LOF(n_neighbors=15, contamination=contamination),
        LOF(n_neighbors=25, contamination=contamination),
        LOF(n_neighbors=35, contamination=contamination),
        LOF(n_neighbors=45, contamination=contamination),
        LOF(n_neighbors=50, contamination=contamination),
        LOF(n_neighbors=55, contamination=contamination),
        LOF(n_neighbors=60, contamination=contamination),
        LOF(n_neighbors=65, contamination=contamination),
        LOF(n_neighbors=70, contamination=contamination),
        LOF(n_neighbors=75, contamination=contamination),
        LOF(n_neighbors=80, contamination=contamination),
        LOF(n_neighbors=85, contamination=contamination),
        LOF(n_neighbors=90, contamination=contamination),
        LOF(n_neighbors=95, contamination=contamination),
        LOF(n_neighbors=100, contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        PCA(contamination=contamination),
        PCA(contamination=contamination),
        PCA(contamination=contamination),
        PCA(contamination=contamination),
        PCA(contamination=contamination),
        PCA(contamination=contamination),
        PCA(contamination=contamination),
        PCA(contamination=contamination),
        KNN(n_neighbors=5, contamination=contamination),
        KNN(n_neighbors=15, contamination=contamination),
        KNN(n_neighbors=25, contamination=contamination),
        KNN(n_neighbors=35, contamination=contamination),
        KNN(n_neighbors=45, contamination=contamination),
        KNN(n_neighbors=50, contamination=contamination),
        KNN(n_neighbors=55, contamination=contamination),
        KNN(n_neighbors=65, contamination=contamination),
        KNN(n_neighbors=75, contamination=contamination),
        KNN(n_neighbors=85, contamination=contamination),
        KNN(n_neighbors=85, contamination=contamination),
        KNN(n_neighbors=85, contamination=contamination),
        KNN(n_neighbors=95, contamination=contamination),
        KNN(n_neighbors=100, contamination=contamination),
        IForest(n_estimators=50, contamination=contamination),
        IForest(n_estimators=100, contamination=contamination),
        IForest(n_estimators=150, contamination=contamination),
        IForest(n_estimators=200, contamination=contamination),
        IForest(n_estimators=50, contamination=contamination),
        IForest(n_estimators=100, contamination=contamination),
        IForest(n_estimators=150, contamination=contamination),
        IForest(n_estimators=200, contamination=contamination),
        LOF(n_neighbors=5, contamination=contamination),
        LOF(n_neighbors=10, contamination=contamination),
        LOF(n_neighbors=15, contamination=contamination),
        LOF(n_neighbors=25, contamination=contamination),
        LOF(n_neighbors=35, contamination=contamination),
        LOF(n_neighbors=45, contamination=contamination),
        LOF(n_neighbors=50, contamination=contamination),
        LOF(n_neighbors=55, contamination=contamination),
        LOF(n_neighbors=60, contamination=contamination),
        LOF(n_neighbors=65, contamination=contamination),
        LOF(n_neighbors=70, contamination=contamination),
        LOF(n_neighbors=75, contamination=contamination),
        LOF(n_neighbors=80, contamination=contamination),
        LOF(n_neighbors=85, contamination=contamination),
        LOF(n_neighbors=90, contamination=contamination),
        LOF(n_neighbors=95, contamination=contamination),
        LOF(n_neighbors=100, contamination=contamination),
        LOF(n_neighbors=5, contamination=contamination),
        LOF(n_neighbors=10, contamination=contamination),
        LOF(n_neighbors=15, contamination=contamination),
        LOF(n_neighbors=25, contamination=contamination),
        LOF(n_neighbors=35, contamination=contamination),
        LOF(n_neighbors=45, contamination=contamination),
        LOF(n_neighbors=50, contamination=contamination),
        LOF(n_neighbors=55, contamination=contamination),
        LOF(n_neighbors=60, contamination=contamination),
        LOF(n_neighbors=65, contamination=contamination),
        LOF(n_neighbors=70, contamination=contamination),
        LOF(n_neighbors=75, contamination=contamination),
        LOF(n_neighbors=80, contamination=contamination),
        LOF(n_neighbors=85, contamination=contamination),
        LOF(n_neighbors=90, contamination=contamination),
        LOF(n_neighbors=95, contamination=contamination),
        LOF(n_neighbors=100, contamination=contamination),
        LOF(n_neighbors=5, contamination=contamination),
        LOF(n_neighbors=10, contamination=contamination),
        LOF(n_neighbors=15, contamination=contamination),
        LOF(n_neighbors=25, contamination=contamination),
        LOF(n_neighbors=35, contamination=contamination),
        LOF(n_neighbors=45, contamination=contamination),
        LOF(n_neighbors=50, contamination=contamination),
        LOF(n_neighbors=55, contamination=contamination),
        LOF(n_neighbors=60, contamination=contamination),
        LOF(n_neighbors=65, contamination=contamination),
        LOF(n_neighbors=70, contamination=contamination),
        LOF(n_neighbors=75, contamination=contamination),
        LOF(n_neighbors=80, contamination=contamination),
        LOF(n_neighbors=85, contamination=contamination),
        LOF(n_neighbors=90, contamination=contamination),
        LOF(n_neighbors=95, contamination=contamination),
        LOF(n_neighbors=100, contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        PCA(contamination=contamination),
        PCA(contamination=contamination),
        PCA(contamination=contamination),
        PCA(contamination=contamination),
        PCA(contamination=contamination),
        PCA(contamination=contamination),
        PCA(contamination=contamination),
        PCA(contamination=contamination),
        PCA(contamination=contamination),
        PCA(contamination=contamination),
        KNN(n_neighbors=5, contamination=contamination),
        KNN(n_neighbors=15, contamination=contamination),
        KNN(n_neighbors=25, contamination=contamination),
        KNN(n_neighbors=35, contamination=contamination),
        KNN(n_neighbors=45, contamination=contamination),
        KNN(n_neighbors=50, contamination=contamination),
        KNN(n_neighbors=55, contamination=contamination),
        KNN(n_neighbors=65, contamination=contamination),
        KNN(n_neighbors=75, contamination=contamination),
        KNN(n_neighbors=85, contamination=contamination),
        KNN(n_neighbors=85, contamination=contamination),
        KNN(n_neighbors=85, contamination=contamination),
        KNN(n_neighbors=95, contamination=contamination),
        KNN(n_neighbors=100, contamination=contamination),
        IForest(n_estimators=50, contamination=contamination),
        IForest(n_estimators=100, contamination=contamination),
        IForest(n_estimators=150, contamination=contamination),
        IForest(n_estimators=200, contamination=contamination),
        IForest(n_estimators=50, contamination=contamination),
        IForest(n_estimators=100, contamination=contamination),
        IForest(n_estimators=150, contamination=contamination),
        IForest(n_estimators=200, contamination=contamination),
        LOF(n_neighbors=5, contamination=contamination),
        LOF(n_neighbors=10, contamination=contamination),
        LOF(n_neighbors=15, contamination=contamination),
        LOF(n_neighbors=25, contamination=contamination),
        LOF(n_neighbors=35, contamination=contamination),
        LOF(n_neighbors=45, contamination=contamination),
        LOF(n_neighbors=50, contamination=contamination),
        LOF(n_neighbors=55, contamination=contamination),
        LOF(n_neighbors=60, contamination=contamination),
        LOF(n_neighbors=65, contamination=contamination),
        LOF(n_neighbors=70, contamination=contamination),
        LOF(n_neighbors=75, contamination=contamination),
        LOF(n_neighbors=80, contamination=contamination),
        LOF(n_neighbors=85, contamination=contamination),
        LOF(n_neighbors=90, contamination=contamination),
        LOF(n_neighbors=95, contamination=contamination),
        LOF(n_neighbors=100, contamination=contamination),
        LOF(n_neighbors=5, contamination=contamination),
        LOF(n_neighbors=10, contamination=contamination),
        LOF(n_neighbors=15, contamination=contamination),
        LOF(n_neighbors=25, contamination=contamination),
        LOF(n_neighbors=35, contamination=contamination),
        LOF(n_neighbors=45, contamination=contamination),
        LOF(n_neighbors=50, contamination=contamination),
        LOF(n_neighbors=55, contamination=contamination),
        LOF(n_neighbors=60, contamination=contamination),
        LOF(n_neighbors=65, contamination=contamination),
        LOF(n_neighbors=70, contamination=contamination),
        LOF(n_neighbors=75, contamination=contamination),
        LOF(n_neighbors=80, contamination=contamination),
        LOF(n_neighbors=85, contamination=contamination),
        LOF(n_neighbors=90, contamination=contamination),
        LOF(n_neighbors=95, contamination=contamination),
        LOF(n_neighbors=100, contamination=contamination),
        LOF(n_neighbors=5, contamination=contamination),
        LOF(n_neighbors=10, contamination=contamination),
        LOF(n_neighbors=15, contamination=contamination),
        LOF(n_neighbors=25, contamination=contamination),
        LOF(n_neighbors=35, contamination=contamination),
        LOF(n_neighbors=45, contamination=contamination),
        LOF(n_neighbors=50, contamination=contamination),
        LOF(n_neighbors=55, contamination=contamination),
        LOF(n_neighbors=60, contamination=contamination),
        LOF(n_neighbors=65, contamination=contamination),
        LOF(n_neighbors=70, contamination=contamination),
        LOF(n_neighbors=75, contamination=contamination),
        LOF(n_neighbors=80, contamination=contamination),
        LOF(n_neighbors=85, contamination=contamination),
        LOF(n_neighbors=90, contamination=contamination),
        LOF(n_neighbors=95, contamination=contamination),
        LOF(n_neighbors=100, contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        PCA(contamination=contamination),
        PCA(contamination=contamination),
        PCA(contamination=contamination),
        PCA(contamination=contamination),
        PCA(contamination=contamination),
        PCA(contamination=contamination),
        PCA(contamination=contamination),
        PCA(contamination=contamination),
        PCA(contamination=contamination),
        PCA(contamination=contamination),
        KNN(n_neighbors=5, contamination=contamination),
        KNN(n_neighbors=15, contamination=contamination),
        KNN(n_neighbors=25, contamination=contamination),
        KNN(n_neighbors=35, contamination=contamination),
        KNN(n_neighbors=45, contamination=contamination),
        KNN(n_neighbors=50, contamination=contamination),
        KNN(n_neighbors=55, contamination=contamination),
        KNN(n_neighbors=65, contamination=contamination),
        KNN(n_neighbors=75, contamination=contamination),
        KNN(n_neighbors=85, contamination=contamination),
        KNN(n_neighbors=85, contamination=contamination),
        KNN(n_neighbors=85, contamination=contamination),
        KNN(n_neighbors=95, contamination=contamination),
        KNN(n_neighbors=100, contamination=contamination),
        IForest(n_estimators=50, contamination=contamination),
        IForest(n_estimators=100, contamination=contamination),
        IForest(n_estimators=150, contamination=contamination),
        IForest(n_estimators=200, contamination=contamination),
        IForest(n_estimators=50, contamination=contamination),
        IForest(n_estimators=100, contamination=contamination),
        IForest(n_estimators=150, contamination=contamination),
        IForest(n_estimators=200, contamination=contamination),
        LOF(n_neighbors=5, contamination=contamination),
        LOF(n_neighbors=10, contamination=contamination),
        LOF(n_neighbors=15, contamination=contamination),
        LOF(n_neighbors=25, contamination=contamination),
        LOF(n_neighbors=35, contamination=contamination),
        LOF(n_neighbors=45, contamination=contamination),
        LOF(n_neighbors=50, contamination=contamination),
        LOF(n_neighbors=55, contamination=contamination),
        LOF(n_neighbors=60, contamination=contamination),
        LOF(n_neighbors=65, contamination=contamination),
        LOF(n_neighbors=70, contamination=contamination),
        LOF(n_neighbors=75, contamination=contamination),
        LOF(n_neighbors=80, contamination=contamination),
        LOF(n_neighbors=85, contamination=contamination),
        LOF(n_neighbors=90, contamination=contamination),
        LOF(n_neighbors=95, contamination=contamination),
        LOF(n_neighbors=100, contamination=contamination),
        LOF(n_neighbors=5, contamination=contamination),
        LOF(n_neighbors=10, contamination=contamination),
        LOF(n_neighbors=15, contamination=contamination),
        LOF(n_neighbors=25, contamination=contamination),
        LOF(n_neighbors=35, contamination=contamination),
        LOF(n_neighbors=45, contamination=contamination),
        LOF(n_neighbors=50, contamination=contamination),
        LOF(n_neighbors=55, contamination=contamination),
        LOF(n_neighbors=60, contamination=contamination),
        LOF(n_neighbors=65, contamination=contamination),
        LOF(n_neighbors=70, contamination=contamination),
        LOF(n_neighbors=75, contamination=contamination),
        LOF(n_neighbors=80, contamination=contamination),
        LOF(n_neighbors=85, contamination=contamination),
        LOF(n_neighbors=90, contamination=contamination),
        LOF(n_neighbors=95, contamination=contamination),
        LOF(n_neighbors=100, contamination=contamination),
        LOF(n_neighbors=5, contamination=contamination),
        LOF(n_neighbors=10, contamination=contamination),
        LOF(n_neighbors=15, contamination=contamination),
        LOF(n_neighbors=25, contamination=contamination),
        LOF(n_neighbors=35, contamination=contamination),
        LOF(n_neighbors=45, contamination=contamination),
        LOF(n_neighbors=50, contamination=contamination),
        LOF(n_neighbors=55, contamination=contamination),
        LOF(n_neighbors=60, contamination=contamination),
        LOF(n_neighbors=65, contamination=contamination),
        LOF(n_neighbors=70, contamination=contamination),
        LOF(n_neighbors=75, contamination=contamination),
        LOF(n_neighbors=80, contamination=contamination),
        LOF(n_neighbors=85, contamination=contamination),
        LOF(n_neighbors=90, contamination=contamination),
        LOF(n_neighbors=95, contamination=contamination),
        LOF(n_neighbors=100, contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        ABOD(n_neighbors=5, contamination=contamination),
        ABOD(n_neighbors=10, contamination=contamination),
        ABOD(n_neighbors=15, contamination=contamination),
        ABOD(n_neighbors=20, contamination=contamination),
        ABOD(n_neighbors=25, contamination=contamination),
        ABOD(n_neighbors=30, contamination=contamination),
        ABOD(n_neighbors=35, contamination=contamination),
        ABOD(n_neighbors=40, contamination=contamination),
        IForest(n_estimators=50, contamination=contamination),
        IForest(n_estimators=100, contamination=contamination),
        IForest(n_estimators=150, contamination=contamination),
        IForest(n_estimators=200, contamination=contamination),
        IForest(n_estimators=50, contamination=contamination),
        IForest(n_estimators=100, contamination=contamination),
        IForest(n_estimators=150, contamination=contamination),
        IForest(n_estimators=200, contamination=contamination),
        IForest(n_estimators=150, contamination=contamination),
        IForest(n_estimators=200, contamination=contamination),
        PCA(contamination=contamination),
        PCA(contamination=contamination),
        PCA(contamination=contamination),
        PCA(contamination=contamination),
        PCA(contamination=contamination),
        PCA(contamination=contamination),
        PCA(contamination=contamination),
        PCA(contamination=contamination),
        PCA(contamination=contamination),
        PCA(contamination=contamination),
        KNN(n_neighbors=5, contamination=contamination),
        KNN(n_neighbors=15, contamination=contamination),
        KNN(n_neighbors=25, contamination=contamination),
        KNN(n_neighbors=35, contamination=contamination),
        KNN(n_neighbors=45, contamination=contamination),
        KNN(n_neighbors=50, contamination=contamination),
        KNN(n_neighbors=55, contamination=contamination),
        KNN(n_neighbors=65, contamination=contamination),
        KNN(n_neighbors=75, contamination=contamination),
        KNN(n_neighbors=85, contamination=contamination),
        KNN(n_neighbors=85, contamination=contamination),
        KNN(n_neighbors=85, contamination=contamination),
        KNN(n_neighbors=95, contamination=contamination),
        KNN(n_neighbors=100, contamination=contamination),
        IForest(n_estimators=50, contamination=contamination),
        IForest(n_estimators=100, contamination=contamination),
        IForest(n_estimators=150, contamination=contamination),
        IForest(n_estimators=200, contamination=contamination),
        IForest(n_estimators=50, contamination=contamination),
        IForest(n_estimators=100, contamination=contamination),
        IForest(n_estimators=150, contamination=contamination),
        IForest(n_estimators=200, contamination=contamination),
        LOF(n_neighbors=5, contamination=contamination),
        LOF(n_neighbors=10, contamination=contamination),
        LOF(n_neighbors=15, contamination=contamination),
        LOF(n_neighbors=25, contamination=contamination),
        LOF(n_neighbors=35, contamination=contamination),
        LOF(n_neighbors=45, contamination=contamination),
        LOF(n_neighbors=50, contamination=contamination),
        LOF(n_neighbors=55, contamination=contamination),
        LOF(n_neighbors=60, contamination=contamination),
        LOF(n_neighbors=65, contamination=contamination),
        LOF(n_neighbors=70, contamination=contamination),
        LOF(n_neighbors=75, contamination=contamination),
        LOF(n_neighbors=80, contamination=contamination),
        LOF(n_neighbors=85, contamination=contamination),
        LOF(n_neighbors=90, contamination=contamination),
        LOF(n_neighbors=95, contamination=contamination),
        LOF(n_neighbors=100, contamination=contamination),
        ABOD(n_neighbors=5, contamination=contamination),
        ABOD(n_neighbors=10, contamination=contamination),
        ABOD(n_neighbors=15, contamination=contamination),
        ABOD(n_neighbors=20, contamination=contamination),
        ABOD(n_neighbors=25, contamination=contamination),
        ABOD(n_neighbors=30, contamination=contamination),
        ABOD(n_neighbors=35, contamination=contamination),
        ABOD(n_neighbors=40, contamination=contamination),
        LOF(n_neighbors=5, contamination=contamination),
        LOF(n_neighbors=10, contamination=contamination),
        LOF(n_neighbors=15, contamination=contamination),
        LOF(n_neighbors=25, contamination=contamination),
        LOF(n_neighbors=35, contamination=contamination),
        LOF(n_neighbors=45, contamination=contamination),
        LOF(n_neighbors=50, contamination=contamination),
        LOF(n_neighbors=55, contamination=contamination),
        LOF(n_neighbors=60, contamination=contamination),
        LOF(n_neighbors=65, contamination=contamination),
        LOF(n_neighbors=70, contamination=contamination),
        LOF(n_neighbors=75, contamination=contamination),
        LOF(n_neighbors=80, contamination=contamination),
        LOF(n_neighbors=85, contamination=contamination),
        LOF(n_neighbors=90, contamination=contamination),
        LOF(n_neighbors=95, contamination=contamination),
        LOF(n_neighbors=100, contamination=contamination),
        LOF(n_neighbors=5, contamination=contamination),
        LOF(n_neighbors=10, contamination=contamination),
        LOF(n_neighbors=15, contamination=contamination),
        LOF(n_neighbors=25, contamination=contamination),
        LOF(n_neighbors=35, contamination=contamination),
        LOF(n_neighbors=45, contamination=contamination),
        LOF(n_neighbors=50, contamination=contamination),
        LOF(n_neighbors=55, contamination=contamination),
        LOF(n_neighbors=60, contamination=contamination),
        LOF(n_neighbors=65, contamination=contamination),
        LOF(n_neighbors=70, contamination=contamination),
        LOF(n_neighbors=75, contamination=contamination),
        LOF(n_neighbors=80, contamination=contamination),
        LOF(n_neighbors=85, contamination=contamination),
        LOF(n_neighbors=90, contamination=contamination),
        LOF(n_neighbors=95, contamination=contamination),
        LOF(n_neighbors=100, contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        HBOS(contamination=contamination),
        PCA(contamination=contamination),
        PCA(contamination=contamination),
        PCA(contamination=contamination),
        PCA(contamination=contamination),
        PCA(contamination=contamination),
        PCA(contamination=contamination),
        PCA(contamination=contamination),
        PCA(contamination=contamination),
        PCA(contamination=contamination),
        PCA(contamination=contamination),
        KNN(n_neighbors=5, contamination=contamination),
        KNN(n_neighbors=15, contamination=contamination),
        KNN(n_neighbors=25, contamination=contamination),
        KNN(n_neighbors=35, contamination=contamination),
        KNN(n_neighbors=45, contamination=contamination),
        KNN(n_neighbors=50, contamination=contamination),
        KNN(n_neighbors=55, contamination=contamination),
        KNN(n_neighbors=65, contamination=contamination),
        KNN(n_neighbors=75, contamination=contamination),
        KNN(n_neighbors=85, contamination=contamination),
        KNN(n_neighbors=85, contamination=contamination),
        KNN(n_neighbors=85, contamination=contamination),
        KNN(n_neighbors=95, contamination=contamination),
        KNN(n_neighbors=100, contamination=contamination),
        IForest(n_estimators=50, contamination=contamination),
        IForest(n_estimators=100, contamination=contamination),
        IForest(n_estimators=150, contamination=contamination),
        IForest(n_estimators=200, contamination=contamination),
        IForest(n_estimators=50, contamination=contamination),
        IForest(n_estimators=100, contamination=contamination),
        IForest(n_estimators=150, contamination=contamination),
        IForest(n_estimators=200, contamination=contamination),
        LOF(n_neighbors=5, contamination=contamination),
        LOF(n_neighbors=10, contamination=contamination),
        LOF(n_neighbors=15, contamination=contamination),
        LOF(n_neighbors=25, contamination=contamination),
        LOF(n_neighbors=35, contamination=contamination),
        LOF(n_neighbors=45, contamination=contamination),
        LOF(n_neighbors=50, contamination=contamination),
        LOF(n_neighbors=55, contamination=contamination),
        LOF(n_neighbors=60, contamination=contamination),
        LOF(n_neighbors=65, contamination=contamination),
        LOF(n_neighbors=70, contamination=contamination),
        LOF(n_neighbors=75, contamination=contamination),
        LOF(n_neighbors=80, contamination=contamination),
        LOF(n_neighbors=85, contamination=contamination),
        LOF(n_neighbors=90, contamination=contamination),
        LOF(n_neighbors=95, contamination=contamination),
        LOF(n_neighbors=100, contamination=contamination),
        ABOD(n_neighbors=5, contamination=contamination),
        ABOD(n_neighbors=10, contamination=contamination),
        ABOD(n_neighbors=15, contamination=contamination),
        ABOD(n_neighbors=20, contamination=contamination),
        ABOD(n_neighbors=25, contamination=contamination),
        ABOD(n_neighbors=30, contamination=contamination),
        ABOD(n_neighbors=35, contamination=contamination),
        ABOD(n_neighbors=40, contamination=contamination),
        ABOD(n_neighbors=45, contamination=contamination),
        OCSVM(contamination=contamination),
        OCSVM(contamination=contamination),
        OCSVM(contamination=contamination),
        OCSVM(contamination=contamination),
        OCSVM(contamination=contamination),
        OCSVM(contamination=contamination),
        OCSVM(contamination=contamination),
        OCSVM(contamination=contamination),
        OCSVM(contamination=contamination),
        OCSVM(contamination=contamination),
        MCD(contamination=contamination),
        MCD(contamination=contamination),
        MCD(contamination=contamination),
        MCD(contamination=contamination),
        MCD(contamination=contamination),
        MCD(contamination=contamination),
        MCD(contamination=contamination),
        MCD(contamination=contamination),
        MCD(contamination=contamination),
        MCD(contamination=contamination),
        MCD(contamination=contamination),
        MCD(contamination=contamination),
        MCD(contamination=contamination),
        MCD(contamination=contamination),
        MCD(contamination=contamination),
        MCD(contamination=contamination),
        MCD(contamination=contamination),
        MCD(contamination=contamination),
        MCD(contamination=contamination),
        MCD(contamination=contamination),
        LOF(n_neighbors=75, contamination=contamination),
        LOF(n_neighbors=80, contamination=contamination),
        LOF(n_neighbors=85, contamination=contamination),
        LOF(n_neighbors=90, contamination=contamination),
        LOF(n_neighbors=95, contamination=contamination),
        LOF(n_neighbors=100, contamination=contamination),
        ABOD(n_neighbors=5, contamination=contamination),
        ABOD(n_neighbors=10, contamination=contamination),
        ABOD(n_neighbors=15, contamination=contamination),
        ABOD(n_neighbors=20, contamination=contamination),
        ABOD(n_neighbors=25, contamination=contamination),
        ABOD(n_neighbors=30, contamination=contamination),
        ABOD(n_neighbors=35, contamination=contamination),
        ABOD(n_neighbors=40, contamination=contamination),
    ]

    return BASE_ESTIMATORS
Example #23
0
def outlier_ensemble(df):
    """" ensemble method based on the paper:
    An unsupervised approach for combining scores of outlier detection techniques, based on similarity measures"""

    df_numeric = df.select_dtypes(
        include=[np.number])  # keep only numeric type features
    algorithms = [
        MCD(), PCA(),
        knn(), ABOD(),
        HBOS(), LOF(),
        OCSVM(),
        IForest()
    ]
    d = len(df_numeric.columns)
    anomaly_scores_matr = []
    for clf in algorithms:
        # sample features
        nr_of_features = np.random.randint(low=int(d / 2), high=d)
        sampled_features = list(
            np.random.choice(d, nr_of_features, replace=False))
        df_numeric_sample = df_numeric.iloc[:, sampled_features]

        # run classifier
        clf.fit(X=df_numeric_sample)
        anomaly_score = clf.decision_function(df_numeric_sample).reshape(-1, 1)
        anomaly_score = list(preprocessing.StandardScaler().fit_transform(
            anomaly_score).flatten())  # standardize
        anomaly_scores_matr.append(anomaly_score)

    anomaly_scores_matr = pd.DataFrame(np.array(anomaly_scores_matr).T)

    # Create votes matrix multiple votes
    votes_matr = np.zeros(anomaly_scores_matr.shape, dtype='int')
    for col in anomaly_scores_matr:
        IQR_col = anomaly_scores_matr[col].quantile(
            0.75) - anomaly_scores_matr[col].quantile(0.25)

        for col in anomaly_scores_matr:
            votes = anomaly_scores_matr[anomaly_scores_matr[col] > 1.5 *
                                        IQR_col].index
            votes_matr[list(votes), col] += 1

    votes_matr = pd.DataFrame(votes_matr)

    # determine weights
    # EDCV
    weights = []
    C = anomaly_scores_matr.corr()
    for i in range(len(algorithms)):
        weight = (C[i].sum() - 1) / (len(algorithms) - 1)
        weights.append(weight)
    weights = np.array(weights)

    # combine scores to get final score:
    final_score = []
    for i in range(len(anomaly_scores_matr)):
        F_final = (anomaly_scores_matr.iloc[i] * votes_matr.iloc[i] *
                   weights).sum() / len(algorithms)
        final_score.append(F_final)

    final_score = np.array(final_score)

    # Regular thresholding
    # predictions = (final_score > (np.percentile(final_score, 75) + 1.5 * (
    #             np.percentile(final_score, 75) - np.percentile(final_score, 25))).astype(int))

    # Two stage thresholding
    mask_stage1 = final_score <= (
        np.percentile(final_score, 75) + 1.5 *
        (np.percentile(final_score, 75) - np.percentile(final_score, 25)))
    threshold = np.percentile(
        final_score[mask_stage1],
        75) + 1.5 * (np.percentile(final_score[mask_stage1], 75) -
                     np.percentile(final_score[mask_stage1], 25))
    predictions = (final_score > threshold).astype(int)

    df_sorted = df.copy()
    df_sorted['anomaly_score'] = final_score
    df_sorted['prediction'] = predictions
    print(predictions.sum())
    df_sorted = df_sorted.sort_values(by='anomaly_score', ascending=False)

    return df_sorted
Example #24
0
if __name__ == "__main__":
    contamination = 0.1  # percentage of outliers
    n_train = 200  # number of training points
    n_test = 100  # number of testing points

    # Generate sample data
    X_train, y_train, X_test, y_test = \
        generate_data(n_train=n_train,
                      n_test=n_test,
                      n_features=2,
                      contamination=contamination,
                      random_state=42)

    # train LOF detector
    clf_name = 'MCD'
    clf = MCD()
    clf.fit(X_train)

    # get the prediction labels and outlier scores of the training data
    y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
    y_train_scores = clf.decision_scores_  # raw outlier scores

    # get the prediction on the test data
    y_test_pred = clf.predict(X_test)  # outlier labels (0 or 1)
    y_test_scores = clf.decision_function(X_test)  # outlier scores

    # evaluate and print the results
    print("\nOn Training Data:")
    evaluate_print(clf_name, y_train, y_train_scores)
    print("\nOn Test Data:")
    evaluate_print(clf_name, y_test, y_test_scores)
                       random_state=random_state),
    '(HBOS) Histogram-base Outlier Detection': HBOS(
        contamination=outliers_fraction),
    'Isolation Forest': IForest(contamination=outliers_fraction,
                                random_state=random_state),
    '(KNN) K Nearest Neighbors ': KNN(
        contamination=outliers_fraction),
    'Average KNN': KNN(method='mean',
                       contamination=outliers_fraction),
    # 'Median KNN': KNN(method='median',
    #                   contamination=outliers_fraction),
    '(LOF) Local Outlier Factor ':
        LOF(n_neighbors=35, contamination=outliers_fraction),
    # 'Local Correlation Integral (LOCI)':
    #     LOCI(contamination=outliers_fraction),
    '(MCD) Minimum Covariance Determinant ': MCD(
        contamination=outliers_fraction, random_state=random_state),
    'One-class SVM (OCSVM)': OCSVM(contamination=outliers_fraction),
    '(PCA) Principal Component Analysis ': PCA(
        contamination=outliers_fraction, random_state=random_state),
    # 'Stochastic Outlier Selection (SOS)': SOS(
    #     contamination=outliers_fraction),
    '(LSCP) Locally Selective Combination ': LSCP(
        detector_list, contamination=outliers_fraction,
        random_state=random_state),
    # 'Connectivity-Based Outlier Factor (COF)':
    #     COF(n_neighbors=35, contamination=outliers_fraction),
    # 'Subspace Outlier Detection (SOD)':
    #     SOD(contamination=outliers_fraction),
}
st.subheader('SELECT AN ALGORITHM:')
Example #26
0
    'Histogram-base Outlier Detection (HBOS)':
    HBOS(),
    'Isolation Forest':
    IForest(random_state=random_state),
    'K Nearest Neighbors (KNN)':
    KNN(),
    'Average KNN':
    KNN(method='mean'),
    # 'Median KNN': KNN(method='median',
    #                   contamination=outliers_fraction),
    'Local Outlier Factor (LOF)':
    LOF(n_neighbors=35),
    # 'Local Correlation Integral (LOCI)':
    #     LOCI(contamination=outliers_fraction),
    'Minimum Covariance Determinant (MCD)':
    MCD(random_state=random_state),
    'One-class SVM (OCSVM)':
    OCSVM(),
    'Principal Component Analysis (PCA)':
    PCA(random_state=random_state),
    # 'Stochastic Outlier Selection (SOS)': SOS(
    #     contamination=outliers_fraction),
    'Locally Selective Combination (LSCP)':
    LSCP(detector_list, random_state=random_state),
    # 'Connectivity-Based Outlier Factor (COF)':
    #     COF(n_neighbors=35, contamination=outliers_fraction),
    # 'Subspace Outlier Detection (SOD)':
    #     SOD(contamination=outliers_fraction),
}

# Show all detectors
Example #27
0
end = "2020-02-15"

test_date = "2020-02-16"

KNN_clf = KNN(contamination=0.05)
PCA_clf = PCA(contamination=0.05)
VAE_clf = VAE(contamination=0.05, epochs=30, encoder_neurons=[9, 4], decoder_neurons=[4, 9])
LOF_clf = LOF(contamination=0.05)
IForest_clf = IForest(contamination=0.05)
AutoEncoder_clf = AutoEncoder(contamination=0.05, epochs=30, hidden_neurons=[9, 4, 4, 9])
FeatureBagging_clf = FeatureBagging(contamination=0.05, check_estimator=False)
ABOD_clf = ABOD(contamination=0.05)
HBOS_clf = HBOS(contamination=0.05)
CBLOF_clf = CBLOF(contamination=0.05)
LODA_clf = LODA(contamination=0.05)
MCD_clf = MCD(contamination=0.05)
MO_GAAL_clf = MO_GAAL(k=3, stop_epochs=2, contamination=0.05)
SO_GAAL_clf = SO_GAAL(contamination=0.05)
KNN_MAH_clf = None

S_models = ["KNN", "LOF", "PCA", "IForest", "HBOS", "LODA", "MCD", "CBLOF", "FeatureBagging", "ABOD", "KNN_MAH"]
K_models = ["AutoEncoder", "SO_GAAL", "VAE"]

def get_train_data():
    """
    获取训练样本
    :return:    x_train 9特征训练样本
                df 原训练数据
    """
    acc_date = pd.date_range(begin, end, freq='1D')
    for day in acc_date:
Example #28
0
        classifiers = {'Feature Bagging': FeatureBagging(contamination=outliers_fraction, check_estimator=False, random_state=random_state)}
        classifiers_indices = {'Feature Bagging': 0}
    elif sys.argv[1] == 'hbos':
        classifiers = {'Histogram-base Outlier Detection (HBOS)': HBOS(contamination=outliers_fraction)}
        classifiers_indices = {'Histogram-base Outlier Detection (HBOS)': 0}
    elif sys.argv[1] == 'iforest':
        classifiers = {'Isolation Forest': IForest(contamination=outliers_fraction, random_state=random_state)}
        classifiers_indices = {'Isolation Forest': 0}
    elif sys.argv[1] == 'knn':
        classifiers = {'K Nearest Neighbors (KNN)': KNN(contamination=outliers_fraction)}
        classifiers_indices = {'K Nearest Neighbors (KNN)': 0}
    elif sys.argv[1] == 'lof':
        classifiers = {'Local Outlier Factor (LOF)': LOF(contamination=outliers_fraction)}
        classifiers_indices = {'Local Outlier Factor (LOF)': 0}
    elif sys.argv[1] == 'mcd':
        classifiers = {'Minimum Covariance Determinant (MCD)': MCD(contamination=outliers_fraction, random_state=random_state)}
        classifiers_indices = {'Minimum Covariance Determinant (MCD)': 0}
    for clf_name, clf in classifiers.items():
        print("\n\nAlgorithm: ", clf_name)
        t0 = time()
        clf.fit(X_train_norm)
        test_scores = clf.decision_function(X_test_norm)
        t1 = time()
        duration = round(t1 - t0, ndigits=4)

        roc = round(roc_auc_score(y_test, test_scores), ndigits=4)
        prn = round(precision_n_scores(y_test, test_scores), ndigits=4)

        print('ROC:{roc}, precision @ rank n:{prn}, '
              'execution time: {duration}s'.format(roc=roc, prn=prn, duration=duration))
# In[6]:

data195061 = df[(df['CarId'] == '195061')]
x = data195061['Time']
y = data195061['Speed diff']

plt.figure(figsize=(10, 4))
plt.plot(x, y, label='Car 195061')
plt.xlabel('Time')
plt.ylabel('Speed diff')
plt.show()

# In[7]:

lscp = LSCP(detector_list=[MCD(), MCD()])
lscp.fit(df['Speed diff'].values.reshape(-1, 1))
xx = np.linspace(df['Speed diff'].min(), df['Speed diff'].max(),
                 len(df)).reshape(-1, 1)
anomaly_score = lscp.decision_function(xx)
outlier = lscp.predict(xx)
plt.figure(figsize=(10, 4))
plt.plot(xx, anomaly_score, label='anomaly score')
plt.ylabel('anomaly score')
plt.xlabel('Speed diff')
plt.show()

# In[8]:

df.loc[df['Speed diff'] > 10]
Example #30
0
        'V.17', 'V.18', 'V.19', 'V.20'
    ]
    x = data[cols].values

    #把label标签加入,把该问题当成有监督问题来处理
    #y=data['original.label']
    data['s'] = data['original.label']
    data.loc[data['original.label'] != 1, 's'] = 0
    y = data['s']

    #划分测试集和训练集
    X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.33)

    #使用pyod中的MCD算法拟合数据
    clf_name = 'MCD'
    clf = MCD()
    clf.fit(X_train)

    #预测得到由0和1组成的数组,1表示离群点,0表示飞离群点
    y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
    y_train_scores = clf.decision_scores_  # raw outlier scores,The outlier scores of the training data.

    #预测样本是不是离群点,返回0和1 的数组
    y_test_pred = clf.predict(X_test)

    y_test_scores = clf.decision_function(
        X_test)  # outlier scores,The anomaly score of the input samples.
    #使用sklearn中的roc_auc_score方法得到auc值,即roc曲线下面的面积
    try:
        sumAuc_train += sklearn.metrics.roc_auc_score(y_train,
                                                      y_train_scores,
Example #31
0
class TestMCD(unittest.TestCase):
    def setUp(self):
        self.n_train = 100
        self.n_test = 50
        self.contamination = 0.1
        self.roc_floor = 0.6
        self.X_train, self.y_train, self.X_test, self.y_test = generate_data(
            n_train=self.n_train, n_test=self.n_test,
            contamination=self.contamination, random_state=42)

        self.clf = MCD(contamination=self.contamination, random_state=42)
        self.clf.fit(self.X_train)

    def test_sklearn_estimator(self):
        check_estimator(self.clf)

    def test_parameters(self):
        assert_true(hasattr(self.clf, 'decision_scores_') and
                    self.clf.decision_scores_ is not None)
        assert_true(hasattr(self.clf, 'labels_') and
                    self.clf.labels_ is not None)
        assert_true(hasattr(self.clf, 'threshold_') and
                    self.clf.threshold_ is not None)
        assert_true(hasattr(self.clf, '_mu') and
                    self.clf._mu is not None)
        assert_true(hasattr(self.clf, '_sigma') and
                    self.clf._sigma is not None)
        assert_true(hasattr(self.clf, 'raw_location_') and
                    self.clf.raw_location_ is not None)
        assert_true(hasattr(self.clf, 'raw_covariance_') and
                    self.clf.raw_covariance_ is not None)
        assert_true(hasattr(self.clf, 'raw_support_') and
                    self.clf.raw_support_ is not None)
        assert_true(hasattr(self.clf, 'location_') and
                    self.clf.location_ is not None)
        assert_true(hasattr(self.clf, 'covariance_') and
                    self.clf.covariance_ is not None)
        assert_true(hasattr(self.clf, 'precision_') and
                    self.clf.precision_ is not None)
        assert_true(hasattr(self.clf, 'support_') and
                    self.clf.support_ is not None)

    def test_train_scores(self):
        assert_equal(len(self.clf.decision_scores_), self.X_train.shape[0])

    def test_prediction_scores(self):
        pred_scores = self.clf.decision_function(self.X_test)

        # check score shapes
        assert_equal(pred_scores.shape[0], self.X_test.shape[0])

        # check performance
        assert_greater(roc_auc_score(self.y_test, pred_scores), self.roc_floor)

    def test_prediction_labels(self):
        pred_labels = self.clf.predict(self.X_test)
        assert_equal(pred_labels.shape, self.y_test.shape)

    def test_prediction_proba(self):
        pred_proba = self.clf.predict_proba(self.X_test)
        assert_greater_equal(pred_proba.min(), 0)
        assert_less_equal(pred_proba.max(), 1)

    def test_prediction_proba_linear(self):
        pred_proba = self.clf.predict_proba(self.X_test, method='linear')
        assert_greater_equal(pred_proba.min(), 0)
        assert_less_equal(pred_proba.max(), 1)

    def test_prediction_proba_unify(self):
        pred_proba = self.clf.predict_proba(self.X_test, method='unify')
        assert_greater_equal(pred_proba.min(), 0)
        assert_less_equal(pred_proba.max(), 1)

    def test_prediction_proba_parameter(self):
        with assert_raises(ValueError):
            self.clf.predict_proba(self.X_test, method='something')

    def test_fit_predict(self):
        pred_labels = self.clf.fit_predict(self.X_train)
        assert_equal(pred_labels.shape, self.y_train.shape)

    def test_fit_predict_score(self):
        self.clf.fit_predict_score(self.X_test, self.y_test)
        self.clf.fit_predict_score(self.X_test, self.y_test,
                                   scoring='roc_auc_score')
        self.clf.fit_predict_score(self.X_test, self.y_test,
                                   scoring='prc_n_score')
        with assert_raises(NotImplementedError):
            self.clf.fit_predict_score(self.X_test, self.y_test,
                                       scoring='something')

    def test_predict_rank(self):
        pred_socres = self.clf.decision_function(self.X_test)
        pred_ranks = self.clf._predict_rank(self.X_test)

        # assert the order is reserved
        assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=2)
        assert_array_less(pred_ranks, self.X_train.shape[0] + 1)
        assert_array_less(-0.1, pred_ranks)

    def test_predict_rank_normalized(self):
        pred_socres = self.clf.decision_function(self.X_test)
        pred_ranks = self.clf._predict_rank(self.X_test, normalized=True)

        # assert the order is reserved
        assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=2)
        assert_array_less(pred_ranks, 1.01)
        assert_array_less(-0.1, pred_ranks)

    def tearDown(self):
        pass
    'Histogram-base Outlier Detection (HBOS)':
    HBOS(contamination=outliers_fraction),
    'Isolation Forest':
    IForest(contamination=outliers_fraction, random_state=random_state),
    'K Nearest Neighbors (KNN)':
    KNN(contamination=outliers_fraction),
    'Average KNN':
    KNN(method='mean', contamination=outliers_fraction),
    # 'Median KNN': KNN(method='median',
    #                   contamination=outliers_fraction),
    'Local Outlier Factor (LOF)':
    LOF(n_neighbors=35, contamination=outliers_fraction),
    # 'Local Correlation Integral (LOCI)':
    #     LOCI(contamination=outliers_fraction),
    'Minimum Covariance Determinant (MCD)':
    MCD(contamination=outliers_fraction, random_state=random_state),
    'One-class SVM (OCSVM)':
    OCSVM(contamination=outliers_fraction),
    'Principal Component Analysis (PCA)':
    PCA(contamination=outliers_fraction, random_state=random_state),
    # 'Stochastic Outlier Selection (SOS)': SOS(
    #     contamination=outliers_fraction),
    'Locally Selective Combination (LSCP)':
    LSCP(detector_list,
         contamination=outliers_fraction,
         random_state=random_state),
    # 'Connectivity-Based Outlier Factor (COF)':
    #     COF(n_neighbors=35, contamination=outliers_fraction),
    # 'Subspace Outlier Detection (SOD)':
    #     SOD(contamination=outliers_fraction),
}
Example #33
0
    mat = sp.io.loadmat(os.path.join('../datasets', mat_file))

    X = mat['X']
    y = mat['y']

    X = StandardScaler().fit_transform(X)

    classifiers = {
        1: ABOD(n_neighbors=10),
        2: CBLOF(check_estimator=False),
        3: FeatureBagging(LOF()),
        4: HBOS(),
        5: IForest(),
        6: KNN(),
        7: LOF(),
        8: MCD(),
        9: OCSVM(),
        10: PCA(),
    }

    idx_clf_mapping = {
        1: 'ABOD',
        2: 'CBLOF',
        3: 'FeatureBagging',
        4: 'HBOS',
        5: 'IForest',
        6: 'KNN',
        7: 'LOF',
        8: 'MCD',
        9: 'OCSVM',
        10: 'PCA',