Example #1
0
    clf_name = 'LSCP'
    detector_list = [
        LOF(n_neighbors=15),
        LOF(n_neighbors=20),
        LOF(n_neighbors=25),
        LOF(n_neighbors=35)
    ]
    clf = LSCP(detector_list, random_state=42)
    clf.fit(X_train)

    # get the prediction labels and outlier scores of the training data
    y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
    y_train_scores = clf.decision_scores_  # raw outlier scores

    # get the prediction on the test data
    y_test_pred = clf.predict(X_test)  # outlier labels (0 or 1)
    y_test_scores = clf.decision_function(X_test)  # outlier scores

    # evaluate and print the results
    print("\nOn Training Data:")
    evaluate_print(clf_name, y_train, y_train_scores)
    print("\nOn Test Data:")
    evaluate_print(clf_name, y_test, y_test_scores)

    # visualize the results
    visualize(clf_name,
              X_train,
              y_train,
              X_test,
              y_test,
              y_train_pred,
Example #2
0
def identify_outliers(df,
                      features,
                      contamination=0.1,
                      algorithms=['Isolation Forest']):
    """Cleans the outliers.

    Outlier detection using LSCP: Locally selective combination in parallel outlier ensembles.
    https://arxiv.org/abs/1812.01528


    Parameters
    ----------
    features : list
        List of feature names.

    df : DataFrame
        The data to be examined.

    contamination : float in (0., 0.5)
        the proportion of outliers in the data set.

    algorithms: list
        list with at the names of least 2 algorithms to be used during LSCP. A list of supported algorithms:

        ['Isolation Forest', 'Cluster-based Local Outlier Factor', 'Minimum Covariance Determinant (MCD)',
                  'Principal Component Analysis (PCA)', 'Angle-based Outlier Detector (ABOD)',
                  'Histogram-base Outlier Detection (HBOS)', 'K Nearest Neighbors (KNN)', 'Local Outlier Factor (LOF)',
                  'Feature Bagging', 'One-class SVM (OCSVM)']



    Returns
    -------

    df_sorted : DataFrame
        Original data with 3 new columns: anomaly_score, probability and prediction. Sorted on descending anomaly score.

    df_styled: DataFrame
        Styled version of df_sorted for use in Jupyter Notebook (i.e. display(df_styled)).
    """

    df_numeric = df.select_dtypes(
        include=[np.number])  # keep only numeric type features
    X = np.asarray(df_numeric)

    classifiers = {
        'Isolation Forest': IForest,
        'Cluster-based Local Outlier Factor': CBLOF,
        'Minimum Covariance Determinant (MCD)': MCD,
        'Principal Component Analysis (PCA)': PCA,
        'Angle-based Outlier Detector (ABOD)': ABOD,
        'Histogram-base Outlier Detection (HBOS)': HBOS,
        'K Nearest Neighbors (KNN)': knn,
        'Local Outlier Factor (LOF)': LOF,
        'Feature Bagging': FeatureBagging,
        'One-class SVM (OCSVM)': OCSVM,
    }

    if len(algorithms) > 1:
        selected_classifiers = [classifiers[x]() for x in algorithms]
        clf = LSCP(selected_classifiers, contamination=contamination)
    else:
        clf = classifiers[algorithms[0]](contamination=contamination)

    clf.fit(X)
    y_pred = clf.predict(X)

    y_predict_proba = clf.predict_proba(X, method='unify')
    y_predict_proba = [item[1] for item in y_predict_proba]

    outlier_index, = np.where(y_pred == 1)

    anomaly_score = clf.decision_function(X)
    anomaly_score = pd.DataFrame(anomaly_score, columns=['anomaly_score'])

    y_predict_proba = pd.DataFrame(y_predict_proba, columns=['probability'])
    prediction = pd.DataFrame(y_pred, columns=['prediction'])

    df.columns = features
    df_with_anomaly_score = pd.concat(
        [df, anomaly_score, y_predict_proba, prediction], axis=1)

    df_sorted = df_with_anomaly_score.sort_values(by='anomaly_score',
                                                  ascending=False)
    cm = sns.diverging_palette(220, 10, sep=80, n=7, as_cmap=True)
    df_styled = df_sorted.style.background_gradient(cmap=cm, subset=['anomaly_score']) \
        .apply(lambda x: ['background: MistyRose' if x.name in outlier_index.tolist() else '' for i in x], axis=1,
               subset=df_sorted.columns[:-3])

    return df_sorted, df_styled
Example #3
0
class TestLSCP(unittest.TestCase):
    def setUp(self):
        # Define data file and read X and y
        # Generate some data if the source data is missing
        this_directory = path.abspath(path.dirname(__file__))
        mat_file = 'cardio.mat'
        try:
            mat = loadmat(path.join(*[this_directory, 'data', mat_file]))

        except TypeError:
            print('{data_file} does not exist. Use generated data'.format(
                data_file=mat_file))
            X, y = generate_data(train_only=True)  # load data
        except IOError:
            print('{data_file} does not exist. Use generated data'.format(
                data_file=mat_file))
            X, y = generate_data(train_only=True)  # load data
        else:
            X = mat['X']
            y = mat['y'].ravel()
            X, y = check_X_y(X, y)

        self.X_train, self.X_test, self.y_train, self.y_test = \
            train_test_split(X, y, test_size=0.4, random_state=42)

        self.detector_list = [LOF(), LOF()]
        self.clf = LSCP(self.detector_list)
        self.clf.fit(self.X_train)
        self.roc_floor = 0.6

    def test_parameters(self):
        assert (hasattr(self.clf, 'decision_scores_')
                and self.clf.decision_scores_ is not None)
        assert (hasattr(self.clf, 'labels_') and self.clf.labels_ is not None)
        assert (hasattr(self.clf, 'threshold_')
                and self.clf.threshold_ is not None)
        assert (hasattr(self.clf, '_mu') and self.clf._mu is not None)
        assert (hasattr(self.clf, '_sigma') and self.clf._sigma is not None)
        assert (hasattr(self.clf, 'detector_list')
                and self.clf.detector_list is not None)

    def test_train_scores(self):
        assert_equal(len(self.clf.decision_scores_), self.X_train.shape[0])

    def test_prediction_scores(self):
        pred_scores = self.clf.decision_function(self.X_test)

        # check score shapes
        assert_equal(pred_scores.shape[0], self.X_test.shape[0])

        # check performance
        assert (roc_auc_score(self.y_test, pred_scores) >= self.roc_floor)

    def test_prediction_labels(self):
        pred_labels = self.clf.predict(self.X_test)
        assert_equal(pred_labels.shape, self.y_test.shape)

    def test_prediction_proba(self):
        pred_proba = self.clf.predict_proba(self.X_test)
        assert (pred_proba.min() >= 0)
        assert (pred_proba.max() <= 1)

    def test_prediction_proba_linear(self):
        pred_proba = self.clf.predict_proba(self.X_test, method='linear')
        assert (pred_proba.min() >= 0)
        assert (pred_proba.max() <= 1)

    def test_prediction_proba_unify(self):
        pred_proba = self.clf.predict_proba(self.X_test, method='unify')
        assert (pred_proba.min() >= 0)
        assert (pred_proba.max() <= 1)

    def test_prediction_proba_parameter(self):
        with assert_raises(ValueError):
            self.clf.predict_proba(self.X_test, method='something')

    def test_fit_predict(self):
        pred_labels = self.clf.fit_predict(self.X_train)
        assert_equal(pred_labels.shape, self.y_train.shape)

    def test_fit_predict_score(self):
        self.clf.fit_predict_score(self.X_test, self.y_test)
        self.clf.fit_predict_score(self.X_test,
                                   self.y_test,
                                   scoring='roc_auc_score')
        self.clf.fit_predict_score(self.X_test,
                                   self.y_test,
                                   scoring='prc_n_score')
        with assert_raises(NotImplementedError):
            self.clf.fit_predict_score(self.X_test,
                                       self.y_test,
                                       scoring='something')

    def test_predict_rank(self):
        pred_socres = self.clf.decision_function(self.X_test)
        pred_ranks = self.clf._predict_rank(self.X_test)

        # assert the order is reserved
        assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=3)
        assert_array_less(pred_ranks, self.X_train.shape[0] + 1)
        assert_array_less(-0.1, pred_ranks)

    def test_predict_rank_normalized(self):
        pred_socres = self.clf.decision_function(self.X_test)
        pred_ranks = self.clf._predict_rank(self.X_test, normalized=True)

        # assert the order is reserved
        assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=3)
        assert_array_less(pred_ranks, 1.01)
        assert_array_less(-0.1, pred_ranks)

    def tearDown(self):
        pass
y = data195061['Speed diff']

plt.figure(figsize=(10, 4))
plt.plot(x, y, label='Car 195061')
plt.xlabel('Time')
plt.ylabel('Speed diff')
plt.show()

# In[7]:

lscp = LSCP(detector_list=[MCD(), MCD()])
lscp.fit(df['Speed diff'].values.reshape(-1, 1))
xx = np.linspace(df['Speed diff'].min(), df['Speed diff'].max(),
                 len(df)).reshape(-1, 1)
anomaly_score = lscp.decision_function(xx)
outlier = lscp.predict(xx)
plt.figure(figsize=(10, 4))
plt.plot(xx, anomaly_score, label='anomaly score')
plt.ylabel('anomaly score')
plt.xlabel('Speed diff')
plt.show()

# In[8]:

df.loc[df['Speed diff'] > 10]

# ### Analyse graphique

# In[9]:

plt.figure(figsize=(10, 10))
Example #5
0
class TestLSCP(unittest.TestCase):
    def setUp(self):
        self.n_train = 100
        self.n_test = 50
        self.contamination = 0.1
        self.roc_floor = 0.6
        self.X_train, self.y_train, self.X_test, self.y_test = generate_data(
            n_train=self.n_train,
            n_test=self.n_test,
            contamination=self.contamination,
            random_state=42)
        self.X_train, self.X_test = standardizer(self.X_train, self.X_test)
        self.detector_list = [LOF(), LOF()]
        self.clf = LSCP(self.detector_list, contamination=self.contamination)
        self.clf.fit(self.X_train)

    def test_parameters(self):
        assert_true(
            hasattr(self.clf, 'decision_scores_')
            and self.clf.decision_scores_ is not None)
        assert_true(
            hasattr(self.clf, 'labels_') and self.clf.labels_ is not None)
        assert_true(
            hasattr(self.clf, 'threshold_')
            and self.clf.threshold_ is not None)
        assert_true(hasattr(self.clf, '_mu') and self.clf._mu is not None)
        assert_true(
            hasattr(self.clf, '_sigma') and self.clf._sigma is not None)
        assert_true(
            hasattr(self.clf, 'detector_list')
            and self.clf.detector_list is not None)

    def test_train_scores(self):
        assert_equal(len(self.clf.decision_scores_), self.X_train.shape[0])

    def test_prediction_scores(self):
        pred_scores = self.clf.decision_function(self.X_test)

        # check score shapes
        assert_equal(pred_scores.shape[0], self.X_test.shape[0])

        # check performance
        assert_greater(roc_auc_score(self.y_test, pred_scores), self.roc_floor)

    def test_prediction_labels(self):
        pred_labels = self.clf.predict(self.X_test)
        assert_equal(pred_labels.shape, self.y_test.shape)

    def test_prediction_proba(self):
        pred_proba = self.clf.predict_proba(self.X_test)
        assert_greater_equal(pred_proba.min(), 0)
        assert_less_equal(pred_proba.max(), 1)

    def test_prediction_proba_linear(self):
        pred_proba = self.clf.predict_proba(self.X_test, method='linear')
        assert_greater_equal(pred_proba.min(), 0)
        assert_less_equal(pred_proba.max(), 1)

    def test_prediction_proba_unify(self):
        pred_proba = self.clf.predict_proba(self.X_test, method='unify')
        assert_greater_equal(pred_proba.min(), 0)
        assert_less_equal(pred_proba.max(), 1)

    def test_prediction_proba_parameter(self):
        with assert_raises(ValueError):
            self.clf.predict_proba(self.X_test, method='something')

    def test_fit_predict(self):
        pred_labels = self.clf.fit_predict(self.X_train)
        assert_equal(pred_labels.shape, self.y_train.shape)

    def test_fit_predict_score(self):
        self.clf.fit_predict_score(self.X_test, self.y_test)
        self.clf.fit_predict_score(self.X_test,
                                   self.y_test,
                                   scoring='roc_auc_score')
        self.clf.fit_predict_score(self.X_test,
                                   self.y_test,
                                   scoring='prc_n_score')
        with assert_raises(NotImplementedError):
            self.clf.fit_predict_score(self.X_test,
                                       self.y_test,
                                       scoring='something')

    def test_predict_rank(self):
        pred_socres = self.clf.decision_function(self.X_test)
        pred_ranks = self.clf._predict_rank(self.X_test)

        # assert the order is reserved
        assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=3)
        assert_array_less(pred_ranks, self.X_train.shape[0] + 1)
        assert_array_less(-0.1, pred_ranks)

    def test_predict_rank_normalized(self):
        pred_socres = self.clf.decision_function(self.X_test)
        pred_ranks = self.clf._predict_rank(self.X_test, normalized=True)

        # assert the order is reserved
        assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=3)
        assert_array_less(pred_ranks, 1.01)
        assert_array_less(-0.1, pred_ranks)

    def tearDown(self):
        pass
Example #6
0
    X_train, y_train, X_test, y_test = \
        generate_data(n_train=n_train,
                      n_test=n_test,
                      contamination=contamination,
                      random_state=42)
    X_train, X_test = standardizer(X_train, X_test)

    # train lscp
    clf_name = 'LSCP'
    detector_list = [LOF(), LOF()]
    clf = LSCP(detector_list, random_state=42)
    clf.fit(X_train)

    # get the prediction labels and outlier scores of the training data
    y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
    y_train_scores = clf.decision_scores_  # raw outlier scores

    # get the prediction on the test data
    y_test_pred = clf.predict(X_test)  # outlier labels (0 or 1)
    y_test_scores = clf.decision_function(X_test)  # outlier scores

    # evaluate and print the results
    print("\nOn Training Data:")
    evaluate_print(clf_name, y_train, y_train_scores)
    print("\nOn Test Data:")
    evaluate_print(clf_name, y_test, y_test_scores)

    # visualize the results
    visualize(clf_name, X_train, y_train, X_test, y_test, y_train_pred,
              y_test_pred, show_figure=True, save_figure=False)