Ejemplo n.º 1
0
def data_overview(file_id, dataset_title):
    mat = db_queries.get_dataframe(file_id)

    filename = "overview_{}_{}.png".format(file_id, len(os.listdir("./images/")))
    path = "./images/{}".format(filename)

    mat = mat.drop(["Unnamed: 0", "Index", "id", "Id"], axis=1, errors="ignore")

    y = mat["outlier"].values
    X = mat.drop("outlier", axis=1).values

    X_embedded = TSNE(n_components=2).fit_transform(X)
    X_out, X_in = get_outliers_inliers(X_embedded, y)

    plt.figure(figsize=(6, 6))
    plt.scatter(X_in[:, 0], X_in[:, 1], color="blue", marker="^", alpha=0.4)
    plt.scatter(X_out[:, 0], X_out[:, 1], color="orange", marker="h", alpha=0.5)
    ttl = plt.title(dataset_title[:-4])
    lgd = plt.legend(
        labels=["Нормальные данные", "Аномальные данные"],
        title="Обозначения",
        shadow=True,
        ncol=1,
        fontsize=12,
        loc="center left",
        bbox_to_anchor=(1, 0.5),
    )
    plt.subplots_adjust(hspace=0.3)
    plt.savefig(path, dpi=100, bbox_extra_artists=(lgd, ttl), bbox_inches="tight")
    plt.close()

    return filename
Ejemplo n.º 2
0
    def test_get_outliers_inliers(self):
        X_train, y_train = generate_data(
            n_train=self.n_train, train_only=True,
            contamination=self.contamination)

        X_outliers, X_inliers = get_outliers_inliers(X_train, y_train)

        inlier_index = int(self.n_train * (1 - self.contamination))

        assert_allclose(X_train[0:inlier_index, :], X_inliers)
        assert_allclose(X_train[inlier_index:, :], X_outliers)
Ejemplo n.º 3
0
def visualize(clf_name,
              X_train,
              y_train,
              X_test,
              y_test,
              y_train_pred,
              y_test_pred,
              show_figure=True,
              save_figure=False):
    """Utility function for visualizing the results in examples.
    Internal use only.

    Parameters
    ----------
    clf_name : str
        The name of the detector.

    X_train : numpy array of shape (n_samples, n_features)
        The training samples.

    y_train : list or array of shape (n_samples,)
        The ground truth of training samples.

    X_test : numpy array of shape (n_samples, n_features)
        The test samples.

    y_test : list or array of shape (n_samples,)
        The ground truth of test samples.

    y_train_pred : numpy array of shape (n_samples, n_features)
        The predicted binary labels of the training samples.

    y_test_pred : numpy array of shape (n_samples, n_features)
        The predicted binary labels of the test samples.

    show_figure : bool, optional (default=True)
        If set to True, show the figure.

    save_figure : bool, optional (default=False)
        If set to True, save the figure to the local.

    """
    def _add_sub_plot(X_inliers,
                      X_outliers,
                      sub_plot_title,
                      inlier_color='blue',
                      outlier_color='orange'):
        """Internal method to add subplot of inliers and outliers.
    
        Parameters
        ----------
        X_inliers : numpy array of shape (n_samples, n_features)
            Outliers.
    
        X_outliers : numpy array of shape (n_samples, n_features)
            Inliers.
    
        sub_plot_title : str
            Subplot title.
    
        inlier_color : str, optional (default='blue')
            The color of inliers.
    
        outlier_color : str, optional (default='orange')
            The color of outliers.
    
        """
        plt.axis("equal")
        plt.scatter(X_inliers[:, 0],
                    X_inliers[:, 1],
                    label='inliers',
                    color=inlier_color,
                    s=40)
        plt.scatter(X_outliers[:, 0],
                    X_outliers[:, 1],
                    label='outliers',
                    color=outlier_color,
                    s=50,
                    marker='^')
        plt.title(sub_plot_title, fontsize=15)
        plt.xticks([])
        plt.yticks([])
        plt.legend(loc=3, prop={'size': 10})
        return

    # check input data shapes are consistent
    X_train, y_train, X_test, y_test, y_train_pred, y_test_pred = \
        check_consistent_shape(X_train, y_train, X_test, y_test, y_train_pred,
                               y_test_pred)

    if X_train.shape[1] != 2:
        raise ValueError("Input data has to be 2-d for visualization. The "
                         "input data has {shape}.".format(shape=X_train.shape))

    X_train_outliers, X_train_inliers = get_outliers_inliers(X_train, y_train)
    X_train_outliers_pred, X_train_inliers_pred = get_outliers_inliers(
        X_train, y_train_pred)

    X_test_outliers, X_test_inliers = get_outliers_inliers(X_test, y_test)
    X_test_outliers_pred, X_test_inliers_pred = get_outliers_inliers(
        X_test, y_test_pred)

    # plot ground truth vs. predicted results
    fig = plt.figure(figsize=(12, 10))
    plt.suptitle("Demo of {clf_name} Detector".format(clf_name=clf_name),
                 fontsize=15)

    fig.add_subplot(221)
    _add_sub_plot(X_train_inliers,
                  X_train_outliers,
                  'Train Set Ground Truth',
                  inlier_color='blue',
                  outlier_color='orange')

    fig.add_subplot(222)
    _add_sub_plot(X_train_inliers_pred,
                  X_train_outliers_pred,
                  'Train Set Prediction',
                  inlier_color='blue',
                  outlier_color='orange')

    fig.add_subplot(223)
    _add_sub_plot(X_test_inliers,
                  X_test_outliers,
                  'Test Set Ground Truth',
                  inlier_color='green',
                  outlier_color='red')

    fig.add_subplot(224)
    _add_sub_plot(X_test_inliers_pred,
                  X_test_outliers_pred,
                  'Test Set Prediction',
                  inlier_color='green',
                  outlier_color='red')

    if save_figure:
        plt.savefig('{clf_name}.png'.format(clf_name=clf_name), dpi=300)

    if show_figure:
        plt.show()

    return
Ejemplo n.º 4
0
def visualize(clf_name, X_train, y_train, X_test, y_test, y_train_pred,
              y_test_pred, show_figure=True, save_figure=False):
    """Utility function for visualizing the results in examples.
    Internal use only.

    Parameters
    ----------
    clf_name : str
        The name of the detector.

    X_train : numpy array of shape (n_samples, n_features)
        The training samples.

    y_train : list or array of shape (n_samples,)
        The ground truth of training samples.

    X_test : numpy array of shape (n_samples, n_features)
        The test samples.

    y_test : list or array of shape (n_samples,)
        The ground truth of test samples.

    y_train_pred : numpy array of shape (n_samples, n_features)
        The predicted binary labels of the training samples.

    y_test_pred : numpy array of shape (n_samples, n_features)
        The predicted binary labels of the test samples.

    show_figure : bool, optional (default=True)
        If set to True, show the figure.

    save_figure : bool, optional (default=False)
        If set to True, save the figure to the local.

    """

    def _add_sub_plot(X_inliers, X_outliers, sub_plot_title,
                      inlier_color='blue', outlier_color='orange'):
        """Internal method to add subplot of inliers and outliers.

        Parameters
        ----------
        X_inliers : numpy array of shape (n_samples, n_features)
            Outliers.

        X_outliers : numpy array of shape (n_samples, n_features)
            Inliers.

        sub_plot_title : str
            Subplot title.

        inlier_color : str, optional (default='blue')
            The color of inliers.

        outlier_color : str, optional (default='orange')
            The color of outliers.

        """
        plt.axis("equal")
        plt.scatter(X_inliers[:, 0], X_inliers[:, 1], label='inliers',
                    color=inlier_color, s=40)
        plt.scatter(X_outliers[:, 0], X_outliers[:, 1],
                    label='outliers', color=outlier_color, s=50, marker='^')
        plt.title(sub_plot_title, fontsize=15)
        plt.xticks([])
        plt.yticks([])
        plt.legend(loc=3, prop={'size': 10})
        return

    # check input data shapes are consistent
    X_train, y_train, X_test, y_test, y_train_pred, y_test_pred = \
        check_consistent_shape(X_train, y_train, X_test, y_test, y_train_pred,
                               y_test_pred)

    if X_train.shape[1] != 2:
        raise ValueError("Input data has to be 2-d for visualization. The "
                         "input data has {shape}.".format(shape=X_train.shape))

    X_train_outliers, X_train_inliers = get_outliers_inliers(X_train, y_train)
    X_train_outliers_pred, X_train_inliers_pred = get_outliers_inliers(
        X_train, y_train_pred)

    X_test_outliers, X_test_inliers = get_outliers_inliers(X_test, y_test)
    X_test_outliers_pred, X_test_inliers_pred = get_outliers_inliers(
        X_test, y_test_pred)

    # plot ground truth vs. predicted results
    fig = plt.figure(figsize=(12, 10))
    plt.suptitle("Demo of {clf_name} Detector".format(clf_name=clf_name),
                 fontsize=15)

    fig.add_subplot(221)
    _add_sub_plot(X_train_inliers, X_train_outliers, 'Train Set Ground Truth',
                  inlier_color='blue', outlier_color='orange')

    fig.add_subplot(222)
    _add_sub_plot(X_train_inliers_pred, X_train_outliers_pred,
                  'Train Set Prediction', inlier_color='blue',
                  outlier_color='orange')

    fig.add_subplot(223)
    _add_sub_plot(X_test_inliers, X_test_outliers, 'Test Set Ground Truth',
                  inlier_color='green', outlier_color='red')

    fig.add_subplot(224)
    _add_sub_plot(X_test_inliers_pred, X_test_outliers_pred,
                  'Test Set Prediction', inlier_color='green',
                  outlier_color='red')

    if save_figure:
        plt.savefig('{clf_name}.png'.format(clf_name=clf_name), dpi=300)

    if show_figure:
        plt.show()

    return
Ejemplo n.º 5
0
import matplotlib.pyplot as plt
import matplotlib.font_manager

from pyod.models.abod import ABOD
from pyod.models.knn import KNN

from pyod.utils.data import generate_data, get_outliers_inliers

#generate random data with two features
X_train, Y_train = generate_data(n_train=200, train_only=True, n_features=2)

# by default the outlier fraction is 0.1 in generate data function
outlier_fraction = 0.1

# store outliers and inliers in different numpy arrays
x_outliers, x_inliers = get_outliers_inliers(X_train, Y_train)

n_inliers = len(x_inliers)
n_outliers = len(x_outliers)

#separate the two features and use it to plot the data
F1 = X_train[:, [0]].reshape(-1, 1)
F2 = X_train[:, [1]].reshape(-1, 1)
# print('++++++ F1: ', F1, '++++++++')
# print('++++++ F1: ', F2, '++++++++')
# create a meshgrid
xx, yy = np.meshgrid(np.linspace(-10, 10, 200), np.linspace(-10, 10, 200))

# scatter plot
plt.scatter(F1, F2)
plt.xlabel('F1')
Ejemplo n.º 6
0
import matplotlib.font_manager
import matplotlib.pyplot as plt
import numpy as np
from pyod.models.knn import KNN
from pyod.utils.data import generate_data, get_outliers_inliers
from scipy import stats

if __name__ == '__main__':

    # generate estimated training data
    # X_train -> training data
    # y_train -> training ground truth
    X_train, y_train = generate_data(n_train=300, n_features=2, contamination=0.2, train_only=True, random_state=20)
    outlier_fraction = 0.2

    X_outliers, X_inliers = get_outliers_inliers(X_train, y_train)
    n_outliers = len(X_outliers)
    n_inliers = len(X_inliers)


    F1 = X_train[:,0].reshape(-1,1)
    F2 = X_train[:,1].reshape(-1,1)
    plt.scatter(F1, F2)
    plt.xlabel('F1')
    plt.ylabel('F2')
    plt.show()

    '''
        KNN -> K-Nearest Neighbors Detector
        For an observation, its distance to its kth nearest neighbors could be viewed as the outlying scores
        Method: -Largest -Average -Median