def data_overview(file_id, dataset_title): mat = db_queries.get_dataframe(file_id) filename = "overview_{}_{}.png".format(file_id, len(os.listdir("./images/"))) path = "./images/{}".format(filename) mat = mat.drop(["Unnamed: 0", "Index", "id", "Id"], axis=1, errors="ignore") y = mat["outlier"].values X = mat.drop("outlier", axis=1).values X_embedded = TSNE(n_components=2).fit_transform(X) X_out, X_in = get_outliers_inliers(X_embedded, y) plt.figure(figsize=(6, 6)) plt.scatter(X_in[:, 0], X_in[:, 1], color="blue", marker="^", alpha=0.4) plt.scatter(X_out[:, 0], X_out[:, 1], color="orange", marker="h", alpha=0.5) ttl = plt.title(dataset_title[:-4]) lgd = plt.legend( labels=["Нормальные данные", "Аномальные данные"], title="Обозначения", shadow=True, ncol=1, fontsize=12, loc="center left", bbox_to_anchor=(1, 0.5), ) plt.subplots_adjust(hspace=0.3) plt.savefig(path, dpi=100, bbox_extra_artists=(lgd, ttl), bbox_inches="tight") plt.close() return filename
def test_get_outliers_inliers(self): X_train, y_train = generate_data( n_train=self.n_train, train_only=True, contamination=self.contamination) X_outliers, X_inliers = get_outliers_inliers(X_train, y_train) inlier_index = int(self.n_train * (1 - self.contamination)) assert_allclose(X_train[0:inlier_index, :], X_inliers) assert_allclose(X_train[inlier_index:, :], X_outliers)
def visualize(clf_name, X_train, y_train, X_test, y_test, y_train_pred, y_test_pred, show_figure=True, save_figure=False): """Utility function for visualizing the results in examples. Internal use only. Parameters ---------- clf_name : str The name of the detector. X_train : numpy array of shape (n_samples, n_features) The training samples. y_train : list or array of shape (n_samples,) The ground truth of training samples. X_test : numpy array of shape (n_samples, n_features) The test samples. y_test : list or array of shape (n_samples,) The ground truth of test samples. y_train_pred : numpy array of shape (n_samples, n_features) The predicted binary labels of the training samples. y_test_pred : numpy array of shape (n_samples, n_features) The predicted binary labels of the test samples. show_figure : bool, optional (default=True) If set to True, show the figure. save_figure : bool, optional (default=False) If set to True, save the figure to the local. """ def _add_sub_plot(X_inliers, X_outliers, sub_plot_title, inlier_color='blue', outlier_color='orange'): """Internal method to add subplot of inliers and outliers. Parameters ---------- X_inliers : numpy array of shape (n_samples, n_features) Outliers. X_outliers : numpy array of shape (n_samples, n_features) Inliers. sub_plot_title : str Subplot title. inlier_color : str, optional (default='blue') The color of inliers. outlier_color : str, optional (default='orange') The color of outliers. """ plt.axis("equal") plt.scatter(X_inliers[:, 0], X_inliers[:, 1], label='inliers', color=inlier_color, s=40) plt.scatter(X_outliers[:, 0], X_outliers[:, 1], label='outliers', color=outlier_color, s=50, marker='^') plt.title(sub_plot_title, fontsize=15) plt.xticks([]) plt.yticks([]) plt.legend(loc=3, prop={'size': 10}) return # check input data shapes are consistent X_train, y_train, X_test, y_test, y_train_pred, y_test_pred = \ check_consistent_shape(X_train, y_train, X_test, y_test, y_train_pred, y_test_pred) if X_train.shape[1] != 2: raise ValueError("Input data has to be 2-d for visualization. The " "input data has {shape}.".format(shape=X_train.shape)) X_train_outliers, X_train_inliers = get_outliers_inliers(X_train, y_train) X_train_outliers_pred, X_train_inliers_pred = get_outliers_inliers( X_train, y_train_pred) X_test_outliers, X_test_inliers = get_outliers_inliers(X_test, y_test) X_test_outliers_pred, X_test_inliers_pred = get_outliers_inliers( X_test, y_test_pred) # plot ground truth vs. predicted results fig = plt.figure(figsize=(12, 10)) plt.suptitle("Demo of {clf_name} Detector".format(clf_name=clf_name), fontsize=15) fig.add_subplot(221) _add_sub_plot(X_train_inliers, X_train_outliers, 'Train Set Ground Truth', inlier_color='blue', outlier_color='orange') fig.add_subplot(222) _add_sub_plot(X_train_inliers_pred, X_train_outliers_pred, 'Train Set Prediction', inlier_color='blue', outlier_color='orange') fig.add_subplot(223) _add_sub_plot(X_test_inliers, X_test_outliers, 'Test Set Ground Truth', inlier_color='green', outlier_color='red') fig.add_subplot(224) _add_sub_plot(X_test_inliers_pred, X_test_outliers_pred, 'Test Set Prediction', inlier_color='green', outlier_color='red') if save_figure: plt.savefig('{clf_name}.png'.format(clf_name=clf_name), dpi=300) if show_figure: plt.show() return
import matplotlib.pyplot as plt import matplotlib.font_manager from pyod.models.abod import ABOD from pyod.models.knn import KNN from pyod.utils.data import generate_data, get_outliers_inliers #generate random data with two features X_train, Y_train = generate_data(n_train=200, train_only=True, n_features=2) # by default the outlier fraction is 0.1 in generate data function outlier_fraction = 0.1 # store outliers and inliers in different numpy arrays x_outliers, x_inliers = get_outliers_inliers(X_train, Y_train) n_inliers = len(x_inliers) n_outliers = len(x_outliers) #separate the two features and use it to plot the data F1 = X_train[:, [0]].reshape(-1, 1) F2 = X_train[:, [1]].reshape(-1, 1) # print('++++++ F1: ', F1, '++++++++') # print('++++++ F1: ', F2, '++++++++') # create a meshgrid xx, yy = np.meshgrid(np.linspace(-10, 10, 200), np.linspace(-10, 10, 200)) # scatter plot plt.scatter(F1, F2) plt.xlabel('F1')
import matplotlib.font_manager import matplotlib.pyplot as plt import numpy as np from pyod.models.knn import KNN from pyod.utils.data import generate_data, get_outliers_inliers from scipy import stats if __name__ == '__main__': # generate estimated training data # X_train -> training data # y_train -> training ground truth X_train, y_train = generate_data(n_train=300, n_features=2, contamination=0.2, train_only=True, random_state=20) outlier_fraction = 0.2 X_outliers, X_inliers = get_outliers_inliers(X_train, y_train) n_outliers = len(X_outliers) n_inliers = len(X_inliers) F1 = X_train[:,0].reshape(-1,1) F2 = X_train[:,1].reshape(-1,1) plt.scatter(F1, F2) plt.xlabel('F1') plt.ylabel('F2') plt.show() ''' KNN -> K-Nearest Neighbors Detector For an observation, its distance to its kth nearest neighbors could be viewed as the outlying scores Method: -Largest -Average -Median