N, M = np.shape(X) # Restrict the data to images of "2" X = X[y.A.ravel()==2,:] N, M = np.shape(X) ### Gausian Kernel density estimator # cross-validate kernel width by leave-one-out-cross-validation # (efficient implementation in gausKernelDensity function) # evaluate for range of kernel widths widths = X.var(axis=0).max() * (2.0**np.arange(-10,3)) logP = np.zeros(np.size(widths)) for i,w in enumerate(widths): density, log_density = gausKernelDensity(X,w) logP[i] = log_density.sum() val = logP.max() ind = logP.argmax() width=widths[ind] print('Optimal estimated width is: {0}'.format(width)) # evaluate density for estimated width density, log_density = gausKernelDensity(X,width) # Sort the densities i = (density.argsort(axis=0)).ravel() density = density[i] # Plot density estimate of outlier score
X = np.matrix(matdata['X']) y = np.matrix(matdata['y']) N, M = np.shape(X) # Restrict the data to images of "2" X = X[y.A.ravel() == 2, :] N, M = np.shape(X) ### Gausian Kernel density estimator # cross-validate kernel width by leave-one-out-cross-validation # (efficient implementation in gausKernelDensity function) # evaluate for range of kernel widths widths = X.var(axis=0).max() * (2.0**np.arange(-10, 3)) logP = np.zeros(np.size(widths)) for i, w in enumerate(widths): density, log_density = gausKernelDensity(X, w) logP[i] = log_density.sum() val = logP.max() ind = logP.argmax() width = widths[ind] print('Optimal estimated width is: {0}'.format(width)) # evaluate density for estimated width density, log_density = gausKernelDensity(X, width) # Sort the densities i = (density.argsort(axis=0)).ravel() density = density[i] # Plot density estimate of outlier score
def outlierDetection(X, objects=20): ### Gausian Kernel density estimator # cross-validate kernel width by leave-one-out-cross-validation # (efficient implementation in gausKernelDensity function) # evaluate for range of kernel widths widths = X.var(axis=0).max() * (2.0 ** np.arange(-10, 3)) logP = np.zeros(np.size(widths)) for i, w in enumerate(widths): density, log_density = gausKernelDensity(X, w) logP[i] = log_density.sum() val = logP.max() ind = logP.argmax() width = widths[ind] print ("Optimal estimated width is: {0}".format(width)) # evaluate density for estimated width density, log_density = gausKernelDensity(X, width) # Sort the densities i = (density.argsort(axis=0)).ravel() density = density[i] # Plot density estimate of outlier score figure() bar(range(objects), density[:objects]) title("Density estimate") # Plot possible outliers # figure() print "For Gaussian Kernel Density" for k in range(1, objects + 1): print i[k] # subplot(4,5,k) # imshow(np.reshape(X[i[k],:], (1,9)).T, cmap=cm.binary) # xticks([]); yticks([]) # if k==3: title('Gaussian Kernel Density: Possible outliers') ### K-neighbors density estimator # Neighbor to use: K = 5 # Find the k nearest neighbors knn = NearestNeighbors(n_neighbors=K).fit(X) D, i = knn.kneighbors(X) density = 1.0 / (D.sum(axis=1) / K) # Sort the scores i = density.argsort() density = density[i] # Plot k-neighbor estimate of outlier score (distances) figure() bar(range(objects), density[:objects]) title("KNN density: Outlier score") # Plot possible outliers # figure() print "\n" print "For KNN density" for k in range(1, objects + 1): print i[k] # subplot(4,5,k) # imshow(np.reshape(X[i[k],:], (1,9)).T, cmap=cm.binary) # xticks([]); yticks([]) # if k==3: title('KNN density: Possible outliers') ### K-nearest neigbor average relative density # Compute the average relative density knn = NearestNeighbors(n_neighbors=K).fit(X) D, i = knn.kneighbors(X) density = 1.0 / (D.sum(axis=1) / K) avg_rel_density = density / (density[i[:, 1:]].sum(axis=1) / K) # Sort the avg.rel.densities i_avg_rel = avg_rel_density.argsort() avg_rel_density = avg_rel_density[i_avg_rel] # Plot k-neighbor estimate of outlier score (distances) figure() bar(range(objects), avg_rel_density[:objects]) title("KNN average relative density: Outlier score") # Plot possible outliers # figure() print "\n" print "For KNN average relative density" for k in range(1, objects + 1): print i_avg_rel[k] # subplot(4,5,k) # imshow(np.reshape(X[i_avg_rel[k],:], (1,9)).T, cmap=cm.binary) # xticks([]); yticks([]) # if k==3: title('KNN average relative density: Possible outliers') ### Distance to 5'th nearest neighbor outlier score K = 25 # Find the k nearest neighbors knn = NearestNeighbors(n_neighbors=K).fit(X) D, i = knn.kneighbors(X) # Outlier score score = D[:, K - 1] # Sort the scores i = score.argsort() score = score[i[::-1]] # Plot k-neighbor estimate of outlier score (distances) figure() bar(range(objects), score[:objects]) title("25th neighbor distance: Outlier score") # Plot possible outliers # figure() print "\n" print "For 5'th neighbour distance" for k in range(1, objects + 1): print i[462 - k] # subplot(4,5,k) # imshow(np.reshape(X[i[k],:], (1,9)).T, cmap=cm.binary); # xticks([]); yticks([]) # if k==3: title('5th neighbor distance: Possible outliers') # Plot random digits (the first 20 in the data set), for comparison # figure() # for k in range(1,objects + 1): # subplot(4,5,k); # imshow(np.reshape(X[k,:], (1,9)).T, cmap=cm.binary); # xticks([]); yticks([]) # if k==3: title('Random digits from data set') show()
import matplotlib.pyplot as plot from sklearn.neighbors import NearestNeighbors from sklearn.preprocessing import StandardScaler scaler = StandardScaler() data = pd.read_csv('data.csv') X = data.drop("class", axis=1) X = scaler.fit_transform(X.as_matrix()) best_logP = -inf best_bandwidth = None best_density = None for bandwidth in np.linspace(0, 5, 1000): density, log_density = tb.gausKernelDensity(X, bandwidth) logP = log_density.sum() if logP > best_logP: best_logP = logP best_bandwidth = bandwidth best_density = density kde_density = best_density[:, 0] # Number of neighbors K = 200 # Find the k nearest neighbors knn = NearestNeighbors(n_neighbors=K).fit(X) D, i = knn.kneighbors(X)
plt.bar(range(n_outlier),d['ard'][:n_outlier]) plt.title('KNN average relative density: Outlier score') plt.show() d = {} d_idx = {} ### Attribute normalization X = X / np.max(X, axis=0) print('Calculating Gaussian Kernel density...') ### Gausian Kernel widths = X.var(axis=0).max() * (2.0**np.arange(-10,3)) logP = np.zeros(np.size(widths)) for i,w in enumerate(widths): density, log_density = gausKernelDensity(X,w) logP[i] = log_density.sum() val = logP.max() ind = logP.argmax() width = widths[ind] # width = 0.32228417991810204 print('\tOptimal estimated width is: {0}'.format(width)) # evaluate density for estimated width d['kde'], log_density = gausKernelDensity(X,width) # Sort the densities d_idx['kde'] = (d['kde'].argsort(axis=0)).ravel() d['kde'] = d['kde'][d_idx['kde']].reshape(d['kde'].shape[0])
def outlierDetection(X, objects=20): ### Gausian Kernel density estimator # cross-validate kernel width by leave-one-out-cross-validation # (efficient implementation in gausKernelDensity function) # evaluate for range of kernel widths widths = X.var(axis=0).max() * (2.0**np.arange(-10, 3)) logP = np.zeros(np.size(widths)) for i, w in enumerate(widths): density, log_density = gausKernelDensity(X, w) logP[i] = log_density.sum() val = logP.max() ind = logP.argmax() width = widths[ind] print('Optimal estimated width is: {0}'.format(width)) # evaluate density for estimated width density, log_density = gausKernelDensity(X, width) # Sort the densities i = (density.argsort(axis=0)).ravel() density = density[i] # Plot density estimate of outlier score figure() bar(range(objects), density[:objects]) title('Density estimate') # Plot possible outliers #figure() print "For Gaussian Kernel Density" for k in range(1, objects + 1): print i[k] #subplot(4,5,k) #imshow(np.reshape(X[i[k],:], (1,9)).T, cmap=cm.binary) #xticks([]); yticks([]) #if k==3: title('Gaussian Kernel Density: Possible outliers') ### K-neighbors density estimator # Neighbor to use: K = 5 # Find the k nearest neighbors knn = NearestNeighbors(n_neighbors=K).fit(X) D, i = knn.kneighbors(X) density = 1. / (D.sum(axis=1) / K) # Sort the scores i = density.argsort() density = density[i] # Plot k-neighbor estimate of outlier score (distances) figure() bar(range(objects), density[:objects]) title('KNN density: Outlier score') # Plot possible outliers #figure() print "\n" print "For KNN density" for k in range(1, objects + 1): print i[k] #subplot(4,5,k) #imshow(np.reshape(X[i[k],:], (1,9)).T, cmap=cm.binary) #xticks([]); yticks([]) #if k==3: title('KNN density: Possible outliers') ### K-nearest neigbor average relative density # Compute the average relative density knn = NearestNeighbors(n_neighbors=K).fit(X) D, i = knn.kneighbors(X) density = 1. / (D.sum(axis=1) / K) avg_rel_density = density / (density[i[:, 1:]].sum(axis=1) / K) # Sort the avg.rel.densities i_avg_rel = avg_rel_density.argsort() avg_rel_density = avg_rel_density[i_avg_rel] # Plot k-neighbor estimate of outlier score (distances) figure() bar(range(objects), avg_rel_density[:objects]) title('KNN average relative density: Outlier score') # Plot possible outliers #figure() print "\n" print "For KNN average relative density" for k in range(1, objects + 1): print i_avg_rel[k] #subplot(4,5,k) #imshow(np.reshape(X[i_avg_rel[k],:], (1,9)).T, cmap=cm.binary) #xticks([]); yticks([]) #if k==3: title('KNN average relative density: Possible outliers') ### Distance to 5'th nearest neighbor outlier score K = 25 # Find the k nearest neighbors knn = NearestNeighbors(n_neighbors=K).fit(X) D, i = knn.kneighbors(X) # Outlier score score = D[:, K - 1] # Sort the scores i = score.argsort() score = score[i[::-1]] # Plot k-neighbor estimate of outlier score (distances) figure() bar(range(objects), score[:objects]) title('25th neighbor distance: Outlier score') # Plot possible outliers #figure() print "\n" print "For 5'th neighbour distance" for k in range(1, objects + 1): print i[462 - k] #subplot(4,5,k) #imshow(np.reshape(X[i[k],:], (1,9)).T, cmap=cm.binary); #xticks([]); yticks([]) #if k==3: title('5th neighbor distance: Possible outliers') # Plot random digits (the first 20 in the data set), for comparison #figure() #for k in range(1,objects + 1): # subplot(4,5,k); # imshow(np.reshape(X[k,:], (1,9)).T, cmap=cm.binary); # xticks([]); yticks([]) # if k==3: title('Random digits from data set') show()
def Outlier(input_data, index_to_check): X, y = split_train_test(input_data, index_to_check) N, M = np.shape(X) # Restrict the data to images of "2" ### Gausian Kernel density estimator # cross-validate kernel width by leave-one-out-cross-validation # (efficient implementation in gausKernelDensity function) # evaluate for range of kernel widths widths = X.var(axis=0).max() * (2.0**np.arange(-10, 3)) logP = np.zeros(np.size(widths)) for i, w in enumerate(widths): print('Fold {:2d}, w={:f}'.format(i, w)) density, log_density = gausKernelDensity(X, w) logP[i] = log_density.sum() val = logP.max() ind = logP.argmax() width = widths[ind] print('Optimal estimated width is: {0}'.format(width)) # evaluate density for estimated width density, log_density = gausKernelDensity(X, width) # Sort the densities i = (density.argsort(axis=0)).ravel() density = density[i].reshape(-1, ) print('The index of the lowest GKD estimator object: {0}'.format(i[0:5])) print('The value of the lowest GKD estimator object: {0}'.format( density[0:5])) # Plot density estimate of outlier score figure(1) bar(range(20), density[:20]) title('Density estimate') # Plot possible outliers ### K-neighbors density estimator # Neighbor to use: K = 5 # Find the k nearest neighbors knn = NearestNeighbors(n_neighbors=K).fit(X) D, i = knn.kneighbors(X) density = 1. / (D.sum(axis=1) / K) # Sort the scores i = density.argsort() density = density[i] print( 'The index of the lowest KNN 5 neighbours density object: {0}'.format( i[0:5])) print( 'The value of the lowest KNN 5 neighbours density object: {0}'.format( density[0:5])) # Plot k-neighbor estimate of outlier score (distances) figure(3) bar(range(20), density[:20]) title('KNN density: Outlier score') # Plot possible outliers ### K-nearest neigbor average relative density # Compute the average relative density knn = NearestNeighbors(n_neighbors=K).fit(X) D, i = knn.kneighbors(X) density = 1. / (D.sum(axis=1) / K) avg_rel_density = density / (density[i[:, 1:]].sum(axis=1) / K) # Sort the avg.rel.densities i_avg_rel = avg_rel_density.argsort() avg_rel_density = avg_rel_density[i_avg_rel] print('The index of the lowest KNN average relative density object: {0}'. format(i_avg_rel[0:5])) print('The value of the lowest KNN average relative density object: {0}'. format(avg_rel_density[0:5])) # Plot k-neighbor estimate of outlier score (distances) figure(5) bar(range(20), avg_rel_density[:20]) title('KNN average relative density: Outlier score') # Plot possible outliers ### Distance to 5'th nearest neighbor outlier score K = 5 # Find the k nearest neighbors knn = NearestNeighbors(n_neighbors=K).fit(X) D, i = knn.kneighbors(X) # Outlier score score = D[:, K - 1] # Sort the scores i = score.argsort() score = score[i[::-1]] print( 'The index of the highest KNN 5 neighbours outlier score: {0}'.format( i[0:5])) print( 'The value of the highest KNN 5 neighbours outlier score: {0}'.format( score[0:5])) # Plot k-neighbor estimate of outlier score (distances) figure(7) bar(range(20), score[:20]) title('5th neighbor distance: Outlier score') # Plot possible outliers show() print('Ran Exercise 11.4.1')