def visualizeFit(X, mu, sigma2): n = np.arange(0, 35, 0.5) X1, X2 = np.meshgrid(n, n) Z = multivariateGaussian(np.column_stack((X1.T.flatten(), X2.T.flatten())), mu, sigma2) Z = Z.reshape(X1.shape) plt.plot(X[:, 0], X[:, 1], 'bx', markersize=5) if not isinf(np.sum(Z)): plt.contour(X1, X2, Z, 10.0**np.arange(-20, 0, 3))
def visualizeFit(X, mu, sigma2): n = np.arange(0, 35, 0.5) X1, X2 = np.meshgrid(n, n) Z = multivariateGaussian(np.column_stack( (X1.T.flatten(), X2.T.flatten())), mu, sigma2) Z = Z.reshape(X1.shape) plt.plot(X[:, 0], X[:, 1], 'bx', markersize=5) if not isinf(np.sum(Z)): plt.contour(X1, X2, Z, 10.0**np.arange(-20, 0, 3))
def visualizeFit(p, mu, sigma2): X1, X2 = np.meshgrid(np.linspace(0, 35, num=70), np.linspace(0, 35, num=70)) p2 = multivariateGaussian( np.hstack((X1.flatten()[:, np.newaxis], X2.flatten()[:, np.newaxis])), mu, sigma2) contour_level = 10**(np.arange(-20., 1, 3)) plt.contour(X1, X2, p2[:, np.newaxis].reshape(X1.shape), contour_level) plt.xlim(0, 35) plt.ylim(0, 35) plt.xlabel("Latency (ms)") plt.ylabel("Throughput (mb/s)") return
def visualizeFit(X, mu, sigma2): X1, X2 = np.meshgrid(np.arange(0, 35.1, 0.5), np.arange(0, 35.1, 0.5)) Z = multivariateGaussian( np.column_stack( (X1.reshape(X1.size, order='F'), X2.reshape(X2.size, order='F'))), mu, sigma2) Z = Z.reshape(X1.shape, order='F') plt.plot(X[:, 0], X[:, 1], 'bx') # Do not plot if there are infinities if (np.sum(np.isinf(Z)) == 0): plt.contour(X1, X2, Z, np.power(10, (np.arange(-20, 0.1, 3)).T))
def visualizeFit(X, mu, sigma2): """ This visualization shows you the probability density function of the Gaussian distribution. Each example has a location (x1, x2) that depends on its feature values. """ n = np.linspace(0,35,71) X1 = np.meshgrid(n,n) Z = multivariateGaussian(np.column_stack((X1[0].T.flatten(), X1[1].T.flatten())),mu,sigma2) Z = Z.reshape(X1[0].shape) plt.plot(X[:, 0], X[:, 1],'bx') # Do not plot if there are infinities if not isinf(np.sum(Z)): plt.contour(X1[0], X1[1], Z, 10.0**np.arange(-20, 0, 3).T)
def visualizeFit(X, mu, sigma2): """ This visualization shows you the probability density function of the Gaussian distribution. Each example has a location (x1, x2) that depends on its feature values. """ n = np.linspace(0,35,71) X1 = np.meshgrid(n,n) Z = multivariateGaussian(np.column_stack((X1[0].T.flatten(), X1[1].T.flatten())),mu,sigma2) Z = Z.reshape(X1[0].shape) plt.plot(X[:, 0], X[:, 1],'bx') # Do not plot if there are infinities if not isinf(np.sum(Z)): plt.contour(X1[0], X1[1], Z, 10.0**np.arange(-20, 0, 3).T) show()
def visualizeFit(X, mu, sigma2): #VISUALIZEFIT Visualize the dataset and its estimated distribution. # VISUALIZEFIT(X, p, mu, sigma2) This visualization shows you the # probability density function of the Gaussian distribution. Each example # has a location (x1, x2) that depends on its feature values. # l = np.arange(0, 35.5, 0.5) X1, X2 = np.meshgrid(l, l) X_tmp = np.vstack((X1.ravel(), X2.ravel())).T Z = multivariateGaussian(X_tmp, mu, sigma2) Z.resize(X1.shape) plt.plot(X[:, 0], X[:, 1], 'bx') if np.sum(np.isinf(Z)) == 0: plt.contour(X1, X2, Z, 10.0**np.arange(-20, 0, 3))
def visualizeFit(X, mu, Sigma2): t = np.linspace(0, 35, 71) X1, X2 = np.meshgrid(t, t) Z = multivariateGaussian( np.vstack((X1.reshape(1, -1), X2.reshape(1, -1))).T, mu, Sigma2) Z = Z.reshape(X1.shape[0], -1) plt.figure() plt.plot(X[:, 0], X[:, 1], 'bx', markersize=4) plt.axis([0, 30, 0, 30]) # Do not plot if there are infinities if np.sum(np.isinf(Z)) == 0: plt.contour(X1, X2, Z, np.power(10, np.linspace(-20, 0, 7)))
def visualizeFit(X, mu, sigma2): #VISUALIZEFIT Visualize the dataset and its estimated distribution. # VISUALIZEFIT(X, p, mu, sigma2) This visualization shows you the # probability density function of the Gaussian distribution. Each example # has a location (x1, x2) that depends on its feature values. # X1,X2 = np.meshgrid(np.arange(0, 35.1, 0.5), np.arange(0, 35.1, 0.5)) Z = multivariateGaussian(np.column_stack((X1.reshape(X1.size, order='F'), X2.reshape(X2.size, order='F'))), mu, sigma2) Z = Z.reshape(X1.shape, order='F') plt.plot(X[:, 0], X[:, 1],'bx', markersize=13, markeredgewidth=1) # plt.scatter(X[:, 0], X[:, 1], s=150, c='b', marker='x', linewidths=1) # Do not plot if there are infinities if (np.sum(np.isinf(Z)) == 0): plt.contour(X1, X2, Z, np.power(10,(np.arange(-20, 0.1, 3)).T))
def visualizeFit(X, mu, sigma2): """Visualizes the dataset and its estimated distribution. visualizeFit(X, p, mu, sigma2) This visualization shows you the probability density function of the Gaussian distribution. Each example has a location (x1, x2) that depends on its feature values. """ X1, X2 = np.meshgrid(np.arange(0, 35.5, 0.5), np.arange(0, 35.5, 0.5)) Z = multivariateGaussian(np.c_[X1.ravel(), X2.ravel()], mu, sigma2) Z = Z.reshape(X1.shape) plt.plot(X[:, 0], X[:, 1], 'bx') cont_levels = [10 ** exp for exp in range(-20, 0, 3)] plt.contour(X1, X2, Z, cmap=plt.cm.Paired, alpha=0.9, levels=cont_levels)
def visualizeFit(X, mu, sigma2): """Visualizes the dataset and its estimated distribution. visualizeFit(X, p, mu, sigma2) This visualization shows you the probability density function of the Gaussian distribution. Each example has a location (x1, x2) that depends on its feature values. """ X1, X2 = np.meshgrid(np.arange(0, 35.5, 0.5), np.arange(0, 35.5, 0.5)) Z = multivariateGaussian(np.c_[X1.ravel(), X2.ravel()], mu, sigma2) Z = Z.reshape(X1.shape) plt.plot(X[:, 0], X[:, 1], 'bx') cont_levels = [10**exp for exp in range(-20, 0, 3)] plt.contour(X1, X2, Z, cmap=plt.cm.Paired, alpha=0.9, levels=cont_levels)
def visualizeFit(X, mu, sigma2): #VISUALIZEFIT Visualize the dataset and its estimated distribution. # VISUALIZEFIT(X, p, mu, sigma2) This visualization shows you the # probability density function of the Gaussian distribution. Each example # has a location (x1, x2) that depends on its feature values. # X1, X2 = np.meshgrid(np.arange(0, 35.5, 0.5), np.arange(0, 35.5, 0.5)) Z = multivariateGaussian(np.stack([X1.ravel(), X2.ravel()], axis=1), mu, sigma2) Z = Z.reshape(X1.shape) plt.plot(X[:, 0], X[:, 1], 'bx') #hold on # Do not plot if there are infinities if np.all(abs(Z) != np.inf): plt.contour(X1, X2, Z, levels=10**(np.arange(-20., 1, 3)), zorder=100)
def visualizeFit(X, mu, sigma2): """ This visualization shows you the probability density function of the Gaussian distribution. Each example has a location (x1, x2) that depends on its feature values. """ n = np.arange(0, 35.5, 0.5) X1, X2 = np.meshgrid(n, n) Z = multivariateGaussian(np.c_[X1.ravel(order='F'), X2.ravel(order='F')], mu, sigma2) Z = np.reshape(Z, X1.shape, order='F') plt.figure() plt.plot(X[:, 0], X[:, 1], 'bx', markersize=3, markeredgewidth=0.5) # Do not plot if there are infinities if not isinf(np.sum(Z)): plt.contour(X1, X2, Z, 10**np.arange(-20, 0, 3, dtype='float'))
def visualizeFit(X, mu, sigma2): #VISUALIZEFIT Visualize the dataset and its estimated distribution. # VISUALIZEFIT(X, mu, sigma2) This visualization shows you the # probability density function of the Gaussian distribution. Each example # has a location (x1, x2) that depends on its feature values. # coords = linspace(0,30,61) X1, X2 = meshgrid(coords, coords) Z = multivariateGaussian(column_stack((X1.ravel(),X2.ravel())), mu, sigma2) Z = reshape(Z, shape(X1)) plot(X[:, 0], X[:, 1],'bx') hold(True) # Do not plot if there are infinities if not any(isinf(Z)): contour(X1, X2, Z, power(10., arange(-20,0,3))) hold(False)
def visualizeFit(X, mu, sigma2): """ VISUALIZEFIT Visualize the dataset and its estimated distribution. VISUALIZEFIT(X, p, mu, sigma2) This visualization shows you the probability density function of the Gaussian distribution. Each example has a location (x1, x2) that depends on its feature values. """ xx = np.arange(0, 35, 0.5) yy = np.arange(0, 35, 0.5) X1, X2 = np.meshgrid(xx, yy) points = np.c_[X1.ravel(), X2.ravel()] Z = multivariateGaussian(points, mu, sigma2) Z = Z.reshape(X1.shape) # 这个levels是作业里面给的参考,或者通过求解的概率推出来 cont_levels = [10**h for h in range(-20, 0, 3)] plt.contour(X1, X2, Z, cont_levels)
def visualizeFit(X, mu, sigma2): #VISUALIZEFIT Visualize the dataset and its estimated distribution. # VISUALIZEFIT(X, p, mu, sigma2) This visualization shows you the # probability density function of the Gaussian distribution. Each example # has a location (x1, x2) that depends on its feature values. # X1,X2 = np.meshgrid(np.arange(0, 35.1, 0.5), np.arange(0, 35.1, 0.5)) Z = mvg.multivariateGaussian(np.column_stack((X1.reshape(X1.size, order='F'), X2.reshape(X2.size, order='F'))), mu, sigma2) Z = Z.reshape(X1.shape, order='F') plt.plot(X[:, 0], X[:, 1],'bx', markersize=13, markeredgewidth=1) # plt.scatter(X[:, 0], X[:, 1], s=150, c='b', marker='x', linewidths=1) plt.hold(True) # Do not plot if there are infinities if (np.sum(np.isinf(Z)) == 0): plt.contour(X1, X2, Z, np.power(10,(np.arange(-20, 0.1, 3)).T)) plt.hold(False)
def visualizeFit(X, mu, sigma2): ''' VISUALIZEFIT(X, p, mu, sigma2) This visualization shows you the probability density function of the Gaussian distribution. Each example has a location (x1, x2) that depends on its feature values. ''' import numpy as np from multivariateGaussian import multivariateGaussian import matplotlib.pyplot as plt X1, X2 = np.meshgrid(np.arange(0, 35.5, 0.5), np.arange(0, 35.5, 0.5)) Z = multivariateGaussian( np.vstack((X1.flatten(), X2.flatten())).T, mu, sigma2) Z = np.reshape(Z, np.shape(X1)) fig, ax = plt.subplots() plt.plot(X[:, 0], X[:, 1], 'bx') plt.contour(X1, X2, Z, 10.**np.arange(-20, 0, 3)) ax.set_xlim(0, 30) ax.set_ylim(0, 30) plt.xlabel('Latency (ms)') plt.ylabel('Throughput (mb/s)')
def visualizeFit(X, mu, sigma2): #VISUALIZEFIT Visualize the dataset and its estimated distribution. # VISUALIZEFIT(X, p, mu, sigma2) This visualization shows you the # probability density function of the Gaussian distribution. Each example # has a location (x1, x2) that depends on its feature values. x = np.arange(0, 35, 0.5) X1,X2 = np.meshgrid(x,x) X1_s = X1.flatten() X1_s = X1.reshape(-1,1) X2_s = X2.flatten() X2_s = X2_s.reshape(-1,1) X1X2 = np.hstack((X1_s,X2_s)) Z = multivariateGaussian(X1X2,mu,sigma2) Z = Z.reshape(X1.shape) #ml_dir = '/Users/gregory/Desktop/me/coursera/machine_learning/ml_python/machine-learning-ex8/ex8/' #np.savetxt(ml_dir+'Z.csv', Z) plt.plot(X[:, 0], X[:, 1],'bx') l = 10.**np.arange(-20, 0, 3) #if (sum(isinf(Z)) == 0) plt.contour(X1, X2, Z, l)#, 10.^(-20:3:0)') plt.xlabel('Latency (ms)') plt.ylabel('Throughput (mb/s)') plt.show()
def visualizeFit(X, mu, sigma2): X1, X2 = np.meshgrid(np.arange(0, 35, 0.5), np.arange(0, 35, 0.5)) m, n = np.shape(X1) a = X1.ravel().reshape(m * n, 1) Z = mG.multivariateGaussian( np.c_[(X1.ravel().reshape(m * n, 1), X2.ravel().reshape(m * n, 1))], mu, sigma2) Z = Z.reshape(np.shape(X1)) # plot points in buttom layer plt.scatter(X[:, 0], X[:, 1], marker="x", color='b', linewidths=0.01) plt.axis([0, 30, 0, 30]) plt.xlabel('Latency (ms)') plt.ylabel('Throughput (mb/s)') # plot countours; Do not plot if there are infinities if (np.sum(np.isinf(Z)) == 0): C = plt.contour(X1, X2, Z, 10, colors='black', linewidth=0.5) # C.contour(X1, X2, Z, 10**np.arange(-20,0,3)) plt.xlabel('Latency (ms)') plt.ylabel('Throughput (mb/s)') plt.show()
## ================== Part 2: Estimate the dataset statistics =================== # For this exercise, we assume a Gaussian distribution for the dataset. # # We first estimate the parameters of our assumed Gaussian distribution, # then compute the probabilities for each of the points and then visualize # both the overall distribution and where each of the points falls in # terms of that distribution. # print 'Visualizing Gaussian fit.' # Estimate my and sigma2 mu, sigma2 = estimateGaussian(X) # Returns the density of the multivariate normal at each data point (row) # of X p = multivariateGaussian(X, mu, sigma2) # Visualize the fit visualizeFit(X, mu, sigma2) plt.xlabel('Latency (ms)') plt.ylabel('Throughput (mb/s)') show() raw_input("Program paused. Press Enter to continue...") ## ================== Part 3: Find Outliers =================== # Now you will find a good epsilon threshold using a cross-validation set # probabilities given the estimated Gaussian distribution # pval = multivariateGaussian(Xval, mu, sigma2)
from sklearn.svm import SVC from estimateGaussian import estimateGaussian from multivariateGaussian import multivariateGaussian from visualizeFit import visualizeFit from selectThreshold import selectThreshold data = loadmat('ex8data1.mat') X = data['X'] Xval = data["Xval"] yval = data['yval'] plt.plot(X[:, 0], X[:, 1], 'bx') #plt.show() #1.2 (mu, sig2) = estimateGaussian(X) p = multivariateGaussian(X, mu, sig2) #visualizeFit(p,mu,sig2) #plt.show() #1.3 pcv = multivariateGaussian(Xval, mu, sig2) (epi, f1) = selectThreshold(pcv, yval) outliers = (p < epi) #plt.scatter(X[outliers,0],X[outliers,1],marker ="o",facecolor="none",edgecolor="r",s=70) #plt.show() #1.4 mat2 = loadmat("ex8data2.mat") X2 = mat2["X"]
import scipy.io as spio import numpy as np import matplotlib.pyplot as plt import math import seaborn as sns import pandas as pd from estimateGaussian import estimateGaussian from multivariateGaussian import multivariateGaussian mat = spio.loadmat('ex8data1.mat', squeeze_me=True) # print(mat.keys()) data = mat['X'] xval = mat['Xval'] yval = mat['yval'] df = pd.DataFrame(data, columns=["Latency (ms)", "Throughput (mb/s)"]) sns.scatterplot(x="Latency (ms)", y="Throughput (mb/s)", data=df) plt.show() [mu, sigma2] = estimateGaussian(data) # print(mu) print(np.shape(sigma2)) print(sigma2) p = multivariateGaussian(data, mu, sigma2)
## ================== Part 2: Estimate the dataset statistics =================== # For this exercise, we assume a Gaussian distribution for the dataset. # # We first estimate the parameters of our assumed Gaussian distribution, # then compute the probabilities for each of the points and then visualize # both the overall distribution and where each of the points falls in # terms of that distribution. # print 'Visualizing Gaussian fit.\n' # Estimate my and sigma2 mu, sigma2 = estimateGaussian(X) # Returns the density of the multivariate normal at each data point (row) # of X p = multivariateGaussian(X, mu, sigma2) # Visualize the fit fig = figure() visualizeFit(X, mu, sigma2) xlabel('Latency (ms)') ylabel('Throughput (mb/s)') fig.show() print 'Program paused. Press enter to continue.' raw_input() ## ================== Part 3: Find Outliers =================== # Now you will find a good epsilon threshold using a cross-validation set # probabilities given the estimated Gaussian distribution #
def ex8(): ## Machine Learning Online Class # Exercise 8 | Anomaly Detection and Collaborative Filtering # # Instructions # ------------ # # This file contains code that helps you get started on the # exercise. You will need to complete the following functions: # # estimateGaussian.m # selectThreshold.m # cofiCostFunc.m # # For this exercise, you will not need to change any code in this file, # or any other files other than those mentioned above. # ## Initialization #clear ; close all; clc ## ================== Part 1: Load Example Dataset =================== # We start this exercise by using a small dataset that is easy to # visualize. # # Our example case consists of 2 network server statistics across # several machines: the latency and throughput of each machine. # This exercise will help us find possibly faulty (or very fast) machines. # print('Visualizing example dataset for outlier detection.\n') # The following command loads the dataset. You should now have the # variables X, Xval, yval in your environment mat = scipy.io.loadmat('ex8data1.mat') X = mat['X'] Xval = mat['Xval'] yval = mat['yval'] # Visualize the example dataset plt.plot(X[:, 0], X[:, 1], 'bx') plt.axis([0, 30, 0, 30]) plt.xlabel('Latency (ms)') plt.ylabel('Throughput (mb/s)') plt.savefig('figure1.png') print('Program paused. Press enter to continue.') #pause ## ================== Part 2: Estimate the dataset statistics =================== # For this exercise, we assume a Gaussian distribution for the dataset. # # We first estimate the parameters of our assumed Gaussian distribution, # then compute the probabilities for each of the points and then visualize # both the overall distribution and where each of the points falls in # terms of that distribution. # print('Visualizing Gaussian fit.\n') # Estimate my and sigma2 mu, sigma2 = estimateGaussian(X) # Returns the density of the multivariate normal at each data point (row) # of X p = multivariateGaussian(X, mu, sigma2) # Visualize the fit visualizeFit(X, mu, sigma2) plt.xlabel('Latency (ms)') plt.ylabel('Throughput (mb/s)') plt.savefig('figure2.png') print('Program paused. Press enter to continue.\n') #pause ## ================== Part 3: Find Outliers =================== # Now you will find a good epsilon threshold using a cross-validation set # probabilities given the estimated Gaussian distribution # pval = multivariateGaussian(Xval, mu, sigma2) epsilon, F1 = selectThreshold(yval, pval) print('Best epsilon found using cross-validation: %e' % epsilon) print('Best F1 on Cross Validation Set: %f' % F1) print(' (you should see a value epsilon of about 8.99e-05)\n') # Find the outliers in the training set and plot the outliers = p < epsilon # Draw a red circle around those outliers #hold on plt.plot(X[outliers, 0], X[outliers, 1], 'ro', lw=2, ms=10) #hold off plt.savefig('figure3.png') print('Program paused. Press enter to continue.\n') #pause ## ================== Part 4: Multidimensional Outliers =================== # We will now use the code from the previous part and apply it to a # harder problem in which more features describe each datapoint and only # some features indicate whether a point is an outlier. # # Loads the second dataset. You should now have the # variables X, Xval, yval in your environment mat = scipy.io.loadmat('ex8data2.mat') X = mat['X'] Xval = mat['Xval'] yval = mat['yval'] # Apply the same steps to the larger dataset mu, sigma2 = estimateGaussian(X) # Training set p = multivariateGaussian(X, mu, sigma2) # Cross-validation set pval = multivariateGaussian(Xval, mu, sigma2) # Find the best threshold epsilon, F1 = selectThreshold(yval, pval) print('Best epsilon found using cross-validation: %e' % epsilon) print('Best F1 on Cross Validation Set: %f' % F1) print('# Outliers found: %d ' % np.sum(p < epsilon)) print(' (you should see a value epsilon of about 1.38e-18)\n')