Ejemplo n.º 1
0
def output(partId):
    # Random Test Cases
    n_u = 3
    n_m = 4
    n = 5
    X = np.sin(np.arange(1, 1 + n_m * n)).reshape(n_m, n, order='F')
    Theta = np.cos(np.arange(1, 1 + n_u * n)).reshape(n_u, n, order='F')
    Y = np.sin(np.arange(1, 1 + 2 * n_m * n_u, 2)).reshape(n_m, n_u, order='F')
    R = Y > 0.5
    pval = np.concatenate([abs(Y.ravel('F')), [0.001], [1]])
    Y = Y * R
    yval = np.concatenate([R.ravel('F'), [1], [0]])
    params = np.concatenate([X.ravel(), Theta.ravel()])
    if partId == '1':
        mu, sigma2 = estimateGaussian(X)
        out = formatter('%0.5f ', mu.ravel())
        out += formatter('%0.5f ', sigma2.ravel())
    elif partId == '2':
        bestEpsilon, bestF1 = selectThreshold(yval, pval)
        out = formatter('%0.5f ', bestEpsilon.ravel())
        out += formatter('%0.5f ', bestF1.ravel())
    elif partId == '3':
        J, _ = cofiCostFunc(params, Y, R, n_u, n_m, n, 0)
        out = formatter('%0.5f ', J.ravel())
    elif partId == '4':
        J, grad = cofiCostFunc(params, Y, R, n_u, n_m, n, 0)
        X_grad = grad[:n_m * n].reshape(n_m, n)
        Theta_grad = grad[n_m * n:].reshape(n_u, n)
        out = formatter(
            '%0.5f ',
            np.concatenate([X_grad.ravel('F'),
                            Theta_grad.ravel('F')]))
    elif partId == '5':
        J, _ = cofiCostFunc(params, Y, R, n_u, n_m, n, 1.5)
        out = formatter('%0.5f ', J.ravel())
    elif partId == '6':
        J, grad = cofiCostFunc(params, Y, R, n_u, n_m, n, 1.5)
        X_grad = grad[:n_m * n].reshape(n_m, n)
        Theta_grad = grad[n_m * n:].reshape(n_u, n)
        out = formatter(
            '%0.5f ',
            np.concatenate([X_grad.ravel('F'),
                            Theta_grad.ravel('F')]))
    return out
Ejemplo n.º 2
0
show()
raw_input("Program paused. Press Enter to continue...")  


## ================== Part 2: Estimate the dataset statistics ===================
#  For this exercise, we assume a Gaussian distribution for the dataset.
#
#  We first estimate the parameters of our assumed Gaussian distribution, 
#  then compute the probabilities for each of the points and then visualize 
#  both the overall distribution and where each of the points falls in 
#  terms of that distribution.
#
print 'Visualizing Gaussian fit.'

#  Estimate my and sigma2
mu, sigma2 = estimateGaussian(X)

#  Returns the density of the multivariate normal at each data point (row) 
#  of X
p = multivariateGaussian(X, mu, sigma2)

#  Visualize the fit
visualizeFit(X,  mu, sigma2)
plt.xlabel('Latency (ms)')
plt.ylabel('Throughput (mb/s)')
show()

raw_input("Program paused. Press Enter to continue...")  

## ================== Part 3: Find Outliers ===================
#  Now you will find a good epsilon threshold using a cross-validation set
Ejemplo n.º 3
0
print 'Program paused. Press enter to continue.'
raw_input()


## ================== Part 2: Estimate the dataset statistics ===================
#  For this exercise, we assume a Gaussian distribution for the dataset.
#
#  We first estimate the parameters of our assumed Gaussian distribution,
#  then compute the probabilities for each of the points and then visualize
#  both the overall distribution and where each of the points falls in
#  terms of that distribution.
#
print 'Visualizing Gaussian fit.\n'

#  Estimate my and sigma2
mu, sigma2 = estimateGaussian(X)

#  Returns the density of the multivariate normal at each data point (row)
#  of X
p = multivariateGaussian(X, mu, sigma2)

#  Visualize the fit
fig = figure()
visualizeFit(X,  mu, sigma2)
xlabel('Latency (ms)')
ylabel('Throughput (mb/s)')
fig.show()

print 'Program paused. Press enter to continue.'
raw_input()
Ejemplo n.º 4
0
def ex8():
    ## Machine Learning Online Class
    #  Exercise 8 | Anomaly Detection and Collaborative Filtering
    #
    #  Instructions
    #  ------------
    #
    #  This file contains code that helps you get started on the
    #  exercise. You will need to complete the following functions:
    #
    #     estimateGaussian.m
    #     selectThreshold.m
    #     cofiCostFunc.m
    #
    #  For this exercise, you will not need to change any code in this file,
    #  or any other files other than those mentioned above.
    #

    ## Initialization
    #clear ; close all; clc

    ## ================== Part 1: Load Example Dataset  ===================
    #  We start this exercise by using a small dataset that is easy to
    #  visualize.
    #
    #  Our example case consists of 2 network server statistics across
    #  several machines: the latency and throughput of each machine.
    #  This exercise will help us find possibly faulty (or very fast) machines.
    #

    print('Visualizing example dataset for outlier detection.\n')

    #  The following command loads the dataset. You should now have the
    #  variables X, Xval, yval in your environment
    mat = scipy.io.loadmat('ex8data1.mat')
    X = mat['X']
    Xval = mat['Xval']
    yval = mat['yval']

    #  Visualize the example dataset
    plt.plot(X[:, 0], X[:, 1], 'bx')
    plt.axis([0, 30, 0, 30])
    plt.xlabel('Latency (ms)')
    plt.ylabel('Throughput (mb/s)')
    plt.savefig('figure1.png')

    print('Program paused. Press enter to continue.')
    #pause

    ## ================== Part 2: Estimate the dataset statistics ===================
    #  For this exercise, we assume a Gaussian distribution for the dataset.
    #
    #  We first estimate the parameters of our assumed Gaussian distribution,
    #  then compute the probabilities for each of the points and then visualize
    #  both the overall distribution and where each of the points falls in
    #  terms of that distribution.
    #
    print('Visualizing Gaussian fit.\n')

    #  Estimate my and sigma2
    mu, sigma2 = estimateGaussian(X)

    #  Returns the density of the multivariate normal at each data point (row)
    #  of X
    p = multivariateGaussian(X, mu, sigma2)

    #  Visualize the fit
    visualizeFit(X, mu, sigma2)
    plt.xlabel('Latency (ms)')
    plt.ylabel('Throughput (mb/s)')
    plt.savefig('figure2.png')

    print('Program paused. Press enter to continue.\n')
    #pause

    ## ================== Part 3: Find Outliers ===================
    #  Now you will find a good epsilon threshold using a cross-validation set
    #  probabilities given the estimated Gaussian distribution
    #

    pval = multivariateGaussian(Xval, mu, sigma2)

    epsilon, F1 = selectThreshold(yval, pval)
    print('Best epsilon found using cross-validation: %e' % epsilon)
    print('Best F1 on Cross Validation Set:  %f' % F1)
    print('   (you should see a value epsilon of about 8.99e-05)\n')

    #  Find the outliers in the training set and plot the
    outliers = p < epsilon

    #  Draw a red circle around those outliers
    #hold on
    plt.plot(X[outliers, 0], X[outliers, 1], 'ro', lw=2, ms=10)
    #hold off
    plt.savefig('figure3.png')

    print('Program paused. Press enter to continue.\n')
    #pause

    ## ================== Part 4: Multidimensional Outliers ===================
    #  We will now use the code from the previous part and apply it to a
    #  harder problem in which more features describe each datapoint and only
    #  some features indicate whether a point is an outlier.
    #

    #  Loads the second dataset. You should now have the
    #  variables X, Xval, yval in your environment
    mat = scipy.io.loadmat('ex8data2.mat')
    X = mat['X']
    Xval = mat['Xval']
    yval = mat['yval']

    #  Apply the same steps to the larger dataset
    mu, sigma2 = estimateGaussian(X)

    #  Training set
    p = multivariateGaussian(X, mu, sigma2)

    #  Cross-validation set
    pval = multivariateGaussian(Xval, mu, sigma2)

    #  Find the best threshold
    epsilon, F1 = selectThreshold(yval, pval)

    print('Best epsilon found using cross-validation: %e' % epsilon)
    print('Best F1 on Cross Validation Set:  %f' % F1)
    print('# Outliers found: %d ' % np.sum(p < epsilon))
    print('   (you should see a value epsilon of about 1.38e-18)\n')
Ejemplo n.º 5
0
import scipy.io as spio
import numpy as np
import matplotlib.pyplot as plt
import math
import seaborn as sns
import pandas as pd
from estimateGaussian import estimateGaussian
from multivariateGaussian import multivariateGaussian

mat = spio.loadmat('ex8data1.mat', squeeze_me=True)
# print(mat.keys())
data = mat['X']
xval = mat['Xval']
yval = mat['yval']

df = pd.DataFrame(data, columns=["Latency (ms)", "Throughput (mb/s)"])
sns.scatterplot(x="Latency (ms)", y="Throughput (mb/s)", data=df)

plt.show()

[mu, sigma2] = estimateGaussian(data)
# print(mu)
print(np.shape(sigma2))
print(sigma2)
p = multivariateGaussian(data, mu, sigma2)
import scipy.optimize as opt
from sklearn.svm import SVC
from estimateGaussian import estimateGaussian
from multivariateGaussian import multivariateGaussian
from visualizeFit import visualizeFit
from selectThreshold import selectThreshold

data = loadmat('ex8data1.mat')
X = data['X']
Xval = data["Xval"]
yval = data['yval']
plt.plot(X[:, 0], X[:, 1], 'bx')
#plt.show()

#1.2
(mu, sig2) = estimateGaussian(X)
p = multivariateGaussian(X, mu, sig2)
#visualizeFit(p,mu,sig2)
#plt.show()

#1.3

pcv = multivariateGaussian(Xval, mu, sig2)

(epi, f1) = selectThreshold(pcv, yval)
outliers = (p < epi)
#plt.scatter(X[outliers,0],X[outliers,1],marker ="o",facecolor="none",edgecolor="r",s=70)
#plt.show()

#1.4
mat2 = loadmat("ex8data2.mat")
Ejemplo n.º 7
0
print('Visualizing example dataset for outlier detection.')
data = sco.loadmat("ex8data1.mat")
X, Xval, yval = data["X"], data["Xval"], data["yval"]

#  Visualize the example dataset
plt.scatter(X[:, 0], X[:, 1], marker="x", color='b', linewidths=0.01)
plt.axis([0, 30, 0, 30])
plt.xlabel('Latency (ms)')
plt.ylabel('Throughput (mb/s)')
plt.show()

## ================== Part 2: Estimate the dataset statistics ===================
print('Visualizing Gaussian fit.')
#  Estimate my and sigma2
mu, sigma2 = eG.estimateGaussian(X)

# Returns the density of the multivariate normal at each data point (row) of X
p = mG.multivariateGaussian(X, mu, sigma2)
#  Visualize the fit
plot.visualizeFit(X, mu, sigma2)

## ================== Part 3: Find Outliers ===================
#  Now you will find a good epsilon threshold using a cross-validation set probabilities
#  given the estimated Gaussian distribution

pval = mG.multivariateGaussian(Xval, mu, sigma2)

epsilon, F1 = sT.selectThreshold(yval, pval)
print('Best epsilon found using cross-validation: %e\n' % epsilon)
print('Best F1 on Cross Validation Set:  %f\n' % F1)