YAhat = learnerA.predict(XA)

mse = mean_squared_error(YA, YAhat)
print("Error rate on data set A: {}\n".format(mse))

YBhat = learnerB.predict(XB)

mseB = mean_squared_error(YB, YBhat)
print("Error rate on data set B: {}".format(mseB))

#ml.plotClassify2D(learnerA, XA, YAhat)
#plt.show()

#ml.plotClassify2D(learnerB, XB, YBhat)
#plt.show()

learnerA.train(XA, YA)
#plt.show()

learnerB.train(XB, YB)
#plt.show()

ml.plotClassify2D(learnerA, XA, YA)
plt.show()

ml.plotClassify2D(learnerB, XB, YB)
plt.show()

#plt.draw()
Example #2
0
X, Y = iris[:,0:2], iris[:,-1]

# Problem 1: Basics of Clustering

# 1A
plt.scatter(X[:,0],X[:,1],color='b')
plt.xlabel('feature x_1')
plt.ylabel('feature_x_2')
plt.title('Precluster algorithm graph')
plt.show()

# 1B

z,c,d = ml.cluster.kmeans(X,5)
ml.plotClassify2D(None, X, z)
plt.title('K = 5')
plt.xlabel('feature x_1')
plt.ylabel('feature_x_2')
plt.show()

z,c,d = ml.cluster.kmeans(X,20)
ml.plotClassify2D(None, X, z)
plt.title('K = 20')
plt.xlabel('feature x_1')
plt.ylabel('feature_x_2')
plt.show()

# 1C

z, c = ml.cluster.agglomerative(X, 5, method='min')
Example #3
0
model = ml.knn.knnClassify(Xtr, Ytr)
print model.err(Xtr, Ytr), model.err(Xva, Yva)
# 0.0 0.0666666666667
print model.nll(Xtr, Ytr), model.nll(Xva, Yva)
# -0.0 inf

model = ml.knn.knnClassify(Xtr, Ytr, K=5)
print model.err(Xtr, Ytr), model.err(Xva, Yva)
# 0.0254237288136 0.0333333333333
print model.nll(Xtr, Ytr), model.nll(Xva, Yva)
# 0.0672872960653 inf

model = ml.knn.knnClassify(Xtr, Ytr, K=100, alpha=0.1)
print model.err(Xtr, Ytr), model.err(Xva, Yva)
# 0.0932203389831 0.133333333333
print model.nll(Xtr, Ytr), model.nll(Xva, Yva)
# 0.641303145273 0.673828596126

model = ml.knn.knnClassify(Xtr[:, :2], Ytr)
ml.plotClassify2D(model, Xtr[:, :2], Ytr)
plt.show()

model = ml.knn.knnClassify(Xtr[:, :2], Ytr, K=5)
ml.plotClassify2D(model, Xtr[:, :2], Ytr)
plt.show()

model = ml.knn.knnClassify(Xtr[:, :2], Ytr, K=100, alpha=.1)
ml.plotClassify2D(model, Xtr[:, :2], Ytr)
plt.show()
# 0.0202702702703
print(model.nll(X,Y))
# 0.0360394614996

model = ml.bayes.gaussClassify( X, Y, equal=True );
print(model.err(X,Y))
# 0.0135135135135
print(model.nll(X,Y))
# 0.0880380736893

model = ml.bayes.gaussClassify( X, Y, diagonal=True );
print(model.err(X,Y))
# 0.0405405405405
print(model.nll(X,Y))
# 0.112463365158


model = ml.bayes.gaussClassify( X[:,:2], Y );
ml.plotClassify2D( model, X[:,:2], Y)
plt.show()

model = ml.bayes.gaussClassify( X[:,:2], Y, equal=True );
ml.plotClassify2D( model, X[:,:2], Y)
plt.show()

model = ml.bayes.gaussClassify( X[:,:2], Y, diagonal=True );
ml.plotClassify2D( model, X[:,:2], Y)
plt.show()


Example #5
0
# sys.path.append('/path/to/parent/dir/');

X,Y = ml.shuffleData(X,Y); # shuffle data randomly

# (This is a good idea in case your data are ordered in some pathological way,
# as the Iris data are)

Xtr,Xva,Ytr,Yva = ml.splitData(X,Y, 0.75); # split data into 75/25 train/validation

for K in [1, 5, 10, 50]: ## visualize classification boundary
    knn = ml.knn.knnClassify() # create the object and train it
    knn.train(Xtr, Ytr, K) # where K is an integer, e.g. 1 for nearest neighbor prediction
    YvaHat = knn.predict(Xva) # get estimates of y for each data point in Xva

    ml.plotClassify2D( knn, Xtr, Ytr, axis=plt ) # make 2D classification plot with data (Xtr,Ytr)
    plt.close()

## b ##

K=[1,2,5,10,50,100,200]
errTrain = []
errValidation = []
for i,k in enumerate(K):
    learner = ml.knn.knnClassify() ## train
    learner.train(Xtr[:,0:2], Ytr, k)
    Yhat = learner.predict(Xtr[:,0:2]) #predict
    print Yhat
    errTrain.append(learner.err(Xtr[:,0:2], Ytr)) # TODO: to count what fraction of predictions are wrong
    learner2 = ml.knn.knnClassify()  ## train
    learner2.train(Xva[:, 0:2], Yva, k)
Example #6
0
learner.classes = np.unique(YA)  # define class labels using YA or YB
wts = np.array([.5, 1, -.25])
# TODO: fill in values
learner.theta = wts
# set the learner's parameters
learner.plotBoundary(XA, YA)
plt.close()
learner.plotBoundary(XB, YB)
plt.close()

## part c ##

print "Error Rate (dataset A): "
print np.mean(YA != learner.predict(XA))  ## equivalent to expected 0.0505

print "Error Rate (dataset B): "
print np.mean(YB != learner.predict(XB))  ## .5454

## part d ##
learner.classes = np.unique(YA)
ml.plotClassify2D(learner, XA, YA, axis=plt)
plt.close()

learner.classes = np.unique(YB)
ml.plotClassify2D(learner, XB, YB, axis=plt)
plt.close()
## resulting decision boundaries matches ones computated analytically
## mostly for XA because the error rate for XB really bad .54

## part e ##
Example #7
0
    if y_pred_a[i] != YA[i]:
        count_a += 1
print('error for A set', count_a / len(y_pred_a))

count_b = 0
lr_b = logisticClassify2()
lr_b.classes = np.unique(YB)
lr_b.theta = wts
y_pred_b = lr_b.predict(XB)
for i in range(len(y_pred_b)):
    if y_pred_b[i] != YB[i]:
        count_b += 1
print('error for B set', count_b / len(y_pred_b))

# (d)
ml.plotClassify2D(lr_a, X, Y)
plt.show()

# (e)

# (f)

# (g)
lr_a.train(XA, YA)
plt.close()
ml.plotClassify2D(lr_a, X, Y)
plt.show()
lr_b.train(XB, YB)
plt.close()
ml.plotClassify2D(lr_b, X, Y)
plt.show()
iris = np.genfromtxt("data/iris.txt", delimiter = None)
X = iris[:,0:-1]

# (a) Loading the first two features of the iris data set and plotting to check
#   clustering
X_two = X[:,0:2]
plt.scatter(X[:,0], X[:,1], c='b')
plt.title('Plotting the first two features')
plt.show()


# (b) Running k means with k=5 and k=20 and plotting the same
k_clusters = [5, 20]
for k in k_clusters:
    (z, c, sumd) = ml.cluster.kmeans(X_two, k)
    ml.plotClassify2D(None, X_two, z)
    plt.title('k-Means Clustering with k = ' + str(k))
    plt.show()

initializations = ['random', 'farthest', 'k++']
parameters = []
for initialization in initializations:
    for k in k_clusters:
        parameters.append((k, initialization))


sumd_s = []
z_s = []
for parameter in parameters:
    (z, c, sumd) = ml.cluster.kmeans(X_two, parameter[0], parameter[1])
    z_s.append(z)
#try for different initializations and select the model values with the least cost
mincost = np.inf
for i in range(10):
    z, Y, l = ml.cluster.kmeans(X, 5)
    if l < mincost:
        mincost = l
        z_leastcost = z
        Y_leastcost = Y

# In[42]:

#ml.plotClassify2D(None,X,Y[0])
plt.scatter(Y_leastcost[:, 0], Y_leastcost[:, 1], c='r',
            marker='x')  #mark centers
ml.plotClassify2D(None, X, z_leastcost)  #color points based on clustering
plt.title("k-means clustering for k=5")

# In[43]:

#try for different initializations and select the model values with the least cost
mincost = np.inf
for i in range(10):
    z, Y, l = ml.cluster.kmeans(X, 20)
    if l < mincost:
        mincost = l
        z_leastcost = z
        Y_leastcost = Y

# In[44]:
      (np.mean(yBhat.reshape(YBtemp.shape) != YBtemp)))

# d
X1s = np.linspace(-3, 3, 100)  # densely sample possible x-values
X2s = np.linspace(-10, 10, 200)
Xs = np.zeros((X1s.shape[0] * X2s.shape[0], 2))
k = 0
l1 = X1s.shape[0]
l2 = X2s.shape[0]
for i in range(l1):
    Xs[k * l2:(k + 1) * l2 - 1, 0] = X1s[i]
    for j in range(k * l2, (k + 1) * l2):
        Xs[j, 1] = X2s[j % l2]
    k += 1
Ys = learner.predict(Xs)
ml.plotClassify2D(learner, Xs, Ys)

# e
# dJ(j)/d(theta) = (sigma(x^(j)theta^T) - y^(j))x^(j)

# f

# g

learnerA = lc2.logisticClassify2()
[it, J01, Jsur] = learnerA.train(XA,
                                 YA,
                                 initStep=1.0,
                                 stopTol=1e-4,
                                 stopIter=1001,
                                 plot=None)
from numpy import asmatrix as arr
from imp import reload

np.random.seed(0)
'''
Problem 1: Basics of Clustering
'''
# a) Load Iris data restricted to the first two features. Observe clusters
iris = np.genfromtxt("data/iris.txt", delimiter=None)  # load the text file
Y = iris[:, -1]  # target value is the last column
X = iris[:, 0:2]  # first two features
plt.plot(iris[:, 0], iris[:, 1], 'bo', linewidth=2)
plt.xlabel('First feature')
plt.ylabel('Second feature')
plt.show()
'''
# b) k-means on data for k = 5 and 20. Try a few different initializations

# random init
[z5, c5, sumd5] = ml.cluster.kmeans(X, 5, init='random', max_iter=100)
[z20, c20, sumd20] = ml.cluster.kmeans(X, 20, init='random', max_iter=100)
fig, ax = plt.subplots(nrows = 1, ncols =2, figsize = (12, 6))
ml.plotClassify2D(None, X, z5, axis=ax[0])
ax[0].plot(c5[:,0], c5[:,1], 'r*', linewidth=10)
ml.plotClassify2D(None, X, z20, axis=ax[1])
ax[1].plot(c20[:,0], c20[:,1], 'r*', linewidth=10)
plt.show()
print('Error rate k = 5, random:, %0.4f' %(np.mean(z5.reshape(Y.shape) != Y)))
print('Error rate k = 20, random:, %0.4f' %(np.mean(z20.reshape(Y.shape) != Y)))

# k++ init
Example #12
0
learner.theta = wts
# set the learner's parameters

#learner.plotBoundary(XA,YA);

#(c)
YAhat = [None] * len(XA)
YAhat = learner.predict(XA)

i = 0
err = 0
for i in range(len(YAhat)):
    err += 1 if (YAhat[i] != YA[i]) else 0

fracterr = err / (len(YAhat))

#Repeat for XB,YB

#(d)
plt.figure()
ml.plotClassify2D(learner, XA, YA)
#The figures match

#(e)
wts = np.random.rand(3)
wts = wts[:, np.newaxis]
learner.theta = wts
# set the learner's parameters
learner.train(XA, YA)
plt.figure()
ml.plotClassify2D(learner, XA, YA)
Example #13
0
import numpy as np
import mltools as ml
import matplotlib.pyplot as plt
from logisticClassify2 import *

iris = np.genfromtxt("data/iris.txt", delimiter=None)
X, Y = iris[:, 0:2], iris[:, -1]  # get first two features & target
X, Y = ml.shuffleData(X, Y)  # reorder randomly (important later)
X, _ = ml.transforms.rescale(X)  # works much better on rescaled data
XA, YA = X[Y < 2, :], Y[Y < 2]  # get class 0 vs 1
XB, YB = X[Y > 0, :], Y[Y > 0]  # get class 1 vs 2

print("Part 1")
ml.plotClassify2D(None, XA, YA)
plt.show()
ml.plotClassify2D(None, XB, YB)
plt.show()

learner = logisticClassify2()
# create "blank" learner
learner.classes = np.unique(YA)  # define class labels using YA or YB
wts = np.array([.5, -.25, 1])
# TODO: fill in values
learner.theta = wts
# set the learner’s parameters
print("Part 2")
learner.plotBoundary(XA, YA)
plt.show()

print("Part 3")
print("ERROR:", learner.err(XA, YA))
Example #14
0
X = iris[:, 0:2]  ## restrict iris to 2 features, ignore class var

## part b ##

sumd = float("inf")
for i in range(5):
    Zi, Ci, SUMDi = cl.kmeans(X, 5, 'random')  ## 5 clusters
    if sumd > SUMDi:
        Z = Zi
        C = Ci
        sumd = SUMDi

print "Best Score (5 Clusters): "
print sumd

ml.plotClassify2D(None, X, Z)
# plt.show()

sumd = float("inf")
for i in range(5):
    Zi, Ci, SUMDi = cl.kmeans(X, 20, 'random')  ## 20 clusters
    if sumd > SUMDi:
        Z = Zi
        C = Ci
        sumd = SUMDi

print "Best Score (20 Clusters): "
print sumd

ml.plotClassify2D(None, X, Z)
# plt.show()
import mltools as ml
iris = np.genfromtxt("data/iris.txt", delimiter=None)
Y = iris[:, -1]  # target value is the last column
X = iris[:, 0:-1]  # features are the other columns

X, Y = ml.shuffleData(X, Y)
# shuffle data randomly
Xtr, Xte, Ytr, Yte = ml.splitData(X, Y, 0.75)
# split data into 75/25 train/test

# (a) Plotting classification boundary for two features in the iris dataset
K = [1, 5, 10, 50]
for i in K:
    knn = ml.knn.knnClassify()
    knn.train(Xtr[:, 0:2], Ytr, i)
    ml.plotClassify2D(knn, Xtr[:, 0:2], Ytr)
    plt.show()

# (b) Computing the error rate for the training data and testing data once having
#    trained a kNN classifier, and printing the error rate vs k graph
K = [1, 2, 5, 10, 50, 100, 200]
errTrain = []
errTest = []
for i, k in enumerate(K):
    learner = ml.knn.knnClassify()
    learner.train(Xtr[:, 0:2], Ytr, k)
    YTrainPred = learner.predict(Xtr[:, 0:2])
    errTrain.append(float(np.sum(YTrainPred != Ytr)) / float(Xtr.shape[0]))

    YTestPred = learner.predict(Xte[:, 0:2])
    errTest.append(float(np.sum(YTestPred != Yte)) / float(Xte.shape[0]))
Example #16
0
# import sys
# sys.path.append('/path/to/parent/dir/');
# X,Y = ml.shuffleData(X,Y); # shuffle data randomly
# (This is a good idea in case your data are ordered in some pathological way,
# as the Iris data are)
# Xtr,Xte,Ytr,Yte = ml.splitData(X,Y, 0.75); # split data into 75/25 train/test

# (a)
# Use only first two features of X
X_new, Y_new = ml.shuffleData(X[:, [0, 1]], Y)
Xtr, Xte, Ytr, Yte = ml.splitData(X_new, Y_new, 0.75)
# Visualize classification boundary for varying values of K = [1,5,10,50]

for K in [1, 5, 10, 50]:
    knn = ml.knn.knnClassify(Xtr, Ytr, K)
    ml.plotClassify2D(knn, Xtr, Ytr)

# (b) Prediction/ error for training set and test set
K = [1, 2, 5, 10, 50, 100, 200]
errTrain = np.zeros(7)
errTest = np.zeros(7)
for i, k in enumerate(K):
    learner = ml.knn.knnClassify(Xtr, Ytr, k)
    Yhat_tr = learner.predict(Xtr)
    Yhat_te = learner.predict(Xte)
    errTrain[i] = (np.sum(Yhat_tr != Ytr)) / len(Ytr)
    errTest[i] = (np.sum(Yhat_te != Yte)) / len(Yte)
    plt.semilogx(k, errTrain[i], c='r', marker='o')
    plt.semilogx(k, errTest[i], c='g', marker='s')
plt.show()
Example #17
0
import matplotlib.pyplot as plt
import mltools as ml
import logisticClassify2 as lC

# Part A

iris = np.genfromtxt("data/iris.txt", delimiter=None)
X, Y = iris[:, 0:2], iris[:, -1]  # get first two features & target
X, Y = ml.shuffleData(X, Y)  # reorder randomly (important later)
X, _ = ml.rescale(X)  # works much better on rescaled data

XA, YA = X[Y < 2, :], Y[Y < 2]  # get class 0 vs 1
XB, YB = X[Y > 0, :], Y[Y > 0]  # get class 1 vs 2

plt.title("Class 0 vs Class 1")
ml.plotClassify2D(None, XA, YA)
plt.show()

plt.title("Class 1 vs Class 2")
ml.plotClassify2D(None, XB, YB)
plt.show()

# Part B

learnerA = lC.logisticClassify2()
learnerA.classes = np.unique(YA)
wts = np.array([.5, 1, -.25])
learnerA.theta = wts

plt.title("Class 0 vs Class 1")
learnerA.plotBoundary(XA, YA)