YAhat = learnerA.predict(XA) mse = mean_squared_error(YA, YAhat) print("Error rate on data set A: {}\n".format(mse)) YBhat = learnerB.predict(XB) mseB = mean_squared_error(YB, YBhat) print("Error rate on data set B: {}".format(mseB)) #ml.plotClassify2D(learnerA, XA, YAhat) #plt.show() #ml.plotClassify2D(learnerB, XB, YBhat) #plt.show() learnerA.train(XA, YA) #plt.show() learnerB.train(XB, YB) #plt.show() ml.plotClassify2D(learnerA, XA, YA) plt.show() ml.plotClassify2D(learnerB, XB, YB) plt.show() #plt.draw()
X, Y = iris[:,0:2], iris[:,-1] # Problem 1: Basics of Clustering # 1A plt.scatter(X[:,0],X[:,1],color='b') plt.xlabel('feature x_1') plt.ylabel('feature_x_2') plt.title('Precluster algorithm graph') plt.show() # 1B z,c,d = ml.cluster.kmeans(X,5) ml.plotClassify2D(None, X, z) plt.title('K = 5') plt.xlabel('feature x_1') plt.ylabel('feature_x_2') plt.show() z,c,d = ml.cluster.kmeans(X,20) ml.plotClassify2D(None, X, z) plt.title('K = 20') plt.xlabel('feature x_1') plt.ylabel('feature_x_2') plt.show() # 1C z, c = ml.cluster.agglomerative(X, 5, method='min')
model = ml.knn.knnClassify(Xtr, Ytr) print model.err(Xtr, Ytr), model.err(Xva, Yva) # 0.0 0.0666666666667 print model.nll(Xtr, Ytr), model.nll(Xva, Yva) # -0.0 inf model = ml.knn.knnClassify(Xtr, Ytr, K=5) print model.err(Xtr, Ytr), model.err(Xva, Yva) # 0.0254237288136 0.0333333333333 print model.nll(Xtr, Ytr), model.nll(Xva, Yva) # 0.0672872960653 inf model = ml.knn.knnClassify(Xtr, Ytr, K=100, alpha=0.1) print model.err(Xtr, Ytr), model.err(Xva, Yva) # 0.0932203389831 0.133333333333 print model.nll(Xtr, Ytr), model.nll(Xva, Yva) # 0.641303145273 0.673828596126 model = ml.knn.knnClassify(Xtr[:, :2], Ytr) ml.plotClassify2D(model, Xtr[:, :2], Ytr) plt.show() model = ml.knn.knnClassify(Xtr[:, :2], Ytr, K=5) ml.plotClassify2D(model, Xtr[:, :2], Ytr) plt.show() model = ml.knn.knnClassify(Xtr[:, :2], Ytr, K=100, alpha=.1) ml.plotClassify2D(model, Xtr[:, :2], Ytr) plt.show()
# 0.0202702702703 print(model.nll(X,Y)) # 0.0360394614996 model = ml.bayes.gaussClassify( X, Y, equal=True ); print(model.err(X,Y)) # 0.0135135135135 print(model.nll(X,Y)) # 0.0880380736893 model = ml.bayes.gaussClassify( X, Y, diagonal=True ); print(model.err(X,Y)) # 0.0405405405405 print(model.nll(X,Y)) # 0.112463365158 model = ml.bayes.gaussClassify( X[:,:2], Y ); ml.plotClassify2D( model, X[:,:2], Y) plt.show() model = ml.bayes.gaussClassify( X[:,:2], Y, equal=True ); ml.plotClassify2D( model, X[:,:2], Y) plt.show() model = ml.bayes.gaussClassify( X[:,:2], Y, diagonal=True ); ml.plotClassify2D( model, X[:,:2], Y) plt.show()
# sys.path.append('/path/to/parent/dir/'); X,Y = ml.shuffleData(X,Y); # shuffle data randomly # (This is a good idea in case your data are ordered in some pathological way, # as the Iris data are) Xtr,Xva,Ytr,Yva = ml.splitData(X,Y, 0.75); # split data into 75/25 train/validation for K in [1, 5, 10, 50]: ## visualize classification boundary knn = ml.knn.knnClassify() # create the object and train it knn.train(Xtr, Ytr, K) # where K is an integer, e.g. 1 for nearest neighbor prediction YvaHat = knn.predict(Xva) # get estimates of y for each data point in Xva ml.plotClassify2D( knn, Xtr, Ytr, axis=plt ) # make 2D classification plot with data (Xtr,Ytr) plt.close() ## b ## K=[1,2,5,10,50,100,200] errTrain = [] errValidation = [] for i,k in enumerate(K): learner = ml.knn.knnClassify() ## train learner.train(Xtr[:,0:2], Ytr, k) Yhat = learner.predict(Xtr[:,0:2]) #predict print Yhat errTrain.append(learner.err(Xtr[:,0:2], Ytr)) # TODO: to count what fraction of predictions are wrong learner2 = ml.knn.knnClassify() ## train learner2.train(Xva[:, 0:2], Yva, k)
learner.classes = np.unique(YA) # define class labels using YA or YB wts = np.array([.5, 1, -.25]) # TODO: fill in values learner.theta = wts # set the learner's parameters learner.plotBoundary(XA, YA) plt.close() learner.plotBoundary(XB, YB) plt.close() ## part c ## print "Error Rate (dataset A): " print np.mean(YA != learner.predict(XA)) ## equivalent to expected 0.0505 print "Error Rate (dataset B): " print np.mean(YB != learner.predict(XB)) ## .5454 ## part d ## learner.classes = np.unique(YA) ml.plotClassify2D(learner, XA, YA, axis=plt) plt.close() learner.classes = np.unique(YB) ml.plotClassify2D(learner, XB, YB, axis=plt) plt.close() ## resulting decision boundaries matches ones computated analytically ## mostly for XA because the error rate for XB really bad .54 ## part e ##
if y_pred_a[i] != YA[i]: count_a += 1 print('error for A set', count_a / len(y_pred_a)) count_b = 0 lr_b = logisticClassify2() lr_b.classes = np.unique(YB) lr_b.theta = wts y_pred_b = lr_b.predict(XB) for i in range(len(y_pred_b)): if y_pred_b[i] != YB[i]: count_b += 1 print('error for B set', count_b / len(y_pred_b)) # (d) ml.plotClassify2D(lr_a, X, Y) plt.show() # (e) # (f) # (g) lr_a.train(XA, YA) plt.close() ml.plotClassify2D(lr_a, X, Y) plt.show() lr_b.train(XB, YB) plt.close() ml.plotClassify2D(lr_b, X, Y) plt.show()
iris = np.genfromtxt("data/iris.txt", delimiter = None) X = iris[:,0:-1] # (a) Loading the first two features of the iris data set and plotting to check # clustering X_two = X[:,0:2] plt.scatter(X[:,0], X[:,1], c='b') plt.title('Plotting the first two features') plt.show() # (b) Running k means with k=5 and k=20 and plotting the same k_clusters = [5, 20] for k in k_clusters: (z, c, sumd) = ml.cluster.kmeans(X_two, k) ml.plotClassify2D(None, X_two, z) plt.title('k-Means Clustering with k = ' + str(k)) plt.show() initializations = ['random', 'farthest', 'k++'] parameters = [] for initialization in initializations: for k in k_clusters: parameters.append((k, initialization)) sumd_s = [] z_s = [] for parameter in parameters: (z, c, sumd) = ml.cluster.kmeans(X_two, parameter[0], parameter[1]) z_s.append(z)
#try for different initializations and select the model values with the least cost mincost = np.inf for i in range(10): z, Y, l = ml.cluster.kmeans(X, 5) if l < mincost: mincost = l z_leastcost = z Y_leastcost = Y # In[42]: #ml.plotClassify2D(None,X,Y[0]) plt.scatter(Y_leastcost[:, 0], Y_leastcost[:, 1], c='r', marker='x') #mark centers ml.plotClassify2D(None, X, z_leastcost) #color points based on clustering plt.title("k-means clustering for k=5") # In[43]: #try for different initializations and select the model values with the least cost mincost = np.inf for i in range(10): z, Y, l = ml.cluster.kmeans(X, 20) if l < mincost: mincost = l z_leastcost = z Y_leastcost = Y # In[44]:
(np.mean(yBhat.reshape(YBtemp.shape) != YBtemp))) # d X1s = np.linspace(-3, 3, 100) # densely sample possible x-values X2s = np.linspace(-10, 10, 200) Xs = np.zeros((X1s.shape[0] * X2s.shape[0], 2)) k = 0 l1 = X1s.shape[0] l2 = X2s.shape[0] for i in range(l1): Xs[k * l2:(k + 1) * l2 - 1, 0] = X1s[i] for j in range(k * l2, (k + 1) * l2): Xs[j, 1] = X2s[j % l2] k += 1 Ys = learner.predict(Xs) ml.plotClassify2D(learner, Xs, Ys) # e # dJ(j)/d(theta) = (sigma(x^(j)theta^T) - y^(j))x^(j) # f # g learnerA = lc2.logisticClassify2() [it, J01, Jsur] = learnerA.train(XA, YA, initStep=1.0, stopTol=1e-4, stopIter=1001, plot=None)
from numpy import asmatrix as arr from imp import reload np.random.seed(0) ''' Problem 1: Basics of Clustering ''' # a) Load Iris data restricted to the first two features. Observe clusters iris = np.genfromtxt("data/iris.txt", delimiter=None) # load the text file Y = iris[:, -1] # target value is the last column X = iris[:, 0:2] # first two features plt.plot(iris[:, 0], iris[:, 1], 'bo', linewidth=2) plt.xlabel('First feature') plt.ylabel('Second feature') plt.show() ''' # b) k-means on data for k = 5 and 20. Try a few different initializations # random init [z5, c5, sumd5] = ml.cluster.kmeans(X, 5, init='random', max_iter=100) [z20, c20, sumd20] = ml.cluster.kmeans(X, 20, init='random', max_iter=100) fig, ax = plt.subplots(nrows = 1, ncols =2, figsize = (12, 6)) ml.plotClassify2D(None, X, z5, axis=ax[0]) ax[0].plot(c5[:,0], c5[:,1], 'r*', linewidth=10) ml.plotClassify2D(None, X, z20, axis=ax[1]) ax[1].plot(c20[:,0], c20[:,1], 'r*', linewidth=10) plt.show() print('Error rate k = 5, random:, %0.4f' %(np.mean(z5.reshape(Y.shape) != Y))) print('Error rate k = 20, random:, %0.4f' %(np.mean(z20.reshape(Y.shape) != Y))) # k++ init
learner.theta = wts # set the learner's parameters #learner.plotBoundary(XA,YA); #(c) YAhat = [None] * len(XA) YAhat = learner.predict(XA) i = 0 err = 0 for i in range(len(YAhat)): err += 1 if (YAhat[i] != YA[i]) else 0 fracterr = err / (len(YAhat)) #Repeat for XB,YB #(d) plt.figure() ml.plotClassify2D(learner, XA, YA) #The figures match #(e) wts = np.random.rand(3) wts = wts[:, np.newaxis] learner.theta = wts # set the learner's parameters learner.train(XA, YA) plt.figure() ml.plotClassify2D(learner, XA, YA)
import numpy as np import mltools as ml import matplotlib.pyplot as plt from logisticClassify2 import * iris = np.genfromtxt("data/iris.txt", delimiter=None) X, Y = iris[:, 0:2], iris[:, -1] # get first two features & target X, Y = ml.shuffleData(X, Y) # reorder randomly (important later) X, _ = ml.transforms.rescale(X) # works much better on rescaled data XA, YA = X[Y < 2, :], Y[Y < 2] # get class 0 vs 1 XB, YB = X[Y > 0, :], Y[Y > 0] # get class 1 vs 2 print("Part 1") ml.plotClassify2D(None, XA, YA) plt.show() ml.plotClassify2D(None, XB, YB) plt.show() learner = logisticClassify2() # create "blank" learner learner.classes = np.unique(YA) # define class labels using YA or YB wts = np.array([.5, -.25, 1]) # TODO: fill in values learner.theta = wts # set the learner’s parameters print("Part 2") learner.plotBoundary(XA, YA) plt.show() print("Part 3") print("ERROR:", learner.err(XA, YA))
X = iris[:, 0:2] ## restrict iris to 2 features, ignore class var ## part b ## sumd = float("inf") for i in range(5): Zi, Ci, SUMDi = cl.kmeans(X, 5, 'random') ## 5 clusters if sumd > SUMDi: Z = Zi C = Ci sumd = SUMDi print "Best Score (5 Clusters): " print sumd ml.plotClassify2D(None, X, Z) # plt.show() sumd = float("inf") for i in range(5): Zi, Ci, SUMDi = cl.kmeans(X, 20, 'random') ## 20 clusters if sumd > SUMDi: Z = Zi C = Ci sumd = SUMDi print "Best Score (20 Clusters): " print sumd ml.plotClassify2D(None, X, Z) # plt.show()
import mltools as ml iris = np.genfromtxt("data/iris.txt", delimiter=None) Y = iris[:, -1] # target value is the last column X = iris[:, 0:-1] # features are the other columns X, Y = ml.shuffleData(X, Y) # shuffle data randomly Xtr, Xte, Ytr, Yte = ml.splitData(X, Y, 0.75) # split data into 75/25 train/test # (a) Plotting classification boundary for two features in the iris dataset K = [1, 5, 10, 50] for i in K: knn = ml.knn.knnClassify() knn.train(Xtr[:, 0:2], Ytr, i) ml.plotClassify2D(knn, Xtr[:, 0:2], Ytr) plt.show() # (b) Computing the error rate for the training data and testing data once having # trained a kNN classifier, and printing the error rate vs k graph K = [1, 2, 5, 10, 50, 100, 200] errTrain = [] errTest = [] for i, k in enumerate(K): learner = ml.knn.knnClassify() learner.train(Xtr[:, 0:2], Ytr, k) YTrainPred = learner.predict(Xtr[:, 0:2]) errTrain.append(float(np.sum(YTrainPred != Ytr)) / float(Xtr.shape[0])) YTestPred = learner.predict(Xte[:, 0:2]) errTest.append(float(np.sum(YTestPred != Yte)) / float(Xte.shape[0]))
# import sys # sys.path.append('/path/to/parent/dir/'); # X,Y = ml.shuffleData(X,Y); # shuffle data randomly # (This is a good idea in case your data are ordered in some pathological way, # as the Iris data are) # Xtr,Xte,Ytr,Yte = ml.splitData(X,Y, 0.75); # split data into 75/25 train/test # (a) # Use only first two features of X X_new, Y_new = ml.shuffleData(X[:, [0, 1]], Y) Xtr, Xte, Ytr, Yte = ml.splitData(X_new, Y_new, 0.75) # Visualize classification boundary for varying values of K = [1,5,10,50] for K in [1, 5, 10, 50]: knn = ml.knn.knnClassify(Xtr, Ytr, K) ml.plotClassify2D(knn, Xtr, Ytr) # (b) Prediction/ error for training set and test set K = [1, 2, 5, 10, 50, 100, 200] errTrain = np.zeros(7) errTest = np.zeros(7) for i, k in enumerate(K): learner = ml.knn.knnClassify(Xtr, Ytr, k) Yhat_tr = learner.predict(Xtr) Yhat_te = learner.predict(Xte) errTrain[i] = (np.sum(Yhat_tr != Ytr)) / len(Ytr) errTest[i] = (np.sum(Yhat_te != Yte)) / len(Yte) plt.semilogx(k, errTrain[i], c='r', marker='o') plt.semilogx(k, errTest[i], c='g', marker='s') plt.show()
import matplotlib.pyplot as plt import mltools as ml import logisticClassify2 as lC # Part A iris = np.genfromtxt("data/iris.txt", delimiter=None) X, Y = iris[:, 0:2], iris[:, -1] # get first two features & target X, Y = ml.shuffleData(X, Y) # reorder randomly (important later) X, _ = ml.rescale(X) # works much better on rescaled data XA, YA = X[Y < 2, :], Y[Y < 2] # get class 0 vs 1 XB, YB = X[Y > 0, :], Y[Y > 0] # get class 1 vs 2 plt.title("Class 0 vs Class 1") ml.plotClassify2D(None, XA, YA) plt.show() plt.title("Class 1 vs Class 2") ml.plotClassify2D(None, XB, YB) plt.show() # Part B learnerA = lC.logisticClassify2() learnerA.classes = np.unique(YA) wts = np.array([.5, 1, -.25]) learnerA.theta = wts plt.title("Class 0 vs Class 1") learnerA.plotBoundary(XA, YA)